{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.0592014, "auxiliary_loss_mlp": 0.02849952, "balance_loss_clip": 1.9061954, "balance_loss_mlp": 2.26414394, "epoch": 6.012325266796934e-05, "flos": 24455432897280.0, "grad_norm": 59.88248960013876, "language_loss": 2.96915507, "learning_rate": 0.0, "loss": 2.02837133, "num_input_tokens_seen": 19155, "router_z_loss_clip": 9.4375, "router_z_loss_mlp": 36.5, "step": 1, "time_per_iteration": 13.95829463005066 }, { "auxiliary_loss_clip": 0.03875387, "auxiliary_loss_mlp": 0.01847448, "balance_loss_clip": 1.26494408, "balance_loss_mlp": 1.51304102, "epoch": 0.00012024650533593868, "flos": 20225010188160.0, "grad_norm": 38.091340254623105, "language_loss": 1.8701098, "learning_rate": 4.4628432569317594e-07, "loss": 1.92733812, "num_input_tokens_seen": 36175, "router_z_loss_clip": 5.84375, "router_z_loss_mlp": 23.625, "step": 2, "time_per_iteration": 2.3926379680633545 }, { "auxiliary_loss_clip": 0.03867972, "auxiliary_loss_mlp": 0.01837649, "balance_loss_clip": 1.24942243, "balance_loss_mlp": 1.51165771, "epoch": 0.000180369758003908, "flos": 22308835996800.0, "grad_norm": 34.54398734614813, "language_loss": 1.63752937, "learning_rate": 7.073439208833112e-07, "loss": 1.69458556, "num_input_tokens_seen": 54870, "router_z_loss_clip": 5.875, "router_z_loss_mlp": 23.5, "step": 3, "time_per_iteration": 2.3708834648132324 }, { "auxiliary_loss_clip": 0.03834647, "auxiliary_loss_mlp": 0.01874712, "balance_loss_clip": 1.23155427, "balance_loss_mlp": 1.51126909, "epoch": 0.00024049301067187735, "flos": 22413680409600.0, "grad_norm": 51.58421907872779, "language_loss": 1.74334764, "learning_rate": 8.925686513863519e-07, "loss": 1.80044127, "num_input_tokens_seen": 74575, "router_z_loss_clip": 6.4375, "router_z_loss_mlp": 23.25, "step": 4, "time_per_iteration": 2.3859238624572754 }, { "auxiliary_loss_clip": 0.0384364, "auxiliary_loss_mlp": 0.01827733, "balance_loss_clip": 1.2551465, "balance_loss_mlp": 1.50975645, "epoch": 0.0003006162633398467, "flos": 21395927099520.0, "grad_norm": 55.60498654380099, "language_loss": 1.97572052, "learning_rate": 1.0362401141348472e-06, "loss": 2.03243423, "num_input_tokens_seen": 92580, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 23.375, "step": 5, "time_per_iteration": 2.3426313400268555 }, { "auxiliary_loss_clip": 0.0384073, "auxiliary_loss_mlp": 0.01876276, "balance_loss_clip": 1.24265456, "balance_loss_mlp": 1.51395893, "epoch": 0.000360739516007816, "flos": 21651316761600.0, "grad_norm": 34.10565653870119, "language_loss": 1.6289196, "learning_rate": 1.153628246576487e-06, "loss": 1.68608975, "num_input_tokens_seen": 109705, "router_z_loss_clip": 6.3125, "router_z_loss_mlp": 23.25, "step": 6, "time_per_iteration": 2.3758718967437744 }, { "auxiliary_loss_clip": 0.03817913, "auxiliary_loss_mlp": 0.01907599, "balance_loss_clip": 1.25719333, "balance_loss_mlp": 1.51139355, "epoch": 0.0004208627686757854, "flos": 27158586312960.0, "grad_norm": 24.29854221742999, "language_loss": 1.55889654, "learning_rate": 1.2528784983718962e-06, "loss": 1.61615169, "num_input_tokens_seen": 129425, "router_z_loss_clip": 6.5, "router_z_loss_mlp": 23.0, "step": 7, "time_per_iteration": 2.497342348098755 }, { "auxiliary_loss_clip": 0.03645904, "auxiliary_loss_mlp": 0.01732322, "balance_loss_clip": 1.18567562, "balance_loss_mlp": 1.50970292, "epoch": 0.0004809860213437547, "flos": 31317824292480.0, "grad_norm": 30.418445323895252, "language_loss": 1.43859577, "learning_rate": 1.338852977079528e-06, "loss": 1.492378, "num_input_tokens_seen": 149210, "router_z_loss_clip": 5.46875, "router_z_loss_mlp": 21.375, "step": 8, "time_per_iteration": 2.58284592628479 }, { "auxiliary_loss_clip": 0.03627739, "auxiliary_loss_mlp": 0.01695781, "balance_loss_clip": 1.16935301, "balance_loss_mlp": 1.50706589, "epoch": 0.000541109274011724, "flos": 32159056435200.0, "grad_norm": 23.488022276991867, "language_loss": 1.57478893, "learning_rate": 1.4146878417666224e-06, "loss": 1.6280241, "num_input_tokens_seen": 169055, "router_z_loss_clip": 5.25, "router_z_loss_mlp": 21.25, "step": 9, "time_per_iteration": 2.5866003036499023 }, { "auxiliary_loss_clip": 0.03584784, "auxiliary_loss_mlp": 0.01715115, "balance_loss_clip": 1.16579831, "balance_loss_mlp": 1.50366664, "epoch": 0.0006012325266796934, "flos": 18915801914880.0, "grad_norm": 19.568016533137865, "language_loss": 1.5172112, "learning_rate": 1.4825244398280232e-06, "loss": 1.57021022, "num_input_tokens_seen": 188045, "router_z_loss_clip": 5.5, "router_z_loss_mlp": 20.75, "step": 10, "time_per_iteration": 2.5052900314331055 }, { "auxiliary_loss_clip": 0.0355306, "auxiliary_loss_mlp": 0.01714114, "balance_loss_clip": 1.20065534, "balance_loss_mlp": 1.5034039, "epoch": 0.0006613557793476627, "flos": 20773879672320.0, "grad_norm": 15.82446653794725, "language_loss": 1.48783445, "learning_rate": 1.5438901072051983e-06, "loss": 1.54050612, "num_input_tokens_seen": 207035, "router_z_loss_clip": 5.125, "router_z_loss_mlp": 20.5, "step": 11, "time_per_iteration": 2.5404746532440186 }, { "auxiliary_loss_clip": 0.03492447, "auxiliary_loss_mlp": 0.01690133, "balance_loss_clip": 1.15874553, "balance_loss_mlp": 1.5042367, "epoch": 0.000721479032015632, "flos": 16580740896000.0, "grad_norm": 12.687790286873813, "language_loss": 1.47094202, "learning_rate": 1.5999125722696629e-06, "loss": 1.5227679, "num_input_tokens_seen": 223225, "router_z_loss_clip": 5.3125, "router_z_loss_mlp": 19.875, "step": 12, "time_per_iteration": 2.496410846710205 }, { "auxiliary_loss_clip": 0.03339392, "auxiliary_loss_mlp": 0.01550267, "balance_loss_clip": 1.08887899, "balance_loss_mlp": 1.51099777, "epoch": 0.0007816022846836014, "flos": 23804340618240.0, "grad_norm": 8.758946386735074, "language_loss": 1.35767293, "learning_rate": 1.6514482443788434e-06, "loss": 1.40656948, "num_input_tokens_seen": 242570, "router_z_loss_clip": 4.625, "router_z_loss_mlp": 18.25, "step": 13, "time_per_iteration": 2.5658016204833984 }, { "auxiliary_loss_clip": 0.02829984, "auxiliary_loss_mlp": 0.01373452, "balance_loss_clip": 1.02517021, "balance_loss_mlp": 1.52180982, "epoch": 0.0008417255373515708, "flos": 19171191576960.0, "grad_norm": 5.41326447908647, "language_loss": 1.3153739, "learning_rate": 1.6991628240650723e-06, "loss": 1.3574084, "num_input_tokens_seen": 261215, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 13.125, "step": 14, "time_per_iteration": 2.4941866397857666 }, { "auxiliary_loss_clip": 0.02720327, "auxiliary_loss_mlp": 0.0137032, "balance_loss_clip": 1.03348207, "balance_loss_mlp": 1.52959895, "epoch": 0.00090184879001954, "flos": 26394372362880.0, "grad_norm": 4.764776492194351, "language_loss": 1.20919251, "learning_rate": 1.7435840350181584e-06, "loss": 1.25009894, "num_input_tokens_seen": 280035, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 11.875, "step": 15, "time_per_iteration": 2.579803466796875 }, { "auxiliary_loss_clip": 0.02641292, "auxiliary_loss_mlp": 0.01373756, "balance_loss_clip": 1.03405726, "balance_loss_mlp": 1.5335772, "epoch": 0.0009619720426875094, "flos": 24678391305600.0, "grad_norm": 3.968460904230528, "language_loss": 1.19075227, "learning_rate": 1.7851373027727038e-06, "loss": 1.23090279, "num_input_tokens_seen": 300265, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 11.0625, "step": 16, "time_per_iteration": 2.5591747760772705 }, { "auxiliary_loss_clip": 0.02610849, "auxiliary_loss_mlp": 0.01333515, "balance_loss_clip": 1.03768528, "balance_loss_mlp": 1.53663445, "epoch": 0.0010220952953554788, "flos": 18623543990400.0, "grad_norm": 4.419232927622343, "language_loss": 1.27668214, "learning_rate": 1.8241705979033208e-06, "loss": 1.31612563, "num_input_tokens_seen": 317375, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 10.75, "step": 17, "time_per_iteration": 5.376318693161011 }, { "auxiliary_loss_clip": 0.02503798, "auxiliary_loss_mlp": 0.0134741, "balance_loss_clip": 1.03460526, "balance_loss_mlp": 1.54176152, "epoch": 0.001082218548023448, "flos": 26141286850560.0, "grad_norm": 3.2883009857040406, "language_loss": 1.14481819, "learning_rate": 1.860972167459798e-06, "loss": 1.1833303, "num_input_tokens_seen": 337975, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 9.625, "step": 18, "time_per_iteration": 2.5738518238067627 }, { "auxiliary_loss_clip": 0.02425537, "auxiliary_loss_mlp": 0.01364425, "balance_loss_clip": 1.05944014, "balance_loss_mlp": 1.53502774, "epoch": 0.0011423418006914173, "flos": 19608758046720.0, "grad_norm": 3.386770393584129, "language_loss": 1.14091611, "learning_rate": 1.89578346593066e-06, "loss": 1.1788156, "num_input_tokens_seen": 356635, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 8.9375, "step": 19, "time_per_iteration": 2.4791295528411865 }, { "auxiliary_loss_clip": 0.02353366, "auxiliary_loss_mlp": 0.01330185, "balance_loss_clip": 1.05724311, "balance_loss_mlp": 1.5460546, "epoch": 0.0012024650533593868, "flos": 17894382912000.0, "grad_norm": 3.4597418022705453, "language_loss": 1.2497611, "learning_rate": 1.928808765521199e-06, "loss": 1.28659654, "num_input_tokens_seen": 375625, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 8.125, "step": 20, "time_per_iteration": 2.6001813411712646 }, { "auxiliary_loss_clip": 0.02273422, "auxiliary_loss_mlp": 0.013496, "balance_loss_clip": 1.0778029, "balance_loss_mlp": 1.54572868, "epoch": 0.001262588306027356, "flos": 21250967667840.0, "grad_norm": 3.6037352982033153, "language_loss": 1.1877749, "learning_rate": 1.9602224192552076e-06, "loss": 1.2240051, "num_input_tokens_seen": 394350, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 7.28125, "step": 21, "time_per_iteration": 2.5836637020111084 }, { "auxiliary_loss_clip": 0.02011067, "auxiliary_loss_mlp": 0.01442024, "balance_loss_clip": 1.14924622, "balance_loss_mlp": 1.55435538, "epoch": 0.0013227115586953253, "flos": 26102882488320.0, "grad_norm": 3.083812497930254, "language_loss": 1.19608283, "learning_rate": 1.9901744328983746e-06, "loss": 1.23061383, "num_input_tokens_seen": 413255, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 4.5625, "step": 22, "time_per_iteration": 2.539775848388672 }, { "auxiliary_loss_clip": 0.01926986, "auxiliary_loss_mlp": 0.01505657, "balance_loss_clip": 1.20963633, "balance_loss_mlp": 1.56594384, "epoch": 0.0013828348113632948, "flos": 23950242656640.0, "grad_norm": 2.827621860614114, "language_loss": 1.03031397, "learning_rate": 2.018794797290208e-06, "loss": 1.06464052, "num_input_tokens_seen": 433065, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 3.609375, "step": 23, "time_per_iteration": 2.514194965362549 }, { "auxiliary_loss_clip": 0.01894492, "auxiliary_loss_mlp": 0.01460161, "balance_loss_clip": 1.14869106, "balance_loss_mlp": 1.57090414, "epoch": 0.001442958064031264, "flos": 15958972759680.0, "grad_norm": 2.703874562223705, "language_loss": 1.15945315, "learning_rate": 2.046196897962839e-06, "loss": 1.19299972, "num_input_tokens_seen": 451175, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 3.234375, "step": 24, "time_per_iteration": 2.4529855251312256 }, { "auxiliary_loss_clip": 0.01871654, "auxiliary_loss_mlp": 0.01432521, "balance_loss_clip": 1.16129601, "balance_loss_mlp": 1.56573081, "epoch": 0.0015030813166992333, "flos": 18107527962240.0, "grad_norm": 3.48604146843946, "language_loss": 1.17391658, "learning_rate": 2.0724802282696944e-06, "loss": 1.20695829, "num_input_tokens_seen": 468775, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 3.0625, "step": 25, "time_per_iteration": 2.442079782485962 }, { "auxiliary_loss_clip": 0.01863681, "auxiliary_loss_mlp": 0.01389685, "balance_loss_clip": 1.12818706, "balance_loss_mlp": 1.56979775, "epoch": 0.0015632045693672028, "flos": 22233528460800.0, "grad_norm": 2.3852005903269413, "language_loss": 1.15666127, "learning_rate": 2.0977325700720194e-06, "loss": 1.18919492, "num_input_tokens_seen": 488530, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 2.9375, "step": 26, "time_per_iteration": 2.494749069213867 }, { "auxiliary_loss_clip": 0.01859968, "auxiliary_loss_mlp": 0.01338449, "balance_loss_clip": 1.10689712, "balance_loss_mlp": 1.5667156, "epoch": 0.001623327822035172, "flos": 23990706789120.0, "grad_norm": 2.314721074457863, "language_loss": 1.03393078, "learning_rate": 2.122031762649933e-06, "loss": 1.06591487, "num_input_tokens_seen": 510495, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 2.9375, "step": 27, "time_per_iteration": 2.513204574584961 }, { "auxiliary_loss_clip": 0.01847053, "auxiliary_loss_mlp": 0.01306927, "balance_loss_clip": 1.09387577, "balance_loss_mlp": 1.56402767, "epoch": 0.0016834510747031415, "flos": 19676769108480.0, "grad_norm": 2.190330751737024, "language_loss": 1.15545738, "learning_rate": 2.1454471497582483e-06, "loss": 1.18699718, "num_input_tokens_seen": 528605, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 2.828125, "step": 28, "time_per_iteration": 2.471423387527466 }, { "auxiliary_loss_clip": 0.01838201, "auxiliary_loss_mlp": 0.01268613, "balance_loss_clip": 1.0743494, "balance_loss_mlp": 1.55438936, "epoch": 0.0017435743273711108, "flos": 20922749176320.0, "grad_norm": 2.075086891165793, "language_loss": 1.1381247, "learning_rate": 2.1680407726407727e-06, "loss": 1.16919291, "num_input_tokens_seen": 548515, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 2.828125, "step": 29, "time_per_iteration": 2.4642415046691895 }, { "auxiliary_loss_clip": 0.01820986, "auxiliary_loss_mlp": 0.01259961, "balance_loss_clip": 1.07828605, "balance_loss_mlp": 1.54057741, "epoch": 0.00180369758003908, "flos": 19528178895360.0, "grad_norm": 2.510270472528044, "language_loss": 1.34787011, "learning_rate": 2.189868360711334e-06, "loss": 1.37867963, "num_input_tokens_seen": 564025, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 2.8125, "step": 30, "time_per_iteration": 2.4827816486358643 }, { "auxiliary_loss_clip": 0.01815259, "auxiliary_loss_mlp": 0.01236805, "balance_loss_clip": 1.05951691, "balance_loss_mlp": 1.53890336, "epoch": 0.0018638208327070496, "flos": 27451961400960.0, "grad_norm": 2.1540247244676842, "language_loss": 1.15840447, "learning_rate": 2.2109801597326265e-06, "loss": 1.18892515, "num_input_tokens_seen": 583345, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 2.75, "step": 31, "time_per_iteration": 2.514981269836426 }, { "auxiliary_loss_clip": 0.01797245, "auxiliary_loss_mlp": 0.01202668, "balance_loss_clip": 1.02967131, "balance_loss_mlp": 1.52960193, "epoch": 0.0019239440853750188, "flos": 13588614489600.0, "grad_norm": 1.958003866574222, "language_loss": 1.04776859, "learning_rate": 2.2314216284658796e-06, "loss": 1.07776773, "num_input_tokens_seen": 600010, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 2.671875, "step": 32, "time_per_iteration": 2.4984066486358643 }, { "auxiliary_loss_clip": 0.01792347, "auxiliary_loss_mlp": 0.01218719, "balance_loss_clip": 1.03799748, "balance_loss_mlp": 1.52601659, "epoch": 0.001984067338042988, "flos": 11253099623040.0, "grad_norm": 2.540123101717717, "language_loss": 1.10896111, "learning_rate": 2.2512340280885094e-06, "loss": 1.1390717, "num_input_tokens_seen": 616295, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 2.65625, "step": 33, "time_per_iteration": 2.44722843170166 }, { "auxiliary_loss_clip": 0.01751008, "auxiliary_loss_mlp": 0.01202346, "balance_loss_clip": 1.03211534, "balance_loss_mlp": 1.50218964, "epoch": 0.0020441905907109576, "flos": 22385051228160.0, "grad_norm": 1.766705137559389, "language_loss": 0.98252147, "learning_rate": 2.270454923596497e-06, "loss": 1.01205504, "num_input_tokens_seen": 637640, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 2.484375, "step": 34, "time_per_iteration": 2.5171329975128174 }, { "auxiliary_loss_clip": 0.0171905, "auxiliary_loss_mlp": 0.01198976, "balance_loss_clip": 1.03961742, "balance_loss_mlp": 1.47864413, "epoch": 0.0021043138433789266, "flos": 49776858489600.0, "grad_norm": 1.934875159455613, "language_loss": 0.87226588, "learning_rate": 2.2891186125067434e-06, "loss": 0.90144616, "num_input_tokens_seen": 659710, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 2.40625, "step": 35, "time_per_iteration": 2.731252908706665 }, { "auxiliary_loss_clip": 0.01696442, "auxiliary_loss_mlp": 0.0120763, "balance_loss_clip": 1.05141854, "balance_loss_mlp": 1.46685505, "epoch": 0.002164437096046896, "flos": 20556929283840.0, "grad_norm": 1.852836988720178, "language_loss": 0.99594855, "learning_rate": 2.307256493152974e-06, "loss": 1.02498937, "num_input_tokens_seen": 679670, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 2.296875, "step": 36, "time_per_iteration": 2.523585557937622 }, { "auxiliary_loss_clip": 0.0166853, "auxiliary_loss_mlp": 0.01179437, "balance_loss_clip": 1.0360043, "balance_loss_mlp": 1.45414805, "epoch": 0.0022245603487148656, "flos": 26541077362560.0, "grad_norm": 1.8982971776811952, "language_loss": 1.05561364, "learning_rate": 2.3248973825097614e-06, "loss": 1.08409333, "num_input_tokens_seen": 700170, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 2.140625, "step": 37, "time_per_iteration": 2.5188474655151367 }, { "auxiliary_loss_clip": 0.01645951, "auxiliary_loss_mlp": 0.01172056, "balance_loss_clip": 1.03701544, "balance_loss_mlp": 1.43997324, "epoch": 0.0022846836013828346, "flos": 20337185986560.0, "grad_norm": 1.7940956563805204, "language_loss": 1.10813737, "learning_rate": 2.3420677916238357e-06, "loss": 1.13631737, "num_input_tokens_seen": 718545, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 2.0625, "step": 38, "time_per_iteration": 2.4835011959075928 }, { "auxiliary_loss_clip": 0.01615695, "auxiliary_loss_mlp": 0.01168341, "balance_loss_clip": 1.03816485, "balance_loss_mlp": 1.42254043, "epoch": 0.002344806854050804, "flos": 26246445465600.0, "grad_norm": 1.797232245845728, "language_loss": 0.94572014, "learning_rate": 2.358792165262154e-06, "loss": 0.97356045, "num_input_tokens_seen": 739865, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 1.9296875, "step": 39, "time_per_iteration": 2.545525550842285 }, { "auxiliary_loss_clip": 0.01589535, "auxiliary_loss_mlp": 0.01171658, "balance_loss_clip": 1.04415178, "balance_loss_mlp": 1.40366411, "epoch": 0.0024049301067187736, "flos": 11800747209600.0, "grad_norm": 2.3252134330432153, "language_loss": 1.05606556, "learning_rate": 2.3750930912143747e-06, "loss": 1.08367753, "num_input_tokens_seen": 755770, "router_z_loss_clip": 1.2734375, "router_z_loss_mlp": 1.859375, "step": 40, "time_per_iteration": 2.4814131259918213 }, { "auxiliary_loss_clip": 0.0156809, "auxiliary_loss_mlp": 0.01151629, "balance_loss_clip": 1.03137076, "balance_loss_mlp": 1.39235711, "epoch": 0.0024650533593867426, "flos": 20630456340480.0, "grad_norm": 1.9304736304141732, "language_loss": 1.04341471, "learning_rate": 2.3909914837471044e-06, "loss": 1.07061172, "num_input_tokens_seen": 773440, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 1.7578125, "step": 41, "time_per_iteration": 2.4887442588806152 }, { "auxiliary_loss_clip": 0.01547533, "auxiliary_loss_mlp": 0.0115471, "balance_loss_clip": 1.03435612, "balance_loss_mlp": 1.37903047, "epoch": 0.002525176612054712, "flos": 18405127324800.0, "grad_norm": 1.8610384533041682, "language_loss": 1.05148911, "learning_rate": 2.4065067449483835e-06, "loss": 1.07851148, "num_input_tokens_seen": 790455, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 1.6875, "step": 42, "time_per_iteration": 2.5541391372680664 }, { "auxiliary_loss_clip": 0.01525354, "auxiliary_loss_mlp": 0.01151981, "balance_loss_clip": 1.04001999, "balance_loss_mlp": 1.36522567, "epoch": 0.0025852998647226816, "flos": 28182763313280.0, "grad_norm": 2.0744596214174855, "language_loss": 1.10404158, "learning_rate": 2.4216569070848724e-06, "loss": 1.13081491, "num_input_tokens_seen": 810645, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 1.6015625, "step": 43, "time_per_iteration": 2.533997058868408 }, { "auxiliary_loss_clip": 0.01508479, "auxiliary_loss_mlp": 0.01141589, "balance_loss_clip": 1.03248906, "balance_loss_mlp": 1.35540771, "epoch": 0.0026454231173906506, "flos": 14282233937280.0, "grad_norm": 1.7923717148249703, "language_loss": 1.07973742, "learning_rate": 2.4364587585915504e-06, "loss": 1.10623813, "num_input_tokens_seen": 827470, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 1.53125, "step": 44, "time_per_iteration": 2.4795925617218018 }, { "auxiliary_loss_clip": 0.01497536, "auxiliary_loss_mlp": 0.01137356, "balance_loss_clip": 1.03011465, "balance_loss_mlp": 1.34896302, "epoch": 0.00270554637005862, "flos": 22418114152320.0, "grad_norm": 1.6965701630206476, "language_loss": 1.09130347, "learning_rate": 2.450927955901469e-06, "loss": 1.11765242, "num_input_tokens_seen": 847285, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.484375, "step": 45, "time_per_iteration": 2.570896863937378 }, { "auxiliary_loss_clip": 0.01485914, "auxiliary_loss_mlp": 0.01129942, "balance_loss_clip": 1.02408433, "balance_loss_mlp": 1.34306371, "epoch": 0.0027656696227265896, "flos": 23984702035200.0, "grad_norm": 1.5400208136049292, "language_loss": 1.10024905, "learning_rate": 2.465079122983384e-06, "loss": 1.12640762, "num_input_tokens_seen": 867545, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 1.4296875, "step": 46, "time_per_iteration": 2.5858287811279297 }, { "auxiliary_loss_clip": 0.01480002, "auxiliary_loss_mlp": 0.01128872, "balance_loss_clip": 1.02701914, "balance_loss_mlp": 1.33938384, "epoch": 0.0028257928753945586, "flos": 37668001731840.0, "grad_norm": 1.646375280954016, "language_loss": 0.98705554, "learning_rate": 2.4789259401737868e-06, "loss": 1.01314437, "num_input_tokens_seen": 889915, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 1.40625, "step": 47, "time_per_iteration": 2.662827253341675 }, { "auxiliary_loss_clip": 0.01477236, "auxiliary_loss_mlp": 0.01140824, "balance_loss_clip": 1.04202318, "balance_loss_mlp": 1.34053612, "epoch": 0.002885916128062528, "flos": 22453481226240.0, "grad_norm": 1.6837192815734772, "language_loss": 0.94916022, "learning_rate": 2.492481223656015e-06, "loss": 0.97534078, "num_input_tokens_seen": 908975, "router_z_loss_clip": 0.98828125, "router_z_loss_mlp": 1.3671875, "step": 48, "time_per_iteration": 2.646942377090454 }, { "auxiliary_loss_clip": 0.01472159, "auxiliary_loss_mlp": 0.01137421, "balance_loss_clip": 1.0374279, "balance_loss_mlp": 1.3341831, "epoch": 0.0029460393807304976, "flos": 27011671845120.0, "grad_norm": 1.7167334518358937, "language_loss": 0.97937459, "learning_rate": 2.5057569967437924e-06, "loss": 1.00547028, "num_input_tokens_seen": 929810, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 1.375, "step": 49, "time_per_iteration": 2.7306461334228516 }, { "auxiliary_loss_clip": 0.01463358, "auxiliary_loss_mlp": 0.01122727, "balance_loss_clip": 1.03303349, "balance_loss_mlp": 1.33046627, "epoch": 0.0030061626333984666, "flos": 15850916501760.0, "grad_norm": 1.7760834902183293, "language_loss": 0.98691154, "learning_rate": 2.51876455396287e-06, "loss": 1.01277232, "num_input_tokens_seen": 948650, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 1.328125, "step": 50, "time_per_iteration": 2.6136646270751953 }, { "auxiliary_loss_clip": 0.01461312, "auxiliary_loss_mlp": 0.01126198, "balance_loss_clip": 1.04046226, "balance_loss_mlp": 1.32714772, "epoch": 0.003066285886066436, "flos": 31825845619200.0, "grad_norm": 1.8773957611407543, "language_loss": 0.99911571, "learning_rate": 2.5315145187866316e-06, "loss": 1.0249908, "num_input_tokens_seen": 966455, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 1.34375, "step": 51, "time_per_iteration": 2.694885492324829 }, { "auxiliary_loss_clip": 0.01448632, "auxiliary_loss_mlp": 0.0112479, "balance_loss_clip": 1.03767216, "balance_loss_mlp": 1.32371688, "epoch": 0.0031264091387344056, "flos": 41425878188160.0, "grad_norm": 1.755027267036778, "language_loss": 1.0240953, "learning_rate": 2.5440168957651953e-06, "loss": 1.04982948, "num_input_tokens_seen": 988110, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 1.25, "step": 52, "time_per_iteration": 2.738954544067383 }, { "auxiliary_loss_clip": 0.01438265, "auxiliary_loss_mlp": 0.01107397, "balance_loss_clip": 1.0242846, "balance_loss_mlp": 1.31720757, "epoch": 0.0031865323914023747, "flos": 23439812446080.0, "grad_norm": 1.6129298069702809, "language_loss": 1.01391912, "learning_rate": 2.5562811176888872e-06, "loss": 1.03937578, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 1.2109375, "step": 53, "time_per_iteration": 2.570388078689575 }, { "auxiliary_loss_clip": 0.01428581, "auxiliary_loss_mlp": 0.01096553, "balance_loss_clip": 1.02097392, "balance_loss_mlp": 1.30994177, "epoch": 0.003246655644070344, "flos": 14428310532480.0, "grad_norm": 1.8091849462926297, "language_loss": 0.93399954, "learning_rate": 2.5683160883431093e-06, "loss": 0.95925093, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.1875, "step": 54, "time_per_iteration": 2.6554181575775146 }, { "auxiliary_loss_clip": 0.01420672, "auxiliary_loss_mlp": 0.01098998, "balance_loss_clip": 1.02031994, "balance_loss_mlp": 1.30313826, "epoch": 0.0033067788967383136, "flos": 35916793246080.0, "grad_norm": 1.9390605439657922, "language_loss": 0.92271101, "learning_rate": 2.580130221340046e-06, "loss": 0.94790775, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 1.171875, "step": 55, "time_per_iteration": 2.6824378967285156 }, { "auxiliary_loss_clip": 0.01416151, "auxiliary_loss_mlp": 0.01100073, "balance_loss_clip": 1.02072692, "balance_loss_mlp": 1.29935884, "epoch": 0.003366902149406283, "flos": 22957836860160.0, "grad_norm": 2.2229776628331526, "language_loss": 1.02354062, "learning_rate": 2.5917314754514246e-06, "loss": 1.04870272, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.171875, "step": 56, "time_per_iteration": 4.229781866073608 }, { "auxiliary_loss_clip": 0.0141158, "auxiliary_loss_mlp": 0.01098514, "balance_loss_clip": 1.02756071, "balance_loss_mlp": 1.29667664, "epoch": 0.003427025402074252, "flos": 26581506583680.0, "grad_norm": 1.6610750254902438, "language_loss": 1.03736138, "learning_rate": 2.6031273868139713e-06, "loss": 1.06246233, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 1.1484375, "step": 57, "time_per_iteration": 4.268309831619263 }, { "auxiliary_loss_clip": 0.01403879, "auxiliary_loss_mlp": 0.01084009, "balance_loss_clip": 1.0168705, "balance_loss_mlp": 1.29193509, "epoch": 0.0034871486547422216, "flos": 23950068099840.0, "grad_norm": 1.7818773582281597, "language_loss": 1.07827854, "learning_rate": 2.614325098333948e-06, "loss": 1.1031574, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.1171875, "step": 58, "time_per_iteration": 2.533069372177124 }, { "auxiliary_loss_clip": 0.01391815, "auxiliary_loss_mlp": 0.01077123, "balance_loss_clip": 1.01570654, "balance_loss_mlp": 1.28348184, "epoch": 0.003547271907410191, "flos": 21213924848640.0, "grad_norm": 1.8801083671758678, "language_loss": 0.98636395, "learning_rate": 2.625331386578098e-06, "loss": 1.01105332, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 1.0859375, "step": 59, "time_per_iteration": 2.63613224029541 }, { "auxiliary_loss_clip": 0.01389579, "auxiliary_loss_mlp": 0.01094339, "balance_loss_clip": 1.02772522, "balance_loss_mlp": 1.27919149, "epoch": 0.00360739516007816, "flos": 16504071816960.0, "grad_norm": 1.7572322462792613, "language_loss": 1.03732479, "learning_rate": 2.63615268640451e-06, "loss": 1.06216395, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 1.109375, "step": 60, "time_per_iteration": 2.5318427085876465 }, { "auxiliary_loss_clip": 0.01380882, "auxiliary_loss_mlp": 0.0109855, "balance_loss_clip": 1.03532171, "balance_loss_mlp": 1.27369034, "epoch": 0.0036675184127461296, "flos": 19463763703680.0, "grad_norm": 1.96366380895085, "language_loss": 1.00712299, "learning_rate": 2.6467951135575943e-06, "loss": 1.03191733, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 1.078125, "step": 61, "time_per_iteration": 2.5204458236694336 }, { "auxiliary_loss_clip": 0.01377498, "auxiliary_loss_mlp": 0.0109224, "balance_loss_clip": 1.02743793, "balance_loss_mlp": 1.27065504, "epoch": 0.003727641665414099, "flos": 20956335770880.0, "grad_norm": 1.6938978530507907, "language_loss": 0.97925043, "learning_rate": 2.657264485425803e-06, "loss": 1.00394773, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 1.0703125, "step": 62, "time_per_iteration": 2.580873489379883 }, { "auxiliary_loss_clip": 0.01369197, "auxiliary_loss_mlp": 0.01073564, "balance_loss_clip": 1.01529503, "balance_loss_mlp": 1.26563585, "epoch": 0.003787764918082068, "flos": 18405057502080.0, "grad_norm": 1.6056544984710879, "language_loss": 1.02707219, "learning_rate": 2.6675663401385186e-06, "loss": 1.05149984, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 1.0390625, "step": 63, "time_per_iteration": 2.5049960613250732 }, { "auxiliary_loss_clip": 0.01366937, "auxiliary_loss_mlp": 0.01081452, "balance_loss_clip": 1.02194309, "balance_loss_mlp": 1.26473141, "epoch": 0.0038478881707500376, "flos": 12458406090240.0, "grad_norm": 1.9646688865502346, "language_loss": 1.10748148, "learning_rate": 2.677705954159056e-06, "loss": 1.13196528, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 1.0234375, "step": 64, "time_per_iteration": 3.22587251663208 }, { "auxiliary_loss_clip": 0.01361245, "auxiliary_loss_mlp": 0.01080482, "balance_loss_clip": 1.02288008, "balance_loss_mlp": 1.25962412, "epoch": 0.003908011423418007, "flos": 13552479365760.0, "grad_norm": 1.925475193310242, "language_loss": 1.00453138, "learning_rate": 2.6876883585136904e-06, "loss": 1.02894866, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 1.015625, "step": 65, "time_per_iteration": 4.017646551132202 }, { "auxiliary_loss_clip": 0.01357257, "auxiliary_loss_mlp": 0.01080493, "balance_loss_clip": 1.02112687, "balance_loss_mlp": 1.25508583, "epoch": 0.003968134676085976, "flos": 18332473052160.0, "grad_norm": 1.5746626958295853, "language_loss": 0.98168778, "learning_rate": 2.697518353781685e-06, "loss": 1.00606525, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 1.0234375, "step": 66, "time_per_iteration": 4.332080364227295 }, { "auxiliary_loss_clip": 0.01348039, "auxiliary_loss_mlp": 0.01080387, "balance_loss_clip": 1.02288032, "balance_loss_mlp": 1.24804461, "epoch": 0.004028257928753946, "flos": 20484205188480.0, "grad_norm": 1.9783854595226316, "language_loss": 1.10036111, "learning_rate": 2.7072005239581103e-06, "loss": 1.12464547, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.99609375, "step": 67, "time_per_iteration": 4.574966192245483 }, { "auxiliary_loss_clip": 0.01343347, "auxiliary_loss_mlp": 0.01079155, "balance_loss_clip": 1.02431893, "balance_loss_mlp": 1.24566841, "epoch": 0.004088381181421915, "flos": 18842833440000.0, "grad_norm": 1.707964296993641, "language_loss": 1.01583052, "learning_rate": 2.7167392492896727e-06, "loss": 1.04005563, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.9765625, "step": 68, "time_per_iteration": 2.9387080669403076 }, { "auxiliary_loss_clip": 0.01336421, "auxiliary_loss_mlp": 0.01077488, "balance_loss_clip": 1.02522683, "balance_loss_mlp": 1.23911297, "epoch": 0.004148504434089885, "flos": 19426790707200.0, "grad_norm": 1.6095956306570647, "language_loss": 1.05997467, "learning_rate": 2.7261387181735195e-06, "loss": 1.0841136, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.97265625, "step": 69, "time_per_iteration": 2.608163356781006 }, { "auxiliary_loss_clip": 0.01332264, "auxiliary_loss_mlp": 0.01069159, "balance_loss_clip": 1.01677835, "balance_loss_mlp": 1.23615658, "epoch": 0.004208627686757853, "flos": 20810049707520.0, "grad_norm": 2.0607956530144578, "language_loss": 1.0951407, "learning_rate": 2.7354029381999196e-06, "loss": 1.11915493, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.9609375, "step": 70, "time_per_iteration": 2.6302578449249268 }, { "auxiliary_loss_clip": 0.01328541, "auxiliary_loss_mlp": 0.01075035, "balance_loss_clip": 1.018291, "balance_loss_mlp": 1.23221707, "epoch": 0.004268750939425823, "flos": 19097629608960.0, "grad_norm": 2.0416179902127864, "language_loss": 1.17451215, "learning_rate": 2.7445357464116983e-06, "loss": 1.19854784, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.9609375, "step": 71, "time_per_iteration": 2.5630364418029785 }, { "auxiliary_loss_clip": 0.01353489, "auxiliary_loss_mlp": 0.01059675, "balance_loss_clip": 1.00321722, "balance_loss_mlp": 1.27533937, "epoch": 0.004328874192093792, "flos": 52436889377280.0, "grad_norm": 2.3851030891106486, "language_loss": 0.657565, "learning_rate": 2.75354081884615e-06, "loss": 0.68169665, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.78125, "step": 72, "time_per_iteration": 3.2282726764678955 }, { "auxiliary_loss_clip": 0.01345302, "auxiliary_loss_mlp": 0.0105828, "balance_loss_clip": 1.00907028, "balance_loss_mlp": 1.26883674, "epoch": 0.004388997444761762, "flos": 66469459115520.0, "grad_norm": 2.246094963193842, "language_loss": 0.63956976, "learning_rate": 2.7624216794188286e-06, "loss": 0.66360557, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.765625, "step": 73, "time_per_iteration": 3.206207036972046 }, { "auxiliary_loss_clip": 0.01312721, "auxiliary_loss_mlp": 0.01078268, "balance_loss_clip": 1.02445734, "balance_loss_mlp": 1.21759152, "epoch": 0.004449120697429731, "flos": 18951971950080.0, "grad_norm": 1.8000698391709369, "language_loss": 0.98054969, "learning_rate": 2.771181708202938e-06, "loss": 1.00445962, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.953125, "step": 74, "time_per_iteration": 2.5685036182403564 }, { "auxiliary_loss_clip": 0.01302433, "auxiliary_loss_mlp": 0.01066214, "balance_loss_clip": 1.0158602, "balance_loss_mlp": 1.21137786, "epoch": 0.004509243950097701, "flos": 21104437224960.0, "grad_norm": 1.7946804275055441, "language_loss": 1.07349372, "learning_rate": 2.779824149153005e-06, "loss": 1.09718025, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.9140625, "step": 75, "time_per_iteration": 2.5850989818573 }, { "auxiliary_loss_clip": 0.01301294, "auxiliary_loss_mlp": 0.0107544, "balance_loss_clip": 1.02284467, "balance_loss_mlp": 1.20859528, "epoch": 0.004569367202765669, "flos": 20697838997760.0, "grad_norm": 1.8031505198727962, "language_loss": 0.98880738, "learning_rate": 2.788352117317012e-06, "loss": 1.01257467, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.92578125, "step": 76, "time_per_iteration": 2.566347360610962 }, { "auxiliary_loss_clip": 0.01297131, "auxiliary_loss_mlp": 0.01088256, "balance_loss_clip": 1.03680515, "balance_loss_mlp": 1.20522153, "epoch": 0.004629490455433639, "flos": 28657198045440.0, "grad_norm": 1.6570994166061204, "language_loss": 1.02446651, "learning_rate": 2.796768605577095e-06, "loss": 1.04832041, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.91796875, "step": 77, "time_per_iteration": 2.6131017208099365 }, { "auxiliary_loss_clip": 0.01287556, "auxiliary_loss_mlp": 0.01072594, "balance_loss_clip": 1.02483845, "balance_loss_mlp": 1.19914961, "epoch": 0.004689613708101608, "flos": 11071621042560.0, "grad_norm": 1.8950067555859382, "language_loss": 1.0496515, "learning_rate": 2.80507649095533e-06, "loss": 1.07325292, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.8828125, "step": 78, "time_per_iteration": 2.61002516746521 }, { "auxiliary_loss_clip": 0.01283501, "auxiliary_loss_mlp": 0.01062978, "balance_loss_clip": 1.01422167, "balance_loss_mlp": 1.19583321, "epoch": 0.004749736960769578, "flos": 21798021761280.0, "grad_norm": 2.079147920979707, "language_loss": 0.96072763, "learning_rate": 2.813278540517843e-06, "loss": 0.98419249, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.875, "step": 79, "time_per_iteration": 2.736137628555298 }, { "auxiliary_loss_clip": 0.01282169, "auxiliary_loss_mlp": 0.01069946, "balance_loss_clip": 1.01854336, "balance_loss_mlp": 1.19400644, "epoch": 0.004809860213437547, "flos": 19791563258880.0, "grad_norm": 1.6633581471839818, "language_loss": 0.99232942, "learning_rate": 2.8213774169075505e-06, "loss": 1.01585054, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.8828125, "step": 80, "time_per_iteration": 2.6624741554260254 }, { "auxiliary_loss_clip": 0.01275935, "auxiliary_loss_mlp": 0.01068509, "balance_loss_clip": 1.02072954, "balance_loss_mlp": 1.19066119, "epoch": 0.004869983466105517, "flos": 26573232591360.0, "grad_norm": 1.8958047428715168, "language_loss": 1.04964733, "learning_rate": 2.829375683533245e-06, "loss": 1.07309175, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.8515625, "step": 81, "time_per_iteration": 2.5657668113708496 }, { "auxiliary_loss_clip": 0.01272667, "auxiliary_loss_mlp": 0.01070946, "balance_loss_clip": 1.02178407, "balance_loss_mlp": 1.18862176, "epoch": 0.004930106718773485, "flos": 12822550237440.0, "grad_norm": 2.36093047531353, "language_loss": 1.13654375, "learning_rate": 2.8372758094402803e-06, "loss": 1.15997982, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.83984375, "step": 82, "time_per_iteration": 2.5176568031311035 }, { "auxiliary_loss_clip": 0.01269172, "auxiliary_loss_mlp": 0.0106399, "balance_loss_clip": 1.01776123, "balance_loss_mlp": 1.18479371, "epoch": 0.004990229971441455, "flos": 25773756301440.0, "grad_norm": 1.7625678439477694, "language_loss": 0.94652849, "learning_rate": 2.84508017388607e-06, "loss": 0.96986014, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.84375, "step": 83, "time_per_iteration": 2.597201108932495 }, { "auxiliary_loss_clip": 0.01267738, "auxiliary_loss_mlp": 0.01066804, "balance_loss_clip": 1.01833367, "balance_loss_mlp": 1.18357682, "epoch": 0.005050353224109424, "flos": 17456292771840.0, "grad_norm": 1.9647619703085009, "language_loss": 1.03632414, "learning_rate": 2.852791070641559e-06, "loss": 1.05966961, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.83984375, "step": 84, "time_per_iteration": 2.5145950317382812 }, { "auxiliary_loss_clip": 0.01266584, "auxiliary_loss_mlp": 0.01095994, "balance_loss_clip": 1.07234323, "balance_loss_mlp": 1.20612133, "epoch": 0.005110476476777394, "flos": 69802269235200.0, "grad_norm": 1.6251546937924461, "language_loss": 0.63005388, "learning_rate": 2.8604107120381682e-06, "loss": 0.65367967, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.6015625, "step": 85, "time_per_iteration": 3.1483633518218994 }, { "auxiliary_loss_clip": 0.01262125, "auxiliary_loss_mlp": 0.01065209, "balance_loss_clip": 1.01676273, "balance_loss_mlp": 1.18012619, "epoch": 0.005170599729445363, "flos": 24788961181440.0, "grad_norm": 1.5118799067699176, "language_loss": 0.9598431, "learning_rate": 2.8679412327780482e-06, "loss": 0.98311651, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.8203125, "step": 86, "time_per_iteration": 2.582326889038086 }, { "auxiliary_loss_clip": 0.01260811, "auxiliary_loss_mlp": 0.01081091, "balance_loss_clip": 1.03531432, "balance_loss_mlp": 1.1800735, "epoch": 0.005230722982113333, "flos": 23256937411200.0, "grad_norm": 2.016924833118203, "language_loss": 0.9565239, "learning_rate": 2.8753846935240833e-06, "loss": 0.97994292, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.8046875, "step": 87, "time_per_iteration": 2.543916940689087 }, { "auxiliary_loss_clip": 0.01258088, "auxiliary_loss_mlp": 0.01076995, "balance_loss_clip": 1.0333643, "balance_loss_mlp": 1.17947674, "epoch": 0.005290846234781301, "flos": 16726957136640.0, "grad_norm": 1.5902949013477015, "language_loss": 1.03879786, "learning_rate": 2.8827430842847267e-06, "loss": 1.06214881, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.7890625, "step": 88, "time_per_iteration": 2.593050956726074 }, { "auxiliary_loss_clip": 0.01253169, "auxiliary_loss_mlp": 0.01059565, "balance_loss_clip": 1.01695991, "balance_loss_mlp": 1.17404974, "epoch": 0.005350969487449271, "flos": 20885043041280.0, "grad_norm": 1.651093208188181, "language_loss": 0.95645708, "learning_rate": 2.8900183276075957e-06, "loss": 0.97958446, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.7890625, "step": 89, "time_per_iteration": 2.5296895503997803 }, { "auxiliary_loss_clip": 0.012481, "auxiliary_loss_mlp": 0.0106714, "balance_loss_clip": 1.02329516, "balance_loss_mlp": 1.16942477, "epoch": 0.00541109274011724, "flos": 26208878976000.0, "grad_norm": 1.8725408861909332, "language_loss": 1.0131824, "learning_rate": 2.8972122815946455e-06, "loss": 1.03633487, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.78515625, "step": 90, "time_per_iteration": 2.578850746154785 }, { "auxiliary_loss_clip": 0.01243275, "auxiliary_loss_mlp": 0.01080846, "balance_loss_clip": 1.0369767, "balance_loss_mlp": 1.16579485, "epoch": 0.00547121599278521, "flos": 21177510433920.0, "grad_norm": 1.8228078622960453, "language_loss": 0.9514904, "learning_rate": 2.90432674275074e-06, "loss": 0.97473168, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.7734375, "step": 91, "time_per_iteration": 2.555706024169922 }, { "auxiliary_loss_clip": 0.01241529, "auxiliary_loss_mlp": 0.01071585, "balance_loss_clip": 1.02728713, "balance_loss_mlp": 1.16463041, "epoch": 0.005531339245453179, "flos": 19717791822720.0, "grad_norm": 1.908948796633992, "language_loss": 1.00433707, "learning_rate": 2.91136344867656e-06, "loss": 1.0274682, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.76953125, "step": 92, "time_per_iteration": 2.6066746711730957 }, { "auxiliary_loss_clip": 0.01237542, "auxiliary_loss_mlp": 0.0105625, "balance_loss_clip": 1.01643443, "balance_loss_mlp": 1.16334677, "epoch": 0.005591462498121149, "flos": 17635222823040.0, "grad_norm": 2.2342423337120603, "language_loss": 1.11653519, "learning_rate": 2.918324080615938e-06, "loss": 1.13947308, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.7421875, "step": 93, "time_per_iteration": 2.499570846557617 }, { "auxiliary_loss_clip": 0.01239023, "auxiliary_loss_mlp": 0.01060369, "balance_loss_clip": 1.01983762, "balance_loss_mlp": 1.16351247, "epoch": 0.005651585750789117, "flos": 20010189392640.0, "grad_norm": 1.8136616275795483, "language_loss": 1.00727367, "learning_rate": 2.925210265866963e-06, "loss": 1.0302676, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.75390625, "step": 94, "time_per_iteration": 2.5293662548065186 }, { "auxiliary_loss_clip": 0.01242183, "auxiliary_loss_mlp": 0.01299076, "balance_loss_clip": 1.27351713, "balance_loss_mlp": 1.1909281, "epoch": 0.005711709003457087, "flos": 59809917185280.0, "grad_norm": 1.5632806356434423, "language_loss": 0.68391669, "learning_rate": 2.932023580065507e-06, "loss": 0.70932931, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.51171875, "step": 95, "time_per_iteration": 2.987596273422241 }, { "auxiliary_loss_clip": 0.01235533, "auxiliary_loss_mlp": 0.01096045, "balance_loss_clip": 1.05508494, "balance_loss_mlp": 1.1599288, "epoch": 0.005771832256125056, "flos": 15558693488640.0, "grad_norm": 1.7943235705518, "language_loss": 1.02090073, "learning_rate": 2.9387655493491906e-06, "loss": 1.04421663, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.7578125, "step": 96, "time_per_iteration": 3.9576892852783203 }, { "auxiliary_loss_clip": 0.01237975, "auxiliary_loss_mlp": 0.01192051, "balance_loss_clip": 1.14813447, "balance_loss_mlp": 1.16075039, "epoch": 0.005831955508793026, "flos": 22527287573760.0, "grad_norm": 2.012633007760356, "language_loss": 1.05167651, "learning_rate": 2.9454376524092147e-06, "loss": 1.07597673, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.7734375, "step": 97, "time_per_iteration": 3.9907126426696777 }, { "auxiliary_loss_clip": 0.01236739, "auxiliary_loss_mlp": 0.01164693, "balance_loss_clip": 1.11698508, "balance_loss_mlp": 1.15853596, "epoch": 0.005892078761460995, "flos": 22048872946560.0, "grad_norm": 1.8680019172606075, "language_loss": 0.84544736, "learning_rate": 2.952041322436969e-06, "loss": 0.86946172, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.78125, "step": 98, "time_per_iteration": 2.5436413288116455 }, { "auxiliary_loss_clip": 0.01228903, "auxiliary_loss_mlp": 0.01271366, "balance_loss_clip": 1.24733341, "balance_loss_mlp": 1.17839527, "epoch": 0.005952202014128965, "flos": 68535689598720.0, "grad_norm": 1.2114521693221167, "language_loss": 0.6578337, "learning_rate": 2.9585779489718204e-06, "loss": 0.68283641, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.5078125, "step": 99, "time_per_iteration": 3.1738245487213135 }, { "auxiliary_loss_clip": 0.01237051, "auxiliary_loss_mlp": 0.01095208, "balance_loss_clip": 1.04056239, "balance_loss_mlp": 1.15641499, "epoch": 0.006012325266796933, "flos": 22959931541760.0, "grad_norm": 1.7681555320421318, "language_loss": 1.03088045, "learning_rate": 2.9650488796560464e-06, "loss": 1.05420291, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.80859375, "step": 100, "time_per_iteration": 2.554457664489746 }, { "auxiliary_loss_clip": 0.01233963, "auxiliary_loss_mlp": 0.01201699, "balance_loss_clip": 1.15279961, "balance_loss_mlp": 1.15491962, "epoch": 0.006072448519464903, "flos": 17346979704960.0, "grad_norm": 1.97824695090203, "language_loss": 1.01097822, "learning_rate": 2.971455421902446e-06, "loss": 1.03533506, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.7890625, "step": 101, "time_per_iteration": 2.5085086822509766 }, { "auxiliary_loss_clip": 0.01227307, "auxiliary_loss_mlp": 0.0116684, "balance_loss_clip": 1.12227917, "balance_loss_mlp": 1.15449381, "epoch": 0.006132571772132872, "flos": 24679962316800.0, "grad_norm": 1.7771090996803043, "language_loss": 1.03684199, "learning_rate": 2.9777988444798075e-06, "loss": 1.06078339, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.7265625, "step": 102, "time_per_iteration": 2.5612902641296387 }, { "auxiliary_loss_clip": 0.01222353, "auxiliary_loss_mlp": 0.01137773, "balance_loss_clip": 1.09419012, "balance_loss_mlp": 1.15168059, "epoch": 0.006192695024800842, "flos": 21464741122560.0, "grad_norm": 1.970948373339513, "language_loss": 0.97891676, "learning_rate": 2.9840803790210285e-06, "loss": 1.00251794, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.70703125, "step": 103, "time_per_iteration": 2.532970666885376 }, { "auxiliary_loss_clip": 0.01217977, "auxiliary_loss_mlp": 0.0108734, "balance_loss_clip": 1.04873979, "balance_loss_mlp": 1.14728475, "epoch": 0.006252818277468811, "flos": 17419459420800.0, "grad_norm": 1.702537147167651, "language_loss": 1.00995564, "learning_rate": 2.990301221458371e-06, "loss": 1.03300893, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.70703125, "step": 104, "time_per_iteration": 2.5523431301116943 }, { "auxiliary_loss_clip": 0.01232147, "auxiliary_loss_mlp": 0.01064679, "balance_loss_clip": 1.02929747, "balance_loss_mlp": 1.15771067, "epoch": 0.006312941530136781, "flos": 19098537304320.0, "grad_norm": 1.9100223115199066, "language_loss": 1.08033586, "learning_rate": 2.9964625333900544e-06, "loss": 1.10330415, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.7421875, "step": 105, "time_per_iteration": 2.5272581577301025 }, { "auxiliary_loss_clip": 0.01252854, "auxiliary_loss_mlp": 0.01151895, "balance_loss_clip": 1.11341465, "balance_loss_mlp": 1.17395711, "epoch": 0.006373064782804749, "flos": 24059695368960.0, "grad_norm": 2.0496554687033752, "language_loss": 1.04248106, "learning_rate": 3.002565443382063e-06, "loss": 1.06652856, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.7890625, "step": 106, "time_per_iteration": 2.574812889099121 }, { "auxiliary_loss_clip": 0.01245715, "auxiliary_loss_mlp": 0.01190595, "balance_loss_clip": 1.15201867, "balance_loss_mlp": 1.16812122, "epoch": 0.006433188035472719, "flos": 18331460622720.0, "grad_norm": 1.8950084325220462, "language_loss": 0.99633414, "learning_rate": 3.008611048208843e-06, "loss": 1.02069736, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.7734375, "step": 107, "time_per_iteration": 2.481133222579956 }, { "auxiliary_loss_clip": 0.01245767, "auxiliary_loss_mlp": 0.01089366, "balance_loss_clip": 1.06981611, "balance_loss_mlp": 1.18363643, "epoch": 0.006493311288140688, "flos": 62558907816960.0, "grad_norm": 1.0933082034188868, "language_loss": 0.65013921, "learning_rate": 3.014600414036285e-06, "loss": 0.67349052, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.62109375, "step": 108, "time_per_iteration": 3.1186723709106445 }, { "auxiliary_loss_clip": 0.01216437, "auxiliary_loss_mlp": 0.01141232, "balance_loss_clip": 1.10146368, "balance_loss_mlp": 1.14801717, "epoch": 0.006553434540808658, "flos": 19499130777600.0, "grad_norm": 1.8055960807637508, "language_loss": 1.08323288, "learning_rate": 3.0205345775501937e-06, "loss": 1.10680962, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6875, "step": 109, "time_per_iteration": 2.4944097995758057 }, { "auxiliary_loss_clip": 0.01207168, "auxiliary_loss_mlp": 0.01182229, "balance_loss_clip": 1.14262795, "balance_loss_mlp": 1.14232838, "epoch": 0.006613557793476627, "flos": 21104088111360.0, "grad_norm": 1.5176005721534047, "language_loss": 0.94617105, "learning_rate": 3.0264145470332218e-06, "loss": 0.970065, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.6484375, "step": 110, "time_per_iteration": 2.5456273555755615 }, { "auxiliary_loss_clip": 0.01207781, "auxiliary_loss_mlp": 0.01078394, "balance_loss_clip": 1.04039049, "balance_loss_mlp": 1.14323533, "epoch": 0.006673681046144597, "flos": 26029564899840.0, "grad_norm": 1.7413916722577985, "language_loss": 0.89089143, "learning_rate": 3.032241303393073e-06, "loss": 0.91375327, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.6484375, "step": 111, "time_per_iteration": 2.546675443649292 }, { "auxiliary_loss_clip": 0.01212908, "auxiliary_loss_mlp": 0.01074555, "balance_loss_clip": 1.03700376, "balance_loss_mlp": 1.14756036, "epoch": 0.006733804298812566, "flos": 23146681737600.0, "grad_norm": 1.6461578012583646, "language_loss": 1.02043927, "learning_rate": 3.0380158011446e-06, "loss": 1.04331386, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.65625, "step": 112, "time_per_iteration": 2.530122995376587 }, { "auxiliary_loss_clip": 0.01208815, "auxiliary_loss_mlp": 0.01157701, "balance_loss_clip": 1.12007833, "balance_loss_mlp": 1.14477301, "epoch": 0.006793927551480535, "flos": 11763669479040.0, "grad_norm": 2.0180373693240323, "language_loss": 0.93481207, "learning_rate": 3.0437389693482466e-06, "loss": 0.9584772, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.640625, "step": 113, "time_per_iteration": 2.502267360687256 }, { "auxiliary_loss_clip": 0.01196857, "auxiliary_loss_mlp": 0.01117322, "balance_loss_clip": 1.08148742, "balance_loss_mlp": 1.13487744, "epoch": 0.006854050804148504, "flos": 19170947197440.0, "grad_norm": 1.6503455504244549, "language_loss": 1.04036093, "learning_rate": 3.0494117125071475e-06, "loss": 1.06350279, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.62109375, "step": 114, "time_per_iteration": 2.512989044189453 }, { "auxiliary_loss_clip": 0.01189554, "auxiliary_loss_mlp": 0.01067583, "balance_loss_clip": 1.03422856, "balance_loss_mlp": 1.1274718, "epoch": 0.006914174056816474, "flos": 21980792062080.0, "grad_norm": 1.7224638794336582, "language_loss": 1.05511618, "learning_rate": 3.055034911425055e-06, "loss": 1.0776875, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.62109375, "step": 115, "time_per_iteration": 2.5234997272491455 }, { "auxiliary_loss_clip": 0.01194388, "auxiliary_loss_mlp": 0.0109411, "balance_loss_clip": 1.05694044, "balance_loss_mlp": 1.12844658, "epoch": 0.006974297309484443, "flos": 16288238592000.0, "grad_norm": 1.7813885375950698, "language_loss": 0.96385103, "learning_rate": 3.0606094240271244e-06, "loss": 0.98673606, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.66015625, "step": 116, "time_per_iteration": 2.473341464996338 }, { "auxiliary_loss_clip": 0.01196451, "auxiliary_loss_mlp": 0.01101771, "balance_loss_clip": 1.06689072, "balance_loss_mlp": 1.13037086, "epoch": 0.007034420562152413, "flos": 26102812665600.0, "grad_norm": 1.8036944158860786, "language_loss": 1.02992558, "learning_rate": 3.0661360861454656e-06, "loss": 1.05290771, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.66015625, "step": 117, "time_per_iteration": 2.5610122680664062 }, { "auxiliary_loss_clip": 0.01192572, "auxiliary_loss_mlp": 0.01069941, "balance_loss_clip": 1.03587055, "balance_loss_mlp": 1.12674475, "epoch": 0.007094543814820382, "flos": 14203889112960.0, "grad_norm": 1.9399774393714495, "language_loss": 0.98128647, "learning_rate": 3.071615712271274e-06, "loss": 1.00391161, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.65625, "step": 118, "time_per_iteration": 2.489553928375244 }, { "auxiliary_loss_clip": 0.01187548, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.01375341, "balance_loss_mlp": 1.12392044, "epoch": 0.007154667067488351, "flos": 14975120246400.0, "grad_norm": 1.8259466580912889, "language_loss": 1.08616149, "learning_rate": 3.0770490962752172e-06, "loss": 1.10849881, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.63671875, "step": 119, "time_per_iteration": 2.467879056930542 }, { "auxiliary_loss_clip": 0.01186702, "auxiliary_loss_mlp": 0.01074925, "balance_loss_clip": 1.03842318, "balance_loss_mlp": 1.12444115, "epoch": 0.00721479032015632, "flos": 20192261466240.0, "grad_norm": 2.033249640870371, "language_loss": 1.10876846, "learning_rate": 3.082437012097686e-06, "loss": 1.13138461, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.625, "step": 120, "time_per_iteration": 2.505545139312744 }, { "auxiliary_loss_clip": 0.01178172, "auxiliary_loss_mlp": 0.01076028, "balance_loss_clip": 1.04298306, "balance_loss_mlp": 1.11830711, "epoch": 0.00727491357282429, "flos": 23146158067200.0, "grad_norm": 1.6564435994101478, "language_loss": 0.99066359, "learning_rate": 3.0877802144103967e-06, "loss": 1.01320553, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.6015625, "step": 121, "time_per_iteration": 2.5707788467407227 }, { "auxiliary_loss_clip": 0.01176372, "auxiliary_loss_mlp": 0.01050122, "balance_loss_clip": 1.01605237, "balance_loss_mlp": 1.11646891, "epoch": 0.007335036825492259, "flos": 15520812796800.0, "grad_norm": 2.1572009005213566, "language_loss": 1.04852247, "learning_rate": 3.09307943925077e-06, "loss": 1.07078755, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.59765625, "step": 122, "time_per_iteration": 2.4875452518463135 }, { "auxiliary_loss_clip": 0.0117421, "auxiliary_loss_mlp": 0.01073096, "balance_loss_clip": 1.03807211, "balance_loss_mlp": 1.11477709, "epoch": 0.007395160078160229, "flos": 24242221290240.0, "grad_norm": 1.8772083341634107, "language_loss": 1.04477477, "learning_rate": 3.0983354046304154e-06, "loss": 1.06724787, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.59375, "step": 123, "time_per_iteration": 2.528247117996216 }, { "auxiliary_loss_clip": 0.01176097, "auxiliary_loss_mlp": 0.01078962, "balance_loss_clip": 1.04555905, "balance_loss_mlp": 1.11549723, "epoch": 0.007455283330828198, "flos": 31758428050560.0, "grad_norm": 1.6752973589855071, "language_loss": 0.8493703, "learning_rate": 3.103548811118979e-06, "loss": 0.87192088, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.60546875, "step": 124, "time_per_iteration": 2.571991443634033 }, { "auxiliary_loss_clip": 0.01170827, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.01230574, "balance_loss_mlp": 1.11192513, "epoch": 0.007515406583496167, "flos": 26613941103360.0, "grad_norm": 1.8416054911415718, "language_loss": 1.00744247, "learning_rate": 3.108720342404542e-06, "loss": 1.02957344, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.58984375, "step": 125, "time_per_iteration": 2.5586955547332764 }, { "auxiliary_loss_clip": 0.01168693, "auxiliary_loss_mlp": 0.01042229, "balance_loss_clip": 1.0125463, "balance_loss_mlp": 1.11198676, "epoch": 0.007575529836164136, "flos": 18222706137600.0, "grad_norm": 3.002509575058275, "language_loss": 1.01587784, "learning_rate": 3.1138506658316945e-06, "loss": 1.03798699, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.56640625, "step": 126, "time_per_iteration": 2.4801864624023438 }, { "auxiliary_loss_clip": 0.01170648, "auxiliary_loss_mlp": 0.01054595, "balance_loss_clip": 1.02290893, "balance_loss_mlp": 1.11264396, "epoch": 0.007635653088832106, "flos": 21579325804800.0, "grad_norm": 2.0439053980232926, "language_loss": 0.84967506, "learning_rate": 3.1189404329183404e-06, "loss": 0.87192738, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.578125, "step": 127, "time_per_iteration": 2.5059218406677246 }, { "auxiliary_loss_clip": 0.01169939, "auxiliary_loss_mlp": 0.01065317, "balance_loss_clip": 1.03293967, "balance_loss_mlp": 1.11319852, "epoch": 0.007695776341500075, "flos": 25373861055360.0, "grad_norm": 1.776339512470131, "language_loss": 0.96083999, "learning_rate": 3.1239902798522317e-06, "loss": 0.98319256, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.56640625, "step": 128, "time_per_iteration": 2.5194311141967773 }, { "auxiliary_loss_clip": 0.01164918, "auxiliary_loss_mlp": 0.01044446, "balance_loss_clip": 1.0119257, "balance_loss_mlp": 1.10898721, "epoch": 0.007755899594168045, "flos": 22342876439040.0, "grad_norm": 1.5367217544527998, "language_loss": 0.91637528, "learning_rate": 3.129000827968184e-06, "loss": 0.93846893, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.55859375, "step": 129, "time_per_iteration": 2.5285797119140625 }, { "auxiliary_loss_clip": 0.01165619, "auxiliary_loss_mlp": 0.01053511, "balance_loss_clip": 1.02192056, "balance_loss_mlp": 1.10991979, "epoch": 0.007816022846836013, "flos": 22637124311040.0, "grad_norm": 1.7814462279799346, "language_loss": 1.06674862, "learning_rate": 3.133972684206866e-06, "loss": 1.08894002, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5546875, "step": 130, "time_per_iteration": 2.5282108783721924 }, { "auxiliary_loss_clip": 0.01170575, "auxiliary_loss_mlp": 0.01076454, "balance_loss_clip": 1.04297984, "balance_loss_mlp": 1.11391091, "epoch": 0.007876146099503984, "flos": 18182032536960.0, "grad_norm": 1.7820110478159792, "language_loss": 0.91168344, "learning_rate": 3.138906441556014e-06, "loss": 0.93415374, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.56640625, "step": 131, "time_per_iteration": 2.503371238708496 }, { "auxiliary_loss_clip": 0.01166018, "auxiliary_loss_mlp": 0.01065416, "balance_loss_clip": 1.03554177, "balance_loss_mlp": 1.10995984, "epoch": 0.007936269352171952, "flos": 27118436382720.0, "grad_norm": 1.807337774587413, "language_loss": 0.91810304, "learning_rate": 3.143802679474861e-06, "loss": 0.94041741, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.5625, "step": 132, "time_per_iteration": 2.5488345623016357 }, { "auxiliary_loss_clip": 0.01158313, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.01266193, "balance_loss_mlp": 1.10263288, "epoch": 0.007996392604839923, "flos": 19025324449920.0, "grad_norm": 1.8219003459879866, "language_loss": 1.04806709, "learning_rate": 3.1486619643025565e-06, "loss": 1.07006276, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.5546875, "step": 133, "time_per_iteration": 2.536303997039795 }, { "auxiliary_loss_clip": 0.01156624, "auxiliary_loss_mlp": 0.0103903, "balance_loss_clip": 1.01113486, "balance_loss_mlp": 1.10205114, "epoch": 0.008056515857507891, "flos": 25482964654080.0, "grad_norm": 1.3873604671091446, "language_loss": 0.80927575, "learning_rate": 3.153484849651286e-06, "loss": 0.83123219, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.546875, "step": 134, "time_per_iteration": 2.541942596435547 }, { "auxiliary_loss_clip": 0.01159421, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.01704311, "balance_loss_mlp": 1.10331714, "epoch": 0.00811663911017586, "flos": 20556545258880.0, "grad_norm": 2.385722404614008, "language_loss": 1.03546047, "learning_rate": 3.1582718767847806e-06, "loss": 1.05753994, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.5625, "step": 135, "time_per_iteration": 5.4628942012786865 }, { "auxiliary_loss_clip": 0.01154965, "auxiliary_loss_mlp": 0.01049689, "balance_loss_clip": 1.01929116, "balance_loss_mlp": 1.09991395, "epoch": 0.00817676236284383, "flos": 18798947994240.0, "grad_norm": 2.3107108551537663, "language_loss": 1.03597558, "learning_rate": 3.1630235749828485e-06, "loss": 1.05802214, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.55078125, "step": 136, "time_per_iteration": 2.5820934772491455 }, { "auxiliary_loss_clip": 0.01153489, "auxiliary_loss_mlp": 0.01044293, "balance_loss_clip": 1.01446652, "balance_loss_mlp": 1.09903002, "epoch": 0.008236885615511799, "flos": 23872596059520.0, "grad_norm": 1.8074169417862755, "language_loss": 0.96794724, "learning_rate": 3.1677404618925676e-06, "loss": 0.98992509, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.546875, "step": 137, "time_per_iteration": 4.063216924667358 }, { "auxiliary_loss_clip": 0.01147655, "auxiliary_loss_mlp": 0.01038887, "balance_loss_clip": 1.00898957, "balance_loss_mlp": 1.09404254, "epoch": 0.00829700886817977, "flos": 24642500561280.0, "grad_norm": 1.5223026522970442, "language_loss": 0.97398341, "learning_rate": 3.1724230438666953e-06, "loss": 0.99584889, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.53515625, "step": 138, "time_per_iteration": 2.5396857261657715 }, { "auxiliary_loss_clip": 0.01145923, "auxiliary_loss_mlp": 0.01047484, "balance_loss_clip": 1.01837313, "balance_loss_mlp": 1.09264004, "epoch": 0.008357132120847738, "flos": 25260917207040.0, "grad_norm": 1.8098998313141175, "language_loss": 0.98372579, "learning_rate": 3.177071816289865e-06, "loss": 1.00565982, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.53125, "step": 139, "time_per_iteration": 2.545809030532837 }, { "auxiliary_loss_clip": 0.01147291, "auxiliary_loss_mlp": 0.01047596, "balance_loss_clip": 1.01779318, "balance_loss_mlp": 1.09388614, "epoch": 0.008417255373515706, "flos": 27343660763520.0, "grad_norm": 1.8951216184509516, "language_loss": 1.0093379, "learning_rate": 3.181687263893095e-06, "loss": 1.03128684, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.53125, "step": 140, "time_per_iteration": 2.5713443756103516 }, { "auxiliary_loss_clip": 0.01148874, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.02033627, "balance_loss_mlp": 1.09526396, "epoch": 0.008477378626183677, "flos": 17638120465920.0, "grad_norm": 2.004789896250355, "language_loss": 0.98197573, "learning_rate": 3.186269861057098e-06, "loss": 1.0039587, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.53515625, "step": 141, "time_per_iteration": 2.620731830596924 }, { "auxiliary_loss_clip": 0.0114262, "auxiliary_loss_mlp": 0.010425, "balance_loss_clip": 1.01462853, "balance_loss_mlp": 1.09023046, "epoch": 0.008537501878851645, "flos": 13880488389120.0, "grad_norm": 1.852408252827294, "language_loss": 0.97836852, "learning_rate": 3.1908200721048745e-06, "loss": 1.00021958, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.5234375, "step": 142, "time_per_iteration": 2.4873030185699463 }, { "auxiliary_loss_clip": 0.01130772, "auxiliary_loss_mlp": 0.01092867, "balance_loss_clip": 1.07493758, "balance_loss_mlp": 1.09406507, "epoch": 0.008597625131519616, "flos": 71244320832000.0, "grad_norm": 1.619044840787843, "language_loss": 0.67234296, "learning_rate": 3.195338351584042e-06, "loss": 0.69457936, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.3671875, "step": 143, "time_per_iteration": 3.2432479858398438 }, { "auxiliary_loss_clip": 0.01139552, "auxiliary_loss_mlp": 0.0108049, "balance_loss_clip": 1.05173695, "balance_loss_mlp": 1.08797789, "epoch": 0.008657748384187584, "flos": 17601880608000.0, "grad_norm": 1.8105849728074743, "language_loss": 0.9700436, "learning_rate": 3.1998251445393258e-06, "loss": 0.99224401, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.515625, "step": 144, "time_per_iteration": 2.467697858810425 }, { "auxiliary_loss_clip": 0.0113955, "auxiliary_loss_mlp": 0.01094197, "balance_loss_clip": 1.06476378, "balance_loss_mlp": 1.0864017, "epoch": 0.008717871636855555, "flos": 19714405420800.0, "grad_norm": 1.5708548721482403, "language_loss": 0.99818945, "learning_rate": 3.204280886775619e-06, "loss": 1.02052689, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.53125, "step": 145, "time_per_iteration": 2.4968101978302 }, { "auxiliary_loss_clip": 0.01139541, "auxiliary_loss_mlp": 0.01076553, "balance_loss_clip": 1.04696488, "balance_loss_mlp": 1.08617938, "epoch": 0.008777994889523523, "flos": 24716271997440.0, "grad_norm": 1.5243322166195072, "language_loss": 0.97406828, "learning_rate": 3.208706005112005e-06, "loss": 0.99622923, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.53125, "step": 146, "time_per_iteration": 2.558898448944092 }, { "auxiliary_loss_clip": 0.01119786, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.02805781, "balance_loss_mlp": 1.0857501, "epoch": 0.008838118142191492, "flos": 70128916715520.0, "grad_norm": 0.9323611416996458, "language_loss": 0.60507274, "learning_rate": 3.213100917627104e-06, "loss": 0.62665844, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.33984375, "step": 147, "time_per_iteration": 3.1915223598480225 }, { "auxiliary_loss_clip": 0.01138769, "auxiliary_loss_mlp": 0.01080127, "balance_loss_clip": 1.05213618, "balance_loss_mlp": 1.08553386, "epoch": 0.008898241394859462, "flos": 20043845809920.0, "grad_norm": 1.81740094174444, "language_loss": 0.91860807, "learning_rate": 3.2174660338961135e-06, "loss": 0.94079697, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.53125, "step": 148, "time_per_iteration": 2.505553960800171 }, { "auxiliary_loss_clip": 0.01140306, "auxiliary_loss_mlp": 0.0110827, "balance_loss_clip": 1.07756126, "balance_loss_mlp": 1.08673239, "epoch": 0.008958364647527431, "flos": 10742843969280.0, "grad_norm": 1.8750336527530238, "language_loss": 0.97847569, "learning_rate": 3.2218017552198588e-06, "loss": 1.00096154, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.53515625, "step": 149, "time_per_iteration": 2.4914822578430176 }, { "auxiliary_loss_clip": 0.01140386, "auxiliary_loss_mlp": 0.01092028, "balance_loss_clip": 1.05931664, "balance_loss_mlp": 1.08699977, "epoch": 0.009018487900195401, "flos": 29126326250880.0, "grad_norm": 1.8228253489488129, "language_loss": 1.04904997, "learning_rate": 3.226108474846181e-06, "loss": 1.07137418, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.53515625, "step": 150, "time_per_iteration": 2.5454583168029785 }, { "auxiliary_loss_clip": 0.0113672, "auxiliary_loss_mlp": 0.01063869, "balance_loss_clip": 1.03273177, "balance_loss_mlp": 1.08456898, "epoch": 0.00907861115286337, "flos": 32962268240640.0, "grad_norm": 1.6721955613619137, "language_loss": 0.85564417, "learning_rate": 3.2303865781839817e-06, "loss": 0.87765008, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.5234375, "step": 151, "time_per_iteration": 2.598466634750366 }, { "auxiliary_loss_clip": 0.01142015, "auxiliary_loss_mlp": 0.01048713, "balance_loss_clip": 1.01860094, "balance_loss_mlp": 1.09065437, "epoch": 0.009138734405531338, "flos": 21761362967040.0, "grad_norm": 1.8065339788154082, "language_loss": 1.01437366, "learning_rate": 3.234636443010188e-06, "loss": 1.03628099, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.515625, "step": 152, "time_per_iteration": 2.549598455429077 }, { "auxiliary_loss_clip": 0.01149248, "auxiliary_loss_mlp": 0.01091286, "balance_loss_clip": 1.0635221, "balance_loss_mlp": 1.09889424, "epoch": 0.009198857658199309, "flos": 20841681265920.0, "grad_norm": 2.3865125594860905, "language_loss": 1.02526176, "learning_rate": 3.238858439669943e-06, "loss": 1.04766715, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.50390625, "step": 153, "time_per_iteration": 2.5359206199645996 }, { "auxiliary_loss_clip": 0.01151709, "auxiliary_loss_mlp": 0.01135706, "balance_loss_clip": 1.10788298, "balance_loss_mlp": 1.10075128, "epoch": 0.009258980910867277, "flos": 24826213468800.0, "grad_norm": 1.6487137201427435, "language_loss": 0.95505238, "learning_rate": 3.2430529312702712e-06, "loss": 0.97792649, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.5078125, "step": 154, "time_per_iteration": 2.5430989265441895 }, { "auxiliary_loss_clip": 0.0113804, "auxiliary_loss_mlp": 0.01051828, "balance_loss_clip": 1.0253154, "balance_loss_mlp": 1.08814311, "epoch": 0.009319104163535248, "flos": 28766511112320.0, "grad_norm": 1.7505227703301158, "language_loss": 0.96661776, "learning_rate": 3.2472202738674737e-06, "loss": 0.98851645, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.5, "step": 155, "time_per_iteration": 2.5747148990631104 }, { "auxiliary_loss_clip": 0.01131053, "auxiliary_loss_mlp": 0.01040511, "balance_loss_clip": 1.01144826, "balance_loss_mlp": 1.08071637, "epoch": 0.009379227416203216, "flos": 16581055098240.0, "grad_norm": 1.8411651345899376, "language_loss": 0.9960739, "learning_rate": 3.2513608166485063e-06, "loss": 1.0177896, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.50390625, "step": 156, "time_per_iteration": 2.4934210777282715 }, { "auxiliary_loss_clip": 0.01135313, "auxiliary_loss_mlp": 0.01070251, "balance_loss_clip": 1.04013824, "balance_loss_mlp": 1.08538198, "epoch": 0.009439350668871187, "flos": 18329016827520.0, "grad_norm": 2.0317943904965476, "language_loss": 1.11902428, "learning_rate": 3.2554749021065498e-06, "loss": 1.1410799, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.5, "step": 157, "time_per_iteration": 2.487980604171753 }, { "auxiliary_loss_clip": 0.0113337, "auxiliary_loss_mlp": 0.01066998, "balance_loss_clip": 1.04098606, "balance_loss_mlp": 1.08573306, "epoch": 0.009499473921539155, "flos": 24348846182400.0, "grad_norm": 1.7869285150680119, "language_loss": 0.99281025, "learning_rate": 3.2595628662110186e-06, "loss": 1.0148139, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.4765625, "step": 158, "time_per_iteration": 2.5409553050994873 }, { "auxiliary_loss_clip": 0.01133501, "auxiliary_loss_mlp": 0.01048139, "balance_loss_clip": 1.02364159, "balance_loss_mlp": 1.08584833, "epoch": 0.009559597174207124, "flos": 16398389531520.0, "grad_norm": 2.016681159532475, "language_loss": 1.00034785, "learning_rate": 3.2636250385721982e-06, "loss": 1.02216423, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.4765625, "step": 159, "time_per_iteration": 2.5823416709899902 }, { "auxiliary_loss_clip": 0.01132978, "auxiliary_loss_mlp": 0.01040574, "balance_loss_clip": 1.01565897, "balance_loss_mlp": 1.08596921, "epoch": 0.009619720426875094, "flos": 22855785356160.0, "grad_norm": 1.4648504807235845, "language_loss": 0.93612546, "learning_rate": 3.2676617426007263e-06, "loss": 0.95786095, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.47070312, "step": 160, "time_per_iteration": 2.5437843799591064 }, { "auxiliary_loss_clip": 0.01133021, "auxiliary_loss_mlp": 0.01053092, "balance_loss_clip": 1.02617431, "balance_loss_mlp": 1.0849607, "epoch": 0.009679843679543063, "flos": 19134009112320.0, "grad_norm": 1.913749567925863, "language_loss": 1.04869103, "learning_rate": 3.2716732956621042e-06, "loss": 1.07055211, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.48046875, "step": 161, "time_per_iteration": 2.5234079360961914 }, { "auxiliary_loss_clip": 0.01132437, "auxiliary_loss_mlp": 0.01081099, "balance_loss_clip": 1.05380058, "balance_loss_mlp": 1.08474565, "epoch": 0.009739966932211033, "flos": 20301958558080.0, "grad_norm": 1.6130591069013445, "language_loss": 1.0267837, "learning_rate": 3.2756600092264203e-06, "loss": 1.0489192, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4765625, "step": 162, "time_per_iteration": 2.4932761192321777 }, { "auxiliary_loss_clip": 0.01112098, "auxiliary_loss_mlp": 0.01132878, "balance_loss_clip": 1.11695135, "balance_loss_mlp": 1.08284581, "epoch": 0.009800090184879002, "flos": 67031073112320.0, "grad_norm": 1.261144539344647, "language_loss": 0.72789085, "learning_rate": 3.279622189013474e-06, "loss": 0.75034058, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.29296875, "step": 163, "time_per_iteration": 3.085137367248535 }, { "auxiliary_loss_clip": 0.01128606, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.01294446, "balance_loss_mlp": 1.08287776, "epoch": 0.00986021343754697, "flos": 17163755556480.0, "grad_norm": 1.8763686953702905, "language_loss": 0.97455072, "learning_rate": 3.283560135133457e-06, "loss": 0.99622023, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.45703125, "step": 164, "time_per_iteration": 2.4988625049591064 }, { "auxiliary_loss_clip": 0.01125758, "auxiliary_loss_mlp": 0.01082773, "balance_loss_clip": 1.05584359, "balance_loss_mlp": 1.07921314, "epoch": 0.00992033669021494, "flos": 17748445962240.0, "grad_norm": 3.409909989672282, "language_loss": 1.01168895, "learning_rate": 3.2874741422233565e-06, "loss": 1.03377426, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.46484375, "step": 165, "time_per_iteration": 2.4588077068328857 }, { "auxiliary_loss_clip": 0.01130602, "auxiliary_loss_mlp": 0.01098895, "balance_loss_clip": 1.07002246, "balance_loss_mlp": 1.08373833, "epoch": 0.00998045994288291, "flos": 25296109724160.0, "grad_norm": 1.606780359975797, "language_loss": 0.90116942, "learning_rate": 3.2913644995792465e-06, "loss": 0.92346436, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.46875, "step": 166, "time_per_iteration": 2.5571889877319336 }, { "auxiliary_loss_clip": 0.01128015, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.01583886, "balance_loss_mlp": 1.08215737, "epoch": 0.01004058319555088, "flos": 32297801644800.0, "grad_norm": 1.884502432506522, "language_loss": 1.01637363, "learning_rate": 3.2952314912845914e-06, "loss": 1.03807402, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.45898438, "step": 167, "time_per_iteration": 2.6032469272613525 }, { "auxiliary_loss_clip": 0.01129957, "auxiliary_loss_mlp": 0.01061388, "balance_loss_clip": 1.03569889, "balance_loss_mlp": 1.08292508, "epoch": 0.010100706448218848, "flos": 11319365116800.0, "grad_norm": 1.99882234204152, "language_loss": 1.06026495, "learning_rate": 3.299075396334735e-06, "loss": 1.08217835, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.47070312, "step": 168, "time_per_iteration": 2.495055675506592 }, { "auxiliary_loss_clip": 0.01131425, "auxiliary_loss_mlp": 0.01094566, "balance_loss_clip": 1.0677793, "balance_loss_mlp": 1.08162379, "epoch": 0.010160829700886819, "flos": 29718103662720.0, "grad_norm": 1.4340699365950984, "language_loss": 0.94515836, "learning_rate": 3.3028964887576868e-06, "loss": 0.96741831, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.49804688, "step": 169, "time_per_iteration": 2.6222143173217773 }, { "auxiliary_loss_clip": 0.0112907, "auxiliary_loss_mlp": 0.01047869, "balance_loss_clip": 1.02018809, "balance_loss_mlp": 1.08031499, "epoch": 0.010220952953554787, "flos": 20411306536320.0, "grad_norm": 1.5025721976504178, "language_loss": 0.95044106, "learning_rate": 3.306695037731344e-06, "loss": 0.97221047, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.48828125, "step": 170, "time_per_iteration": 2.5119643211364746 }, { "auxiliary_loss_clip": 0.01128434, "auxiliary_loss_mlp": 0.01049745, "balance_loss_clip": 1.02148008, "balance_loss_mlp": 1.07969916, "epoch": 0.010281076206222756, "flos": 31283783850240.0, "grad_norm": 1.6022968043148427, "language_loss": 1.00824785, "learning_rate": 3.3104713076972827e-06, "loss": 1.03002965, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.48828125, "step": 171, "time_per_iteration": 2.606609582901001 }, { "auxiliary_loss_clip": 0.01122217, "auxiliary_loss_mlp": 0.01083674, "balance_loss_clip": 1.05616105, "balance_loss_mlp": 1.07616639, "epoch": 0.010341199458890726, "flos": 21981176087040.0, "grad_norm": 1.6616138774649498, "language_loss": 0.98271638, "learning_rate": 3.314225558471224e-06, "loss": 1.00477529, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4609375, "step": 172, "time_per_iteration": 2.5501956939697266 }, { "auxiliary_loss_clip": 0.01114095, "auxiliary_loss_mlp": 0.01053214, "balance_loss_clip": 1.02593875, "balance_loss_mlp": 1.07071829, "epoch": 0.010401322711558695, "flos": 30809209472640.0, "grad_norm": 1.4117990446047644, "language_loss": 0.89141834, "learning_rate": 3.317958045350308e-06, "loss": 0.91309136, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.43359375, "step": 173, "time_per_iteration": 2.5926830768585205 }, { "auxiliary_loss_clip": 0.01113294, "auxiliary_loss_mlp": 0.01039942, "balance_loss_clip": 1.01331067, "balance_loss_mlp": 1.0681175, "epoch": 0.010461445964226665, "flos": 24714037670400.0, "grad_norm": 1.5824782001419004, "language_loss": 0.92250144, "learning_rate": 3.3216690192172596e-06, "loss": 0.94403386, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.453125, "step": 174, "time_per_iteration": 2.5231244564056396 }, { "auxiliary_loss_clip": 0.01111766, "auxiliary_loss_mlp": 0.01073474, "balance_loss_clip": 1.04599655, "balance_loss_mlp": 1.06691909, "epoch": 0.010521569216894634, "flos": 27709096631040.0, "grad_norm": 1.7159644708178656, "language_loss": 0.82094216, "learning_rate": 3.325358726641591e-06, "loss": 0.84279454, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.44921875, "step": 175, "time_per_iteration": 5.420367240905762 }, { "auxiliary_loss_clip": 0.01108329, "auxiliary_loss_mlp": 0.01061841, "balance_loss_clip": 1.03462589, "balance_loss_mlp": 1.06344867, "epoch": 0.010581692469562603, "flos": 12457533306240.0, "grad_norm": 1.855225605181663, "language_loss": 1.09006524, "learning_rate": 3.329027409977902e-06, "loss": 1.11176705, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.44921875, "step": 176, "time_per_iteration": 3.932236909866333 }, { "auxiliary_loss_clip": 0.01107977, "auxiliary_loss_mlp": 0.01042396, "balance_loss_clip": 1.01726687, "balance_loss_mlp": 1.06352115, "epoch": 0.010641815722230573, "flos": 19426581239040.0, "grad_norm": 2.0633428337284245, "language_loss": 0.90053284, "learning_rate": 3.3326753074614087e-06, "loss": 0.92203653, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.4453125, "step": 177, "time_per_iteration": 2.5009655952453613 }, { "auxiliary_loss_clip": 0.01108465, "auxiliary_loss_mlp": 0.01046372, "balance_loss_clip": 1.01986027, "balance_loss_mlp": 1.06119859, "epoch": 0.010701938974898541, "flos": 18331600268160.0, "grad_norm": 2.8810073302678676, "language_loss": 0.94233507, "learning_rate": 3.3363026533007716e-06, "loss": 0.96388352, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.47265625, "step": 178, "time_per_iteration": 2.4481000900268555 }, { "auxiliary_loss_clip": 0.01108904, "auxiliary_loss_mlp": 0.01054608, "balance_loss_clip": 1.02767897, "balance_loss_mlp": 1.06241047, "epoch": 0.010762062227566512, "flos": 19203102426240.0, "grad_norm": 1.935276908590826, "language_loss": 0.95267212, "learning_rate": 3.3399096777683303e-06, "loss": 0.9743073, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.46484375, "step": 179, "time_per_iteration": 2.462662696838379 }, { "auxiliary_loss_clip": 0.01108332, "auxiliary_loss_mlp": 0.01043485, "balance_loss_clip": 1.01535177, "balance_loss_mlp": 1.06018186, "epoch": 0.01082218548023448, "flos": 31424239716480.0, "grad_norm": 1.824050483614449, "language_loss": 0.97232193, "learning_rate": 3.3434966072878213e-06, "loss": 0.99384004, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.48242188, "step": 180, "time_per_iteration": 2.625880002975464 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01043804, "balance_loss_clip": 1.0162189, "balance_loss_mlp": 1.05589795, "epoch": 0.01088230873290245, "flos": 25045258538880.0, "grad_norm": 1.7445022715107348, "language_loss": 0.87962955, "learning_rate": 3.3470636645196674e-06, "loss": 0.90110552, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.47851562, "step": 181, "time_per_iteration": 2.5555903911590576 }, { "auxiliary_loss_clip": 0.01103581, "auxiliary_loss_mlp": 0.01063185, "balance_loss_clip": 1.03059316, "balance_loss_mlp": 1.056005, "epoch": 0.01094243198557042, "flos": 22892304504960.0, "grad_norm": 2.126886827733957, "language_loss": 0.99448544, "learning_rate": 3.3506110684439156e-06, "loss": 1.0161531, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47460938, "step": 182, "time_per_iteration": 2.558154344558716 }, { "auxiliary_loss_clip": 0.01103192, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.01801348, "balance_loss_mlp": 1.05479455, "epoch": 0.011002555238238388, "flos": 17164104670080.0, "grad_norm": 1.845868485975377, "language_loss": 1.01241052, "learning_rate": 3.3541390344409054e-06, "loss": 1.03394306, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.484375, "step": 183, "time_per_iteration": 2.480976104736328 }, { "auxiliary_loss_clip": 0.01100503, "auxiliary_loss_mlp": 0.01036958, "balance_loss_clip": 1.00949216, "balance_loss_mlp": 1.05313444, "epoch": 0.011062678490906358, "flos": 22309045464960.0, "grad_norm": 1.717824545683583, "language_loss": 0.98364997, "learning_rate": 3.357647774369736e-06, "loss": 1.00502455, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.47460938, "step": 184, "time_per_iteration": 2.50250506401062 }, { "auxiliary_loss_clip": 0.0109969, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.01297688, "balance_loss_mlp": 1.05309284, "epoch": 0.011122801743574327, "flos": 24387250544640.0, "grad_norm": 1.5629507167603618, "language_loss": 0.94785511, "learning_rate": 3.3611374966446085e-06, "loss": 0.96924728, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.46484375, "step": 185, "time_per_iteration": 2.5196001529693604 }, { "auxiliary_loss_clip": 0.01097175, "auxiliary_loss_mlp": 0.01048778, "balance_loss_clip": 1.0211935, "balance_loss_mlp": 1.0516212, "epoch": 0.011182924996242297, "flos": 18149283815040.0, "grad_norm": 1.7713677097629736, "language_loss": 0.86431944, "learning_rate": 3.3646084063091142e-06, "loss": 0.88577902, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.45507812, "step": 186, "time_per_iteration": 2.525308847427368 }, { "auxiliary_loss_clip": 0.01096939, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.01812482, "balance_loss_mlp": 1.05212104, "epoch": 0.011243048248910266, "flos": 15485899570560.0, "grad_norm": 2.194222615678212, "language_loss": 1.16813791, "learning_rate": 3.3680607051085194e-06, "loss": 1.18955147, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.44726562, "step": 187, "time_per_iteration": 2.4834649562835693 }, { "auxiliary_loss_clip": 0.01094342, "auxiliary_loss_mlp": 0.01042195, "balance_loss_clip": 1.01674342, "balance_loss_mlp": 1.05074596, "epoch": 0.011303171501578235, "flos": 40915273420800.0, "grad_norm": 1.4044566971179826, "language_loss": 0.82456887, "learning_rate": 3.371494591560139e-06, "loss": 0.84593427, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.4375, "step": 188, "time_per_iteration": 2.7538275718688965 }, { "auxiliary_loss_clip": 0.0107885, "auxiliary_loss_mlp": 0.01078219, "balance_loss_clip": 1.06267381, "balance_loss_mlp": 1.04969454, "epoch": 0.011363294754246205, "flos": 66299607884160.0, "grad_norm": 0.8769107283055859, "language_loss": 0.56446695, "learning_rate": 3.3749102610218297e-06, "loss": 0.58603764, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.29101562, "step": 189, "time_per_iteration": 3.1571950912475586 }, { "auxiliary_loss_clip": 0.01098987, "auxiliary_loss_mlp": 0.01191718, "balance_loss_clip": 1.16007996, "balance_loss_mlp": 1.05176306, "epoch": 0.011423418006914174, "flos": 24899112120960.0, "grad_norm": 1.8049669485500117, "language_loss": 1.06696558, "learning_rate": 3.3783079057586833e-06, "loss": 1.0898726, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47265625, "step": 190, "time_per_iteration": 2.53869891166687 }, { "auxiliary_loss_clip": 0.01103717, "auxiliary_loss_mlp": 0.01235216, "balance_loss_clip": 1.20071638, "balance_loss_mlp": 1.05140197, "epoch": 0.011483541259582144, "flos": 19790865031680.0, "grad_norm": 2.004491653650201, "language_loss": 0.98771751, "learning_rate": 3.3816877150079665e-06, "loss": 1.01110685, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5234375, "step": 191, "time_per_iteration": 2.4821109771728516 }, { "auxiliary_loss_clip": 0.01103148, "auxiliary_loss_mlp": 0.01225375, "balance_loss_clip": 1.19154346, "balance_loss_mlp": 1.05140662, "epoch": 0.011543664512250112, "flos": 26175746229120.0, "grad_norm": 1.691901910399317, "language_loss": 0.99615991, "learning_rate": 3.385049875042367e-06, "loss": 1.01944518, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.515625, "step": 192, "time_per_iteration": 2.544553518295288 }, { "auxiliary_loss_clip": 0.01102695, "auxiliary_loss_mlp": 0.01117862, "balance_loss_clip": 1.08391166, "balance_loss_mlp": 1.05200636, "epoch": 0.011603787764918083, "flos": 23767856380800.0, "grad_norm": 2.0334207566118487, "language_loss": 0.99136633, "learning_rate": 3.3883945692315938e-06, "loss": 1.01357186, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 193, "time_per_iteration": 2.5115790367126465 }, { "auxiliary_loss_clip": 0.01096818, "auxiliary_loss_mlp": 0.01045923, "balance_loss_clip": 1.01166224, "balance_loss_mlp": 1.05028796, "epoch": 0.011663911017586051, "flos": 25953594048000.0, "grad_norm": 1.7614406224923902, "language_loss": 1.03044677, "learning_rate": 3.3917219781023906e-06, "loss": 1.05187416, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46484375, "step": 194, "time_per_iteration": 2.557671070098877 }, { "auxiliary_loss_clip": 0.01099663, "auxiliary_loss_mlp": 0.01108333, "balance_loss_clip": 1.06820667, "balance_loss_mlp": 1.05228031, "epoch": 0.01172403427025402, "flos": 17894173443840.0, "grad_norm": 1.927176303647054, "language_loss": 1.06707811, "learning_rate": 3.3950322793970014e-06, "loss": 1.08915806, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.47265625, "step": 195, "time_per_iteration": 2.495694875717163 }, { "auxiliary_loss_clip": 0.010981, "auxiliary_loss_mlp": 0.01137273, "balance_loss_clip": 1.09757662, "balance_loss_mlp": 1.05124545, "epoch": 0.01178415752292199, "flos": 17893579950720.0, "grad_norm": 2.0585798463572424, "language_loss": 0.99742883, "learning_rate": 3.3983256481301445e-06, "loss": 1.01978254, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.46875, "step": 196, "time_per_iteration": 2.521138906478882 }, { "auxiliary_loss_clip": 0.01097922, "auxiliary_loss_mlp": 0.01108691, "balance_loss_clip": 1.07516909, "balance_loss_mlp": 1.0514127, "epoch": 0.011844280775589959, "flos": 22892444150400.0, "grad_norm": 1.9618744077688632, "language_loss": 1.06366789, "learning_rate": 3.4016022566445335e-06, "loss": 1.08573401, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46484375, "step": 197, "time_per_iteration": 2.4926061630249023 }, { "auxiliary_loss_clip": 0.0109545, "auxiliary_loss_mlp": 0.01053314, "balance_loss_clip": 1.02420259, "balance_loss_mlp": 1.05084753, "epoch": 0.01190440402825793, "flos": 26979097680000.0, "grad_norm": 1.794604503777132, "language_loss": 0.90271187, "learning_rate": 3.4048622746649966e-06, "loss": 0.92419952, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44726562, "step": 198, "time_per_iteration": 2.5743231773376465 }, { "auxiliary_loss_clip": 0.01097158, "auxiliary_loss_mlp": 0.01072629, "balance_loss_clip": 1.0424211, "balance_loss_mlp": 1.05109262, "epoch": 0.011964527280925898, "flos": 20520549780480.0, "grad_norm": 1.5716242810570824, "language_loss": 0.94445264, "learning_rate": 3.4081058693512278e-06, "loss": 0.96615058, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4609375, "step": 199, "time_per_iteration": 2.49124813079834 }, { "auxiliary_loss_clip": 0.01101112, "auxiliary_loss_mlp": 0.01105727, "balance_loss_clip": 1.07194304, "balance_loss_mlp": 1.05356622, "epoch": 0.012024650533593867, "flos": 27744742995840.0, "grad_norm": 1.6261571136584236, "language_loss": 0.92443496, "learning_rate": 3.411333205349222e-06, "loss": 0.9465034, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4765625, "step": 200, "time_per_iteration": 2.550828695297241 }, { "auxiliary_loss_clip": 0.01093348, "auxiliary_loss_mlp": 0.01101853, "balance_loss_clip": 1.0685699, "balance_loss_mlp": 1.04718041, "epoch": 0.012084773786261837, "flos": 10451249360640.0, "grad_norm": 1.6938137071463262, "language_loss": 1.01948857, "learning_rate": 3.4145444448414217e-06, "loss": 1.04144073, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4609375, "step": 201, "time_per_iteration": 2.4822611808776855 }, { "auxiliary_loss_clip": 0.01084611, "auxiliary_loss_mlp": 0.0106071, "balance_loss_clip": 1.03052664, "balance_loss_mlp": 1.04157591, "epoch": 0.012144897038929806, "flos": 23104821150720.0, "grad_norm": 1.6048851073238817, "language_loss": 0.93243027, "learning_rate": 3.4177397475956223e-06, "loss": 0.95388341, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 202, "time_per_iteration": 2.5191028118133545 }, { "auxiliary_loss_clip": 0.01082212, "auxiliary_loss_mlp": 0.01043154, "balance_loss_clip": 1.01575935, "balance_loss_mlp": 1.03899705, "epoch": 0.012205020291597776, "flos": 21032132065920.0, "grad_norm": 1.6181792641688677, "language_loss": 0.99432361, "learning_rate": 3.4209192710126685e-06, "loss": 1.01557732, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.43359375, "step": 203, "time_per_iteration": 2.476510524749756 }, { "auxiliary_loss_clip": 0.01062666, "auxiliary_loss_mlp": 0.01152707, "balance_loss_clip": 1.14212155, "balance_loss_mlp": 1.03697503, "epoch": 0.012265143544265745, "flos": 68444846507520.0, "grad_norm": 1.1245228372156577, "language_loss": 0.61589622, "learning_rate": 3.4240831701729837e-06, "loss": 0.63804996, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.2578125, "step": 204, "time_per_iteration": 3.0146005153656006 }, { "auxiliary_loss_clip": 0.0110672, "auxiliary_loss_mlp": 0.01243779, "balance_loss_clip": 1.20835042, "balance_loss_mlp": 1.05874193, "epoch": 0.012325266796933715, "flos": 17018307365760.0, "grad_norm": 1.9755689809381638, "language_loss": 1.04080296, "learning_rate": 3.4272315978819516e-06, "loss": 1.06430793, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48046875, "step": 205, "time_per_iteration": 2.4901349544525146 }, { "auxiliary_loss_clip": 0.011189, "auxiliary_loss_mlp": 0.01304909, "balance_loss_clip": 1.26585567, "balance_loss_mlp": 1.06587601, "epoch": 0.012385390049601683, "flos": 20189119443840.0, "grad_norm": 1.8699489769743696, "language_loss": 1.01561546, "learning_rate": 3.4303647047142043e-06, "loss": 1.03985357, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53125, "step": 206, "time_per_iteration": 2.504542589187622 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.01264451, "balance_loss_clip": 1.23185897, "balance_loss_mlp": 1.0563736, "epoch": 0.012445513302269652, "flos": 16252208202240.0, "grad_norm": 1.7505488440962755, "language_loss": 1.08401203, "learning_rate": 3.43348263905683e-06, "loss": 1.10772812, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5078125, "step": 207, "time_per_iteration": 2.4713680744171143 }, { "auxiliary_loss_clip": 0.01093147, "auxiliary_loss_mlp": 0.01239402, "balance_loss_clip": 1.20707214, "balance_loss_mlp": 1.04464293, "epoch": 0.012505636554937622, "flos": 23768240405760.0, "grad_norm": 1.7038821994544826, "language_loss": 0.8575018, "learning_rate": 3.436585547151547e-06, "loss": 0.88082731, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.484375, "step": 208, "time_per_iteration": 2.566667318344116 }, { "auxiliary_loss_clip": 0.01101981, "auxiliary_loss_mlp": 0.01113928, "balance_loss_clip": 1.07597136, "balance_loss_mlp": 1.05930662, "epoch": 0.012565759807605591, "flos": 30590234225280.0, "grad_norm": 1.918460532501461, "language_loss": 1.10475111, "learning_rate": 3.4396735731358586e-06, "loss": 1.12691021, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.42578125, "step": 209, "time_per_iteration": 2.571448564529419 }, { "auxiliary_loss_clip": 0.01146159, "auxiliary_loss_mlp": 0.01064591, "balance_loss_clip": 1.01964951, "balance_loss_mlp": 1.10477829, "epoch": 0.012625883060273561, "flos": 40111956881280.0, "grad_norm": 2.404581684592111, "language_loss": 0.97718978, "learning_rate": 3.4427468590832302e-06, "loss": 0.9992972, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.4140625, "step": 210, "time_per_iteration": 2.663059711456299 }, { "auxiliary_loss_clip": 0.01169735, "auxiliary_loss_mlp": 0.01247272, "balance_loss_clip": 1.2077657, "balance_loss_mlp": 1.12621319, "epoch": 0.01268600631294153, "flos": 27087956899200.0, "grad_norm": 1.9748600183347633, "language_loss": 1.05048847, "learning_rate": 3.445805545042314e-06, "loss": 1.07465851, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.43554688, "step": 211, "time_per_iteration": 2.576044797897339 }, { "auxiliary_loss_clip": 0.01148725, "auxiliary_loss_mlp": 0.01269692, "balance_loss_clip": 1.23457336, "balance_loss_mlp": 1.10369885, "epoch": 0.012746129565609499, "flos": 16981823128320.0, "grad_norm": 2.0005912457661315, "language_loss": 1.0922991, "learning_rate": 3.448849769075239e-06, "loss": 1.11648321, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.45117188, "step": 212, "time_per_iteration": 2.5996060371398926 }, { "auxiliary_loss_clip": 0.01103402, "auxiliary_loss_mlp": 0.01114701, "balance_loss_clip": 1.08470774, "balance_loss_mlp": 1.05919051, "epoch": 0.012806252818277469, "flos": 46531786216320.0, "grad_norm": 1.5769440776846986, "language_loss": 0.87272328, "learning_rate": 3.4518796672950093e-06, "loss": 0.89490432, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 213, "time_per_iteration": 2.699709415435791 }, { "auxiliary_loss_clip": 0.01091771, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.00830221, "balance_loss_mlp": 1.04605031, "epoch": 0.012866376070945438, "flos": 14387846400000.0, "grad_norm": 2.3194899577081682, "language_loss": 0.99835563, "learning_rate": 3.4548953739020187e-06, "loss": 1.0196439, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.45703125, "step": 214, "time_per_iteration": 2.4537081718444824 }, { "auxiliary_loss_clip": 0.01118137, "auxiliary_loss_mlp": 0.01097442, "balance_loss_clip": 1.06294298, "balance_loss_mlp": 1.07102633, "epoch": 0.012926499323613408, "flos": 26139611105280.0, "grad_norm": 1.8247650237274122, "language_loss": 0.90476429, "learning_rate": 3.4578970212197196e-06, "loss": 0.92692012, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47070312, "step": 215, "time_per_iteration": 4.073654413223267 }, { "auxiliary_loss_clip": 0.01137959, "auxiliary_loss_mlp": 0.01154666, "balance_loss_clip": 1.10900891, "balance_loss_mlp": 1.08928561, "epoch": 0.012986622576281377, "flos": 30115904227200.0, "grad_norm": 1.9232658366492974, "language_loss": 1.0574342, "learning_rate": 3.460884739729461e-06, "loss": 1.08036041, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.48632812, "step": 216, "time_per_iteration": 3.9853289127349854 }, { "auxiliary_loss_clip": 0.0113832, "auxiliary_loss_mlp": 0.0115294, "balance_loss_clip": 1.1091187, "balance_loss_mlp": 1.09031796, "epoch": 0.013046745828949347, "flos": 13953177573120.0, "grad_norm": 2.1747942725460545, "language_loss": 1.12298131, "learning_rate": 3.463858658104523e-06, "loss": 1.14589393, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.48046875, "step": 217, "time_per_iteration": 2.463624954223633 }, { "auxiliary_loss_clip": 0.01126002, "auxiliary_loss_mlp": 0.01138359, "balance_loss_clip": 1.09792292, "balance_loss_mlp": 1.0798049, "epoch": 0.013106869081617315, "flos": 17346874970880.0, "grad_norm": 1.6824502379540924, "language_loss": 1.03864717, "learning_rate": 3.4668189032433696e-06, "loss": 1.06129086, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.4609375, "step": 218, "time_per_iteration": 2.5329208374023438 }, { "auxiliary_loss_clip": 0.01106929, "auxiliary_loss_mlp": 0.01080042, "balance_loss_clip": 1.05047822, "balance_loss_mlp": 1.06290102, "epoch": 0.013166992334285284, "flos": 25883732684160.0, "grad_norm": 1.7209989260840117, "language_loss": 0.9726072, "learning_rate": 3.46976560030214e-06, "loss": 0.99447691, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44140625, "step": 219, "time_per_iteration": 2.524907350540161 }, { "auxiliary_loss_clip": 0.01089994, "auxiliary_loss_mlp": 0.01038807, "balance_loss_clip": 1.01105499, "balance_loss_mlp": 1.04625654, "epoch": 0.013227115586953254, "flos": 31174610428800.0, "grad_norm": 1.5088127628256938, "language_loss": 0.980483, "learning_rate": 3.4726988727263976e-06, "loss": 1.00177097, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4375, "step": 220, "time_per_iteration": 2.6165785789489746 }, { "auxiliary_loss_clip": 0.01083868, "auxiliary_loss_mlp": 0.01051757, "balance_loss_clip": 1.02104831, "balance_loss_mlp": 1.04074812, "epoch": 0.013287238839621223, "flos": 20408513627520.0, "grad_norm": 1.6444710778523064, "language_loss": 0.97700977, "learning_rate": 3.475618842282164e-06, "loss": 0.99836606, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43164062, "step": 221, "time_per_iteration": 2.50985050201416 }, { "auxiliary_loss_clip": 0.01089744, "auxiliary_loss_mlp": 0.01099199, "balance_loss_clip": 1.06663144, "balance_loss_mlp": 1.04638612, "epoch": 0.013347362092289193, "flos": 14136262076160.0, "grad_norm": 1.8308138252476334, "language_loss": 1.06829596, "learning_rate": 3.4785256290862486e-06, "loss": 1.0901854, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.43359375, "step": 222, "time_per_iteration": 2.470869779586792 }, { "auxiliary_loss_clip": 0.01097978, "auxiliary_loss_mlp": 0.01115348, "balance_loss_clip": 1.07853651, "balance_loss_mlp": 1.05494082, "epoch": 0.013407485344957162, "flos": 21796660218240.0, "grad_norm": 1.902695554458468, "language_loss": 1.05840564, "learning_rate": 3.481419351635897e-06, "loss": 1.08053899, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.4296875, "step": 223, "time_per_iteration": 2.492043972015381 }, { "auxiliary_loss_clip": 0.01098679, "auxiliary_loss_mlp": 0.01105397, "balance_loss_clip": 1.06796527, "balance_loss_mlp": 1.05564547, "epoch": 0.013467608597625132, "flos": 18620716170240.0, "grad_norm": 2.031665178165983, "language_loss": 1.01738811, "learning_rate": 3.484300126837776e-06, "loss": 1.03942895, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.4296875, "step": 224, "time_per_iteration": 2.4889323711395264 }, { "auxiliary_loss_clip": 0.01096586, "auxiliary_loss_mlp": 0.01077493, "balance_loss_clip": 1.04354191, "balance_loss_mlp": 1.05369246, "epoch": 0.013527731850293101, "flos": 18551308654080.0, "grad_norm": 1.635028854133641, "language_loss": 1.02950668, "learning_rate": 3.487168070036317e-06, "loss": 1.0512476, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4296875, "step": 225, "time_per_iteration": 2.4698359966278076 }, { "auxiliary_loss_clip": 0.01088983, "auxiliary_loss_mlp": 0.01049318, "balance_loss_clip": 1.0203023, "balance_loss_mlp": 1.04679775, "epoch": 0.01358785510296107, "flos": 19164558418560.0, "grad_norm": 1.668201941654053, "language_loss": 1.08488941, "learning_rate": 3.4900232950414224e-06, "loss": 1.10627246, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 226, "time_per_iteration": 2.4657299518585205 }, { "auxiliary_loss_clip": 0.01080702, "auxiliary_loss_mlp": 0.01043698, "balance_loss_clip": 1.01372886, "balance_loss_mlp": 1.03907442, "epoch": 0.01364797835562904, "flos": 23328858545280.0, "grad_norm": 1.8529547712715548, "language_loss": 1.03664923, "learning_rate": 3.4928659141555727e-06, "loss": 1.05789328, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41601562, "step": 227, "time_per_iteration": 2.4552206993103027 }, { "auxiliary_loss_clip": 0.01071787, "auxiliary_loss_mlp": 0.0113349, "balance_loss_clip": 1.11394, "balance_loss_mlp": 1.04463649, "epoch": 0.013708101608297009, "flos": 70989943599360.0, "grad_norm": 1.0500160154291867, "language_loss": 0.57966822, "learning_rate": 3.4956960382003234e-06, "loss": 0.60172099, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.27148438, "step": 228, "time_per_iteration": 3.1548004150390625 }, { "auxiliary_loss_clip": 0.01075507, "auxiliary_loss_mlp": 0.01049621, "balance_loss_clip": 1.02358627, "balance_loss_mlp": 1.03497553, "epoch": 0.013768224860964979, "flos": 16324268981760.0, "grad_norm": 2.0749228903278825, "language_loss": 1.02698469, "learning_rate": 3.4985137765422354e-06, "loss": 1.04823601, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 229, "time_per_iteration": 2.4137885570526123 }, { "auxiliary_loss_clip": 0.0107582, "auxiliary_loss_mlp": 0.01096173, "balance_loss_clip": 1.06734776, "balance_loss_mlp": 1.03477514, "epoch": 0.013828348113632948, "flos": 20192017086720.0, "grad_norm": 2.0340409472213365, "language_loss": 0.98320317, "learning_rate": 3.501319237118231e-06, "loss": 1.00492311, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 230, "time_per_iteration": 2.4391608238220215 }, { "auxiliary_loss_clip": 0.01078928, "auxiliary_loss_mlp": 0.01109966, "balance_loss_clip": 1.08059335, "balance_loss_mlp": 1.03632855, "epoch": 0.013888471366300916, "flos": 20740013786880.0, "grad_norm": 1.561018678690937, "language_loss": 1.00255001, "learning_rate": 3.5041125264604056e-06, "loss": 1.02443886, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 231, "time_per_iteration": 2.43204402923584 }, { "auxiliary_loss_clip": 0.01077624, "auxiliary_loss_mlp": 0.01088841, "balance_loss_clip": 1.06013572, "balance_loss_mlp": 1.03567123, "epoch": 0.013948594618968886, "flos": 22089546547200.0, "grad_norm": 1.6758753163861133, "language_loss": 0.98966426, "learning_rate": 3.5068937497203002e-06, "loss": 1.01132882, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41992188, "step": 232, "time_per_iteration": 2.457054615020752 }, { "auxiliary_loss_clip": 0.01073614, "auxiliary_loss_mlp": 0.01048586, "balance_loss_clip": 1.02159727, "balance_loss_mlp": 1.03352594, "epoch": 0.014008717871636855, "flos": 19062087978240.0, "grad_norm": 2.371221136412141, "language_loss": 0.89359134, "learning_rate": 3.509663010692652e-06, "loss": 0.9148134, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 233, "time_per_iteration": 2.431537389755249 }, { "auxiliary_loss_clip": 0.01076656, "auxiliary_loss_mlp": 0.01048181, "balance_loss_clip": 1.01859379, "balance_loss_mlp": 1.03608823, "epoch": 0.014068841124304825, "flos": 14530152568320.0, "grad_norm": 1.8277487307236124, "language_loss": 0.99442166, "learning_rate": 3.512420411838642e-06, "loss": 1.01567006, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40625, "step": 234, "time_per_iteration": 2.4785776138305664 }, { "auxiliary_loss_clip": 0.01075426, "auxiliary_loss_mlp": 0.01059718, "balance_loss_clip": 1.02707899, "balance_loss_mlp": 1.03561401, "epoch": 0.014128964376972794, "flos": 18076420074240.0, "grad_norm": 1.965602055508437, "language_loss": 1.06779504, "learning_rate": 3.515166054308634e-06, "loss": 1.08914638, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.3984375, "step": 235, "time_per_iteration": 2.450258731842041 }, { "auxiliary_loss_clip": 0.01076743, "auxiliary_loss_mlp": 0.0106071, "balance_loss_clip": 1.02840436, "balance_loss_mlp": 1.03590918, "epoch": 0.014189087629640764, "flos": 25333257277440.0, "grad_norm": 1.967850403399697, "language_loss": 0.95677412, "learning_rate": 3.5179000379644498e-06, "loss": 0.97814864, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.40820312, "step": 236, "time_per_iteration": 2.6083762645721436 }, { "auxiliary_loss_clip": 0.01078437, "auxiliary_loss_mlp": 0.01057042, "balance_loss_clip": 1.0254519, "balance_loss_mlp": 1.03690863, "epoch": 0.014249210882308733, "flos": 36138212288640.0, "grad_norm": 1.6101465146016263, "language_loss": 0.94536513, "learning_rate": 3.520622461401154e-06, "loss": 0.96671987, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4140625, "step": 237, "time_per_iteration": 2.6023366451263428 }, { "auxiliary_loss_clip": 0.01075778, "auxiliary_loss_mlp": 0.01044183, "balance_loss_clip": 1.01414251, "balance_loss_mlp": 1.03647864, "epoch": 0.014309334134976702, "flos": 12932142595200.0, "grad_norm": 1.7039806804781887, "language_loss": 0.9228217, "learning_rate": 3.5233334219683935e-06, "loss": 0.94402122, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.39257812, "step": 238, "time_per_iteration": 2.4322116374969482 }, { "auxiliary_loss_clip": 0.01075948, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.01462257, "balance_loss_mlp": 1.03708851, "epoch": 0.014369457387644672, "flos": 20776463112960.0, "grad_norm": 1.4593327963501108, "language_loss": 0.96789944, "learning_rate": 3.526033015791284e-06, "loss": 0.98907089, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38867188, "step": 239, "time_per_iteration": 2.4896762371063232 }, { "auxiliary_loss_clip": 0.01078415, "auxiliary_loss_mlp": 0.01036202, "balance_loss_clip": 1.01070309, "balance_loss_mlp": 1.0398531, "epoch": 0.01442958064031264, "flos": 25847353180800.0, "grad_norm": 1.8139910450337617, "language_loss": 1.0199244, "learning_rate": 3.528721337790862e-06, "loss": 1.04107058, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 240, "time_per_iteration": 2.5304315090179443 }, { "auxiliary_loss_clip": 0.01081899, "auxiliary_loss_mlp": 0.01058553, "balance_loss_clip": 1.02996683, "balance_loss_mlp": 1.0421263, "epoch": 0.014489703892980611, "flos": 28218479500800.0, "grad_norm": 1.5811174619170947, "language_loss": 0.96160829, "learning_rate": 3.531398481704111e-06, "loss": 0.9830128, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.3984375, "step": 241, "time_per_iteration": 2.51960825920105 }, { "auxiliary_loss_clip": 0.01080413, "auxiliary_loss_mlp": 0.01063672, "balance_loss_clip": 1.03785181, "balance_loss_mlp": 1.04137993, "epoch": 0.01454982714564858, "flos": 22489860729600.0, "grad_norm": 1.5906862979908856, "language_loss": 0.97562319, "learning_rate": 3.534064540103573e-06, "loss": 0.99706411, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 242, "time_per_iteration": 2.4700756072998047 }, { "auxiliary_loss_clip": 0.01075915, "auxiliary_loss_mlp": 0.01078374, "balance_loss_clip": 1.05218327, "balance_loss_mlp": 1.03710485, "epoch": 0.014609950398316548, "flos": 21652119722880.0, "grad_norm": 1.7859696660501259, "language_loss": 0.96531606, "learning_rate": 3.536719604416555e-06, "loss": 0.98685902, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 243, "time_per_iteration": 2.461885452270508 }, { "auxiliary_loss_clip": 0.01074115, "auxiliary_loss_mlp": 0.01070905, "balance_loss_clip": 1.04452419, "balance_loss_mlp": 1.03551435, "epoch": 0.014670073650984519, "flos": 21868965377280.0, "grad_norm": 1.54499021698165, "language_loss": 0.93622619, "learning_rate": 3.5393637649439464e-06, "loss": 0.95767641, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38671875, "step": 244, "time_per_iteration": 2.4596147537231445 }, { "auxiliary_loss_clip": 0.01076645, "auxiliary_loss_mlp": 0.01038598, "balance_loss_clip": 1.0103457, "balance_loss_mlp": 1.03671432, "epoch": 0.014730196903652487, "flos": 23182642304640.0, "grad_norm": 2.1456827382680492, "language_loss": 0.9655484, "learning_rate": 3.54199711087864e-06, "loss": 0.98670083, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3984375, "step": 245, "time_per_iteration": 2.4329512119293213 }, { "auxiliary_loss_clip": 0.01078689, "auxiliary_loss_mlp": 0.01043874, "balance_loss_clip": 1.01665831, "balance_loss_mlp": 1.04003501, "epoch": 0.014790320156320457, "flos": 23221465603200.0, "grad_norm": 1.7582671076352405, "language_loss": 0.94322646, "learning_rate": 3.5446197303235913e-06, "loss": 0.96445209, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 246, "time_per_iteration": 2.5023117065429688 }, { "auxiliary_loss_clip": 0.01083627, "auxiliary_loss_mlp": 0.01069723, "balance_loss_clip": 1.04211402, "balance_loss_mlp": 1.04438996, "epoch": 0.014850443408988426, "flos": 15814571909760.0, "grad_norm": 1.566293307979433, "language_loss": 1.00349915, "learning_rate": 3.5472317103095034e-06, "loss": 1.02503276, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.390625, "step": 247, "time_per_iteration": 2.421661615371704 }, { "auxiliary_loss_clip": 0.01080176, "auxiliary_loss_mlp": 0.01069717, "balance_loss_clip": 1.04172671, "balance_loss_mlp": 1.04135633, "epoch": 0.014910566661656396, "flos": 22780617465600.0, "grad_norm": 1.9211376406260587, "language_loss": 0.90626585, "learning_rate": 3.549833136812155e-06, "loss": 0.92776477, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38671875, "step": 248, "time_per_iteration": 2.5011179447174072 }, { "auxiliary_loss_clip": 0.01076011, "auxiliary_loss_mlp": 0.010757, "balance_loss_clip": 1.04852009, "balance_loss_mlp": 1.03747702, "epoch": 0.014970689914324365, "flos": 26863954416000.0, "grad_norm": 1.6565379334890873, "language_loss": 0.93743962, "learning_rate": 3.552424094769381e-06, "loss": 0.95895672, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38476562, "step": 249, "time_per_iteration": 2.5460221767425537 }, { "auxiliary_loss_clip": 0.01070466, "auxiliary_loss_mlp": 0.01076226, "balance_loss_clip": 1.04912972, "balance_loss_mlp": 1.03327656, "epoch": 0.015030813166992334, "flos": 13984948776960.0, "grad_norm": 2.0559168560461942, "language_loss": 1.06404042, "learning_rate": 3.5550046680977174e-06, "loss": 1.08550739, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 250, "time_per_iteration": 2.4159696102142334 }, { "auxiliary_loss_clip": 0.01070828, "auxiliary_loss_mlp": 0.0105464, "balance_loss_clip": 1.02688777, "balance_loss_mlp": 1.03454304, "epoch": 0.015090936419660304, "flos": 24716656022400.0, "grad_norm": 1.76409725680884, "language_loss": 1.09362614, "learning_rate": 3.5575749397087034e-06, "loss": 1.1148808, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.36328125, "step": 251, "time_per_iteration": 2.5014548301696777 }, { "auxiliary_loss_clip": 0.01072752, "auxiliary_loss_mlp": 0.01037333, "balance_loss_clip": 1.0117867, "balance_loss_mlp": 1.03694236, "epoch": 0.015151059672328273, "flos": 25737621177600.0, "grad_norm": 1.7091590897612239, "language_loss": 0.9745695, "learning_rate": 3.5601349915248707e-06, "loss": 0.99567032, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.35742188, "step": 252, "time_per_iteration": 2.493452787399292 }, { "auxiliary_loss_clip": 0.01076742, "auxiliary_loss_mlp": 0.01055807, "balance_loss_clip": 1.02903247, "balance_loss_mlp": 1.04086673, "epoch": 0.015211182924996243, "flos": 21870152363520.0, "grad_norm": 1.8688358806116068, "language_loss": 1.12032104, "learning_rate": 3.5626849044954064e-06, "loss": 1.14164662, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.359375, "step": 253, "time_per_iteration": 2.4544849395751953 }, { "auxiliary_loss_clip": 0.01083387, "auxiliary_loss_mlp": 0.01183997, "balance_loss_clip": 1.16931033, "balance_loss_mlp": 1.0582068, "epoch": 0.015271306177664212, "flos": 66891734409600.0, "grad_norm": 0.9777478791121624, "language_loss": 0.55954546, "learning_rate": 3.5652247586115167e-06, "loss": 0.5822193, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.25195312, "step": 254, "time_per_iteration": 4.458518743515015 }, { "auxiliary_loss_clip": 0.01074842, "auxiliary_loss_mlp": 0.01068142, "balance_loss_clip": 1.04085517, "balance_loss_mlp": 1.03857684, "epoch": 0.01533142943033218, "flos": 26832846528000.0, "grad_norm": 1.6855824759183693, "language_loss": 1.03326499, "learning_rate": 3.567754632921479e-06, "loss": 1.05469477, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.36328125, "step": 255, "time_per_iteration": 3.9672932624816895 }, { "auxiliary_loss_clip": 0.01071325, "auxiliary_loss_mlp": 0.01061342, "balance_loss_clip": 1.03542614, "balance_loss_mlp": 1.03577185, "epoch": 0.01539155268300015, "flos": 20812702970880.0, "grad_norm": 1.8813299776665477, "language_loss": 0.97233367, "learning_rate": 3.5702746055454075e-06, "loss": 0.99366027, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.35546875, "step": 256, "time_per_iteration": 3.900709629058838 }, { "auxiliary_loss_clip": 0.01075141, "auxiliary_loss_mlp": 0.01055303, "balance_loss_clip": 1.02650189, "balance_loss_mlp": 1.03796566, "epoch": 0.01545167593566812, "flos": 15960927795840.0, "grad_norm": 2.2502576049601557, "language_loss": 0.92342532, "learning_rate": 3.5727847536897254e-06, "loss": 0.94472975, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.37109375, "step": 257, "time_per_iteration": 2.4153144359588623 }, { "auxiliary_loss_clip": 0.01078318, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.01341343, "balance_loss_mlp": 1.04083276, "epoch": 0.01551179918833609, "flos": 22600640073600.0, "grad_norm": 1.9301110015952734, "language_loss": 1.05837071, "learning_rate": 3.5752851536613596e-06, "loss": 1.07954311, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 258, "time_per_iteration": 2.4784247875213623 }, { "auxiliary_loss_clip": 0.0108204, "auxiliary_loss_mlp": 0.01035681, "balance_loss_clip": 1.01113629, "balance_loss_mlp": 1.04383373, "epoch": 0.015571922441004058, "flos": 22815705248640.0, "grad_norm": 1.9274837672864904, "language_loss": 1.01914859, "learning_rate": 3.577775880881658e-06, "loss": 1.04032576, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 259, "time_per_iteration": 2.5347282886505127 }, { "auxiliary_loss_clip": 0.01088531, "auxiliary_loss_mlp": 0.01043368, "balance_loss_clip": 1.01693892, "balance_loss_mlp": 1.0490309, "epoch": 0.015632045693672027, "flos": 18946595600640.0, "grad_norm": 1.6641482068588285, "language_loss": 1.02975178, "learning_rate": 3.5802570099000424e-06, "loss": 1.05107081, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 260, "time_per_iteration": 2.504532814025879 }, { "auxiliary_loss_clip": 0.01084408, "auxiliary_loss_mlp": 0.01042269, "balance_loss_clip": 1.01513672, "balance_loss_mlp": 1.04483366, "epoch": 0.015692168946339995, "flos": 29970421125120.0, "grad_norm": 1.7862701647716586, "language_loss": 1.0257014, "learning_rate": 3.5827286144073947e-06, "loss": 1.04696822, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39453125, "step": 261, "time_per_iteration": 2.5264780521392822 }, { "auxiliary_loss_clip": 0.01083054, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.01076746, "balance_loss_mlp": 1.04234612, "epoch": 0.015752292199007967, "flos": 19391039608320.0, "grad_norm": 1.6031367656723223, "language_loss": 0.76873946, "learning_rate": 3.5851907672491904e-06, "loss": 0.78994572, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 262, "time_per_iteration": 2.48397159576416 }, { "auxiliary_loss_clip": 0.01079737, "auxiliary_loss_mlp": 0.01041867, "balance_loss_clip": 1.01423419, "balance_loss_mlp": 1.03847504, "epoch": 0.015812415451675936, "flos": 20338756997760.0, "grad_norm": 1.7084775626780269, "language_loss": 0.82158327, "learning_rate": 3.587643540438383e-06, "loss": 0.84279919, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41210938, "step": 263, "time_per_iteration": 2.4642598628997803 }, { "auxiliary_loss_clip": 0.01077048, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.01613736, "balance_loss_mlp": 1.03553402, "epoch": 0.015872538704343905, "flos": 17524583124480.0, "grad_norm": 2.17296017702561, "language_loss": 1.03400373, "learning_rate": 3.590087005168037e-06, "loss": 1.05522966, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4140625, "step": 264, "time_per_iteration": 2.43809175491333 }, { "auxiliary_loss_clip": 0.01077684, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.01645851, "balance_loss_mlp": 1.03632593, "epoch": 0.015932661957011873, "flos": 15259802405760.0, "grad_norm": 1.9415994157280507, "language_loss": 1.14071894, "learning_rate": 3.5925212318237344e-06, "loss": 1.16194439, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 265, "time_per_iteration": 2.4155664443969727 }, { "auxiliary_loss_clip": 0.01084177, "auxiliary_loss_mlp": 0.01048072, "balance_loss_clip": 1.01667213, "balance_loss_mlp": 1.04137063, "epoch": 0.015992785209679845, "flos": 20301504710400.0, "grad_norm": 1.855790177529862, "language_loss": 0.91434687, "learning_rate": 3.5949462899957323e-06, "loss": 0.9356693, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.42773438, "step": 266, "time_per_iteration": 2.503396987915039 }, { "auxiliary_loss_clip": 0.0108016, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.01576996, "balance_loss_mlp": 1.04015231, "epoch": 0.016052908462347814, "flos": 23361397799040.0, "grad_norm": 1.6726723919324413, "language_loss": 0.97525406, "learning_rate": 3.5973622484909068e-06, "loss": 0.99648565, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 267, "time_per_iteration": 2.4556307792663574 }, { "auxiliary_loss_clip": 0.0107762, "auxiliary_loss_mlp": 0.01049832, "balance_loss_clip": 1.0219605, "balance_loss_mlp": 1.03791082, "epoch": 0.016113031715015783, "flos": 21285566691840.0, "grad_norm": 1.8959558945497696, "language_loss": 0.99569213, "learning_rate": 3.599769175344462e-06, "loss": 1.01696658, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.39648438, "step": 268, "time_per_iteration": 2.4850926399230957 }, { "auxiliary_loss_clip": 0.01078513, "auxiliary_loss_mlp": 0.01057554, "balance_loss_clip": 1.02858627, "balance_loss_mlp": 1.03886533, "epoch": 0.01617315496768375, "flos": 18913742144640.0, "grad_norm": 1.7734706586745663, "language_loss": 0.99278879, "learning_rate": 3.602167137831432e-06, "loss": 1.01414943, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.39648438, "step": 269, "time_per_iteration": 2.4067189693450928 }, { "auxiliary_loss_clip": 0.01076683, "auxiliary_loss_mlp": 0.01055979, "balance_loss_clip": 1.02846527, "balance_loss_mlp": 1.03716183, "epoch": 0.01623327822035172, "flos": 16545513467520.0, "grad_norm": 1.787914617446807, "language_loss": 1.07600427, "learning_rate": 3.6045562024779565e-06, "loss": 1.09733081, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39648438, "step": 270, "time_per_iteration": 2.4217727184295654 }, { "auxiliary_loss_clip": 0.01075132, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.01967883, "balance_loss_mlp": 1.03623009, "epoch": 0.016293401473019692, "flos": 23512361984640.0, "grad_norm": 1.7442076079546263, "language_loss": 0.9867897, "learning_rate": 3.606936435072361e-06, "loss": 1.00800729, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 271, "time_per_iteration": 2.448598623275757 }, { "auxiliary_loss_clip": 0.01074273, "auxiliary_loss_mlp": 0.01037717, "balance_loss_clip": 1.01104999, "balance_loss_mlp": 1.03570163, "epoch": 0.01635352472568766, "flos": 29014988325120.0, "grad_norm": 1.9345535097035402, "language_loss": 0.96648163, "learning_rate": 3.609307900676025e-06, "loss": 0.98760152, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 272, "time_per_iteration": 2.51503586769104 }, { "auxiliary_loss_clip": 0.01074452, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.01177073, "balance_loss_mlp": 1.03689492, "epoch": 0.01641364797835563, "flos": 13369674153600.0, "grad_norm": 1.8193532645749473, "language_loss": 0.93630731, "learning_rate": 3.611670663634051e-06, "loss": 0.95742595, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 273, "time_per_iteration": 2.408466339111328 }, { "auxiliary_loss_clip": 0.01076213, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.01204228, "balance_loss_mlp": 1.038903, "epoch": 0.016473771231023598, "flos": 18877292818560.0, "grad_norm": 1.8054884306408012, "language_loss": 1.05533934, "learning_rate": 3.614024787585744e-06, "loss": 1.07649088, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37304688, "step": 274, "time_per_iteration": 2.462442398071289 }, { "auxiliary_loss_clip": 0.01081764, "auxiliary_loss_mlp": 0.0104442, "balance_loss_clip": 1.01539207, "balance_loss_mlp": 1.0430696, "epoch": 0.016533894483691566, "flos": 22600535339520.0, "grad_norm": 1.6466746471753408, "language_loss": 0.99748546, "learning_rate": 3.6163703354748927e-06, "loss": 1.01874733, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.38671875, "step": 275, "time_per_iteration": 2.4583442211151123 }, { "auxiliary_loss_clip": 0.01082498, "auxiliary_loss_mlp": 0.01044385, "balance_loss_clip": 1.01415372, "balance_loss_mlp": 1.04391456, "epoch": 0.01659401773635954, "flos": 21506112950400.0, "grad_norm": 1.4341877670405285, "language_loss": 0.90382957, "learning_rate": 3.6187073695598707e-06, "loss": 0.92509842, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.38671875, "step": 276, "time_per_iteration": 2.487459421157837 }, { "auxiliary_loss_clip": 0.01081887, "auxiliary_loss_mlp": 0.01043553, "balance_loss_clip": 1.01432228, "balance_loss_mlp": 1.04412079, "epoch": 0.016654140989027507, "flos": 32849673505920.0, "grad_norm": 1.5719930585942532, "language_loss": 0.89018023, "learning_rate": 3.621035951423551e-06, "loss": 0.91143465, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.37890625, "step": 277, "time_per_iteration": 2.576155662536621 }, { "auxiliary_loss_clip": 0.01082033, "auxiliary_loss_mlp": 0.01048118, "balance_loss_clip": 1.01776719, "balance_loss_mlp": 1.0435276, "epoch": 0.016714264241695476, "flos": 12305591602560.0, "grad_norm": 1.8584143024471391, "language_loss": 0.90343451, "learning_rate": 3.623356141983041e-06, "loss": 0.92473608, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.38476562, "step": 278, "time_per_iteration": 2.4338877201080322 }, { "auxiliary_loss_clip": 0.01085335, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.01405632, "balance_loss_mlp": 1.04590058, "epoch": 0.016774387494363444, "flos": 27122625745920.0, "grad_norm": 1.6622574612475278, "language_loss": 1.02717173, "learning_rate": 3.6256680014992486e-06, "loss": 1.04846501, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.39453125, "step": 279, "time_per_iteration": 2.520209789276123 }, { "auxiliary_loss_clip": 0.0108561, "auxiliary_loss_mlp": 0.01043472, "balance_loss_clip": 1.01321697, "balance_loss_mlp": 1.04529035, "epoch": 0.016834510747031413, "flos": 20190515898240.0, "grad_norm": 1.8222456034663919, "language_loss": 1.07618022, "learning_rate": 3.6279715895862713e-06, "loss": 1.097471, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.40234375, "step": 280, "time_per_iteration": 2.4726343154907227 }, { "auxiliary_loss_clip": 0.01082315, "auxiliary_loss_mlp": 0.01042702, "balance_loss_clip": 1.01587999, "balance_loss_mlp": 1.04281974, "epoch": 0.016894633999699385, "flos": 27272961527040.0, "grad_norm": 1.5111924662742857, "language_loss": 0.84199786, "learning_rate": 3.6302669652206183e-06, "loss": 0.86324799, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 281, "time_per_iteration": 2.579535722732544 }, { "auxiliary_loss_clip": 0.0107898, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.01362658, "balance_loss_mlp": 1.04010093, "epoch": 0.016954757252367354, "flos": 14902081948800.0, "grad_norm": 2.091330182018933, "language_loss": 1.00837684, "learning_rate": 3.632554186750274e-06, "loss": 1.02957392, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38867188, "step": 282, "time_per_iteration": 2.4451372623443604 }, { "auxiliary_loss_clip": 0.0107842, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.00846422, "balance_loss_mlp": 1.03927469, "epoch": 0.017014880505035322, "flos": 21357802028160.0, "grad_norm": 1.6630350944167145, "language_loss": 0.90203714, "learning_rate": 3.6348333119035937e-06, "loss": 0.92317563, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 283, "time_per_iteration": 2.4600670337677 }, { "auxiliary_loss_clip": 0.01075972, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.00730634, "balance_loss_mlp": 1.03771234, "epoch": 0.01707500375770329, "flos": 35331753726720.0, "grad_norm": 1.7651061365582053, "language_loss": 0.94349569, "learning_rate": 3.6371043977980503e-06, "loss": 0.96458817, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 284, "time_per_iteration": 2.6087069511413574 }, { "auxiliary_loss_clip": 0.01074357, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.00675452, "balance_loss_mlp": 1.03615403, "epoch": 0.01713512701037126, "flos": 23581071273600.0, "grad_norm": 1.8840048980892565, "language_loss": 1.09616125, "learning_rate": 3.639367500948819e-06, "loss": 1.11723065, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 285, "time_per_iteration": 2.503856897354126 }, { "auxiliary_loss_clip": 0.01071632, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.00981605, "balance_loss_mlp": 1.03465748, "epoch": 0.01719525026303923, "flos": 27633474892800.0, "grad_norm": 1.7121594971970922, "language_loss": 1.05195332, "learning_rate": 3.6416226772772178e-06, "loss": 1.07301342, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 286, "time_per_iteration": 2.5478200912475586 }, { "auxiliary_loss_clip": 0.01070873, "auxiliary_loss_mlp": 0.01034055, "balance_loss_clip": 1.00747168, "balance_loss_mlp": 1.03270841, "epoch": 0.0172553735157072, "flos": 26978504186880.0, "grad_norm": 1.4830744707498988, "language_loss": 0.99835181, "learning_rate": 3.643869982119001e-06, "loss": 1.01940107, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38085938, "step": 287, "time_per_iteration": 2.4996163845062256 }, { "auxiliary_loss_clip": 0.01069401, "auxiliary_loss_mlp": 0.01034481, "balance_loss_clip": 1.0102222, "balance_loss_mlp": 1.03192806, "epoch": 0.01731549676837517, "flos": 14055962215680.0, "grad_norm": 2.1222890079899344, "language_loss": 1.16811979, "learning_rate": 3.646109470232502e-06, "loss": 1.18915868, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.375, "step": 288, "time_per_iteration": 2.4296839237213135 }, { "auxiliary_loss_clip": 0.01058296, "auxiliary_loss_mlp": 0.01039085, "balance_loss_clip": 1.02411199, "balance_loss_mlp": 1.0347544, "epoch": 0.017375620021043137, "flos": 66506885959680.0, "grad_norm": 0.932411853858951, "language_loss": 0.64186275, "learning_rate": 3.6483411958066417e-06, "loss": 0.66283655, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.23535156, "step": 289, "time_per_iteration": 3.1736297607421875 }, { "auxiliary_loss_clip": 0.01067776, "auxiliary_loss_mlp": 0.01038898, "balance_loss_clip": 1.01436472, "balance_loss_mlp": 1.0318439, "epoch": 0.01743574327371111, "flos": 15224435331840.0, "grad_norm": 2.077931203850872, "language_loss": 1.00649023, "learning_rate": 3.6505652124687957e-06, "loss": 1.02755702, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 290, "time_per_iteration": 2.555510997772217 }, { "auxiliary_loss_clip": 0.01070476, "auxiliary_loss_mlp": 0.01056412, "balance_loss_clip": 1.03283286, "balance_loss_mlp": 1.03435135, "epoch": 0.017495866526379078, "flos": 25372708980480.0, "grad_norm": 1.5804229723503513, "language_loss": 0.9709962, "learning_rate": 3.6527815732925258e-06, "loss": 0.99226511, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 291, "time_per_iteration": 2.5116424560546875 }, { "auxiliary_loss_clip": 0.0107268, "auxiliary_loss_mlp": 0.01057612, "balance_loss_clip": 1.0335201, "balance_loss_mlp": 1.03733194, "epoch": 0.017555989779047047, "flos": 26358272150400.0, "grad_norm": 1.5396855663765308, "language_loss": 0.81635725, "learning_rate": 3.6549903308051806e-06, "loss": 0.83766013, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35351562, "step": 292, "time_per_iteration": 2.5562639236450195 }, { "auxiliary_loss_clip": 0.0106819, "auxiliary_loss_mlp": 0.01052813, "balance_loss_clip": 1.02935243, "balance_loss_mlp": 1.03485465, "epoch": 0.017616113031715015, "flos": 22337919025920.0, "grad_norm": 1.9071288259160022, "language_loss": 0.98447442, "learning_rate": 3.6571915369953646e-06, "loss": 1.00568449, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.33203125, "step": 293, "time_per_iteration": 2.485985040664673 }, { "auxiliary_loss_clip": 0.01065642, "auxiliary_loss_mlp": 0.01044677, "balance_loss_clip": 1.02115679, "balance_loss_mlp": 1.03222585, "epoch": 0.017676236284382984, "flos": 20155881962880.0, "grad_norm": 1.5493453494418794, "language_loss": 0.94352484, "learning_rate": 3.6593852433202797e-06, "loss": 0.9646281, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.33398438, "step": 294, "time_per_iteration": 6.391979694366455 }, { "auxiliary_loss_clip": 0.01065311, "auxiliary_loss_mlp": 0.01033254, "balance_loss_clip": 1.00950718, "balance_loss_mlp": 1.03034544, "epoch": 0.017736359537050956, "flos": 25222303376640.0, "grad_norm": 1.4990014546844366, "language_loss": 0.93051851, "learning_rate": 3.6615715007129453e-06, "loss": 0.95150423, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34960938, "step": 295, "time_per_iteration": 5.337981939315796 }, { "auxiliary_loss_clip": 0.01066937, "auxiliary_loss_mlp": 0.01037031, "balance_loss_clip": 1.01364255, "balance_loss_mlp": 1.03156435, "epoch": 0.017796482789718925, "flos": 20337779479680.0, "grad_norm": 1.8545510366592748, "language_loss": 0.93144822, "learning_rate": 3.6637503595892897e-06, "loss": 0.95248789, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 296, "time_per_iteration": 2.4316158294677734 }, { "auxiliary_loss_clip": 0.01065236, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.01369596, "balance_loss_mlp": 1.03088462, "epoch": 0.017856606042386893, "flos": 22378208601600.0, "grad_norm": 1.7150698571905323, "language_loss": 0.98072588, "learning_rate": 3.665921869855132e-06, "loss": 1.00173903, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 297, "time_per_iteration": 2.4523074626922607 }, { "auxiliary_loss_clip": 0.01065319, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.01766133, "balance_loss_mlp": 1.03130651, "epoch": 0.017916729295054862, "flos": 20229024994560.0, "grad_norm": 1.6845333048798552, "language_loss": 1.01746178, "learning_rate": 3.6680860809130346e-06, "loss": 1.03850627, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 298, "time_per_iteration": 2.435473680496216 }, { "auxiliary_loss_clip": 0.01066698, "auxiliary_loss_mlp": 0.01039627, "balance_loss_clip": 1.01650035, "balance_loss_mlp": 1.03078902, "epoch": 0.01797685254772283, "flos": 19389957356160.0, "grad_norm": 1.5144232823295136, "language_loss": 0.97376871, "learning_rate": 3.6702430416690516e-06, "loss": 0.99483192, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 299, "time_per_iteration": 2.472338914871216 }, { "auxiliary_loss_clip": 0.01066046, "auxiliary_loss_mlp": 0.01042092, "balance_loss_clip": 1.01832175, "balance_loss_mlp": 1.02983689, "epoch": 0.018036975800390802, "flos": 24424851945600.0, "grad_norm": 2.0186355357643553, "language_loss": 0.81916797, "learning_rate": 3.672392800539357e-06, "loss": 0.84024936, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 300, "time_per_iteration": 2.487420082092285 }, { "auxiliary_loss_clip": 0.01063066, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.01330018, "balance_loss_mlp": 1.02848518, "epoch": 0.01809709905305877, "flos": 15778017849600.0, "grad_norm": 1.770811728359909, "language_loss": 1.01069212, "learning_rate": 3.6745354054567686e-06, "loss": 1.03167975, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 301, "time_per_iteration": 2.457226276397705 }, { "auxiliary_loss_clip": 0.01048458, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.01442671, "balance_loss_mlp": 1.02741838, "epoch": 0.01815722230572674, "flos": 67344592055040.0, "grad_norm": 0.8756461282280742, "language_loss": 0.62486458, "learning_rate": 3.676670903877158e-06, "loss": 0.64560127, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.2109375, "step": 302, "time_per_iteration": 3.1609270572662354 }, { "auxiliary_loss_clip": 0.01069446, "auxiliary_loss_mlp": 0.01071462, "balance_loss_clip": 1.04638028, "balance_loss_mlp": 1.0350486, "epoch": 0.01821734555839471, "flos": 15484747495680.0, "grad_norm": 1.8035649950172024, "language_loss": 1.02879417, "learning_rate": 3.6787993427857567e-06, "loss": 1.05020332, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.34375, "step": 303, "time_per_iteration": 2.5072736740112305 }, { "auxiliary_loss_clip": 0.01076853, "auxiliary_loss_mlp": 0.01140822, "balance_loss_clip": 1.11328518, "balance_loss_mlp": 1.04200816, "epoch": 0.018277468811062677, "flos": 24096284340480.0, "grad_norm": 1.6162251255892142, "language_loss": 0.9114241, "learning_rate": 3.680920768703364e-06, "loss": 0.93360078, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.34765625, "step": 304, "time_per_iteration": 2.4957633018493652 }, { "auxiliary_loss_clip": 0.01071762, "auxiliary_loss_mlp": 0.01118964, "balance_loss_clip": 1.09332228, "balance_loss_mlp": 1.03888655, "epoch": 0.01833759206373065, "flos": 20958290807040.0, "grad_norm": 1.4730770474972743, "language_loss": 0.88069034, "learning_rate": 3.6830352276924415e-06, "loss": 0.90259761, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.328125, "step": 305, "time_per_iteration": 2.488272190093994 }, { "auxiliary_loss_clip": 0.01060205, "auxiliary_loss_mlp": 0.01064406, "balance_loss_clip": 1.04186368, "balance_loss_mlp": 1.02768803, "epoch": 0.018397715316398618, "flos": 19389747888000.0, "grad_norm": 1.6983109741610847, "language_loss": 0.99082673, "learning_rate": 3.685142765363119e-06, "loss": 1.0120728, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.32421875, "step": 306, "time_per_iteration": 2.557779550552368 }, { "auxiliary_loss_clip": 0.01052935, "auxiliary_loss_mlp": 0.0104256, "balance_loss_clip": 1.02005291, "balance_loss_mlp": 1.02125943, "epoch": 0.018457838569066586, "flos": 29131248752640.0, "grad_norm": 1.6367791771873348, "language_loss": 0.97924662, "learning_rate": 3.687243426879095e-06, "loss": 1.00020158, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.31640625, "step": 307, "time_per_iteration": 2.5334136486053467 }, { "auxiliary_loss_clip": 0.01053412, "auxiliary_loss_mlp": 0.0103482, "balance_loss_clip": 1.01318312, "balance_loss_mlp": 1.02171671, "epoch": 0.018517961821734555, "flos": 19207640903040.0, "grad_norm": 1.7612655894679548, "language_loss": 0.86022341, "learning_rate": 3.6893372569634466e-06, "loss": 0.88110566, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.31640625, "step": 308, "time_per_iteration": 2.5304481983184814 }, { "auxiliary_loss_clip": 0.01061589, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.01480842, "balance_loss_mlp": 1.02706218, "epoch": 0.018578085074402523, "flos": 19862053027200.0, "grad_norm": 1.6695985682296173, "language_loss": 0.97483426, "learning_rate": 3.6914242999043395e-06, "loss": 0.99583733, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34375, "step": 309, "time_per_iteration": 2.467846632003784 }, { "auxiliary_loss_clip": 0.01061935, "auxiliary_loss_mlp": 0.0104664, "balance_loss_clip": 1.0232271, "balance_loss_mlp": 1.02861905, "epoch": 0.018638208327070496, "flos": 29605648573440.0, "grad_norm": 1.8144101277553621, "language_loss": 0.88109088, "learning_rate": 3.69350459956065e-06, "loss": 0.90217662, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.33398438, "step": 310, "time_per_iteration": 2.5191166400909424 }, { "auxiliary_loss_clip": 0.01060532, "auxiliary_loss_mlp": 0.01039677, "balance_loss_clip": 1.01604962, "balance_loss_mlp": 1.02774298, "epoch": 0.018698331579738464, "flos": 45729866131200.0, "grad_norm": 1.5115441118037374, "language_loss": 0.8324126, "learning_rate": 3.695578199367497e-06, "loss": 0.85341471, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.328125, "step": 311, "time_per_iteration": 2.698251485824585 }, { "auxiliary_loss_clip": 0.01056954, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.01408768, "balance_loss_mlp": 1.02504182, "epoch": 0.018758454832406433, "flos": 20482669088640.0, "grad_norm": 1.9394295833107467, "language_loss": 1.02466917, "learning_rate": 3.6976451423416825e-06, "loss": 1.04559577, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.31835938, "step": 312, "time_per_iteration": 2.4265899658203125 }, { "auxiliary_loss_clip": 0.01053329, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.01061344, "balance_loss_mlp": 1.02112341, "epoch": 0.0188185780850744, "flos": 15776900686080.0, "grad_norm": 1.7456041971512823, "language_loss": 1.03895962, "learning_rate": 3.699705471087043e-06, "loss": 1.05981278, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.32226562, "step": 313, "time_per_iteration": 2.451723098754883 }, { "auxiliary_loss_clip": 0.01053627, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.0125401, "balance_loss_mlp": 1.02035475, "epoch": 0.018878701337742373, "flos": 22454633301120.0, "grad_norm": 1.9561360397861012, "language_loss": 0.9421671, "learning_rate": 3.7017592277997256e-06, "loss": 0.96305895, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.33203125, "step": 314, "time_per_iteration": 2.46235728263855 }, { "auxiliary_loss_clip": 0.01047211, "auxiliary_loss_mlp": 0.01028138, "balance_loss_clip": 1.00823021, "balance_loss_mlp": 1.01513386, "epoch": 0.018938824590410342, "flos": 30992189241600.0, "grad_norm": 2.2802747700380714, "language_loss": 1.04170799, "learning_rate": 3.7038064542733654e-06, "loss": 1.0624615, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.3203125, "step": 315, "time_per_iteration": 2.525439500808716 }, { "auxiliary_loss_clip": 0.01048698, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.01680362, "balance_loss_mlp": 1.01567197, "epoch": 0.01899894784307831, "flos": 23257775283840.0, "grad_norm": 1.5642686849792493, "language_loss": 0.91748989, "learning_rate": 3.7058471919041945e-06, "loss": 0.93835378, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33007812, "step": 316, "time_per_iteration": 2.4626078605651855 }, { "auxiliary_loss_clip": 0.01048181, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.0103116, "balance_loss_mlp": 1.0150373, "epoch": 0.01905907109574628, "flos": 17456921176320.0, "grad_norm": 2.001937900836701, "language_loss": 0.9683249, "learning_rate": 3.7078814816960605e-06, "loss": 0.9891153, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33203125, "step": 317, "time_per_iteration": 2.449324607849121 }, { "auxiliary_loss_clip": 0.01050135, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.01026332, "balance_loss_mlp": 1.01637101, "epoch": 0.019119194348414248, "flos": 14969499517440.0, "grad_norm": 1.9113955023332994, "language_loss": 1.05841649, "learning_rate": 3.709909364265374e-06, "loss": 1.07924581, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3359375, "step": 318, "time_per_iteration": 2.4522011280059814 }, { "auxiliary_loss_clip": 0.01051392, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.0105865, "balance_loss_mlp": 1.01729941, "epoch": 0.01917931760108222, "flos": 25481672933760.0, "grad_norm": 1.92243857292623, "language_loss": 1.05108058, "learning_rate": 3.7119308798459706e-06, "loss": 1.07191718, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 319, "time_per_iteration": 2.521923542022705 }, { "auxiliary_loss_clip": 0.01042419, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.02857721, "balance_loss_mlp": 1.01906168, "epoch": 0.01923944085375019, "flos": 71553722100480.0, "grad_norm": 0.9766901159740592, "language_loss": 0.60036325, "learning_rate": 3.7139460682939026e-06, "loss": 0.62117052, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 0.09716797, "router_z_loss_mlp": 0.234375, "step": 320, "time_per_iteration": 2.970048666000366 }, { "auxiliary_loss_clip": 0.01051188, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.01553869, "balance_loss_mlp": 1.01721931, "epoch": 0.019299564106418157, "flos": 19681482142080.0, "grad_norm": 1.7650813310083497, "language_loss": 1.06671751, "learning_rate": 3.715954969092154e-06, "loss": 1.08759224, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33984375, "step": 321, "time_per_iteration": 2.4633588790893555 }, { "auxiliary_loss_clip": 0.01051129, "auxiliary_loss_mlp": 0.01049862, "balance_loss_clip": 1.0271647, "balance_loss_mlp": 1.01753688, "epoch": 0.019359687359086126, "flos": 24386063558400.0, "grad_norm": 1.7235707672074083, "language_loss": 0.96146882, "learning_rate": 3.7179576213552805e-06, "loss": 0.98247874, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3359375, "step": 322, "time_per_iteration": 2.4831669330596924 }, { "auxiliary_loss_clip": 0.01050158, "auxiliary_loss_mlp": 0.01037326, "balance_loss_clip": 1.01613092, "balance_loss_mlp": 1.01716709, "epoch": 0.019419810611754094, "flos": 23950242656640.0, "grad_norm": 1.6842862244031123, "language_loss": 0.87644994, "learning_rate": 3.719954063833981e-06, "loss": 0.8973248, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33007812, "step": 323, "time_per_iteration": 2.5170605182647705 }, { "auxiliary_loss_clip": 0.01049933, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.01094961, "balance_loss_mlp": 1.01710343, "epoch": 0.019479933864422067, "flos": 22159233354240.0, "grad_norm": 1.5670535378546833, "language_loss": 1.01539159, "learning_rate": 3.721944334919596e-06, "loss": 1.03622198, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.328125, "step": 324, "time_per_iteration": 2.469804048538208 }, { "auxiliary_loss_clip": 0.0105158, "auxiliary_loss_mlp": 0.01035542, "balance_loss_clip": 1.01360798, "balance_loss_mlp": 1.01898265, "epoch": 0.019540057117090035, "flos": 22235727876480.0, "grad_norm": 1.7785446031554615, "language_loss": 0.83181769, "learning_rate": 3.7239284726485375e-06, "loss": 0.85268891, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.32617188, "step": 325, "time_per_iteration": 2.6482508182525635 }, { "auxiliary_loss_clip": 0.01052508, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.01610637, "balance_loss_mlp": 1.01879108, "epoch": 0.019600180369758004, "flos": 23075633387520.0, "grad_norm": 1.4824096814011942, "language_loss": 0.86647522, "learning_rate": 3.72590651470665e-06, "loss": 0.88737524, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3359375, "step": 326, "time_per_iteration": 2.453738212585449 }, { "auxiliary_loss_clip": 0.01054779, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.0175581, "balance_loss_mlp": 1.02065158, "epoch": 0.019660303622425972, "flos": 25409681976960.0, "grad_norm": 1.7600522765780668, "language_loss": 0.89101493, "learning_rate": 3.727878498433505e-06, "loss": 0.91198885, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.34179688, "step": 327, "time_per_iteration": 2.4836325645446777 }, { "auxiliary_loss_clip": 0.01051374, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.01429844, "balance_loss_mlp": 1.01822317, "epoch": 0.01972042687509394, "flos": 23656448632320.0, "grad_norm": 1.8354560326831015, "language_loss": 0.89268327, "learning_rate": 3.7298444608266328e-06, "loss": 0.91358674, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.33203125, "step": 328, "time_per_iteration": 2.4601492881774902 }, { "auxiliary_loss_clip": 0.01050994, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.016186, "balance_loss_mlp": 1.01748824, "epoch": 0.019780550127761913, "flos": 18222496669440.0, "grad_norm": 2.019168521692278, "language_loss": 1.14488649, "learning_rate": 3.731804438545683e-06, "loss": 1.16579032, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.33398438, "step": 329, "time_per_iteration": 2.4218249320983887 }, { "auxiliary_loss_clip": 0.01052977, "auxiliary_loss_mlp": 0.01035849, "balance_loss_clip": 1.0122577, "balance_loss_mlp": 1.01951194, "epoch": 0.01984067338042988, "flos": 22417695216000.0, "grad_norm": 1.9570761831401466, "language_loss": 0.86860716, "learning_rate": 3.7337584679165324e-06, "loss": 0.88949549, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.33398438, "step": 330, "time_per_iteration": 2.474691152572632 }, { "auxiliary_loss_clip": 0.01055725, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.01241541, "balance_loss_mlp": 1.02247572, "epoch": 0.01990079663309785, "flos": 17054267932800.0, "grad_norm": 1.935387035738896, "language_loss": 1.11344814, "learning_rate": 3.7357065849353186e-06, "loss": 1.13436496, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.33203125, "step": 331, "time_per_iteration": 2.433915376663208 }, { "auxiliary_loss_clip": 0.01054813, "auxiliary_loss_mlp": 0.01043352, "balance_loss_clip": 1.02033257, "balance_loss_mlp": 1.02226162, "epoch": 0.01996091988576582, "flos": 15960857973120.0, "grad_norm": 1.617532465745625, "language_loss": 1.03551733, "learning_rate": 3.737648825272422e-06, "loss": 1.056499, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.32617188, "step": 332, "time_per_iteration": 2.4041998386383057 }, { "auxiliary_loss_clip": 0.01057276, "auxiliary_loss_mlp": 0.01049653, "balance_loss_clip": 1.02571607, "balance_loss_mlp": 1.02257323, "epoch": 0.02002104313843379, "flos": 23585330459520.0, "grad_norm": 2.0657487740342004, "language_loss": 0.93461829, "learning_rate": 3.739585224276384e-06, "loss": 0.95568752, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.34765625, "step": 333, "time_per_iteration": 5.939366340637207 }, { "auxiliary_loss_clip": 0.01059113, "auxiliary_loss_mlp": 0.01064763, "balance_loss_clip": 1.03975272, "balance_loss_mlp": 1.02462626, "epoch": 0.02008116639110176, "flos": 34093454158080.0, "grad_norm": 1.6296505044350587, "language_loss": 0.93251032, "learning_rate": 3.7415158169777673e-06, "loss": 0.953749, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.34570312, "step": 334, "time_per_iteration": 4.023910760879517 }, { "auxiliary_loss_clip": 0.01058272, "auxiliary_loss_mlp": 0.01047056, "balance_loss_clip": 1.0222609, "balance_loss_mlp": 1.02456069, "epoch": 0.020141289643769728, "flos": 19682669128320.0, "grad_norm": 1.5500544188091645, "language_loss": 0.92126071, "learning_rate": 3.7434406380929575e-06, "loss": 0.94231397, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3359375, "step": 335, "time_per_iteration": 2.4827654361724854 }, { "auxiliary_loss_clip": 0.01057233, "auxiliary_loss_mlp": 0.01041972, "balance_loss_clip": 1.0189054, "balance_loss_mlp": 1.02497244, "epoch": 0.020201412896437697, "flos": 20739525027840.0, "grad_norm": 1.9435041870189311, "language_loss": 1.04189467, "learning_rate": 3.745359722027911e-06, "loss": 1.06288683, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.32226562, "step": 336, "time_per_iteration": 2.4678378105163574 }, { "auxiliary_loss_clip": 0.01057538, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.00890994, "balance_loss_mlp": 1.02422345, "epoch": 0.020261536149105665, "flos": 20265474320640.0, "grad_norm": 1.5308863184395802, "language_loss": 0.96066093, "learning_rate": 3.7472731028818428e-06, "loss": 0.98159242, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.33203125, "step": 337, "time_per_iteration": 2.4676594734191895 }, { "auxiliary_loss_clip": 0.01052949, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.01258981, "balance_loss_mlp": 1.02134871, "epoch": 0.020321659401773638, "flos": 25847562648960.0, "grad_norm": 1.2770784875378334, "language_loss": 0.95205188, "learning_rate": 3.7491808144508626e-06, "loss": 0.97295225, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.31640625, "step": 338, "time_per_iteration": 2.5320656299591064 }, { "auxiliary_loss_clip": 0.01051566, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.01685834, "balance_loss_mlp": 1.02052879, "epoch": 0.020381782654441606, "flos": 17494033818240.0, "grad_norm": 1.6374697253380042, "language_loss": 0.95828885, "learning_rate": 3.7510828902315576e-06, "loss": 0.97921127, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.30859375, "step": 339, "time_per_iteration": 2.4545087814331055 }, { "auxiliary_loss_clip": 0.01054079, "auxiliary_loss_mlp": 0.01033526, "balance_loss_clip": 1.01008928, "balance_loss_mlp": 1.02278709, "epoch": 0.020441905907109575, "flos": 24242779872000.0, "grad_norm": 1.5248951722341324, "language_loss": 0.97586143, "learning_rate": 3.75297936342452e-06, "loss": 0.99673748, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3125, "step": 340, "time_per_iteration": 2.4710581302642822 }, { "auxiliary_loss_clip": 0.01055147, "auxiliary_loss_mlp": 0.01037996, "balance_loss_clip": 1.01520276, "balance_loss_mlp": 1.02346206, "epoch": 0.020502029159777543, "flos": 22232306563200.0, "grad_norm": 1.6479682549609678, "language_loss": 0.97359681, "learning_rate": 3.7548702669378253e-06, "loss": 0.99452817, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.31640625, "step": 341, "time_per_iteration": 2.513909339904785 }, { "auxiliary_loss_clip": 0.01053362, "auxiliary_loss_mlp": 0.01042291, "balance_loss_clip": 1.01914048, "balance_loss_mlp": 1.02214515, "epoch": 0.020562152412445512, "flos": 23986726894080.0, "grad_norm": 2.442168980302781, "language_loss": 0.96863955, "learning_rate": 3.756755633390458e-06, "loss": 0.98959607, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3125, "step": 342, "time_per_iteration": 2.480635166168213 }, { "auxiliary_loss_clip": 0.01053921, "auxiliary_loss_mlp": 0.01032949, "balance_loss_clip": 1.01089549, "balance_loss_mlp": 1.0227809, "epoch": 0.020622275665113484, "flos": 26974210089600.0, "grad_norm": 1.4513318263038222, "language_loss": 0.97555053, "learning_rate": 3.7586354951156886e-06, "loss": 0.99641925, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3125, "step": 343, "time_per_iteration": 2.5186679363250732 }, { "auxiliary_loss_clip": 0.01052964, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.01142383, "balance_loss_mlp": 1.02210093, "epoch": 0.020682398917781453, "flos": 22599627644160.0, "grad_norm": 1.5596747561495, "language_loss": 0.87155473, "learning_rate": 3.7605098841644e-06, "loss": 0.89241517, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.30859375, "step": 344, "time_per_iteration": 2.4981014728546143 }, { "auxiliary_loss_clip": 0.01052181, "auxiliary_loss_mlp": 0.0103953, "balance_loss_clip": 1.01758397, "balance_loss_mlp": 1.02081704, "epoch": 0.02074252217044942, "flos": 15012686736000.0, "grad_norm": 1.4032766726695793, "language_loss": 0.88269889, "learning_rate": 3.7623788323083666e-06, "loss": 0.90361595, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.31445312, "step": 345, "time_per_iteration": 2.511490821838379 }, { "auxiliary_loss_clip": 0.01050155, "auxiliary_loss_mlp": 0.01043267, "balance_loss_clip": 1.0206883, "balance_loss_mlp": 1.0178318, "epoch": 0.02080264542311739, "flos": 25336783324800.0, "grad_norm": 1.7472236291153327, "language_loss": 0.97384208, "learning_rate": 3.7642423710434837e-06, "loss": 0.99477637, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.32421875, "step": 346, "time_per_iteration": 2.5483906269073486 }, { "auxiliary_loss_clip": 0.01046587, "auxiliary_loss_mlp": 0.01040461, "balance_loss_clip": 1.01759708, "balance_loss_mlp": 1.01515579, "epoch": 0.02086276867578536, "flos": 24387669480960.0, "grad_norm": 1.6794710914870128, "language_loss": 0.91511512, "learning_rate": 3.7661005315929563e-06, "loss": 0.93598562, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.31445312, "step": 347, "time_per_iteration": 2.651024341583252 }, { "auxiliary_loss_clip": 0.01045041, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.01579976, "balance_loss_mlp": 1.0138284, "epoch": 0.02092289192845333, "flos": 24461056892160.0, "grad_norm": 1.5253953557080513, "language_loss": 0.82996881, "learning_rate": 3.7679533449104354e-06, "loss": 0.85079861, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3125, "step": 348, "time_per_iteration": 2.568885564804077 }, { "auxiliary_loss_clip": 0.01046997, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.00860095, "balance_loss_mlp": 1.01523268, "epoch": 0.0209830151811213, "flos": 17450392752000.0, "grad_norm": 1.8800966145467317, "language_loss": 0.92315567, "learning_rate": 3.7698008416831116e-06, "loss": 0.94392145, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.31835938, "step": 349, "time_per_iteration": 2.4211666584014893 }, { "auxiliary_loss_clip": 0.01052959, "auxiliary_loss_mlp": 0.01052669, "balance_loss_clip": 1.0276711, "balance_loss_mlp": 1.01985884, "epoch": 0.021043138433789268, "flos": 24572778842880.0, "grad_norm": 1.6042996410551307, "language_loss": 0.92770702, "learning_rate": 3.7716430523347664e-06, "loss": 0.94876331, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.33203125, "step": 350, "time_per_iteration": 2.514585018157959 }, { "auxiliary_loss_clip": 0.01056169, "auxiliary_loss_mlp": 0.01056151, "balance_loss_clip": 1.03041339, "balance_loss_mlp": 1.02121615, "epoch": 0.021103261686457236, "flos": 24453132013440.0, "grad_norm": 1.7415094709691268, "language_loss": 0.89059722, "learning_rate": 3.773480007028776e-06, "loss": 0.91172045, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.34960938, "step": 351, "time_per_iteration": 2.4895401000976562 }, { "auxiliary_loss_clip": 0.01055467, "auxiliary_loss_mlp": 0.01055435, "balance_loss_clip": 1.03000808, "balance_loss_mlp": 1.02140582, "epoch": 0.021163384939125205, "flos": 14682233917440.0, "grad_norm": 1.6683731560860307, "language_loss": 0.96391118, "learning_rate": 3.775311735671078e-06, "loss": 0.98502004, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.33984375, "step": 352, "time_per_iteration": 2.4473307132720947 }, { "auxiliary_loss_clip": 0.01054252, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.01323867, "balance_loss_mlp": 1.02056229, "epoch": 0.021223508191793177, "flos": 24492199691520.0, "grad_norm": 1.7019599123993663, "language_loss": 0.91113013, "learning_rate": 3.7771382679130878e-06, "loss": 0.932055, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3359375, "step": 353, "time_per_iteration": 2.553689956665039 }, { "auxiliary_loss_clip": 0.01053495, "auxiliary_loss_mlp": 0.01040308, "balance_loss_clip": 1.01581073, "balance_loss_mlp": 1.02077854, "epoch": 0.021283631444461146, "flos": 24126030685440.0, "grad_norm": 1.7864862226037206, "language_loss": 0.88779557, "learning_rate": 3.7789596331545845e-06, "loss": 0.90873361, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.328125, "step": 354, "time_per_iteration": 2.511162757873535 }, { "auxiliary_loss_clip": 0.01056654, "auxiliary_loss_mlp": 0.0103833, "balance_loss_clip": 1.01117396, "balance_loss_mlp": 1.0234791, "epoch": 0.021343754697129114, "flos": 25191055843200.0, "grad_norm": 1.751334386513014, "language_loss": 0.91461623, "learning_rate": 3.780775860546545e-06, "loss": 0.93556607, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.33203125, "step": 355, "time_per_iteration": 2.551647901535034 }, { "auxiliary_loss_clip": 0.01055938, "auxiliary_loss_mlp": 0.01037525, "balance_loss_clip": 1.01176405, "balance_loss_mlp": 1.02295291, "epoch": 0.021403877949797083, "flos": 17273243180160.0, "grad_norm": 1.8561163947937789, "language_loss": 1.03026938, "learning_rate": 3.7825869789939474e-06, "loss": 1.05120397, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.33007812, "step": 356, "time_per_iteration": 2.4424355030059814 }, { "auxiliary_loss_clip": 0.01051904, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.01509404, "balance_loss_mlp": 1.02103949, "epoch": 0.021464001202465055, "flos": 30916183478400.0, "grad_norm": 1.6393247965911175, "language_loss": 0.90655422, "learning_rate": 3.784393017158528e-06, "loss": 0.927477, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.30859375, "step": 357, "time_per_iteration": 2.550257444381714 }, { "auxiliary_loss_clip": 0.01049646, "auxiliary_loss_mlp": 0.01031571, "balance_loss_clip": 1.00806332, "balance_loss_mlp": 1.01878524, "epoch": 0.021524124455133024, "flos": 18185418938880.0, "grad_norm": 2.033949581243725, "language_loss": 0.89859986, "learning_rate": 3.786194003461506e-06, "loss": 0.91941202, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.30859375, "step": 358, "time_per_iteration": 2.420276641845703 }, { "auxiliary_loss_clip": 0.01052544, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.0091536, "balance_loss_mlp": 1.01939023, "epoch": 0.021584247707800992, "flos": 13805006296320.0, "grad_norm": 1.7411050690070378, "language_loss": 1.01038039, "learning_rate": 3.787989966086264e-06, "loss": 1.03125739, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.33203125, "step": 359, "time_per_iteration": 2.4263339042663574 }, { "auxiliary_loss_clip": 0.0105698, "auxiliary_loss_mlp": 0.0103848, "balance_loss_clip": 1.0122776, "balance_loss_mlp": 1.02343392, "epoch": 0.02164437096046896, "flos": 23293596205440.0, "grad_norm": 1.975109024806782, "language_loss": 0.95856643, "learning_rate": 3.789780932980997e-06, "loss": 0.97952104, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3359375, "step": 360, "time_per_iteration": 2.4664599895477295 }, { "auxiliary_loss_clip": 0.01040569, "auxiliary_loss_mlp": 0.01018223, "balance_loss_clip": 1.00878167, "balance_loss_mlp": 1.02093506, "epoch": 0.02170449421313693, "flos": 68896237875840.0, "grad_norm": 0.8994916100683901, "language_loss": 0.65260315, "learning_rate": 3.79156693186132e-06, "loss": 0.67319101, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 0.09423828, "router_z_loss_mlp": 0.19628906, "step": 361, "time_per_iteration": 3.174626588821411 }, { "auxiliary_loss_clip": 0.01062255, "auxiliary_loss_mlp": 0.01083123, "balance_loss_clip": 1.05673039, "balance_loss_mlp": 1.02799392, "epoch": 0.0217646174658049, "flos": 25227365523840.0, "grad_norm": 2.1069719654627637, "language_loss": 0.9514755, "learning_rate": 3.7933479902128433e-06, "loss": 0.9729293, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.34179688, "step": 362, "time_per_iteration": 2.489434003829956 }, { "auxiliary_loss_clip": 0.01065318, "auxiliary_loss_mlp": 0.01135035, "balance_loss_clip": 1.10979843, "balance_loss_mlp": 1.02964151, "epoch": 0.02182474071847287, "flos": 22892025214080.0, "grad_norm": 1.7524061382011709, "language_loss": 1.01685953, "learning_rate": 3.7951241352937077e-06, "loss": 1.03886318, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35546875, "step": 363, "time_per_iteration": 2.4795219898223877 }, { "auxiliary_loss_clip": 0.01058261, "auxiliary_loss_mlp": 0.01096419, "balance_loss_clip": 1.07308948, "balance_loss_mlp": 1.0255233, "epoch": 0.02188486397114084, "flos": 23657879998080.0, "grad_norm": 1.7047753876417933, "language_loss": 1.00673032, "learning_rate": 3.7968953941370915e-06, "loss": 1.02827704, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.328125, "step": 364, "time_per_iteration": 2.5590672492980957 }, { "auxiliary_loss_clip": 0.01055543, "auxiliary_loss_mlp": 0.01062395, "balance_loss_clip": 1.04016221, "balance_loss_mlp": 1.02388453, "epoch": 0.021944987223808807, "flos": 21542562276480.0, "grad_norm": 1.7183442361974162, "language_loss": 0.93013781, "learning_rate": 3.798661793553676e-06, "loss": 0.95131719, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.31640625, "step": 365, "time_per_iteration": 2.543931007385254 }, { "auxiliary_loss_clip": 0.01061743, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.00805211, "balance_loss_mlp": 1.02955437, "epoch": 0.022005110476476776, "flos": 16069961571840.0, "grad_norm": 1.4965838347684615, "language_loss": 0.91469157, "learning_rate": 3.8004233601340808e-06, "loss": 0.93562675, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.32226562, "step": 366, "time_per_iteration": 2.461550712585449 }, { "auxiliary_loss_clip": 0.0107226, "auxiliary_loss_mlp": 0.01053499, "balance_loss_clip": 1.02729654, "balance_loss_mlp": 1.03793919, "epoch": 0.022065233729144748, "flos": 21432655716480.0, "grad_norm": 1.7436805288872164, "language_loss": 1.00496304, "learning_rate": 3.8021801202512694e-06, "loss": 1.02622056, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.34375, "step": 367, "time_per_iteration": 2.482743740081787 }, { "auxiliary_loss_clip": 0.0107694, "auxiliary_loss_mlp": 0.01082567, "balance_loss_clip": 1.05421877, "balance_loss_mlp": 1.04025459, "epoch": 0.022125356981812717, "flos": 21542632099200.0, "grad_norm": 1.5872538070346651, "language_loss": 0.96469468, "learning_rate": 3.803932100062912e-06, "loss": 0.98628974, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3671875, "step": 368, "time_per_iteration": 2.473029136657715 }, { "auxiliary_loss_clip": 0.01070991, "auxiliary_loss_mlp": 0.01094667, "balance_loss_clip": 1.06771421, "balance_loss_mlp": 1.03560328, "epoch": 0.022185480234480685, "flos": 20703110613120.0, "grad_norm": 2.188996590859147, "language_loss": 0.96622384, "learning_rate": 3.8056793255137264e-06, "loss": 0.98788047, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.35351562, "step": 369, "time_per_iteration": 2.4747204780578613 }, { "auxiliary_loss_clip": 0.01067443, "auxiliary_loss_mlp": 0.01107009, "balance_loss_clip": 1.08313107, "balance_loss_mlp": 1.03387809, "epoch": 0.022245603487148654, "flos": 25191998449920.0, "grad_norm": 1.7452480599431255, "language_loss": 0.94708717, "learning_rate": 3.8074218223377844e-06, "loss": 0.96883172, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3359375, "step": 370, "time_per_iteration": 2.5162887573242188 }, { "auxiliary_loss_clip": 0.01065255, "auxiliary_loss_mlp": 0.01076542, "balance_loss_clip": 1.05291522, "balance_loss_mlp": 1.0340147, "epoch": 0.022305726739816623, "flos": 21394914670080.0, "grad_norm": 1.4209812785347278, "language_loss": 0.89684153, "learning_rate": 3.8091596160607834e-06, "loss": 0.9182595, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3125, "step": 371, "time_per_iteration": 2.4882850646972656 }, { "auxiliary_loss_clip": 0.01063969, "auxiliary_loss_mlp": 0.01051573, "balance_loss_clip": 1.02742112, "balance_loss_mlp": 1.03394675, "epoch": 0.022365849992484595, "flos": 22491047715840.0, "grad_norm": 1.993940242841274, "language_loss": 0.982355, "learning_rate": 3.8108927320022896e-06, "loss": 1.00351048, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.30078125, "step": 372, "time_per_iteration": 2.459507703781128 }, { "auxiliary_loss_clip": 0.01057762, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.01113582, "balance_loss_mlp": 1.0273459, "epoch": 0.022425973245152563, "flos": 17855664347520.0, "grad_norm": 2.018832828161093, "language_loss": 0.94146943, "learning_rate": 3.8126211952779548e-06, "loss": 0.96242565, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3046875, "step": 373, "time_per_iteration": 7.351617336273193 }, { "auxiliary_loss_clip": 0.01059714, "auxiliary_loss_mlp": 0.01083468, "balance_loss_clip": 1.05523908, "balance_loss_mlp": 1.02653718, "epoch": 0.022486096497820532, "flos": 15482233877760.0, "grad_norm": 1.9791609312613758, "language_loss": 0.9598487, "learning_rate": 3.8143450308016952e-06, "loss": 0.98128051, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.33203125, "step": 374, "time_per_iteration": 3.8822717666625977 }, { "auxiliary_loss_clip": 0.01057041, "auxiliary_loss_mlp": 0.01137261, "balance_loss_clip": 1.10824502, "balance_loss_mlp": 1.02234483, "epoch": 0.0225462197504885, "flos": 27782868067200.0, "grad_norm": 1.5808787138599067, "language_loss": 0.92508572, "learning_rate": 3.8160642632878525e-06, "loss": 0.94702882, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.34765625, "step": 375, "time_per_iteration": 2.5588536262512207 }, { "auxiliary_loss_clip": 0.01058581, "auxiliary_loss_mlp": 0.01095103, "balance_loss_clip": 1.07077229, "balance_loss_mlp": 1.02340436, "epoch": 0.02260634300315647, "flos": 19974438293760.0, "grad_norm": 1.9567445915504975, "language_loss": 0.98526859, "learning_rate": 3.817778917253314e-06, "loss": 1.00680542, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3515625, "step": 376, "time_per_iteration": 2.4672186374664307 }, { "auxiliary_loss_clip": 0.01058407, "auxiliary_loss_mlp": 0.01062511, "balance_loss_clip": 1.03661895, "balance_loss_mlp": 1.0230149, "epoch": 0.02266646625582444, "flos": 16027437669120.0, "grad_norm": 2.7299721499892424, "language_loss": 0.93942362, "learning_rate": 3.8194890170196155e-06, "loss": 0.96063268, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.35351562, "step": 377, "time_per_iteration": 2.4267687797546387 }, { "auxiliary_loss_clip": 0.01056123, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.01075971, "balance_loss_mlp": 1.02317297, "epoch": 0.02272658950849241, "flos": 20403800593920.0, "grad_norm": 1.7576562012774108, "language_loss": 1.07824039, "learning_rate": 3.8211945867150055e-06, "loss": 1.09913683, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.33007812, "step": 378, "time_per_iteration": 2.473787307739258 }, { "auxiliary_loss_clip": 0.01042357, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.02851868, "balance_loss_mlp": 1.02332401, "epoch": 0.02278671276116038, "flos": 69843990176640.0, "grad_norm": 1.0172823467558303, "language_loss": 0.75652194, "learning_rate": 3.822895650276492e-06, "loss": 0.7773385, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.19042969, "step": 379, "time_per_iteration": 3.078988790512085 }, { "auxiliary_loss_clip": 0.01050903, "auxiliary_loss_mlp": 0.01095351, "balance_loss_clip": 1.06593013, "balance_loss_mlp": 1.01971483, "epoch": 0.022846836013828347, "flos": 38507243927040.0, "grad_norm": 1.8581431618441673, "language_loss": 0.92617601, "learning_rate": 3.824592231451859e-06, "loss": 0.94763857, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.3125, "step": 380, "time_per_iteration": 2.626676559448242 }, { "auxiliary_loss_clip": 0.01049226, "auxiliary_loss_mlp": 0.01140949, "balance_loss_clip": 1.10895348, "balance_loss_mlp": 1.01942563, "epoch": 0.02290695926649632, "flos": 20958430452480.0, "grad_norm": 1.8444911428781403, "language_loss": 1.07154179, "learning_rate": 3.826284353801652e-06, "loss": 1.09344351, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.296875, "step": 381, "time_per_iteration": 2.442504405975342 }, { "auxiliary_loss_clip": 0.01046808, "auxiliary_loss_mlp": 0.01159748, "balance_loss_clip": 1.12853909, "balance_loss_mlp": 1.0167253, "epoch": 0.022967082519164288, "flos": 24021325918080.0, "grad_norm": 1.8375417247472114, "language_loss": 0.98562652, "learning_rate": 3.827972040701142e-06, "loss": 1.0076921, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.30078125, "step": 382, "time_per_iteration": 2.524433135986328 }, { "auxiliary_loss_clip": 0.01045008, "auxiliary_loss_mlp": 0.01154577, "balance_loss_clip": 1.12532306, "balance_loss_mlp": 1.01346707, "epoch": 0.023027205771832256, "flos": 20996066764800.0, "grad_norm": 1.5887092471825892, "language_loss": 0.96227187, "learning_rate": 3.829655315342268e-06, "loss": 0.98426771, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.31445312, "step": 383, "time_per_iteration": 2.5006637573242188 }, { "auxiliary_loss_clip": 0.01049137, "auxiliary_loss_mlp": 0.01132752, "balance_loss_clip": 1.10856426, "balance_loss_mlp": 1.01547778, "epoch": 0.023087329024500225, "flos": 21359757064320.0, "grad_norm": 1.67865832263442, "language_loss": 0.96721143, "learning_rate": 3.831334200735543e-06, "loss": 0.98903036, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3359375, "step": 384, "time_per_iteration": 2.4451935291290283 }, { "auxiliary_loss_clip": 0.01054972, "auxiliary_loss_mlp": 0.01067441, "balance_loss_clip": 1.04473174, "balance_loss_mlp": 1.02155185, "epoch": 0.023147452277168194, "flos": 21871339349760.0, "grad_norm": 1.5653887655972567, "language_loss": 0.96656799, "learning_rate": 3.8330087197119426e-06, "loss": 0.98779213, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.33398438, "step": 385, "time_per_iteration": 2.4414944648742676 }, { "auxiliary_loss_clip": 0.01060448, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.01226926, "balance_loss_mlp": 1.02736878, "epoch": 0.023207575529836166, "flos": 18915697180800.0, "grad_norm": 1.530763170308701, "language_loss": 0.75778854, "learning_rate": 3.83467889492477e-06, "loss": 0.77878106, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.33203125, "step": 386, "time_per_iteration": 2.4757237434387207 }, { "auxiliary_loss_clip": 0.01067537, "auxiliary_loss_mlp": 0.01124646, "balance_loss_clip": 1.09753752, "balance_loss_mlp": 1.03222609, "epoch": 0.023267698782504134, "flos": 25044839602560.0, "grad_norm": 1.578774471492193, "language_loss": 0.95899045, "learning_rate": 3.836344748851495e-06, "loss": 0.98091227, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.35351562, "step": 387, "time_per_iteration": 2.5410001277923584 }, { "auxiliary_loss_clip": 0.0107026, "auxiliary_loss_mlp": 0.01244227, "balance_loss_clip": 1.21561658, "balance_loss_mlp": 1.033319, "epoch": 0.023327822035172103, "flos": 28877883949440.0, "grad_norm": 1.6360884572049952, "language_loss": 0.94518, "learning_rate": 3.838006303795566e-06, "loss": 0.9683249, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.36914062, "step": 388, "time_per_iteration": 2.583221197128296 }, { "auxiliary_loss_clip": 0.01061667, "auxiliary_loss_mlp": 0.01245961, "balance_loss_clip": 1.21696925, "balance_loss_mlp": 1.02636814, "epoch": 0.02338794528784007, "flos": 27120426330240.0, "grad_norm": 1.9083147035832377, "language_loss": 1.06098771, "learning_rate": 3.839663581888206e-06, "loss": 1.08406401, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.35351562, "step": 389, "time_per_iteration": 2.4842751026153564 }, { "auxiliary_loss_clip": 0.01054832, "auxiliary_loss_mlp": 0.01149739, "balance_loss_clip": 1.12221324, "balance_loss_mlp": 1.02061367, "epoch": 0.02344806854050804, "flos": 21321352702080.0, "grad_norm": 1.719351910528869, "language_loss": 0.97008938, "learning_rate": 3.841316605090178e-06, "loss": 0.99213517, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.34375, "step": 390, "time_per_iteration": 2.4864470958709717 }, { "auxiliary_loss_clip": 0.0105106, "auxiliary_loss_mlp": 0.01061157, "balance_loss_clip": 1.03557444, "balance_loss_mlp": 1.01914644, "epoch": 0.023508191793176012, "flos": 24788856447360.0, "grad_norm": 1.939306325924463, "language_loss": 1.04287207, "learning_rate": 3.842965395193529e-06, "loss": 1.06399429, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.31835938, "step": 391, "time_per_iteration": 2.491482973098755 }, { "auxiliary_loss_clip": 0.01058708, "auxiliary_loss_mlp": 0.01034715, "balance_loss_clip": 1.01037192, "balance_loss_mlp": 1.02537847, "epoch": 0.02356831504584398, "flos": 25994162914560.0, "grad_norm": 1.6564997196821902, "language_loss": 0.97874367, "learning_rate": 3.84460997382332e-06, "loss": 0.9996779, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.33398438, "step": 392, "time_per_iteration": 2.5364246368408203 }, { "auxiliary_loss_clip": 0.01071197, "auxiliary_loss_mlp": 0.01052446, "balance_loss_clip": 1.0235858, "balance_loss_mlp": 1.03618598, "epoch": 0.02362843829851195, "flos": 19061459573760.0, "grad_norm": 1.677648731787316, "language_loss": 0.97780597, "learning_rate": 3.8462503624393256e-06, "loss": 0.99904239, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.34960938, "step": 393, "time_per_iteration": 2.453315258026123 }, { "auxiliary_loss_clip": 0.01086634, "auxiliary_loss_mlp": 0.01083953, "balance_loss_clip": 1.05202913, "balance_loss_mlp": 1.04925573, "epoch": 0.023688561551179918, "flos": 16070101217280.0, "grad_norm": 1.504603378010179, "language_loss": 0.89725667, "learning_rate": 3.84788658233771e-06, "loss": 0.9189626, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.375, "step": 394, "time_per_iteration": 2.4793944358825684 }, { "auxiliary_loss_clip": 0.01083467, "auxiliary_loss_mlp": 0.01116137, "balance_loss_clip": 1.08147073, "balance_loss_mlp": 1.04631972, "epoch": 0.023748684803847887, "flos": 21723342629760.0, "grad_norm": 1.5473601816260492, "language_loss": 0.93547249, "learning_rate": 3.84951865465269e-06, "loss": 0.95746851, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.37109375, "step": 395, "time_per_iteration": 2.499711751937866 }, { "auxiliary_loss_clip": 0.01068875, "auxiliary_loss_mlp": 0.01130705, "balance_loss_clip": 1.11344326, "balance_loss_mlp": 1.04772222, "epoch": 0.02380880805651586, "flos": 61923175136640.0, "grad_norm": 1.034173276045498, "language_loss": 0.64108658, "learning_rate": 3.851146600358172e-06, "loss": 0.66308236, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.2109375, "step": 396, "time_per_iteration": 2.9469611644744873 }, { "auxiliary_loss_clip": 0.01062368, "auxiliary_loss_mlp": 0.01176483, "balance_loss_clip": 1.1380024, "balance_loss_mlp": 1.02662802, "epoch": 0.023868931309183827, "flos": 20265299763840.0, "grad_norm": 1.9732415497440556, "language_loss": 0.99557865, "learning_rate": 3.852770440269372e-06, "loss": 1.01796722, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.35742188, "step": 397, "time_per_iteration": 2.4573190212249756 }, { "auxiliary_loss_clip": 0.01056196, "auxiliary_loss_mlp": 0.01191142, "balance_loss_clip": 1.15587997, "balance_loss_mlp": 1.02041304, "epoch": 0.023929054561851796, "flos": 21138128553600.0, "grad_norm": 1.7634831749766737, "language_loss": 1.00372171, "learning_rate": 3.854390195044404e-06, "loss": 1.02619505, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.35742188, "step": 398, "time_per_iteration": 2.5018601417541504 }, { "auxiliary_loss_clip": 0.01055112, "auxiliary_loss_mlp": 0.01157312, "balance_loss_clip": 1.12543523, "balance_loss_mlp": 1.02073753, "epoch": 0.023989177814519765, "flos": 13697683176960.0, "grad_norm": 2.095343063557732, "language_loss": 1.07544744, "learning_rate": 3.856005885185868e-06, "loss": 1.09757161, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.34375, "step": 399, "time_per_iteration": 2.452336072921753 }, { "auxiliary_loss_clip": 0.01061664, "auxiliary_loss_mlp": 0.01128832, "balance_loss_clip": 1.10282016, "balance_loss_mlp": 1.02799106, "epoch": 0.024049301067187733, "flos": 26320845306240.0, "grad_norm": 1.71131071693994, "language_loss": 0.93634069, "learning_rate": 3.857617531042398e-06, "loss": 0.95824564, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3359375, "step": 400, "time_per_iteration": 2.5170485973358154 }, { "auxiliary_loss_clip": 0.01077872, "auxiliary_loss_mlp": 0.010685, "balance_loss_clip": 1.04061723, "balance_loss_mlp": 1.04331744, "epoch": 0.024109424319855705, "flos": 24424293363840.0, "grad_norm": 1.5299244848717946, "language_loss": 0.88781524, "learning_rate": 3.8592251528102065e-06, "loss": 0.90927899, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.34570312, "step": 401, "time_per_iteration": 2.51882004737854 }, { "auxiliary_loss_clip": 0.01105542, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.00915468, "balance_loss_mlp": 1.0690577, "epoch": 0.024169547572523674, "flos": 29603169866880.0, "grad_norm": 1.6525525823408749, "language_loss": 0.90059721, "learning_rate": 3.8608287705345976e-06, "loss": 0.9220134, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.36523438, "step": 402, "time_per_iteration": 2.570519208908081 }, { "auxiliary_loss_clip": 0.01135296, "auxiliary_loss_mlp": 0.01137525, "balance_loss_clip": 1.10328794, "balance_loss_mlp": 1.09048748, "epoch": 0.024229670825191642, "flos": 22600360782720.0, "grad_norm": 2.1212160614934357, "language_loss": 1.08712173, "learning_rate": 3.86242840411147e-06, "loss": 1.10984993, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.44726562, "step": 403, "time_per_iteration": 2.5086300373077393 }, { "auxiliary_loss_clip": 0.01152256, "auxiliary_loss_mlp": 0.01310818, "balance_loss_clip": 1.27186012, "balance_loss_mlp": 1.09858251, "epoch": 0.02428979407785961, "flos": 18149283815040.0, "grad_norm": 2.012260642036847, "language_loss": 1.10839891, "learning_rate": 3.864024073288798e-06, "loss": 1.1330297, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53515625, "step": 404, "time_per_iteration": 2.5056493282318115 }, { "auxiliary_loss_clip": 0.01141174, "auxiliary_loss_mlp": 0.01313721, "balance_loss_clip": 1.26961398, "balance_loss_mlp": 1.08737814, "epoch": 0.024349917330527583, "flos": 15304071876480.0, "grad_norm": 1.7520327690697088, "language_loss": 0.99499238, "learning_rate": 3.865615797668091e-06, "loss": 1.01954138, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.5390625, "step": 405, "time_per_iteration": 2.4998738765716553 }, { "auxiliary_loss_clip": 0.01106758, "auxiliary_loss_mlp": 0.01188182, "balance_loss_clip": 1.14018893, "balance_loss_mlp": 1.05776453, "epoch": 0.024410040583195552, "flos": 20772937065600.0, "grad_norm": 1.7700789791114835, "language_loss": 1.04371715, "learning_rate": 3.867203596705844e-06, "loss": 1.06666672, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.49023438, "step": 406, "time_per_iteration": 2.4956769943237305 }, { "auxiliary_loss_clip": 0.0108177, "auxiliary_loss_mlp": 0.01118397, "balance_loss_clip": 1.07083273, "balance_loss_mlp": 1.03623927, "epoch": 0.02447016383586352, "flos": 21797777381760.0, "grad_norm": 1.7172087592256737, "language_loss": 0.97895312, "learning_rate": 3.86878748971496e-06, "loss": 1.00095487, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.45507812, "step": 407, "time_per_iteration": 2.485947370529175 }, { "auxiliary_loss_clip": 0.01070793, "auxiliary_loss_mlp": 0.01068357, "balance_loss_clip": 1.02053034, "balance_loss_mlp": 1.02987337, "epoch": 0.02453028708853149, "flos": 33946714247040.0, "grad_norm": 1.4417177831272252, "language_loss": 0.82290316, "learning_rate": 3.8703674958661596e-06, "loss": 0.84429467, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.40820312, "step": 408, "time_per_iteration": 2.583381414413452 }, { "auxiliary_loss_clip": 0.01074591, "auxiliary_loss_mlp": 0.01082551, "balance_loss_clip": 1.03734636, "balance_loss_mlp": 1.03423309, "epoch": 0.024590410341199458, "flos": 21792086830080.0, "grad_norm": 2.234857793951266, "language_loss": 1.06532502, "learning_rate": 3.871943634189376e-06, "loss": 1.08689642, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.40429688, "step": 409, "time_per_iteration": 2.458040952682495 }, { "auxiliary_loss_clip": 0.010873, "auxiliary_loss_mlp": 0.01085392, "balance_loss_clip": 1.03890085, "balance_loss_mlp": 1.04669189, "epoch": 0.02465053359386743, "flos": 35113371972480.0, "grad_norm": 1.782492068289133, "language_loss": 0.93966281, "learning_rate": 3.873515923575128e-06, "loss": 0.96138972, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.40625, "step": 410, "time_per_iteration": 2.591573476791382 }, { "auxiliary_loss_clip": 0.01097663, "auxiliary_loss_mlp": 0.01091273, "balance_loss_clip": 1.04718947, "balance_loss_mlp": 1.0561378, "epoch": 0.0247106568465354, "flos": 27450250744320.0, "grad_norm": 2.0100138812826267, "language_loss": 0.91941869, "learning_rate": 3.875084382775879e-06, "loss": 0.94130802, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.4140625, "step": 411, "time_per_iteration": 2.520829916000366 }, { "auxiliary_loss_clip": 0.01102946, "auxiliary_loss_mlp": 0.01104003, "balance_loss_clip": 1.05677199, "balance_loss_mlp": 1.06026542, "epoch": 0.024770780099203367, "flos": 20702761499520.0, "grad_norm": 1.7488256910988416, "language_loss": 0.98558384, "learning_rate": 3.87664903040738e-06, "loss": 1.00765324, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.42578125, "step": 412, "time_per_iteration": 4.009392499923706 }, { "auxiliary_loss_clip": 0.01067275, "auxiliary_loss_mlp": 0.01123078, "balance_loss_clip": 1.09828234, "balance_loss_mlp": 1.04154396, "epoch": 0.024830903351871336, "flos": 69548625141120.0, "grad_norm": 0.964304275836295, "language_loss": 0.58828115, "learning_rate": 3.878209884949994e-06, "loss": 0.61018467, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.2578125, "step": 413, "time_per_iteration": 4.518808841705322 }, { "auxiliary_loss_clip": 0.01080489, "auxiliary_loss_mlp": 0.01352252, "balance_loss_clip": 1.29920411, "balance_loss_mlp": 1.0407443, "epoch": 0.024891026604539304, "flos": 32269102640640.0, "grad_norm": 1.630355473054429, "language_loss": 0.88304049, "learning_rate": 3.879766964750006e-06, "loss": 0.90736794, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.3984375, "step": 414, "time_per_iteration": 3.9202213287353516 }, { "auxiliary_loss_clip": 0.01061057, "auxiliary_loss_mlp": 0.01217555, "balance_loss_clip": 1.17027617, "balance_loss_mlp": 1.02325416, "epoch": 0.024951149857207276, "flos": 18839377215360.0, "grad_norm": 1.7733515973750797, "language_loss": 0.91122979, "learning_rate": 3.881320288020917e-06, "loss": 0.93401593, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.37890625, "step": 415, "time_per_iteration": 2.4326889514923096 }, { "auxiliary_loss_clip": 0.01069004, "auxiliary_loss_mlp": 0.01109241, "balance_loss_clip": 1.06813788, "balance_loss_mlp": 1.02867365, "epoch": 0.025011273109875245, "flos": 15376307212800.0, "grad_norm": 1.9580423386031647, "language_loss": 1.17924917, "learning_rate": 3.882869872844723e-06, "loss": 1.20103168, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.40234375, "step": 416, "time_per_iteration": 2.460941791534424 }, { "auxiliary_loss_clip": 0.01079348, "auxiliary_loss_mlp": 0.01226701, "balance_loss_clip": 1.18521583, "balance_loss_mlp": 1.03785563, "epoch": 0.025071396362543213, "flos": 18914545105920.0, "grad_norm": 1.4849245568933516, "language_loss": 0.86919314, "learning_rate": 3.884415737173176e-06, "loss": 0.89225364, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.4140625, "step": 417, "time_per_iteration": 2.480829954147339 }, { "auxiliary_loss_clip": 0.01092358, "auxiliary_loss_mlp": 0.01151558, "balance_loss_clip": 1.11403108, "balance_loss_mlp": 1.04928017, "epoch": 0.025131519615211182, "flos": 25336783324800.0, "grad_norm": 1.5055364611404225, "language_loss": 0.8663283, "learning_rate": 3.8859578988290344e-06, "loss": 0.88876748, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.4296875, "step": 418, "time_per_iteration": 2.5229387283325195 }, { "auxiliary_loss_clip": 0.0110301, "auxiliary_loss_mlp": 0.01208428, "balance_loss_clip": 1.17230773, "balance_loss_mlp": 1.05734265, "epoch": 0.02519164286787915, "flos": 18952146506880.0, "grad_norm": 2.0411450314123916, "language_loss": 1.01694596, "learning_rate": 3.887496375507294e-06, "loss": 1.0400604, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.45703125, "step": 419, "time_per_iteration": 2.4592461585998535 }, { "auxiliary_loss_clip": 0.01102145, "auxiliary_loss_mlp": 0.01289402, "balance_loss_clip": 1.24755907, "balance_loss_mlp": 1.05516112, "epoch": 0.025251766120547123, "flos": 17420122736640.0, "grad_norm": 1.5802325254694738, "language_loss": 0.84811753, "learning_rate": 3.8890311847764065e-06, "loss": 0.872033, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.47070312, "step": 420, "time_per_iteration": 2.489466905593872 }, { "auxiliary_loss_clip": 0.01095744, "auxiliary_loss_mlp": 0.01359643, "balance_loss_clip": 1.31355679, "balance_loss_mlp": 1.04864407, "epoch": 0.02531188937321509, "flos": 25044281020800.0, "grad_norm": 1.5752372439744768, "language_loss": 0.88626528, "learning_rate": 3.890562344079484e-06, "loss": 0.91081917, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.47070312, "step": 421, "time_per_iteration": 2.5057857036590576 }, { "auxiliary_loss_clip": 0.0107962, "auxiliary_loss_mlp": 0.0130183, "balance_loss_clip": 1.25331223, "balance_loss_mlp": 1.03527308, "epoch": 0.02537201262588306, "flos": 30590897541120.0, "grad_norm": 1.7501331531352018, "language_loss": 0.97287649, "learning_rate": 3.89208987073549e-06, "loss": 0.99669105, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.44335938, "step": 422, "time_per_iteration": 2.546980857849121 }, { "auxiliary_loss_clip": 0.01074358, "auxiliary_loss_mlp": 0.01267467, "balance_loss_clip": 1.21546841, "balance_loss_mlp": 1.03082883, "epoch": 0.02543213587855103, "flos": 26064233746560.0, "grad_norm": 1.4963160418317423, "language_loss": 0.9285996, "learning_rate": 3.893613781940409e-06, "loss": 0.95201784, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.43554688, "step": 423, "time_per_iteration": 2.4973714351654053 }, { "auxiliary_loss_clip": 0.010686, "auxiliary_loss_mlp": 0.01198454, "balance_loss_clip": 1.14442837, "balance_loss_mlp": 1.02503848, "epoch": 0.025492259131218997, "flos": 36021498013440.0, "grad_norm": 1.5120011326170943, "language_loss": 0.84542441, "learning_rate": 3.895134094768415e-06, "loss": 0.86809498, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.43554688, "step": 424, "time_per_iteration": 2.6108956336975098 }, { "auxiliary_loss_clip": 0.01075044, "auxiliary_loss_mlp": 0.01102674, "balance_loss_clip": 1.04340315, "balance_loss_mlp": 1.02645218, "epoch": 0.02555238238388697, "flos": 18587059752960.0, "grad_norm": 1.747182477324269, "language_loss": 0.97283477, "learning_rate": 3.896650826173015e-06, "loss": 0.99461192, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.484375, "step": 425, "time_per_iteration": 2.442579984664917 }, { "auxiliary_loss_clip": 0.01078244, "auxiliary_loss_mlp": 0.0110228, "balance_loss_clip": 1.05273676, "balance_loss_mlp": 1.02673221, "epoch": 0.025612505636554938, "flos": 24242046733440.0, "grad_norm": 1.858634003781625, "language_loss": 0.98814982, "learning_rate": 3.898163992988186e-06, "loss": 1.00995517, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.515625, "step": 426, "time_per_iteration": 2.491123914718628 }, { "auxiliary_loss_clip": 0.01035619, "auxiliary_loss_mlp": 0.01247714, "balance_loss_clip": 1.22921276, "balance_loss_mlp": 1.01196575, "epoch": 0.025672628889222907, "flos": 60583661936640.0, "grad_norm": 0.9953989383190229, "language_loss": 0.57471186, "learning_rate": 3.899673611929491e-06, "loss": 0.59754521, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.23632812, "step": 427, "time_per_iteration": 3.14373517036438 }, { "auxiliary_loss_clip": 0.01093529, "auxiliary_loss_mlp": 0.01514068, "balance_loss_clip": 1.43658161, "balance_loss_mlp": 1.0367291, "epoch": 0.025732752141890875, "flos": 19572238897920.0, "grad_norm": 1.9609025011286223, "language_loss": 1.01531613, "learning_rate": 3.901179699595194e-06, "loss": 1.04139209, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 0.5703125, "step": 428, "time_per_iteration": 2.4653875827789307 }, { "auxiliary_loss_clip": 0.01102117, "auxiliary_loss_mlp": 0.01536129, "balance_loss_clip": 1.45220518, "balance_loss_mlp": 1.04303503, "epoch": 0.025792875394558847, "flos": 31282945977600.0, "grad_norm": 1.5666351018907405, "language_loss": 0.92477018, "learning_rate": 3.902682272467353e-06, "loss": 0.95115268, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 0.83984375, "router_z_loss_mlp": 0.58984375, "step": 429, "time_per_iteration": 2.5686259269714355 }, { "auxiliary_loss_clip": 0.01109063, "auxiliary_loss_mlp": 0.01175055, "balance_loss_clip": 1.11597538, "balance_loss_mlp": 1.05489016, "epoch": 0.025852998647226816, "flos": 32378241150720.0, "grad_norm": 1.7194285060258234, "language_loss": 0.96532792, "learning_rate": 3.904181346912895e-06, "loss": 0.98816907, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.54296875, "step": 430, "time_per_iteration": 2.5670180320739746 }, { "auxiliary_loss_clip": 0.01104334, "auxiliary_loss_mlp": 0.01212343, "balance_loss_clip": 1.16098726, "balance_loss_mlp": 1.0569948, "epoch": 0.025913121899894784, "flos": 20192261466240.0, "grad_norm": 1.4125027672327777, "language_loss": 0.92967927, "learning_rate": 3.905676939184698e-06, "loss": 0.95284599, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.47265625, "step": 431, "time_per_iteration": 2.48945951461792 }, { "auxiliary_loss_clip": 0.01096897, "auxiliary_loss_mlp": 0.01415654, "balance_loss_clip": 1.35256827, "balance_loss_mlp": 1.05194187, "epoch": 0.025973245152562753, "flos": 14719556027520.0, "grad_norm": 1.9494570201501953, "language_loss": 1.02119184, "learning_rate": 3.907169065422638e-06, "loss": 1.04631734, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.44921875, "step": 432, "time_per_iteration": 2.441380500793457 }, { "auxiliary_loss_clip": 0.01081184, "auxiliary_loss_mlp": 0.01464531, "balance_loss_clip": 1.4023037, "balance_loss_mlp": 1.04050517, "epoch": 0.02603336840523072, "flos": 30991665571200.0, "grad_norm": 1.7110813320340912, "language_loss": 0.88180423, "learning_rate": 3.908657741654636e-06, "loss": 0.90726137, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.40625, "step": 433, "time_per_iteration": 2.611130714416504 }, { "auxiliary_loss_clip": 0.01067657, "auxiliary_loss_mlp": 0.01241945, "balance_loss_clip": 1.19564462, "balance_loss_mlp": 1.02843809, "epoch": 0.026093491657898694, "flos": 17673347894400.0, "grad_norm": 1.8078536597896537, "language_loss": 1.00427544, "learning_rate": 3.910142983797699e-06, "loss": 1.02737153, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.39257812, "step": 434, "time_per_iteration": 2.4487080574035645 }, { "auxiliary_loss_clip": 0.01068704, "auxiliary_loss_mlp": 0.01114978, "balance_loss_clip": 1.07153773, "balance_loss_mlp": 1.02613163, "epoch": 0.026153614910566662, "flos": 17856921156480.0, "grad_norm": 1.7393823453049984, "language_loss": 0.94274235, "learning_rate": 3.9116248076589305e-06, "loss": 0.96457911, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.42578125, "step": 435, "time_per_iteration": 2.4638381004333496 }, { "auxiliary_loss_clip": 0.01074687, "auxiliary_loss_mlp": 0.01286979, "balance_loss_clip": 1.23042583, "balance_loss_mlp": 1.03001726, "epoch": 0.02621373816323463, "flos": 20010084658560.0, "grad_norm": 1.855561554180425, "language_loss": 1.00319719, "learning_rate": 3.913103228936546e-06, "loss": 1.02681375, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.44726562, "step": 436, "time_per_iteration": 2.464911460876465 }, { "auxiliary_loss_clip": 0.01081567, "auxiliary_loss_mlp": 0.01325425, "balance_loss_clip": 1.26515269, "balance_loss_mlp": 1.03569412, "epoch": 0.0262738614159026, "flos": 19280190441600.0, "grad_norm": 1.889354115675155, "language_loss": 0.90361893, "learning_rate": 3.914578263220868e-06, "loss": 0.9276889, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.45703125, "step": 437, "time_per_iteration": 2.528285026550293 }, { "auxiliary_loss_clip": 0.01092816, "auxiliary_loss_mlp": 0.01185407, "balance_loss_clip": 1.12151098, "balance_loss_mlp": 1.04545319, "epoch": 0.026333984668570568, "flos": 18806209557120.0, "grad_norm": 1.9311327244759642, "language_loss": 1.02307796, "learning_rate": 3.916049925995316e-06, "loss": 1.04586017, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.47265625, "step": 438, "time_per_iteration": 2.4848694801330566 }, { "auxiliary_loss_clip": 0.01061256, "auxiliary_loss_mlp": 0.01229617, "balance_loss_clip": 1.18956256, "balance_loss_mlp": 1.03786755, "epoch": 0.02639410792123854, "flos": 64568403607680.0, "grad_norm": 0.9687376022112427, "language_loss": 0.62851357, "learning_rate": 3.917518232637377e-06, "loss": 0.65142226, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.234375, "step": 439, "time_per_iteration": 3.152158737182617 }, { "auxiliary_loss_clip": 0.01111209, "auxiliary_loss_mlp": 0.01143553, "balance_loss_clip": 1.07560325, "balance_loss_mlp": 1.05496895, "epoch": 0.02645423117390651, "flos": 28472263240320.0, "grad_norm": 1.643052309564428, "language_loss": 0.87152088, "learning_rate": 3.918983198419573e-06, "loss": 0.89406848, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.5625, "step": 440, "time_per_iteration": 2.5506815910339355 }, { "auxiliary_loss_clip": 0.01111826, "auxiliary_loss_mlp": 0.01180838, "balance_loss_clip": 1.11045647, "balance_loss_mlp": 1.05053329, "epoch": 0.026514354426574478, "flos": 18550261313280.0, "grad_norm": 1.583446230047119, "language_loss": 0.93723452, "learning_rate": 3.920444838510415e-06, "loss": 0.96016109, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.61328125, "step": 441, "time_per_iteration": 2.4710886478424072 }, { "auxiliary_loss_clip": 0.01105112, "auxiliary_loss_mlp": 0.01166443, "balance_loss_clip": 1.10264266, "balance_loss_mlp": 1.04616296, "epoch": 0.026574477679242446, "flos": 20666766021120.0, "grad_norm": 1.5838662080562382, "language_loss": 0.89557087, "learning_rate": 3.92190316797534e-06, "loss": 0.91828644, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.58984375, "step": 442, "time_per_iteration": 2.4895575046539307 }, { "auxiliary_loss_clip": 0.0104997, "auxiliary_loss_mlp": 0.01034661, "balance_loss_clip": 1.01291728, "balance_loss_mlp": 1.02347493, "epoch": 0.026634600931910415, "flos": 57953026414080.0, "grad_norm": 0.9732443204481646, "language_loss": 0.64726448, "learning_rate": 3.92335820177765e-06, "loss": 0.66811079, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.265625, "step": 443, "time_per_iteration": 2.9266908168792725 }, { "auxiliary_loss_clip": 0.01090088, "auxiliary_loss_mlp": 0.01119116, "balance_loss_clip": 1.05560172, "balance_loss_mlp": 1.03570509, "epoch": 0.026694724184578387, "flos": 15814222796160.0, "grad_norm": 1.7843176004192642, "language_loss": 0.99075246, "learning_rate": 3.924809954779425e-06, "loss": 1.01284444, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.54296875, "step": 444, "time_per_iteration": 2.464545249938965 }, { "auxiliary_loss_clip": 0.0108764, "auxiliary_loss_mlp": 0.01135656, "balance_loss_clip": 1.07166433, "balance_loss_mlp": 1.03387642, "epoch": 0.026754847437246355, "flos": 23439149130240.0, "grad_norm": 1.7650013699669371, "language_loss": 1.05818057, "learning_rate": 3.9262584417424425e-06, "loss": 1.08041358, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.5390625, "step": 445, "time_per_iteration": 2.5354983806610107 }, { "auxiliary_loss_clip": 0.01077301, "auxiliary_loss_mlp": 0.01121267, "balance_loss_clip": 1.06914878, "balance_loss_mlp": 1.02889597, "epoch": 0.026814970689914324, "flos": 17341009862400.0, "grad_norm": 1.7453518220001314, "language_loss": 1.06053925, "learning_rate": 3.9277036773290725e-06, "loss": 1.08252501, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.484375, "step": 446, "time_per_iteration": 2.4397237300872803 }, { "auxiliary_loss_clip": 0.01073684, "auxiliary_loss_mlp": 0.01084602, "balance_loss_clip": 1.03250742, "balance_loss_mlp": 1.02750671, "epoch": 0.026875093942582293, "flos": 17893754507520.0, "grad_norm": 1.6624574624385358, "language_loss": 0.90394521, "learning_rate": 3.92914567610317e-06, "loss": 0.92552805, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.4609375, "step": 447, "time_per_iteration": 2.467102527618408 }, { "auxiliary_loss_clip": 0.01071468, "auxiliary_loss_mlp": 0.01069429, "balance_loss_clip": 1.01895559, "balance_loss_mlp": 1.02658105, "epoch": 0.026935217195250265, "flos": 21722958604800.0, "grad_norm": 1.6072632850483282, "language_loss": 0.97541744, "learning_rate": 3.930584452530952e-06, "loss": 0.99682641, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.44921875, "step": 448, "time_per_iteration": 2.45749568939209 }, { "auxiliary_loss_clip": 0.01073408, "auxiliary_loss_mlp": 0.01077867, "balance_loss_clip": 1.02865767, "balance_loss_mlp": 1.02684307, "epoch": 0.026995340447918233, "flos": 23621570317440.0, "grad_norm": 1.9791859654580417, "language_loss": 0.97788966, "learning_rate": 3.9320200209818755e-06, "loss": 0.9994024, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.46484375, "step": 449, "time_per_iteration": 2.477748394012451 }, { "auxiliary_loss_clip": 0.01072197, "auxiliary_loss_mlp": 0.01097728, "balance_loss_clip": 1.0498302, "balance_loss_mlp": 1.02680922, "epoch": 0.027055463700586202, "flos": 17930308567680.0, "grad_norm": 1.6624340948183838, "language_loss": 0.95286608, "learning_rate": 3.933452395729493e-06, "loss": 0.97456545, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.453125, "step": 450, "time_per_iteration": 2.444392204284668 }, { "auxiliary_loss_clip": 0.01075371, "auxiliary_loss_mlp": 0.01106679, "balance_loss_clip": 1.06025934, "balance_loss_mlp": 1.03107512, "epoch": 0.02711558695325417, "flos": 25117738254720.0, "grad_norm": 1.3830171268415852, "language_loss": 0.87715119, "learning_rate": 3.934881590952304e-06, "loss": 0.89897174, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.44335938, "step": 451, "time_per_iteration": 3.945362091064453 }, { "auxiliary_loss_clip": 0.01076533, "auxiliary_loss_mlp": 0.01093663, "balance_loss_clip": 1.04814911, "balance_loss_mlp": 1.03162563, "epoch": 0.02717571020592214, "flos": 24238520686080.0, "grad_norm": 1.4247779251996742, "language_loss": 0.82774174, "learning_rate": 3.936307620734599e-06, "loss": 0.84944367, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.44921875, "step": 452, "time_per_iteration": 3.9083104133605957 }, { "auxiliary_loss_clip": 0.0107286, "auxiliary_loss_mlp": 0.01071755, "balance_loss_clip": 1.02795792, "balance_loss_mlp": 1.02930152, "epoch": 0.02723583345859011, "flos": 25117773166080.0, "grad_norm": 1.4037185699395758, "language_loss": 0.78768903, "learning_rate": 3.937730499067294e-06, "loss": 0.80913514, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.43554688, "step": 453, "time_per_iteration": 3.910768508911133 }, { "auxiliary_loss_clip": 0.01067028, "auxiliary_loss_mlp": 0.01114327, "balance_loss_clip": 1.07057726, "balance_loss_mlp": 1.02542186, "epoch": 0.02729595671125808, "flos": 42739939140480.0, "grad_norm": 1.5802973028391543, "language_loss": 0.93361497, "learning_rate": 3.939150239848748e-06, "loss": 0.95542854, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.41601562, "step": 454, "time_per_iteration": 2.6732449531555176 }, { "auxiliary_loss_clip": 0.01064768, "auxiliary_loss_mlp": 0.01103791, "balance_loss_clip": 1.06073308, "balance_loss_mlp": 1.02387166, "epoch": 0.02735607996392605, "flos": 21430002453120.0, "grad_norm": 1.40996443550794, "language_loss": 0.83596015, "learning_rate": 3.9405668568855866e-06, "loss": 0.85764575, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.41015625, "step": 455, "time_per_iteration": 2.504915237426758 }, { "auxiliary_loss_clip": 0.01062349, "auxiliary_loss_mlp": 0.01074541, "balance_loss_clip": 1.03508329, "balance_loss_mlp": 1.02212381, "epoch": 0.027416203216594017, "flos": 20850199637760.0, "grad_norm": 1.60295818078564, "language_loss": 0.92798924, "learning_rate": 3.941980363893499e-06, "loss": 0.94935822, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.40234375, "step": 456, "time_per_iteration": 2.490342617034912 }, { "auxiliary_loss_clip": 0.01061423, "auxiliary_loss_mlp": 0.01082777, "balance_loss_clip": 1.04377198, "balance_loss_mlp": 1.02204454, "epoch": 0.027476326469261986, "flos": 13223667381120.0, "grad_norm": 1.5015077123030105, "language_loss": 0.89764625, "learning_rate": 3.9433907744980384e-06, "loss": 0.9190883, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.39453125, "step": 457, "time_per_iteration": 2.4761478900909424 }, { "auxiliary_loss_clip": 0.01058649, "auxiliary_loss_mlp": 0.01081982, "balance_loss_clip": 1.04440701, "balance_loss_mlp": 1.01990438, "epoch": 0.027536449721929958, "flos": 24023385688320.0, "grad_norm": 1.869561994849557, "language_loss": 1.03928685, "learning_rate": 3.944798102235412e-06, "loss": 1.06069314, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.38671875, "step": 458, "time_per_iteration": 2.5039923191070557 }, { "auxiliary_loss_clip": 0.01057831, "auxiliary_loss_mlp": 0.01064508, "balance_loss_clip": 1.02946067, "balance_loss_mlp": 1.02015853, "epoch": 0.027596572974597926, "flos": 13005215804160.0, "grad_norm": 2.028272082687578, "language_loss": 0.94367236, "learning_rate": 3.9462023605532545e-06, "loss": 0.96489573, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.37695312, "step": 459, "time_per_iteration": 2.4560458660125732 }, { "auxiliary_loss_clip": 0.01056359, "auxiliary_loss_mlp": 0.01051928, "balance_loss_clip": 1.01802516, "balance_loss_mlp": 1.01991963, "epoch": 0.027656696227265895, "flos": 26141810520960.0, "grad_norm": 1.4720825176737136, "language_loss": 0.90831614, "learning_rate": 3.947603562811407e-06, "loss": 0.92939901, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.36328125, "step": 460, "time_per_iteration": 2.5009100437164307 }, { "auxiliary_loss_clip": 0.01047951, "auxiliary_loss_mlp": 0.01371652, "balance_loss_clip": 1.33045363, "balance_loss_mlp": 1.02201283, "epoch": 0.027716819479933864, "flos": 60693917610240.0, "grad_norm": 1.7228985869413633, "language_loss": 0.73757339, "learning_rate": 3.949001722282675e-06, "loss": 0.76176947, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.25976562, "step": 461, "time_per_iteration": 3.033900499343872 }, { "auxiliary_loss_clip": 0.01059002, "auxiliary_loss_mlp": 0.01072757, "balance_loss_clip": 1.03918791, "balance_loss_mlp": 1.02266741, "epoch": 0.027776942732601832, "flos": 31210605907200.0, "grad_norm": 1.9347250579229809, "language_loss": 0.95866072, "learning_rate": 3.950396852153582e-06, "loss": 0.97997832, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.36328125, "step": 462, "time_per_iteration": 2.522268295288086 }, { "auxiliary_loss_clip": 0.01060527, "auxiliary_loss_mlp": 0.01135243, "balance_loss_clip": 1.09921789, "balance_loss_mlp": 1.02257156, "epoch": 0.027837065985269804, "flos": 22673538725760.0, "grad_norm": 1.8424971046146295, "language_loss": 1.04239631, "learning_rate": 3.951788965525118e-06, "loss": 1.06435394, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.37890625, "step": 463, "time_per_iteration": 2.4867489337921143 }, { "auxiliary_loss_clip": 0.01046778, "auxiliary_loss_mlp": 0.01021885, "balance_loss_clip": 1.0062449, "balance_loss_mlp": 1.02434325, "epoch": 0.027897189237937773, "flos": 62179437582720.0, "grad_norm": 0.8973351139160131, "language_loss": 0.59326446, "learning_rate": 3.953178075413476e-06, "loss": 0.61395109, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.22460938, "step": 464, "time_per_iteration": 3.030200481414795 }, { "auxiliary_loss_clip": 0.01062848, "auxiliary_loss_mlp": 0.01198682, "balance_loss_clip": 1.15948558, "balance_loss_mlp": 1.02381897, "epoch": 0.02795731249060574, "flos": 24492164780160.0, "grad_norm": 1.6567602474438679, "language_loss": 0.96851265, "learning_rate": 3.954564194750784e-06, "loss": 0.99112797, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.390625, "step": 465, "time_per_iteration": 2.4986178874969482 }, { "auxiliary_loss_clip": 0.01061766, "auxiliary_loss_mlp": 0.01174991, "balance_loss_clip": 1.13536596, "balance_loss_mlp": 1.02291989, "epoch": 0.02801743574327371, "flos": 23731860902400.0, "grad_norm": 1.684077725179182, "language_loss": 0.87185907, "learning_rate": 3.955947336385828e-06, "loss": 0.89422661, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.38867188, "step": 466, "time_per_iteration": 2.470954656600952 }, { "auxiliary_loss_clip": 0.01058656, "auxiliary_loss_mlp": 0.01127028, "balance_loss_clip": 1.08601975, "balance_loss_mlp": 1.02179849, "epoch": 0.02807755899594168, "flos": 20628117279360.0, "grad_norm": 1.6112438676202752, "language_loss": 0.93925571, "learning_rate": 3.957327513084761e-06, "loss": 0.96111262, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.3671875, "step": 467, "time_per_iteration": 2.482576847076416 }, { "auxiliary_loss_clip": 0.01063363, "auxiliary_loss_mlp": 0.01079824, "balance_loss_clip": 1.04058015, "balance_loss_mlp": 1.02423286, "epoch": 0.02813768224860965, "flos": 19243566558720.0, "grad_norm": 1.7647817159058723, "language_loss": 0.96234852, "learning_rate": 3.958704737531818e-06, "loss": 0.98378038, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.390625, "step": 468, "time_per_iteration": 2.527853488922119 }, { "auxiliary_loss_clip": 0.01063887, "auxiliary_loss_mlp": 0.01062662, "balance_loss_clip": 1.02275062, "balance_loss_mlp": 1.02488852, "epoch": 0.02819780550127762, "flos": 20812912439040.0, "grad_norm": 1.8090786064206932, "language_loss": 1.04676318, "learning_rate": 3.9600790223300065e-06, "loss": 1.06802869, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.390625, "step": 469, "time_per_iteration": 2.491145372390747 }, { "auxiliary_loss_clip": 0.01063796, "auxiliary_loss_mlp": 0.01058231, "balance_loss_clip": 1.01684213, "balance_loss_mlp": 1.02440226, "epoch": 0.028257928753945588, "flos": 19973111662080.0, "grad_norm": 1.8249365771843946, "language_loss": 0.9798125, "learning_rate": 3.96145038000181e-06, "loss": 1.00103283, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.39453125, "step": 470, "time_per_iteration": 2.4382712841033936 }, { "auxiliary_loss_clip": 0.01075298, "auxiliary_loss_mlp": 0.01063437, "balance_loss_clip": 1.0202117, "balance_loss_mlp": 1.03350055, "epoch": 0.028318052006613557, "flos": 20483472049920.0, "grad_norm": 1.6536804897011441, "language_loss": 1.04656124, "learning_rate": 3.962818822989861e-06, "loss": 1.06794846, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.41796875, "step": 471, "time_per_iteration": 2.486631155014038 }, { "auxiliary_loss_clip": 0.01070454, "auxiliary_loss_mlp": 0.01084664, "balance_loss_clip": 1.03955543, "balance_loss_mlp": 1.02979302, "epoch": 0.02837817525928153, "flos": 28513495422720.0, "grad_norm": 1.5594555198240558, "language_loss": 0.85379982, "learning_rate": 3.964184363657625e-06, "loss": 0.87535095, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.40625, "step": 472, "time_per_iteration": 2.520949125289917 }, { "auxiliary_loss_clip": 0.01073506, "auxiliary_loss_mlp": 0.01099429, "balance_loss_clip": 1.05660939, "balance_loss_mlp": 1.03324246, "epoch": 0.028438298511949497, "flos": 18550680249600.0, "grad_norm": 1.5507133161459867, "language_loss": 1.03927302, "learning_rate": 3.965547014290071e-06, "loss": 1.06100225, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.40234375, "step": 473, "time_per_iteration": 2.4519848823547363 }, { "auxiliary_loss_clip": 0.01068118, "auxiliary_loss_mlp": 0.01157544, "balance_loss_clip": 1.10714245, "balance_loss_mlp": 1.02916074, "epoch": 0.028498421764617466, "flos": 16909273589760.0, "grad_norm": 1.8957181580051394, "language_loss": 1.04659927, "learning_rate": 3.96690678709433e-06, "loss": 1.06885588, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.390625, "step": 474, "time_per_iteration": 2.430156707763672 }, { "auxiliary_loss_clip": 0.01067562, "auxiliary_loss_mlp": 0.01185116, "balance_loss_clip": 1.12827742, "balance_loss_mlp": 1.02766824, "epoch": 0.028558545017285435, "flos": 27777561540480.0, "grad_norm": 1.7384403101353583, "language_loss": 0.89234042, "learning_rate": 3.968263694200355e-06, "loss": 0.91486716, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.3984375, "step": 475, "time_per_iteration": 2.515697956085205 }, { "auxiliary_loss_clip": 0.01046359, "auxiliary_loss_mlp": 0.01302395, "balance_loss_clip": 1.25948, "balance_loss_mlp": 1.02134895, "epoch": 0.028618668269953403, "flos": 65651060868480.0, "grad_norm": 1.108212568755077, "language_loss": 0.67332512, "learning_rate": 3.969617747661569e-06, "loss": 0.69681269, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.25, "step": 476, "time_per_iteration": 2.980910301208496 }, { "auxiliary_loss_clip": 0.01063847, "auxiliary_loss_mlp": 0.01297294, "balance_loss_clip": 1.23358881, "balance_loss_mlp": 1.02015924, "epoch": 0.028678791522621375, "flos": 21936208389120.0, "grad_norm": 1.838459948772356, "language_loss": 0.98401761, "learning_rate": 3.970968959455509e-06, "loss": 1.00762892, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.4375, "step": 477, "time_per_iteration": 2.4759461879730225 }, { "auxiliary_loss_clip": 0.01072471, "auxiliary_loss_mlp": 0.01375246, "balance_loss_clip": 1.31197023, "balance_loss_mlp": 1.02801323, "epoch": 0.028738914775289344, "flos": 24570963452160.0, "grad_norm": 2.06680580994056, "language_loss": 0.9559828, "learning_rate": 3.97231734148446e-06, "loss": 0.98045999, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.44335938, "step": 478, "time_per_iteration": 2.4877047538757324 }, { "auxiliary_loss_clip": 0.01082779, "auxiliary_loss_mlp": 0.01245624, "balance_loss_clip": 1.1939826, "balance_loss_mlp": 1.03950381, "epoch": 0.028799038027957313, "flos": 23256867588480.0, "grad_norm": 1.5081175216432052, "language_loss": 0.9190942, "learning_rate": 3.973662905576082e-06, "loss": 0.94237828, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.43164062, "step": 479, "time_per_iteration": 2.506470203399658 }, { "auxiliary_loss_clip": 0.0109537, "auxiliary_loss_mlp": 0.01069175, "balance_loss_clip": 1.03453255, "balance_loss_mlp": 1.05306149, "epoch": 0.02885916128062528, "flos": 22163003781120.0, "grad_norm": 1.7414786868711547, "language_loss": 0.84525621, "learning_rate": 3.975005663484038e-06, "loss": 0.86690164, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.42382812, "step": 480, "time_per_iteration": 2.482440948486328 }, { "auxiliary_loss_clip": 0.01114211, "auxiliary_loss_mlp": 0.01097172, "balance_loss_clip": 1.05974054, "balance_loss_mlp": 1.06619561, "epoch": 0.02891928453329325, "flos": 22931651473920.0, "grad_norm": 1.4651378733657758, "language_loss": 0.95342934, "learning_rate": 3.976345626888605e-06, "loss": 0.97554314, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.47851562, "step": 481, "time_per_iteration": 2.562629222869873 }, { "auxiliary_loss_clip": 0.01138624, "auxiliary_loss_mlp": 0.01123518, "balance_loss_clip": 1.08766019, "balance_loss_mlp": 1.08891475, "epoch": 0.028979407785961222, "flos": 57430202670720.0, "grad_norm": 0.9052968901909274, "language_loss": 0.66359693, "learning_rate": 3.9776828073972864e-06, "loss": 0.68621832, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.49609375, "step": 482, "time_per_iteration": 2.8132543563842773 }, { "auxiliary_loss_clip": 0.01140211, "auxiliary_loss_mlp": 0.01237639, "balance_loss_clip": 1.18893015, "balance_loss_mlp": 1.07987344, "epoch": 0.02903953103862919, "flos": 16721929900800.0, "grad_norm": 2.009995415563254, "language_loss": 0.97913986, "learning_rate": 3.979017216545415e-06, "loss": 1.00291824, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.60546875, "step": 483, "time_per_iteration": 2.5561890602111816 }, { "auxiliary_loss_clip": 0.01137553, "auxiliary_loss_mlp": 0.01299003, "balance_loss_clip": 1.25081873, "balance_loss_mlp": 1.07552803, "epoch": 0.02909965429129716, "flos": 16762708235520.0, "grad_norm": 1.5692790691675422, "language_loss": 0.86888838, "learning_rate": 3.980348865796749e-06, "loss": 0.89325392, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.62109375, "step": 484, "time_per_iteration": 2.441469192504883 }, { "auxiliary_loss_clip": 0.01118154, "auxiliary_loss_mlp": 0.0129426, "balance_loss_clip": 1.2446692, "balance_loss_mlp": 1.06238127, "epoch": 0.029159777543965128, "flos": 19784511164160.0, "grad_norm": 1.8453913858402686, "language_loss": 0.94359136, "learning_rate": 3.9816777665440615e-06, "loss": 0.96771544, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.5546875, "step": 485, "time_per_iteration": 2.491818904876709 }, { "auxiliary_loss_clip": 0.01101327, "auxiliary_loss_mlp": 0.01224207, "balance_loss_clip": 1.16920376, "balance_loss_mlp": 1.04975796, "epoch": 0.029219900796633096, "flos": 19641751148160.0, "grad_norm": 1.7592660679479901, "language_loss": 0.9997077, "learning_rate": 3.983003930109732e-06, "loss": 1.02296305, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.515625, "step": 486, "time_per_iteration": 2.4576265811920166 }, { "auxiliary_loss_clip": 0.01087293, "auxiliary_loss_mlp": 0.0122641, "balance_loss_clip": 1.16675723, "balance_loss_mlp": 1.03734541, "epoch": 0.02928002404930107, "flos": 25884500734080.0, "grad_norm": 1.5523878104721587, "language_loss": 0.9864561, "learning_rate": 3.984327367746315e-06, "loss": 1.00959301, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.5, "step": 487, "time_per_iteration": 2.531792640686035 }, { "auxiliary_loss_clip": 0.01093668, "auxiliary_loss_mlp": 0.01202614, "balance_loss_clip": 1.13261414, "balance_loss_mlp": 1.04040742, "epoch": 0.029340147301969037, "flos": 20659399724160.0, "grad_norm": 1.9972596833023266, "language_loss": 1.05266643, "learning_rate": 3.985648090637122e-06, "loss": 1.07562923, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.53125, "step": 488, "time_per_iteration": 2.4513723850250244 }, { "auxiliary_loss_clip": 0.01098791, "auxiliary_loss_mlp": 0.01178871, "balance_loss_clip": 1.10624862, "balance_loss_mlp": 1.04263854, "epoch": 0.029400270554637006, "flos": 24426806981760.0, "grad_norm": 1.612030519160693, "language_loss": 0.95104742, "learning_rate": 3.986966109896785e-06, "loss": 0.97382402, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.5625, "step": 489, "time_per_iteration": 2.5253055095672607 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01124979, "balance_loss_clip": 1.05269074, "balance_loss_mlp": 1.05359125, "epoch": 0.029460393807304974, "flos": 20119851573120.0, "grad_norm": 1.5117878485635015, "language_loss": 0.94966328, "learning_rate": 3.988281436571815e-06, "loss": 0.97209704, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 0.6484375, "step": 490, "time_per_iteration": 2.483485460281372 }, { "auxiliary_loss_clip": 0.0113758, "auxiliary_loss_mlp": 0.01099364, "balance_loss_clip": 1.02602625, "balance_loss_mlp": 1.06776619, "epoch": 0.029520517059972943, "flos": 17674953816960.0, "grad_norm": 1.8937820445965092, "language_loss": 1.05084133, "learning_rate": 3.989594081641164e-06, "loss": 1.07321072, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.69921875, "step": 491, "time_per_iteration": 5.441805362701416 }, { "auxiliary_loss_clip": 0.0114447, "auxiliary_loss_mlp": 0.01192518, "balance_loss_clip": 1.113554, "balance_loss_mlp": 1.07219505, "epoch": 0.029580640312640915, "flos": 18952181418240.0, "grad_norm": 1.6406449666500702, "language_loss": 0.93240142, "learning_rate": 3.9909040560167675e-06, "loss": 0.95577139, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 0.72265625, "step": 492, "time_per_iteration": 3.90789532661438 }, { "auxiliary_loss_clip": 0.01129758, "auxiliary_loss_mlp": 0.01186463, "balance_loss_clip": 1.10425639, "balance_loss_mlp": 1.06729031, "epoch": 0.029640763565308884, "flos": 18725351114880.0, "grad_norm": 2.016961366388103, "language_loss": 0.99007815, "learning_rate": 3.992211370544093e-06, "loss": 1.01324034, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 0.82421875, "router_z_loss_mlp": 0.625, "step": 493, "time_per_iteration": 3.875744581222534 }, { "auxiliary_loss_clip": 0.01114203, "auxiliary_loss_mlp": 0.01147368, "balance_loss_clip": 1.07064462, "balance_loss_mlp": 1.05397987, "epoch": 0.029700886817976852, "flos": 20594251393920.0, "grad_norm": 1.4909158347075284, "language_loss": 0.98815495, "learning_rate": 3.99351603600268e-06, "loss": 1.01077056, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.6015625, "step": 494, "time_per_iteration": 2.510373592376709 }, { "auxiliary_loss_clip": 0.01104537, "auxiliary_loss_mlp": 0.01133556, "balance_loss_clip": 1.05854952, "balance_loss_mlp": 1.04807806, "epoch": 0.02976101007064482, "flos": 22235762787840.0, "grad_norm": 1.8895961131602586, "language_loss": 0.97594285, "learning_rate": 3.994818063106668e-06, "loss": 0.9983238, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.5625, "step": 495, "time_per_iteration": 2.4800100326538086 }, { "auxiliary_loss_clip": 0.01103096, "auxiliary_loss_mlp": 0.01125985, "balance_loss_clip": 1.04964375, "balance_loss_mlp": 1.04405785, "epoch": 0.029821133323312793, "flos": 23731511788800.0, "grad_norm": 1.4766247403847093, "language_loss": 0.71986866, "learning_rate": 3.99611746250533e-06, "loss": 0.74215949, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 0.58984375, "step": 496, "time_per_iteration": 2.5348434448242188 }, { "auxiliary_loss_clip": 0.01107344, "auxiliary_loss_mlp": 0.01121402, "balance_loss_clip": 1.05311882, "balance_loss_mlp": 1.04771471, "epoch": 0.02988125657598076, "flos": 22418393443200.0, "grad_norm": 1.4250591486584456, "language_loss": 0.96096951, "learning_rate": 3.997414244783595e-06, "loss": 0.98325694, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.59375, "step": 497, "time_per_iteration": 2.499897003173828 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01094779, "balance_loss_clip": 1.02558994, "balance_loss_mlp": 1.04693675, "epoch": 0.02994137982864873, "flos": 13844248531200.0, "grad_norm": 1.9704547132895474, "language_loss": 1.01603985, "learning_rate": 3.998708420462557e-06, "loss": 1.03807151, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.61328125, "step": 498, "time_per_iteration": 2.4617421627044678 }, { "auxiliary_loss_clip": 0.01112543, "auxiliary_loss_mlp": 0.01106553, "balance_loss_clip": 1.02334476, "balance_loss_mlp": 1.04628384, "epoch": 0.0300015030813167, "flos": 23907404551680.0, "grad_norm": 2.0235955988153487, "language_loss": 0.96577203, "learning_rate": 4e-06, "loss": 0.98796308, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 0.83203125, "router_z_loss_mlp": 0.6640625, "step": 499, "time_per_iteration": 2.455439329147339 }, { "auxiliary_loss_clip": 0.01113809, "auxiliary_loss_mlp": 0.01124168, "balance_loss_clip": 1.04210448, "balance_loss_mlp": 1.04805195, "epoch": 0.030061626333984667, "flos": 22015740199680.0, "grad_norm": 1.450328724173176, "language_loss": 0.90556049, "learning_rate": 3.9999999620799e-06, "loss": 0.92794025, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 0.65625, "step": 500, "time_per_iteration": 2.4918484687805176 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01138084, "balance_loss_clip": 1.06417418, "balance_loss_mlp": 1.04953122, "epoch": 0.03012174958665264, "flos": 23038625479680.0, "grad_norm": 1.9245725410329135, "language_loss": 1.03260815, "learning_rate": 3.9999998483196e-06, "loss": 1.05511522, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 0.6328125, "step": 501, "time_per_iteration": 2.459219217300415 }, { "auxiliary_loss_clip": 0.01107122, "auxiliary_loss_mlp": 0.01126613, "balance_loss_clip": 1.05289388, "balance_loss_mlp": 1.04742968, "epoch": 0.030181872839320608, "flos": 18952251240960.0, "grad_norm": 1.9278549276928703, "language_loss": 0.98206395, "learning_rate": 3.9999996587191065e-06, "loss": 1.00440121, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 0.59765625, "step": 502, "time_per_iteration": 2.488969326019287 }, { "auxiliary_loss_clip": 0.0110702, "auxiliary_loss_mlp": 0.01140594, "balance_loss_clip": 1.06453848, "balance_loss_mlp": 1.04990494, "epoch": 0.030241996091988577, "flos": 16727061870720.0, "grad_norm": 2.0312172336926313, "language_loss": 0.95668173, "learning_rate": 3.999999393278425e-06, "loss": 0.97915787, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 0.5703125, "step": 503, "time_per_iteration": 2.462157726287842 }, { "auxiliary_loss_clip": 0.01103482, "auxiliary_loss_mlp": 0.01150776, "balance_loss_clip": 1.05698252, "balance_loss_mlp": 1.04730058, "epoch": 0.030302119344656545, "flos": 28620015580800.0, "grad_norm": 1.5971093431692582, "language_loss": 0.97744256, "learning_rate": 3.999999051997567e-06, "loss": 0.99998516, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.5625, "step": 504, "time_per_iteration": 2.585536479949951 }, { "auxiliary_loss_clip": 0.01102356, "auxiliary_loss_mlp": 0.01148119, "balance_loss_clip": 1.05108261, "balance_loss_mlp": 1.04616618, "epoch": 0.030362242597324514, "flos": 15668425491840.0, "grad_norm": 1.519782239364022, "language_loss": 0.84274971, "learning_rate": 3.9999986348765425e-06, "loss": 0.86525452, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.5625, "step": 505, "time_per_iteration": 2.4598066806793213 }, { "auxiliary_loss_clip": 0.0106289, "auxiliary_loss_mlp": 0.01158018, "balance_loss_clip": 1.10957122, "balance_loss_mlp": 1.03374863, "epoch": 0.030422365849992486, "flos": 72122107034880.0, "grad_norm": 0.9083511625743905, "language_loss": 0.5529021, "learning_rate": 3.999998141915371e-06, "loss": 0.57511115, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.29296875, "step": 506, "time_per_iteration": 3.2049031257629395 }, { "auxiliary_loss_clip": 0.01099584, "auxiliary_loss_mlp": 0.01155772, "balance_loss_clip": 1.07013237, "balance_loss_mlp": 1.04265046, "epoch": 0.030482489102660455, "flos": 19426790707200.0, "grad_norm": 1.54761936163015, "language_loss": 0.90868783, "learning_rate": 3.999997573114069e-06, "loss": 0.93124139, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.5703125, "step": 507, "time_per_iteration": 2.5416057109832764 }, { "auxiliary_loss_clip": 0.01089259, "auxiliary_loss_mlp": 0.011534, "balance_loss_clip": 1.07515156, "balance_loss_mlp": 1.03643203, "epoch": 0.030542612355328423, "flos": 20374787387520.0, "grad_norm": 1.7865720078895793, "language_loss": 1.00743413, "learning_rate": 3.999996928472659e-06, "loss": 1.02986073, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 0.52734375, "step": 508, "time_per_iteration": 2.5111982822418213 }, { "auxiliary_loss_clip": 0.01091777, "auxiliary_loss_mlp": 0.01157533, "balance_loss_clip": 1.07761478, "balance_loss_mlp": 1.03450191, "epoch": 0.030602735607996392, "flos": 34675945148160.0, "grad_norm": 1.5576848542450341, "language_loss": 0.79518276, "learning_rate": 3.999996207991165e-06, "loss": 0.81767583, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 0.5703125, "step": 509, "time_per_iteration": 2.5608036518096924 }, { "auxiliary_loss_clip": 0.01089366, "auxiliary_loss_mlp": 0.011369, "balance_loss_clip": 1.0546937, "balance_loss_mlp": 1.03308797, "epoch": 0.03066285886066436, "flos": 23657565795840.0, "grad_norm": 1.8118574763559583, "language_loss": 0.92320585, "learning_rate": 3.999995411669614e-06, "loss": 0.94546854, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 0.5625, "step": 510, "time_per_iteration": 2.4971930980682373 }, { "auxiliary_loss_clip": 0.01088133, "auxiliary_loss_mlp": 0.01109449, "balance_loss_clip": 1.03930604, "balance_loss_mlp": 1.03266156, "epoch": 0.030722982113332332, "flos": 23001861951360.0, "grad_norm": 1.818955433352456, "language_loss": 0.96844339, "learning_rate": 3.999994539508036e-06, "loss": 0.99041927, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.5546875, "step": 511, "time_per_iteration": 2.4615046977996826 }, { "auxiliary_loss_clip": 0.01090055, "auxiliary_loss_mlp": 0.01094922, "balance_loss_clip": 1.02244306, "balance_loss_mlp": 1.03563952, "epoch": 0.0307831053660003, "flos": 24749788769280.0, "grad_norm": 1.5119013925501068, "language_loss": 0.92847729, "learning_rate": 3.9999935915064655e-06, "loss": 0.95032704, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 0.546875, "step": 512, "time_per_iteration": 2.5186767578125 }, { "auxiliary_loss_clip": 0.01099037, "auxiliary_loss_mlp": 0.01122197, "balance_loss_clip": 1.0268774, "balance_loss_mlp": 1.03957152, "epoch": 0.03084322861866827, "flos": 26139680928000.0, "grad_norm": 1.668883507733051, "language_loss": 0.96336728, "learning_rate": 3.9999925676649374e-06, "loss": 0.98557961, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.59375, "step": 513, "time_per_iteration": 2.499514579772949 }, { "auxiliary_loss_clip": 0.01098186, "auxiliary_loss_mlp": 0.01123851, "balance_loss_clip": 1.04441011, "balance_loss_mlp": 1.04187703, "epoch": 0.03090335187133624, "flos": 18770283901440.0, "grad_norm": 1.3112978803390427, "language_loss": 0.90259612, "learning_rate": 3.999991467983491e-06, "loss": 0.92481643, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 0.5625, "step": 514, "time_per_iteration": 2.4696667194366455 }, { "auxiliary_loss_clip": 0.01102758, "auxiliary_loss_mlp": 0.0111079, "balance_loss_clip": 1.03020465, "balance_loss_mlp": 1.04537988, "epoch": 0.030963475124004207, "flos": 23220767376000.0, "grad_norm": 2.443021249961568, "language_loss": 0.90932828, "learning_rate": 3.999990292462167e-06, "loss": 0.93146378, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 0.57421875, "step": 515, "time_per_iteration": 2.493640184402466 }, { "auxiliary_loss_clip": 0.01109739, "auxiliary_loss_mlp": 0.01134942, "balance_loss_clip": 1.03814387, "balance_loss_mlp": 1.04790258, "epoch": 0.03102359837667218, "flos": 42523861536000.0, "grad_norm": 1.5017812315555825, "language_loss": 0.95326561, "learning_rate": 3.999989041101011e-06, "loss": 0.97571248, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.6171875, "step": 516, "time_per_iteration": 2.6898787021636963 }, { "auxiliary_loss_clip": 0.01109095, "auxiliary_loss_mlp": 0.01147333, "balance_loss_clip": 1.03622949, "balance_loss_mlp": 1.04604602, "epoch": 0.031083721629340148, "flos": 21175939422720.0, "grad_norm": 1.5843742027639292, "language_loss": 0.86425745, "learning_rate": 3.999987713900071e-06, "loss": 0.88682175, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.62890625, "step": 517, "time_per_iteration": 2.4948623180389404 }, { "auxiliary_loss_clip": 0.01104503, "auxiliary_loss_mlp": 0.01126735, "balance_loss_clip": 1.03394306, "balance_loss_mlp": 1.0430367, "epoch": 0.031143844882008116, "flos": 29714891817600.0, "grad_norm": 1.3503588235869797, "language_loss": 0.95947772, "learning_rate": 3.999986310859396e-06, "loss": 0.98179018, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.61328125, "step": 518, "time_per_iteration": 2.5664560794830322 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.0115579, "balance_loss_clip": 1.04878747, "balance_loss_mlp": 1.04225373, "epoch": 0.031203968134676085, "flos": 23111349575040.0, "grad_norm": 1.6921325038477357, "language_loss": 0.9641695, "learning_rate": 3.999984831979039e-06, "loss": 0.986808, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.65625, "step": 519, "time_per_iteration": 2.568657398223877 }, { "auxiliary_loss_clip": 0.01102498, "auxiliary_loss_mlp": 0.01182164, "balance_loss_clip": 1.08245707, "balance_loss_mlp": 1.04172552, "epoch": 0.03126409138734405, "flos": 20953473039360.0, "grad_norm": 1.6542053395696847, "language_loss": 0.95543146, "learning_rate": 3.999983277259057e-06, "loss": 0.9782781, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 0.99609375, "router_z_loss_mlp": 0.609375, "step": 520, "time_per_iteration": 2.5302679538726807 }, { "auxiliary_loss_clip": 0.01091642, "auxiliary_loss_mlp": 0.01189027, "balance_loss_clip": 1.09027398, "balance_loss_mlp": 1.03398323, "epoch": 0.031324214640012026, "flos": 21649117345920.0, "grad_norm": 1.5931616236821584, "language_loss": 0.9621582, "learning_rate": 3.999981646699509e-06, "loss": 0.98496497, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 0.98828125, "router_z_loss_mlp": 0.57421875, "step": 521, "time_per_iteration": 2.4701414108276367 }, { "auxiliary_loss_clip": 0.01096876, "auxiliary_loss_mlp": 0.01205215, "balance_loss_clip": 1.09301519, "balance_loss_mlp": 1.03767633, "epoch": 0.03138433789267999, "flos": 23440196471040.0, "grad_norm": 1.6480498504105687, "language_loss": 0.79155159, "learning_rate": 3.999979940300456e-06, "loss": 0.81457245, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.59375, "step": 522, "time_per_iteration": 2.5112197399139404 }, { "auxiliary_loss_clip": 0.01097384, "auxiliary_loss_mlp": 0.01150734, "balance_loss_clip": 1.05083704, "balance_loss_mlp": 1.03663826, "epoch": 0.03144446114534796, "flos": 18981369181440.0, "grad_norm": 2.0942728152601306, "language_loss": 0.99647903, "learning_rate": 3.999978158061963e-06, "loss": 1.01896024, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.609375, "step": 523, "time_per_iteration": 2.43190598487854 }, { "auxiliary_loss_clip": 0.01099987, "auxiliary_loss_mlp": 0.0113624, "balance_loss_clip": 1.04449654, "balance_loss_mlp": 1.04056573, "epoch": 0.031504584398015935, "flos": 22636600640640.0, "grad_norm": 1.7602689256321742, "language_loss": 1.03405833, "learning_rate": 3.999976299984099e-06, "loss": 1.05642056, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.59375, "step": 524, "time_per_iteration": 2.5049948692321777 }, { "auxiliary_loss_clip": 0.0110226, "auxiliary_loss_mlp": 0.01133099, "balance_loss_clip": 1.03820872, "balance_loss_mlp": 1.04332471, "epoch": 0.0315647076506839, "flos": 25296004990080.0, "grad_norm": 1.835632317280912, "language_loss": 0.92858231, "learning_rate": 3.999974366066933e-06, "loss": 0.95093596, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 0.94921875, "router_z_loss_mlp": 0.58984375, "step": 525, "time_per_iteration": 2.49580979347229 }, { "auxiliary_loss_clip": 0.01103175, "auxiliary_loss_mlp": 0.01121206, "balance_loss_clip": 1.02517056, "balance_loss_mlp": 1.04288244, "epoch": 0.03162483090335187, "flos": 16981892951040.0, "grad_norm": 1.6643463217271257, "language_loss": 0.90448618, "learning_rate": 3.999972356310538e-06, "loss": 0.92672992, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.6015625, "step": 526, "time_per_iteration": 2.502254009246826 }, { "auxiliary_loss_clip": 0.01107912, "auxiliary_loss_mlp": 0.01124799, "balance_loss_clip": 1.02771544, "balance_loss_mlp": 1.0449096, "epoch": 0.03168495415601984, "flos": 18733485461760.0, "grad_norm": 1.6378996589415575, "language_loss": 0.94918996, "learning_rate": 3.999970270714991e-06, "loss": 0.97151715, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.62890625, "step": 527, "time_per_iteration": 2.451977014541626 }, { "auxiliary_loss_clip": 0.0111541, "auxiliary_loss_mlp": 0.01156017, "balance_loss_clip": 1.05850339, "balance_loss_mlp": 1.04665124, "epoch": 0.03174507740868781, "flos": 21213820114560.0, "grad_norm": 1.7345732638730849, "language_loss": 1.07938552, "learning_rate": 3.999968109280371e-06, "loss": 1.1020999, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.6875, "step": 528, "time_per_iteration": 2.497335910797119 }, { "auxiliary_loss_clip": 0.01111602, "auxiliary_loss_mlp": 0.0116157, "balance_loss_clip": 1.06567764, "balance_loss_mlp": 1.0431422, "epoch": 0.03180520066135578, "flos": 24786587208960.0, "grad_norm": 1.6708697127592953, "language_loss": 0.93226665, "learning_rate": 3.99996587200676e-06, "loss": 0.95499837, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.68359375, "step": 529, "time_per_iteration": 2.4965312480926514 }, { "auxiliary_loss_clip": 0.01113208, "auxiliary_loss_mlp": 0.01190402, "balance_loss_clip": 1.08983696, "balance_loss_mlp": 1.04383636, "epoch": 0.03186532391402375, "flos": 24863081731200.0, "grad_norm": 1.5876529627092344, "language_loss": 0.99495316, "learning_rate": 3.999963558894243e-06, "loss": 1.01798928, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.6953125, "step": 530, "time_per_iteration": 3.93658709526062 }, { "auxiliary_loss_clip": 0.01110733, "auxiliary_loss_mlp": 0.01200385, "balance_loss_clip": 1.09252501, "balance_loss_mlp": 1.04129124, "epoch": 0.03192544716669172, "flos": 21213994671360.0, "grad_norm": 1.7409412276121312, "language_loss": 0.89042419, "learning_rate": 3.999961169942907e-06, "loss": 0.91353536, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.6953125, "step": 531, "time_per_iteration": 3.947244644165039 }, { "auxiliary_loss_clip": 0.01107662, "auxiliary_loss_mlp": 0.01181013, "balance_loss_clip": 1.0749166, "balance_loss_mlp": 1.03721142, "epoch": 0.03198557041935969, "flos": 24352058027520.0, "grad_norm": 1.5922783889218952, "language_loss": 1.01070762, "learning_rate": 3.999958705152843e-06, "loss": 1.03359437, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.703125, "step": 532, "time_per_iteration": 3.98683762550354 }, { "auxiliary_loss_clip": 0.01054155, "auxiliary_loss_mlp": 0.01112228, "balance_loss_clip": 1.0855248, "balance_loss_mlp": 1.02263224, "epoch": 0.032045693672027656, "flos": 61824056186880.0, "grad_norm": 0.8177710683862445, "language_loss": 0.5813067, "learning_rate": 3.9999561645241445e-06, "loss": 0.60297054, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.31640625, "step": 533, "time_per_iteration": 4.461689472198486 }, { "auxiliary_loss_clip": 0.01100474, "auxiliary_loss_mlp": 0.011704, "balance_loss_clip": 1.05901051, "balance_loss_mlp": 1.03504872, "epoch": 0.03210581692469563, "flos": 28399958081280.0, "grad_norm": 1.5133056646533105, "language_loss": 0.96741486, "learning_rate": 3.999953548056907e-06, "loss": 0.99012357, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.65625, "step": 534, "time_per_iteration": 2.5278666019439697 }, { "auxiliary_loss_clip": 0.01106248, "auxiliary_loss_mlp": 0.01194806, "balance_loss_clip": 1.06319904, "balance_loss_mlp": 1.03651822, "epoch": 0.03216594017736359, "flos": 24716551288320.0, "grad_norm": 1.7792757982625182, "language_loss": 0.90488517, "learning_rate": 3.999950855751232e-06, "loss": 0.92789572, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 1.3203125, "router_z_loss_mlp": 0.6953125, "step": 535, "time_per_iteration": 2.557374954223633 }, { "auxiliary_loss_clip": 0.01104064, "auxiliary_loss_mlp": 0.01178931, "balance_loss_clip": 1.06353593, "balance_loss_mlp": 1.03565001, "epoch": 0.032226063430031565, "flos": 31174121669760.0, "grad_norm": 2.0280163740028647, "language_loss": 0.92853439, "learning_rate": 3.999948087607219e-06, "loss": 0.95136434, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.68359375, "step": 536, "time_per_iteration": 2.5691068172454834 }, { "auxiliary_loss_clip": 0.01110484, "auxiliary_loss_mlp": 0.01188241, "balance_loss_clip": 1.0478605, "balance_loss_mlp": 1.03895593, "epoch": 0.03228618668269954, "flos": 32196832392960.0, "grad_norm": 1.5284148689318524, "language_loss": 0.83409375, "learning_rate": 3.999945243624975e-06, "loss": 0.85708106, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.71484375, "step": 537, "time_per_iteration": 2.602902889251709 }, { "auxiliary_loss_clip": 0.01125477, "auxiliary_loss_mlp": 0.01154761, "balance_loss_clip": 1.02477503, "balance_loss_mlp": 1.04873323, "epoch": 0.0323463099353675, "flos": 22669174805760.0, "grad_norm": 1.6777262703084497, "language_loss": 0.9344542, "learning_rate": 3.999942323804607e-06, "loss": 0.95725667, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.765625, "step": 538, "time_per_iteration": 2.4709980487823486 }, { "auxiliary_loss_clip": 0.01129259, "auxiliary_loss_mlp": 0.01194416, "balance_loss_clip": 1.04220963, "balance_loss_mlp": 1.05134463, "epoch": 0.032406433188035474, "flos": 26903999612160.0, "grad_norm": 1.6959415512883778, "language_loss": 0.8825298, "learning_rate": 3.999939328146225e-06, "loss": 0.90576661, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.77734375, "step": 539, "time_per_iteration": 2.5402956008911133 }, { "auxiliary_loss_clip": 0.01130928, "auxiliary_loss_mlp": 0.01210736, "balance_loss_clip": 1.06749415, "balance_loss_mlp": 1.05148911, "epoch": 0.03246655644070344, "flos": 31502584540800.0, "grad_norm": 1.6262645838172556, "language_loss": 0.87846148, "learning_rate": 3.999936256649943e-06, "loss": 0.90187812, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.796875, "step": 540, "time_per_iteration": 2.535794734954834 }, { "auxiliary_loss_clip": 0.01133982, "auxiliary_loss_mlp": 0.01220164, "balance_loss_clip": 1.08293033, "balance_loss_mlp": 1.057127, "epoch": 0.03252667969337141, "flos": 23217311151360.0, "grad_norm": 1.7183399200697322, "language_loss": 0.97069895, "learning_rate": 3.999933109315878e-06, "loss": 0.99424052, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.765625, "step": 541, "time_per_iteration": 2.5130021572113037 }, { "auxiliary_loss_clip": 0.01133363, "auxiliary_loss_mlp": 0.01260855, "balance_loss_clip": 1.10025585, "balance_loss_mlp": 1.05313075, "epoch": 0.032586802946039384, "flos": 14756563935360.0, "grad_norm": 1.757498706555251, "language_loss": 0.96902514, "learning_rate": 3.9999298861441496e-06, "loss": 0.99296731, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.80078125, "step": 542, "time_per_iteration": 2.4482169151306152 }, { "auxiliary_loss_clip": 0.01130744, "auxiliary_loss_mlp": 0.01273974, "balance_loss_clip": 1.09029579, "balance_loss_mlp": 1.0473417, "epoch": 0.03264692619870735, "flos": 24279508488960.0, "grad_norm": 1.5747198311958899, "language_loss": 0.8122493, "learning_rate": 3.999926587134879e-06, "loss": 0.8362965, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.8359375, "step": 543, "time_per_iteration": 2.5225167274475098 }, { "auxiliary_loss_clip": 0.01127858, "auxiliary_loss_mlp": 0.01273549, "balance_loss_clip": 1.0967381, "balance_loss_mlp": 1.04508126, "epoch": 0.03270704945137532, "flos": 22892060125440.0, "grad_norm": 1.9400828914122876, "language_loss": 1.05496657, "learning_rate": 3.999923212288192e-06, "loss": 1.07898068, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.828125, "step": 544, "time_per_iteration": 2.475820302963257 }, { "auxiliary_loss_clip": 0.0112007, "auxiliary_loss_mlp": 0.01246125, "balance_loss_clip": 1.09677923, "balance_loss_mlp": 1.03863096, "epoch": 0.032767172704043286, "flos": 18040040570880.0, "grad_norm": 2.0253150590157976, "language_loss": 0.83212256, "learning_rate": 3.999919761604216e-06, "loss": 0.85578454, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.8125, "step": 545, "time_per_iteration": 2.446779251098633 }, { "auxiliary_loss_clip": 0.01121591, "auxiliary_loss_mlp": 0.01239558, "balance_loss_clip": 1.09135747, "balance_loss_mlp": 1.03943312, "epoch": 0.03282729595671126, "flos": 22527636687360.0, "grad_norm": 1.8455715661743097, "language_loss": 1.02270365, "learning_rate": 3.999916235083083e-06, "loss": 1.04631519, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.8203125, "step": 546, "time_per_iteration": 2.4760241508483887 }, { "auxiliary_loss_clip": 0.01116349, "auxiliary_loss_mlp": 0.01210784, "balance_loss_clip": 1.0752672, "balance_loss_mlp": 1.03537107, "epoch": 0.03288741920937923, "flos": 20409630791040.0, "grad_norm": 1.9035107248633822, "language_loss": 0.99299961, "learning_rate": 3.999912632724925e-06, "loss": 1.01627088, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.80859375, "step": 547, "time_per_iteration": 2.4931209087371826 }, { "auxiliary_loss_clip": 0.01129005, "auxiliary_loss_mlp": 0.01232072, "balance_loss_clip": 1.07462072, "balance_loss_mlp": 1.04002047, "epoch": 0.032947542462047195, "flos": 20776916960640.0, "grad_norm": 1.5837130300785742, "language_loss": 0.94542706, "learning_rate": 3.999908954529881e-06, "loss": 0.96903789, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.890625, "step": 548, "time_per_iteration": 2.4664671421051025 }, { "auxiliary_loss_clip": 0.01136493, "auxiliary_loss_mlp": 0.01196867, "balance_loss_clip": 1.04480338, "balance_loss_mlp": 1.04186535, "epoch": 0.03300766571471517, "flos": 19900247921280.0, "grad_norm": 1.8770912022006996, "language_loss": 0.86237824, "learning_rate": 3.999905200498087e-06, "loss": 0.88571191, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.9453125, "step": 549, "time_per_iteration": 2.501680374145508 }, { "auxiliary_loss_clip": 0.01139474, "auxiliary_loss_mlp": 0.01201756, "balance_loss_clip": 1.04039443, "balance_loss_mlp": 1.04646051, "epoch": 0.03306778896738313, "flos": 17966792805120.0, "grad_norm": 1.6231718541416202, "language_loss": 0.93893534, "learning_rate": 3.999901370629689e-06, "loss": 0.96234763, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.9296875, "step": 550, "time_per_iteration": 2.4385581016540527 }, { "auxiliary_loss_clip": 0.01147395, "auxiliary_loss_mlp": 0.01172734, "balance_loss_clip": 1.03607237, "balance_loss_mlp": 1.05177951, "epoch": 0.033127912220051105, "flos": 21652294279680.0, "grad_norm": 1.4128984923864125, "language_loss": 0.86525953, "learning_rate": 3.99989746492483e-06, "loss": 0.88846081, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.95703125, "step": 551, "time_per_iteration": 2.487537145614624 }, { "auxiliary_loss_clip": 0.01135096, "auxiliary_loss_mlp": 0.01197826, "balance_loss_clip": 1.0693661, "balance_loss_mlp": 1.04413843, "epoch": 0.03318803547271908, "flos": 30187127134080.0, "grad_norm": 3.182602773534329, "language_loss": 1.06004131, "learning_rate": 3.999893483383658e-06, "loss": 1.08337045, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.91015625, "step": 552, "time_per_iteration": 2.5393924713134766 }, { "auxiliary_loss_clip": 0.01137627, "auxiliary_loss_mlp": 0.01204695, "balance_loss_clip": 1.06808162, "balance_loss_mlp": 1.04647756, "epoch": 0.03324815872538704, "flos": 20374996855680.0, "grad_norm": 1.9751188836200375, "language_loss": 1.04130173, "learning_rate": 3.999889426006326e-06, "loss": 1.06472492, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.91015625, "step": 553, "time_per_iteration": 2.4983417987823486 }, { "auxiliary_loss_clip": 0.01151044, "auxiliary_loss_mlp": 0.01244142, "balance_loss_clip": 1.10261655, "balance_loss_mlp": 1.04873753, "epoch": 0.033308281978055014, "flos": 24493526323200.0, "grad_norm": 1.7742589723238658, "language_loss": 0.88081658, "learning_rate": 3.999885292792986e-06, "loss": 0.90476841, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 1.0234375, "step": 554, "time_per_iteration": 2.4930005073547363 }, { "auxiliary_loss_clip": 0.01147339, "auxiliary_loss_mlp": 0.01247258, "balance_loss_clip": 1.10792613, "balance_loss_mlp": 1.04858994, "epoch": 0.03336840523072298, "flos": 23399313402240.0, "grad_norm": 2.4006587351921294, "language_loss": 0.90614688, "learning_rate": 3.999881083743795e-06, "loss": 0.93009281, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.98828125, "step": 555, "time_per_iteration": 2.486687660217285 }, { "auxiliary_loss_clip": 0.01140244, "auxiliary_loss_mlp": 0.01243849, "balance_loss_clip": 1.11147952, "balance_loss_mlp": 1.04319668, "epoch": 0.03342852848339095, "flos": 30549386067840.0, "grad_norm": 1.9108890543740997, "language_loss": 1.01771629, "learning_rate": 3.999876798858914e-06, "loss": 1.04155731, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.97265625, "step": 556, "time_per_iteration": 2.507275104522705 }, { "auxiliary_loss_clip": 0.01139649, "auxiliary_loss_mlp": 0.01233307, "balance_loss_clip": 1.08787167, "balance_loss_mlp": 1.04195929, "epoch": 0.03348865173605892, "flos": 22892199770880.0, "grad_norm": 1.797722026952748, "language_loss": 0.94498062, "learning_rate": 3.999872438138503e-06, "loss": 0.96871006, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.9765625, "step": 557, "time_per_iteration": 2.526979684829712 }, { "auxiliary_loss_clip": 0.0112784, "auxiliary_loss_mlp": 0.01203832, "balance_loss_clip": 1.07918715, "balance_loss_mlp": 1.03668118, "epoch": 0.03354877498872689, "flos": 17675058551040.0, "grad_norm": 2.2127831879194377, "language_loss": 1.1035949, "learning_rate": 3.999868001582729e-06, "loss": 1.12691164, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.91015625, "step": 558, "time_per_iteration": 2.4273765087127686 }, { "auxiliary_loss_clip": 0.01129122, "auxiliary_loss_mlp": 0.01196892, "balance_loss_clip": 1.05517602, "balance_loss_mlp": 1.03869724, "epoch": 0.03360889824139486, "flos": 21651910254720.0, "grad_norm": 1.8722720853276784, "language_loss": 0.92296618, "learning_rate": 3.99986348919176e-06, "loss": 0.9462263, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.90234375, "step": 559, "time_per_iteration": 2.5072474479675293 }, { "auxiliary_loss_clip": 0.01119332, "auxiliary_loss_mlp": 0.01177146, "balance_loss_clip": 1.044204, "balance_loss_mlp": 1.03072166, "epoch": 0.033669021494062826, "flos": 21794740093440.0, "grad_norm": 1.5855640139217508, "language_loss": 0.9336009, "learning_rate": 3.9998589009657675e-06, "loss": 0.95656562, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.88671875, "step": 560, "time_per_iteration": 2.4519424438476562 }, { "auxiliary_loss_clip": 0.0111538, "auxiliary_loss_mlp": 0.0118726, "balance_loss_clip": 1.04315996, "balance_loss_mlp": 1.03105402, "epoch": 0.0337291447467308, "flos": 21865299684480.0, "grad_norm": 1.828883306704858, "language_loss": 0.90188801, "learning_rate": 3.999854236904925e-06, "loss": 0.92491442, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.84375, "step": 561, "time_per_iteration": 2.483736515045166 }, { "auxiliary_loss_clip": 0.01116414, "auxiliary_loss_mlp": 0.01176057, "balance_loss_clip": 1.04025352, "balance_loss_mlp": 1.03113675, "epoch": 0.03378926799939877, "flos": 24244734908160.0, "grad_norm": 1.53276061490754, "language_loss": 0.88441819, "learning_rate": 3.999849497009409e-06, "loss": 0.90734291, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.8515625, "step": 562, "time_per_iteration": 2.4966485500335693 }, { "auxiliary_loss_clip": 0.0111902, "auxiliary_loss_mlp": 0.01173969, "balance_loss_clip": 1.04722595, "balance_loss_mlp": 1.03180313, "epoch": 0.033849391252066735, "flos": 16506899637120.0, "grad_norm": 1.7433612505801317, "language_loss": 0.92252147, "learning_rate": 3.999844681279401e-06, "loss": 0.94545144, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.87109375, "step": 563, "time_per_iteration": 2.455982208251953 }, { "auxiliary_loss_clip": 0.01118452, "auxiliary_loss_mlp": 0.01153023, "balance_loss_clip": 1.02990413, "balance_loss_mlp": 1.0319984, "epoch": 0.03390951450473471, "flos": 15668390580480.0, "grad_norm": 1.8861857747296413, "language_loss": 0.99484402, "learning_rate": 3.99983978971508e-06, "loss": 1.01755869, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.8671875, "step": 564, "time_per_iteration": 2.4077341556549072 }, { "auxiliary_loss_clip": 0.01117822, "auxiliary_loss_mlp": 0.01146901, "balance_loss_clip": 1.02917027, "balance_loss_mlp": 1.03201461, "epoch": 0.03396963775740267, "flos": 22673678371200.0, "grad_norm": 1.9683834492964154, "language_loss": 1.03429043, "learning_rate": 3.999834822316635e-06, "loss": 1.05693769, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 1.1796875, "router_z_loss_mlp": 0.859375, "step": 565, "time_per_iteration": 2.4581360816955566 }, { "auxiliary_loss_clip": 0.01048328, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.02535641, "balance_loss_mlp": 1.02438474, "epoch": 0.034029761010070644, "flos": 64388984797440.0, "grad_norm": 0.9766516319306671, "language_loss": 0.55027646, "learning_rate": 3.9998297790842535e-06, "loss": 0.57121456, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.24023438, "step": 566, "time_per_iteration": 3.1307201385498047 }, { "auxiliary_loss_clip": 0.0112585, "auxiliary_loss_mlp": 0.0116581, "balance_loss_clip": 1.04607654, "balance_loss_mlp": 1.03932071, "epoch": 0.034089884262738616, "flos": 25003188483840.0, "grad_norm": 1.8891349744273729, "language_loss": 0.89690077, "learning_rate": 3.999824660018126e-06, "loss": 0.91981733, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 0.8671875, "step": 567, "time_per_iteration": 2.4997751712799072 }, { "auxiliary_loss_clip": 0.01119206, "auxiliary_loss_mlp": 0.01204005, "balance_loss_clip": 1.07907414, "balance_loss_mlp": 1.0362916, "epoch": 0.03415000751540658, "flos": 28437838773120.0, "grad_norm": 1.6184569789964445, "language_loss": 0.88866186, "learning_rate": 3.999819465118447e-06, "loss": 0.91189396, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.828125, "step": 568, "time_per_iteration": 2.502458095550537 }, { "auxiliary_loss_clip": 0.01127194, "auxiliary_loss_mlp": 0.01221816, "balance_loss_clip": 1.09435725, "balance_loss_mlp": 1.04144526, "epoch": 0.034210130768074554, "flos": 21467708588160.0, "grad_norm": 1.5547682955162685, "language_loss": 0.94873005, "learning_rate": 3.999814194385413e-06, "loss": 0.97222018, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 1.2734375, "router_z_loss_mlp": 0.859375, "step": 569, "time_per_iteration": 3.972914457321167 }, { "auxiliary_loss_clip": 0.01133259, "auxiliary_loss_mlp": 0.01244152, "balance_loss_clip": 1.1077764, "balance_loss_mlp": 1.03843021, "epoch": 0.03427025402074252, "flos": 18696512465280.0, "grad_norm": 1.5575524223318786, "language_loss": 1.0156033, "learning_rate": 3.9998088478192255e-06, "loss": 1.03937745, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.94921875, "step": 570, "time_per_iteration": 2.4375224113464355 }, { "auxiliary_loss_clip": 0.0112952, "auxiliary_loss_mlp": 0.01186745, "balance_loss_clip": 1.06143188, "balance_loss_mlp": 1.03812456, "epoch": 0.03433037727341049, "flos": 20848942828800.0, "grad_norm": 1.913000886526044, "language_loss": 0.90328473, "learning_rate": 3.9998034254200846e-06, "loss": 0.92644733, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.9140625, "step": 571, "time_per_iteration": 2.452617645263672 }, { "auxiliary_loss_clip": 0.01118482, "auxiliary_loss_mlp": 0.01176553, "balance_loss_clip": 1.04837918, "balance_loss_mlp": 1.03448927, "epoch": 0.03439050052607846, "flos": 25409123395200.0, "grad_norm": 1.876046048035974, "language_loss": 0.91276026, "learning_rate": 3.999797927188199e-06, "loss": 0.93571061, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.8359375, "step": 572, "time_per_iteration": 6.690124273300171 }, { "auxiliary_loss_clip": 0.01123119, "auxiliary_loss_mlp": 0.01152595, "balance_loss_clip": 1.03438687, "balance_loss_mlp": 1.03611279, "epoch": 0.03445062377874643, "flos": 17639167806720.0, "grad_norm": 1.659871319825706, "language_loss": 0.92387295, "learning_rate": 3.999792353123774e-06, "loss": 0.94663006, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.8671875, "step": 573, "time_per_iteration": 2.439091682434082 }, { "auxiliary_loss_clip": 0.01119368, "auxiliary_loss_mlp": 0.01162519, "balance_loss_clip": 1.03997183, "balance_loss_mlp": 1.03149724, "epoch": 0.0345107470314144, "flos": 16763546108160.0, "grad_norm": 1.893990148939269, "language_loss": 0.91562116, "learning_rate": 3.999786703227023e-06, "loss": 0.93844008, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.87890625, "step": 574, "time_per_iteration": 2.4105968475341797 }, { "auxiliary_loss_clip": 0.01122081, "auxiliary_loss_mlp": 0.01211753, "balance_loss_clip": 1.0810045, "balance_loss_mlp": 1.03274226, "epoch": 0.03457087028408237, "flos": 14683560549120.0, "grad_norm": 1.9178032909202491, "language_loss": 0.94081104, "learning_rate": 3.9997809774981606e-06, "loss": 0.96414936, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 0.890625, "step": 575, "time_per_iteration": 2.4481594562530518 }, { "auxiliary_loss_clip": 0.0111962, "auxiliary_loss_mlp": 0.01212054, "balance_loss_clip": 1.08931613, "balance_loss_mlp": 1.03160977, "epoch": 0.03463099353675034, "flos": 20010259215360.0, "grad_norm": 1.928200201612274, "language_loss": 0.89894342, "learning_rate": 3.9997751759374025e-06, "loss": 0.92226011, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.8828125, "step": 576, "time_per_iteration": 2.4339497089385986 }, { "auxiliary_loss_clip": 0.01124952, "auxiliary_loss_mlp": 0.01221854, "balance_loss_clip": 1.08624172, "balance_loss_mlp": 1.03464127, "epoch": 0.03469111678941831, "flos": 25299984885120.0, "grad_norm": 1.822266975770665, "language_loss": 0.92172527, "learning_rate": 3.99976929854497e-06, "loss": 0.94519341, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.90625, "step": 577, "time_per_iteration": 2.5071544647216797 }, { "auxiliary_loss_clip": 0.01129671, "auxiliary_loss_mlp": 0.01199416, "balance_loss_clip": 1.05359983, "balance_loss_mlp": 1.03464389, "epoch": 0.034751240042086275, "flos": 23258264042880.0, "grad_norm": 1.7449173582018862, "language_loss": 0.79331028, "learning_rate": 3.9997633453210845e-06, "loss": 0.81660116, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.94921875, "step": 578, "time_per_iteration": 2.464688777923584 }, { "auxiliary_loss_clip": 0.01135186, "auxiliary_loss_mlp": 0.01175142, "balance_loss_clip": 1.02875292, "balance_loss_mlp": 1.03696179, "epoch": 0.03481136329475425, "flos": 23768100760320.0, "grad_norm": 1.6107996176960915, "language_loss": 0.83964503, "learning_rate": 3.999757316265973e-06, "loss": 0.86274827, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.984375, "step": 579, "time_per_iteration": 2.4916505813598633 }, { "auxiliary_loss_clip": 0.01126882, "auxiliary_loss_mlp": 0.01144654, "balance_loss_clip": 1.02067578, "balance_loss_mlp": 1.0336169, "epoch": 0.03487148654742222, "flos": 20156475456000.0, "grad_norm": 1.7741074525628746, "language_loss": 0.93820435, "learning_rate": 3.999751211379863e-06, "loss": 0.96091962, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 0.9296875, "step": 580, "time_per_iteration": 2.509655237197876 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01149336, "balance_loss_clip": 1.02726579, "balance_loss_mlp": 1.03658557, "epoch": 0.034931609800090184, "flos": 15668669871360.0, "grad_norm": 2.069002442033303, "language_loss": 0.94589162, "learning_rate": 3.999745030662987e-06, "loss": 0.96868879, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.9375, "step": 581, "time_per_iteration": 2.461562156677246 }, { "auxiliary_loss_clip": 0.01138039, "auxiliary_loss_mlp": 0.01180326, "balance_loss_clip": 1.04890943, "balance_loss_mlp": 1.03759456, "epoch": 0.034991733052758156, "flos": 16361451446400.0, "grad_norm": 1.7015709136785961, "language_loss": 0.84837043, "learning_rate": 3.99973877411558e-06, "loss": 0.87155414, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 1.0078125, "step": 582, "time_per_iteration": 2.410998582839966 }, { "auxiliary_loss_clip": 0.01135532, "auxiliary_loss_mlp": 0.0118301, "balance_loss_clip": 1.05502641, "balance_loss_mlp": 1.03843379, "epoch": 0.03505185630542612, "flos": 19386396397440.0, "grad_norm": 1.6203561778939457, "language_loss": 0.93527448, "learning_rate": 3.999732441737877e-06, "loss": 0.95845991, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.96875, "step": 583, "time_per_iteration": 2.470585823059082 }, { "auxiliary_loss_clip": 0.01133185, "auxiliary_loss_mlp": 0.01170752, "balance_loss_clip": 1.05306816, "balance_loss_mlp": 1.03542304, "epoch": 0.03511197955809409, "flos": 21322784067840.0, "grad_norm": 1.9805693861181615, "language_loss": 0.92181969, "learning_rate": 3.99972603353012e-06, "loss": 0.94485909, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.9765625, "step": 584, "time_per_iteration": 2.529553174972534 }, { "auxiliary_loss_clip": 0.01136966, "auxiliary_loss_mlp": 0.01184859, "balance_loss_clip": 1.05573189, "balance_loss_mlp": 1.03474641, "epoch": 0.035172102810762065, "flos": 14135738405760.0, "grad_norm": 2.4129239320597478, "language_loss": 1.06750739, "learning_rate": 3.999719549492551e-06, "loss": 1.09072566, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 1.2890625, "router_z_loss_mlp": 1.0234375, "step": 585, "time_per_iteration": 2.4398443698883057 }, { "auxiliary_loss_clip": 0.01135905, "auxiliary_loss_mlp": 0.01191041, "balance_loss_clip": 1.05523801, "balance_loss_mlp": 1.03748393, "epoch": 0.03523222606343003, "flos": 20296023626880.0, "grad_norm": 2.0376822303572695, "language_loss": 0.96939135, "learning_rate": 3.9997129896254165e-06, "loss": 0.99266076, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.984375, "step": 586, "time_per_iteration": 2.4211606979370117 }, { "auxiliary_loss_clip": 0.01139563, "auxiliary_loss_mlp": 0.01180904, "balance_loss_clip": 1.0544467, "balance_loss_mlp": 1.03403854, "epoch": 0.035292349316098, "flos": 20374787387520.0, "grad_norm": 1.6885230957616433, "language_loss": 0.8697657, "learning_rate": 3.999706353928965e-06, "loss": 0.89297038, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 1.0546875, "step": 587, "time_per_iteration": 2.445727825164795 }, { "auxiliary_loss_clip": 0.01135822, "auxiliary_loss_mlp": 0.01178345, "balance_loss_clip": 1.04397202, "balance_loss_mlp": 1.03322887, "epoch": 0.03535247256876597, "flos": 21467848233600.0, "grad_norm": 1.4798219771882806, "language_loss": 0.87201285, "learning_rate": 3.999699642403449e-06, "loss": 0.89515448, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 1.03125, "step": 588, "time_per_iteration": 2.4359219074249268 }, { "auxiliary_loss_clip": 0.01135184, "auxiliary_loss_mlp": 0.01179778, "balance_loss_clip": 1.03214931, "balance_loss_mlp": 1.03024328, "epoch": 0.03541259582143394, "flos": 23621919431040.0, "grad_norm": 1.8870066143249948, "language_loss": 1.05879104, "learning_rate": 3.99969285504912e-06, "loss": 1.08194065, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 1.046875, "step": 589, "time_per_iteration": 2.453648090362549 }, { "auxiliary_loss_clip": 0.01131056, "auxiliary_loss_mlp": 0.01162723, "balance_loss_clip": 1.02424955, "balance_loss_mlp": 1.03055692, "epoch": 0.03547271907410191, "flos": 33725050824960.0, "grad_norm": 1.9168849169932591, "language_loss": 0.92424816, "learning_rate": 3.99968599186624e-06, "loss": 0.94718587, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 1.0, "step": 590, "time_per_iteration": 2.548443555831909 }, { "auxiliary_loss_clip": 0.01133026, "auxiliary_loss_mlp": 0.0116205, "balance_loss_clip": 1.02958465, "balance_loss_mlp": 1.0304544, "epoch": 0.03553284232676988, "flos": 21141619689600.0, "grad_norm": 1.904244731809144, "language_loss": 0.95059144, "learning_rate": 3.999679052855065e-06, "loss": 0.97354221, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 1.0234375, "step": 591, "time_per_iteration": 2.45732045173645 }, { "auxiliary_loss_clip": 0.01135159, "auxiliary_loss_mlp": 0.01178403, "balance_loss_clip": 1.03830838, "balance_loss_mlp": 1.03139913, "epoch": 0.03559296557943785, "flos": 20045591377920.0, "grad_norm": 1.662101568159591, "language_loss": 0.91738057, "learning_rate": 3.999672038015861e-06, "loss": 0.94051623, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 1.0390625, "step": 592, "time_per_iteration": 2.433211088180542 }, { "auxiliary_loss_clip": 0.01044255, "auxiliary_loss_mlp": 0.01103515, "balance_loss_clip": 1.08234298, "balance_loss_mlp": 1.02112067, "epoch": 0.035653088832105814, "flos": 60331239740160.0, "grad_norm": 0.9100897362399614, "language_loss": 0.59922636, "learning_rate": 3.999664947348893e-06, "loss": 0.62070405, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.23046875, "step": 593, "time_per_iteration": 3.0719518661499023 }, { "auxiliary_loss_clip": 0.01139564, "auxiliary_loss_mlp": 0.01209852, "balance_loss_clip": 1.06346262, "balance_loss_mlp": 1.03209782, "epoch": 0.035713212084773786, "flos": 20112310719360.0, "grad_norm": 1.7407052258697056, "language_loss": 0.93002135, "learning_rate": 3.999657780854429e-06, "loss": 0.95351553, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 1.078125, "step": 594, "time_per_iteration": 2.4300620555877686 }, { "auxiliary_loss_clip": 0.01143001, "auxiliary_loss_mlp": 0.0122224, "balance_loss_clip": 1.08214498, "balance_loss_mlp": 1.03211153, "epoch": 0.03577333533744176, "flos": 26284605448320.0, "grad_norm": 1.821200500500637, "language_loss": 0.91260576, "learning_rate": 3.999650538532742e-06, "loss": 0.93625814, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 1.109375, "step": 595, "time_per_iteration": 2.499069929122925 }, { "auxiliary_loss_clip": 0.01140779, "auxiliary_loss_mlp": 0.01232191, "balance_loss_clip": 1.08074725, "balance_loss_mlp": 1.03046095, "epoch": 0.035833458590109724, "flos": 10888955475840.0, "grad_norm": 2.2216642179293693, "language_loss": 1.07016993, "learning_rate": 3.999643220384106e-06, "loss": 1.09389973, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 1.1015625, "step": 596, "time_per_iteration": 2.397531270980835 }, { "auxiliary_loss_clip": 0.01148663, "auxiliary_loss_mlp": 0.0121052, "balance_loss_clip": 1.07700562, "balance_loss_mlp": 1.03398108, "epoch": 0.035893581842777696, "flos": 22089127610880.0, "grad_norm": 2.1432945611476892, "language_loss": 0.92265987, "learning_rate": 3.999635826408799e-06, "loss": 0.94625169, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 1.1484375, "step": 597, "time_per_iteration": 2.4649407863616943 }, { "auxiliary_loss_clip": 0.01136751, "auxiliary_loss_mlp": 0.01218642, "balance_loss_clip": 1.07482839, "balance_loss_mlp": 1.030936, "epoch": 0.03595370509544566, "flos": 23037263936640.0, "grad_norm": 1.6013077074951683, "language_loss": 0.858154, "learning_rate": 3.999628356607101e-06, "loss": 0.88170791, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 1.0546875, "step": 598, "time_per_iteration": 2.4549970626831055 }, { "auxiliary_loss_clip": 0.01131057, "auxiliary_loss_mlp": 0.01165755, "balance_loss_clip": 1.0394882, "balance_loss_mlp": 1.03080678, "epoch": 0.03601382834811363, "flos": 20776672581120.0, "grad_norm": 1.646228355223748, "language_loss": 0.86395174, "learning_rate": 3.999620810979295e-06, "loss": 0.88691986, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 1.0078125, "step": 599, "time_per_iteration": 2.4681897163391113 }, { "auxiliary_loss_clip": 0.01152795, "auxiliary_loss_mlp": 0.01159652, "balance_loss_clip": 1.03367186, "balance_loss_mlp": 1.0365355, "epoch": 0.036073951600781605, "flos": 23950487036160.0, "grad_norm": 1.9726730442747542, "language_loss": 0.96328104, "learning_rate": 3.999613189525668e-06, "loss": 0.98640549, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 1.2578125, "router_z_loss_mlp": 1.1640625, "step": 600, "time_per_iteration": 2.4512691497802734 }, { "auxiliary_loss_clip": 0.01140167, "auxiliary_loss_mlp": 0.01149151, "balance_loss_clip": 1.02765274, "balance_loss_mlp": 1.03376889, "epoch": 0.03613407485344957, "flos": 18911403083520.0, "grad_norm": 1.583812149546343, "language_loss": 0.88016117, "learning_rate": 3.999605492246508e-06, "loss": 0.90305436, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 1.2109375, "router_z_loss_mlp": 1.0625, "step": 601, "time_per_iteration": 2.4613654613494873 }, { "auxiliary_loss_clip": 0.01137278, "auxiliary_loss_mlp": 0.01157257, "balance_loss_clip": 1.03847671, "balance_loss_mlp": 1.03210258, "epoch": 0.03619419810611754, "flos": 23037438493440.0, "grad_norm": 2.1863642945842585, "language_loss": 0.84624588, "learning_rate": 3.999597719142107e-06, "loss": 0.86919129, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 1.0546875, "step": 602, "time_per_iteration": 2.4436893463134766 }, { "auxiliary_loss_clip": 0.01146275, "auxiliary_loss_mlp": 0.01188324, "balance_loss_clip": 1.05767083, "balance_loss_mlp": 1.03426147, "epoch": 0.03625432135878551, "flos": 29456569601280.0, "grad_norm": 1.6668777311786453, "language_loss": 0.85847187, "learning_rate": 3.999589870212761e-06, "loss": 0.88181782, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 1.1171875, "step": 603, "time_per_iteration": 2.5141797065734863 }, { "auxiliary_loss_clip": 0.01137624, "auxiliary_loss_mlp": 0.01174297, "balance_loss_clip": 1.06252623, "balance_loss_mlp": 1.03375828, "epoch": 0.03631444461145348, "flos": 23507544216960.0, "grad_norm": 1.7633756512242336, "language_loss": 0.92174476, "learning_rate": 3.9995819454587664e-06, "loss": 0.94486403, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 1.0390625, "step": 604, "time_per_iteration": 2.4520962238311768 }, { "auxiliary_loss_clip": 0.01145165, "auxiliary_loss_mlp": 0.01195868, "balance_loss_clip": 1.06769419, "balance_loss_mlp": 1.03581345, "epoch": 0.03637456786412145, "flos": 16617190222080.0, "grad_norm": 1.9060720903789012, "language_loss": 0.90218383, "learning_rate": 3.999573944880424e-06, "loss": 0.92559421, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 1.09375, "step": 605, "time_per_iteration": 2.432851552963257 }, { "auxiliary_loss_clip": 0.01145378, "auxiliary_loss_mlp": 0.01164865, "balance_loss_clip": 1.05342865, "balance_loss_mlp": 1.03662348, "epoch": 0.03643469111678942, "flos": 15850916501760.0, "grad_norm": 2.189037031962612, "language_loss": 0.94489419, "learning_rate": 3.9995658684780375e-06, "loss": 0.9679966, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 1.09375, "step": 606, "time_per_iteration": 2.4093234539031982 }, { "auxiliary_loss_clip": 0.0114942, "auxiliary_loss_mlp": 0.01181308, "balance_loss_clip": 1.04769862, "balance_loss_mlp": 1.03741181, "epoch": 0.03649481436945739, "flos": 23619335990400.0, "grad_norm": 2.2750634880678406, "language_loss": 0.88896465, "learning_rate": 3.999557716251912e-06, "loss": 0.91227192, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 1.125, "step": 607, "time_per_iteration": 2.5047388076782227 }, { "auxiliary_loss_clip": 0.01139677, "auxiliary_loss_mlp": 0.01156641, "balance_loss_clip": 1.04191446, "balance_loss_mlp": 1.03562021, "epoch": 0.036554937622125354, "flos": 21754694897280.0, "grad_norm": 2.3149950976431146, "language_loss": 0.88564229, "learning_rate": 3.999549488202358e-06, "loss": 0.90860552, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 1.046875, "step": 608, "time_per_iteration": 2.426285743713379 }, { "auxiliary_loss_clip": 0.01136598, "auxiliary_loss_mlp": 0.01134869, "balance_loss_clip": 1.02853394, "balance_loss_mlp": 1.0341115, "epoch": 0.036615060874793326, "flos": 17818865907840.0, "grad_norm": 1.859383743505045, "language_loss": 0.88021719, "learning_rate": 3.999541184329688e-06, "loss": 0.90293187, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 1.0234375, "step": 609, "time_per_iteration": 3.864445686340332 }, { "auxiliary_loss_clip": 0.01143784, "auxiliary_loss_mlp": 0.01141999, "balance_loss_clip": 1.03566492, "balance_loss_mlp": 1.03638268, "epoch": 0.0366751841274613, "flos": 26752791047040.0, "grad_norm": 1.900608640244239, "language_loss": 0.86938202, "learning_rate": 3.999532804634215e-06, "loss": 0.89223981, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 1.078125, "step": 610, "time_per_iteration": 2.475282669067383 }, { "auxiliary_loss_clip": 0.01145349, "auxiliary_loss_mlp": 0.0113481, "balance_loss_clip": 1.02180004, "balance_loss_mlp": 1.03796923, "epoch": 0.03673530738012926, "flos": 22195961971200.0, "grad_norm": 1.855841745920249, "language_loss": 0.94348156, "learning_rate": 3.9995243491162575e-06, "loss": 0.9662832, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 1.078125, "step": 611, "time_per_iteration": 5.377411603927612 }, { "auxiliary_loss_clip": 0.01141231, "auxiliary_loss_mlp": 0.01142037, "balance_loss_clip": 1.02416253, "balance_loss_mlp": 1.03334546, "epoch": 0.036795430632797235, "flos": 24680485987200.0, "grad_norm": 1.8377996266837493, "language_loss": 0.80533373, "learning_rate": 3.999515817776136e-06, "loss": 0.82816648, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 1.1796875, "router_z_loss_mlp": 1.078125, "step": 612, "time_per_iteration": 3.8703882694244385 }, { "auxiliary_loss_clip": 0.01144723, "auxiliary_loss_mlp": 0.01135816, "balance_loss_clip": 1.02681112, "balance_loss_mlp": 1.03488398, "epoch": 0.0368555538854652, "flos": 17747957203200.0, "grad_norm": 2.502692943427083, "language_loss": 0.88265264, "learning_rate": 3.999507210614175e-06, "loss": 0.90545809, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 1.09375, "step": 613, "time_per_iteration": 2.447810649871826 }, { "auxiliary_loss_clip": 0.01136952, "auxiliary_loss_mlp": 0.01146764, "balance_loss_clip": 1.03156042, "balance_loss_mlp": 1.0319196, "epoch": 0.03691567713813317, "flos": 20593518255360.0, "grad_norm": 1.6746319516059052, "language_loss": 0.99686742, "learning_rate": 3.9994985276307e-06, "loss": 1.01970458, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 1.0546875, "step": 614, "time_per_iteration": 2.4973056316375732 }, { "auxiliary_loss_clip": 0.01143539, "auxiliary_loss_mlp": 0.01138837, "balance_loss_clip": 1.02945089, "balance_loss_mlp": 1.03434527, "epoch": 0.036975800390801145, "flos": 33649149795840.0, "grad_norm": 2.3394643251372016, "language_loss": 0.84657586, "learning_rate": 3.999489768826041e-06, "loss": 0.86939967, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 1.09375, "step": 615, "time_per_iteration": 2.550053119659424 }, { "auxiliary_loss_clip": 0.01149275, "auxiliary_loss_mlp": 0.01149109, "balance_loss_clip": 1.02880275, "balance_loss_mlp": 1.03408265, "epoch": 0.03703592364346911, "flos": 28292425493760.0, "grad_norm": 1.7307014896371944, "language_loss": 0.88484526, "learning_rate": 3.999480934200528e-06, "loss": 0.90782917, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 1.15625, "step": 616, "time_per_iteration": 2.5172057151794434 }, { "auxiliary_loss_clip": 0.01142722, "auxiliary_loss_mlp": 0.01148217, "balance_loss_clip": 1.02605176, "balance_loss_mlp": 1.0331043, "epoch": 0.03709604689613708, "flos": 31502863831680.0, "grad_norm": 2.0226835265390304, "language_loss": 0.7443614, "learning_rate": 3.999472023754499e-06, "loss": 0.76727086, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 1.09375, "step": 617, "time_per_iteration": 2.5514779090881348 }, { "auxiliary_loss_clip": 0.01144035, "auxiliary_loss_mlp": 0.01153274, "balance_loss_clip": 1.03206182, "balance_loss_mlp": 1.03471375, "epoch": 0.03715617014880505, "flos": 19608374021760.0, "grad_norm": 1.9091637159595967, "language_loss": 0.87663281, "learning_rate": 3.99946303748829e-06, "loss": 0.89960587, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 1.2109375, "router_z_loss_mlp": 1.09375, "step": 618, "time_per_iteration": 2.4170682430267334 }, { "auxiliary_loss_clip": 0.01141964, "auxiliary_loss_mlp": 0.01132541, "balance_loss_clip": 1.0255388, "balance_loss_mlp": 1.03270066, "epoch": 0.03721629340147302, "flos": 15923291483520.0, "grad_norm": 2.2025431207740165, "language_loss": 0.97853041, "learning_rate": 3.999453975402242e-06, "loss": 1.00127554, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 1.09375, "step": 619, "time_per_iteration": 2.4366865158081055 }, { "auxiliary_loss_clip": 0.01144644, "auxiliary_loss_mlp": 0.01140787, "balance_loss_clip": 1.02963603, "balance_loss_mlp": 1.03562737, "epoch": 0.03727641665414099, "flos": 21103075681920.0, "grad_norm": 2.922265213518912, "language_loss": 1.00434625, "learning_rate": 3.9994448374967e-06, "loss": 1.02720046, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 1.09375, "step": 620, "time_per_iteration": 2.423802614212036 }, { "auxiliary_loss_clip": 0.01147724, "auxiliary_loss_mlp": 0.01143387, "balance_loss_clip": 1.03047216, "balance_loss_mlp": 1.03511846, "epoch": 0.037336539906808956, "flos": 24130604073600.0, "grad_norm": 1.7414318203517491, "language_loss": 0.82710987, "learning_rate": 3.999435623772008e-06, "loss": 0.850021, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 1.125, "step": 621, "time_per_iteration": 2.4973435401916504 }, { "auxiliary_loss_clip": 0.01146479, "auxiliary_loss_mlp": 0.01140138, "balance_loss_clip": 1.03375554, "balance_loss_mlp": 1.03534997, "epoch": 0.03739666315947693, "flos": 22345285322880.0, "grad_norm": 2.2279396189109626, "language_loss": 0.92843008, "learning_rate": 3.999426334228518e-06, "loss": 0.95129633, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 1.109375, "step": 622, "time_per_iteration": 2.432361125946045 }, { "auxiliary_loss_clip": 0.01146456, "auxiliary_loss_mlp": 0.01143284, "balance_loss_clip": 1.02626777, "balance_loss_mlp": 1.03450549, "epoch": 0.0374567864121449, "flos": 20448454089600.0, "grad_norm": 2.032015196544754, "language_loss": 0.95863605, "learning_rate": 3.999416968866581e-06, "loss": 0.98153341, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 1.1171875, "step": 623, "time_per_iteration": 2.4633841514587402 }, { "auxiliary_loss_clip": 0.01151717, "auxiliary_loss_mlp": 0.01162799, "balance_loss_clip": 1.04292214, "balance_loss_mlp": 1.03735971, "epoch": 0.037516909664812866, "flos": 19207047409920.0, "grad_norm": 1.7952810784722342, "language_loss": 0.87904549, "learning_rate": 3.999407527686551e-06, "loss": 0.90219063, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 1.140625, "step": 624, "time_per_iteration": 2.436494827270508 }, { "auxiliary_loss_clip": 0.01148249, "auxiliary_loss_mlp": 0.01144412, "balance_loss_clip": 1.03683746, "balance_loss_mlp": 1.03573513, "epoch": 0.03757703291748084, "flos": 35003814526080.0, "grad_norm": 2.8860014183493523, "language_loss": 0.73953319, "learning_rate": 3.999398010688788e-06, "loss": 0.76245981, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.125, "step": 625, "time_per_iteration": 2.6041533946990967 }, { "auxiliary_loss_clip": 0.01150095, "auxiliary_loss_mlp": 0.01149034, "balance_loss_clip": 1.03254294, "balance_loss_mlp": 1.03531814, "epoch": 0.0376371561701488, "flos": 25482720274560.0, "grad_norm": 1.8207282661875475, "language_loss": 0.82669246, "learning_rate": 3.999388417873652e-06, "loss": 0.84968376, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 1.1484375, "step": 626, "time_per_iteration": 2.490727186203003 }, { "auxiliary_loss_clip": 0.01150833, "auxiliary_loss_mlp": 0.01153904, "balance_loss_clip": 1.03679323, "balance_loss_mlp": 1.0368886, "epoch": 0.037697279422816775, "flos": 18184685800320.0, "grad_norm": 1.6761634543459292, "language_loss": 0.85668659, "learning_rate": 3.999378749241506e-06, "loss": 0.87973398, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 1.140625, "step": 627, "time_per_iteration": 2.4729204177856445 }, { "auxiliary_loss_clip": 0.01145017, "auxiliary_loss_mlp": 0.01134961, "balance_loss_clip": 1.02776766, "balance_loss_mlp": 1.03444088, "epoch": 0.03775740267548475, "flos": 24643128965760.0, "grad_norm": 1.5134426086173418, "language_loss": 0.92669588, "learning_rate": 3.999369004792719e-06, "loss": 0.94949561, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 1.109375, "step": 628, "time_per_iteration": 2.4645371437072754 }, { "auxiliary_loss_clip": 0.01150354, "auxiliary_loss_mlp": 0.01141764, "balance_loss_clip": 1.02436709, "balance_loss_mlp": 1.03583467, "epoch": 0.03781752592815271, "flos": 21287137703040.0, "grad_norm": 2.0370317312948325, "language_loss": 0.8416189, "learning_rate": 3.999359184527658e-06, "loss": 0.8645401, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 1.1484375, "step": 629, "time_per_iteration": 2.462101459503174 }, { "auxiliary_loss_clip": 0.01146355, "auxiliary_loss_mlp": 0.011391, "balance_loss_clip": 1.02294207, "balance_loss_mlp": 1.03396928, "epoch": 0.037877649180820684, "flos": 22088569029120.0, "grad_norm": 1.6481701926445964, "language_loss": 0.82757461, "learning_rate": 3.999349288446696e-06, "loss": 0.85042918, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 1.125, "step": 630, "time_per_iteration": 2.584721803665161 }, { "auxiliary_loss_clip": 0.01151643, "auxiliary_loss_mlp": 0.01137685, "balance_loss_clip": 1.02901435, "balance_loss_mlp": 1.03617048, "epoch": 0.03793777243348865, "flos": 14500476046080.0, "grad_norm": 2.799449965772281, "language_loss": 0.99718016, "learning_rate": 3.99933931655021e-06, "loss": 1.02007341, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 1.15625, "step": 631, "time_per_iteration": 2.505333662033081 }, { "auxiliary_loss_clip": 0.01137196, "auxiliary_loss_mlp": 0.01132671, "balance_loss_clip": 1.02781439, "balance_loss_mlp": 1.03332996, "epoch": 0.03799789568615662, "flos": 21907334828160.0, "grad_norm": 1.4941902100665034, "language_loss": 0.94728148, "learning_rate": 3.999329268838575e-06, "loss": 0.96998012, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.0390625, "step": 632, "time_per_iteration": 2.4616222381591797 }, { "auxiliary_loss_clip": 0.01137809, "auxiliary_loss_mlp": 0.01120377, "balance_loss_clip": 1.02396011, "balance_loss_mlp": 1.03341722, "epoch": 0.03805801893882459, "flos": 24825864355200.0, "grad_norm": 1.738479481222945, "language_loss": 0.88349819, "learning_rate": 3.999319145312175e-06, "loss": 0.90608013, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 0.96484375, "router_z_loss_mlp": 1.046875, "step": 633, "time_per_iteration": 2.4975996017456055 }, { "auxiliary_loss_clip": 0.0114388, "auxiliary_loss_mlp": 0.01133259, "balance_loss_clip": 1.03317142, "balance_loss_mlp": 1.03329587, "epoch": 0.03811814219149256, "flos": 30481619385600.0, "grad_norm": 1.673016660576088, "language_loss": 0.75777364, "learning_rate": 3.999308945971392e-06, "loss": 0.78054506, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 1.109375, "step": 634, "time_per_iteration": 2.5212392807006836 }, { "auxiliary_loss_clip": 0.01053407, "auxiliary_loss_mlp": 0.01050714, "balance_loss_clip": 1.03412056, "balance_loss_mlp": 1.02602744, "epoch": 0.03817826544416053, "flos": 66989561639040.0, "grad_norm": 1.080090002084096, "language_loss": 0.61783701, "learning_rate": 3.999298670816614e-06, "loss": 0.63887829, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.2734375, "step": 635, "time_per_iteration": 3.095073938369751 }, { "auxiliary_loss_clip": 0.01144458, "auxiliary_loss_mlp": 0.01137539, "balance_loss_clip": 1.03540051, "balance_loss_mlp": 1.03319693, "epoch": 0.038238388696828496, "flos": 20484309922560.0, "grad_norm": 2.1375099227535066, "language_loss": 0.90742528, "learning_rate": 3.9992883198482294e-06, "loss": 0.93024528, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 1.109375, "step": 636, "time_per_iteration": 2.4218077659606934 }, { "auxiliary_loss_clip": 0.01140279, "auxiliary_loss_mlp": 0.01143451, "balance_loss_clip": 1.03220439, "balance_loss_mlp": 1.03110051, "epoch": 0.03829851194949647, "flos": 17964977414400.0, "grad_norm": 2.108904257511354, "language_loss": 0.87953991, "learning_rate": 3.999277893066632e-06, "loss": 0.90237719, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 1.09375, "step": 637, "time_per_iteration": 2.4062628746032715 }, { "auxiliary_loss_clip": 0.01149598, "auxiliary_loss_mlp": 0.01140114, "balance_loss_clip": 1.03215766, "balance_loss_mlp": 1.03322482, "epoch": 0.03835863520216444, "flos": 22455401351040.0, "grad_norm": 1.7172727993800396, "language_loss": 0.89876044, "learning_rate": 3.999267390472215e-06, "loss": 0.92165744, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.1640625, "step": 638, "time_per_iteration": 2.427471160888672 }, { "auxiliary_loss_clip": 0.01146479, "auxiliary_loss_mlp": 0.01150787, "balance_loss_clip": 1.04578769, "balance_loss_mlp": 1.03394055, "epoch": 0.038418758454832405, "flos": 22163317983360.0, "grad_norm": 2.331586236551516, "language_loss": 0.77012938, "learning_rate": 3.999256812065381e-06, "loss": 0.79310209, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.125, "step": 639, "time_per_iteration": 2.4867639541625977 }, { "auxiliary_loss_clip": 0.01152412, "auxiliary_loss_mlp": 0.01154368, "balance_loss_clip": 1.04593539, "balance_loss_mlp": 1.03479469, "epoch": 0.03847888170750038, "flos": 22746332643840.0, "grad_norm": 2.0925507270613792, "language_loss": 0.93403959, "learning_rate": 3.999246157846526e-06, "loss": 0.95710731, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 1.171875, "step": 640, "time_per_iteration": 2.442833185195923 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01143048, "balance_loss_clip": 1.04396117, "balance_loss_mlp": 1.03502786, "epoch": 0.03853900496016834, "flos": 22710092785920.0, "grad_norm": 2.049803715612257, "language_loss": 0.89439285, "learning_rate": 3.9992354278160574e-06, "loss": 0.91727161, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 1.09375, "step": 641, "time_per_iteration": 2.4766299724578857 }, { "auxiliary_loss_clip": 0.01045718, "auxiliary_loss_mlp": 0.01157494, "balance_loss_clip": 1.1399461, "balance_loss_mlp": 1.0203532, "epoch": 0.038599128212836314, "flos": 70395652569600.0, "grad_norm": 0.954832839638245, "language_loss": 0.65817571, "learning_rate": 3.999224621974381e-06, "loss": 0.68020779, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.25390625, "step": 642, "time_per_iteration": 3.0364344120025635 }, { "auxiliary_loss_clip": 0.01145578, "auxiliary_loss_mlp": 0.011214, "balance_loss_clip": 1.0282737, "balance_loss_mlp": 1.0339222, "epoch": 0.03865925146550429, "flos": 23294015141760.0, "grad_norm": 1.6105373145386477, "language_loss": 0.84267819, "learning_rate": 3.999213740321906e-06, "loss": 0.86534798, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 1.1171875, "step": 643, "time_per_iteration": 2.470949411392212 }, { "auxiliary_loss_clip": 0.01138372, "auxiliary_loss_mlp": 0.01116481, "balance_loss_clip": 1.02240109, "balance_loss_mlp": 1.03272557, "epoch": 0.03871937471817225, "flos": 21429478782720.0, "grad_norm": 1.7282296521325415, "language_loss": 0.8665489, "learning_rate": 3.999202782859046e-06, "loss": 0.88909739, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 1.0546875, "step": 644, "time_per_iteration": 2.4602267742156982 }, { "auxiliary_loss_clip": 0.0114193, "auxiliary_loss_mlp": 0.0113537, "balance_loss_clip": 1.03051376, "balance_loss_mlp": 1.03361261, "epoch": 0.038779497970840224, "flos": 34275875345280.0, "grad_norm": 1.9212685796933018, "language_loss": 0.87557673, "learning_rate": 3.9991917495862165e-06, "loss": 0.8983497, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.078125, "step": 645, "time_per_iteration": 2.5726985931396484 }, { "auxiliary_loss_clip": 0.01145175, "auxiliary_loss_mlp": 0.01140511, "balance_loss_clip": 1.03670359, "balance_loss_mlp": 1.034168, "epoch": 0.03883962122350819, "flos": 22747065782400.0, "grad_norm": 2.3097887274972857, "language_loss": 0.87386203, "learning_rate": 3.9991806405038345e-06, "loss": 0.89671886, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 1.109375, "step": 646, "time_per_iteration": 2.4509987831115723 }, { "auxiliary_loss_clip": 0.01139482, "auxiliary_loss_mlp": 0.0114241, "balance_loss_clip": 1.04399121, "balance_loss_mlp": 1.0324012, "epoch": 0.03889974447617616, "flos": 21944726760960.0, "grad_norm": 1.8819525249691549, "language_loss": 0.85945958, "learning_rate": 3.999169455612323e-06, "loss": 0.8822785, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 1.0703125, "step": 647, "time_per_iteration": 2.514894485473633 }, { "auxiliary_loss_clip": 0.01141196, "auxiliary_loss_mlp": 0.01148608, "balance_loss_clip": 1.03864908, "balance_loss_mlp": 1.03167391, "epoch": 0.03895986772884413, "flos": 31503457324800.0, "grad_norm": 1.923486594833773, "language_loss": 0.90020525, "learning_rate": 3.999158194912106e-06, "loss": 0.92310333, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 1.09375, "step": 648, "time_per_iteration": 2.5370564460754395 }, { "auxiliary_loss_clip": 0.01135196, "auxiliary_loss_mlp": 0.01175276, "balance_loss_clip": 1.06641376, "balance_loss_mlp": 1.02939034, "epoch": 0.0390199909815121, "flos": 19900003541760.0, "grad_norm": 1.7761805985334822, "language_loss": 0.89493334, "learning_rate": 3.9991468584036086e-06, "loss": 0.91803813, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 1.0625, "step": 649, "time_per_iteration": 3.85715389251709 }, { "auxiliary_loss_clip": 0.01144721, "auxiliary_loss_mlp": 0.01139222, "balance_loss_clip": 1.03202879, "balance_loss_mlp": 1.03192854, "epoch": 0.03908011423418007, "flos": 21611515944960.0, "grad_norm": 1.898034747366542, "language_loss": 0.84001702, "learning_rate": 3.999135446087263e-06, "loss": 0.86285645, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 1.125, "step": 650, "time_per_iteration": 5.374142408370972 }, { "auxiliary_loss_clip": 0.01138069, "auxiliary_loss_mlp": 0.01130687, "balance_loss_clip": 1.03121901, "balance_loss_mlp": 1.0306164, "epoch": 0.039140237486848035, "flos": 18660412252800.0, "grad_norm": 2.1249664287301133, "language_loss": 0.82968974, "learning_rate": 3.9991239579635e-06, "loss": 0.8523773, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 1.078125, "step": 651, "time_per_iteration": 3.827467679977417 }, { "auxiliary_loss_clip": 0.01144146, "auxiliary_loss_mlp": 0.01143147, "balance_loss_clip": 1.03681254, "balance_loss_mlp": 1.03242385, "epoch": 0.03920036073951601, "flos": 18660132961920.0, "grad_norm": 2.3254845210019863, "language_loss": 0.93467844, "learning_rate": 3.999112394032757e-06, "loss": 0.95755136, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 1.1171875, "step": 652, "time_per_iteration": 2.397010564804077 }, { "auxiliary_loss_clip": 0.01134054, "auxiliary_loss_mlp": 0.01132591, "balance_loss_clip": 1.02840257, "balance_loss_mlp": 1.03009403, "epoch": 0.03926048399218398, "flos": 31353226277760.0, "grad_norm": 2.292943232036688, "language_loss": 0.87760836, "learning_rate": 3.999100754295471e-06, "loss": 0.90027481, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.0390625, "step": 653, "time_per_iteration": 2.5087904930114746 }, { "auxiliary_loss_clip": 0.01149826, "auxiliary_loss_mlp": 0.01150856, "balance_loss_clip": 1.03374493, "balance_loss_mlp": 1.03430355, "epoch": 0.039320607244851945, "flos": 29602297082880.0, "grad_norm": 2.0606702949693037, "language_loss": 0.92878038, "learning_rate": 3.999089038752085e-06, "loss": 0.95178723, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 1.15625, "step": 654, "time_per_iteration": 2.4998490810394287 }, { "auxiliary_loss_clip": 0.01040154, "auxiliary_loss_mlp": 0.01019751, "balance_loss_clip": 1.00067747, "balance_loss_mlp": 1.01383352, "epoch": 0.03938073049751992, "flos": 66531151221120.0, "grad_norm": 0.7438771244747499, "language_loss": 0.50222671, "learning_rate": 3.999077247403041e-06, "loss": 0.52282578, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.26367188, "step": 655, "time_per_iteration": 3.104891538619995 }, { "auxiliary_loss_clip": 0.01145112, "auxiliary_loss_mlp": 0.01141032, "balance_loss_clip": 1.03636646, "balance_loss_mlp": 1.03428555, "epoch": 0.03944085375018788, "flos": 23366704325760.0, "grad_norm": 1.9825798270969837, "language_loss": 0.84229481, "learning_rate": 3.9990653802487886e-06, "loss": 0.86515629, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.109375, "step": 656, "time_per_iteration": 2.456942558288574 }, { "auxiliary_loss_clip": 0.011591, "auxiliary_loss_mlp": 0.01138425, "balance_loss_clip": 1.03027868, "balance_loss_mlp": 1.03786945, "epoch": 0.039500977002855854, "flos": 18547398581760.0, "grad_norm": 2.993029478405108, "language_loss": 0.83184087, "learning_rate": 3.999053437289776e-06, "loss": 0.8548162, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.2109375, "step": 657, "time_per_iteration": 2.396207094192505 }, { "auxiliary_loss_clip": 0.01160207, "auxiliary_loss_mlp": 0.01150377, "balance_loss_clip": 1.03040516, "balance_loss_mlp": 1.03921843, "epoch": 0.039561100255523826, "flos": 25336992792960.0, "grad_norm": 2.2994740733883288, "language_loss": 0.85964495, "learning_rate": 3.999041418526457e-06, "loss": 0.88275075, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 1.2109375, "step": 658, "time_per_iteration": 2.494626998901367 }, { "auxiliary_loss_clip": 0.01147782, "auxiliary_loss_mlp": 0.01146308, "balance_loss_clip": 1.0417856, "balance_loss_mlp": 1.03554034, "epoch": 0.03962122350819179, "flos": 18219005533440.0, "grad_norm": 1.8461537837749333, "language_loss": 0.9614377, "learning_rate": 3.999029323959287e-06, "loss": 0.9843787, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.125, "step": 659, "time_per_iteration": 2.4406585693359375 }, { "auxiliary_loss_clip": 0.01154311, "auxiliary_loss_mlp": 0.0114721, "balance_loss_clip": 1.03954029, "balance_loss_mlp": 1.03571033, "epoch": 0.03968134676085976, "flos": 20521178184960.0, "grad_norm": 2.085971986846673, "language_loss": 0.85468787, "learning_rate": 3.999017153588724e-06, "loss": 0.87770307, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.1875, "step": 660, "time_per_iteration": 2.431804656982422 }, { "auxiliary_loss_clip": 0.0115663, "auxiliary_loss_mlp": 0.01155076, "balance_loss_clip": 1.05126858, "balance_loss_mlp": 1.03981268, "epoch": 0.03974147001352773, "flos": 22421395820160.0, "grad_norm": 1.6034772804171846, "language_loss": 0.85248554, "learning_rate": 3.999004907415231e-06, "loss": 0.8756026, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 1.171875, "step": 661, "time_per_iteration": 2.4936578273773193 }, { "auxiliary_loss_clip": 0.01043831, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.04973829, "balance_loss_mlp": 1.01680923, "epoch": 0.0398015932661957, "flos": 71125267495680.0, "grad_norm": 0.946722031743666, "language_loss": 0.69565392, "learning_rate": 3.998992585439272e-06, "loss": 0.71674323, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.26953125, "step": 662, "time_per_iteration": 3.176159143447876 }, { "auxiliary_loss_clip": 0.01150752, "auxiliary_loss_mlp": 0.01142488, "balance_loss_clip": 1.03515172, "balance_loss_mlp": 1.03584886, "epoch": 0.03986171651886367, "flos": 16799995434240.0, "grad_norm": 1.7247541789998526, "language_loss": 0.87811625, "learning_rate": 3.998980187661314e-06, "loss": 0.90104866, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 1.140625, "step": 663, "time_per_iteration": 2.496194839477539 }, { "auxiliary_loss_clip": 0.01153436, "auxiliary_loss_mlp": 0.01141784, "balance_loss_clip": 1.0307765, "balance_loss_mlp": 1.03455997, "epoch": 0.03992183977153164, "flos": 24533920632960.0, "grad_norm": 2.29137104944411, "language_loss": 0.92828369, "learning_rate": 3.998967714081826e-06, "loss": 0.95123589, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 1.1875, "step": 664, "time_per_iteration": 2.492635726928711 }, { "auxiliary_loss_clip": 0.01141531, "auxiliary_loss_mlp": 0.01129279, "balance_loss_clip": 1.02928603, "balance_loss_mlp": 1.03249192, "epoch": 0.03998196302419961, "flos": 15595003169280.0, "grad_norm": 1.9046914955046066, "language_loss": 0.89442289, "learning_rate": 3.998955164701281e-06, "loss": 0.91713107, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 1.09375, "step": 665, "time_per_iteration": 2.403488874435425 }, { "auxiliary_loss_clip": 0.01146659, "auxiliary_loss_mlp": 0.01129273, "balance_loss_clip": 1.02632403, "balance_loss_mlp": 1.0332005, "epoch": 0.04004208627686758, "flos": 25303790223360.0, "grad_norm": 2.040123180001308, "language_loss": 0.84858418, "learning_rate": 3.998942539520158e-06, "loss": 0.87134349, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 1.1328125, "step": 666, "time_per_iteration": 2.533057689666748 }, { "auxiliary_loss_clip": 0.01142127, "auxiliary_loss_mlp": 0.01121325, "balance_loss_clip": 1.01961589, "balance_loss_mlp": 1.03244996, "epoch": 0.04010220952953555, "flos": 23474760583680.0, "grad_norm": 1.8487774465223867, "language_loss": 0.91057909, "learning_rate": 3.998929838538932e-06, "loss": 0.93321365, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 1.09375, "step": 667, "time_per_iteration": 2.475883960723877 }, { "auxiliary_loss_clip": 0.01143159, "auxiliary_loss_mlp": 0.0112864, "balance_loss_clip": 1.0239743, "balance_loss_mlp": 1.03338122, "epoch": 0.04016233278220352, "flos": 18616247516160.0, "grad_norm": 2.1902831871626067, "language_loss": 0.85681808, "learning_rate": 3.998917061758087e-06, "loss": 0.87953603, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 1.09375, "step": 668, "time_per_iteration": 2.403777599334717 }, { "auxiliary_loss_clip": 0.01045396, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.01937294, "balance_loss_mlp": 1.01646304, "epoch": 0.040222456034871484, "flos": 70902801112320.0, "grad_norm": 0.8154666165093641, "language_loss": 0.60316014, "learning_rate": 3.998904209178107e-06, "loss": 0.62400234, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.2890625, "step": 669, "time_per_iteration": 3.1351687908172607 }, { "auxiliary_loss_clip": 0.01148064, "auxiliary_loss_mlp": 0.01135891, "balance_loss_clip": 1.02946138, "balance_loss_mlp": 1.03482938, "epoch": 0.040282579287539456, "flos": 23763701928960.0, "grad_norm": 1.662387864976036, "language_loss": 0.90439564, "learning_rate": 3.9988912807994785e-06, "loss": 0.92723525, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 1.1328125, "step": 670, "time_per_iteration": 2.486539363861084 }, { "auxiliary_loss_clip": 0.01139737, "auxiliary_loss_mlp": 0.011163, "balance_loss_clip": 1.02188587, "balance_loss_mlp": 1.03324819, "epoch": 0.04034270254020743, "flos": 18477537217920.0, "grad_norm": 1.8615681522588239, "language_loss": 0.79142499, "learning_rate": 3.998878276622692e-06, "loss": 0.81398535, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 1.0625, "step": 671, "time_per_iteration": 2.4270260334014893 }, { "auxiliary_loss_clip": 0.0115208, "auxiliary_loss_mlp": 0.01131061, "balance_loss_clip": 1.02973342, "balance_loss_mlp": 1.03835058, "epoch": 0.040402825792875394, "flos": 17200903109760.0, "grad_norm": 1.8956433288339951, "language_loss": 0.97127789, "learning_rate": 3.998865196648242e-06, "loss": 0.99410939, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 1.1328125, "step": 672, "time_per_iteration": 2.4225685596466064 }, { "auxiliary_loss_clip": 0.01151095, "auxiliary_loss_mlp": 0.01135538, "balance_loss_clip": 1.03330457, "balance_loss_mlp": 1.03608048, "epoch": 0.040462949045543366, "flos": 19171156665600.0, "grad_norm": 1.8477654372546233, "language_loss": 0.93808079, "learning_rate": 3.998852040876622e-06, "loss": 0.96094704, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 1.1484375, "step": 673, "time_per_iteration": 2.4184083938598633 }, { "auxiliary_loss_clip": 0.01145377, "auxiliary_loss_mlp": 0.01129033, "balance_loss_clip": 1.03099537, "balance_loss_mlp": 1.03231931, "epoch": 0.04052307229821133, "flos": 24018812300160.0, "grad_norm": 1.9040232467614557, "language_loss": 0.79716676, "learning_rate": 3.998838809308334e-06, "loss": 0.81991088, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 0.98046875, "router_z_loss_mlp": 1.1328125, "step": 674, "time_per_iteration": 2.477980136871338 }, { "auxiliary_loss_clip": 0.01160325, "auxiliary_loss_mlp": 0.01141965, "balance_loss_clip": 1.03071928, "balance_loss_mlp": 1.03637743, "epoch": 0.0405831955508793, "flos": 16435641818880.0, "grad_norm": 2.215652634157601, "language_loss": 0.85125887, "learning_rate": 3.9988255019438766e-06, "loss": 0.87428176, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 1.234375, "step": 675, "time_per_iteration": 2.422987222671509 }, { "auxiliary_loss_clip": 0.01146998, "auxiliary_loss_mlp": 0.01114603, "balance_loss_clip": 1.01785231, "balance_loss_mlp": 1.03223252, "epoch": 0.040643318803547275, "flos": 24278775350400.0, "grad_norm": 1.6483841310272274, "language_loss": 0.80701369, "learning_rate": 3.998812118783757e-06, "loss": 0.82962966, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 1.1484375, "step": 676, "time_per_iteration": 2.481201410293579 }, { "auxiliary_loss_clip": 0.01148228, "auxiliary_loss_mlp": 0.0111558, "balance_loss_clip": 1.01959252, "balance_loss_mlp": 1.03390324, "epoch": 0.04070344205621524, "flos": 17711123852160.0, "grad_norm": 2.1627593240706826, "language_loss": 0.89751595, "learning_rate": 3.9987986598284804e-06, "loss": 0.92015398, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 1.140625, "step": 677, "time_per_iteration": 2.414358139038086 }, { "auxiliary_loss_clip": 0.01143863, "auxiliary_loss_mlp": 0.01112875, "balance_loss_clip": 1.01984406, "balance_loss_mlp": 1.03332329, "epoch": 0.04076356530888321, "flos": 26176444456320.0, "grad_norm": 2.1219640420479324, "language_loss": 0.80295086, "learning_rate": 3.998785125078559e-06, "loss": 0.82551825, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 1.109375, "step": 678, "time_per_iteration": 2.5327956676483154 }, { "auxiliary_loss_clip": 0.01146948, "auxiliary_loss_mlp": 0.01124406, "balance_loss_clip": 1.02498579, "balance_loss_mlp": 1.03048611, "epoch": 0.04082368856155118, "flos": 35771973459840.0, "grad_norm": 1.5857690280125762, "language_loss": 0.85437536, "learning_rate": 3.998771514534505e-06, "loss": 0.87708896, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 0.99609375, "router_z_loss_mlp": 1.1640625, "step": 679, "time_per_iteration": 2.5361883640289307 }, { "auxiliary_loss_clip": 0.01143064, "auxiliary_loss_mlp": 0.01110961, "balance_loss_clip": 1.01702392, "balance_loss_mlp": 1.03150892, "epoch": 0.04088381181421915, "flos": 28145406291840.0, "grad_norm": 1.8122299387912468, "language_loss": 0.81136459, "learning_rate": 3.998757828196835e-06, "loss": 0.8339048, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 1.109375, "step": 680, "time_per_iteration": 2.537060260772705 }, { "auxiliary_loss_clip": 0.01147984, "auxiliary_loss_mlp": 0.01118835, "balance_loss_clip": 1.01784062, "balance_loss_mlp": 1.03201079, "epoch": 0.04094393506688712, "flos": 27596501896320.0, "grad_norm": 1.7487901662373744, "language_loss": 0.86967373, "learning_rate": 3.9987440660660685e-06, "loss": 0.89234191, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 1.15625, "step": 681, "time_per_iteration": 2.4821507930755615 }, { "auxiliary_loss_clip": 0.01149803, "auxiliary_loss_mlp": 0.01123164, "balance_loss_clip": 1.02140665, "balance_loss_mlp": 1.03132033, "epoch": 0.04100405831955509, "flos": 23110930638720.0, "grad_norm": 1.633952853797651, "language_loss": 0.7617293, "learning_rate": 3.998730228142726e-06, "loss": 0.78445894, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 1.1875, "step": 682, "time_per_iteration": 2.4639172554016113 }, { "auxiliary_loss_clip": 0.01148086, "auxiliary_loss_mlp": 0.01117165, "balance_loss_clip": 1.01741064, "balance_loss_mlp": 1.03163362, "epoch": 0.04106418157222306, "flos": 20155707406080.0, "grad_norm": 1.7166485726616771, "language_loss": 0.76514614, "learning_rate": 3.998716314427333e-06, "loss": 0.78779864, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 0.99609375, "router_z_loss_mlp": 1.1640625, "step": 683, "time_per_iteration": 2.4202606678009033 }, { "auxiliary_loss_clip": 0.01144307, "auxiliary_loss_mlp": 0.01109018, "balance_loss_clip": 1.02261484, "balance_loss_mlp": 1.02928138, "epoch": 0.041124304824891024, "flos": 17419738711680.0, "grad_norm": 2.695420921015649, "language_loss": 0.85175836, "learning_rate": 3.998702324920417e-06, "loss": 0.87429166, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 1.1484375, "step": 684, "time_per_iteration": 2.473734140396118 }, { "auxiliary_loss_clip": 0.01141528, "auxiliary_loss_mlp": 0.01114277, "balance_loss_clip": 1.01933861, "balance_loss_mlp": 1.02941537, "epoch": 0.041184428077558996, "flos": 25778853360000.0, "grad_norm": 1.4670487658655595, "language_loss": 0.93872792, "learning_rate": 3.9986882596225085e-06, "loss": 0.96128601, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 0.94921875, "router_z_loss_mlp": 1.125, "step": 685, "time_per_iteration": 2.506664514541626 }, { "auxiliary_loss_clip": 0.01143432, "auxiliary_loss_mlp": 0.01104638, "balance_loss_clip": 1.01551676, "balance_loss_mlp": 1.02931964, "epoch": 0.04124455133022697, "flos": 22963701968640.0, "grad_norm": 2.2013557400519232, "language_loss": 0.93457246, "learning_rate": 3.998674118534141e-06, "loss": 0.95705312, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 1.140625, "step": 686, "time_per_iteration": 2.479328155517578 }, { "auxiliary_loss_clip": 0.01145387, "auxiliary_loss_mlp": 0.01113393, "balance_loss_clip": 1.01964641, "balance_loss_mlp": 1.02947474, "epoch": 0.04130467458289493, "flos": 21287975575680.0, "grad_norm": 1.8003524058405944, "language_loss": 0.75618255, "learning_rate": 3.998659901655851e-06, "loss": 0.77877033, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 1.15625, "step": 687, "time_per_iteration": 2.440997838973999 }, { "auxiliary_loss_clip": 0.01130864, "auxiliary_loss_mlp": 0.01101142, "balance_loss_clip": 1.02108145, "balance_loss_mlp": 1.02755916, "epoch": 0.041364797835562905, "flos": 19973216396160.0, "grad_norm": 1.4241847846972275, "language_loss": 0.89603168, "learning_rate": 3.998645608988177e-06, "loss": 0.91835177, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 0.80078125, "router_z_loss_mlp": 1.03125, "step": 688, "time_per_iteration": 3.9174094200134277 }, { "auxiliary_loss_clip": 0.01135141, "auxiliary_loss_mlp": 0.0110081, "balance_loss_clip": 1.02022409, "balance_loss_mlp": 1.02849007, "epoch": 0.04142492108823087, "flos": 21905205235200.0, "grad_norm": 1.753687891361087, "language_loss": 0.87174642, "learning_rate": 3.998631240531661e-06, "loss": 0.89410603, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 1.0625, "step": 689, "time_per_iteration": 2.437152624130249 }, { "auxiliary_loss_clip": 0.01138413, "auxiliary_loss_mlp": 0.01107427, "balance_loss_clip": 1.0261265, "balance_loss_mlp": 1.02782011, "epoch": 0.04148504434089884, "flos": 27638292660480.0, "grad_norm": 1.9561709728017773, "language_loss": 0.72685653, "learning_rate": 3.998616796286848e-06, "loss": 0.7493149, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 0.8125, "router_z_loss_mlp": 1.109375, "step": 690, "time_per_iteration": 3.929933547973633 }, { "auxiliary_loss_clip": 0.01138071, "auxiliary_loss_mlp": 0.01109711, "balance_loss_clip": 1.02449965, "balance_loss_mlp": 1.0274477, "epoch": 0.041545167593566815, "flos": 20517442669440.0, "grad_norm": 1.6165250474725026, "language_loss": 0.78118956, "learning_rate": 3.998602276254286e-06, "loss": 0.80366731, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.109375, "step": 691, "time_per_iteration": 3.800966739654541 }, { "auxiliary_loss_clip": 0.01140213, "auxiliary_loss_mlp": 0.01110365, "balance_loss_clip": 1.02334201, "balance_loss_mlp": 1.02730799, "epoch": 0.04160529084623478, "flos": 11868269512320.0, "grad_norm": 1.9734362678836048, "language_loss": 0.8775323, "learning_rate": 3.998587680434526e-06, "loss": 0.90003806, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 1.125, "step": 692, "time_per_iteration": 2.3723509311676025 }, { "auxiliary_loss_clip": 0.01143444, "auxiliary_loss_mlp": 0.01106273, "balance_loss_clip": 1.0180105, "balance_loss_mlp": 1.02847564, "epoch": 0.04166541409890275, "flos": 14827472640000.0, "grad_norm": 2.1374499543792664, "language_loss": 0.95044053, "learning_rate": 3.99857300882812e-06, "loss": 0.9729377, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 1.1484375, "step": 693, "time_per_iteration": 2.399405002593994 }, { "auxiliary_loss_clip": 0.01145031, "auxiliary_loss_mlp": 0.0110449, "balance_loss_clip": 1.0169431, "balance_loss_mlp": 1.03120589, "epoch": 0.04172553735157072, "flos": 25807063605120.0, "grad_norm": 2.0002385278993358, "language_loss": 0.86368775, "learning_rate": 3.998558261435626e-06, "loss": 0.88618302, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 1.140625, "step": 694, "time_per_iteration": 2.4540154933929443 }, { "auxiliary_loss_clip": 0.01148437, "auxiliary_loss_mlp": 0.0111921, "balance_loss_clip": 1.01916981, "balance_loss_mlp": 1.03047276, "epoch": 0.04178566060423869, "flos": 24278670616320.0, "grad_norm": 1.8546510084979992, "language_loss": 0.89156288, "learning_rate": 3.9985434382576015e-06, "loss": 0.91423929, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 1.171875, "step": 695, "time_per_iteration": 2.4542396068573 }, { "auxiliary_loss_clip": 0.01141274, "auxiliary_loss_mlp": 0.01111994, "balance_loss_clip": 1.0255909, "balance_loss_mlp": 1.02842283, "epoch": 0.04184578385690666, "flos": 18221065303680.0, "grad_norm": 2.457914240917448, "language_loss": 0.90388429, "learning_rate": 3.99852853929461e-06, "loss": 0.92641699, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 1.125, "step": 696, "time_per_iteration": 2.380408763885498 }, { "auxiliary_loss_clip": 0.01143746, "auxiliary_loss_mlp": 0.01116715, "balance_loss_clip": 1.02454209, "balance_loss_mlp": 1.0283854, "epoch": 0.041905907109574626, "flos": 22775450584320.0, "grad_norm": 2.330216831203628, "language_loss": 0.9735322, "learning_rate": 3.998513564547216e-06, "loss": 0.99613678, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 1.15625, "step": 697, "time_per_iteration": 2.4298970699310303 }, { "auxiliary_loss_clip": 0.0113553, "auxiliary_loss_mlp": 0.01107672, "balance_loss_clip": 1.02141225, "balance_loss_mlp": 1.02829766, "epoch": 0.0419660303622426, "flos": 20155916874240.0, "grad_norm": 2.117415220399418, "language_loss": 0.89289582, "learning_rate": 3.998498514015987e-06, "loss": 0.91532779, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 1.078125, "step": 698, "time_per_iteration": 2.406831979751587 }, { "auxiliary_loss_clip": 0.0114367, "auxiliary_loss_mlp": 0.011163, "balance_loss_clip": 1.02226782, "balance_loss_mlp": 1.02739084, "epoch": 0.042026153614910564, "flos": 23075249362560.0, "grad_norm": 1.8068258593235993, "language_loss": 0.94687164, "learning_rate": 3.998483387701495e-06, "loss": 0.96947134, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 1.1640625, "step": 699, "time_per_iteration": 2.4555959701538086 }, { "auxiliary_loss_clip": 0.01059863, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.0172112, "balance_loss_mlp": 1.0290184, "epoch": 0.042086276867578536, "flos": 64491734528640.0, "grad_norm": 0.967139958296333, "language_loss": 0.68031961, "learning_rate": 3.998468185604312e-06, "loss": 0.70125914, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.30859375, "step": 700, "time_per_iteration": 3.0838208198547363 }, { "auxiliary_loss_clip": 0.01144112, "auxiliary_loss_mlp": 0.01118248, "balance_loss_clip": 1.02278471, "balance_loss_mlp": 1.02894843, "epoch": 0.04214640012024651, "flos": 15486109038720.0, "grad_norm": 2.2284658935471056, "language_loss": 0.9273811, "learning_rate": 3.998452907725016e-06, "loss": 0.9500047, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 1.15625, "step": 701, "time_per_iteration": 2.4092633724212646 }, { "auxiliary_loss_clip": 0.01142335, "auxiliary_loss_mlp": 0.01110361, "balance_loss_clip": 1.0225755, "balance_loss_mlp": 1.02932513, "epoch": 0.04220652337291447, "flos": 23875947550080.0, "grad_norm": 1.7732673197037274, "language_loss": 0.73061061, "learning_rate": 3.998437554064184e-06, "loss": 0.75313759, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 1.1328125, "step": 702, "time_per_iteration": 2.4450647830963135 }, { "auxiliary_loss_clip": 0.01052494, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.00850046, "balance_loss_mlp": 1.02324557, "epoch": 0.042266646625582445, "flos": 63792145238400.0, "grad_norm": 0.8712367276957804, "language_loss": 0.61116236, "learning_rate": 3.9984221246224006e-06, "loss": 0.63192391, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.29296875, "step": 703, "time_per_iteration": 3.116760492324829 }, { "auxiliary_loss_clip": 0.01051436, "auxiliary_loss_mlp": 0.01013246, "balance_loss_clip": 0.99875015, "balance_loss_mlp": 1.02073598, "epoch": 0.04232676987825041, "flos": 50015521877760.0, "grad_norm": 1.0704243783074525, "language_loss": 0.5788312, "learning_rate": 3.9984066194002494e-06, "loss": 0.59947801, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.30664062, "step": 704, "time_per_iteration": 2.9471654891967773 }, { "auxiliary_loss_clip": 0.01147149, "auxiliary_loss_mlp": 0.01121852, "balance_loss_clip": 1.03625989, "balance_loss_mlp": 1.03329575, "epoch": 0.04238689313091838, "flos": 21615041992320.0, "grad_norm": 2.2101748565416046, "language_loss": 0.91898108, "learning_rate": 3.998391038398319e-06, "loss": 0.94167113, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 1.140625, "step": 705, "time_per_iteration": 2.438861608505249 }, { "auxiliary_loss_clip": 0.01141899, "auxiliary_loss_mlp": 0.01129015, "balance_loss_clip": 1.04466295, "balance_loss_mlp": 1.0315094, "epoch": 0.042447016383586354, "flos": 19134113846400.0, "grad_norm": 2.4709033476251605, "language_loss": 0.75250739, "learning_rate": 3.998375381617201e-06, "loss": 0.77521658, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 0.84375, "router_z_loss_mlp": 1.109375, "step": 706, "time_per_iteration": 2.4444239139556885 }, { "auxiliary_loss_clip": 0.01150915, "auxiliary_loss_mlp": 0.01141375, "balance_loss_clip": 1.05072856, "balance_loss_mlp": 1.03545868, "epoch": 0.04250713963625432, "flos": 24424851945600.0, "grad_norm": 2.7454612573551307, "language_loss": 0.96787024, "learning_rate": 3.9983596490574875e-06, "loss": 0.99079311, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 1.15625, "step": 707, "time_per_iteration": 2.448821783065796 }, { "auxiliary_loss_clip": 0.01152416, "auxiliary_loss_mlp": 0.01135119, "balance_loss_clip": 1.04576015, "balance_loss_mlp": 1.03550673, "epoch": 0.04256726288892229, "flos": 30366231742080.0, "grad_norm": 1.6581984549677886, "language_loss": 0.85144758, "learning_rate": 3.998343840719776e-06, "loss": 0.87432295, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 1.171875, "step": 708, "time_per_iteration": 2.516350507736206 }, { "auxiliary_loss_clip": 0.01154444, "auxiliary_loss_mlp": 0.01155713, "balance_loss_clip": 1.05867708, "balance_loss_mlp": 1.03431833, "epoch": 0.04262738614159026, "flos": 16361730737280.0, "grad_norm": 1.976981271756763, "language_loss": 0.87505841, "learning_rate": 3.998327956604666e-06, "loss": 0.89815998, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 1.203125, "step": 709, "time_per_iteration": 2.3963935375213623 }, { "auxiliary_loss_clip": 0.0114777, "auxiliary_loss_mlp": 0.01146419, "balance_loss_clip": 1.05605829, "balance_loss_mlp": 1.03262353, "epoch": 0.04268750939425823, "flos": 20411341447680.0, "grad_norm": 3.1010226778975363, "language_loss": 0.91445822, "learning_rate": 3.99831199671276e-06, "loss": 0.9374001, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 1.1484375, "step": 710, "time_per_iteration": 2.4525625705718994 }, { "auxiliary_loss_clip": 0.01155516, "auxiliary_loss_mlp": 0.01147531, "balance_loss_clip": 1.05345154, "balance_loss_mlp": 1.03446317, "epoch": 0.0427476326469262, "flos": 20301923646720.0, "grad_norm": 1.8841525420833416, "language_loss": 0.88040853, "learning_rate": 3.998295961044662e-06, "loss": 0.90343893, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 1.2109375, "step": 711, "time_per_iteration": 2.417354106903076 }, { "auxiliary_loss_clip": 0.01144454, "auxiliary_loss_mlp": 0.01128539, "balance_loss_clip": 1.04232681, "balance_loss_mlp": 1.03117323, "epoch": 0.042807755899594166, "flos": 21649780661760.0, "grad_norm": 1.64860752759726, "language_loss": 0.88556767, "learning_rate": 3.9982798496009804e-06, "loss": 0.9082976, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 1.140625, "step": 712, "time_per_iteration": 2.4566781520843506 }, { "auxiliary_loss_clip": 0.01152712, "auxiliary_loss_mlp": 0.01138287, "balance_loss_clip": 1.04291964, "balance_loss_mlp": 1.03289008, "epoch": 0.04286787915226214, "flos": 21433912525440.0, "grad_norm": 2.3930566971177276, "language_loss": 0.96439731, "learning_rate": 3.998263662382328e-06, "loss": 0.98730725, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 1.1953125, "step": 713, "time_per_iteration": 2.473478078842163 }, { "auxiliary_loss_clip": 0.01038652, "auxiliary_loss_mlp": 0.01092257, "balance_loss_clip": 1.07928669, "balance_loss_mlp": 1.01230037, "epoch": 0.04292800240493011, "flos": 66394256313600.0, "grad_norm": 0.9014068670757661, "language_loss": 0.63926554, "learning_rate": 3.9982473993893165e-06, "loss": 0.66057467, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.26367188, "step": 714, "time_per_iteration": 3.1448700428009033 }, { "auxiliary_loss_clip": 0.01141506, "auxiliary_loss_mlp": 0.01115124, "balance_loss_clip": 1.02972221, "balance_loss_mlp": 1.03291035, "epoch": 0.042988125657598075, "flos": 31648905515520.0, "grad_norm": 1.6810060794282315, "language_loss": 0.78302395, "learning_rate": 3.998231060622563e-06, "loss": 0.80559027, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.0859375, "step": 715, "time_per_iteration": 2.524284839630127 }, { "auxiliary_loss_clip": 0.01145289, "auxiliary_loss_mlp": 0.01119189, "balance_loss_clip": 1.02425051, "balance_loss_mlp": 1.03274786, "epoch": 0.04304824891026605, "flos": 33247264602240.0, "grad_norm": 1.7573901763432593, "language_loss": 0.76250398, "learning_rate": 3.998214646082688e-06, "loss": 0.78514874, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 1.125, "step": 716, "time_per_iteration": 2.547414779663086 }, { "auxiliary_loss_clip": 0.01037703, "auxiliary_loss_mlp": 0.01013978, "balance_loss_clip": 0.99995852, "balance_loss_mlp": 1.01001048, "epoch": 0.04310837216293401, "flos": 64061080508160.0, "grad_norm": 0.9041181067697918, "language_loss": 0.65790331, "learning_rate": 3.998198155770314e-06, "loss": 0.67842013, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.27734375, "step": 717, "time_per_iteration": 3.149366617202759 }, { "auxiliary_loss_clip": 0.01043218, "auxiliary_loss_mlp": 0.01017201, "balance_loss_clip": 1.00442147, "balance_loss_mlp": 1.01441026, "epoch": 0.043168495415601985, "flos": 61340719057920.0, "grad_norm": 0.9950550243141916, "language_loss": 0.59116203, "learning_rate": 3.998181589686065e-06, "loss": 0.61176616, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.2890625, "step": 718, "time_per_iteration": 2.9409382343292236 }, { "auxiliary_loss_clip": 0.0115902, "auxiliary_loss_mlp": 0.01130951, "balance_loss_clip": 1.03610849, "balance_loss_mlp": 1.04256403, "epoch": 0.04322861866826996, "flos": 20703215347200.0, "grad_norm": 1.8148965741547145, "language_loss": 0.9526943, "learning_rate": 3.99816494783057e-06, "loss": 0.97559398, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 0.94921875, "router_z_loss_mlp": 1.1640625, "step": 719, "time_per_iteration": 2.45314359664917 }, { "auxiliary_loss_clip": 0.01161045, "auxiliary_loss_mlp": 0.01126274, "balance_loss_clip": 1.03615177, "balance_loss_mlp": 1.04308903, "epoch": 0.04328874192093792, "flos": 30372027027840.0, "grad_norm": 1.4297366166222838, "language_loss": 0.68987429, "learning_rate": 3.99814823020446e-06, "loss": 0.71274751, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 1.1796875, "step": 720, "time_per_iteration": 2.5416030883789062 }, { "auxiliary_loss_clip": 0.01157038, "auxiliary_loss_mlp": 0.01128176, "balance_loss_clip": 1.04096293, "balance_loss_mlp": 1.04451132, "epoch": 0.043348865173605894, "flos": 21943714331520.0, "grad_norm": 1.9215818591168672, "language_loss": 0.80529994, "learning_rate": 3.9981314368083684e-06, "loss": 0.82815206, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 1.125, "step": 721, "time_per_iteration": 2.465538740158081 }, { "auxiliary_loss_clip": 0.01162103, "auxiliary_loss_mlp": 0.01125861, "balance_loss_clip": 1.03693056, "balance_loss_mlp": 1.04607975, "epoch": 0.04340898842627386, "flos": 15263433187200.0, "grad_norm": 2.551966770720661, "language_loss": 0.93448949, "learning_rate": 3.998114567642933e-06, "loss": 0.95736915, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 1.1640625, "step": 722, "time_per_iteration": 2.441356658935547 }, { "auxiliary_loss_clip": 0.01160653, "auxiliary_loss_mlp": 0.01121257, "balance_loss_clip": 1.03604567, "balance_loss_mlp": 1.04438806, "epoch": 0.04346911167894183, "flos": 27964172090880.0, "grad_norm": 1.7760254459726104, "language_loss": 0.89578849, "learning_rate": 3.998097622708792e-06, "loss": 0.91860759, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.1640625, "step": 723, "time_per_iteration": 2.5088438987731934 }, { "auxiliary_loss_clip": 0.01154337, "auxiliary_loss_mlp": 0.01118766, "balance_loss_clip": 1.03336465, "balance_loss_mlp": 1.04160857, "epoch": 0.0435292349316098, "flos": 29240910933120.0, "grad_norm": 1.6843211822434485, "language_loss": 0.86046994, "learning_rate": 3.99808060200659e-06, "loss": 0.883201, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.125, "step": 724, "time_per_iteration": 2.572150230407715 }, { "auxiliary_loss_clip": 0.01149898, "auxiliary_loss_mlp": 0.01125642, "balance_loss_clip": 1.03885722, "balance_loss_mlp": 1.04166746, "epoch": 0.04358935818427777, "flos": 20557313308800.0, "grad_norm": 1.7737384397129539, "language_loss": 0.83929199, "learning_rate": 3.998063505536971e-06, "loss": 0.86204737, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 1.0859375, "step": 725, "time_per_iteration": 2.460517406463623 }, { "auxiliary_loss_clip": 0.01160372, "auxiliary_loss_mlp": 0.01120347, "balance_loss_clip": 1.03623319, "balance_loss_mlp": 1.04292965, "epoch": 0.04364948143694574, "flos": 14464061631360.0, "grad_norm": 1.9363191651098222, "language_loss": 0.9248907, "learning_rate": 3.998046333300584e-06, "loss": 0.94769788, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 0.84375, "router_z_loss_mlp": 1.171875, "step": 726, "time_per_iteration": 2.4446539878845215 }, { "auxiliary_loss_clip": 0.01040842, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.02023053, "balance_loss_mlp": 1.01525593, "epoch": 0.043709604689613706, "flos": 50064610982400.0, "grad_norm": 0.9196165543991929, "language_loss": 0.56134468, "learning_rate": 3.998029085298079e-06, "loss": 0.58207601, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.25585938, "step": 727, "time_per_iteration": 4.6004369258880615 }, { "auxiliary_loss_clip": 0.01143707, "auxiliary_loss_mlp": 0.0110247, "balance_loss_clip": 1.02107394, "balance_loss_mlp": 1.03138566, "epoch": 0.04376972794228168, "flos": 13990709151360.0, "grad_norm": 2.1946081947748133, "language_loss": 0.86729062, "learning_rate": 3.998011761530112e-06, "loss": 0.88975233, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 0.81640625, "router_z_loss_mlp": 1.125, "step": 728, "time_per_iteration": 2.424870491027832 }, { "auxiliary_loss_clip": 0.01139082, "auxiliary_loss_mlp": 0.01099467, "balance_loss_clip": 1.02021623, "balance_loss_mlp": 1.03237259, "epoch": 0.04382985119494965, "flos": 22009037218560.0, "grad_norm": 2.1532141704767183, "language_loss": 0.79052758, "learning_rate": 3.997994361997338e-06, "loss": 0.81291306, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.0625, "step": 729, "time_per_iteration": 3.9228484630584717 }, { "auxiliary_loss_clip": 0.01148438, "auxiliary_loss_mlp": 0.01102248, "balance_loss_clip": 1.01842046, "balance_loss_mlp": 1.03554714, "epoch": 0.043889974447617615, "flos": 24205387939200.0, "grad_norm": 1.8937880481048945, "language_loss": 0.9912231, "learning_rate": 3.997976886700417e-06, "loss": 1.01372993, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 0.83984375, "router_z_loss_mlp": 1.125, "step": 730, "time_per_iteration": 3.9357101917266846 }, { "auxiliary_loss_clip": 0.01144626, "auxiliary_loss_mlp": 0.01096822, "balance_loss_clip": 1.01637924, "balance_loss_mlp": 1.03328729, "epoch": 0.04395009770028559, "flos": 17273592293760.0, "grad_norm": 2.4081699568166632, "language_loss": 0.93713468, "learning_rate": 3.997959335640013e-06, "loss": 0.95954919, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 1.109375, "step": 731, "time_per_iteration": 2.4546773433685303 }, { "auxiliary_loss_clip": 0.01147668, "auxiliary_loss_mlp": 0.01110516, "balance_loss_clip": 1.02196741, "balance_loss_mlp": 1.03637171, "epoch": 0.04401022095295355, "flos": 12309536586240.0, "grad_norm": 2.8717169498735915, "language_loss": 0.95605797, "learning_rate": 3.997941708816791e-06, "loss": 0.97863984, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 1.1171875, "step": 732, "time_per_iteration": 2.451756477355957 }, { "auxiliary_loss_clip": 0.01152254, "auxiliary_loss_mlp": 0.01111422, "balance_loss_clip": 1.02783227, "balance_loss_mlp": 1.0397439, "epoch": 0.044070344205621524, "flos": 20958605009280.0, "grad_norm": 2.04184133923239, "language_loss": 0.90404558, "learning_rate": 3.997924006231419e-06, "loss": 0.92668235, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 1.125, "step": 733, "time_per_iteration": 2.453716993331909 }, { "auxiliary_loss_clip": 0.01155701, "auxiliary_loss_mlp": 0.01120224, "balance_loss_clip": 1.03124654, "balance_loss_mlp": 1.0405035, "epoch": 0.044130467458289496, "flos": 13844423088000.0, "grad_norm": 2.2516441753610117, "language_loss": 0.94911051, "learning_rate": 3.9979062278845685e-06, "loss": 0.97186971, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 1.1484375, "step": 734, "time_per_iteration": 2.43974232673645 }, { "auxiliary_loss_clip": 0.01153703, "auxiliary_loss_mlp": 0.01118044, "balance_loss_clip": 1.0334053, "balance_loss_mlp": 1.04253638, "epoch": 0.04419059071095746, "flos": 28653881466240.0, "grad_norm": 1.7596240525812477, "language_loss": 0.82477832, "learning_rate": 3.9978883737769125e-06, "loss": 0.84749579, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 1.109375, "step": 735, "time_per_iteration": 2.5013484954833984 }, { "auxiliary_loss_clip": 0.0115145, "auxiliary_loss_mlp": 0.01113023, "balance_loss_clip": 1.03262782, "balance_loss_mlp": 1.04059815, "epoch": 0.04425071396362543, "flos": 28182065086080.0, "grad_norm": 1.958572002443253, "language_loss": 0.92400038, "learning_rate": 3.9978704439091305e-06, "loss": 0.94664514, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 1.109375, "step": 736, "time_per_iteration": 2.520101547241211 }, { "auxiliary_loss_clip": 0.01153039, "auxiliary_loss_mlp": 0.01122166, "balance_loss_clip": 1.03328347, "balance_loss_mlp": 1.04079342, "epoch": 0.0443108372162934, "flos": 23657356327680.0, "grad_norm": 1.6545055352494313, "language_loss": 0.88095534, "learning_rate": 3.997852438281901e-06, "loss": 0.90370739, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 1.125, "step": 737, "time_per_iteration": 2.5043704509735107 }, { "auxiliary_loss_clip": 0.01153469, "auxiliary_loss_mlp": 0.01120557, "balance_loss_clip": 1.0332005, "balance_loss_mlp": 1.04126489, "epoch": 0.04437096046896137, "flos": 33978590184960.0, "grad_norm": 1.8760504001142357, "language_loss": 0.88670981, "learning_rate": 3.997834356895906e-06, "loss": 0.90945005, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 1.125, "step": 738, "time_per_iteration": 2.580198287963867 }, { "auxiliary_loss_clip": 0.0104627, "auxiliary_loss_mlp": 0.01020977, "balance_loss_clip": 1.00695777, "balance_loss_mlp": 1.0171442, "epoch": 0.04443108372162934, "flos": 67394379386880.0, "grad_norm": 0.8809992072563103, "language_loss": 0.59327638, "learning_rate": 3.9978161997518324e-06, "loss": 0.61394882, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.29101562, "step": 739, "time_per_iteration": 3.019779682159424 }, { "auxiliary_loss_clip": 0.01147755, "auxiliary_loss_mlp": 0.01117163, "balance_loss_clip": 1.03323972, "balance_loss_mlp": 1.03867161, "epoch": 0.04449120697429731, "flos": 29751376055040.0, "grad_norm": 2.210708701997334, "language_loss": 0.97377348, "learning_rate": 3.997797966850369e-06, "loss": 0.99642277, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 1.09375, "step": 740, "time_per_iteration": 2.5146284103393555 }, { "auxiliary_loss_clip": 0.01146668, "auxiliary_loss_mlp": 0.01116899, "balance_loss_clip": 1.03173625, "balance_loss_mlp": 1.03367734, "epoch": 0.04455133022696528, "flos": 36500645779200.0, "grad_norm": 1.822140058037994, "language_loss": 0.76034653, "learning_rate": 3.997779658192205e-06, "loss": 0.78298223, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.1328125, "step": 741, "time_per_iteration": 2.5871119499206543 }, { "auxiliary_loss_clip": 0.01135098, "auxiliary_loss_mlp": 0.01106665, "balance_loss_clip": 1.02054882, "balance_loss_mlp": 1.02734685, "epoch": 0.044611453479633245, "flos": 28802401856640.0, "grad_norm": 1.6392228533763398, "language_loss": 0.9079752, "learning_rate": 3.997761273778037e-06, "loss": 0.9303928, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 1.078125, "step": 742, "time_per_iteration": 2.496699333190918 }, { "auxiliary_loss_clip": 0.01138247, "auxiliary_loss_mlp": 0.01120232, "balance_loss_clip": 1.02443576, "balance_loss_mlp": 1.02893972, "epoch": 0.04467157673230122, "flos": 20009945013120.0, "grad_norm": 1.8766350539023289, "language_loss": 0.87973946, "learning_rate": 3.997742813608561e-06, "loss": 0.90232432, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 0.95703125, "router_z_loss_mlp": 1.09375, "step": 743, "time_per_iteration": 2.4465954303741455 }, { "auxiliary_loss_clip": 0.01140816, "auxiliary_loss_mlp": 0.01113915, "balance_loss_clip": 1.02670169, "balance_loss_mlp": 1.02928197, "epoch": 0.04473169998496919, "flos": 18003975269760.0, "grad_norm": 2.2816699584426328, "language_loss": 0.85252416, "learning_rate": 3.997724277684479e-06, "loss": 0.87507147, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 1.109375, "step": 744, "time_per_iteration": 2.4268076419830322 }, { "auxiliary_loss_clip": 0.01137097, "auxiliary_loss_mlp": 0.01116476, "balance_loss_clip": 1.02730727, "balance_loss_mlp": 1.02858961, "epoch": 0.044791823237637154, "flos": 20630665808640.0, "grad_norm": 2.2616490585746267, "language_loss": 0.88658237, "learning_rate": 3.99770566600649e-06, "loss": 0.90911818, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 1.0859375, "step": 745, "time_per_iteration": 2.4623963832855225 }, { "auxiliary_loss_clip": 0.0113805, "auxiliary_loss_mlp": 0.01113571, "balance_loss_clip": 1.02235186, "balance_loss_mlp": 1.02790534, "epoch": 0.04485194649030513, "flos": 31174819896960.0, "grad_norm": 1.681727306137203, "language_loss": 0.72507191, "learning_rate": 3.997686978575302e-06, "loss": 0.74758816, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 1.1015625, "step": 746, "time_per_iteration": 2.5230627059936523 }, { "auxiliary_loss_clip": 0.01135766, "auxiliary_loss_mlp": 0.01108748, "balance_loss_clip": 1.02582586, "balance_loss_mlp": 1.02934778, "epoch": 0.04491206974297309, "flos": 26142019989120.0, "grad_norm": 1.7980778513982356, "language_loss": 0.73686814, "learning_rate": 3.997668215391625e-06, "loss": 0.75931334, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 1.0625, "step": 747, "time_per_iteration": 2.4970059394836426 }, { "auxiliary_loss_clip": 0.01132827, "auxiliary_loss_mlp": 0.01109177, "balance_loss_clip": 1.0300225, "balance_loss_mlp": 1.02718925, "epoch": 0.044972192995641064, "flos": 20666626375680.0, "grad_norm": 1.6733235936810396, "language_loss": 0.705854, "learning_rate": 3.997649376456168e-06, "loss": 0.72827411, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.0546875, "step": 748, "time_per_iteration": 2.4578843116760254 }, { "auxiliary_loss_clip": 0.01133152, "auxiliary_loss_mlp": 0.01108276, "balance_loss_clip": 1.02506781, "balance_loss_mlp": 1.02805603, "epoch": 0.045032316248309036, "flos": 16105922138880.0, "grad_norm": 2.0145462783121304, "language_loss": 0.81297636, "learning_rate": 3.997630461769647e-06, "loss": 0.83539069, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 0.83203125, "router_z_loss_mlp": 1.046875, "step": 749, "time_per_iteration": 2.4110422134399414 }, { "auxiliary_loss_clip": 0.01133525, "auxiliary_loss_mlp": 0.01097532, "balance_loss_clip": 1.02266872, "balance_loss_mlp": 1.02717578, "epoch": 0.045092439500977, "flos": 17857863763200.0, "grad_norm": 1.9740615004970081, "language_loss": 0.9338429, "learning_rate": 3.997611471332778e-06, "loss": 0.95615345, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.0625, "step": 750, "time_per_iteration": 2.403404951095581 }, { "auxiliary_loss_clip": 0.01133673, "auxiliary_loss_mlp": 0.01110672, "balance_loss_clip": 1.02107453, "balance_loss_mlp": 1.02723479, "epoch": 0.04515256275364497, "flos": 24461650385280.0, "grad_norm": 1.8150889031213246, "language_loss": 0.78918195, "learning_rate": 3.9975924051462825e-06, "loss": 0.81162548, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 1.0625, "step": 751, "time_per_iteration": 2.4733352661132812 }, { "auxiliary_loss_clip": 0.01129511, "auxiliary_loss_mlp": 0.0110942, "balance_loss_clip": 1.02683175, "balance_loss_mlp": 1.02687716, "epoch": 0.04521268600631294, "flos": 20915522524800.0, "grad_norm": 2.1476393589833953, "language_loss": 0.73663074, "learning_rate": 3.997573263210883e-06, "loss": 0.75902009, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 0.82421875, "router_z_loss_mlp": 1.0234375, "step": 752, "time_per_iteration": 2.4177374839782715 }, { "auxiliary_loss_clip": 0.01125974, "auxiliary_loss_mlp": 0.01095165, "balance_loss_clip": 1.0210644, "balance_loss_mlp": 1.02512336, "epoch": 0.04527280925898091, "flos": 13370512026240.0, "grad_norm": 2.4592828739466985, "language_loss": 0.98795986, "learning_rate": 3.997554045527305e-06, "loss": 1.01017118, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 1.0078125, "step": 753, "time_per_iteration": 2.3938474655151367 }, { "auxiliary_loss_clip": 0.01132138, "auxiliary_loss_mlp": 0.01102924, "balance_loss_clip": 1.01881051, "balance_loss_mlp": 1.02710354, "epoch": 0.04533293251164888, "flos": 23253551009280.0, "grad_norm": 1.7829900085674473, "language_loss": 0.94655603, "learning_rate": 3.997534752096277e-06, "loss": 0.96890664, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 0.83984375, "router_z_loss_mlp": 1.046875, "step": 754, "time_per_iteration": 2.4401543140411377 }, { "auxiliary_loss_clip": 0.01119839, "auxiliary_loss_mlp": 0.01100307, "balance_loss_clip": 1.03130913, "balance_loss_mlp": 1.02705443, "epoch": 0.04539305576431685, "flos": 12421188714240.0, "grad_norm": 2.2233125280831234, "language_loss": 0.8334623, "learning_rate": 3.997515382918531e-06, "loss": 0.85566378, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.9296875, "step": 755, "time_per_iteration": 2.423874855041504 }, { "auxiliary_loss_clip": 0.01126963, "auxiliary_loss_mlp": 0.01094351, "balance_loss_clip": 1.01767564, "balance_loss_mlp": 1.02748966, "epoch": 0.04545317901698482, "flos": 16070066305920.0, "grad_norm": 2.7768742739392476, "language_loss": 0.83761609, "learning_rate": 3.9974959379948015e-06, "loss": 0.85982925, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.9921875, "step": 756, "time_per_iteration": 2.423295021057129 }, { "auxiliary_loss_clip": 0.01038475, "auxiliary_loss_mlp": 0.01020994, "balance_loss_clip": 1.00993145, "balance_loss_mlp": 1.01612711, "epoch": 0.045513302269652785, "flos": 66392475834240.0, "grad_norm": 0.8147330362799062, "language_loss": 0.62927663, "learning_rate": 3.997476417325827e-06, "loss": 0.64987135, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.22363281, "step": 757, "time_per_iteration": 3.1076056957244873 }, { "auxiliary_loss_clip": 0.01123728, "auxiliary_loss_mlp": 0.01089517, "balance_loss_clip": 1.01756227, "balance_loss_mlp": 1.02637184, "epoch": 0.04557342552232076, "flos": 21470082560640.0, "grad_norm": 1.4475162366835992, "language_loss": 0.86870086, "learning_rate": 3.997456820912346e-06, "loss": 0.89083326, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.97265625, "step": 758, "time_per_iteration": 2.4468162059783936 }, { "auxiliary_loss_clip": 0.01122527, "auxiliary_loss_mlp": 0.01096152, "balance_loss_clip": 1.02457893, "balance_loss_mlp": 1.02613902, "epoch": 0.04563354877498873, "flos": 23731546700160.0, "grad_norm": 1.6421709814566503, "language_loss": 0.92035711, "learning_rate": 3.997437148755101e-06, "loss": 0.94254386, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.96484375, "step": 759, "time_per_iteration": 2.4394891262054443 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01089758, "balance_loss_clip": 1.01952028, "balance_loss_mlp": 1.02720857, "epoch": 0.045693672027656694, "flos": 25734653712000.0, "grad_norm": 1.9414847033246145, "language_loss": 0.78815544, "learning_rate": 3.9974174008548405e-06, "loss": 0.81032914, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.0, "step": 760, "time_per_iteration": 2.4857101440429688 }, { "auxiliary_loss_clip": 0.01123582, "auxiliary_loss_mlp": 0.01086036, "balance_loss_clip": 1.0194695, "balance_loss_mlp": 1.02658474, "epoch": 0.045753795280324666, "flos": 19718001290880.0, "grad_norm": 1.8907724451366423, "language_loss": 0.86819911, "learning_rate": 3.9973975772123105e-06, "loss": 0.89029527, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.97265625, "step": 761, "time_per_iteration": 2.426213026046753 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.0109556, "balance_loss_clip": 1.02222252, "balance_loss_mlp": 1.02707672, "epoch": 0.04581391853299264, "flos": 23254737995520.0, "grad_norm": 1.5763461964755348, "language_loss": 0.81917208, "learning_rate": 3.997377677828266e-06, "loss": 0.84137511, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 0.9765625, "step": 762, "time_per_iteration": 2.4767606258392334 }, { "auxiliary_loss_clip": 0.0103563, "auxiliary_loss_mlp": 0.01017786, "balance_loss_clip": 1.00557923, "balance_loss_mlp": 1.01358962, "epoch": 0.0458740417856606, "flos": 64227896317440.0, "grad_norm": 1.0099804210804244, "language_loss": 0.58912963, "learning_rate": 3.9973577027034585e-06, "loss": 0.60966378, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.22070312, "step": 763, "time_per_iteration": 3.0731773376464844 }, { "auxiliary_loss_clip": 0.01121011, "auxiliary_loss_mlp": 0.01086608, "balance_loss_clip": 1.01884973, "balance_loss_mlp": 1.02573705, "epoch": 0.045934165038328575, "flos": 20769271372800.0, "grad_norm": 2.0696924660042004, "language_loss": 0.9309442, "learning_rate": 3.9973376518386475e-06, "loss": 0.95302039, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 0.953125, "step": 764, "time_per_iteration": 2.44710111618042 }, { "auxiliary_loss_clip": 0.01128728, "auxiliary_loss_mlp": 0.01092947, "balance_loss_clip": 1.02146888, "balance_loss_mlp": 1.02825689, "epoch": 0.04599428829099654, "flos": 30261596797440.0, "grad_norm": 1.8970109995098774, "language_loss": 0.91690052, "learning_rate": 3.997317525234592e-06, "loss": 0.93911725, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 1.0078125, "step": 765, "time_per_iteration": 2.4943723678588867 }, { "auxiliary_loss_clip": 0.01127246, "auxiliary_loss_mlp": 0.01095254, "balance_loss_clip": 1.02358532, "balance_loss_mlp": 1.02612019, "epoch": 0.04605441154366451, "flos": 23037822518400.0, "grad_norm": 2.3386482294694195, "language_loss": 0.94150591, "learning_rate": 3.997297322892056e-06, "loss": 0.96373093, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 1.0078125, "step": 766, "time_per_iteration": 3.94807767868042 }, { "auxiliary_loss_clip": 0.0112332, "auxiliary_loss_mlp": 0.01088985, "balance_loss_clip": 1.02084517, "balance_loss_mlp": 1.02624428, "epoch": 0.046114534796332485, "flos": 22016333692800.0, "grad_norm": 2.01040258366671, "language_loss": 0.87956429, "learning_rate": 3.997277044811806e-06, "loss": 0.90168738, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.96875, "step": 767, "time_per_iteration": 2.4641506671905518 }, { "auxiliary_loss_clip": 0.01126415, "auxiliary_loss_mlp": 0.01092933, "balance_loss_clip": 1.0230763, "balance_loss_mlp": 1.02637029, "epoch": 0.04617465804900045, "flos": 29861073146880.0, "grad_norm": 1.8448964343350567, "language_loss": 0.90976912, "learning_rate": 3.99725669099461e-06, "loss": 0.93196261, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.0, "step": 768, "time_per_iteration": 2.4838666915893555 }, { "auxiliary_loss_clip": 0.01129397, "auxiliary_loss_mlp": 0.01089388, "balance_loss_clip": 1.01905465, "balance_loss_mlp": 1.02606571, "epoch": 0.04623478130166842, "flos": 25628866692480.0, "grad_norm": 1.9684060631435032, "language_loss": 0.78897661, "learning_rate": 3.9972362614412395e-06, "loss": 0.81116444, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.03125, "step": 769, "time_per_iteration": 5.420944452285767 }, { "auxiliary_loss_clip": 0.0112043, "auxiliary_loss_mlp": 0.0109677, "balance_loss_clip": 1.02567363, "balance_loss_mlp": 1.02694619, "epoch": 0.04629490455433639, "flos": 20448035153280.0, "grad_norm": 1.724662341571376, "language_loss": 0.89795363, "learning_rate": 3.997215756152471e-06, "loss": 0.92012566, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.9375, "step": 770, "time_per_iteration": 3.9440112113952637 }, { "auxiliary_loss_clip": 0.01128884, "auxiliary_loss_mlp": 0.01094301, "balance_loss_clip": 1.02482605, "balance_loss_mlp": 1.02562356, "epoch": 0.04635502780700436, "flos": 23147624344320.0, "grad_norm": 2.0470585001970005, "language_loss": 0.91511911, "learning_rate": 3.99719517512908e-06, "loss": 0.93735099, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.03125, "step": 771, "time_per_iteration": 2.4450881481170654 }, { "auxiliary_loss_clip": 0.01136375, "auxiliary_loss_mlp": 0.01100635, "balance_loss_clip": 1.0188576, "balance_loss_mlp": 1.02673697, "epoch": 0.04641515105967233, "flos": 23290977853440.0, "grad_norm": 1.9674777994919002, "language_loss": 0.87871599, "learning_rate": 3.997174518371848e-06, "loss": 0.90108621, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 1.09375, "step": 772, "time_per_iteration": 2.435906171798706 }, { "auxiliary_loss_clip": 0.01128413, "auxiliary_loss_mlp": 0.01088221, "balance_loss_clip": 1.02098715, "balance_loss_mlp": 1.02830124, "epoch": 0.046475274312340296, "flos": 25114142384640.0, "grad_norm": 1.7385662105911095, "language_loss": 0.78276026, "learning_rate": 3.997153785881557e-06, "loss": 0.80492663, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.0, "step": 773, "time_per_iteration": 2.4614150524139404 }, { "auxiliary_loss_clip": 0.01123253, "auxiliary_loss_mlp": 0.01081742, "balance_loss_clip": 1.01784563, "balance_loss_mlp": 1.02816975, "epoch": 0.04653539756500827, "flos": 25263745027200.0, "grad_norm": 1.8042543341097, "language_loss": 0.81806386, "learning_rate": 3.997132977658996e-06, "loss": 0.84011382, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.953125, "step": 774, "time_per_iteration": 2.4512312412261963 }, { "auxiliary_loss_clip": 0.01130333, "auxiliary_loss_mlp": 0.01101352, "balance_loss_clip": 1.02782345, "balance_loss_mlp": 1.02870679, "epoch": 0.046595520817676234, "flos": 35402802076800.0, "grad_norm": 2.5106479490493854, "language_loss": 0.76514322, "learning_rate": 3.997112093704952e-06, "loss": 0.78746003, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 1.015625, "step": 775, "time_per_iteration": 2.5525665283203125 }, { "auxiliary_loss_clip": 0.01129184, "auxiliary_loss_mlp": 0.01092556, "balance_loss_clip": 1.01878905, "balance_loss_mlp": 1.02747607, "epoch": 0.046655644070344206, "flos": 18111577680000.0, "grad_norm": 1.612297059918002, "language_loss": 0.81280386, "learning_rate": 3.997091134020217e-06, "loss": 0.83502126, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 1.015625, "step": 776, "time_per_iteration": 2.4103944301605225 }, { "auxiliary_loss_clip": 0.01127989, "auxiliary_loss_mlp": 0.01089115, "balance_loss_clip": 1.01549149, "balance_loss_mlp": 1.02790773, "epoch": 0.04671576732301218, "flos": 29204007759360.0, "grad_norm": 1.7187579355062241, "language_loss": 0.76234835, "learning_rate": 3.997070098605585e-06, "loss": 0.78451943, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 1.0, "step": 777, "time_per_iteration": 2.4932217597961426 }, { "auxiliary_loss_clip": 0.01127021, "auxiliary_loss_mlp": 0.01092988, "balance_loss_clip": 1.01812434, "balance_loss_mlp": 1.02743447, "epoch": 0.04677589057568014, "flos": 30477115820160.0, "grad_norm": 1.7224883389611743, "language_loss": 0.79182601, "learning_rate": 3.997048987461856e-06, "loss": 0.81402612, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.99609375, "step": 778, "time_per_iteration": 2.5414860248565674 }, { "auxiliary_loss_clip": 0.0113126, "auxiliary_loss_mlp": 0.0110191, "balance_loss_clip": 1.02594984, "balance_loss_mlp": 1.02887797, "epoch": 0.046836013828348115, "flos": 20556649992960.0, "grad_norm": 1.8248308589518587, "language_loss": 0.82879174, "learning_rate": 3.997027800589829e-06, "loss": 0.85112345, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.0234375, "step": 779, "time_per_iteration": 2.4907591342926025 }, { "auxiliary_loss_clip": 0.01124461, "auxiliary_loss_mlp": 0.01090545, "balance_loss_clip": 1.02126074, "balance_loss_mlp": 1.02609038, "epoch": 0.04689613708101608, "flos": 25446201125760.0, "grad_norm": 1.9988860237486408, "language_loss": 0.79348707, "learning_rate": 3.997006537990308e-06, "loss": 0.81563711, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.984375, "step": 780, "time_per_iteration": 2.5092263221740723 }, { "auxiliary_loss_clip": 0.01124685, "auxiliary_loss_mlp": 0.01087951, "balance_loss_clip": 1.01766503, "balance_loss_mlp": 1.0277046, "epoch": 0.04695626033368405, "flos": 23000325851520.0, "grad_norm": 1.5564591161405106, "language_loss": 0.78871608, "learning_rate": 3.996985199664099e-06, "loss": 0.81084239, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.96875, "step": 781, "time_per_iteration": 2.4851717948913574 }, { "auxiliary_loss_clip": 0.01134739, "auxiliary_loss_mlp": 0.01101609, "balance_loss_clip": 1.02035594, "balance_loss_mlp": 1.02886438, "epoch": 0.047016383586352024, "flos": 29132051713920.0, "grad_norm": 1.8514347251301078, "language_loss": 0.79326612, "learning_rate": 3.99696378561201e-06, "loss": 0.8156296, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 0.8125, "router_z_loss_mlp": 1.0625, "step": 782, "time_per_iteration": 2.5721004009246826 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01093575, "balance_loss_clip": 1.02033305, "balance_loss_mlp": 1.03003383, "epoch": 0.04707650683901999, "flos": 14975434448640.0, "grad_norm": 1.8153019745488344, "language_loss": 0.83836341, "learning_rate": 3.996942295834855e-06, "loss": 0.86063361, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 1.03125, "step": 783, "time_per_iteration": 2.442685842514038 }, { "auxiliary_loss_clip": 0.01126048, "auxiliary_loss_mlp": 0.01096142, "balance_loss_clip": 1.02514052, "balance_loss_mlp": 1.02859151, "epoch": 0.04713663009168796, "flos": 21650094864000.0, "grad_norm": 1.6397611669243433, "language_loss": 0.84214222, "learning_rate": 3.996920730333448e-06, "loss": 0.86436403, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.97265625, "step": 784, "time_per_iteration": 2.445434093475342 }, { "auxiliary_loss_clip": 0.01132832, "auxiliary_loss_mlp": 0.01094157, "balance_loss_clip": 1.02019954, "balance_loss_mlp": 1.02982461, "epoch": 0.04719675334435593, "flos": 21324320167680.0, "grad_norm": 1.9516384574821146, "language_loss": 0.83121675, "learning_rate": 3.996899089108607e-06, "loss": 0.85348666, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0234375, "step": 785, "time_per_iteration": 2.462984085083008 }, { "auxiliary_loss_clip": 0.01128416, "auxiliary_loss_mlp": 0.01093777, "balance_loss_clip": 1.02163196, "balance_loss_mlp": 1.02940941, "epoch": 0.0472568765970239, "flos": 17930413301760.0, "grad_norm": 1.8294102202585873, "language_loss": 0.94905436, "learning_rate": 3.996877372161152e-06, "loss": 0.97127628, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.98828125, "step": 786, "time_per_iteration": 2.3854053020477295 }, { "auxiliary_loss_clip": 0.01136259, "auxiliary_loss_mlp": 0.01107894, "balance_loss_clip": 1.02282584, "balance_loss_mlp": 1.03083968, "epoch": 0.04731699984969187, "flos": 18076350251520.0, "grad_norm": 2.0750957617473507, "language_loss": 0.83329582, "learning_rate": 3.9968555794919065e-06, "loss": 0.85573733, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 1.0546875, "step": 787, "time_per_iteration": 2.4238786697387695 }, { "auxiliary_loss_clip": 0.01135761, "auxiliary_loss_mlp": 0.01096024, "balance_loss_clip": 1.02135134, "balance_loss_mlp": 1.03283858, "epoch": 0.047377123102359836, "flos": 23183968936320.0, "grad_norm": 2.171102484013912, "language_loss": 0.86099792, "learning_rate": 3.996833711101698e-06, "loss": 0.88331574, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.03125, "step": 788, "time_per_iteration": 2.429232358932495 }, { "auxiliary_loss_clip": 0.01130573, "auxiliary_loss_mlp": 0.0109877, "balance_loss_clip": 1.02266729, "balance_loss_mlp": 1.03172731, "epoch": 0.04743724635502781, "flos": 22746681757440.0, "grad_norm": 1.780541112028689, "language_loss": 0.87988985, "learning_rate": 3.996811766991355e-06, "loss": 0.90218329, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 0.9921875, "step": 789, "time_per_iteration": 2.422647476196289 }, { "auxiliary_loss_clip": 0.01129348, "auxiliary_loss_mlp": 0.01096613, "balance_loss_clip": 1.02465832, "balance_loss_mlp": 1.02794719, "epoch": 0.04749736960769577, "flos": 17237736460800.0, "grad_norm": 2.3503811297891493, "language_loss": 0.86072397, "learning_rate": 3.996789747161709e-06, "loss": 0.88298362, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.015625, "step": 790, "time_per_iteration": 2.4177567958831787 }, { "auxiliary_loss_clip": 0.01128664, "auxiliary_loss_mlp": 0.01090908, "balance_loss_clip": 1.01942992, "balance_loss_mlp": 1.02669954, "epoch": 0.047557492860363745, "flos": 40477672039680.0, "grad_norm": 1.812654727552757, "language_loss": 0.91726911, "learning_rate": 3.996767651613597e-06, "loss": 0.93946481, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 1.015625, "step": 791, "time_per_iteration": 2.5801892280578613 }, { "auxiliary_loss_clip": 0.01130046, "auxiliary_loss_mlp": 0.01089521, "balance_loss_clip": 1.02147579, "balance_loss_mlp": 1.02863991, "epoch": 0.04761761611303172, "flos": 18697001224320.0, "grad_norm": 1.8184137166363679, "language_loss": 0.93475795, "learning_rate": 3.996745480347854e-06, "loss": 0.95695364, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 1.015625, "step": 792, "time_per_iteration": 2.4260759353637695 }, { "auxiliary_loss_clip": 0.01126688, "auxiliary_loss_mlp": 0.0109598, "balance_loss_clip": 1.02164102, "balance_loss_mlp": 1.02642488, "epoch": 0.04767773936569968, "flos": 20920968696960.0, "grad_norm": 2.114892430515852, "language_loss": 0.76931417, "learning_rate": 3.996723233365324e-06, "loss": 0.79154086, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0, "step": 793, "time_per_iteration": 2.4195919036865234 }, { "auxiliary_loss_clip": 0.0112963, "auxiliary_loss_mlp": 0.01093968, "balance_loss_clip": 1.01862741, "balance_loss_mlp": 1.02825058, "epoch": 0.047737862618367655, "flos": 23731546700160.0, "grad_norm": 1.8922723863531363, "language_loss": 0.91048574, "learning_rate": 3.996700910666847e-06, "loss": 0.93272173, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 1.015625, "step": 794, "time_per_iteration": 2.4750466346740723 }, { "auxiliary_loss_clip": 0.01129898, "auxiliary_loss_mlp": 0.01086515, "balance_loss_clip": 1.01584744, "balance_loss_mlp": 1.02604842, "epoch": 0.04779798587103562, "flos": 23694643526400.0, "grad_norm": 2.576090113652724, "language_loss": 0.75514168, "learning_rate": 3.996678512253272e-06, "loss": 0.77730578, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 1.0390625, "step": 795, "time_per_iteration": 2.440300226211548 }, { "auxiliary_loss_clip": 0.01121473, "auxiliary_loss_mlp": 0.0109148, "balance_loss_clip": 1.01919115, "balance_loss_mlp": 1.02514851, "epoch": 0.04785810912370359, "flos": 23182572481920.0, "grad_norm": 1.6778087643319475, "language_loss": 0.83677101, "learning_rate": 3.996656038125449e-06, "loss": 0.85890055, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 0.9609375, "step": 796, "time_per_iteration": 2.438460111618042 }, { "auxiliary_loss_clip": 0.01126284, "auxiliary_loss_mlp": 0.01085635, "balance_loss_clip": 1.01611233, "balance_loss_mlp": 1.02805769, "epoch": 0.047918232376371564, "flos": 18039656545920.0, "grad_norm": 1.8707317942598143, "language_loss": 0.86171526, "learning_rate": 3.996633488284228e-06, "loss": 0.88383442, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.984375, "step": 797, "time_per_iteration": 2.394491672515869 }, { "auxiliary_loss_clip": 0.01038824, "auxiliary_loss_mlp": 0.01018545, "balance_loss_clip": 1.00700581, "balance_loss_mlp": 1.01667762, "epoch": 0.04797835562903953, "flos": 62439400632960.0, "grad_norm": 0.9351228017063494, "language_loss": 0.64602757, "learning_rate": 3.996610862730465e-06, "loss": 0.6666013, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.22070312, "step": 798, "time_per_iteration": 2.948089361190796 }, { "auxiliary_loss_clip": 0.01135993, "auxiliary_loss_mlp": 0.01099678, "balance_loss_clip": 1.02243054, "balance_loss_mlp": 1.0282104, "epoch": 0.0480384788817075, "flos": 21506217684480.0, "grad_norm": 1.8207314241628734, "language_loss": 0.94829297, "learning_rate": 3.996588161465018e-06, "loss": 0.97064972, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 1.078125, "step": 799, "time_per_iteration": 2.4190673828125 }, { "auxiliary_loss_clip": 0.01119699, "auxiliary_loss_mlp": 0.01097683, "balance_loss_clip": 1.02401185, "balance_loss_mlp": 1.02583659, "epoch": 0.048098602134375466, "flos": 21725611868160.0, "grad_norm": 2.010004037721749, "language_loss": 0.9065969, "learning_rate": 3.996565384488748e-06, "loss": 0.92877072, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.9375, "step": 800, "time_per_iteration": 2.4437150955200195 }, { "auxiliary_loss_clip": 0.0112911, "auxiliary_loss_mlp": 0.01105532, "balance_loss_clip": 1.03367305, "balance_loss_mlp": 1.02780533, "epoch": 0.04815872538704344, "flos": 22929940817280.0, "grad_norm": 1.919406211562986, "language_loss": 0.88225609, "learning_rate": 3.996542531802518e-06, "loss": 0.90460253, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.015625, "step": 801, "time_per_iteration": 2.4226744174957275 }, { "auxiliary_loss_clip": 0.0112781, "auxiliary_loss_mlp": 0.0110204, "balance_loss_clip": 1.02932215, "balance_loss_mlp": 1.02885723, "epoch": 0.04821884863971141, "flos": 43173176601600.0, "grad_norm": 1.8101202191822818, "language_loss": 0.83623838, "learning_rate": 3.996519603407196e-06, "loss": 0.8585369, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.98828125, "step": 802, "time_per_iteration": 2.6308212280273438 }, { "auxiliary_loss_clip": 0.01125116, "auxiliary_loss_mlp": 0.01109157, "balance_loss_clip": 1.03739333, "balance_loss_mlp": 1.02919221, "epoch": 0.048278971892379376, "flos": 18619145159040.0, "grad_norm": 1.7674656946439435, "language_loss": 0.89733648, "learning_rate": 3.996496599303649e-06, "loss": 0.91967928, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.95703125, "step": 803, "time_per_iteration": 2.4383859634399414 }, { "auxiliary_loss_clip": 0.01124105, "auxiliary_loss_mlp": 0.01102861, "balance_loss_clip": 1.03109694, "balance_loss_mlp": 1.02629101, "epoch": 0.04833909514504735, "flos": 20229024994560.0, "grad_norm": 1.993864127437173, "language_loss": 0.89977777, "learning_rate": 3.996473519492753e-06, "loss": 0.92204738, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.9765625, "step": 804, "time_per_iteration": 2.434561014175415 }, { "auxiliary_loss_clip": 0.01127648, "auxiliary_loss_mlp": 0.0110738, "balance_loss_clip": 1.03914428, "balance_loss_mlp": 1.02809668, "epoch": 0.04839921839771532, "flos": 24644001749760.0, "grad_norm": 1.9710814926157487, "language_loss": 0.89173484, "learning_rate": 3.99645036397538e-06, "loss": 0.91408515, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.9921875, "step": 805, "time_per_iteration": 3.940985679626465 }, { "auxiliary_loss_clip": 0.01126997, "auxiliary_loss_mlp": 0.01099673, "balance_loss_clip": 1.02795672, "balance_loss_mlp": 1.02706385, "epoch": 0.048459341650383285, "flos": 24826283291520.0, "grad_norm": 2.0533771923172317, "language_loss": 0.70781481, "learning_rate": 3.9964271327524085e-06, "loss": 0.73008156, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.0, "step": 806, "time_per_iteration": 2.4611496925354004 }, { "auxiliary_loss_clip": 0.0112498, "auxiliary_loss_mlp": 0.01105411, "balance_loss_clip": 1.03173971, "balance_loss_mlp": 1.02855229, "epoch": 0.04851946490305126, "flos": 22162130997120.0, "grad_norm": 2.016635483805228, "language_loss": 0.80266404, "learning_rate": 3.9964038258247214e-06, "loss": 0.82496798, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.96484375, "step": 807, "time_per_iteration": 2.550133228302002 }, { "auxiliary_loss_clip": 0.0112265, "auxiliary_loss_mlp": 0.01094868, "balance_loss_clip": 1.02462971, "balance_loss_mlp": 1.02706468, "epoch": 0.04857958815571922, "flos": 19791004677120.0, "grad_norm": 1.8971304209440776, "language_loss": 0.8991456, "learning_rate": 3.9963804431932005e-06, "loss": 0.92132086, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.95703125, "step": 808, "time_per_iteration": 2.4279072284698486 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01107149, "balance_loss_clip": 1.02708781, "balance_loss_mlp": 1.02667928, "epoch": 0.048639711408387194, "flos": 18696966312960.0, "grad_norm": 1.5979004793448004, "language_loss": 0.93540597, "learning_rate": 3.996356984858732e-06, "loss": 0.95777398, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 0.80078125, "router_z_loss_mlp": 1.03125, "step": 809, "time_per_iteration": 5.257675886154175 }, { "auxiliary_loss_clip": 0.01126257, "auxiliary_loss_mlp": 0.01090004, "balance_loss_clip": 1.01709557, "balance_loss_mlp": 1.0290637, "epoch": 0.048699834661055166, "flos": 24862348592640.0, "grad_norm": 1.8828613243198968, "language_loss": 0.88903427, "learning_rate": 3.996333450822208e-06, "loss": 0.91119689, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.97265625, "step": 810, "time_per_iteration": 3.866572380065918 }, { "auxiliary_loss_clip": 0.01130846, "auxiliary_loss_mlp": 0.01084792, "balance_loss_clip": 1.01522171, "balance_loss_mlp": 1.02959275, "epoch": 0.04875995791372313, "flos": 20702970967680.0, "grad_norm": 1.682357340063889, "language_loss": 0.83875442, "learning_rate": 3.99630984108452e-06, "loss": 0.86091083, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.015625, "step": 811, "time_per_iteration": 2.45694637298584 }, { "auxiliary_loss_clip": 0.0112501, "auxiliary_loss_mlp": 0.01097166, "balance_loss_clip": 1.02735698, "balance_loss_mlp": 1.02807593, "epoch": 0.048820081166391104, "flos": 18587304132480.0, "grad_norm": 1.7785973269450905, "language_loss": 0.77735823, "learning_rate": 3.9962861556465615e-06, "loss": 0.79957998, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.96875, "step": 812, "time_per_iteration": 2.3841586112976074 }, { "auxiliary_loss_clip": 0.01118452, "auxiliary_loss_mlp": 0.01094484, "balance_loss_clip": 1.02753603, "balance_loss_mlp": 1.02711451, "epoch": 0.04888020441905907, "flos": 22706322359040.0, "grad_norm": 2.009119264157794, "language_loss": 0.92772406, "learning_rate": 3.996262394509233e-06, "loss": 0.94985342, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 0.9140625, "step": 813, "time_per_iteration": 2.4460930824279785 }, { "auxiliary_loss_clip": 0.01122485, "auxiliary_loss_mlp": 0.01090506, "balance_loss_clip": 1.02441633, "balance_loss_mlp": 1.02621555, "epoch": 0.04894032767172704, "flos": 22783235817600.0, "grad_norm": 2.068441860286538, "language_loss": 0.78264749, "learning_rate": 3.9962385576734335e-06, "loss": 0.8047775, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.9609375, "step": 814, "time_per_iteration": 2.420793056488037 }, { "auxiliary_loss_clip": 0.011234, "auxiliary_loss_mlp": 0.011012, "balance_loss_clip": 1.02838731, "balance_loss_mlp": 1.02567816, "epoch": 0.04900045092439501, "flos": 25515084971520.0, "grad_norm": 1.8166781578221358, "language_loss": 0.8717823, "learning_rate": 3.9962146451400675e-06, "loss": 0.89402831, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.9765625, "step": 815, "time_per_iteration": 2.4551053047180176 }, { "auxiliary_loss_clip": 0.01128477, "auxiliary_loss_mlp": 0.01097516, "balance_loss_clip": 1.02265215, "balance_loss_mlp": 1.02689028, "epoch": 0.04906057417706298, "flos": 25956945538560.0, "grad_norm": 2.0875320817834893, "language_loss": 0.96493769, "learning_rate": 3.996190656910043e-06, "loss": 0.98719758, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.015625, "step": 816, "time_per_iteration": 2.4652719497680664 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01086402, "balance_loss_clip": 1.01482892, "balance_loss_mlp": 1.02478743, "epoch": 0.04912069742973095, "flos": 18623648724480.0, "grad_norm": 2.0971487976054997, "language_loss": 0.84503907, "learning_rate": 3.996166592984268e-06, "loss": 0.86715263, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 1.0, "step": 817, "time_per_iteration": 2.400782346725464 }, { "auxiliary_loss_clip": 0.01124415, "auxiliary_loss_mlp": 0.01108134, "balance_loss_clip": 1.03141093, "balance_loss_mlp": 1.02671444, "epoch": 0.049180820682398915, "flos": 23698553598720.0, "grad_norm": 1.5808824118039877, "language_loss": 0.87186432, "learning_rate": 3.996142453363656e-06, "loss": 0.89418983, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.9765625, "step": 818, "time_per_iteration": 2.4802234172821045 }, { "auxiliary_loss_clip": 0.01130484, "auxiliary_loss_mlp": 0.01102548, "balance_loss_clip": 1.02468061, "balance_loss_mlp": 1.02723527, "epoch": 0.04924094393506689, "flos": 22419266227200.0, "grad_norm": 2.367986260900332, "language_loss": 0.82921702, "learning_rate": 3.996118238049124e-06, "loss": 0.85154736, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 1.03125, "step": 819, "time_per_iteration": 2.4196581840515137 }, { "auxiliary_loss_clip": 0.01126639, "auxiliary_loss_mlp": 0.01094052, "balance_loss_clip": 1.02686572, "balance_loss_mlp": 1.02769327, "epoch": 0.04930106718773486, "flos": 15737448983040.0, "grad_norm": 2.340973671253265, "language_loss": 0.87657368, "learning_rate": 3.996093947041586e-06, "loss": 0.89878058, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.98828125, "step": 820, "time_per_iteration": 2.4172985553741455 }, { "auxiliary_loss_clip": 0.01130159, "auxiliary_loss_mlp": 0.01090882, "balance_loss_clip": 1.02045345, "balance_loss_mlp": 1.02736664, "epoch": 0.049361190440402825, "flos": 26249412931200.0, "grad_norm": 1.7740636162689962, "language_loss": 0.93433547, "learning_rate": 3.996069580341966e-06, "loss": 0.95654583, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.03125, "step": 821, "time_per_iteration": 2.441948652267456 }, { "auxiliary_loss_clip": 0.01130009, "auxiliary_loss_mlp": 0.01103275, "balance_loss_clip": 1.02407241, "balance_loss_mlp": 1.02945018, "epoch": 0.0494213136930708, "flos": 21251281870080.0, "grad_norm": 2.2523934561042753, "language_loss": 0.92241263, "learning_rate": 3.996045137951188e-06, "loss": 0.94474542, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.0078125, "step": 822, "time_per_iteration": 2.447622060775757 }, { "auxiliary_loss_clip": 0.01130032, "auxiliary_loss_mlp": 0.01091187, "balance_loss_clip": 1.01684785, "balance_loss_mlp": 1.02943933, "epoch": 0.04948143694573876, "flos": 27964241913600.0, "grad_norm": 2.048651541065716, "language_loss": 0.70931351, "learning_rate": 3.996020619870178e-06, "loss": 0.73152566, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0078125, "step": 823, "time_per_iteration": 2.4577138423919678 }, { "auxiliary_loss_clip": 0.01051613, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.02955818, "balance_loss_mlp": 1.02661741, "epoch": 0.049541560198406734, "flos": 66178250398080.0, "grad_norm": 1.3714006389048436, "language_loss": 0.62486339, "learning_rate": 3.995996026099866e-06, "loss": 0.64576805, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 0.09277344, "router_z_loss_mlp": 0.25, "step": 824, "time_per_iteration": 3.1256790161132812 }, { "auxiliary_loss_clip": 0.01136179, "auxiliary_loss_mlp": 0.01108715, "balance_loss_clip": 1.02784312, "balance_loss_mlp": 1.03218424, "epoch": 0.049601683451074706, "flos": 22891606277760.0, "grad_norm": 1.950089683384892, "language_loss": 0.94378054, "learning_rate": 3.995971356641185e-06, "loss": 0.96622944, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 0.80859375, "router_z_loss_mlp": 1.0390625, "step": 825, "time_per_iteration": 2.447047472000122 }, { "auxiliary_loss_clip": 0.011298, "auxiliary_loss_mlp": 0.0110263, "balance_loss_clip": 1.02838635, "balance_loss_mlp": 1.02891612, "epoch": 0.04966180670374267, "flos": 21432585893760.0, "grad_norm": 2.4839616786413083, "language_loss": 0.71979761, "learning_rate": 3.9959466114950695e-06, "loss": 0.74212193, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0078125, "step": 826, "time_per_iteration": 2.4005424976348877 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01104253, "balance_loss_clip": 1.0245254, "balance_loss_mlp": 1.02904248, "epoch": 0.04972192995641064, "flos": 23106392161920.0, "grad_norm": 1.752457603278906, "language_loss": 0.82754141, "learning_rate": 3.995921790662459e-06, "loss": 0.84992528, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 1.046875, "step": 827, "time_per_iteration": 2.464085578918457 }, { "auxiliary_loss_clip": 0.01133215, "auxiliary_loss_mlp": 0.0111074, "balance_loss_clip": 1.02691174, "balance_loss_mlp": 1.02864671, "epoch": 0.04978205320907861, "flos": 40404563919360.0, "grad_norm": 2.0308059899730937, "language_loss": 0.82910037, "learning_rate": 3.995896894144294e-06, "loss": 0.85153997, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 1.046875, "step": 828, "time_per_iteration": 2.762592077255249 }, { "auxiliary_loss_clip": 0.01128613, "auxiliary_loss_mlp": 0.01094514, "balance_loss_clip": 1.02084279, "balance_loss_mlp": 1.02726007, "epoch": 0.04984217646174658, "flos": 25227365523840.0, "grad_norm": 1.835989445728411, "language_loss": 0.872908, "learning_rate": 3.995871921941519e-06, "loss": 0.89513928, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 1.015625, "step": 829, "time_per_iteration": 2.5734376907348633 }, { "auxiliary_loss_clip": 0.01133377, "auxiliary_loss_mlp": 0.01109738, "balance_loss_clip": 1.02848506, "balance_loss_mlp": 1.02867651, "epoch": 0.04990229971441455, "flos": 15958763291520.0, "grad_norm": 1.873953114359596, "language_loss": 0.78572136, "learning_rate": 3.99584687405508e-06, "loss": 0.80815256, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 0.8125, "router_z_loss_mlp": 1.046875, "step": 830, "time_per_iteration": 2.4663054943084717 }, { "auxiliary_loss_clip": 0.01132258, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.02255106, "balance_loss_mlp": 1.02574801, "epoch": 0.04996242296708252, "flos": 18404149806720.0, "grad_norm": 1.7680049350202567, "language_loss": 0.8146385, "learning_rate": 3.995821750485929e-06, "loss": 0.83710599, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 1.0625, "step": 831, "time_per_iteration": 2.462139844894409 }, { "auxiliary_loss_clip": 0.0113452, "auxiliary_loss_mlp": 0.0110743, "balance_loss_clip": 1.02708244, "balance_loss_mlp": 1.02893734, "epoch": 0.05002254621975049, "flos": 17857095713280.0, "grad_norm": 2.3540284908975644, "language_loss": 0.95906782, "learning_rate": 3.995796551235016e-06, "loss": 0.98148727, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 1.0546875, "step": 832, "time_per_iteration": 2.4712274074554443 }, { "auxiliary_loss_clip": 0.0112218, "auxiliary_loss_mlp": 0.0109016, "balance_loss_clip": 1.02206767, "balance_loss_mlp": 1.02536106, "epoch": 0.050082669472418455, "flos": 45658538490240.0, "grad_norm": 1.7717772174750868, "language_loss": 0.85488385, "learning_rate": 3.9957712763032974e-06, "loss": 0.87700725, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.96875, "step": 833, "time_per_iteration": 2.7135560512542725 }, { "auxiliary_loss_clip": 0.01129931, "auxiliary_loss_mlp": 0.01103536, "balance_loss_clip": 1.02485824, "balance_loss_mlp": 1.02585721, "epoch": 0.05014279272508643, "flos": 37960538947200.0, "grad_norm": 1.8868049448423627, "language_loss": 0.85936546, "learning_rate": 3.995745925691733e-06, "loss": 0.88170016, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 1.046875, "step": 834, "time_per_iteration": 2.6495156288146973 }, { "auxiliary_loss_clip": 0.0113259, "auxiliary_loss_mlp": 0.01090849, "balance_loss_clip": 1.01736856, "balance_loss_mlp": 1.02794921, "epoch": 0.0502029159777544, "flos": 20995124158080.0, "grad_norm": 1.9846737858935943, "language_loss": 0.9617635, "learning_rate": 3.995720499401282e-06, "loss": 0.98399782, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 1.046875, "step": 835, "time_per_iteration": 2.5073297023773193 }, { "auxiliary_loss_clip": 0.0113215, "auxiliary_loss_mlp": 0.01102783, "balance_loss_clip": 1.02558303, "balance_loss_mlp": 1.02677274, "epoch": 0.050263039230422364, "flos": 15887156359680.0, "grad_norm": 1.7716826333032227, "language_loss": 0.81808454, "learning_rate": 3.995694997432911e-06, "loss": 0.8404339, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 1.046875, "step": 836, "time_per_iteration": 2.4563841819763184 }, { "auxiliary_loss_clip": 0.0112006, "auxiliary_loss_mlp": 0.01085969, "balance_loss_clip": 1.01968837, "balance_loss_mlp": 1.02622104, "epoch": 0.050323162483090336, "flos": 23731616522880.0, "grad_norm": 2.3205625918079082, "language_loss": 0.87536287, "learning_rate": 3.9956694197875855e-06, "loss": 0.89742315, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.9375, "step": 837, "time_per_iteration": 2.5013794898986816 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01099207, "balance_loss_clip": 1.02973211, "balance_loss_mlp": 1.02733886, "epoch": 0.0503832857357583, "flos": 20265195029760.0, "grad_norm": 2.56780608115119, "language_loss": 0.77931446, "learning_rate": 3.995643766466275e-06, "loss": 0.8015306, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.953125, "step": 838, "time_per_iteration": 2.4540815353393555 }, { "auxiliary_loss_clip": 0.01125337, "auxiliary_loss_mlp": 0.01095978, "balance_loss_clip": 1.02717018, "balance_loss_mlp": 1.02619982, "epoch": 0.05044340898842627, "flos": 17784057415680.0, "grad_norm": 1.7458688580060588, "language_loss": 0.86139184, "learning_rate": 3.995618037469953e-06, "loss": 0.883605, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.9921875, "step": 839, "time_per_iteration": 2.4471378326416016 }, { "auxiliary_loss_clip": 0.01128292, "auxiliary_loss_mlp": 0.01100836, "balance_loss_clip": 1.03093207, "balance_loss_mlp": 1.02837932, "epoch": 0.050503532241094246, "flos": 22965412625280.0, "grad_norm": 2.1940416372116553, "language_loss": 0.88223338, "learning_rate": 3.995592232799595e-06, "loss": 0.90452462, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.99609375, "step": 840, "time_per_iteration": 2.459482431411743 }, { "auxiliary_loss_clip": 0.01128945, "auxiliary_loss_mlp": 0.01099091, "balance_loss_clip": 1.02441835, "balance_loss_mlp": 1.02688992, "epoch": 0.05056365549376221, "flos": 22776078988800.0, "grad_norm": 1.8051545138055862, "language_loss": 0.97429597, "learning_rate": 3.99556635245618e-06, "loss": 0.99657631, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.015625, "step": 841, "time_per_iteration": 2.4701220989227295 }, { "auxiliary_loss_clip": 0.01125666, "auxiliary_loss_mlp": 0.01090117, "balance_loss_clip": 1.02541065, "balance_loss_mlp": 1.02667594, "epoch": 0.05062377874643018, "flos": 30915729630720.0, "grad_norm": 2.226784535004877, "language_loss": 0.80644429, "learning_rate": 3.995540396440688e-06, "loss": 0.82860214, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.9921875, "step": 842, "time_per_iteration": 2.581712245941162 }, { "auxiliary_loss_clip": 0.01131404, "auxiliary_loss_mlp": 0.01102603, "balance_loss_clip": 1.03064871, "balance_loss_mlp": 1.02810645, "epoch": 0.05068390199909815, "flos": 19646115068160.0, "grad_norm": 2.379461286570494, "language_loss": 0.81725568, "learning_rate": 3.995514364754105e-06, "loss": 0.83959579, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.03125, "step": 843, "time_per_iteration": 2.479529857635498 }, { "auxiliary_loss_clip": 0.01132361, "auxiliary_loss_mlp": 0.01091869, "balance_loss_clip": 1.01767302, "balance_loss_mlp": 1.02786183, "epoch": 0.05074402525176612, "flos": 37960573858560.0, "grad_norm": 1.8797006763876272, "language_loss": 0.85929048, "learning_rate": 3.995488257397417e-06, "loss": 0.88153279, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.046875, "step": 844, "time_per_iteration": 2.627268075942993 }, { "auxiliary_loss_clip": 0.01125239, "auxiliary_loss_mlp": 0.01091194, "balance_loss_clip": 1.02157545, "balance_loss_mlp": 1.02420568, "epoch": 0.05080414850443409, "flos": 22053516157440.0, "grad_norm": 1.9509915749054254, "language_loss": 0.78852189, "learning_rate": 3.995462074371614e-06, "loss": 0.81068623, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.015625, "step": 845, "time_per_iteration": 3.9310946464538574 }, { "auxiliary_loss_clip": 0.01124335, "auxiliary_loss_mlp": 0.01088083, "balance_loss_clip": 1.01994276, "balance_loss_mlp": 1.02573383, "epoch": 0.05086427175710206, "flos": 20224870542720.0, "grad_norm": 1.6835496432971293, "language_loss": 0.90483904, "learning_rate": 3.99543581567769e-06, "loss": 0.92696327, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.984375, "step": 846, "time_per_iteration": 2.42999529838562 }, { "auxiliary_loss_clip": 0.01127, "auxiliary_loss_mlp": 0.01089823, "balance_loss_clip": 1.02151585, "balance_loss_mlp": 1.02541673, "epoch": 0.05092439500977003, "flos": 15158309483520.0, "grad_norm": 1.6830884935417534, "language_loss": 0.90893376, "learning_rate": 3.9954094813166394e-06, "loss": 0.93110198, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 1.015625, "step": 847, "time_per_iteration": 2.437636613845825 }, { "auxiliary_loss_clip": 0.01124722, "auxiliary_loss_mlp": 0.01096013, "balance_loss_clip": 1.0289216, "balance_loss_mlp": 1.02545738, "epoch": 0.050984518262437994, "flos": 22054039827840.0, "grad_norm": 2.799328696583664, "language_loss": 0.86242688, "learning_rate": 3.995383071289462e-06, "loss": 0.8846342, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.99609375, "step": 848, "time_per_iteration": 3.9412245750427246 }, { "auxiliary_loss_clip": 0.01131091, "auxiliary_loss_mlp": 0.01088485, "balance_loss_clip": 1.02110744, "balance_loss_mlp": 1.02829468, "epoch": 0.05104464151510597, "flos": 30224065219200.0, "grad_norm": 1.9426385944931062, "language_loss": 0.90142244, "learning_rate": 3.995356585597158e-06, "loss": 0.9236182, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.03125, "step": 849, "time_per_iteration": 5.342249155044556 }, { "auxiliary_loss_clip": 0.01126329, "auxiliary_loss_mlp": 0.01091284, "balance_loss_clip": 1.0230968, "balance_loss_mlp": 1.02584982, "epoch": 0.05110476476777394, "flos": 18331914470400.0, "grad_norm": 1.7760995981333365, "language_loss": 0.8730967, "learning_rate": 3.995330024240732e-06, "loss": 0.89527285, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 1.0078125, "step": 850, "time_per_iteration": 2.4158575534820557 }, { "auxiliary_loss_clip": 0.01127017, "auxiliary_loss_mlp": 0.01086713, "balance_loss_clip": 1.01905036, "balance_loss_mlp": 1.02708244, "epoch": 0.051164888020441904, "flos": 37997197741440.0, "grad_norm": 2.556591989838514, "language_loss": 0.70103139, "learning_rate": 3.995303387221192e-06, "loss": 0.72316873, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 1.0, "step": 851, "time_per_iteration": 2.567460775375366 }, { "auxiliary_loss_clip": 0.01129788, "auxiliary_loss_mlp": 0.01087196, "balance_loss_clip": 1.01991379, "balance_loss_mlp": 1.02702141, "epoch": 0.051225011273109876, "flos": 23037543227520.0, "grad_norm": 2.285884356837248, "language_loss": 0.86837685, "learning_rate": 3.995276674539547e-06, "loss": 0.89054674, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.03125, "step": 852, "time_per_iteration": 2.423408031463623 }, { "auxiliary_loss_clip": 0.01127191, "auxiliary_loss_mlp": 0.01081896, "balance_loss_clip": 1.01375628, "balance_loss_mlp": 1.02675819, "epoch": 0.05128513452577785, "flos": 18258841261440.0, "grad_norm": 2.229355998138035, "language_loss": 0.83204579, "learning_rate": 3.995249886196811e-06, "loss": 0.85413671, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 1.0078125, "step": 853, "time_per_iteration": 2.4253132343292236 }, { "auxiliary_loss_clip": 0.01127914, "auxiliary_loss_mlp": 0.01091916, "balance_loss_clip": 1.02139187, "balance_loss_mlp": 1.02738333, "epoch": 0.05134525777844581, "flos": 27197723813760.0, "grad_norm": 2.1471528847269576, "language_loss": 0.81310225, "learning_rate": 3.995223022193999e-06, "loss": 0.83530045, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.0078125, "step": 854, "time_per_iteration": 2.4777135848999023 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01089657, "balance_loss_clip": 1.02351987, "balance_loss_mlp": 1.02583981, "epoch": 0.051405381031113785, "flos": 28361099871360.0, "grad_norm": 2.1740625440374193, "language_loss": 0.85286057, "learning_rate": 3.99519608253213e-06, "loss": 0.87501729, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 1.0, "step": 855, "time_per_iteration": 2.4987337589263916 }, { "auxiliary_loss_clip": 0.01044204, "auxiliary_loss_mlp": 0.01015265, "balance_loss_clip": 1.00415492, "balance_loss_mlp": 1.01840544, "epoch": 0.05146550428378175, "flos": 65614855921920.0, "grad_norm": 0.9861385312864569, "language_loss": 0.65863013, "learning_rate": 3.995169067212227e-06, "loss": 0.67922485, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.2578125, "step": 856, "time_per_iteration": 3.0258331298828125 }, { "auxiliary_loss_clip": 0.01120249, "auxiliary_loss_mlp": 0.01080859, "balance_loss_clip": 1.01720107, "balance_loss_mlp": 1.0247848, "epoch": 0.05152562753644972, "flos": 22053760536960.0, "grad_norm": 1.9686873824562292, "language_loss": 0.792606, "learning_rate": 3.9951419762353116e-06, "loss": 0.81461704, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.95703125, "step": 857, "time_per_iteration": 2.464237689971924 }, { "auxiliary_loss_clip": 0.01125959, "auxiliary_loss_mlp": 0.01092239, "balance_loss_clip": 1.02304995, "balance_loss_mlp": 1.02511215, "epoch": 0.051585750789117694, "flos": 18508714928640.0, "grad_norm": 2.0338238251209204, "language_loss": 0.91288197, "learning_rate": 3.995114809602412e-06, "loss": 0.93506396, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.0078125, "step": 858, "time_per_iteration": 2.388768434524536 }, { "auxiliary_loss_clip": 0.01123081, "auxiliary_loss_mlp": 0.01087648, "balance_loss_clip": 1.02430058, "balance_loss_mlp": 1.02513862, "epoch": 0.05164587404178566, "flos": 23729172727680.0, "grad_norm": 1.878235479469795, "language_loss": 0.80082572, "learning_rate": 3.9950875673145605e-06, "loss": 0.82293296, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.98046875, "step": 859, "time_per_iteration": 2.5003578662872314 }, { "auxiliary_loss_clip": 0.01128921, "auxiliary_loss_mlp": 0.01105744, "balance_loss_clip": 1.03660238, "balance_loss_mlp": 1.02623785, "epoch": 0.05170599729445363, "flos": 16251963822720.0, "grad_norm": 2.110308659642544, "language_loss": 0.94403362, "learning_rate": 3.995060249372788e-06, "loss": 0.96638024, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 1.03125, "step": 860, "time_per_iteration": 2.42598295211792 }, { "auxiliary_loss_clip": 0.01122266, "auxiliary_loss_mlp": 0.01087832, "balance_loss_clip": 1.02493739, "balance_loss_mlp": 1.0246284, "epoch": 0.0517661205471216, "flos": 23984841680640.0, "grad_norm": 1.8724142506380805, "language_loss": 0.8434664, "learning_rate": 3.99503285577813e-06, "loss": 0.86556733, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.9765625, "step": 861, "time_per_iteration": 2.4750988483428955 }, { "auxiliary_loss_clip": 0.01127942, "auxiliary_loss_mlp": 0.01095504, "balance_loss_clip": 1.0302726, "balance_loss_mlp": 1.02795827, "epoch": 0.05182624379978957, "flos": 29276452563840.0, "grad_norm": 2.0169936540183597, "language_loss": 0.81250715, "learning_rate": 3.995005386531627e-06, "loss": 0.83474159, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.99609375, "step": 862, "time_per_iteration": 2.5066378116607666 }, { "auxiliary_loss_clip": 0.0111986, "auxiliary_loss_mlp": 0.0108717, "balance_loss_clip": 1.02828062, "balance_loss_mlp": 1.02662253, "epoch": 0.05188636705245754, "flos": 24169671751680.0, "grad_norm": 3.4825332529018818, "language_loss": 0.92340934, "learning_rate": 3.9949778416343195e-06, "loss": 0.94547963, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.93359375, "step": 863, "time_per_iteration": 2.502131938934326 }, { "auxiliary_loss_clip": 0.01125404, "auxiliary_loss_mlp": 0.01094033, "balance_loss_clip": 1.02207804, "balance_loss_mlp": 1.02691793, "epoch": 0.051946490305125506, "flos": 26759494028160.0, "grad_norm": 2.0709947360471945, "language_loss": 0.79882979, "learning_rate": 3.9949502210872525e-06, "loss": 0.82102424, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.984375, "step": 864, "time_per_iteration": 2.4973390102386475 }, { "auxiliary_loss_clip": 0.01128601, "auxiliary_loss_mlp": 0.01086955, "balance_loss_clip": 1.02115154, "balance_loss_mlp": 1.02599359, "epoch": 0.05200661355779348, "flos": 21501574473600.0, "grad_norm": 2.3385804449765324, "language_loss": 0.8155117, "learning_rate": 3.994922524891474e-06, "loss": 0.83766729, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 1.03125, "step": 865, "time_per_iteration": 2.4627766609191895 }, { "auxiliary_loss_clip": 0.0112672, "auxiliary_loss_mlp": 0.01089402, "balance_loss_clip": 1.02030826, "balance_loss_mlp": 1.02646124, "epoch": 0.05206673681046144, "flos": 18113497804800.0, "grad_norm": 2.260164986095091, "language_loss": 0.90035486, "learning_rate": 3.994894753048032e-06, "loss": 0.92251617, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 1.0, "step": 866, "time_per_iteration": 2.3968911170959473 }, { "auxiliary_loss_clip": 0.01125059, "auxiliary_loss_mlp": 0.01093193, "balance_loss_clip": 1.02896333, "balance_loss_mlp": 1.02739251, "epoch": 0.052126860063129415, "flos": 17523396138240.0, "grad_norm": 2.146389669745918, "language_loss": 0.91947442, "learning_rate": 3.9948669055579815e-06, "loss": 0.94165695, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.9765625, "step": 867, "time_per_iteration": 2.414842128753662 }, { "auxiliary_loss_clip": 0.01117931, "auxiliary_loss_mlp": 0.01076522, "balance_loss_clip": 1.01891971, "balance_loss_mlp": 1.025738, "epoch": 0.05218698331579739, "flos": 32596692727680.0, "grad_norm": 1.4538185033286133, "language_loss": 0.65122497, "learning_rate": 3.9948389824223785e-06, "loss": 0.67316949, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.921875, "step": 868, "time_per_iteration": 2.5167908668518066 }, { "auxiliary_loss_clip": 0.01128349, "auxiliary_loss_mlp": 0.01091046, "balance_loss_clip": 1.01851892, "balance_loss_mlp": 1.02672362, "epoch": 0.05224710656846535, "flos": 22126205341440.0, "grad_norm": 1.9551923609882471, "language_loss": 0.87578464, "learning_rate": 3.994810983642281e-06, "loss": 0.8979786, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 1.015625, "step": 869, "time_per_iteration": 2.455733060836792 }, { "auxiliary_loss_clip": 0.01127832, "auxiliary_loss_mlp": 0.01091027, "balance_loss_clip": 1.02202857, "balance_loss_mlp": 1.02732813, "epoch": 0.052307229821133325, "flos": 11144310226560.0, "grad_norm": 2.004737040523193, "language_loss": 0.91783607, "learning_rate": 3.994782909218751e-06, "loss": 0.94002467, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 1.0078125, "step": 870, "time_per_iteration": 2.392753839492798 }, { "auxiliary_loss_clip": 0.01124768, "auxiliary_loss_mlp": 0.01082604, "balance_loss_clip": 1.01770639, "balance_loss_mlp": 1.02529752, "epoch": 0.05236735307380129, "flos": 19127271219840.0, "grad_norm": 2.223060039598315, "language_loss": 0.83374041, "learning_rate": 3.994754759152854e-06, "loss": 0.8558141, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.99609375, "step": 871, "time_per_iteration": 2.405123472213745 }, { "auxiliary_loss_clip": 0.011215, "auxiliary_loss_mlp": 0.01080171, "balance_loss_clip": 1.01889741, "balance_loss_mlp": 1.02528739, "epoch": 0.05242747632646926, "flos": 20959582527360.0, "grad_norm": 3.3363116458221524, "language_loss": 0.8311621, "learning_rate": 3.994726533445656e-06, "loss": 0.85317874, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.9609375, "step": 872, "time_per_iteration": 2.418765068054199 }, { "auxiliary_loss_clip": 0.01036752, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.01968074, "balance_loss_mlp": 1.00802565, "epoch": 0.052487599579137234, "flos": 65017632337920.0, "grad_norm": 0.9114903374827709, "language_loss": 0.61834103, "learning_rate": 3.9946982320982274e-06, "loss": 0.63901788, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.28710938, "step": 873, "time_per_iteration": 2.936467409133911 }, { "auxiliary_loss_clip": 0.01129797, "auxiliary_loss_mlp": 0.01084549, "balance_loss_clip": 1.01750553, "balance_loss_mlp": 1.02730465, "epoch": 0.0525477228318052, "flos": 23287905653760.0, "grad_norm": 1.7195783521390156, "language_loss": 0.9251098, "learning_rate": 3.994669855111643e-06, "loss": 0.94725329, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.0234375, "step": 874, "time_per_iteration": 2.5425920486450195 }, { "auxiliary_loss_clip": 0.01129105, "auxiliary_loss_mlp": 0.01088692, "balance_loss_clip": 1.01835823, "balance_loss_mlp": 1.02709103, "epoch": 0.05260784608447317, "flos": 32228952710400.0, "grad_norm": 2.210023215168436, "language_loss": 0.77426052, "learning_rate": 3.994641402486977e-06, "loss": 0.79643846, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.015625, "step": 875, "time_per_iteration": 2.546679973602295 }, { "auxiliary_loss_clip": 0.01120347, "auxiliary_loss_mlp": 0.01082298, "balance_loss_clip": 1.0160656, "balance_loss_mlp": 1.02403259, "epoch": 0.052667969337141136, "flos": 24462034410240.0, "grad_norm": 1.6378506458604296, "language_loss": 0.950863, "learning_rate": 3.99461287422531e-06, "loss": 0.97288948, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.96484375, "step": 876, "time_per_iteration": 2.442812442779541 }, { "auxiliary_loss_clip": 0.01033385, "auxiliary_loss_mlp": 0.01011917, "balance_loss_clip": 1.00095022, "balance_loss_mlp": 1.00505638, "epoch": 0.05272809258980911, "flos": 57780938989440.0, "grad_norm": 0.8214649011612507, "language_loss": 0.63142455, "learning_rate": 3.994584270327722e-06, "loss": 0.65187758, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.28320312, "step": 877, "time_per_iteration": 3.0710692405700684 }, { "auxiliary_loss_clip": 0.01130417, "auxiliary_loss_mlp": 0.01092054, "balance_loss_clip": 1.01890695, "balance_loss_mlp": 1.02706611, "epoch": 0.05278821584247708, "flos": 17419843445760.0, "grad_norm": 6.678741295999182, "language_loss": 0.89217949, "learning_rate": 3.994555590795299e-06, "loss": 0.91440421, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 1.03125, "step": 878, "time_per_iteration": 2.412493944168091 }, { "auxiliary_loss_clip": 0.0113142, "auxiliary_loss_mlp": 0.01090617, "balance_loss_clip": 1.02343011, "balance_loss_mlp": 1.0286057, "epoch": 0.052848339095145046, "flos": 26136154880640.0, "grad_norm": 1.8732732135628374, "language_loss": 0.88152182, "learning_rate": 3.9945268356291275e-06, "loss": 0.90374219, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 1.03125, "step": 879, "time_per_iteration": 2.4723246097564697 }, { "auxiliary_loss_clip": 0.01128562, "auxiliary_loss_mlp": 0.0110982, "balance_loss_clip": 1.03562427, "balance_loss_mlp": 1.02803516, "epoch": 0.05290846234781302, "flos": 16471148538240.0, "grad_norm": 2.037612015787909, "language_loss": 0.8841238, "learning_rate": 3.9944980048302985e-06, "loss": 0.90650761, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0078125, "step": 880, "time_per_iteration": 2.396439790725708 }, { "auxiliary_loss_clip": 0.0113371, "auxiliary_loss_mlp": 0.01103912, "balance_loss_clip": 1.03448415, "balance_loss_mlp": 1.02936578, "epoch": 0.05296858560048098, "flos": 19864147708800.0, "grad_norm": 1.8321939524643087, "language_loss": 0.90907663, "learning_rate": 3.994469098399906e-06, "loss": 0.93145287, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.046875, "step": 881, "time_per_iteration": 2.418199062347412 }, { "auxiliary_loss_clip": 0.01133214, "auxiliary_loss_mlp": 0.01100032, "balance_loss_clip": 1.01982844, "balance_loss_mlp": 1.02751732, "epoch": 0.053028708853148955, "flos": 24387460012800.0, "grad_norm": 2.030668744442987, "language_loss": 0.9086293, "learning_rate": 3.994440116339046e-06, "loss": 0.93096173, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 0.80078125, "router_z_loss_mlp": 1.0625, "step": 882, "time_per_iteration": 2.4479787349700928 }, { "auxiliary_loss_clip": 0.01133334, "auxiliary_loss_mlp": 0.01100334, "balance_loss_clip": 1.02070212, "balance_loss_mlp": 1.02888429, "epoch": 0.05308883210581693, "flos": 36391681825920.0, "grad_norm": 2.2494771315648867, "language_loss": 0.74744678, "learning_rate": 3.994411058648816e-06, "loss": 0.76978344, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 1.046875, "step": 883, "time_per_iteration": 2.54573655128479 }, { "auxiliary_loss_clip": 0.01124566, "auxiliary_loss_mlp": 0.01089165, "balance_loss_clip": 1.01697147, "balance_loss_mlp": 1.02643287, "epoch": 0.05314895535848489, "flos": 22854039788160.0, "grad_norm": 1.9281765646791185, "language_loss": 0.79975688, "learning_rate": 3.994381925330319e-06, "loss": 0.82189417, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.98046875, "step": 884, "time_per_iteration": 2.4227118492126465 }, { "auxiliary_loss_clip": 0.01122629, "auxiliary_loss_mlp": 0.01082049, "balance_loss_clip": 1.01638901, "balance_loss_mlp": 1.02468765, "epoch": 0.053209078611152864, "flos": 12859488322560.0, "grad_norm": 1.9290258101334734, "language_loss": 0.88879204, "learning_rate": 3.994352716384659e-06, "loss": 0.91083884, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.98046875, "step": 885, "time_per_iteration": 3.84979248046875 }, { "auxiliary_loss_clip": 0.0113127, "auxiliary_loss_mlp": 0.0109572, "balance_loss_clip": 1.02033186, "balance_loss_mlp": 1.0271107, "epoch": 0.05326920186382083, "flos": 12163844016000.0, "grad_norm": 2.364942727061361, "language_loss": 0.91269279, "learning_rate": 3.994323431812945e-06, "loss": 0.93496263, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 1.0390625, "step": 886, "time_per_iteration": 2.3995745182037354 }, { "auxiliary_loss_clip": 0.01127869, "auxiliary_loss_mlp": 0.01086933, "balance_loss_clip": 1.01926994, "balance_loss_mlp": 1.02735138, "epoch": 0.0533293251164888, "flos": 22703564361600.0, "grad_norm": 1.8117351758222557, "language_loss": 0.91718292, "learning_rate": 3.994294071616286e-06, "loss": 0.93933094, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 1.0078125, "step": 887, "time_per_iteration": 2.417062520980835 }, { "auxiliary_loss_clip": 0.01126988, "auxiliary_loss_mlp": 0.01089025, "balance_loss_clip": 1.0215044, "balance_loss_mlp": 1.02687526, "epoch": 0.053389448369156774, "flos": 26939785622400.0, "grad_norm": 1.9364517538200954, "language_loss": 0.78306127, "learning_rate": 3.994264635795796e-06, "loss": 0.80522138, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 1.0, "step": 888, "time_per_iteration": 3.940000295639038 }, { "auxiliary_loss_clip": 0.01128296, "auxiliary_loss_mlp": 0.01091383, "balance_loss_clip": 1.02290893, "balance_loss_mlp": 1.02751827, "epoch": 0.05344957162182474, "flos": 25555165079040.0, "grad_norm": 1.8613417505828236, "language_loss": 0.91851896, "learning_rate": 3.994235124352592e-06, "loss": 0.94071573, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 1.0078125, "step": 889, "time_per_iteration": 3.9605112075805664 }, { "auxiliary_loss_clip": 0.011257, "auxiliary_loss_mlp": 0.01085746, "balance_loss_clip": 1.02213621, "balance_loss_mlp": 1.02664995, "epoch": 0.05350969487449271, "flos": 19718559872640.0, "grad_norm": 1.8244271556415097, "language_loss": 0.90625685, "learning_rate": 3.994205537287791e-06, "loss": 0.92837131, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.9921875, "step": 890, "time_per_iteration": 2.401724100112915 }, { "auxiliary_loss_clip": 0.01127688, "auxiliary_loss_mlp": 0.01086174, "balance_loss_clip": 1.02079964, "balance_loss_mlp": 1.0270673, "epoch": 0.053569818127160676, "flos": 27015128069760.0, "grad_norm": 2.0732640865031944, "language_loss": 0.96809208, "learning_rate": 3.994175874602517e-06, "loss": 0.99023068, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 1.0078125, "step": 891, "time_per_iteration": 2.462536334991455 }, { "auxiliary_loss_clip": 0.0112356, "auxiliary_loss_mlp": 0.01091786, "balance_loss_clip": 1.02798498, "balance_loss_mlp": 1.02535248, "epoch": 0.05362994137982865, "flos": 13187497345920.0, "grad_norm": 1.8540104871664411, "language_loss": 0.74823928, "learning_rate": 3.994146136297893e-06, "loss": 0.77039266, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.98046875, "step": 892, "time_per_iteration": 2.3760876655578613 }, { "auxiliary_loss_clip": 0.01134497, "auxiliary_loss_mlp": 0.0110204, "balance_loss_clip": 1.02717626, "balance_loss_mlp": 1.02947116, "epoch": 0.05369006463249662, "flos": 28656744197760.0, "grad_norm": 1.6247873890802804, "language_loss": 0.85717261, "learning_rate": 3.994116322375049e-06, "loss": 0.87953794, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.046875, "step": 893, "time_per_iteration": 2.475463628768921 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01093756, "balance_loss_clip": 1.02385163, "balance_loss_mlp": 1.02812362, "epoch": 0.053750187885164585, "flos": 28911889480320.0, "grad_norm": 2.8570813573311096, "language_loss": 0.84611565, "learning_rate": 3.994086432835114e-06, "loss": 0.86834604, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.0078125, "step": 894, "time_per_iteration": 2.5759589672088623 }, { "auxiliary_loss_clip": 0.01125738, "auxiliary_loss_mlp": 0.01081075, "balance_loss_clip": 1.02056432, "balance_loss_mlp": 1.02675855, "epoch": 0.05381031113783256, "flos": 15157925458560.0, "grad_norm": 2.1791027684585287, "language_loss": 0.7903257, "learning_rate": 3.994056467679221e-06, "loss": 0.81239378, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.98828125, "step": 895, "time_per_iteration": 2.3923048973083496 }, { "auxiliary_loss_clip": 0.01133077, "auxiliary_loss_mlp": 0.01088739, "balance_loss_clip": 1.0233649, "balance_loss_mlp": 1.02901173, "epoch": 0.05387043439050053, "flos": 21834156885120.0, "grad_norm": 2.064382236376656, "language_loss": 0.90006709, "learning_rate": 3.9940264269085065e-06, "loss": 0.92228526, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 1.0390625, "step": 896, "time_per_iteration": 2.428237199783325 }, { "auxiliary_loss_clip": 0.01127722, "auxiliary_loss_mlp": 0.01077478, "balance_loss_clip": 1.01558506, "balance_loss_mlp": 1.02597606, "epoch": 0.053930557643168495, "flos": 17309378304000.0, "grad_norm": 2.1055391679896536, "language_loss": 0.9142698, "learning_rate": 3.9939963105241115e-06, "loss": 0.93632179, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 1.015625, "step": 897, "time_per_iteration": 2.427826166152954 }, { "auxiliary_loss_clip": 0.01120636, "auxiliary_loss_mlp": 0.01085708, "balance_loss_clip": 1.01713932, "balance_loss_mlp": 1.02409196, "epoch": 0.05399068089583647, "flos": 17347503375360.0, "grad_norm": 1.7610670172856944, "language_loss": 0.92936295, "learning_rate": 3.993966118527175e-06, "loss": 0.95142639, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.96484375, "step": 898, "time_per_iteration": 2.383394479751587 }, { "auxiliary_loss_clip": 0.01125532, "auxiliary_loss_mlp": 0.01080409, "balance_loss_clip": 1.02001762, "balance_loss_mlp": 1.02590048, "epoch": 0.05405080414850443, "flos": 17486178762240.0, "grad_norm": 2.6895572538912713, "language_loss": 0.95975626, "learning_rate": 3.993935850918845e-06, "loss": 0.9818157, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.99609375, "step": 899, "time_per_iteration": 2.437697172164917 }, { "auxiliary_loss_clip": 0.01120608, "auxiliary_loss_mlp": 0.01077013, "balance_loss_clip": 1.01693189, "balance_loss_mlp": 1.02492046, "epoch": 0.054110927401172404, "flos": 24495690827520.0, "grad_norm": 2.402747554203343, "language_loss": 0.78818405, "learning_rate": 3.9939055077002665e-06, "loss": 0.81016028, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.95703125, "step": 900, "time_per_iteration": 2.517084836959839 }, { "auxiliary_loss_clip": 0.01122646, "auxiliary_loss_mlp": 0.01081948, "balance_loss_clip": 1.01790905, "balance_loss_mlp": 1.02480602, "epoch": 0.054171050653840376, "flos": 22928928387840.0, "grad_norm": 2.4125771537949126, "language_loss": 0.7967546, "learning_rate": 3.993875088872592e-06, "loss": 0.81880057, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.9765625, "step": 901, "time_per_iteration": 2.435633420944214 }, { "auxiliary_loss_clip": 0.01113055, "auxiliary_loss_mlp": 0.01075452, "balance_loss_clip": 1.02087796, "balance_loss_mlp": 1.0233562, "epoch": 0.05423117390650834, "flos": 12932352063360.0, "grad_norm": 2.018364363717726, "language_loss": 0.88925475, "learning_rate": 3.9938445944369745e-06, "loss": 0.91113985, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8984375, "step": 902, "time_per_iteration": 2.399024724960327 }, { "auxiliary_loss_clip": 0.01119928, "auxiliary_loss_mlp": 0.01081189, "balance_loss_clip": 1.01853251, "balance_loss_mlp": 1.02378201, "epoch": 0.05429129715917631, "flos": 19900317744000.0, "grad_norm": 2.117994284166333, "language_loss": 0.89757168, "learning_rate": 3.993814024394569e-06, "loss": 0.9195829, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.9609375, "step": 903, "time_per_iteration": 2.4367218017578125 }, { "auxiliary_loss_clip": 0.01122128, "auxiliary_loss_mlp": 0.01087548, "balance_loss_clip": 1.02517772, "balance_loss_mlp": 1.0242548, "epoch": 0.05435142041184428, "flos": 16907702578560.0, "grad_norm": 2.419387024230688, "language_loss": 0.79452693, "learning_rate": 3.993783378746537e-06, "loss": 0.81662375, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.9765625, "step": 904, "time_per_iteration": 2.404392957687378 }, { "auxiliary_loss_clip": 0.01121452, "auxiliary_loss_mlp": 0.01089645, "balance_loss_clip": 1.02496159, "balance_loss_mlp": 1.02453566, "epoch": 0.05441154366451225, "flos": 23947275191040.0, "grad_norm": 2.2481546222984004, "language_loss": 0.89883184, "learning_rate": 3.993752657494039e-06, "loss": 0.9209429, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.96875, "step": 905, "time_per_iteration": 2.4161629676818848 }, { "auxiliary_loss_clip": 0.01120703, "auxiliary_loss_mlp": 0.01077425, "balance_loss_clip": 1.01915503, "balance_loss_mlp": 1.02700603, "epoch": 0.05447166691718022, "flos": 19974333559680.0, "grad_norm": 2.0231286971312854, "language_loss": 0.77343166, "learning_rate": 3.993721860638241e-06, "loss": 0.79541296, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.9375, "step": 906, "time_per_iteration": 2.421051502227783 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01083488, "balance_loss_clip": 1.01782787, "balance_loss_mlp": 1.0250771, "epoch": 0.05453179016984819, "flos": 24935351978880.0, "grad_norm": 1.951739030132688, "language_loss": 0.91334414, "learning_rate": 3.993690988180309e-06, "loss": 0.93541729, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.98828125, "step": 907, "time_per_iteration": 2.440842866897583 }, { "auxiliary_loss_clip": 0.01124339, "auxiliary_loss_mlp": 0.01081378, "balance_loss_clip": 1.01795912, "balance_loss_mlp": 1.02663255, "epoch": 0.05459191342251616, "flos": 18114091297920.0, "grad_norm": 1.6651852626431427, "language_loss": 0.90196919, "learning_rate": 3.9936600401214165e-06, "loss": 0.92402637, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.9765625, "step": 908, "time_per_iteration": 2.424984931945801 }, { "auxiliary_loss_clip": 0.01121081, "auxiliary_loss_mlp": 0.01081023, "balance_loss_clip": 1.02170455, "balance_loss_mlp": 1.0249052, "epoch": 0.054652036675184125, "flos": 19207291789440.0, "grad_norm": 2.1868488728281776, "language_loss": 0.9354558, "learning_rate": 3.9936290164627345e-06, "loss": 0.95747685, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.9609375, "step": 909, "time_per_iteration": 2.4045894145965576 }, { "auxiliary_loss_clip": 0.01124886, "auxiliary_loss_mlp": 0.01088544, "balance_loss_clip": 1.02259696, "balance_loss_mlp": 1.02507257, "epoch": 0.0547121599278521, "flos": 16324827563520.0, "grad_norm": 2.2424132423421317, "language_loss": 0.74963367, "learning_rate": 3.99359791720544e-06, "loss": 0.77176797, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 1.0, "step": 910, "time_per_iteration": 2.4162583351135254 }, { "auxiliary_loss_clip": 0.01119444, "auxiliary_loss_mlp": 0.01083427, "balance_loss_clip": 1.02289271, "balance_loss_mlp": 1.02491653, "epoch": 0.05477228318052007, "flos": 20337988947840.0, "grad_norm": 1.5909643428255626, "language_loss": 0.86067891, "learning_rate": 3.993566742350714e-06, "loss": 0.8827076, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9453125, "step": 911, "time_per_iteration": 2.4164538383483887 }, { "auxiliary_loss_clip": 0.01122473, "auxiliary_loss_mlp": 0.01075329, "balance_loss_clip": 1.01372194, "balance_loss_mlp": 1.02588105, "epoch": 0.054832406433188034, "flos": 21972238778880.0, "grad_norm": 2.3361856754714774, "language_loss": 0.79422903, "learning_rate": 3.993535491899736e-06, "loss": 0.81620705, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.96484375, "step": 912, "time_per_iteration": 2.4688615798950195 }, { "auxiliary_loss_clip": 0.01115557, "auxiliary_loss_mlp": 0.01081524, "balance_loss_clip": 1.02373123, "balance_loss_mlp": 1.02335024, "epoch": 0.054892529685856006, "flos": 16398005506560.0, "grad_norm": 2.465567818491581, "language_loss": 0.86453819, "learning_rate": 3.993504165853694e-06, "loss": 0.886509, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.921875, "step": 913, "time_per_iteration": 2.459686040878296 }, { "auxiliary_loss_clip": 0.01118012, "auxiliary_loss_mlp": 0.01073888, "balance_loss_clip": 1.01804996, "balance_loss_mlp": 1.02544069, "epoch": 0.05495265293852397, "flos": 23911279712640.0, "grad_norm": 1.8023726948470293, "language_loss": 0.86560982, "learning_rate": 3.993472764213772e-06, "loss": 0.88752878, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.92578125, "step": 914, "time_per_iteration": 2.5117075443267822 }, { "auxiliary_loss_clip": 0.01117667, "auxiliary_loss_mlp": 0.01082086, "balance_loss_clip": 1.02147996, "balance_loss_mlp": 1.0246532, "epoch": 0.055012776191191944, "flos": 23585819218560.0, "grad_norm": 2.158671037614002, "language_loss": 0.95001459, "learning_rate": 3.9934412869811655e-06, "loss": 0.9720121, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9296875, "step": 915, "time_per_iteration": 2.469024896621704 }, { "auxiliary_loss_clip": 0.01119895, "auxiliary_loss_mlp": 0.01072452, "balance_loss_clip": 1.0150888, "balance_loss_mlp": 1.02534914, "epoch": 0.055072899443859916, "flos": 17527585501440.0, "grad_norm": 1.793766418693055, "language_loss": 0.91853392, "learning_rate": 3.993409734157064e-06, "loss": 0.94045734, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.9453125, "step": 916, "time_per_iteration": 2.480485677719116 }, { "auxiliary_loss_clip": 0.0111952, "auxiliary_loss_mlp": 0.01079747, "balance_loss_clip": 1.02157331, "balance_loss_mlp": 1.0245626, "epoch": 0.05513302269652788, "flos": 21686160165120.0, "grad_norm": 1.7354715677036034, "language_loss": 0.83472085, "learning_rate": 3.993378105742666e-06, "loss": 0.85671353, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.94921875, "step": 917, "time_per_iteration": 2.5028650760650635 }, { "auxiliary_loss_clip": 0.01123176, "auxiliary_loss_mlp": 0.01073306, "balance_loss_clip": 1.01391625, "balance_loss_mlp": 1.02447474, "epoch": 0.05519314594919585, "flos": 21612353817600.0, "grad_norm": 1.8069606342306848, "language_loss": 0.83700562, "learning_rate": 3.9933464017391705e-06, "loss": 0.8589704, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.984375, "step": 918, "time_per_iteration": 2.536362886428833 }, { "auxiliary_loss_clip": 0.01123927, "auxiliary_loss_mlp": 0.01079284, "balance_loss_clip": 1.02077639, "balance_loss_mlp": 1.02532208, "epoch": 0.05525326920186382, "flos": 21797498090880.0, "grad_norm": 2.1422499519691276, "language_loss": 0.92001951, "learning_rate": 3.99331462214778e-06, "loss": 0.94205165, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.984375, "step": 919, "time_per_iteration": 2.5601465702056885 }, { "auxiliary_loss_clip": 0.01117876, "auxiliary_loss_mlp": 0.01078159, "balance_loss_clip": 1.01860189, "balance_loss_mlp": 1.02267587, "epoch": 0.05531339245453179, "flos": 28438362443520.0, "grad_norm": 2.0839333675131035, "language_loss": 0.91133678, "learning_rate": 3.993282766969699e-06, "loss": 0.93329704, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.953125, "step": 920, "time_per_iteration": 2.5991275310516357 }, { "auxiliary_loss_clip": 0.01119383, "auxiliary_loss_mlp": 0.01083533, "balance_loss_clip": 1.01915991, "balance_loss_mlp": 1.02339458, "epoch": 0.05537351570719976, "flos": 37373718948480.0, "grad_norm": 1.9122346867378892, "language_loss": 0.69504476, "learning_rate": 3.993250836206136e-06, "loss": 0.71707392, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9609375, "step": 921, "time_per_iteration": 2.5270113945007324 }, { "auxiliary_loss_clip": 0.01122078, "auxiliary_loss_mlp": 0.01080594, "balance_loss_clip": 1.01989269, "balance_loss_mlp": 1.02504683, "epoch": 0.05543363895986773, "flos": 20083437158400.0, "grad_norm": 2.253623353244792, "language_loss": 0.76224709, "learning_rate": 3.993218829858301e-06, "loss": 0.7842738, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.96875, "step": 922, "time_per_iteration": 2.421757459640503 }, { "auxiliary_loss_clip": 0.01120593, "auxiliary_loss_mlp": 0.01088235, "balance_loss_clip": 1.02348006, "balance_loss_mlp": 1.02319288, "epoch": 0.0554937622125357, "flos": 24532105242240.0, "grad_norm": 4.12314418254269, "language_loss": 0.86530393, "learning_rate": 3.993186747927408e-06, "loss": 0.88739222, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.97265625, "step": 923, "time_per_iteration": 2.426964521408081 }, { "auxiliary_loss_clip": 0.01121567, "auxiliary_loss_mlp": 0.01083764, "balance_loss_clip": 1.01724517, "balance_loss_mlp": 1.02369547, "epoch": 0.055553885465203665, "flos": 14319172022400.0, "grad_norm": 2.012388989202911, "language_loss": 0.82161134, "learning_rate": 3.993154590414675e-06, "loss": 0.84366465, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.9765625, "step": 924, "time_per_iteration": 2.4108219146728516 }, { "auxiliary_loss_clip": 0.01113395, "auxiliary_loss_mlp": 0.0107746, "balance_loss_clip": 1.019238, "balance_loss_mlp": 1.02218843, "epoch": 0.05561400871787164, "flos": 27379900621440.0, "grad_norm": 1.841892424677068, "language_loss": 1.04974174, "learning_rate": 3.993122357321319e-06, "loss": 1.07165027, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.9140625, "step": 925, "time_per_iteration": 3.9872593879699707 }, { "auxiliary_loss_clip": 0.01116847, "auxiliary_loss_mlp": 0.01082197, "balance_loss_clip": 1.02097142, "balance_loss_mlp": 1.02292633, "epoch": 0.05567413197053961, "flos": 23219999326080.0, "grad_norm": 1.8957289546810663, "language_loss": 0.83292049, "learning_rate": 3.993090048648564e-06, "loss": 0.85491097, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.9375, "step": 926, "time_per_iteration": 2.43587064743042 }, { "auxiliary_loss_clip": 0.01127628, "auxiliary_loss_mlp": 0.01089114, "balance_loss_clip": 1.01992488, "balance_loss_mlp": 1.0257163, "epoch": 0.055734255223207574, "flos": 25263779938560.0, "grad_norm": 2.513368132909658, "language_loss": 0.79492259, "learning_rate": 3.993057664397634e-06, "loss": 0.81709003, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 1.015625, "step": 927, "time_per_iteration": 3.8844234943389893 }, { "auxiliary_loss_clip": 0.01041675, "auxiliary_loss_mlp": 0.01028511, "balance_loss_clip": 1.01735342, "balance_loss_mlp": 1.0134151, "epoch": 0.055794378475875546, "flos": 66499519662720.0, "grad_norm": 0.7896404882410542, "language_loss": 0.6007551, "learning_rate": 3.9930252045697585e-06, "loss": 0.62145698, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.28125, "step": 928, "time_per_iteration": 5.964838743209839 }, { "auxiliary_loss_clip": 0.01120305, "auxiliary_loss_mlp": 0.01082399, "balance_loss_clip": 1.01850319, "balance_loss_mlp": 1.02474117, "epoch": 0.05585450172854351, "flos": 25336469122560.0, "grad_norm": 1.9746621175107022, "language_loss": 0.98213017, "learning_rate": 3.992992669166168e-06, "loss": 1.00415719, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.95703125, "step": 929, "time_per_iteration": 2.4845173358917236 }, { "auxiliary_loss_clip": 0.01123352, "auxiliary_loss_mlp": 0.01095529, "balance_loss_clip": 1.02657819, "balance_loss_mlp": 1.02726007, "epoch": 0.05591462498121148, "flos": 33910334743680.0, "grad_norm": 2.3685247279258586, "language_loss": 0.75268775, "learning_rate": 3.992960058188094e-06, "loss": 0.77487659, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.9609375, "step": 930, "time_per_iteration": 2.5293941497802734 }, { "auxiliary_loss_clip": 0.01124642, "auxiliary_loss_mlp": 0.01086576, "balance_loss_clip": 1.01915073, "balance_loss_mlp": 1.02594137, "epoch": 0.055974748233879455, "flos": 17929924542720.0, "grad_norm": 2.0989753990332525, "language_loss": 0.8866356, "learning_rate": 3.992927371636776e-06, "loss": 0.90874773, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.984375, "step": 931, "time_per_iteration": 2.434673547744751 }, { "auxiliary_loss_clip": 0.01124577, "auxiliary_loss_mlp": 0.01092734, "balance_loss_clip": 1.0255481, "balance_loss_mlp": 1.02567458, "epoch": 0.05603487148654742, "flos": 24020906981760.0, "grad_norm": 1.6080111837647388, "language_loss": 0.86948848, "learning_rate": 3.9928946095134525e-06, "loss": 0.89166164, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.9921875, "step": 932, "time_per_iteration": 2.445666551589966 }, { "auxiliary_loss_clip": 0.01127364, "auxiliary_loss_mlp": 0.01095551, "balance_loss_clip": 1.02769756, "balance_loss_mlp": 1.0281142, "epoch": 0.05609499473921539, "flos": 17306899597440.0, "grad_norm": 1.95286087742154, "language_loss": 0.78279328, "learning_rate": 3.992861771819365e-06, "loss": 0.80502248, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.9921875, "step": 933, "time_per_iteration": 2.416311502456665 }, { "auxiliary_loss_clip": 0.01125054, "auxiliary_loss_mlp": 0.01089712, "balance_loss_clip": 1.02214432, "balance_loss_mlp": 1.02645922, "epoch": 0.05615511799188336, "flos": 20993727703680.0, "grad_norm": 2.246288009154978, "language_loss": 0.89897525, "learning_rate": 3.99282885855576e-06, "loss": 0.92112285, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 0.984375, "step": 934, "time_per_iteration": 2.397213935852051 }, { "auxiliary_loss_clip": 0.01122841, "auxiliary_loss_mlp": 0.01083441, "balance_loss_clip": 1.02054572, "balance_loss_mlp": 1.02791095, "epoch": 0.05621524124455133, "flos": 17272614775680.0, "grad_norm": 3.5601917310719227, "language_loss": 0.83352959, "learning_rate": 3.992795869723885e-06, "loss": 0.85559237, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.94921875, "step": 935, "time_per_iteration": 2.397948980331421 }, { "auxiliary_loss_clip": 0.01041933, "auxiliary_loss_mlp": 0.01014674, "balance_loss_clip": 1.00303924, "balance_loss_mlp": 1.01508749, "epoch": 0.0562753644972193, "flos": 58716332668800.0, "grad_norm": 0.8231284325586253, "language_loss": 0.69276774, "learning_rate": 3.99276280532499e-06, "loss": 0.71333373, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.26953125, "step": 936, "time_per_iteration": 2.9382803440093994 }, { "auxiliary_loss_clip": 0.01127981, "auxiliary_loss_mlp": 0.01098956, "balance_loss_clip": 1.02437842, "balance_loss_mlp": 1.02713513, "epoch": 0.05633548774988727, "flos": 17456083303680.0, "grad_norm": 2.6184879557465806, "language_loss": 0.80036873, "learning_rate": 3.992729665360331e-06, "loss": 0.82263803, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.0078125, "step": 937, "time_per_iteration": 2.430989980697632 }, { "auxiliary_loss_clip": 0.01039498, "auxiliary_loss_mlp": 0.01022038, "balance_loss_clip": 1.00706565, "balance_loss_mlp": 1.01346707, "epoch": 0.05639561100255524, "flos": 70651426256640.0, "grad_norm": 0.8843280779576959, "language_loss": 0.6462326, "learning_rate": 3.992696449831162e-06, "loss": 0.66684794, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.25976562, "step": 938, "time_per_iteration": 3.003065824508667 }, { "auxiliary_loss_clip": 0.01136436, "auxiliary_loss_mlp": 0.01100261, "balance_loss_clip": 1.02286994, "balance_loss_mlp": 1.0268991, "epoch": 0.056455734255223204, "flos": 20484938327040.0, "grad_norm": 4.53488490516945, "language_loss": 0.84203786, "learning_rate": 3.992663158738745e-06, "loss": 0.8644048, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 1.09375, "step": 939, "time_per_iteration": 2.4258644580841064 }, { "auxiliary_loss_clip": 0.01125301, "auxiliary_loss_mlp": 0.01102785, "balance_loss_clip": 1.03249943, "balance_loss_mlp": 1.02609599, "epoch": 0.056515857507891176, "flos": 22052503728000.0, "grad_norm": 1.6001810504644367, "language_loss": 0.765172, "learning_rate": 3.992629792084341e-06, "loss": 0.78745288, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.9921875, "step": 940, "time_per_iteration": 2.4411253929138184 }, { "auxiliary_loss_clip": 0.01123712, "auxiliary_loss_mlp": 0.01096962, "balance_loss_clip": 1.02758241, "balance_loss_mlp": 1.02566171, "epoch": 0.05657598076055915, "flos": 24024153738240.0, "grad_norm": 1.7973713209528892, "language_loss": 0.74662936, "learning_rate": 3.992596349869216e-06, "loss": 0.76883602, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.98046875, "step": 941, "time_per_iteration": 2.4572362899780273 }, { "auxiliary_loss_clip": 0.01124782, "auxiliary_loss_mlp": 0.01092796, "balance_loss_clip": 1.02122307, "balance_loss_mlp": 1.02705717, "epoch": 0.05663610401322711, "flos": 20479701623040.0, "grad_norm": 1.9433687439825826, "language_loss": 0.83690298, "learning_rate": 3.992562832094637e-06, "loss": 0.85907871, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.9765625, "step": 942, "time_per_iteration": 2.4054033756256104 }, { "auxiliary_loss_clip": 0.01130259, "auxiliary_loss_mlp": 0.01095179, "balance_loss_clip": 1.02093577, "balance_loss_mlp": 1.02776945, "epoch": 0.056696227265895086, "flos": 21067987898880.0, "grad_norm": 1.8091049806021222, "language_loss": 0.91744149, "learning_rate": 3.9925292387618755e-06, "loss": 0.93969584, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0234375, "step": 943, "time_per_iteration": 2.570345878601074 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01097243, "balance_loss_clip": 1.02347648, "balance_loss_mlp": 1.02733016, "epoch": 0.05675635051856306, "flos": 17820367096320.0, "grad_norm": 2.0768882442487433, "language_loss": 0.80012816, "learning_rate": 3.992495569872206e-06, "loss": 0.82237238, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 1.0, "step": 944, "time_per_iteration": 2.394005298614502 }, { "auxiliary_loss_clip": 0.01126464, "auxiliary_loss_mlp": 0.01090083, "balance_loss_clip": 1.02079844, "balance_loss_mlp": 1.02629352, "epoch": 0.05681647377123102, "flos": 23113758458880.0, "grad_norm": 1.943433641906643, "language_loss": 0.81938994, "learning_rate": 3.992461825426906e-06, "loss": 0.84155536, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 1.0, "step": 945, "time_per_iteration": 2.454765558242798 }, { "auxiliary_loss_clip": 0.01131934, "auxiliary_loss_mlp": 0.01093296, "balance_loss_clip": 1.0210073, "balance_loss_mlp": 1.02966034, "epoch": 0.056876597023898995, "flos": 16069612458240.0, "grad_norm": 2.077289481158631, "language_loss": 0.85102606, "learning_rate": 3.992428005427252e-06, "loss": 0.87327838, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 1.0234375, "step": 946, "time_per_iteration": 2.4211196899414062 }, { "auxiliary_loss_clip": 0.01135463, "auxiliary_loss_mlp": 0.01104019, "balance_loss_clip": 1.02557981, "balance_loss_mlp": 1.03040671, "epoch": 0.05693672027656696, "flos": 16834734103680.0, "grad_norm": 1.8605156197164077, "language_loss": 0.83768058, "learning_rate": 3.992394109874529e-06, "loss": 0.86007535, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 0.78515625, "router_z_loss_mlp": 1.046875, "step": 947, "time_per_iteration": 2.4302000999450684 }, { "auxiliary_loss_clip": 0.01134947, "auxiliary_loss_mlp": 0.01099227, "balance_loss_clip": 1.02751064, "balance_loss_mlp": 1.03228641, "epoch": 0.05699684352923493, "flos": 21388281511680.0, "grad_norm": 5.617782041802679, "language_loss": 0.90811485, "learning_rate": 3.9923601387700225e-06, "loss": 0.93045652, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.0234375, "step": 948, "time_per_iteration": 2.4176712036132812 }, { "auxiliary_loss_clip": 0.01131975, "auxiliary_loss_mlp": 0.01094095, "balance_loss_clip": 1.01923108, "balance_loss_mlp": 1.0299089, "epoch": 0.057056966781902904, "flos": 15559391715840.0, "grad_norm": 1.673510645622403, "language_loss": 0.90460217, "learning_rate": 3.992326092115019e-06, "loss": 0.92686284, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.0234375, "step": 949, "time_per_iteration": 2.4255564212799072 }, { "auxiliary_loss_clip": 0.01127478, "auxiliary_loss_mlp": 0.01086656, "balance_loss_clip": 1.01985145, "balance_loss_mlp": 1.02921772, "epoch": 0.05711709003457087, "flos": 19936836892800.0, "grad_norm": 2.008457109782295, "language_loss": 0.82122171, "learning_rate": 3.992291969910811e-06, "loss": 0.84336311, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 0.984375, "step": 950, "time_per_iteration": 2.4229536056518555 }, { "auxiliary_loss_clip": 0.01132337, "auxiliary_loss_mlp": 0.0110328, "balance_loss_clip": 1.02469742, "balance_loss_mlp": 1.02949297, "epoch": 0.05717721328723884, "flos": 30331493072640.0, "grad_norm": 1.8936392398058866, "language_loss": 0.85878342, "learning_rate": 3.992257772158691e-06, "loss": 0.88113964, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 0.78515625, "router_z_loss_mlp": 1.03125, "step": 951, "time_per_iteration": 2.513164758682251 }, { "auxiliary_loss_clip": 0.01128298, "auxiliary_loss_mlp": 0.01093425, "balance_loss_clip": 1.02161348, "balance_loss_mlp": 1.02677917, "epoch": 0.05723733653990681, "flos": 23653376432640.0, "grad_norm": 4.040509312164673, "language_loss": 0.89580774, "learning_rate": 3.992223498859958e-06, "loss": 0.91802502, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.015625, "step": 952, "time_per_iteration": 2.429227352142334 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.01103819, "balance_loss_clip": 1.02456903, "balance_loss_mlp": 1.02878428, "epoch": 0.05729745979257478, "flos": 22054633320960.0, "grad_norm": 1.8116323273873673, "language_loss": 0.82249457, "learning_rate": 3.9921891500159084e-06, "loss": 0.84489, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.0703125, "step": 953, "time_per_iteration": 2.460824489593506 }, { "auxiliary_loss_clip": 0.01126616, "auxiliary_loss_mlp": 0.01091546, "balance_loss_clip": 1.02483654, "balance_loss_mlp": 1.02781487, "epoch": 0.05735758304524275, "flos": 19603486431360.0, "grad_norm": 1.8456119833675457, "language_loss": 0.90118545, "learning_rate": 3.992154725627848e-06, "loss": 0.92336714, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 0.98828125, "step": 954, "time_per_iteration": 2.40238356590271 }, { "auxiliary_loss_clip": 0.01131357, "auxiliary_loss_mlp": 0.01092334, "balance_loss_clip": 1.0194732, "balance_loss_mlp": 1.02684367, "epoch": 0.057417706297910716, "flos": 19098013633920.0, "grad_norm": 3.5739281035110317, "language_loss": 0.93333107, "learning_rate": 3.9921202256970804e-06, "loss": 0.95556796, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 1.046875, "step": 955, "time_per_iteration": 2.400515556335449 }, { "auxiliary_loss_clip": 0.01127711, "auxiliary_loss_mlp": 0.01091233, "balance_loss_clip": 1.02523839, "balance_loss_mlp": 1.02722752, "epoch": 0.05747782955057869, "flos": 16653569725440.0, "grad_norm": 2.4589728774130917, "language_loss": 0.93558955, "learning_rate": 3.992085650224914e-06, "loss": 0.95777893, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 1.0, "step": 956, "time_per_iteration": 2.428107500076294 }, { "auxiliary_loss_clip": 0.01123254, "auxiliary_loss_mlp": 0.01085309, "balance_loss_clip": 1.01993442, "balance_loss_mlp": 1.02665424, "epoch": 0.05753795280324665, "flos": 14501174273280.0, "grad_norm": 2.0580916057709144, "language_loss": 0.78504288, "learning_rate": 3.99205099921266e-06, "loss": 0.80712855, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.96484375, "step": 957, "time_per_iteration": 2.3925962448120117 }, { "auxiliary_loss_clip": 0.0112932, "auxiliary_loss_mlp": 0.01103349, "balance_loss_clip": 1.02901018, "balance_loss_mlp": 1.02769423, "epoch": 0.057598076055914625, "flos": 18075372733440.0, "grad_norm": 1.8558662791525078, "language_loss": 0.82246041, "learning_rate": 3.992016272661633e-06, "loss": 0.84478706, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.015625, "step": 958, "time_per_iteration": 2.418532609939575 }, { "auxiliary_loss_clip": 0.01125241, "auxiliary_loss_mlp": 0.01092919, "balance_loss_clip": 1.02387261, "balance_loss_mlp": 1.02585578, "epoch": 0.0576581993085826, "flos": 22123586989440.0, "grad_norm": 2.0437774758620795, "language_loss": 0.90743577, "learning_rate": 3.99198147057315e-06, "loss": 0.9296174, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.9921875, "step": 959, "time_per_iteration": 2.4289016723632812 }, { "auxiliary_loss_clip": 0.01118484, "auxiliary_loss_mlp": 0.01087763, "balance_loss_clip": 1.02224541, "balance_loss_mlp": 1.02510524, "epoch": 0.05771832256125056, "flos": 33180370704000.0, "grad_norm": 3.1681646693218983, "language_loss": 0.82189268, "learning_rate": 3.991946592948529e-06, "loss": 0.84395516, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.93359375, "step": 960, "time_per_iteration": 2.5346662998199463 }, { "auxiliary_loss_clip": 0.01126532, "auxiliary_loss_mlp": 0.01090026, "balance_loss_clip": 1.01797545, "balance_loss_mlp": 1.02592087, "epoch": 0.057778445813918534, "flos": 24169008435840.0, "grad_norm": 1.8925108964751758, "language_loss": 0.97161686, "learning_rate": 3.991911639789094e-06, "loss": 0.9937824, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.0078125, "step": 961, "time_per_iteration": 2.4395880699157715 }, { "auxiliary_loss_clip": 0.01130132, "auxiliary_loss_mlp": 0.01099052, "balance_loss_clip": 1.02099371, "balance_loss_mlp": 1.02804279, "epoch": 0.0578385690665865, "flos": 29641748785920.0, "grad_norm": 2.5390266776576005, "language_loss": 0.72748172, "learning_rate": 3.991876611096169e-06, "loss": 0.74977356, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 1.0234375, "step": 962, "time_per_iteration": 2.505279302597046 }, { "auxiliary_loss_clip": 0.01124621, "auxiliary_loss_mlp": 0.01091321, "balance_loss_clip": 1.01731586, "balance_loss_mlp": 1.02808583, "epoch": 0.05789869231925447, "flos": 20884414636800.0, "grad_norm": 2.58501559503198, "language_loss": 0.91616815, "learning_rate": 3.991841506871084e-06, "loss": 0.93832755, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.96484375, "step": 963, "time_per_iteration": 2.408115863800049 }, { "auxiliary_loss_clip": 0.01132555, "auxiliary_loss_mlp": 0.01094141, "balance_loss_clip": 1.01818109, "balance_loss_mlp": 1.02800548, "epoch": 0.057958815571922444, "flos": 26029914013440.0, "grad_norm": 2.4241300721880426, "language_loss": 0.89892358, "learning_rate": 3.99180632711517e-06, "loss": 0.92119056, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.046875, "step": 964, "time_per_iteration": 3.908592700958252 }, { "auxiliary_loss_clip": 0.01126985, "auxiliary_loss_mlp": 0.01103138, "balance_loss_clip": 1.02775037, "balance_loss_mlp": 1.02587056, "epoch": 0.05801893882459041, "flos": 18076699365120.0, "grad_norm": 2.697802868720696, "language_loss": 0.80888444, "learning_rate": 3.99177107182976e-06, "loss": 0.8311857, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 1.0078125, "step": 965, "time_per_iteration": 2.447664260864258 }, { "auxiliary_loss_clip": 0.01125824, "auxiliary_loss_mlp": 0.0109215, "balance_loss_clip": 1.02172065, "balance_loss_mlp": 1.02556205, "epoch": 0.05807906207725838, "flos": 17747922291840.0, "grad_norm": 1.9660934455239516, "language_loss": 0.85969114, "learning_rate": 3.99173574101619e-06, "loss": 0.88187087, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.0, "step": 966, "time_per_iteration": 2.408773422241211 }, { "auxiliary_loss_clip": 0.01125602, "auxiliary_loss_mlp": 0.0109356, "balance_loss_clip": 1.02136683, "balance_loss_mlp": 1.02658224, "epoch": 0.058139185329926346, "flos": 18039412166400.0, "grad_norm": 1.956085797783424, "language_loss": 0.80072606, "learning_rate": 3.9917003346758035e-06, "loss": 0.82291764, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 0.9921875, "step": 967, "time_per_iteration": 5.366400957107544 }, { "auxiliary_loss_clip": 0.01044823, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.02139413, "balance_loss_mlp": 1.01678514, "epoch": 0.05819930858259432, "flos": 62360287758720.0, "grad_norm": 0.8031589151860283, "language_loss": 0.57549012, "learning_rate": 3.991664852809939e-06, "loss": 0.59629917, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.28125, "step": 968, "time_per_iteration": 3.0211095809936523 }, { "auxiliary_loss_clip": 0.01129065, "auxiliary_loss_mlp": 0.0109852, "balance_loss_clip": 1.02241695, "balance_loss_mlp": 1.02706146, "epoch": 0.05825943183526229, "flos": 19134358225920.0, "grad_norm": 2.135629676756832, "language_loss": 0.85367376, "learning_rate": 3.991629295419945e-06, "loss": 0.87594962, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 1.0234375, "step": 969, "time_per_iteration": 2.3907978534698486 }, { "auxiliary_loss_clip": 0.01132043, "auxiliary_loss_mlp": 0.01097581, "balance_loss_clip": 1.02100074, "balance_loss_mlp": 1.02741516, "epoch": 0.058319555087930255, "flos": 29021202547200.0, "grad_norm": 2.17073942942579, "language_loss": 0.81194687, "learning_rate": 3.991593662507167e-06, "loss": 0.83424312, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 1.046875, "step": 970, "time_per_iteration": 2.4756600856781006 }, { "auxiliary_loss_clip": 0.0113399, "auxiliary_loss_mlp": 0.01100688, "balance_loss_clip": 1.02262926, "balance_loss_mlp": 1.02932334, "epoch": 0.05837967834059823, "flos": 18879003475200.0, "grad_norm": 2.349614030792172, "language_loss": 0.95256007, "learning_rate": 3.991557954072958e-06, "loss": 0.97490686, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 1.046875, "step": 971, "time_per_iteration": 2.4527878761291504 }, { "auxiliary_loss_clip": 0.01131574, "auxiliary_loss_mlp": 0.01095685, "balance_loss_clip": 1.02287197, "balance_loss_mlp": 1.0280596, "epoch": 0.05843980159326619, "flos": 25701870078720.0, "grad_norm": 1.648915557947185, "language_loss": 0.88691032, "learning_rate": 3.991522170118673e-06, "loss": 0.90918291, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 1.0390625, "step": 972, "time_per_iteration": 2.467953681945801 }, { "auxiliary_loss_clip": 0.01129729, "auxiliary_loss_mlp": 0.01103289, "balance_loss_clip": 1.03300285, "balance_loss_mlp": 1.02864647, "epoch": 0.058499924845934165, "flos": 25551080449920.0, "grad_norm": 1.974001269919108, "language_loss": 0.89990461, "learning_rate": 3.991486310645667e-06, "loss": 0.92223477, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.015625, "step": 973, "time_per_iteration": 2.4539103507995605 }, { "auxiliary_loss_clip": 0.01127564, "auxiliary_loss_mlp": 0.0111272, "balance_loss_clip": 1.03642631, "balance_loss_mlp": 1.02655494, "epoch": 0.05856004809860214, "flos": 16435222882560.0, "grad_norm": 1.7522613854206017, "language_loss": 0.77836841, "learning_rate": 3.991450375655301e-06, "loss": 0.80077124, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 1.0078125, "step": 974, "time_per_iteration": 2.40959095954895 }, { "auxiliary_loss_clip": 0.01125559, "auxiliary_loss_mlp": 0.01109093, "balance_loss_clip": 1.03065324, "balance_loss_mlp": 1.02649522, "epoch": 0.0586201713512701, "flos": 39457230554880.0, "grad_norm": 1.4281097460993528, "language_loss": 0.78482264, "learning_rate": 3.991414365148936e-06, "loss": 0.80716914, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 0.9921875, "step": 975, "time_per_iteration": 2.5659992694854736 }, { "auxiliary_loss_clip": 0.01129565, "auxiliary_loss_mlp": 0.01104298, "balance_loss_clip": 1.02843356, "balance_loss_mlp": 1.02630043, "epoch": 0.058680294603938074, "flos": 23364120885120.0, "grad_norm": 2.036632895548704, "language_loss": 0.80002582, "learning_rate": 3.99137827912794e-06, "loss": 0.82236445, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.03125, "step": 976, "time_per_iteration": 2.4385452270507812 }, { "auxiliary_loss_clip": 0.01125149, "auxiliary_loss_mlp": 0.01098623, "balance_loss_clip": 1.02552378, "balance_loss_mlp": 1.02643919, "epoch": 0.05874041785660604, "flos": 32230698278400.0, "grad_norm": 1.6919907319595964, "language_loss": 0.89246011, "learning_rate": 3.991342117593679e-06, "loss": 0.91469783, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 0.984375, "step": 977, "time_per_iteration": 2.501647710800171 }, { "auxiliary_loss_clip": 0.01127052, "auxiliary_loss_mlp": 0.01091713, "balance_loss_clip": 1.02095032, "balance_loss_mlp": 1.02556419, "epoch": 0.05880054110927401, "flos": 22308940730880.0, "grad_norm": 1.5605598580318714, "language_loss": 0.8205657, "learning_rate": 3.991305880547527e-06, "loss": 0.84275341, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 1.015625, "step": 978, "time_per_iteration": 2.4952917098999023 }, { "auxiliary_loss_clip": 0.01130386, "auxiliary_loss_mlp": 0.01100412, "balance_loss_clip": 1.02507186, "balance_loss_mlp": 1.02712965, "epoch": 0.05886066436194198, "flos": 27379237305600.0, "grad_norm": 1.8530220101614145, "language_loss": 0.83436739, "learning_rate": 3.991269567990855e-06, "loss": 0.85667533, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 1.03125, "step": 979, "time_per_iteration": 2.544856548309326 }, { "auxiliary_loss_clip": 0.0104495, "auxiliary_loss_mlp": 0.0101797, "balance_loss_clip": 1.00185323, "balance_loss_mlp": 1.0160768, "epoch": 0.05892078761460995, "flos": 59581725338880.0, "grad_norm": 0.9455884025861344, "language_loss": 0.59177703, "learning_rate": 3.9912331799250415e-06, "loss": 0.61240625, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.2890625, "step": 980, "time_per_iteration": 2.947077989578247 }, { "auxiliary_loss_clip": 0.01123254, "auxiliary_loss_mlp": 0.01091589, "balance_loss_clip": 1.02364016, "balance_loss_mlp": 1.0260489, "epoch": 0.05898091086727792, "flos": 15413175475200.0, "grad_norm": 2.1947772391637574, "language_loss": 0.90017039, "learning_rate": 3.9911967163514665e-06, "loss": 0.92231882, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.97265625, "step": 981, "time_per_iteration": 2.385772943496704 }, { "auxiliary_loss_clip": 0.01127395, "auxiliary_loss_mlp": 0.01084126, "balance_loss_clip": 1.01775062, "balance_loss_mlp": 1.02742434, "epoch": 0.059041034119945886, "flos": 23654319039360.0, "grad_norm": 1.9181546500720876, "language_loss": 0.81091762, "learning_rate": 3.991160177271513e-06, "loss": 0.83303285, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 1.0, "step": 982, "time_per_iteration": 2.4679689407348633 }, { "auxiliary_loss_clip": 0.01136352, "auxiliary_loss_mlp": 0.01096767, "balance_loss_clip": 1.02362049, "balance_loss_mlp": 1.02906358, "epoch": 0.05910115737261386, "flos": 24752930791680.0, "grad_norm": 1.9883143890633186, "language_loss": 0.88401842, "learning_rate": 3.9911235626865654e-06, "loss": 0.9063496, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 1.0703125, "step": 983, "time_per_iteration": 2.475595235824585 }, { "auxiliary_loss_clip": 0.01127514, "auxiliary_loss_mlp": 0.01096753, "balance_loss_clip": 1.02093637, "balance_loss_mlp": 1.0264163, "epoch": 0.05916128062528183, "flos": 11727953291520.0, "grad_norm": 1.8307139798266348, "language_loss": 0.87248522, "learning_rate": 3.9910868725980125e-06, "loss": 0.89472783, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.0078125, "step": 984, "time_per_iteration": 2.4062790870666504 }, { "auxiliary_loss_clip": 0.01124834, "auxiliary_loss_mlp": 0.01087978, "balance_loss_clip": 1.02112556, "balance_loss_mlp": 1.02881384, "epoch": 0.059221403877949795, "flos": 21902063212800.0, "grad_norm": 2.085748640143121, "language_loss": 0.7996968, "learning_rate": 3.9910501070072465e-06, "loss": 0.82182491, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 0.9609375, "step": 985, "time_per_iteration": 2.4216248989105225 }, { "auxiliary_loss_clip": 0.0112967, "auxiliary_loss_mlp": 0.01095754, "balance_loss_clip": 1.02880573, "balance_loss_mlp": 1.02739823, "epoch": 0.05928152713061777, "flos": 20513742065280.0, "grad_norm": 1.824564772438856, "language_loss": 0.92660165, "learning_rate": 3.991013265915661e-06, "loss": 0.94885588, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 1.0234375, "step": 986, "time_per_iteration": 2.42228364944458 }, { "auxiliary_loss_clip": 0.01130297, "auxiliary_loss_mlp": 0.0109569, "balance_loss_clip": 1.02115989, "balance_loss_mlp": 1.02724624, "epoch": 0.05934165038328574, "flos": 24494084904960.0, "grad_norm": 1.9736339688601048, "language_loss": 0.78733784, "learning_rate": 3.9909763493246525e-06, "loss": 0.80959773, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.03125, "step": 987, "time_per_iteration": 2.449246644973755 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01091562, "balance_loss_clip": 1.0168891, "balance_loss_mlp": 1.02811456, "epoch": 0.059401773635953704, "flos": 38726498465280.0, "grad_norm": 1.8510213112989038, "language_loss": 0.75301909, "learning_rate": 3.990939357235621e-06, "loss": 0.77525806, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.046875, "step": 988, "time_per_iteration": 2.591006278991699 }, { "auxiliary_loss_clip": 0.01040563, "auxiliary_loss_mlp": 0.01022688, "balance_loss_clip": 1.00676167, "balance_loss_mlp": 1.01165795, "epoch": 0.059461896888621676, "flos": 58020618539520.0, "grad_norm": 0.9645505870189149, "language_loss": 0.71348822, "learning_rate": 3.99090228964997e-06, "loss": 0.73412079, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.2890625, "step": 989, "time_per_iteration": 2.898162841796875 }, { "auxiliary_loss_clip": 0.01133227, "auxiliary_loss_mlp": 0.01102607, "balance_loss_clip": 1.02593184, "balance_loss_mlp": 1.03012383, "epoch": 0.05952202014128964, "flos": 22126659189120.0, "grad_norm": 2.1417111949179812, "language_loss": 0.83003402, "learning_rate": 3.990865146569105e-06, "loss": 0.85239238, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 1.03125, "step": 990, "time_per_iteration": 2.4373950958251953 }, { "auxiliary_loss_clip": 0.01125331, "auxiliary_loss_mlp": 0.01092499, "balance_loss_clip": 1.01715875, "balance_loss_mlp": 1.02652311, "epoch": 0.059582143393957614, "flos": 20444823308160.0, "grad_norm": 2.1669255441445934, "language_loss": 0.87995899, "learning_rate": 3.990827927994434e-06, "loss": 0.90213722, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.98828125, "step": 991, "time_per_iteration": 2.427744150161743 }, { "auxiliary_loss_clip": 0.01131123, "auxiliary_loss_mlp": 0.01089779, "balance_loss_clip": 1.01992249, "balance_loss_mlp": 1.02690172, "epoch": 0.059642266646625586, "flos": 20593832457600.0, "grad_norm": 1.7707128244897044, "language_loss": 0.80476785, "learning_rate": 3.9907906339273674e-06, "loss": 0.82697684, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.0390625, "step": 992, "time_per_iteration": 2.4672327041625977 }, { "auxiliary_loss_clip": 0.01129771, "auxiliary_loss_mlp": 0.01095396, "balance_loss_clip": 1.02878189, "balance_loss_mlp": 1.02716327, "epoch": 0.05970238989929355, "flos": 19351692639360.0, "grad_norm": 2.3491214801954032, "language_loss": 0.78657597, "learning_rate": 3.9907532643693215e-06, "loss": 0.80882764, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 1.0234375, "step": 993, "time_per_iteration": 2.3871331214904785 }, { "auxiliary_loss_clip": 0.01124075, "auxiliary_loss_mlp": 0.01097182, "balance_loss_clip": 1.02570462, "balance_loss_mlp": 1.02677011, "epoch": 0.05976251315196152, "flos": 30262713960960.0, "grad_norm": 2.0933487783578064, "language_loss": 0.82146722, "learning_rate": 3.990715819321712e-06, "loss": 0.84367979, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.97265625, "step": 994, "time_per_iteration": 2.4808077812194824 }, { "auxiliary_loss_clip": 0.01129265, "auxiliary_loss_mlp": 0.0110893, "balance_loss_clip": 1.0372138, "balance_loss_mlp": 1.02835047, "epoch": 0.05982263640462949, "flos": 23184038759040.0, "grad_norm": 2.9910757369075758, "language_loss": 0.82906139, "learning_rate": 3.99067829878596e-06, "loss": 0.85144335, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 1.015625, "step": 995, "time_per_iteration": 2.4094972610473633 }, { "auxiliary_loss_clip": 0.01124242, "auxiliary_loss_mlp": 0.01106382, "balance_loss_clip": 1.03528547, "balance_loss_mlp": 1.02604425, "epoch": 0.05988275965729746, "flos": 27849761965440.0, "grad_norm": 2.160637101156907, "language_loss": 0.89495534, "learning_rate": 3.990640702763487e-06, "loss": 0.9172616, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.984375, "step": 996, "time_per_iteration": 2.4765777587890625 }, { "auxiliary_loss_clip": 0.01126012, "auxiliary_loss_mlp": 0.01098069, "balance_loss_clip": 1.02663851, "balance_loss_mlp": 1.02726793, "epoch": 0.05994288290996543, "flos": 24678880064640.0, "grad_norm": 2.783047225703568, "language_loss": 0.91457498, "learning_rate": 3.990603031255718e-06, "loss": 0.93681592, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.98828125, "step": 997, "time_per_iteration": 2.4281632900238037 }, { "auxiliary_loss_clip": 0.01041926, "auxiliary_loss_mlp": 0.01020887, "balance_loss_clip": 1.00534177, "balance_loss_mlp": 1.00967336, "epoch": 0.0600030061626334, "flos": 69925965782400.0, "grad_norm": 1.0421868728509203, "language_loss": 0.75635278, "learning_rate": 3.990565284264083e-06, "loss": 0.77698088, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.32226562, "step": 998, "time_per_iteration": 3.1077888011932373 }, { "auxiliary_loss_clip": 0.01119311, "auxiliary_loss_mlp": 0.01087833, "balance_loss_clip": 1.02388883, "balance_loss_mlp": 1.02611005, "epoch": 0.06006312941530137, "flos": 26538982680960.0, "grad_norm": 2.0418756904794084, "language_loss": 0.7906177, "learning_rate": 3.990527461790013e-06, "loss": 0.81268913, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.9296875, "step": 999, "time_per_iteration": 2.440178394317627 }, { "auxiliary_loss_clip": 0.01129103, "auxiliary_loss_mlp": 0.01091238, "balance_loss_clip": 1.018139, "balance_loss_mlp": 1.02713966, "epoch": 0.060123252667969335, "flos": 27342787979520.0, "grad_norm": 1.705774846965125, "language_loss": 0.8448599, "learning_rate": 3.990489563834943e-06, "loss": 0.86706334, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 1.015625, "step": 1000, "time_per_iteration": 2.4486825466156006 }, { "auxiliary_loss_clip": 0.01123979, "auxiliary_loss_mlp": 0.01092467, "balance_loss_clip": 1.02103686, "balance_loss_mlp": 1.02594709, "epoch": 0.06018337592063731, "flos": 27015477183360.0, "grad_norm": 2.0880864904568814, "language_loss": 0.89541829, "learning_rate": 3.990451590400309e-06, "loss": 0.91758275, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.98046875, "step": 1001, "time_per_iteration": 2.4317262172698975 }, { "auxiliary_loss_clip": 0.01125175, "auxiliary_loss_mlp": 0.01088186, "balance_loss_clip": 1.01952112, "balance_loss_mlp": 1.02728069, "epoch": 0.06024349917330528, "flos": 25591788961920.0, "grad_norm": 2.0395459777277427, "language_loss": 0.76422989, "learning_rate": 3.990413541487551e-06, "loss": 0.78636342, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.98046875, "step": 1002, "time_per_iteration": 2.4245893955230713 }, { "auxiliary_loss_clip": 0.0112736, "auxiliary_loss_mlp": 0.01094445, "balance_loss_clip": 1.01886606, "balance_loss_mlp": 1.02682328, "epoch": 0.060303622425973244, "flos": 26132279719680.0, "grad_norm": 2.331451198561699, "language_loss": 0.79068947, "learning_rate": 3.990375417098112e-06, "loss": 0.81290758, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 1.0, "step": 1003, "time_per_iteration": 3.8609912395477295 }, { "auxiliary_loss_clip": 0.01133453, "auxiliary_loss_mlp": 0.01103605, "balance_loss_clip": 1.02340078, "balance_loss_mlp": 1.02931833, "epoch": 0.060363745678641216, "flos": 20376114019200.0, "grad_norm": 2.629000738335318, "language_loss": 0.73495626, "learning_rate": 3.990337217233437e-06, "loss": 0.75732684, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 0.80078125, "router_z_loss_mlp": 1.046875, "step": 1004, "time_per_iteration": 2.411811351776123 }, { "auxiliary_loss_clip": 0.0113358, "auxiliary_loss_mlp": 0.01103084, "balance_loss_clip": 1.0242151, "balance_loss_mlp": 1.02763677, "epoch": 0.06042386893130918, "flos": 17748201582720.0, "grad_norm": 2.7364271437941654, "language_loss": 0.86672491, "learning_rate": 3.990298941894976e-06, "loss": 0.88909155, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 1.0625, "step": 1005, "time_per_iteration": 2.40199613571167 }, { "auxiliary_loss_clip": 0.01044407, "auxiliary_loss_mlp": 0.01021358, "balance_loss_clip": 1.00161648, "balance_loss_mlp": 1.01261878, "epoch": 0.06048399218397715, "flos": 68535061194240.0, "grad_norm": 0.9332703699946714, "language_loss": 0.59206235, "learning_rate": 3.9902605910841794e-06, "loss": 0.61271989, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.31835938, "step": 1006, "time_per_iteration": 4.595729112625122 }, { "auxiliary_loss_clip": 0.01127408, "auxiliary_loss_mlp": 0.0109366, "balance_loss_clip": 1.01679337, "balance_loss_mlp": 1.02574062, "epoch": 0.060544115436645125, "flos": 23257391258880.0, "grad_norm": 2.0632220513829975, "language_loss": 0.78616261, "learning_rate": 3.990222164802503e-06, "loss": 0.80837327, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 0.76953125, "router_z_loss_mlp": 1.015625, "step": 1007, "time_per_iteration": 3.910252571105957 }, { "auxiliary_loss_clip": 0.01128302, "auxiliary_loss_mlp": 0.01096327, "balance_loss_clip": 1.02155876, "balance_loss_mlp": 1.02737665, "epoch": 0.06060423868931309, "flos": 23877309093120.0, "grad_norm": 1.8110772081952613, "language_loss": 0.839454, "learning_rate": 3.9901836630514006e-06, "loss": 0.86170024, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.0078125, "step": 1008, "time_per_iteration": 2.4718194007873535 }, { "auxiliary_loss_clip": 0.0112531, "auxiliary_loss_mlp": 0.0109185, "balance_loss_clip": 1.02142155, "balance_loss_mlp": 1.02681816, "epoch": 0.06066436194198106, "flos": 18727236328320.0, "grad_norm": 1.680063284662544, "language_loss": 0.81262296, "learning_rate": 3.990145085832335e-06, "loss": 0.83479458, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.984375, "step": 1009, "time_per_iteration": 2.403305768966675 }, { "auxiliary_loss_clip": 0.01120654, "auxiliary_loss_mlp": 0.01093241, "balance_loss_clip": 1.02152514, "balance_loss_mlp": 1.02543366, "epoch": 0.06072448519464903, "flos": 24639428361600.0, "grad_norm": 1.7696400988824703, "language_loss": 0.9448337, "learning_rate": 3.990106433146769e-06, "loss": 0.96697271, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.953125, "step": 1010, "time_per_iteration": 2.4620039463043213 }, { "auxiliary_loss_clip": 0.01135929, "auxiliary_loss_mlp": 0.01099439, "balance_loss_clip": 1.02009344, "balance_loss_mlp": 1.02835917, "epoch": 0.060784608447317, "flos": 17378017770240.0, "grad_norm": 2.222828808431505, "language_loss": 0.75482929, "learning_rate": 3.9900677049961665e-06, "loss": 0.777183, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 1.078125, "step": 1011, "time_per_iteration": 2.4503891468048096 }, { "auxiliary_loss_clip": 0.01125509, "auxiliary_loss_mlp": 0.0109771, "balance_loss_clip": 1.02313316, "balance_loss_mlp": 1.02678943, "epoch": 0.06084473169998497, "flos": 23691187301760.0, "grad_norm": 1.7846043873876567, "language_loss": 0.89260995, "learning_rate": 3.990028901381999e-06, "loss": 0.91484219, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 0.98828125, "step": 1012, "time_per_iteration": 2.4589455127716064 }, { "auxiliary_loss_clip": 0.01124339, "auxiliary_loss_mlp": 0.01093836, "balance_loss_clip": 1.02231002, "balance_loss_mlp": 1.02477717, "epoch": 0.06090485495265294, "flos": 23545320174720.0, "grad_norm": 1.7186659283016437, "language_loss": 0.79806948, "learning_rate": 3.989990022305734e-06, "loss": 0.82025123, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.99609375, "step": 1013, "time_per_iteration": 2.413548707962036 }, { "auxiliary_loss_clip": 0.01130347, "auxiliary_loss_mlp": 0.01100286, "balance_loss_clip": 1.02547002, "balance_loss_mlp": 1.02708149, "epoch": 0.06096497820532091, "flos": 20338268238720.0, "grad_norm": 2.2365622617191647, "language_loss": 0.8890422, "learning_rate": 3.98995106776885e-06, "loss": 0.91134852, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.03125, "step": 1014, "time_per_iteration": 2.461885452270508 }, { "auxiliary_loss_clip": 0.01132685, "auxiliary_loss_mlp": 0.01098485, "balance_loss_clip": 1.02552927, "balance_loss_mlp": 1.02941501, "epoch": 0.061025101457988874, "flos": 26937935320320.0, "grad_norm": 1.9892464152835103, "language_loss": 0.77418143, "learning_rate": 3.98991203777282e-06, "loss": 0.79649317, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 1.03125, "step": 1015, "time_per_iteration": 2.461822509765625 }, { "auxiliary_loss_clip": 0.01119242, "auxiliary_loss_mlp": 0.01093237, "balance_loss_clip": 1.0259552, "balance_loss_mlp": 1.02563667, "epoch": 0.061085224710656846, "flos": 25373861055360.0, "grad_norm": 1.9335358993192098, "language_loss": 0.81727445, "learning_rate": 3.9898729323191275e-06, "loss": 0.83939916, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.9375, "step": 1016, "time_per_iteration": 2.518801689147949 }, { "auxiliary_loss_clip": 0.01124394, "auxiliary_loss_mlp": 0.01083535, "balance_loss_clip": 1.01463199, "balance_loss_mlp": 1.02592719, "epoch": 0.06114534796332482, "flos": 24823664939520.0, "grad_norm": 1.9487878983565214, "language_loss": 0.78400135, "learning_rate": 3.989833751409254e-06, "loss": 0.80608064, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.984375, "step": 1017, "time_per_iteration": 2.432973861694336 }, { "auxiliary_loss_clip": 0.01135069, "auxiliary_loss_mlp": 0.01108516, "balance_loss_clip": 1.03241277, "balance_loss_mlp": 1.0312767, "epoch": 0.061205471215992784, "flos": 20630386517760.0, "grad_norm": 1.6789295244751747, "language_loss": 0.88619113, "learning_rate": 3.989794495044685e-06, "loss": 0.90862697, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 1.03125, "step": 1018, "time_per_iteration": 2.427917957305908 }, { "auxiliary_loss_clip": 0.01121603, "auxiliary_loss_mlp": 0.01100773, "balance_loss_clip": 1.03120232, "balance_loss_mlp": 1.02634573, "epoch": 0.061265594468660756, "flos": 16507423307520.0, "grad_norm": 3.73986103754787, "language_loss": 0.8295331, "learning_rate": 3.989755163226909e-06, "loss": 0.85175681, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.953125, "step": 1019, "time_per_iteration": 2.4019839763641357 }, { "auxiliary_loss_clip": 0.01125408, "auxiliary_loss_mlp": 0.01091075, "balance_loss_clip": 1.02026463, "balance_loss_mlp": 1.02759457, "epoch": 0.06132571772132872, "flos": 26245118833920.0, "grad_norm": 1.8426924374851184, "language_loss": 0.85705459, "learning_rate": 3.989715755957418e-06, "loss": 0.87921941, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 0.9765625, "step": 1020, "time_per_iteration": 2.4584877490997314 }, { "auxiliary_loss_clip": 0.01122658, "auxiliary_loss_mlp": 0.01089941, "balance_loss_clip": 1.02261138, "balance_loss_mlp": 1.02635193, "epoch": 0.06138584097399669, "flos": 37413275385600.0, "grad_norm": 1.9112892155786403, "language_loss": 0.81077987, "learning_rate": 3.989676273237705e-06, "loss": 0.83290589, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.96484375, "step": 1021, "time_per_iteration": 2.543074607849121 }, { "auxiliary_loss_clip": 0.01120304, "auxiliary_loss_mlp": 0.01093257, "balance_loss_clip": 1.02692902, "balance_loss_mlp": 1.02532387, "epoch": 0.061445964226664665, "flos": 17419703800320.0, "grad_norm": 1.9207476478332044, "language_loss": 0.90067899, "learning_rate": 3.9896367150692705e-06, "loss": 0.92281461, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.94921875, "step": 1022, "time_per_iteration": 2.3972439765930176 }, { "auxiliary_loss_clip": 0.01120043, "auxiliary_loss_mlp": 0.01085176, "balance_loss_clip": 1.01894331, "balance_loss_mlp": 1.02614224, "epoch": 0.06150608747933263, "flos": 22598964328320.0, "grad_norm": 1.5899042550673956, "language_loss": 0.84694982, "learning_rate": 3.989597081453611e-06, "loss": 0.86900198, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.94140625, "step": 1023, "time_per_iteration": 2.425142526626587 }, { "auxiliary_loss_clip": 0.01042381, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.01723135, "balance_loss_mlp": 1.01605856, "epoch": 0.0615662107320006, "flos": 56738712816000.0, "grad_norm": 0.9173402749138183, "language_loss": 0.65254587, "learning_rate": 3.989557372392231e-06, "loss": 0.67328978, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.26367188, "step": 1024, "time_per_iteration": 3.051473617553711 }, { "auxiliary_loss_clip": 0.01123627, "auxiliary_loss_mlp": 0.01091455, "balance_loss_clip": 1.02102637, "balance_loss_mlp": 1.02805865, "epoch": 0.06162633398466857, "flos": 22563701988480.0, "grad_norm": 1.929982686054449, "language_loss": 0.9136706, "learning_rate": 3.989517587886636e-06, "loss": 0.93582141, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.953125, "step": 1025, "time_per_iteration": 2.418164014816284 }, { "auxiliary_loss_clip": 0.01121206, "auxiliary_loss_mlp": 0.0108356, "balance_loss_clip": 1.01797104, "balance_loss_mlp": 1.02517581, "epoch": 0.06168645723733654, "flos": 25591928607360.0, "grad_norm": 1.5744310892839763, "language_loss": 0.86198819, "learning_rate": 3.989477727938335e-06, "loss": 0.88403589, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.9609375, "step": 1026, "time_per_iteration": 2.4618115425109863 }, { "auxiliary_loss_clip": 0.01125572, "auxiliary_loss_mlp": 0.01096842, "balance_loss_clip": 1.02813005, "balance_loss_mlp": 1.02566648, "epoch": 0.06174658049000451, "flos": 15996993096960.0, "grad_norm": 2.0899207214694018, "language_loss": 0.85217607, "learning_rate": 3.989437792548839e-06, "loss": 0.8744002, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 1.0, "step": 1027, "time_per_iteration": 2.383760690689087 }, { "auxiliary_loss_clip": 0.01120497, "auxiliary_loss_mlp": 0.01090677, "balance_loss_clip": 1.02816403, "balance_loss_mlp": 1.02672338, "epoch": 0.06180670374267248, "flos": 11285324674560.0, "grad_norm": 2.2859821390063115, "language_loss": 0.86628294, "learning_rate": 3.989397781719663e-06, "loss": 0.88839465, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.9375, "step": 1028, "time_per_iteration": 2.3931970596313477 }, { "auxiliary_loss_clip": 0.01040075, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.01130724, "balance_loss_mlp": 1.01365328, "epoch": 0.06186682699534045, "flos": 65127224695680.0, "grad_norm": 0.9543270552395677, "language_loss": 0.60728455, "learning_rate": 3.989357695452323e-06, "loss": 0.627931, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.265625, "step": 1029, "time_per_iteration": 2.911198377609253 }, { "auxiliary_loss_clip": 0.01122117, "auxiliary_loss_mlp": 0.0110372, "balance_loss_clip": 1.0340066, "balance_loss_mlp": 1.02625489, "epoch": 0.061926950248008414, "flos": 21104681604480.0, "grad_norm": 1.9962560984165834, "language_loss": 0.85088837, "learning_rate": 3.98931753374834e-06, "loss": 0.87314671, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.95703125, "step": 1030, "time_per_iteration": 2.4927802085876465 }, { "auxiliary_loss_clip": 0.01126456, "auxiliary_loss_mlp": 0.01111307, "balance_loss_clip": 1.03982925, "balance_loss_mlp": 1.02838099, "epoch": 0.061987073500676386, "flos": 17747503355520.0, "grad_norm": 2.381949717331762, "language_loss": 0.84579551, "learning_rate": 3.989277296609237e-06, "loss": 0.86817312, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 0.98046875, "step": 1031, "time_per_iteration": 2.5611655712127686 }, { "auxiliary_loss_clip": 0.01119073, "auxiliary_loss_mlp": 0.01102307, "balance_loss_clip": 1.03755212, "balance_loss_mlp": 1.02531815, "epoch": 0.06204719675334436, "flos": 21835134403200.0, "grad_norm": 1.4828372411818636, "language_loss": 0.78956044, "learning_rate": 3.98923698403654e-06, "loss": 0.81177419, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.9375, "step": 1032, "time_per_iteration": 2.5441999435424805 }, { "auxiliary_loss_clip": 0.01128614, "auxiliary_loss_mlp": 0.01093413, "balance_loss_clip": 1.02675104, "balance_loss_mlp": 1.02712679, "epoch": 0.06210732000601232, "flos": 19352705068800.0, "grad_norm": 2.128034416515746, "language_loss": 0.92314547, "learning_rate": 3.989196596031776e-06, "loss": 0.94536573, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 1.015625, "step": 1033, "time_per_iteration": 2.51632022857666 }, { "auxiliary_loss_clip": 0.01125029, "auxiliary_loss_mlp": 0.01094553, "balance_loss_clip": 1.03439975, "balance_loss_mlp": 1.02745986, "epoch": 0.062167443258680295, "flos": 24748357403520.0, "grad_norm": 2.2873166466435784, "language_loss": 0.88127214, "learning_rate": 3.989156132596479e-06, "loss": 0.90346795, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.9765625, "step": 1034, "time_per_iteration": 2.508610486984253 }, { "auxiliary_loss_clip": 0.01116404, "auxiliary_loss_mlp": 0.01085141, "balance_loss_clip": 1.02632368, "balance_loss_mlp": 1.02681756, "epoch": 0.06222756651134827, "flos": 34457074634880.0, "grad_norm": 1.8945276774460322, "language_loss": 0.83572543, "learning_rate": 3.989115593732182e-06, "loss": 0.85774082, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8984375, "step": 1035, "time_per_iteration": 2.509218454360962 }, { "auxiliary_loss_clip": 0.01124265, "auxiliary_loss_mlp": 0.01078861, "balance_loss_clip": 1.01563263, "balance_loss_mlp": 1.03003216, "epoch": 0.06228768976401623, "flos": 25665281107200.0, "grad_norm": 1.8757286985992025, "language_loss": 0.81553674, "learning_rate": 3.989074979440421e-06, "loss": 0.83756799, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.94140625, "step": 1036, "time_per_iteration": 2.500920295715332 }, { "auxiliary_loss_clip": 0.01120647, "auxiliary_loss_mlp": 0.010817, "balance_loss_clip": 1.02149892, "balance_loss_mlp": 1.02839124, "epoch": 0.062347813016684205, "flos": 25294608535680.0, "grad_norm": 1.840717521557544, "language_loss": 0.8892284, "learning_rate": 3.989034289722739e-06, "loss": 0.9112519, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.921875, "step": 1037, "time_per_iteration": 2.5002129077911377 }, { "auxiliary_loss_clip": 0.01123696, "auxiliary_loss_mlp": 0.01092036, "balance_loss_clip": 1.02456403, "balance_loss_mlp": 1.03040874, "epoch": 0.06240793626935217, "flos": 26905815002880.0, "grad_norm": 2.066469722728655, "language_loss": 0.84340858, "learning_rate": 3.988993524580676e-06, "loss": 0.8655659, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.9296875, "step": 1038, "time_per_iteration": 2.5259828567504883 }, { "auxiliary_loss_clip": 0.01124296, "auxiliary_loss_mlp": 0.0109562, "balance_loss_clip": 1.03155708, "balance_loss_mlp": 1.0315671, "epoch": 0.06246805952202014, "flos": 21614727790080.0, "grad_norm": 2.079839169787104, "language_loss": 0.89440054, "learning_rate": 3.98895268401578e-06, "loss": 0.91659975, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.9296875, "step": 1039, "time_per_iteration": 2.464094638824463 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.01092519, "balance_loss_clip": 1.02666807, "balance_loss_mlp": 1.02759457, "epoch": 0.0625281827746881, "flos": 19311053950080.0, "grad_norm": 2.024780827523462, "language_loss": 0.83123702, "learning_rate": 3.9889117680296e-06, "loss": 0.85338205, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.9453125, "step": 1040, "time_per_iteration": 2.4867732524871826 }, { "auxiliary_loss_clip": 0.01120928, "auxiliary_loss_mlp": 0.01091156, "balance_loss_clip": 1.03031123, "balance_loss_mlp": 1.03144467, "epoch": 0.06258830602735609, "flos": 27744533527680.0, "grad_norm": 2.3310679075405, "language_loss": 0.73142481, "learning_rate": 3.988870776623685e-06, "loss": 0.75354564, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.89453125, "step": 1041, "time_per_iteration": 2.522988796234131 }, { "auxiliary_loss_clip": 0.01119462, "auxiliary_loss_mlp": 0.010963, "balance_loss_clip": 1.03006768, "balance_loss_mlp": 1.02643991, "epoch": 0.06264842928002405, "flos": 23221465603200.0, "grad_norm": 1.963326904830616, "language_loss": 0.84113556, "learning_rate": 3.9888297097995905e-06, "loss": 0.86329317, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.9296875, "step": 1042, "time_per_iteration": 2.454434633255005 }, { "auxiliary_loss_clip": 0.01121524, "auxiliary_loss_mlp": 0.01090381, "balance_loss_clip": 1.03072917, "balance_loss_mlp": 1.0271163, "epoch": 0.06270855253269202, "flos": 38397965771520.0, "grad_norm": 1.6266546397167794, "language_loss": 0.79744434, "learning_rate": 3.988788567558874e-06, "loss": 0.81956339, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.9453125, "step": 1043, "time_per_iteration": 4.0658957958221436 }, { "auxiliary_loss_clip": 0.01114728, "auxiliary_loss_mlp": 0.01079668, "balance_loss_clip": 1.02123189, "balance_loss_mlp": 1.02563965, "epoch": 0.06276867578535998, "flos": 22452503708160.0, "grad_norm": 1.97361183981175, "language_loss": 0.95319337, "learning_rate": 3.988747349903097e-06, "loss": 0.97513735, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.890625, "step": 1044, "time_per_iteration": 2.4186480045318604 }, { "auxiliary_loss_clip": 0.01120944, "auxiliary_loss_mlp": 0.01085232, "balance_loss_clip": 1.02355337, "balance_loss_mlp": 1.02598763, "epoch": 0.06282879903802796, "flos": 22929312412800.0, "grad_norm": 1.936695404506749, "language_loss": 0.8825779, "learning_rate": 3.988706056833821e-06, "loss": 0.9046396, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.94921875, "step": 1045, "time_per_iteration": 2.4632999897003174 }, { "auxiliary_loss_clip": 0.01115101, "auxiliary_loss_mlp": 0.01082772, "balance_loss_clip": 1.02517033, "balance_loss_mlp": 1.02473474, "epoch": 0.06288892229069593, "flos": 34817937114240.0, "grad_norm": 2.219264745329713, "language_loss": 0.81954104, "learning_rate": 3.9886646883526125e-06, "loss": 0.84151971, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.90625, "step": 1046, "time_per_iteration": 5.377129316329956 }, { "auxiliary_loss_clip": 0.01118294, "auxiliary_loss_mlp": 0.01093742, "balance_loss_clip": 1.03168166, "balance_loss_mlp": 1.0258224, "epoch": 0.06294904554336389, "flos": 19426127391360.0, "grad_norm": 2.047324705182207, "language_loss": 0.80351937, "learning_rate": 3.988623244461039e-06, "loss": 0.82563967, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.92578125, "step": 1047, "time_per_iteration": 3.8437821865081787 }, { "auxiliary_loss_clip": 0.01126066, "auxiliary_loss_mlp": 0.01092804, "balance_loss_clip": 1.02752519, "balance_loss_mlp": 1.02771652, "epoch": 0.06300916879603187, "flos": 40660267783680.0, "grad_norm": 2.791299523368332, "language_loss": 0.79515481, "learning_rate": 3.988581725160672e-06, "loss": 0.81734347, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.984375, "step": 1048, "time_per_iteration": 2.576902389526367 }, { "auxiliary_loss_clip": 0.01122805, "auxiliary_loss_mlp": 0.01087589, "balance_loss_clip": 1.02552867, "balance_loss_mlp": 1.02606678, "epoch": 0.06306929204869983, "flos": 23803048897920.0, "grad_norm": 2.0886795309169455, "language_loss": 0.80409116, "learning_rate": 3.988540130453087e-06, "loss": 0.82619512, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.96875, "step": 1049, "time_per_iteration": 2.4208083152770996 }, { "auxiliary_loss_clip": 0.01119807, "auxiliary_loss_mlp": 0.01092957, "balance_loss_clip": 1.02505553, "balance_loss_mlp": 1.02550459, "epoch": 0.0631294153013678, "flos": 18914824396800.0, "grad_norm": 2.0890934669558656, "language_loss": 0.85480368, "learning_rate": 3.988498460339862e-06, "loss": 0.87693131, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.9453125, "step": 1050, "time_per_iteration": 2.4068799018859863 }, { "auxiliary_loss_clip": 0.01117889, "auxiliary_loss_mlp": 0.01074428, "balance_loss_clip": 1.01751745, "balance_loss_mlp": 1.02725577, "epoch": 0.06318953855403578, "flos": 24279019729920.0, "grad_norm": 1.7001516318288077, "language_loss": 0.79598558, "learning_rate": 3.988456714822575e-06, "loss": 0.81790876, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.90625, "step": 1051, "time_per_iteration": 2.4582767486572266 }, { "auxiliary_loss_clip": 0.01123244, "auxiliary_loss_mlp": 0.01092525, "balance_loss_clip": 1.02853346, "balance_loss_mlp": 1.02813864, "epoch": 0.06324966180670374, "flos": 22527811244160.0, "grad_norm": 2.3002736669031187, "language_loss": 0.84092963, "learning_rate": 3.98841489390281e-06, "loss": 0.8630873, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.953125, "step": 1052, "time_per_iteration": 2.4346559047698975 }, { "auxiliary_loss_clip": 0.01122476, "auxiliary_loss_mlp": 0.01088316, "balance_loss_clip": 1.02537346, "balance_loss_mlp": 1.02801478, "epoch": 0.06330978505937171, "flos": 15777214888320.0, "grad_norm": 1.9936604449039912, "language_loss": 0.82026523, "learning_rate": 3.988372997582155e-06, "loss": 0.84237313, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.9453125, "step": 1053, "time_per_iteration": 2.3937292098999023 }, { "auxiliary_loss_clip": 0.01122594, "auxiliary_loss_mlp": 0.01082185, "balance_loss_clip": 1.01757371, "balance_loss_mlp": 1.02595663, "epoch": 0.06336990831203967, "flos": 21470012737920.0, "grad_norm": 1.8938669357093418, "language_loss": 0.87145114, "learning_rate": 3.988331025862195e-06, "loss": 0.8934989, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.96875, "step": 1054, "time_per_iteration": 2.432512044906616 }, { "auxiliary_loss_clip": 0.01121532, "auxiliary_loss_mlp": 0.01084468, "balance_loss_clip": 1.02472055, "balance_loss_mlp": 1.02723479, "epoch": 0.06343003156470765, "flos": 18477886331520.0, "grad_norm": 2.2997934050843134, "language_loss": 0.88589448, "learning_rate": 3.9882889787445225e-06, "loss": 0.90795445, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.9453125, "step": 1055, "time_per_iteration": 2.374807596206665 }, { "auxiliary_loss_clip": 0.01124597, "auxiliary_loss_mlp": 0.01094489, "balance_loss_clip": 1.0252049, "balance_loss_mlp": 1.02733898, "epoch": 0.06349015481737562, "flos": 25153733733120.0, "grad_norm": 3.6571621070237463, "language_loss": 0.87692791, "learning_rate": 3.988246856230734e-06, "loss": 0.89911878, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.9765625, "step": 1056, "time_per_iteration": 2.461125135421753 }, { "auxiliary_loss_clip": 0.0112646, "auxiliary_loss_mlp": 0.01097148, "balance_loss_clip": 1.02276111, "balance_loss_mlp": 1.02575731, "epoch": 0.06355027807004358, "flos": 26870517751680.0, "grad_norm": 2.340082951683335, "language_loss": 0.85647762, "learning_rate": 3.988204658322426e-06, "loss": 0.87871367, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 1.0078125, "step": 1057, "time_per_iteration": 2.4642553329467773 }, { "auxiliary_loss_clip": 0.01113159, "auxiliary_loss_mlp": 0.01081281, "balance_loss_clip": 1.01905417, "balance_loss_mlp": 1.02528191, "epoch": 0.06361040132271156, "flos": 21395647808640.0, "grad_norm": 1.8343253404194075, "language_loss": 0.85614014, "learning_rate": 3.988162385021196e-06, "loss": 0.87808454, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.87890625, "step": 1058, "time_per_iteration": 2.415755033493042 }, { "auxiliary_loss_clip": 0.01120057, "auxiliary_loss_mlp": 0.01101414, "balance_loss_clip": 1.027933, "balance_loss_mlp": 1.02598333, "epoch": 0.06367052457537953, "flos": 25732733587200.0, "grad_norm": 2.228391340296399, "language_loss": 0.90098339, "learning_rate": 3.988120036328651e-06, "loss": 0.92319798, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.9375, "step": 1059, "time_per_iteration": 2.419654369354248 }, { "auxiliary_loss_clip": 0.01122992, "auxiliary_loss_mlp": 0.01080912, "balance_loss_clip": 1.01582408, "balance_loss_mlp": 1.02689362, "epoch": 0.0637306478280475, "flos": 17630684346240.0, "grad_norm": 2.3007004233747494, "language_loss": 0.94364297, "learning_rate": 3.988077612246394e-06, "loss": 0.96568203, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.9609375, "step": 1060, "time_per_iteration": 2.395505428314209 }, { "auxiliary_loss_clip": 0.0111891, "auxiliary_loss_mlp": 0.01084301, "balance_loss_clip": 1.02240729, "balance_loss_mlp": 1.02564621, "epoch": 0.06379077108071547, "flos": 13661757521280.0, "grad_norm": 1.947431582406713, "language_loss": 0.90625364, "learning_rate": 3.988035112776035e-06, "loss": 0.92828572, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.93359375, "step": 1061, "time_per_iteration": 2.3723816871643066 }, { "auxiliary_loss_clip": 0.01123006, "auxiliary_loss_mlp": 0.01094058, "balance_loss_clip": 1.02658546, "balance_loss_mlp": 1.0237143, "epoch": 0.06385089433338344, "flos": 28477499944320.0, "grad_norm": 2.381135429320362, "language_loss": 0.80243617, "learning_rate": 3.987992537919185e-06, "loss": 0.8246069, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.9921875, "step": 1062, "time_per_iteration": 2.4985103607177734 }, { "auxiliary_loss_clip": 0.01120905, "auxiliary_loss_mlp": 0.01081172, "balance_loss_clip": 1.01617932, "balance_loss_mlp": 1.02306414, "epoch": 0.0639110175860514, "flos": 24310057795200.0, "grad_norm": 2.0000955970980203, "language_loss": 0.89178491, "learning_rate": 3.987949887677459e-06, "loss": 0.91380566, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.9765625, "step": 1063, "time_per_iteration": 2.4254629611968994 }, { "auxiliary_loss_clip": 0.01123622, "auxiliary_loss_mlp": 0.01091497, "balance_loss_clip": 1.0242157, "balance_loss_mlp": 1.02561641, "epoch": 0.06397114083871938, "flos": 22089686192640.0, "grad_norm": 1.96234639238302, "language_loss": 0.8321234, "learning_rate": 3.9879071620524744e-06, "loss": 0.85427451, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.98046875, "step": 1064, "time_per_iteration": 2.4293994903564453 }, { "auxiliary_loss_clip": 0.01120403, "auxiliary_loss_mlp": 0.01090139, "balance_loss_clip": 1.02567041, "balance_loss_mlp": 1.02647913, "epoch": 0.06403126409138735, "flos": 19571819961600.0, "grad_norm": 2.0400156021515072, "language_loss": 0.8702895, "learning_rate": 3.987864361045851e-06, "loss": 0.89239496, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9375, "step": 1065, "time_per_iteration": 2.3946800231933594 }, { "auxiliary_loss_clip": 0.01121485, "auxiliary_loss_mlp": 0.0108212, "balance_loss_clip": 1.01774669, "balance_loss_mlp": 1.02662969, "epoch": 0.06409138734405531, "flos": 40805820708480.0, "grad_norm": 1.7385896987598173, "language_loss": 0.70013899, "learning_rate": 3.987821484659211e-06, "loss": 0.722175, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9453125, "step": 1066, "time_per_iteration": 2.5871217250823975 }, { "auxiliary_loss_clip": 0.01122842, "auxiliary_loss_mlp": 0.01099292, "balance_loss_clip": 1.02810073, "balance_loss_mlp": 1.02791452, "epoch": 0.06415151059672328, "flos": 20440773590400.0, "grad_norm": 2.482341930729839, "language_loss": 0.93501902, "learning_rate": 3.987778532894181e-06, "loss": 0.95724034, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.94921875, "step": 1067, "time_per_iteration": 2.3952934741973877 }, { "auxiliary_loss_clip": 0.01124211, "auxiliary_loss_mlp": 0.01089905, "balance_loss_clip": 1.02395844, "balance_loss_mlp": 1.02650857, "epoch": 0.06421163384939126, "flos": 18071218281600.0, "grad_norm": 2.1168191476421754, "language_loss": 0.8741498, "learning_rate": 3.987735505752391e-06, "loss": 0.89629096, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.9765625, "step": 1068, "time_per_iteration": 2.4017884731292725 }, { "auxiliary_loss_clip": 0.01120972, "auxiliary_loss_mlp": 0.01083717, "balance_loss_clip": 1.023206, "balance_loss_mlp": 1.02648103, "epoch": 0.06427175710205922, "flos": 25118261925120.0, "grad_norm": 2.41480106391959, "language_loss": 0.92427808, "learning_rate": 3.987692403235471e-06, "loss": 0.94632494, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9453125, "step": 1069, "time_per_iteration": 2.434234857559204 }, { "auxiliary_loss_clip": 0.01125251, "auxiliary_loss_mlp": 0.01095476, "balance_loss_clip": 1.02719295, "balance_loss_mlp": 1.02661729, "epoch": 0.06433188035472719, "flos": 17379693515520.0, "grad_norm": 2.464797247156926, "language_loss": 0.99638212, "learning_rate": 3.987649225345056e-06, "loss": 1.01858938, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.984375, "step": 1070, "time_per_iteration": 2.382152557373047 }, { "auxiliary_loss_clip": 0.01125855, "auxiliary_loss_mlp": 0.01090611, "balance_loss_clip": 1.02018189, "balance_loss_mlp": 1.0274061, "epoch": 0.06439200360739517, "flos": 23545250352000.0, "grad_norm": 1.611547189992938, "language_loss": 0.89884496, "learning_rate": 3.987605972082782e-06, "loss": 0.9210096, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.984375, "step": 1071, "time_per_iteration": 2.4362175464630127 }, { "auxiliary_loss_clip": 0.01118616, "auxiliary_loss_mlp": 0.01092878, "balance_loss_clip": 1.02373648, "balance_loss_mlp": 1.02501035, "epoch": 0.06445212686006313, "flos": 21978732291840.0, "grad_norm": 1.660792292067065, "language_loss": 0.80094004, "learning_rate": 3.987562643450292e-06, "loss": 0.82305497, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.9375, "step": 1072, "time_per_iteration": 2.434375286102295 }, { "auxiliary_loss_clip": 0.01124437, "auxiliary_loss_mlp": 0.01092933, "balance_loss_clip": 1.02417302, "balance_loss_mlp": 1.02694046, "epoch": 0.0645122501127311, "flos": 25920112187520.0, "grad_norm": 2.112999654607481, "language_loss": 0.83960772, "learning_rate": 3.987519239449226e-06, "loss": 0.86178148, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.9765625, "step": 1073, "time_per_iteration": 2.447216510772705 }, { "auxiliary_loss_clip": 0.01121902, "auxiliary_loss_mlp": 0.01090209, "balance_loss_clip": 1.02168775, "balance_loss_mlp": 1.02630675, "epoch": 0.06457237336539907, "flos": 25624956620160.0, "grad_norm": 1.8802980392483422, "language_loss": 0.82009876, "learning_rate": 3.987475760081233e-06, "loss": 0.84221989, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.95703125, "step": 1074, "time_per_iteration": 2.4824140071868896 }, { "auxiliary_loss_clip": 0.01121573, "auxiliary_loss_mlp": 0.01094172, "balance_loss_clip": 1.02503073, "balance_loss_mlp": 1.0261662, "epoch": 0.06463249661806704, "flos": 19462960742400.0, "grad_norm": 1.6310999107561297, "language_loss": 0.82577121, "learning_rate": 3.987432205347958e-06, "loss": 0.8479287, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.953125, "step": 1075, "time_per_iteration": 2.394707441329956 }, { "auxiliary_loss_clip": 0.01126363, "auxiliary_loss_mlp": 0.01085825, "balance_loss_clip": 1.01968789, "balance_loss_mlp": 1.02775788, "epoch": 0.064692619870735, "flos": 24496912725120.0, "grad_norm": 3.660783459659181, "language_loss": 0.91883302, "learning_rate": 3.987388575251055e-06, "loss": 0.94095492, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.98828125, "step": 1076, "time_per_iteration": 2.4476311206817627 }, { "auxiliary_loss_clip": 0.01121398, "auxiliary_loss_mlp": 0.01085063, "balance_loss_clip": 1.01921165, "balance_loss_mlp": 1.02620292, "epoch": 0.06475274312340297, "flos": 17017748784000.0, "grad_norm": 1.8854817509594821, "language_loss": 0.84750068, "learning_rate": 3.98734486979218e-06, "loss": 0.86956525, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.953125, "step": 1077, "time_per_iteration": 2.3808188438415527 }, { "auxiliary_loss_clip": 0.01126964, "auxiliary_loss_mlp": 0.01098882, "balance_loss_clip": 1.02916861, "balance_loss_mlp": 1.02873921, "epoch": 0.06481286637607095, "flos": 24571207831680.0, "grad_norm": 2.015731985806654, "language_loss": 0.94662684, "learning_rate": 3.987301088972986e-06, "loss": 0.96888524, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.98046875, "step": 1078, "time_per_iteration": 2.44041109085083 }, { "auxiliary_loss_clip": 0.01131238, "auxiliary_loss_mlp": 0.01097308, "balance_loss_clip": 1.02730823, "balance_loss_mlp": 1.02946138, "epoch": 0.06487298962873891, "flos": 21104576870400.0, "grad_norm": 1.8996078223216881, "language_loss": 0.81335634, "learning_rate": 3.987257232795137e-06, "loss": 0.83564186, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.015625, "step": 1079, "time_per_iteration": 2.547140121459961 }, { "auxiliary_loss_clip": 0.01122766, "auxiliary_loss_mlp": 0.01092215, "balance_loss_clip": 1.02393222, "balance_loss_mlp": 1.02702093, "epoch": 0.06493311288140688, "flos": 24607028753280.0, "grad_norm": 1.6574004791619237, "language_loss": 0.72627282, "learning_rate": 3.987213301260294e-06, "loss": 0.74842262, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.9609375, "step": 1080, "time_per_iteration": 2.4899046421051025 }, { "auxiliary_loss_clip": 0.01121816, "auxiliary_loss_mlp": 0.01081533, "balance_loss_clip": 1.01787555, "balance_loss_mlp": 1.02541375, "epoch": 0.06499323613407486, "flos": 25336818236160.0, "grad_norm": 1.8396558955000455, "language_loss": 0.7595588, "learning_rate": 3.987169294370123e-06, "loss": 0.78159231, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.96484375, "step": 1081, "time_per_iteration": 2.4479482173919678 }, { "auxiliary_loss_clip": 0.01118475, "auxiliary_loss_mlp": 0.0108487, "balance_loss_clip": 1.02097356, "balance_loss_mlp": 1.02585471, "epoch": 0.06505335938674282, "flos": 20374682653440.0, "grad_norm": 2.3818817102333636, "language_loss": 0.87467372, "learning_rate": 3.987125212126294e-06, "loss": 0.89670718, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.92578125, "step": 1082, "time_per_iteration": 2.3888351917266846 }, { "auxiliary_loss_clip": 0.01133713, "auxiliary_loss_mlp": 0.0110356, "balance_loss_clip": 1.02912593, "balance_loss_mlp": 1.02908659, "epoch": 0.06511348263941079, "flos": 25336748413440.0, "grad_norm": 2.062456197724076, "language_loss": 0.8509053, "learning_rate": 3.987081054530478e-06, "loss": 0.87327802, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 1.046875, "step": 1083, "time_per_iteration": 3.9143543243408203 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.01083064, "balance_loss_clip": 1.01988316, "balance_loss_mlp": 1.0279175, "epoch": 0.06517360589207877, "flos": 20331949282560.0, "grad_norm": 2.651307631539728, "language_loss": 0.83543038, "learning_rate": 3.987036821584348e-06, "loss": 0.85745978, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.91796875, "step": 1084, "time_per_iteration": 3.856062173843384 }, { "auxiliary_loss_clip": 0.01119746, "auxiliary_loss_mlp": 0.01075087, "balance_loss_clip": 1.0174613, "balance_loss_mlp": 1.02678192, "epoch": 0.06523372914474673, "flos": 31680432339840.0, "grad_norm": 2.27366259774032, "language_loss": 0.68827015, "learning_rate": 3.986992513289584e-06, "loss": 0.71021849, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.9296875, "step": 1085, "time_per_iteration": 2.518594264984131 }, { "auxiliary_loss_clip": 0.011177, "auxiliary_loss_mlp": 0.01073352, "balance_loss_clip": 1.01834846, "balance_loss_mlp": 1.0260303, "epoch": 0.0652938523974147, "flos": 20777091517440.0, "grad_norm": 1.907629077623935, "language_loss": 0.79065573, "learning_rate": 3.9869481296478645e-06, "loss": 0.81256628, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.9140625, "step": 1086, "time_per_iteration": 3.844931125640869 }, { "auxiliary_loss_clip": 0.01118694, "auxiliary_loss_mlp": 0.01072338, "balance_loss_clip": 1.01819277, "balance_loss_mlp": 1.02719188, "epoch": 0.06535397565008266, "flos": 16690053962880.0, "grad_norm": 2.1770142014706977, "language_loss": 0.87295473, "learning_rate": 3.986903670660872e-06, "loss": 0.89486504, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.9140625, "step": 1087, "time_per_iteration": 3.793470859527588 }, { "auxiliary_loss_clip": 0.01119643, "auxiliary_loss_mlp": 0.01086772, "balance_loss_clip": 1.02654719, "balance_loss_mlp": 1.02743101, "epoch": 0.06541409890275064, "flos": 26867061527040.0, "grad_norm": 1.9955980550976617, "language_loss": 0.80957699, "learning_rate": 3.9868591363302945e-06, "loss": 0.83164108, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.921875, "step": 1088, "time_per_iteration": 2.4624691009521484 }, { "auxiliary_loss_clip": 0.01117962, "auxiliary_loss_mlp": 0.01077206, "balance_loss_clip": 1.01834023, "balance_loss_mlp": 1.0256604, "epoch": 0.06547422215541861, "flos": 20520584691840.0, "grad_norm": 1.8567809076853405, "language_loss": 0.73969108, "learning_rate": 3.9868145266578186e-06, "loss": 0.76164275, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.921875, "step": 1089, "time_per_iteration": 2.428581476211548 }, { "auxiliary_loss_clip": 0.0111687, "auxiliary_loss_mlp": 0.01080845, "balance_loss_clip": 1.02803564, "balance_loss_mlp": 1.02648377, "epoch": 0.06553434540808657, "flos": 22015565642880.0, "grad_norm": 1.6623054340837042, "language_loss": 0.8774364, "learning_rate": 3.9867698416451366e-06, "loss": 0.89941359, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.90234375, "step": 1090, "time_per_iteration": 2.433183193206787 }, { "auxiliary_loss_clip": 0.01117311, "auxiliary_loss_mlp": 0.01077902, "balance_loss_clip": 1.01877451, "balance_loss_mlp": 1.02709794, "epoch": 0.06559446866075455, "flos": 24607482600960.0, "grad_norm": 1.8544865479239219, "language_loss": 0.7470156, "learning_rate": 3.9867250812939434e-06, "loss": 0.76896769, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.90234375, "step": 1091, "time_per_iteration": 2.447695255279541 }, { "auxiliary_loss_clip": 0.01117971, "auxiliary_loss_mlp": 0.0108346, "balance_loss_clip": 1.02428472, "balance_loss_mlp": 1.02604508, "epoch": 0.06565459191342252, "flos": 24273678291840.0, "grad_norm": 2.286918635892109, "language_loss": 0.85563594, "learning_rate": 3.986680245605936e-06, "loss": 0.87765026, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.921875, "step": 1092, "time_per_iteration": 2.4320003986358643 }, { "auxiliary_loss_clip": 0.0112053, "auxiliary_loss_mlp": 0.01087363, "balance_loss_clip": 1.02699494, "balance_loss_mlp": 1.02588415, "epoch": 0.06571471516609048, "flos": 24786063538560.0, "grad_norm": 2.2270837940348107, "language_loss": 0.73814356, "learning_rate": 3.986635334582814e-06, "loss": 0.76022249, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9453125, "step": 1093, "time_per_iteration": 2.4481000900268555 }, { "auxiliary_loss_clip": 0.01118743, "auxiliary_loss_mlp": 0.01082699, "balance_loss_clip": 1.02113914, "balance_loss_mlp": 1.02671051, "epoch": 0.06577483841875846, "flos": 26212858871040.0, "grad_norm": 1.604752633499389, "language_loss": 0.90374291, "learning_rate": 3.986590348226282e-06, "loss": 0.92575741, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.921875, "step": 1094, "time_per_iteration": 2.4544970989227295 }, { "auxiliary_loss_clip": 0.01120645, "auxiliary_loss_mlp": 0.01081226, "balance_loss_clip": 1.02033389, "balance_loss_mlp": 1.02774262, "epoch": 0.06583496167142643, "flos": 25079683006080.0, "grad_norm": 1.5905818614331495, "language_loss": 0.83708948, "learning_rate": 3.986545286538044e-06, "loss": 0.85910821, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.9296875, "step": 1095, "time_per_iteration": 2.4460747241973877 }, { "auxiliary_loss_clip": 0.01118077, "auxiliary_loss_mlp": 0.01077466, "balance_loss_clip": 1.02241492, "balance_loss_mlp": 1.02531183, "epoch": 0.06589508492409439, "flos": 25628622312960.0, "grad_norm": 2.1936971704059385, "language_loss": 0.7455287, "learning_rate": 3.986500149519811e-06, "loss": 0.76748407, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.9296875, "step": 1096, "time_per_iteration": 2.473275899887085 }, { "auxiliary_loss_clip": 0.0111817, "auxiliary_loss_mlp": 0.01070183, "balance_loss_clip": 1.01775455, "balance_loss_mlp": 1.0258646, "epoch": 0.06595520817676236, "flos": 23620173863040.0, "grad_norm": 1.9152610164693724, "language_loss": 0.79522955, "learning_rate": 3.986454937173292e-06, "loss": 0.81711304, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.921875, "step": 1097, "time_per_iteration": 2.443470001220703 }, { "auxiliary_loss_clip": 0.01121931, "auxiliary_loss_mlp": 0.01078039, "balance_loss_clip": 1.02201068, "balance_loss_mlp": 1.02572334, "epoch": 0.06601533142943034, "flos": 33800323449600.0, "grad_norm": 2.0040873426898913, "language_loss": 0.80652416, "learning_rate": 3.986409649500203e-06, "loss": 0.82852387, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.9609375, "step": 1098, "time_per_iteration": 2.5048789978027344 }, { "auxiliary_loss_clip": 0.01115229, "auxiliary_loss_mlp": 0.01081685, "balance_loss_clip": 1.0231055, "balance_loss_mlp": 1.02524841, "epoch": 0.0660754546820983, "flos": 20258352403200.0, "grad_norm": 1.8866886674671082, "language_loss": 0.84760594, "learning_rate": 3.986364286502261e-06, "loss": 0.86957514, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8984375, "step": 1099, "time_per_iteration": 2.405137777328491 }, { "auxiliary_loss_clip": 0.01113493, "auxiliary_loss_mlp": 0.01071534, "balance_loss_clip": 1.01891482, "balance_loss_mlp": 1.02531719, "epoch": 0.06613557793476627, "flos": 19353158916480.0, "grad_norm": 1.9630346137601873, "language_loss": 0.86324638, "learning_rate": 3.986318848181186e-06, "loss": 0.88509667, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.87890625, "step": 1100, "time_per_iteration": 2.3810274600982666 }, { "auxiliary_loss_clip": 0.0111834, "auxiliary_loss_mlp": 0.01085337, "balance_loss_clip": 1.02427745, "balance_loss_mlp": 1.02663732, "epoch": 0.06619570118743424, "flos": 13771698992640.0, "grad_norm": 2.194754349172351, "language_loss": 0.76269424, "learning_rate": 3.986273334538702e-06, "loss": 0.78473103, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.9140625, "step": 1101, "time_per_iteration": 2.4132373332977295 }, { "auxiliary_loss_clip": 0.01115933, "auxiliary_loss_mlp": 0.01079949, "balance_loss_clip": 1.02053523, "balance_loss_mlp": 1.02458668, "epoch": 0.06625582444010221, "flos": 17856921156480.0, "grad_norm": 2.2659518661782663, "language_loss": 0.90404302, "learning_rate": 3.986227745576533e-06, "loss": 0.92600191, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.9140625, "step": 1102, "time_per_iteration": 2.389693260192871 }, { "auxiliary_loss_clip": 0.01117092, "auxiliary_loss_mlp": 0.01087484, "balance_loss_clip": 1.02656841, "balance_loss_mlp": 1.02725148, "epoch": 0.06631594769277017, "flos": 11837894762880.0, "grad_norm": 2.100413544997422, "language_loss": 0.85788357, "learning_rate": 3.98618208129641e-06, "loss": 0.87992936, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.8984375, "step": 1103, "time_per_iteration": 2.3741064071655273 }, { "auxiliary_loss_clip": 0.01113874, "auxiliary_loss_mlp": 0.01077781, "balance_loss_clip": 1.02304006, "balance_loss_mlp": 1.0259726, "epoch": 0.06637607094543815, "flos": 19792296397440.0, "grad_norm": 1.7627943560956827, "language_loss": 0.84274757, "learning_rate": 3.986136341700063e-06, "loss": 0.86466408, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.87890625, "step": 1104, "time_per_iteration": 2.401047945022583 }, { "auxiliary_loss_clip": 0.01115866, "auxiliary_loss_mlp": 0.0107462, "balance_loss_clip": 1.01825738, "balance_loss_mlp": 1.02511764, "epoch": 0.06643619419810612, "flos": 25484430931200.0, "grad_norm": 1.7037591587565213, "language_loss": 0.81663072, "learning_rate": 3.986090526789227e-06, "loss": 0.83853555, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.90625, "step": 1105, "time_per_iteration": 2.456904172897339 }, { "auxiliary_loss_clip": 0.01114306, "auxiliary_loss_mlp": 0.0107557, "balance_loss_clip": 1.01944685, "balance_loss_mlp": 1.02570915, "epoch": 0.06649631745077408, "flos": 16945583270400.0, "grad_norm": 2.06042190957479, "language_loss": 0.98223692, "learning_rate": 3.986044636565639e-06, "loss": 1.00413585, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8828125, "step": 1106, "time_per_iteration": 2.3892481327056885 }, { "auxiliary_loss_clip": 0.0112116, "auxiliary_loss_mlp": 0.01077339, "balance_loss_clip": 1.01697171, "balance_loss_mlp": 1.02492881, "epoch": 0.06655644070344206, "flos": 17857619383680.0, "grad_norm": 1.770210665320836, "language_loss": 0.85268295, "learning_rate": 3.985998671031039e-06, "loss": 0.874668, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9609375, "step": 1107, "time_per_iteration": 2.412248134613037 }, { "auxiliary_loss_clip": 0.01036255, "auxiliary_loss_mlp": 0.01020116, "balance_loss_clip": 1.00638294, "balance_loss_mlp": 1.01024175, "epoch": 0.06661656395611003, "flos": 61416236062080.0, "grad_norm": 0.8238038018806919, "language_loss": 0.56833071, "learning_rate": 3.9859526301871705e-06, "loss": 0.58889443, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.26171875, "step": 1108, "time_per_iteration": 2.9792981147766113 }, { "auxiliary_loss_clip": 0.01119922, "auxiliary_loss_mlp": 0.01091585, "balance_loss_clip": 1.02649713, "balance_loss_mlp": 1.02596259, "epoch": 0.066676687208778, "flos": 20661948253440.0, "grad_norm": 3.487870976022965, "language_loss": 0.75580716, "learning_rate": 3.9859065140357795e-06, "loss": 0.77792221, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.9375, "step": 1109, "time_per_iteration": 2.4156947135925293 }, { "auxiliary_loss_clip": 0.01116635, "auxiliary_loss_mlp": 0.01073149, "balance_loss_clip": 1.01645315, "balance_loss_mlp": 1.02510285, "epoch": 0.06673681046144596, "flos": 20922225505920.0, "grad_norm": 1.602397685293025, "language_loss": 0.80923939, "learning_rate": 3.985860322578614e-06, "loss": 0.83113718, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.91796875, "step": 1110, "time_per_iteration": 2.4009132385253906 }, { "auxiliary_loss_clip": 0.01121021, "auxiliary_loss_mlp": 0.01084553, "balance_loss_clip": 1.02461505, "balance_loss_mlp": 1.02673864, "epoch": 0.06679693371411394, "flos": 31064494400640.0, "grad_norm": 2.013706849262472, "language_loss": 0.75132871, "learning_rate": 3.985814055817427e-06, "loss": 0.77338445, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.94140625, "step": 1111, "time_per_iteration": 2.4835853576660156 }, { "auxiliary_loss_clip": 0.01122443, "auxiliary_loss_mlp": 0.01088815, "balance_loss_clip": 1.02975917, "balance_loss_mlp": 1.02592778, "epoch": 0.0668570569667819, "flos": 21725053286400.0, "grad_norm": 1.7691223670096392, "language_loss": 0.8153646, "learning_rate": 3.985767713753971e-06, "loss": 0.83747715, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.96484375, "step": 1112, "time_per_iteration": 2.4132843017578125 }, { "auxiliary_loss_clip": 0.01118617, "auxiliary_loss_mlp": 0.01089497, "balance_loss_clip": 1.02636349, "balance_loss_mlp": 1.02671099, "epoch": 0.06691718021944987, "flos": 22746158087040.0, "grad_norm": 2.012456003082321, "language_loss": 0.84165847, "learning_rate": 3.985721296390005e-06, "loss": 0.86373967, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.91796875, "step": 1113, "time_per_iteration": 2.424727439880371 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.0108052, "balance_loss_clip": 1.01743436, "balance_loss_mlp": 1.023929, "epoch": 0.06697730347211785, "flos": 16544675594880.0, "grad_norm": 1.8507004345553795, "language_loss": 0.85286343, "learning_rate": 3.985674803727289e-06, "loss": 0.87479907, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.890625, "step": 1114, "time_per_iteration": 2.4428517818450928 }, { "auxiliary_loss_clip": 0.0103323, "auxiliary_loss_mlp": 0.01023788, "balance_loss_clip": 1.0107224, "balance_loss_mlp": 1.00802636, "epoch": 0.06703742672478581, "flos": 59779123499520.0, "grad_norm": 0.852720138796611, "language_loss": 0.5836007, "learning_rate": 3.985628235767584e-06, "loss": 0.60417086, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.25195312, "step": 1115, "time_per_iteration": 2.9749999046325684 }, { "auxiliary_loss_clip": 0.01124546, "auxiliary_loss_mlp": 0.0109687, "balance_loss_clip": 1.0301609, "balance_loss_mlp": 1.02850115, "epoch": 0.06709754997745378, "flos": 16799262295680.0, "grad_norm": 2.972201632054534, "language_loss": 0.94455981, "learning_rate": 3.985581592512658e-06, "loss": 0.96677411, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 0.9609375, "step": 1116, "time_per_iteration": 2.3827598094940186 }, { "auxiliary_loss_clip": 0.01122696, "auxiliary_loss_mlp": 0.01083418, "balance_loss_clip": 1.02371812, "balance_loss_mlp": 1.02691305, "epoch": 0.06715767323012176, "flos": 22122923673600.0, "grad_norm": 1.7840851487949978, "language_loss": 0.89795351, "learning_rate": 3.985534873964279e-06, "loss": 0.92001468, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.95703125, "step": 1117, "time_per_iteration": 2.4365878105163574 }, { "auxiliary_loss_clip": 0.01034245, "auxiliary_loss_mlp": 0.01014564, "balance_loss_clip": 1.00345373, "balance_loss_mlp": 1.00915062, "epoch": 0.06721779648278972, "flos": 66615363020160.0, "grad_norm": 0.8653409761736361, "language_loss": 0.5997349, "learning_rate": 3.985488080124218e-06, "loss": 0.62022305, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.25, "step": 1118, "time_per_iteration": 2.9569766521453857 }, { "auxiliary_loss_clip": 0.01122303, "auxiliary_loss_mlp": 0.01081181, "balance_loss_clip": 1.02176738, "balance_loss_mlp": 1.02562976, "epoch": 0.06727791973545769, "flos": 22381385535360.0, "grad_norm": 2.649699368813853, "language_loss": 0.87965488, "learning_rate": 3.985441210994251e-06, "loss": 0.90168977, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.96875, "step": 1119, "time_per_iteration": 2.4172894954681396 }, { "auxiliary_loss_clip": 0.01115957, "auxiliary_loss_mlp": 0.01073688, "balance_loss_clip": 1.01956713, "balance_loss_mlp": 1.02567744, "epoch": 0.06733804298812565, "flos": 24279054641280.0, "grad_norm": 1.9909298106634399, "language_loss": 0.87397975, "learning_rate": 3.9853942665761545e-06, "loss": 0.89587623, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.90234375, "step": 1120, "time_per_iteration": 2.420795440673828 }, { "auxiliary_loss_clip": 0.01123296, "auxiliary_loss_mlp": 0.01093124, "balance_loss_clip": 1.02832222, "balance_loss_mlp": 1.02761793, "epoch": 0.06739816624079363, "flos": 15917496197760.0, "grad_norm": 1.9998410486036524, "language_loss": 0.8118242, "learning_rate": 3.985347246871708e-06, "loss": 0.83398843, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.95703125, "step": 1121, "time_per_iteration": 2.4120850563049316 }, { "auxiliary_loss_clip": 0.01035083, "auxiliary_loss_mlp": 0.01054152, "balance_loss_clip": 1.03917885, "balance_loss_mlp": 1.00847876, "epoch": 0.0674582894934616, "flos": 71394656613120.0, "grad_norm": 0.7980851824815436, "language_loss": 0.58567643, "learning_rate": 3.985300151882694e-06, "loss": 0.60656875, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.265625, "step": 1122, "time_per_iteration": 3.184535026550293 }, { "auxiliary_loss_clip": 0.01123495, "auxiliary_loss_mlp": 0.01085879, "balance_loss_clip": 1.02193558, "balance_loss_mlp": 1.02801776, "epoch": 0.06751841274612956, "flos": 25263779938560.0, "grad_norm": 2.214353176070726, "language_loss": 0.75975633, "learning_rate": 3.985252981610901e-06, "loss": 0.7818501, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.95703125, "step": 1123, "time_per_iteration": 3.932253837585449 }, { "auxiliary_loss_clip": 0.011247, "auxiliary_loss_mlp": 0.01084773, "balance_loss_clip": 1.02340448, "balance_loss_mlp": 1.02773237, "epoch": 0.06757853599879754, "flos": 23801687354880.0, "grad_norm": 1.8035342248159478, "language_loss": 0.81587684, "learning_rate": 3.985205736058114e-06, "loss": 0.83797157, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.96875, "step": 1124, "time_per_iteration": 3.9603469371795654 }, { "auxiliary_loss_clip": 0.01118417, "auxiliary_loss_mlp": 0.01076933, "balance_loss_clip": 1.02197719, "balance_loss_mlp": 1.02609396, "epoch": 0.0676386592514655, "flos": 21032655736320.0, "grad_norm": 1.8904031565188635, "language_loss": 0.74719065, "learning_rate": 3.985158415226128e-06, "loss": 0.76914418, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.921875, "step": 1125, "time_per_iteration": 3.834554433822632 }, { "auxiliary_loss_clip": 0.01121382, "auxiliary_loss_mlp": 0.01083275, "balance_loss_clip": 1.02376568, "balance_loss_mlp": 1.02692199, "epoch": 0.06769878250413347, "flos": 25555165079040.0, "grad_norm": 3.32834237866338, "language_loss": 0.83633471, "learning_rate": 3.985111019116736e-06, "loss": 0.85838127, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.9453125, "step": 1126, "time_per_iteration": 3.899857997894287 }, { "auxiliary_loss_clip": 0.01029995, "auxiliary_loss_mlp": 0.01056445, "balance_loss_clip": 1.04328477, "balance_loss_mlp": 1.00515544, "epoch": 0.06775890575680145, "flos": 70651740458880.0, "grad_norm": 0.8617601963732228, "language_loss": 0.59859312, "learning_rate": 3.985063547731735e-06, "loss": 0.61945748, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.24804688, "step": 1127, "time_per_iteration": 3.0482993125915527 }, { "auxiliary_loss_clip": 0.01123007, "auxiliary_loss_mlp": 0.01085624, "balance_loss_clip": 1.02528024, "balance_loss_mlp": 1.02925444, "epoch": 0.06781902900946941, "flos": 24234575702400.0, "grad_norm": 2.042128775577491, "language_loss": 0.83990121, "learning_rate": 3.985016001072925e-06, "loss": 0.86198753, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.9375, "step": 1128, "time_per_iteration": 2.4523684978485107 }, { "auxiliary_loss_clip": 0.01129507, "auxiliary_loss_mlp": 0.01094309, "balance_loss_clip": 1.03141379, "balance_loss_mlp": 1.03060877, "epoch": 0.06787915226213738, "flos": 22416473318400.0, "grad_norm": 1.9041930503931996, "language_loss": 0.78820133, "learning_rate": 3.984968379142109e-06, "loss": 0.81043947, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.98828125, "step": 1129, "time_per_iteration": 2.4221837520599365 }, { "auxiliary_loss_clip": 0.01126317, "auxiliary_loss_mlp": 0.01088032, "balance_loss_clip": 1.02589965, "balance_loss_mlp": 1.03075194, "epoch": 0.06793927551480534, "flos": 37705393664640.0, "grad_norm": 1.9418144790439253, "language_loss": 0.75519568, "learning_rate": 3.984920681941094e-06, "loss": 0.77733916, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.953125, "step": 1130, "time_per_iteration": 2.571518659591675 }, { "auxiliary_loss_clip": 0.01125286, "auxiliary_loss_mlp": 0.01101638, "balance_loss_clip": 1.03807545, "balance_loss_mlp": 1.03185916, "epoch": 0.06799939876747332, "flos": 20630351606400.0, "grad_norm": 2.5223262058806584, "language_loss": 0.83711624, "learning_rate": 3.984872909471688e-06, "loss": 0.85938543, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.93359375, "step": 1131, "time_per_iteration": 2.482433795928955 }, { "auxiliary_loss_clip": 0.01123376, "auxiliary_loss_mlp": 0.01100627, "balance_loss_clip": 1.04192817, "balance_loss_mlp": 1.0323621, "epoch": 0.06805952202014129, "flos": 14863921966080.0, "grad_norm": 2.0117562256844272, "language_loss": 0.83521867, "learning_rate": 3.984825061735701e-06, "loss": 0.85745865, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.91015625, "step": 1132, "time_per_iteration": 2.4478507041931152 }, { "auxiliary_loss_clip": 0.01127269, "auxiliary_loss_mlp": 0.01092815, "balance_loss_clip": 1.03416383, "balance_loss_mlp": 1.03286517, "epoch": 0.06811964527280925, "flos": 48907555747200.0, "grad_norm": 1.5167081980092614, "language_loss": 0.66709793, "learning_rate": 3.9847771387349495e-06, "loss": 0.68929875, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.9453125, "step": 1133, "time_per_iteration": 2.676922082901001 }, { "auxiliary_loss_clip": 0.01130865, "auxiliary_loss_mlp": 0.01103078, "balance_loss_clip": 1.03584409, "balance_loss_mlp": 1.0317868, "epoch": 0.06817976852547723, "flos": 15376377035520.0, "grad_norm": 2.2872385688703463, "language_loss": 0.78900862, "learning_rate": 3.9847291404712506e-06, "loss": 0.81134808, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.98828125, "step": 1134, "time_per_iteration": 2.41567325592041 }, { "auxiliary_loss_clip": 0.01121873, "auxiliary_loss_mlp": 0.01091113, "balance_loss_clip": 1.03389275, "balance_loss_mlp": 1.03119636, "epoch": 0.0682398917781452, "flos": 20154694976640.0, "grad_norm": 1.7019263422609774, "language_loss": 0.90251881, "learning_rate": 3.984681066946423e-06, "loss": 0.92464864, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.90625, "step": 1135, "time_per_iteration": 2.4410014152526855 }, { "auxiliary_loss_clip": 0.01121429, "auxiliary_loss_mlp": 0.01088732, "balance_loss_clip": 1.03043914, "balance_loss_mlp": 1.02930117, "epoch": 0.06830001503081316, "flos": 23439498243840.0, "grad_norm": 2.608476546521454, "language_loss": 0.82385993, "learning_rate": 3.984632918162291e-06, "loss": 0.84596151, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.921875, "step": 1136, "time_per_iteration": 2.488626003265381 }, { "auxiliary_loss_clip": 0.01125287, "auxiliary_loss_mlp": 0.01097232, "balance_loss_clip": 1.0357914, "balance_loss_mlp": 1.02988625, "epoch": 0.06836013828348114, "flos": 34348389972480.0, "grad_norm": 2.1092199946668257, "language_loss": 0.874327, "learning_rate": 3.984584694120679e-06, "loss": 0.89655221, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.953125, "step": 1137, "time_per_iteration": 2.5108256340026855 }, { "auxiliary_loss_clip": 0.01122085, "auxiliary_loss_mlp": 0.01092169, "balance_loss_clip": 1.03289843, "balance_loss_mlp": 1.02879333, "epoch": 0.06842026153614911, "flos": 23147729078400.0, "grad_norm": 1.8941065411664983, "language_loss": 0.82052231, "learning_rate": 3.984536394823418e-06, "loss": 0.84266484, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.93359375, "step": 1138, "time_per_iteration": 2.451204299926758 }, { "auxiliary_loss_clip": 0.01122444, "auxiliary_loss_mlp": 0.01096732, "balance_loss_clip": 1.03254938, "balance_loss_mlp": 1.02774584, "epoch": 0.06848038478881707, "flos": 24607796803200.0, "grad_norm": 1.9489130121631686, "language_loss": 0.8792212, "learning_rate": 3.984488020272336e-06, "loss": 0.9014129, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.94921875, "step": 1139, "time_per_iteration": 2.4835870265960693 }, { "auxiliary_loss_clip": 0.01126947, "auxiliary_loss_mlp": 0.01094943, "balance_loss_clip": 1.03362191, "balance_loss_mlp": 1.03009427, "epoch": 0.06854050804148504, "flos": 40879382676480.0, "grad_norm": 1.7967079356159903, "language_loss": 0.77245498, "learning_rate": 3.984439570469271e-06, "loss": 0.7946738, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.96875, "step": 1140, "time_per_iteration": 2.5894415378570557 }, { "auxiliary_loss_clip": 0.01125202, "auxiliary_loss_mlp": 0.01098284, "balance_loss_clip": 1.02580476, "balance_loss_mlp": 1.03009641, "epoch": 0.06860063129415302, "flos": 31685005728000.0, "grad_norm": 2.184653874082324, "language_loss": 0.71900249, "learning_rate": 3.9843910454160574e-06, "loss": 0.74123734, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.94921875, "step": 1141, "time_per_iteration": 2.550922393798828 }, { "auxiliary_loss_clip": 0.0112521, "auxiliary_loss_mlp": 0.01079015, "balance_loss_clip": 1.01278222, "balance_loss_mlp": 1.02810967, "epoch": 0.06866075454682098, "flos": 26540798071680.0, "grad_norm": 2.038034663497038, "language_loss": 0.81297326, "learning_rate": 3.984342445114538e-06, "loss": 0.83501542, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.96875, "step": 1142, "time_per_iteration": 2.5274693965911865 }, { "auxiliary_loss_clip": 0.01120549, "auxiliary_loss_mlp": 0.01084358, "balance_loss_clip": 1.02069974, "balance_loss_mlp": 1.02910841, "epoch": 0.06872087779948895, "flos": 29788453785600.0, "grad_norm": 3.283402850429473, "language_loss": 0.71786511, "learning_rate": 3.984293769566553e-06, "loss": 0.73991418, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.9140625, "step": 1143, "time_per_iteration": 2.4973251819610596 }, { "auxiliary_loss_clip": 0.01122981, "auxiliary_loss_mlp": 0.01082928, "balance_loss_clip": 1.02232218, "balance_loss_mlp": 1.03154778, "epoch": 0.06878100105215693, "flos": 26939960179200.0, "grad_norm": 1.6362535099889581, "language_loss": 0.76401097, "learning_rate": 3.98424501877395e-06, "loss": 0.78607011, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.9140625, "step": 1144, "time_per_iteration": 2.4799752235412598 }, { "auxiliary_loss_clip": 0.01129897, "auxiliary_loss_mlp": 0.01088304, "balance_loss_clip": 1.02040291, "balance_loss_mlp": 1.03165627, "epoch": 0.06884112430482489, "flos": 10669980228480.0, "grad_norm": 2.386005081791177, "language_loss": 0.95492917, "learning_rate": 3.984196192738577e-06, "loss": 0.97711122, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.984375, "step": 1145, "time_per_iteration": 2.3852670192718506 }, { "auxiliary_loss_clip": 0.01136948, "auxiliary_loss_mlp": 0.01095834, "balance_loss_clip": 1.0257864, "balance_loss_mlp": 1.03430319, "epoch": 0.06890124755749286, "flos": 20192610579840.0, "grad_norm": 2.5621384087053105, "language_loss": 0.8596679, "learning_rate": 3.984147291462285e-06, "loss": 0.88199568, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.03125, "step": 1146, "time_per_iteration": 2.4385712146759033 }, { "auxiliary_loss_clip": 0.01123646, "auxiliary_loss_mlp": 0.01086992, "balance_loss_clip": 1.02471733, "balance_loss_mlp": 1.03171659, "epoch": 0.06896137081016084, "flos": 20448174798720.0, "grad_norm": 1.9064338641776337, "language_loss": 0.87195885, "learning_rate": 3.98409831494693e-06, "loss": 0.8940652, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.91796875, "step": 1147, "time_per_iteration": 2.4222569465637207 }, { "auxiliary_loss_clip": 0.01125913, "auxiliary_loss_mlp": 0.01085966, "balance_loss_clip": 1.02085328, "balance_loss_mlp": 1.0294975, "epoch": 0.0690214940628288, "flos": 18367735392000.0, "grad_norm": 1.9764152505946622, "language_loss": 0.88885844, "learning_rate": 3.984049263194367e-06, "loss": 0.91097713, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.96484375, "step": 1148, "time_per_iteration": 2.421541929244995 }, { "auxiliary_loss_clip": 0.01123515, "auxiliary_loss_mlp": 0.01082554, "balance_loss_clip": 1.02056539, "balance_loss_mlp": 1.029423, "epoch": 0.06908161731549677, "flos": 20556999106560.0, "grad_norm": 2.392153570835658, "language_loss": 0.72474724, "learning_rate": 3.9840001362064575e-06, "loss": 0.74680793, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.94140625, "step": 1149, "time_per_iteration": 2.432215452194214 }, { "auxiliary_loss_clip": 0.0112754, "auxiliary_loss_mlp": 0.01082993, "balance_loss_clip": 1.02057528, "balance_loss_mlp": 1.02859437, "epoch": 0.06914174056816474, "flos": 27562426542720.0, "grad_norm": 1.885564513768512, "language_loss": 0.8711127, "learning_rate": 3.983950933985064e-06, "loss": 0.89321804, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.98828125, "step": 1150, "time_per_iteration": 2.491286277770996 }, { "auxiliary_loss_clip": 0.01129628, "auxiliary_loss_mlp": 0.01093329, "balance_loss_clip": 1.02719188, "balance_loss_mlp": 1.03058386, "epoch": 0.06920186382083271, "flos": 15303129269760.0, "grad_norm": 3.514703244721381, "language_loss": 0.87550616, "learning_rate": 3.983901656532052e-06, "loss": 0.89773583, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.98828125, "step": 1151, "time_per_iteration": 2.3895370960235596 }, { "auxiliary_loss_clip": 0.01125898, "auxiliary_loss_mlp": 0.01088363, "balance_loss_clip": 1.02399015, "balance_loss_mlp": 1.02974057, "epoch": 0.06926198707350067, "flos": 25190078325120.0, "grad_norm": 1.7310559545872284, "language_loss": 0.87590617, "learning_rate": 3.983852303849291e-06, "loss": 0.89804876, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9609375, "step": 1152, "time_per_iteration": 2.463397741317749 }, { "auxiliary_loss_clip": 0.01122164, "auxiliary_loss_mlp": 0.01088429, "balance_loss_clip": 1.02837181, "balance_loss_mlp": 1.02923846, "epoch": 0.06932211032616864, "flos": 13255438584960.0, "grad_norm": 2.233448985568123, "language_loss": 0.93598485, "learning_rate": 3.983802875938651e-06, "loss": 0.95809078, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.9296875, "step": 1153, "time_per_iteration": 2.383981227874756 }, { "auxiliary_loss_clip": 0.01124264, "auxiliary_loss_mlp": 0.01086272, "balance_loss_clip": 1.02342486, "balance_loss_mlp": 1.02828693, "epoch": 0.06938223357883662, "flos": 24826213468800.0, "grad_norm": 1.977841163568739, "language_loss": 0.83504844, "learning_rate": 3.983753372802008e-06, "loss": 0.85715377, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.95703125, "step": 1154, "time_per_iteration": 2.445939540863037 }, { "auxiliary_loss_clip": 0.0111804, "auxiliary_loss_mlp": 0.0108937, "balance_loss_clip": 1.02471066, "balance_loss_mlp": 1.02664399, "epoch": 0.06944235683150458, "flos": 27266852039040.0, "grad_norm": 1.8378016692937635, "language_loss": 0.77644992, "learning_rate": 3.983703794441237e-06, "loss": 0.79852396, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9140625, "step": 1155, "time_per_iteration": 2.468337059020996 }, { "auxiliary_loss_clip": 0.01120091, "auxiliary_loss_mlp": 0.01086791, "balance_loss_clip": 1.02341962, "balance_loss_mlp": 1.02557087, "epoch": 0.06950248008417255, "flos": 25806993782400.0, "grad_norm": 1.7401925724173481, "language_loss": 0.73281115, "learning_rate": 3.98365414085822e-06, "loss": 0.75487995, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.9453125, "step": 1156, "time_per_iteration": 2.45001482963562 }, { "auxiliary_loss_clip": 0.01122873, "auxiliary_loss_mlp": 0.01090399, "balance_loss_clip": 1.02435744, "balance_loss_mlp": 1.02761257, "epoch": 0.06956260333684053, "flos": 22270501457280.0, "grad_norm": 1.8370850115205077, "language_loss": 0.77475703, "learning_rate": 3.98360441205484e-06, "loss": 0.79688978, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.953125, "step": 1157, "time_per_iteration": 2.4102835655212402 }, { "auxiliary_loss_clip": 0.01122157, "auxiliary_loss_mlp": 0.01085671, "balance_loss_clip": 1.02358639, "balance_loss_mlp": 1.02596426, "epoch": 0.0696227265895085, "flos": 29680048414080.0, "grad_norm": 1.9458102866366156, "language_loss": 0.74436134, "learning_rate": 3.983554608032982e-06, "loss": 0.76643968, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.9609375, "step": 1158, "time_per_iteration": 2.4733190536499023 }, { "auxiliary_loss_clip": 0.01127856, "auxiliary_loss_mlp": 0.01082402, "balance_loss_clip": 1.01693273, "balance_loss_mlp": 1.02864301, "epoch": 0.06968284984217646, "flos": 25522276711680.0, "grad_norm": 2.071886089028901, "language_loss": 0.82614207, "learning_rate": 3.983504728794533e-06, "loss": 0.84824467, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.9921875, "step": 1159, "time_per_iteration": 2.4483911991119385 }, { "auxiliary_loss_clip": 0.0112646, "auxiliary_loss_mlp": 0.01089671, "balance_loss_clip": 1.01881254, "balance_loss_mlp": 1.02979612, "epoch": 0.06974297309484444, "flos": 20697315327360.0, "grad_norm": 2.7241934538980273, "language_loss": 0.87037098, "learning_rate": 3.983454774341387e-06, "loss": 0.89253217, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 0.96875, "step": 1160, "time_per_iteration": 2.3994834423065186 }, { "auxiliary_loss_clip": 0.01127559, "auxiliary_loss_mlp": 0.01095395, "balance_loss_clip": 1.02539492, "balance_loss_mlp": 1.02781618, "epoch": 0.0698030963475124, "flos": 26503999632000.0, "grad_norm": 1.625587204700965, "language_loss": 0.78050566, "learning_rate": 3.983404744675437e-06, "loss": 0.80273521, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 1.0, "step": 1161, "time_per_iteration": 2.4453136920928955 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01096327, "balance_loss_clip": 1.02737641, "balance_loss_mlp": 1.0261029, "epoch": 0.06986321960018037, "flos": 23039288795520.0, "grad_norm": 1.7730575241492539, "language_loss": 0.85159492, "learning_rate": 3.9833546397985794e-06, "loss": 0.87377816, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.9609375, "step": 1162, "time_per_iteration": 3.8944971561431885 }, { "auxiliary_loss_clip": 0.01120592, "auxiliary_loss_mlp": 0.010764, "balance_loss_clip": 1.01326632, "balance_loss_mlp": 1.02559292, "epoch": 0.06992334285284833, "flos": 28583566254720.0, "grad_norm": 1.8005549986175502, "language_loss": 0.81445593, "learning_rate": 3.983304459712716e-06, "loss": 0.83642584, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.94921875, "step": 1163, "time_per_iteration": 2.4830455780029297 }, { "auxiliary_loss_clip": 0.01123889, "auxiliary_loss_mlp": 0.01081741, "balance_loss_clip": 1.01979947, "balance_loss_mlp": 1.02658606, "epoch": 0.06998346610551631, "flos": 20594286305280.0, "grad_norm": 2.137255250669476, "language_loss": 0.8066082, "learning_rate": 3.983254204419749e-06, "loss": 0.82866442, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.97265625, "step": 1164, "time_per_iteration": 5.3419575691223145 }, { "auxiliary_loss_clip": 0.01123909, "auxiliary_loss_mlp": 0.01085924, "balance_loss_clip": 1.02093148, "balance_loss_mlp": 1.02699661, "epoch": 0.07004358935818428, "flos": 22527706510080.0, "grad_norm": 1.7896258245151857, "language_loss": 0.75779712, "learning_rate": 3.983203873921583e-06, "loss": 0.77989542, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.96875, "step": 1165, "time_per_iteration": 2.423755645751953 }, { "auxiliary_loss_clip": 0.01123987, "auxiliary_loss_mlp": 0.01089922, "balance_loss_clip": 1.02111435, "balance_loss_mlp": 1.02588546, "epoch": 0.07010371261085224, "flos": 28948722831360.0, "grad_norm": 1.8631089368546352, "language_loss": 0.82915854, "learning_rate": 3.983153468220128e-06, "loss": 0.85129762, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.98046875, "step": 1166, "time_per_iteration": 3.87127685546875 }, { "auxiliary_loss_clip": 0.0112414, "auxiliary_loss_mlp": 0.01092909, "balance_loss_clip": 1.02495956, "balance_loss_mlp": 1.02779317, "epoch": 0.07016383586352022, "flos": 23658054554880.0, "grad_norm": 2.0286047740789, "language_loss": 0.87652385, "learning_rate": 3.983102987317295e-06, "loss": 0.8986944, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.96484375, "step": 1167, "time_per_iteration": 2.425036907196045 }, { "auxiliary_loss_clip": 0.01126049, "auxiliary_loss_mlp": 0.01091101, "balance_loss_clip": 1.02067208, "balance_loss_mlp": 1.02578366, "epoch": 0.07022395911618819, "flos": 19791109411200.0, "grad_norm": 1.989751081251553, "language_loss": 0.93672723, "learning_rate": 3.983052431214997e-06, "loss": 0.95889866, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.0, "step": 1168, "time_per_iteration": 2.398099660873413 }, { "auxiliary_loss_clip": 0.01128506, "auxiliary_loss_mlp": 0.01105523, "balance_loss_clip": 1.03042114, "balance_loss_mlp": 1.02679992, "epoch": 0.07028408236885615, "flos": 21688080289920.0, "grad_norm": 1.9694278304428756, "language_loss": 0.91400796, "learning_rate": 3.983001799915153e-06, "loss": 0.93634826, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 1.015625, "step": 1169, "time_per_iteration": 2.396488666534424 }, { "auxiliary_loss_clip": 0.01127363, "auxiliary_loss_mlp": 0.01089321, "balance_loss_clip": 1.01994097, "balance_loss_mlp": 1.0284791, "epoch": 0.07034420562152413, "flos": 25629076160640.0, "grad_norm": 2.4510198917683526, "language_loss": 0.8710537, "learning_rate": 3.982951093419681e-06, "loss": 0.89322054, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 0.98828125, "step": 1170, "time_per_iteration": 2.4446558952331543 }, { "auxiliary_loss_clip": 0.01121907, "auxiliary_loss_mlp": 0.01095745, "balance_loss_clip": 1.02688932, "balance_loss_mlp": 1.02577257, "epoch": 0.0704043288741921, "flos": 20809491125760.0, "grad_norm": 1.974348525507159, "language_loss": 0.78108561, "learning_rate": 3.982900311730506e-06, "loss": 0.80326217, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.9609375, "step": 1171, "time_per_iteration": 2.4116785526275635 }, { "auxiliary_loss_clip": 0.01122969, "auxiliary_loss_mlp": 0.01089955, "balance_loss_clip": 1.02524829, "balance_loss_mlp": 1.02575743, "epoch": 0.07046445212686006, "flos": 25591998430080.0, "grad_norm": 1.7534652989738828, "language_loss": 0.91614544, "learning_rate": 3.9828494548495514e-06, "loss": 0.93827468, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.97265625, "step": 1172, "time_per_iteration": 2.4500577449798584 }, { "auxiliary_loss_clip": 0.01127021, "auxiliary_loss_mlp": 0.01093403, "balance_loss_clip": 1.02297401, "balance_loss_mlp": 1.02703094, "epoch": 0.07052457537952803, "flos": 25555793483520.0, "grad_norm": 1.6369875259069804, "language_loss": 0.84426713, "learning_rate": 3.982798522778748e-06, "loss": 0.86647129, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 1.0, "step": 1173, "time_per_iteration": 2.4377236366271973 }, { "auxiliary_loss_clip": 0.01123274, "auxiliary_loss_mlp": 0.01090843, "balance_loss_clip": 1.02270269, "balance_loss_mlp": 1.0266397, "epoch": 0.070584698632196, "flos": 17967525943680.0, "grad_norm": 1.9760185715190408, "language_loss": 0.83913708, "learning_rate": 3.9827475155200245e-06, "loss": 0.8612783, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.96875, "step": 1174, "time_per_iteration": 2.4259960651397705 }, { "auxiliary_loss_clip": 0.01125288, "auxiliary_loss_mlp": 0.01099835, "balance_loss_clip": 1.03031182, "balance_loss_mlp": 1.02695394, "epoch": 0.07064482188486397, "flos": 25369811337600.0, "grad_norm": 1.8712729103564836, "language_loss": 0.87261462, "learning_rate": 3.982696433075317e-06, "loss": 0.89486587, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 0.984375, "step": 1175, "time_per_iteration": 2.427386999130249 }, { "auxiliary_loss_clip": 0.01124963, "auxiliary_loss_mlp": 0.01092704, "balance_loss_clip": 1.02160811, "balance_loss_mlp": 1.02870667, "epoch": 0.07070494513753194, "flos": 24898693184640.0, "grad_norm": 1.7003345106287127, "language_loss": 0.86938488, "learning_rate": 3.982645275446563e-06, "loss": 0.89156163, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.9609375, "step": 1176, "time_per_iteration": 2.447356939315796 }, { "auxiliary_loss_clip": 0.01120673, "auxiliary_loss_mlp": 0.01099006, "balance_loss_clip": 1.03072345, "balance_loss_mlp": 1.02573276, "epoch": 0.07076506839019991, "flos": 22337569912320.0, "grad_norm": 2.622261797986799, "language_loss": 0.77637994, "learning_rate": 3.982594042635701e-06, "loss": 0.79857677, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.9453125, "step": 1177, "time_per_iteration": 2.3983569145202637 }, { "auxiliary_loss_clip": 0.01129956, "auxiliary_loss_mlp": 0.01099751, "balance_loss_clip": 1.0238384, "balance_loss_mlp": 1.02857423, "epoch": 0.07082519164286788, "flos": 18659818759680.0, "grad_norm": 1.760069954688353, "language_loss": 0.88014412, "learning_rate": 3.982542734644673e-06, "loss": 0.9024412, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 1.015625, "step": 1178, "time_per_iteration": 2.430652379989624 }, { "auxiliary_loss_clip": 0.01035933, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.03057301, "balance_loss_mlp": 1.00861716, "epoch": 0.07088531489553584, "flos": 63650676942720.0, "grad_norm": 0.8984989359649695, "language_loss": 0.63537645, "learning_rate": 3.982491351475427e-06, "loss": 0.65616167, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.2734375, "step": 1179, "time_per_iteration": 3.1636335849761963 }, { "auxiliary_loss_clip": 0.01125684, "auxiliary_loss_mlp": 0.01095091, "balance_loss_clip": 1.02518678, "balance_loss_mlp": 1.02717006, "epoch": 0.07094543814820382, "flos": 21571819862400.0, "grad_norm": 2.0676158963514344, "language_loss": 0.88718003, "learning_rate": 3.98243989312991e-06, "loss": 0.90938783, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.984375, "step": 1180, "time_per_iteration": 2.415634870529175 }, { "auxiliary_loss_clip": 0.01122677, "auxiliary_loss_mlp": 0.01089986, "balance_loss_clip": 1.02642345, "balance_loss_mlp": 1.02745593, "epoch": 0.07100556140087179, "flos": 22088883231360.0, "grad_norm": 1.9755949973321099, "language_loss": 0.9062252, "learning_rate": 3.982388359610074e-06, "loss": 0.92835188, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.953125, "step": 1181, "time_per_iteration": 2.3916428089141846 }, { "auxiliary_loss_clip": 0.01117525, "auxiliary_loss_mlp": 0.01080919, "balance_loss_clip": 1.02231574, "balance_loss_mlp": 1.02573502, "epoch": 0.07106568465353975, "flos": 47920491388800.0, "grad_norm": 1.8169800846786288, "language_loss": 0.86468613, "learning_rate": 3.9823367509178725e-06, "loss": 0.88667059, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.9140625, "step": 1182, "time_per_iteration": 2.6625585556030273 }, { "auxiliary_loss_clip": 0.01118918, "auxiliary_loss_mlp": 0.01092169, "balance_loss_clip": 1.02874947, "balance_loss_mlp": 1.02779865, "epoch": 0.07112580790620772, "flos": 23439672800640.0, "grad_norm": 2.175270210512498, "language_loss": 0.83351719, "learning_rate": 3.982285067055262e-06, "loss": 0.85562801, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.91015625, "step": 1183, "time_per_iteration": 2.415926694869995 }, { "auxiliary_loss_clip": 0.01123528, "auxiliary_loss_mlp": 0.01092604, "balance_loss_clip": 1.0248456, "balance_loss_mlp": 1.02660775, "epoch": 0.0711859311588757, "flos": 31867531649280.0, "grad_norm": 1.9336690096561036, "language_loss": 0.82249904, "learning_rate": 3.982233308024204e-06, "loss": 0.84466034, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.96875, "step": 1184, "time_per_iteration": 2.4871761798858643 }, { "auxiliary_loss_clip": 0.01116557, "auxiliary_loss_mlp": 0.01102872, "balance_loss_clip": 1.03845167, "balance_loss_mlp": 1.02501965, "epoch": 0.07124605441154366, "flos": 19609281717120.0, "grad_norm": 1.8042516016519636, "language_loss": 0.79406917, "learning_rate": 3.98218147382666e-06, "loss": 0.81626344, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.9140625, "step": 1185, "time_per_iteration": 2.3854079246520996 }, { "auxiliary_loss_clip": 0.01116748, "auxiliary_loss_mlp": 0.01096321, "balance_loss_clip": 1.03581023, "balance_loss_mlp": 1.02474868, "epoch": 0.07130617766421163, "flos": 14683560549120.0, "grad_norm": 2.262057352365454, "language_loss": 0.69308913, "learning_rate": 3.982129564464596e-06, "loss": 0.71521986, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.921875, "step": 1186, "time_per_iteration": 2.3720531463623047 }, { "auxiliary_loss_clip": 0.01112848, "auxiliary_loss_mlp": 0.01087575, "balance_loss_clip": 1.02396536, "balance_loss_mlp": 1.02491748, "epoch": 0.07136630091687961, "flos": 26066712453120.0, "grad_norm": 1.7667968910349536, "language_loss": 0.71992087, "learning_rate": 3.98207757993998e-06, "loss": 0.74192506, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.87890625, "step": 1187, "time_per_iteration": 2.4268622398376465 }, { "auxiliary_loss_clip": 0.01116705, "auxiliary_loss_mlp": 0.01081683, "balance_loss_clip": 1.02472436, "balance_loss_mlp": 1.0262754, "epoch": 0.07142642416954757, "flos": 15668285846400.0, "grad_norm": 2.7678316515069654, "language_loss": 0.8119632, "learning_rate": 3.9820255202547845e-06, "loss": 0.83394706, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.90625, "step": 1188, "time_per_iteration": 2.4028849601745605 }, { "auxiliary_loss_clip": 0.01115977, "auxiliary_loss_mlp": 0.0108913, "balance_loss_clip": 1.02346921, "balance_loss_mlp": 1.02768111, "epoch": 0.07148654742221554, "flos": 19754310971520.0, "grad_norm": 2.0267495769136916, "language_loss": 0.87510866, "learning_rate": 3.981973385410981e-06, "loss": 0.8971597, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.8828125, "step": 1189, "time_per_iteration": 2.384150981903076 }, { "auxiliary_loss_clip": 0.01114505, "auxiliary_loss_mlp": 0.0107604, "balance_loss_clip": 1.01865232, "balance_loss_mlp": 1.0250957, "epoch": 0.07154667067488352, "flos": 23470850511360.0, "grad_norm": 1.6568544516657202, "language_loss": 0.79731643, "learning_rate": 3.9819211754105494e-06, "loss": 0.81922185, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.89453125, "step": 1190, "time_per_iteration": 2.417663812637329 }, { "auxiliary_loss_clip": 0.0112034, "auxiliary_loss_mlp": 0.01089419, "balance_loss_clip": 1.02547491, "balance_loss_mlp": 1.0268836, "epoch": 0.07160679392755148, "flos": 18331949381760.0, "grad_norm": 2.164297692622935, "language_loss": 0.7927351, "learning_rate": 3.981868890255468e-06, "loss": 0.81483269, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.93359375, "step": 1191, "time_per_iteration": 2.3940494060516357 }, { "auxiliary_loss_clip": 0.01116035, "auxiliary_loss_mlp": 0.01083933, "balance_loss_clip": 1.0195601, "balance_loss_mlp": 1.02545905, "epoch": 0.07166691718021945, "flos": 17746106901120.0, "grad_norm": 3.704578111152643, "language_loss": 0.77571988, "learning_rate": 3.981816529947719e-06, "loss": 0.79771954, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.90625, "step": 1192, "time_per_iteration": 2.3752057552337646 }, { "auxiliary_loss_clip": 0.01115085, "auxiliary_loss_mlp": 0.01076688, "balance_loss_clip": 1.01872897, "balance_loss_mlp": 1.02501416, "epoch": 0.07172704043288743, "flos": 22450932696960.0, "grad_norm": 2.235966015789626, "language_loss": 0.80903035, "learning_rate": 3.9817640944892896e-06, "loss": 0.83094811, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8984375, "step": 1193, "time_per_iteration": 2.41412091255188 }, { "auxiliary_loss_clip": 0.01115787, "auxiliary_loss_mlp": 0.01075892, "balance_loss_clip": 1.01845717, "balance_loss_mlp": 1.02688646, "epoch": 0.07178716368555539, "flos": 23221081578240.0, "grad_norm": 2.0258026916721956, "language_loss": 0.89008176, "learning_rate": 3.981711583882166e-06, "loss": 0.91199857, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.890625, "step": 1194, "time_per_iteration": 2.401995897293091 }, { "auxiliary_loss_clip": 0.01115104, "auxiliary_loss_mlp": 0.01072062, "balance_loss_clip": 1.01834655, "balance_loss_mlp": 1.02682126, "epoch": 0.07184728693822336, "flos": 25149788749440.0, "grad_norm": 2.0168134213511455, "language_loss": 0.83536619, "learning_rate": 3.981658998128341e-06, "loss": 0.85723782, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8828125, "step": 1195, "time_per_iteration": 2.4544694423675537 }, { "auxiliary_loss_clip": 0.01112442, "auxiliary_loss_mlp": 0.010746, "balance_loss_clip": 1.01945353, "balance_loss_mlp": 1.02524376, "epoch": 0.07190741019089132, "flos": 22710127697280.0, "grad_norm": 1.75718719432, "language_loss": 0.80885106, "learning_rate": 3.981606337229808e-06, "loss": 0.8307215, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.87109375, "step": 1196, "time_per_iteration": 2.4209485054016113 }, { "auxiliary_loss_clip": 0.01112327, "auxiliary_loss_mlp": 0.01081033, "balance_loss_clip": 1.02319264, "balance_loss_mlp": 1.02397537, "epoch": 0.0719675334435593, "flos": 29348548254720.0, "grad_norm": 2.242594260065792, "language_loss": 0.75135398, "learning_rate": 3.9815536011885655e-06, "loss": 0.77328753, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8828125, "step": 1197, "time_per_iteration": 2.451167583465576 }, { "auxiliary_loss_clip": 0.01113114, "auxiliary_loss_mlp": 0.01081348, "balance_loss_clip": 1.02367496, "balance_loss_mlp": 1.02495384, "epoch": 0.07202765669622727, "flos": 17638818693120.0, "grad_norm": 2.0360962175838435, "language_loss": 0.87672806, "learning_rate": 3.98150079000661e-06, "loss": 0.89867264, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8828125, "step": 1198, "time_per_iteration": 2.3775522708892822 }, { "auxiliary_loss_clip": 0.01112577, "auxiliary_loss_mlp": 0.01076277, "balance_loss_clip": 1.01998663, "balance_loss_mlp": 1.02645516, "epoch": 0.07208777994889523, "flos": 21432969918720.0, "grad_norm": 2.091016387007963, "language_loss": 0.86180282, "learning_rate": 3.981447903685947e-06, "loss": 0.88369143, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.86328125, "step": 1199, "time_per_iteration": 2.4240610599517822 }, { "auxiliary_loss_clip": 0.01112717, "auxiliary_loss_mlp": 0.01075554, "balance_loss_clip": 1.02369833, "balance_loss_mlp": 1.0278995, "epoch": 0.07214790320156321, "flos": 26939715799680.0, "grad_norm": 2.0905177417523686, "language_loss": 0.78654313, "learning_rate": 3.981394942228581e-06, "loss": 0.8084259, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.84765625, "step": 1200, "time_per_iteration": 2.464064121246338 }, { "auxiliary_loss_clip": 0.0110905, "auxiliary_loss_mlp": 0.01073413, "balance_loss_clip": 1.0208416, "balance_loss_mlp": 1.02513266, "epoch": 0.07220802645423118, "flos": 23878775370240.0, "grad_norm": 1.9300602262812165, "language_loss": 0.84572875, "learning_rate": 3.98134190563652e-06, "loss": 0.86755347, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.83984375, "step": 1201, "time_per_iteration": 2.4499619007110596 }, { "auxiliary_loss_clip": 0.01114972, "auxiliary_loss_mlp": 0.01073777, "balance_loss_clip": 1.02146769, "balance_loss_mlp": 1.02601802, "epoch": 0.07226814970689914, "flos": 19242658863360.0, "grad_norm": 2.5938350055687387, "language_loss": 0.72377568, "learning_rate": 3.981288793911775e-06, "loss": 0.74566323, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.890625, "step": 1202, "time_per_iteration": 3.8819968700408936 }, { "auxiliary_loss_clip": 0.01113223, "auxiliary_loss_mlp": 0.01079972, "balance_loss_clip": 1.02437234, "balance_loss_mlp": 1.02635658, "epoch": 0.07232827295956712, "flos": 19171017020160.0, "grad_norm": 6.548318522025175, "language_loss": 0.89312029, "learning_rate": 3.98123560705636e-06, "loss": 0.91505218, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.87109375, "step": 1203, "time_per_iteration": 2.3944551944732666 }, { "auxiliary_loss_clip": 0.01111437, "auxiliary_loss_mlp": 0.01074299, "balance_loss_clip": 1.02134633, "balance_loss_mlp": 1.02428722, "epoch": 0.07238839621223508, "flos": 17638783781760.0, "grad_norm": 1.9967232649731343, "language_loss": 0.81829417, "learning_rate": 3.981182345072293e-06, "loss": 0.84015155, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.87109375, "step": 1204, "time_per_iteration": 5.206207990646362 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.01073809, "balance_loss_clip": 1.02049828, "balance_loss_mlp": 1.02529883, "epoch": 0.07244851946490305, "flos": 28291168684800.0, "grad_norm": 3.040804559294911, "language_loss": 0.84102583, "learning_rate": 3.981129007961593e-06, "loss": 0.86289537, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.87890625, "step": 1205, "time_per_iteration": 3.8415613174438477 }, { "auxiliary_loss_clip": 0.01112206, "auxiliary_loss_mlp": 0.01068996, "balance_loss_clip": 1.01792717, "balance_loss_mlp": 1.0258503, "epoch": 0.07250864271757101, "flos": 22563736899840.0, "grad_norm": 1.8308996784701876, "language_loss": 0.78907037, "learning_rate": 3.981075595726283e-06, "loss": 0.81088239, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.86328125, "step": 1206, "time_per_iteration": 2.4230120182037354 }, { "auxiliary_loss_clip": 0.01109095, "auxiliary_loss_mlp": 0.01069985, "balance_loss_clip": 1.01879692, "balance_loss_mlp": 1.0238272, "epoch": 0.072568765970239, "flos": 21761328055680.0, "grad_norm": 1.8639185009385169, "language_loss": 0.79852414, "learning_rate": 3.981022108368387e-06, "loss": 0.82031488, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.8515625, "step": 1207, "time_per_iteration": 2.4163947105407715 }, { "auxiliary_loss_clip": 0.01106544, "auxiliary_loss_mlp": 0.01074992, "balance_loss_clip": 1.02742767, "balance_loss_mlp": 1.0255847, "epoch": 0.07262888922290696, "flos": 25518541196160.0, "grad_norm": 2.1194911961140206, "language_loss": 0.82011855, "learning_rate": 3.9809685458899345e-06, "loss": 0.84193391, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.8125, "step": 1208, "time_per_iteration": 2.433619260787964 }, { "auxiliary_loss_clip": 0.01105386, "auxiliary_loss_mlp": 0.01063465, "balance_loss_clip": 1.01687789, "balance_loss_mlp": 1.02401137, "epoch": 0.07268901247557492, "flos": 21245626229760.0, "grad_norm": 1.862190047866094, "language_loss": 0.80501086, "learning_rate": 3.980914908292955e-06, "loss": 0.82669938, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.8125, "step": 1209, "time_per_iteration": 2.4045615196228027 }, { "auxiliary_loss_clip": 0.01109715, "auxiliary_loss_mlp": 0.01067112, "balance_loss_clip": 1.02071548, "balance_loss_mlp": 1.02487123, "epoch": 0.0727491357282429, "flos": 25478251620480.0, "grad_norm": 2.7676543826676516, "language_loss": 0.84162784, "learning_rate": 3.980861195579486e-06, "loss": 0.86339611, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.84765625, "step": 1210, "time_per_iteration": 2.424144983291626 }, { "auxiliary_loss_clip": 0.0110776, "auxiliary_loss_mlp": 0.01065589, "balance_loss_clip": 1.01943123, "balance_loss_mlp": 1.02581787, "epoch": 0.07280925898091087, "flos": 24461021980800.0, "grad_norm": 1.8249884414728295, "language_loss": 0.87354827, "learning_rate": 3.98080740775156e-06, "loss": 0.89528167, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.8203125, "step": 1211, "time_per_iteration": 2.4516186714172363 }, { "auxiliary_loss_clip": 0.01105759, "auxiliary_loss_mlp": 0.01074731, "balance_loss_clip": 1.0216347, "balance_loss_mlp": 1.02405953, "epoch": 0.07286938223357883, "flos": 18287435531520.0, "grad_norm": 2.3175766095114683, "language_loss": 0.94591117, "learning_rate": 3.98075354481122e-06, "loss": 0.96771604, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.81640625, "step": 1212, "time_per_iteration": 2.3611464500427246 }, { "auxiliary_loss_clip": 0.01105698, "auxiliary_loss_mlp": 0.01072169, "balance_loss_clip": 1.02145743, "balance_loss_mlp": 1.02371264, "epoch": 0.07292950548624681, "flos": 21213750291840.0, "grad_norm": 1.8241637870983252, "language_loss": 0.75461155, "learning_rate": 3.9806996067605055e-06, "loss": 0.77639019, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.8203125, "step": 1213, "time_per_iteration": 2.4256136417388916 }, { "auxiliary_loss_clip": 0.01109816, "auxiliary_loss_mlp": 0.01068884, "balance_loss_clip": 1.01740909, "balance_loss_mlp": 1.02486157, "epoch": 0.07298962873891478, "flos": 24640929550080.0, "grad_norm": 1.6581661244144796, "language_loss": 0.87027365, "learning_rate": 3.980645593601465e-06, "loss": 0.89206064, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.84765625, "step": 1214, "time_per_iteration": 2.4261586666107178 }, { "auxiliary_loss_clip": 0.01111435, "auxiliary_loss_mlp": 0.0106732, "balance_loss_clip": 1.01434326, "balance_loss_mlp": 1.02481461, "epoch": 0.07304975199158274, "flos": 27051542484480.0, "grad_norm": 2.29191356886576, "language_loss": 0.87316763, "learning_rate": 3.980591505336144e-06, "loss": 0.89495516, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8671875, "step": 1215, "time_per_iteration": 2.4961042404174805 }, { "auxiliary_loss_clip": 0.01109385, "auxiliary_loss_mlp": 0.01074217, "balance_loss_clip": 1.01823676, "balance_loss_mlp": 1.02366066, "epoch": 0.07310987524425071, "flos": 33548075809920.0, "grad_norm": 1.6020325449163855, "language_loss": 0.83655798, "learning_rate": 3.980537341966595e-06, "loss": 0.85839403, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.85546875, "step": 1216, "time_per_iteration": 2.53962779045105 }, { "auxiliary_loss_clip": 0.0111208, "auxiliary_loss_mlp": 0.01068369, "balance_loss_clip": 1.01422441, "balance_loss_mlp": 1.02654088, "epoch": 0.07316999849691869, "flos": 28109690104320.0, "grad_norm": 2.0949301455298226, "language_loss": 0.79516041, "learning_rate": 3.980483103494872e-06, "loss": 0.81696492, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.85546875, "step": 1217, "time_per_iteration": 2.4651968479156494 }, { "auxiliary_loss_clip": 0.011084, "auxiliary_loss_mlp": 0.01076036, "balance_loss_clip": 1.02527654, "balance_loss_mlp": 1.02599311, "epoch": 0.07323012174958665, "flos": 14391721560960.0, "grad_norm": 1.8419408599874212, "language_loss": 0.88053071, "learning_rate": 3.98042878992303e-06, "loss": 0.90237504, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.82421875, "step": 1218, "time_per_iteration": 2.389381170272827 }, { "auxiliary_loss_clip": 0.01108902, "auxiliary_loss_mlp": 0.0107483, "balance_loss_clip": 1.02161479, "balance_loss_mlp": 1.0239656, "epoch": 0.07329024500225462, "flos": 21615356194560.0, "grad_norm": 2.905376318049401, "language_loss": 0.8919282, "learning_rate": 3.9803744012531305e-06, "loss": 0.91376549, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8515625, "step": 1219, "time_per_iteration": 2.4132585525512695 }, { "auxiliary_loss_clip": 0.01105741, "auxiliary_loss_mlp": 0.01075677, "balance_loss_clip": 1.02150822, "balance_loss_mlp": 1.02352095, "epoch": 0.0733503682549226, "flos": 13223318267520.0, "grad_norm": 2.085097348374203, "language_loss": 0.87277043, "learning_rate": 3.980319937487235e-06, "loss": 0.89458454, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.8203125, "step": 1220, "time_per_iteration": 2.3701112270355225 }, { "auxiliary_loss_clip": 0.01110239, "auxiliary_loss_mlp": 0.01076356, "balance_loss_clip": 1.01951706, "balance_loss_mlp": 1.02584934, "epoch": 0.07341049150759056, "flos": 20885915825280.0, "grad_norm": 2.7781244427758502, "language_loss": 0.81141949, "learning_rate": 3.98026539862741e-06, "loss": 0.83328545, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.84375, "step": 1221, "time_per_iteration": 2.3969979286193848 }, { "auxiliary_loss_clip": 0.01109046, "auxiliary_loss_mlp": 0.0107691, "balance_loss_clip": 1.02324164, "balance_loss_mlp": 1.0254637, "epoch": 0.07347061476025853, "flos": 15412721627520.0, "grad_norm": 2.0837882954351112, "language_loss": 0.95044965, "learning_rate": 3.980210784675722e-06, "loss": 0.97230923, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8359375, "step": 1222, "time_per_iteration": 2.386704444885254 }, { "auxiliary_loss_clip": 0.01114441, "auxiliary_loss_mlp": 0.01075608, "balance_loss_clip": 1.02058101, "balance_loss_mlp": 1.02711535, "epoch": 0.0735307380129265, "flos": 11108070368640.0, "grad_norm": 2.369027230856239, "language_loss": 0.94117868, "learning_rate": 3.980156095634242e-06, "loss": 0.96307921, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.875, "step": 1223, "time_per_iteration": 2.3978323936462402 }, { "auxiliary_loss_clip": 0.01112689, "auxiliary_loss_mlp": 0.01076749, "balance_loss_clip": 1.02296162, "balance_loss_mlp": 1.02690721, "epoch": 0.07359086126559447, "flos": 23731267409280.0, "grad_norm": 1.9497707885845108, "language_loss": 0.85614836, "learning_rate": 3.980101331505045e-06, "loss": 0.87804276, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.859375, "step": 1224, "time_per_iteration": 2.4262356758117676 }, { "auxiliary_loss_clip": 0.01111227, "auxiliary_loss_mlp": 0.01085937, "balance_loss_clip": 1.02013373, "balance_loss_mlp": 1.02544844, "epoch": 0.07365098451826244, "flos": 20992296337920.0, "grad_norm": 1.9775315444808272, "language_loss": 0.86926222, "learning_rate": 3.9800464922902076e-06, "loss": 0.89123386, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.859375, "step": 1225, "time_per_iteration": 2.4234533309936523 }, { "auxiliary_loss_clip": 0.01113064, "auxiliary_loss_mlp": 0.01075632, "balance_loss_clip": 1.02000856, "balance_loss_mlp": 1.02610576, "epoch": 0.0737111077709304, "flos": 19932682440960.0, "grad_norm": 1.8867982280421667, "language_loss": 0.93087101, "learning_rate": 3.979991577991808e-06, "loss": 0.95275795, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8671875, "step": 1226, "time_per_iteration": 2.382312059402466 }, { "auxiliary_loss_clip": 0.01121495, "auxiliary_loss_mlp": 0.01077993, "balance_loss_clip": 1.01981854, "balance_loss_mlp": 1.02823472, "epoch": 0.07377123102359838, "flos": 16580601250560.0, "grad_norm": 3.9106377283634415, "language_loss": 0.81725794, "learning_rate": 3.97993658861193e-06, "loss": 0.83925283, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.9296875, "step": 1227, "time_per_iteration": 2.396059989929199 }, { "auxiliary_loss_clip": 0.01109074, "auxiliary_loss_mlp": 0.01078504, "balance_loss_clip": 1.01956725, "balance_loss_mlp": 1.02651048, "epoch": 0.07383135427626634, "flos": 28327338720000.0, "grad_norm": 1.6325203291660428, "language_loss": 0.87493557, "learning_rate": 3.9798815241526575e-06, "loss": 0.89681137, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.828125, "step": 1228, "time_per_iteration": 2.465291976928711 }, { "auxiliary_loss_clip": 0.01109299, "auxiliary_loss_mlp": 0.01073676, "balance_loss_clip": 1.01771939, "balance_loss_mlp": 1.02482212, "epoch": 0.07389147752893431, "flos": 20046149959680.0, "grad_norm": 2.2455498681705266, "language_loss": 0.83000445, "learning_rate": 3.97982638461608e-06, "loss": 0.85183418, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.84375, "step": 1229, "time_per_iteration": 2.4005115032196045 }, { "auxiliary_loss_clip": 0.01113006, "auxiliary_loss_mlp": 0.01078398, "balance_loss_clip": 1.01755404, "balance_loss_mlp": 1.02557695, "epoch": 0.07395160078160229, "flos": 18113148691200.0, "grad_norm": 2.103421717651121, "language_loss": 0.82628107, "learning_rate": 3.979771170004287e-06, "loss": 0.84819508, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.875, "step": 1230, "time_per_iteration": 2.4553723335266113 }, { "auxiliary_loss_clip": 0.01104364, "auxiliary_loss_mlp": 0.01067144, "balance_loss_clip": 1.01521623, "balance_loss_mlp": 1.02369094, "epoch": 0.07401172403427025, "flos": 23585784307200.0, "grad_norm": 1.773209059859953, "language_loss": 0.84134996, "learning_rate": 3.979715880319372e-06, "loss": 0.86306506, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.8046875, "step": 1231, "time_per_iteration": 2.4403328895568848 }, { "auxiliary_loss_clip": 0.01111281, "auxiliary_loss_mlp": 0.01080462, "balance_loss_clip": 1.02283573, "balance_loss_mlp": 1.02384484, "epoch": 0.07407184728693822, "flos": 26358691086720.0, "grad_norm": 2.375642219981619, "language_loss": 0.98807317, "learning_rate": 3.979660515563434e-06, "loss": 1.00999057, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.875, "step": 1232, "time_per_iteration": 2.454683542251587 }, { "auxiliary_loss_clip": 0.0110602, "auxiliary_loss_mlp": 0.01077715, "balance_loss_clip": 1.02233052, "balance_loss_mlp": 1.02257073, "epoch": 0.0741319705396062, "flos": 22199348373120.0, "grad_norm": 1.641293591834297, "language_loss": 0.83461791, "learning_rate": 3.979605075738569e-06, "loss": 0.85645527, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8359375, "step": 1233, "time_per_iteration": 2.413576364517212 }, { "auxiliary_loss_clip": 0.01110332, "auxiliary_loss_mlp": 0.01076212, "balance_loss_clip": 1.01570177, "balance_loss_mlp": 1.02388155, "epoch": 0.07419209379227416, "flos": 39198978161280.0, "grad_norm": 2.0025818126493484, "language_loss": 0.73032165, "learning_rate": 3.979549560846883e-06, "loss": 0.75218701, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.8671875, "step": 1234, "time_per_iteration": 2.5814261436462402 }, { "auxiliary_loss_clip": 0.01108463, "auxiliary_loss_mlp": 0.01074801, "balance_loss_clip": 1.01767635, "balance_loss_mlp": 1.02383804, "epoch": 0.07425221704494213, "flos": 22780617465600.0, "grad_norm": 2.004035315832205, "language_loss": 0.79149365, "learning_rate": 3.979493970890478e-06, "loss": 0.81332636, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.84765625, "step": 1235, "time_per_iteration": 2.411708116531372 }, { "auxiliary_loss_clip": 0.01108472, "auxiliary_loss_mlp": 0.01066522, "balance_loss_clip": 1.01163781, "balance_loss_mlp": 1.02327132, "epoch": 0.0743123402976101, "flos": 22271897911680.0, "grad_norm": 1.935686887854904, "language_loss": 0.84948778, "learning_rate": 3.979438305871464e-06, "loss": 0.8712377, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8515625, "step": 1236, "time_per_iteration": 2.4159562587738037 }, { "auxiliary_loss_clip": 0.01111564, "auxiliary_loss_mlp": 0.01078823, "balance_loss_clip": 1.01988626, "balance_loss_mlp": 1.02411819, "epoch": 0.07437246355027807, "flos": 29313739762560.0, "grad_norm": 2.279897298121074, "language_loss": 0.78992748, "learning_rate": 3.979382565791951e-06, "loss": 0.81183136, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.875, "step": 1237, "time_per_iteration": 2.489786386489868 }, { "auxiliary_loss_clip": 0.01110402, "auxiliary_loss_mlp": 0.01075822, "balance_loss_clip": 1.01969862, "balance_loss_mlp": 1.02342427, "epoch": 0.07443258680294604, "flos": 31943293032960.0, "grad_norm": 1.5839640980228518, "language_loss": 0.79452771, "learning_rate": 3.979326750654053e-06, "loss": 0.81638992, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8671875, "step": 1238, "time_per_iteration": 2.5114190578460693 }, { "auxiliary_loss_clip": 0.0111158, "auxiliary_loss_mlp": 0.01084705, "balance_loss_clip": 1.02221525, "balance_loss_mlp": 1.02460039, "epoch": 0.074492710055614, "flos": 22674167130240.0, "grad_norm": 2.0712338448934906, "language_loss": 0.88884497, "learning_rate": 3.9792708604598854e-06, "loss": 0.91080785, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.87109375, "step": 1239, "time_per_iteration": 2.3974335193634033 }, { "auxiliary_loss_clip": 0.01112152, "auxiliary_loss_mlp": 0.01082096, "balance_loss_clip": 1.01686513, "balance_loss_mlp": 1.02490366, "epoch": 0.07455283330828198, "flos": 21283925857920.0, "grad_norm": 1.9958091208716677, "language_loss": 0.91966289, "learning_rate": 3.979214895211569e-06, "loss": 0.94160545, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.875, "step": 1240, "time_per_iteration": 2.414065361022949 }, { "auxiliary_loss_clip": 0.01109898, "auxiliary_loss_mlp": 0.01080189, "balance_loss_clip": 1.02218163, "balance_loss_mlp": 1.02561247, "epoch": 0.07461295656094995, "flos": 24387285456000.0, "grad_norm": 1.66041345755512, "language_loss": 0.91123086, "learning_rate": 3.979158854911225e-06, "loss": 0.93313169, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.84375, "step": 1241, "time_per_iteration": 3.871889591217041 }, { "auxiliary_loss_clip": 0.01031941, "auxiliary_loss_mlp": 0.01019023, "balance_loss_clip": 1.00652981, "balance_loss_mlp": 1.00984669, "epoch": 0.07467307981361791, "flos": 62106608753280.0, "grad_norm": 0.9086997194473594, "language_loss": 0.63194132, "learning_rate": 3.979102739560979e-06, "loss": 0.65245092, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.22070312, "step": 1242, "time_per_iteration": 3.0794472694396973 }, { "auxiliary_loss_clip": 0.01120338, "auxiliary_loss_mlp": 0.01082743, "balance_loss_clip": 1.01903737, "balance_loss_mlp": 1.02761149, "epoch": 0.07473320306628589, "flos": 24861999479040.0, "grad_norm": 2.705871379581421, "language_loss": 0.66069138, "learning_rate": 3.9790465491629595e-06, "loss": 0.68272221, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.9296875, "step": 1243, "time_per_iteration": 3.8724122047424316 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01071227, "balance_loss_clip": 1.01634264, "balance_loss_mlp": 1.02450335, "epoch": 0.07479332631895386, "flos": 24896354123520.0, "grad_norm": 1.800178602083535, "language_loss": 0.78938186, "learning_rate": 3.978990283719296e-06, "loss": 0.81117737, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.8359375, "step": 1244, "time_per_iteration": 3.9459540843963623 }, { "auxiliary_loss_clip": 0.01116793, "auxiliary_loss_mlp": 0.01084247, "balance_loss_clip": 1.02387989, "balance_loss_mlp": 1.02684224, "epoch": 0.07485344957162182, "flos": 17814467076480.0, "grad_norm": 2.757158346905854, "language_loss": 0.72362882, "learning_rate": 3.978933943232123e-06, "loss": 0.7456392, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.8984375, "step": 1245, "time_per_iteration": 2.3565220832824707 }, { "auxiliary_loss_clip": 0.01111562, "auxiliary_loss_mlp": 0.01085486, "balance_loss_clip": 1.02514184, "balance_loss_mlp": 1.02552235, "epoch": 0.0749135728242898, "flos": 25009018680960.0, "grad_norm": 1.9986136969655295, "language_loss": 0.90960586, "learning_rate": 3.978877527703576e-06, "loss": 0.93157637, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.86328125, "step": 1246, "time_per_iteration": 3.7871716022491455 }, { "auxiliary_loss_clip": 0.01121845, "auxiliary_loss_mlp": 0.01089216, "balance_loss_clip": 1.02717948, "balance_loss_mlp": 1.02655077, "epoch": 0.07497369607695777, "flos": 17821100234880.0, "grad_norm": 3.1104411953872164, "language_loss": 0.92641199, "learning_rate": 3.9788210371357945e-06, "loss": 0.94852263, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.953125, "step": 1247, "time_per_iteration": 2.3781256675720215 }, { "auxiliary_loss_clip": 0.01109982, "auxiliary_loss_mlp": 0.01076662, "balance_loss_clip": 1.0206337, "balance_loss_mlp": 1.0251646, "epoch": 0.07503381932962573, "flos": 15120219323520.0, "grad_norm": 2.321477048376059, "language_loss": 0.6819427, "learning_rate": 3.978764471530921e-06, "loss": 0.70380914, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.84765625, "step": 1248, "time_per_iteration": 2.3754279613494873 }, { "auxiliary_loss_clip": 0.0110812, "auxiliary_loss_mlp": 0.01078073, "balance_loss_clip": 1.02323639, "balance_loss_mlp": 1.02528, "epoch": 0.0750939425822937, "flos": 12816091635840.0, "grad_norm": 2.4957260738878477, "language_loss": 0.76485664, "learning_rate": 3.978707830891102e-06, "loss": 0.78671861, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.828125, "step": 1249, "time_per_iteration": 2.377328395843506 }, { "auxiliary_loss_clip": 0.01114947, "auxiliary_loss_mlp": 0.01083832, "balance_loss_clip": 1.02391791, "balance_loss_mlp": 1.02607584, "epoch": 0.07515406583496168, "flos": 24205702141440.0, "grad_norm": 2.5642893747076525, "language_loss": 0.84517086, "learning_rate": 3.978651115218482e-06, "loss": 0.86715871, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.88671875, "step": 1250, "time_per_iteration": 2.415754556655884 }, { "auxiliary_loss_clip": 0.01109544, "auxiliary_loss_mlp": 0.01077668, "balance_loss_clip": 1.0204953, "balance_loss_mlp": 1.02603602, "epoch": 0.07521418908762964, "flos": 26686944489600.0, "grad_norm": 2.3179918389725236, "language_loss": 0.70440054, "learning_rate": 3.978594324515215e-06, "loss": 0.72627264, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.8359375, "step": 1251, "time_per_iteration": 2.4797544479370117 }, { "auxiliary_loss_clip": 0.01031316, "auxiliary_loss_mlp": 0.01013148, "balance_loss_clip": 1.00318182, "balance_loss_mlp": 1.00989604, "epoch": 0.0752743123402976, "flos": 59091788096640.0, "grad_norm": 0.9691640070571871, "language_loss": 0.70728672, "learning_rate": 3.9785374587834515e-06, "loss": 0.72773135, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 0.09960938, "router_z_loss_mlp": 0.21484375, "step": 1252, "time_per_iteration": 3.060161590576172 }, { "auxiliary_loss_clip": 0.01114069, "auxiliary_loss_mlp": 0.01074152, "balance_loss_clip": 1.01807642, "balance_loss_mlp": 1.02649546, "epoch": 0.07533443559296558, "flos": 23475912658560.0, "grad_norm": 3.4173303279108067, "language_loss": 0.81790322, "learning_rate": 3.97848051802535e-06, "loss": 0.83978546, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.875, "step": 1253, "time_per_iteration": 2.4284822940826416 }, { "auxiliary_loss_clip": 0.01114018, "auxiliary_loss_mlp": 0.01076051, "balance_loss_clip": 1.01508713, "balance_loss_mlp": 1.02835727, "epoch": 0.07539455884563355, "flos": 20878270237440.0, "grad_norm": 2.3738502271501973, "language_loss": 0.97283733, "learning_rate": 3.978423502243069e-06, "loss": 0.9947381, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.85546875, "step": 1254, "time_per_iteration": 2.3995022773742676 }, { "auxiliary_loss_clip": 0.01109642, "auxiliary_loss_mlp": 0.01073235, "balance_loss_clip": 1.0222131, "balance_loss_mlp": 1.02569199, "epoch": 0.07545468209830151, "flos": 27671669786880.0, "grad_norm": 1.765312967636275, "language_loss": 0.90268552, "learning_rate": 3.97836641143877e-06, "loss": 0.92451429, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.83984375, "step": 1255, "time_per_iteration": 2.472668409347534 }, { "auxiliary_loss_clip": 0.01109255, "auxiliary_loss_mlp": 0.01090003, "balance_loss_clip": 1.02791905, "balance_loss_mlp": 1.02452517, "epoch": 0.0755148053509695, "flos": 14136122430720.0, "grad_norm": 1.7580557517016258, "language_loss": 0.81169355, "learning_rate": 3.978309245614618e-06, "loss": 0.83368611, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.84765625, "step": 1256, "time_per_iteration": 2.3847219944000244 }, { "auxiliary_loss_clip": 0.01027411, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.01389813, "balance_loss_mlp": 1.00679183, "epoch": 0.07557492860363746, "flos": 58232506780800.0, "grad_norm": 0.7958810759124006, "language_loss": 0.58199626, "learning_rate": 3.9782520047727825e-06, "loss": 0.6025362, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.20605469, "step": 1257, "time_per_iteration": 3.1301071643829346 }, { "auxiliary_loss_clip": 0.01110813, "auxiliary_loss_mlp": 0.01086886, "balance_loss_clip": 1.0254221, "balance_loss_mlp": 1.02576709, "epoch": 0.07563505185630542, "flos": 24643233699840.0, "grad_norm": 2.10426766113452, "language_loss": 0.92514712, "learning_rate": 3.978194688915432e-06, "loss": 0.94712406, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.8515625, "step": 1258, "time_per_iteration": 2.415243148803711 }, { "auxiliary_loss_clip": 0.01105832, "auxiliary_loss_mlp": 0.01078805, "balance_loss_clip": 1.02330077, "balance_loss_mlp": 1.02439606, "epoch": 0.07569517510897339, "flos": 15522104517120.0, "grad_norm": 2.5227415239909043, "language_loss": 0.82888377, "learning_rate": 3.978137298044741e-06, "loss": 0.85073018, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8125, "step": 1259, "time_per_iteration": 2.3969428539276123 }, { "auxiliary_loss_clip": 0.0111098, "auxiliary_loss_mlp": 0.01080769, "balance_loss_clip": 1.0237391, "balance_loss_mlp": 1.0241338, "epoch": 0.07575529836164137, "flos": 22927462110720.0, "grad_norm": 2.247022111498786, "language_loss": 0.7849946, "learning_rate": 3.978079832162885e-06, "loss": 0.80691212, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.8671875, "step": 1260, "time_per_iteration": 2.4059765338897705 }, { "auxiliary_loss_clip": 0.01112623, "auxiliary_loss_mlp": 0.01083664, "balance_loss_clip": 1.0251081, "balance_loss_mlp": 1.02541876, "epoch": 0.07581542161430933, "flos": 19499410068480.0, "grad_norm": 1.7031442565244528, "language_loss": 0.87311423, "learning_rate": 3.978022291272044e-06, "loss": 0.89507705, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.87109375, "step": 1261, "time_per_iteration": 2.4009759426116943 }, { "auxiliary_loss_clip": 0.01116196, "auxiliary_loss_mlp": 0.01082479, "balance_loss_clip": 1.02974129, "balance_loss_mlp": 1.02728581, "epoch": 0.0758755448669773, "flos": 24972290064000.0, "grad_norm": 1.8298346067670568, "language_loss": 0.84464765, "learning_rate": 3.977964675374399e-06, "loss": 0.86663449, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.890625, "step": 1262, "time_per_iteration": 2.4443039894104004 }, { "auxiliary_loss_clip": 0.01111299, "auxiliary_loss_mlp": 0.01080911, "balance_loss_clip": 1.02464437, "balance_loss_mlp": 1.02586579, "epoch": 0.07593566811964528, "flos": 22746856314240.0, "grad_norm": 2.8306592051255066, "language_loss": 0.86415261, "learning_rate": 3.977906984472136e-06, "loss": 0.88607466, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.85546875, "step": 1263, "time_per_iteration": 2.423081159591675 }, { "auxiliary_loss_clip": 0.0111389, "auxiliary_loss_mlp": 0.01078518, "balance_loss_clip": 1.02301455, "balance_loss_mlp": 1.02697253, "epoch": 0.07599579137231324, "flos": 23111279752320.0, "grad_norm": 1.8937584893590937, "language_loss": 0.78919494, "learning_rate": 3.977849218567442e-06, "loss": 0.81111908, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8671875, "step": 1264, "time_per_iteration": 2.4227380752563477 }, { "auxiliary_loss_clip": 0.01115976, "auxiliary_loss_mlp": 0.01084785, "balance_loss_clip": 1.02642047, "balance_loss_mlp": 1.0276103, "epoch": 0.07605591462498121, "flos": 14501174273280.0, "grad_norm": 5.836269019396336, "language_loss": 0.84655112, "learning_rate": 3.977791377662507e-06, "loss": 0.8685587, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8828125, "step": 1265, "time_per_iteration": 2.374069929122925 }, { "auxiliary_loss_clip": 0.0111927, "auxiliary_loss_mlp": 0.0108751, "balance_loss_clip": 1.02766705, "balance_loss_mlp": 1.02913237, "epoch": 0.07611603787764919, "flos": 23513060211840.0, "grad_norm": 2.801887092896207, "language_loss": 0.68210065, "learning_rate": 3.977733461759524e-06, "loss": 0.70416844, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.90234375, "step": 1266, "time_per_iteration": 2.4233310222625732 }, { "auxiliary_loss_clip": 0.01118561, "auxiliary_loss_mlp": 0.01083002, "balance_loss_clip": 1.02563834, "balance_loss_mlp": 1.02844703, "epoch": 0.07617616113031715, "flos": 21506112950400.0, "grad_norm": 2.138864087768607, "language_loss": 0.84271103, "learning_rate": 3.977675470860691e-06, "loss": 0.86472666, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.90234375, "step": 1267, "time_per_iteration": 2.4329049587249756 }, { "auxiliary_loss_clip": 0.01114862, "auxiliary_loss_mlp": 0.0108758, "balance_loss_clip": 1.03214765, "balance_loss_mlp": 1.02739859, "epoch": 0.07623628438298512, "flos": 14572327357440.0, "grad_norm": 2.771044179461256, "language_loss": 0.76195407, "learning_rate": 3.977617404968205e-06, "loss": 0.78397858, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.875, "step": 1268, "time_per_iteration": 2.371276378631592 }, { "auxiliary_loss_clip": 0.01118232, "auxiliary_loss_mlp": 0.01085413, "balance_loss_clip": 1.02945626, "balance_loss_mlp": 1.02741373, "epoch": 0.07629640763565308, "flos": 14719521116160.0, "grad_norm": 2.19354742090999, "language_loss": 0.85878783, "learning_rate": 3.977559264084269e-06, "loss": 0.88082427, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.91015625, "step": 1269, "time_per_iteration": 2.4100465774536133 }, { "auxiliary_loss_clip": 0.01117596, "auxiliary_loss_mlp": 0.01087871, "balance_loss_clip": 1.02993524, "balance_loss_mlp": 1.02872252, "epoch": 0.07635653088832106, "flos": 14902047037440.0, "grad_norm": 2.5239472678751866, "language_loss": 0.92882013, "learning_rate": 3.977501048211088e-06, "loss": 0.95087475, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.890625, "step": 1270, "time_per_iteration": 2.390667200088501 }, { "auxiliary_loss_clip": 0.0111819, "auxiliary_loss_mlp": 0.01088361, "balance_loss_clip": 1.02823234, "balance_loss_mlp": 1.02841234, "epoch": 0.07641665414098903, "flos": 26650355518080.0, "grad_norm": 2.1356163807142554, "language_loss": 0.74612987, "learning_rate": 3.977442757350869e-06, "loss": 0.76819539, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.8984375, "step": 1271, "time_per_iteration": 2.4645323753356934 }, { "auxiliary_loss_clip": 0.01106961, "auxiliary_loss_mlp": 0.01072809, "balance_loss_clip": 1.01928401, "balance_loss_mlp": 1.02486467, "epoch": 0.07647677739365699, "flos": 25191614424960.0, "grad_norm": 2.00499679941534, "language_loss": 0.83985341, "learning_rate": 3.977384391505823e-06, "loss": 0.86165106, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.8203125, "step": 1272, "time_per_iteration": 2.453439950942993 }, { "auxiliary_loss_clip": 0.01112914, "auxiliary_loss_mlp": 0.01075436, "balance_loss_clip": 1.01936054, "balance_loss_mlp": 1.02516222, "epoch": 0.07653690064632497, "flos": 20557103840640.0, "grad_norm": 1.6337958630292642, "language_loss": 0.83530927, "learning_rate": 3.977325950678162e-06, "loss": 0.85719275, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.87890625, "step": 1273, "time_per_iteration": 2.404423475265503 }, { "auxiliary_loss_clip": 0.01116261, "auxiliary_loss_mlp": 0.01080123, "balance_loss_clip": 1.02066171, "balance_loss_mlp": 1.02459669, "epoch": 0.07659702389899294, "flos": 22268336952960.0, "grad_norm": 1.7880935307707355, "language_loss": 0.83093613, "learning_rate": 3.977267434870103e-06, "loss": 0.85289991, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.91796875, "step": 1274, "time_per_iteration": 2.426405429840088 }, { "auxiliary_loss_clip": 0.0111479, "auxiliary_loss_mlp": 0.0107794, "balance_loss_clip": 1.02069569, "balance_loss_mlp": 1.0269407, "epoch": 0.0766571471516609, "flos": 32634713064960.0, "grad_norm": 1.6413403518417475, "language_loss": 0.74679124, "learning_rate": 3.977208844083865e-06, "loss": 0.7687186, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.87890625, "step": 1275, "time_per_iteration": 2.490630865097046 }, { "auxiliary_loss_clip": 0.01114042, "auxiliary_loss_mlp": 0.01087374, "balance_loss_clip": 1.02521873, "balance_loss_mlp": 1.02556527, "epoch": 0.07671727040432888, "flos": 15266505386880.0, "grad_norm": 2.124158014918776, "language_loss": 0.83142871, "learning_rate": 3.9771501783216685e-06, "loss": 0.85344291, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.8828125, "step": 1276, "time_per_iteration": 2.3857274055480957 }, { "auxiliary_loss_clip": 0.0111615, "auxiliary_loss_mlp": 0.01076865, "balance_loss_clip": 1.01792765, "balance_loss_mlp": 1.02478075, "epoch": 0.07677739365699685, "flos": 28182833136000.0, "grad_norm": 2.33514294870614, "language_loss": 0.6354655, "learning_rate": 3.97709143758574e-06, "loss": 0.6573956, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.9140625, "step": 1277, "time_per_iteration": 2.4477226734161377 }, { "auxiliary_loss_clip": 0.01117647, "auxiliary_loss_mlp": 0.01082188, "balance_loss_clip": 1.02167702, "balance_loss_mlp": 1.02578163, "epoch": 0.07683751690966481, "flos": 18295150942080.0, "grad_norm": 2.7361114192969347, "language_loss": 0.78426409, "learning_rate": 3.977032621878305e-06, "loss": 0.80626243, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.91796875, "step": 1278, "time_per_iteration": 2.3961613178253174 }, { "auxiliary_loss_clip": 0.01111011, "auxiliary_loss_mlp": 0.01078639, "balance_loss_clip": 1.02032232, "balance_loss_mlp": 1.02468765, "epoch": 0.07689764016233278, "flos": 21980024012160.0, "grad_norm": 2.1504266101209026, "language_loss": 0.90395373, "learning_rate": 3.976973731201596e-06, "loss": 0.92585027, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.86328125, "step": 1279, "time_per_iteration": 2.408963918685913 }, { "auxiliary_loss_clip": 0.01114736, "auxiliary_loss_mlp": 0.01081161, "balance_loss_clip": 1.02448916, "balance_loss_mlp": 1.02696049, "epoch": 0.07695776341500075, "flos": 22234924915200.0, "grad_norm": 2.903596671162025, "language_loss": 0.85071397, "learning_rate": 3.976914765557845e-06, "loss": 0.87267292, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.87890625, "step": 1280, "time_per_iteration": 3.9086408615112305 }, { "auxiliary_loss_clip": 0.01116047, "auxiliary_loss_mlp": 0.01084506, "balance_loss_clip": 1.02199268, "balance_loss_mlp": 1.02780628, "epoch": 0.07701788666766872, "flos": 16142825312640.0, "grad_norm": 2.0221543100916013, "language_loss": 0.77778512, "learning_rate": 3.9768557249492875e-06, "loss": 0.79979062, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.8828125, "step": 1281, "time_per_iteration": 2.4197499752044678 }, { "auxiliary_loss_clip": 0.01119853, "auxiliary_loss_mlp": 0.01084451, "balance_loss_clip": 1.02625275, "balance_loss_mlp": 1.02659404, "epoch": 0.07707800992033668, "flos": 19462053047040.0, "grad_norm": 1.8473089572523251, "language_loss": 0.78039145, "learning_rate": 3.9767966093781634e-06, "loss": 0.80243444, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.9296875, "step": 1282, "time_per_iteration": 3.84993839263916 }, { "auxiliary_loss_clip": 0.01115278, "auxiliary_loss_mlp": 0.01084561, "balance_loss_clip": 1.0307976, "balance_loss_mlp": 1.02454948, "epoch": 0.07713813317300466, "flos": 18989259148800.0, "grad_norm": 2.4401431012498196, "language_loss": 0.85270166, "learning_rate": 3.976737418846713e-06, "loss": 0.87470013, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.90625, "step": 1283, "time_per_iteration": 3.811868906021118 }, { "auxiliary_loss_clip": 0.01112127, "auxiliary_loss_mlp": 0.01088686, "balance_loss_clip": 1.02974927, "balance_loss_mlp": 1.02479434, "epoch": 0.07719825642567263, "flos": 18112974134400.0, "grad_norm": 2.2766967210165423, "language_loss": 0.77546883, "learning_rate": 3.976678153357181e-06, "loss": 0.79747695, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.875, "step": 1284, "time_per_iteration": 2.3963985443115234 }, { "auxiliary_loss_clip": 0.01113136, "auxiliary_loss_mlp": 0.01080375, "balance_loss_clip": 1.02406073, "balance_loss_mlp": 1.02427959, "epoch": 0.0772583796783406, "flos": 42192780312960.0, "grad_norm": 2.1345674308908213, "language_loss": 0.78020114, "learning_rate": 3.976618812911817e-06, "loss": 0.80213624, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.88671875, "step": 1285, "time_per_iteration": 3.9257423877716064 }, { "auxiliary_loss_clip": 0.01112151, "auxiliary_loss_mlp": 0.01081314, "balance_loss_clip": 1.02573895, "balance_loss_mlp": 1.02636755, "epoch": 0.07731850293100857, "flos": 24752546766720.0, "grad_norm": 1.9899012172378996, "language_loss": 0.86305422, "learning_rate": 3.9765593975128685e-06, "loss": 0.8849889, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.859375, "step": 1286, "time_per_iteration": 2.4472496509552 }, { "auxiliary_loss_clip": 0.01115797, "auxiliary_loss_mlp": 0.01078622, "balance_loss_clip": 1.02040064, "balance_loss_mlp": 1.02593493, "epoch": 0.07737862618367654, "flos": 17564942522880.0, "grad_norm": 2.7013698388651735, "language_loss": 0.82241994, "learning_rate": 3.97649990716259e-06, "loss": 0.84436411, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.8984375, "step": 1287, "time_per_iteration": 2.3523247241973877 }, { "auxiliary_loss_clip": 0.01113506, "auxiliary_loss_mlp": 0.01078623, "balance_loss_clip": 1.01923311, "balance_loss_mlp": 1.0250392, "epoch": 0.0774387494363445, "flos": 25626038872320.0, "grad_norm": 1.840611530581988, "language_loss": 0.86045706, "learning_rate": 3.976440341863237e-06, "loss": 0.88237834, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.88671875, "step": 1288, "time_per_iteration": 2.4548630714416504 }, { "auxiliary_loss_clip": 0.0111625, "auxiliary_loss_mlp": 0.01086655, "balance_loss_clip": 1.02561998, "balance_loss_mlp": 1.02570438, "epoch": 0.07749887268901248, "flos": 12239046817920.0, "grad_norm": 2.220347960724826, "language_loss": 0.88918692, "learning_rate": 3.976380701617068e-06, "loss": 0.91121596, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.90625, "step": 1289, "time_per_iteration": 2.36944580078125 }, { "auxiliary_loss_clip": 0.01114819, "auxiliary_loss_mlp": 0.0107578, "balance_loss_clip": 1.01746273, "balance_loss_mlp": 1.02522171, "epoch": 0.07755899594168045, "flos": 25080590701440.0, "grad_norm": 1.844656224417422, "language_loss": 0.87233949, "learning_rate": 3.976320986426344e-06, "loss": 0.89424545, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.8984375, "step": 1290, "time_per_iteration": 2.450631618499756 }, { "auxiliary_loss_clip": 0.01109913, "auxiliary_loss_mlp": 0.01088995, "balance_loss_clip": 1.02924693, "balance_loss_mlp": 1.02527738, "epoch": 0.07761911919434841, "flos": 14245540231680.0, "grad_norm": 2.0480529724921737, "language_loss": 0.94071054, "learning_rate": 3.9762611962933315e-06, "loss": 0.96269959, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.84765625, "step": 1291, "time_per_iteration": 2.3772730827331543 }, { "auxiliary_loss_clip": 0.01037002, "auxiliary_loss_mlp": 0.01070855, "balance_loss_clip": 1.06127095, "balance_loss_mlp": 1.01347315, "epoch": 0.07767924244701638, "flos": 67233463597440.0, "grad_norm": 0.9379739157002828, "language_loss": 0.65196186, "learning_rate": 3.9762013312202955e-06, "loss": 0.67304045, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 0.09570312, "router_z_loss_mlp": 0.23535156, "step": 1292, "time_per_iteration": 3.140533924102783 }, { "auxiliary_loss_clip": 0.01111043, "auxiliary_loss_mlp": 0.0108101, "balance_loss_clip": 1.02264476, "balance_loss_mlp": 1.02411091, "epoch": 0.07773936569968436, "flos": 28549316344320.0, "grad_norm": 1.6000556653965194, "language_loss": 0.90237826, "learning_rate": 3.9761413912095075e-06, "loss": 0.92429888, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.8671875, "step": 1293, "time_per_iteration": 2.4698164463043213 }, { "auxiliary_loss_clip": 0.01113567, "auxiliary_loss_mlp": 0.0108832, "balance_loss_clip": 1.02516317, "balance_loss_mlp": 1.02611494, "epoch": 0.07779948895235232, "flos": 27489039131520.0, "grad_norm": 2.3333879269140048, "language_loss": 0.88249624, "learning_rate": 3.976081376263239e-06, "loss": 0.90451515, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.875, "step": 1294, "time_per_iteration": 2.50152325630188 }, { "auxiliary_loss_clip": 0.01117477, "auxiliary_loss_mlp": 0.01092092, "balance_loss_clip": 1.02714658, "balance_loss_mlp": 1.02734089, "epoch": 0.07785961220502029, "flos": 18222322112640.0, "grad_norm": 2.292976268986861, "language_loss": 0.82917792, "learning_rate": 3.976021286383768e-06, "loss": 0.85127366, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.8984375, "step": 1295, "time_per_iteration": 2.3899710178375244 }, { "auxiliary_loss_clip": 0.01111525, "auxiliary_loss_mlp": 0.01088247, "balance_loss_clip": 1.02911878, "balance_loss_mlp": 1.02379036, "epoch": 0.07791973545768827, "flos": 24607063664640.0, "grad_norm": 1.977872379709689, "language_loss": 0.91423941, "learning_rate": 3.975961121573371e-06, "loss": 0.93623716, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.875, "step": 1296, "time_per_iteration": 2.435633420944214 }, { "auxiliary_loss_clip": 0.0111575, "auxiliary_loss_mlp": 0.01091681, "balance_loss_clip": 1.02830935, "balance_loss_mlp": 1.02601528, "epoch": 0.07797985871035623, "flos": 14281221507840.0, "grad_norm": 2.6504782850883415, "language_loss": 0.99314928, "learning_rate": 3.9759008818343305e-06, "loss": 1.0152235, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.8984375, "step": 1297, "time_per_iteration": 2.3925867080688477 }, { "auxiliary_loss_clip": 0.0111716, "auxiliary_loss_mlp": 0.01092027, "balance_loss_clip": 1.02958512, "balance_loss_mlp": 1.02448416, "epoch": 0.0780399819630242, "flos": 26609367715200.0, "grad_norm": 2.308454576673817, "language_loss": 0.78110254, "learning_rate": 3.97584056716893e-06, "loss": 0.80319446, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.921875, "step": 1298, "time_per_iteration": 2.4545648097991943 }, { "auxiliary_loss_clip": 0.01113427, "auxiliary_loss_mlp": 0.01093542, "balance_loss_clip": 1.0345093, "balance_loss_mlp": 1.02643466, "epoch": 0.07810010521569218, "flos": 21833458657920.0, "grad_norm": 1.6949417476983086, "language_loss": 0.82703048, "learning_rate": 3.9757801775794575e-06, "loss": 0.84910017, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.8671875, "step": 1299, "time_per_iteration": 2.4164834022521973 }, { "auxiliary_loss_clip": 0.01107761, "auxiliary_loss_mlp": 0.01077304, "balance_loss_clip": 1.02737927, "balance_loss_mlp": 1.02589202, "epoch": 0.07816022846836014, "flos": 25080101942400.0, "grad_norm": 3.0651608150289977, "language_loss": 0.89050609, "learning_rate": 3.975719713068202e-06, "loss": 0.91235673, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.81640625, "step": 1300, "time_per_iteration": 2.431771755218506 }, { "auxiliary_loss_clip": 0.01115908, "auxiliary_loss_mlp": 0.01097212, "balance_loss_clip": 1.03429329, "balance_loss_mlp": 1.0273807, "epoch": 0.0782203517210281, "flos": 40915901825280.0, "grad_norm": 2.2082953876317792, "language_loss": 0.74573475, "learning_rate": 3.975659173637458e-06, "loss": 0.76786602, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.8828125, "step": 1301, "time_per_iteration": 2.5685629844665527 }, { "auxiliary_loss_clip": 0.01118665, "auxiliary_loss_mlp": 0.01084393, "balance_loss_clip": 1.02462149, "balance_loss_mlp": 1.02612078, "epoch": 0.07828047497369607, "flos": 41170418703360.0, "grad_norm": 3.978183644179768, "language_loss": 0.73210657, "learning_rate": 3.97559855928952e-06, "loss": 0.75413716, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.921875, "step": 1302, "time_per_iteration": 2.5999114513397217 }, { "auxiliary_loss_clip": 0.01113395, "auxiliary_loss_mlp": 0.01079394, "balance_loss_clip": 1.0244621, "balance_loss_mlp": 1.02644074, "epoch": 0.07834059822636405, "flos": 23507160192000.0, "grad_norm": 2.008387485909996, "language_loss": 0.84534049, "learning_rate": 3.9755378700266864e-06, "loss": 0.86726838, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8671875, "step": 1303, "time_per_iteration": 2.4097769260406494 }, { "auxiliary_loss_clip": 0.0111355, "auxiliary_loss_mlp": 0.01082052, "balance_loss_clip": 1.02661943, "balance_loss_mlp": 1.02537799, "epoch": 0.07840072147903202, "flos": 20192854959360.0, "grad_norm": 2.123858019636386, "language_loss": 0.77221608, "learning_rate": 3.9754771058512585e-06, "loss": 0.79417211, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.87890625, "step": 1304, "time_per_iteration": 2.410893678665161 }, { "auxiliary_loss_clip": 0.0111136, "auxiliary_loss_mlp": 0.01075512, "balance_loss_clip": 1.02065158, "balance_loss_mlp": 1.02583385, "epoch": 0.07846084473169998, "flos": 21359757064320.0, "grad_norm": 1.6187257570693327, "language_loss": 0.78066891, "learning_rate": 3.975416266765542e-06, "loss": 0.80253768, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8515625, "step": 1305, "time_per_iteration": 2.3891444206237793 }, { "auxiliary_loss_clip": 0.01117743, "auxiliary_loss_mlp": 0.01082784, "balance_loss_clip": 1.0204618, "balance_loss_mlp": 1.02722454, "epoch": 0.07852096798436796, "flos": 25409786711040.0, "grad_norm": 1.9346441081910533, "language_loss": 0.89066184, "learning_rate": 3.975355352771841e-06, "loss": 0.9126671, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.90625, "step": 1306, "time_per_iteration": 2.4632389545440674 }, { "auxiliary_loss_clip": 0.0111108, "auxiliary_loss_mlp": 0.01072467, "balance_loss_clip": 1.01848888, "balance_loss_mlp": 1.02584505, "epoch": 0.07858109123703592, "flos": 24570335047680.0, "grad_norm": 2.179967687732444, "language_loss": 0.92156303, "learning_rate": 3.975294363872468e-06, "loss": 0.94339848, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8515625, "step": 1307, "time_per_iteration": 2.4139764308929443 }, { "auxiliary_loss_clip": 0.01108242, "auxiliary_loss_mlp": 0.01078333, "balance_loss_clip": 1.02058756, "balance_loss_mlp": 1.02415991, "epoch": 0.07864121448970389, "flos": 20697978643200.0, "grad_norm": 1.936468025474089, "language_loss": 0.85044777, "learning_rate": 3.975233300069735e-06, "loss": 0.8723135, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.83984375, "step": 1308, "time_per_iteration": 2.421154737472534 }, { "auxiliary_loss_clip": 0.01109996, "auxiliary_loss_mlp": 0.01066194, "balance_loss_clip": 1.01703167, "balance_loss_mlp": 1.02503967, "epoch": 0.07870133774237187, "flos": 22965412625280.0, "grad_norm": 1.4516141332667416, "language_loss": 0.7930882, "learning_rate": 3.975172161365958e-06, "loss": 0.81485015, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.8515625, "step": 1309, "time_per_iteration": 2.4105026721954346 }, { "auxiliary_loss_clip": 0.01112374, "auxiliary_loss_mlp": 0.01074677, "balance_loss_clip": 1.02091324, "balance_loss_mlp": 1.02549911, "epoch": 0.07876146099503983, "flos": 18841855921920.0, "grad_norm": 2.479661519581678, "language_loss": 0.83342427, "learning_rate": 3.975110947763453e-06, "loss": 0.8552947, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8671875, "step": 1310, "time_per_iteration": 2.3910391330718994 }, { "auxiliary_loss_clip": 0.01109369, "auxiliary_loss_mlp": 0.01073696, "balance_loss_clip": 1.0241046, "balance_loss_mlp": 1.02740824, "epoch": 0.0788215842477078, "flos": 23804654820480.0, "grad_norm": 1.7860338378880796, "language_loss": 0.7527582, "learning_rate": 3.9750496592645435e-06, "loss": 0.77458882, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.8203125, "step": 1311, "time_per_iteration": 2.4259324073791504 }, { "auxiliary_loss_clip": 0.01107963, "auxiliary_loss_mlp": 0.01075911, "balance_loss_clip": 1.02608192, "balance_loss_mlp": 1.0258348, "epoch": 0.07888170750037576, "flos": 21578837045760.0, "grad_norm": 1.7606789402917775, "language_loss": 0.88043088, "learning_rate": 3.974988295871553e-06, "loss": 0.90226966, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.8203125, "step": 1312, "time_per_iteration": 2.4214870929718018 }, { "auxiliary_loss_clip": 0.01107496, "auxiliary_loss_mlp": 0.01075993, "balance_loss_clip": 1.02740359, "balance_loss_mlp": 1.02531779, "epoch": 0.07894183075304374, "flos": 19863833506560.0, "grad_norm": 1.7732417231147266, "language_loss": 0.84008086, "learning_rate": 3.9749268575868085e-06, "loss": 0.86191571, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.8203125, "step": 1313, "time_per_iteration": 2.388854503631592 }, { "auxiliary_loss_clip": 0.01114652, "auxiliary_loss_mlp": 0.0107273, "balance_loss_clip": 1.01615298, "balance_loss_mlp": 1.02576399, "epoch": 0.07900195400571171, "flos": 16142546021760.0, "grad_norm": 2.4525097879860853, "language_loss": 0.7790637, "learning_rate": 3.97486534441264e-06, "loss": 0.80093753, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.890625, "step": 1314, "time_per_iteration": 2.43091082572937 }, { "auxiliary_loss_clip": 0.01106784, "auxiliary_loss_mlp": 0.01072151, "balance_loss_clip": 1.01984191, "balance_loss_mlp": 1.02299643, "epoch": 0.07906207725837967, "flos": 23729347284480.0, "grad_norm": 1.5957687776262106, "language_loss": 0.81942004, "learning_rate": 3.974803756351379e-06, "loss": 0.84120941, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.83984375, "step": 1315, "time_per_iteration": 2.473597764968872 }, { "auxiliary_loss_clip": 0.01110859, "auxiliary_loss_mlp": 0.01079685, "balance_loss_clip": 1.02134371, "balance_loss_mlp": 1.02446198, "epoch": 0.07912220051104765, "flos": 24314770828800.0, "grad_norm": 2.3508693143268804, "language_loss": 0.7562601, "learning_rate": 3.974742093405362e-06, "loss": 0.77816558, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.8671875, "step": 1316, "time_per_iteration": 2.4428412914276123 }, { "auxiliary_loss_clip": 0.01108821, "auxiliary_loss_mlp": 0.01075307, "balance_loss_clip": 1.01985073, "balance_loss_mlp": 1.02337098, "epoch": 0.07918232376371562, "flos": 18879038386560.0, "grad_norm": 3.7468830690884083, "language_loss": 0.6882624, "learning_rate": 3.974680355576927e-06, "loss": 0.71010375, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.85546875, "step": 1317, "time_per_iteration": 2.401154041290283 }, { "auxiliary_loss_clip": 0.01115937, "auxiliary_loss_mlp": 0.01082899, "balance_loss_clip": 1.0198375, "balance_loss_mlp": 1.02634323, "epoch": 0.07924244701638358, "flos": 27375187587840.0, "grad_norm": 2.9065938412832373, "language_loss": 0.77951217, "learning_rate": 3.974618542868415e-06, "loss": 0.80150056, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.8984375, "step": 1318, "time_per_iteration": 2.4679880142211914 }, { "auxiliary_loss_clip": 0.01108994, "auxiliary_loss_mlp": 0.01073667, "balance_loss_clip": 1.02147758, "balance_loss_mlp": 1.02551961, "epoch": 0.07930257026905156, "flos": 25119134709120.0, "grad_norm": 1.5806039296746688, "language_loss": 0.92131197, "learning_rate": 3.97455665528217e-06, "loss": 0.9431386, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.83203125, "step": 1319, "time_per_iteration": 2.4307196140289307 }, { "auxiliary_loss_clip": 0.0110781, "auxiliary_loss_mlp": 0.01071557, "balance_loss_clip": 1.01819885, "balance_loss_mlp": 1.02368414, "epoch": 0.07936269352171953, "flos": 21833423746560.0, "grad_norm": 1.8222302199754363, "language_loss": 0.82644433, "learning_rate": 3.974494692820539e-06, "loss": 0.84823799, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.83984375, "step": 1320, "time_per_iteration": 3.8100790977478027 }, { "auxiliary_loss_clip": 0.01110221, "auxiliary_loss_mlp": 0.01077873, "balance_loss_clip": 1.02284575, "balance_loss_mlp": 1.0259999, "epoch": 0.07942281677438749, "flos": 16939124668800.0, "grad_norm": 2.203952441167756, "language_loss": 0.71933568, "learning_rate": 3.974432655485872e-06, "loss": 0.74121666, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.84375, "step": 1321, "time_per_iteration": 2.352989673614502 }, { "auxiliary_loss_clip": 0.0110665, "auxiliary_loss_mlp": 0.01075883, "balance_loss_clip": 1.01956868, "balance_loss_mlp": 1.02319884, "epoch": 0.07948294002705546, "flos": 18986012392320.0, "grad_norm": 2.003780916195859, "language_loss": 0.87183511, "learning_rate": 3.9743705432805195e-06, "loss": 0.89366043, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8359375, "step": 1322, "time_per_iteration": 3.8031857013702393 }, { "auxiliary_loss_clip": 0.01107297, "auxiliary_loss_mlp": 0.0107431, "balance_loss_clip": 1.02092791, "balance_loss_mlp": 1.02308059, "epoch": 0.07954306327972344, "flos": 21652364102400.0, "grad_norm": 1.9804893736623392, "language_loss": 0.93101752, "learning_rate": 3.974308356206838e-06, "loss": 0.95283353, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.83984375, "step": 1323, "time_per_iteration": 3.7943084239959717 }, { "auxiliary_loss_clip": 0.0110532, "auxiliary_loss_mlp": 0.01070733, "balance_loss_clip": 1.01937795, "balance_loss_mlp": 1.02309442, "epoch": 0.0796031865323914, "flos": 23219196364800.0, "grad_norm": 1.701213951488955, "language_loss": 0.84239727, "learning_rate": 3.974246094267187e-06, "loss": 0.8641578, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.8203125, "step": 1324, "time_per_iteration": 4.064820289611816 }, { "auxiliary_loss_clip": 0.01109688, "auxiliary_loss_mlp": 0.01073908, "balance_loss_clip": 1.0191915, "balance_loss_mlp": 1.02427816, "epoch": 0.07966330978505937, "flos": 23293421648640.0, "grad_norm": 2.06694175959185, "language_loss": 0.82039601, "learning_rate": 3.974183757463925e-06, "loss": 0.84223199, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8515625, "step": 1325, "time_per_iteration": 2.572401762008667 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.0108306, "balance_loss_clip": 1.02476692, "balance_loss_mlp": 1.02346456, "epoch": 0.07972343303772735, "flos": 18362952535680.0, "grad_norm": 2.8761018157466154, "language_loss": 0.90512717, "learning_rate": 3.974121345799418e-06, "loss": 0.92702031, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.828125, "step": 1326, "time_per_iteration": 2.4042999744415283 }, { "auxiliary_loss_clip": 0.01104156, "auxiliary_loss_mlp": 0.01065993, "balance_loss_clip": 1.01489997, "balance_loss_mlp": 1.02272522, "epoch": 0.07978355629039531, "flos": 21761432789760.0, "grad_norm": 1.85635247404814, "language_loss": 0.85363793, "learning_rate": 3.974058859276032e-06, "loss": 0.87533939, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.8125, "step": 1327, "time_per_iteration": 2.3813905715942383 }, { "auxiliary_loss_clip": 0.01109646, "auxiliary_loss_mlp": 0.01082696, "balance_loss_clip": 1.02225661, "balance_loss_mlp": 1.02393973, "epoch": 0.07984367954306328, "flos": 18550331136000.0, "grad_norm": 2.3710652503006786, "language_loss": 0.82645732, "learning_rate": 3.9739962978961354e-06, "loss": 0.84838068, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.859375, "step": 1328, "time_per_iteration": 2.3928399085998535 }, { "auxiliary_loss_clip": 0.01108894, "auxiliary_loss_mlp": 0.01075048, "balance_loss_clip": 1.01503801, "balance_loss_mlp": 1.02365673, "epoch": 0.07990380279573125, "flos": 16903268835840.0, "grad_norm": 3.316858033803759, "language_loss": 0.77969515, "learning_rate": 3.973933661662101e-06, "loss": 0.80153453, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.8515625, "step": 1329, "time_per_iteration": 2.377809762954712 }, { "auxiliary_loss_clip": 0.01108532, "auxiliary_loss_mlp": 0.01069511, "balance_loss_clip": 1.01584268, "balance_loss_mlp": 1.02538419, "epoch": 0.07996392604839922, "flos": 24097192035840.0, "grad_norm": 1.6127542204939962, "language_loss": 0.83123344, "learning_rate": 3.973870950576305e-06, "loss": 0.85301387, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.83203125, "step": 1330, "time_per_iteration": 2.5272607803344727 }, { "auxiliary_loss_clip": 0.01109376, "auxiliary_loss_mlp": 0.01078341, "balance_loss_clip": 1.02381492, "balance_loss_mlp": 1.02487612, "epoch": 0.08002404930106718, "flos": 14277974751360.0, "grad_norm": 1.8084030049542563, "language_loss": 0.91104257, "learning_rate": 3.9738081646411255e-06, "loss": 0.9329198, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.84375, "step": 1331, "time_per_iteration": 2.372652530670166 }, { "auxiliary_loss_clip": 0.01113275, "auxiliary_loss_mlp": 0.01074205, "balance_loss_clip": 1.01974988, "balance_loss_mlp": 1.02541745, "epoch": 0.08008417255373516, "flos": 40404633742080.0, "grad_norm": 1.80529961066992, "language_loss": 0.75606763, "learning_rate": 3.973745303858942e-06, "loss": 0.77794242, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.87890625, "step": 1332, "time_per_iteration": 2.570234775543213 }, { "auxiliary_loss_clip": 0.01107836, "auxiliary_loss_mlp": 0.01066879, "balance_loss_clip": 1.01726389, "balance_loss_mlp": 1.02548635, "epoch": 0.08014429580640313, "flos": 18477921242880.0, "grad_norm": 1.7278731775416323, "language_loss": 0.83549148, "learning_rate": 3.973682368232138e-06, "loss": 0.85723865, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.82421875, "step": 1333, "time_per_iteration": 2.3747293949127197 }, { "auxiliary_loss_clip": 0.01108425, "auxiliary_loss_mlp": 0.01070924, "balance_loss_clip": 1.01768541, "balance_loss_mlp": 1.02411902, "epoch": 0.0802044190590711, "flos": 22052398993920.0, "grad_norm": 2.498379040975243, "language_loss": 0.79254526, "learning_rate": 3.9736193577631015e-06, "loss": 0.81433874, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.84375, "step": 1334, "time_per_iteration": 2.4315414428710938 }, { "auxiliary_loss_clip": 0.0110698, "auxiliary_loss_mlp": 0.01074646, "balance_loss_clip": 1.01988173, "balance_loss_mlp": 1.02452981, "epoch": 0.08026454231173906, "flos": 24570963452160.0, "grad_norm": 1.937995151572117, "language_loss": 0.82187104, "learning_rate": 3.973556272454221e-06, "loss": 0.8436873, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.82421875, "step": 1335, "time_per_iteration": 2.4134137630462646 }, { "auxiliary_loss_clip": 0.01036358, "auxiliary_loss_mlp": 0.01014533, "balance_loss_clip": 1.00466263, "balance_loss_mlp": 1.01556134, "epoch": 0.08032466556440704, "flos": 52579195545600.0, "grad_norm": 0.7459005631939178, "language_loss": 0.56162834, "learning_rate": 3.973493112307889e-06, "loss": 0.58213723, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 0.09863281, "router_z_loss_mlp": 0.20800781, "step": 1336, "time_per_iteration": 3.086026191711426 }, { "auxiliary_loss_clip": 0.01110818, "auxiliary_loss_mlp": 0.01073064, "balance_loss_clip": 1.0180608, "balance_loss_mlp": 1.02682829, "epoch": 0.080384788817075, "flos": 23841453260160.0, "grad_norm": 1.8846504309521057, "language_loss": 0.69978154, "learning_rate": 3.9734298773265005e-06, "loss": 0.72162032, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.83984375, "step": 1337, "time_per_iteration": 2.412435293197632 }, { "auxiliary_loss_clip": 0.01108605, "auxiliary_loss_mlp": 0.01078672, "balance_loss_clip": 1.02750707, "balance_loss_mlp": 1.02578926, "epoch": 0.08044491206974297, "flos": 25299565948800.0, "grad_norm": 1.802117850198766, "language_loss": 0.89150703, "learning_rate": 3.973366567512453e-06, "loss": 0.91337979, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.828125, "step": 1338, "time_per_iteration": 2.507984161376953 }, { "auxiliary_loss_clip": 0.01108053, "auxiliary_loss_mlp": 0.01086142, "balance_loss_clip": 1.03149617, "balance_loss_mlp": 1.0237062, "epoch": 0.08050503532241095, "flos": 22375625160960.0, "grad_norm": 2.2070546301110285, "language_loss": 0.89671904, "learning_rate": 3.973303182868147e-06, "loss": 0.91866106, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.84375, "step": 1339, "time_per_iteration": 2.3635315895080566 }, { "auxiliary_loss_clip": 0.01104289, "auxiliary_loss_mlp": 0.01065787, "balance_loss_clip": 1.01827073, "balance_loss_mlp": 1.02461195, "epoch": 0.08056515857507891, "flos": 18368433619200.0, "grad_norm": 2.026810562942507, "language_loss": 0.91961873, "learning_rate": 3.973239723395988e-06, "loss": 0.94131953, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.796875, "step": 1340, "time_per_iteration": 2.395108461380005 }, { "auxiliary_loss_clip": 0.01028235, "auxiliary_loss_mlp": 0.01020038, "balance_loss_clip": 1.00959539, "balance_loss_mlp": 1.00784612, "epoch": 0.08062528182774688, "flos": 51345329719680.0, "grad_norm": 0.9265618169854715, "language_loss": 0.64822388, "learning_rate": 3.97317618909838e-06, "loss": 0.66870654, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.203125, "step": 1341, "time_per_iteration": 2.9663238525390625 }, { "auxiliary_loss_clip": 0.01115499, "auxiliary_loss_mlp": 0.01083281, "balance_loss_clip": 1.02615595, "balance_loss_mlp": 1.02748251, "epoch": 0.08068540508041486, "flos": 17598843319680.0, "grad_norm": 1.8554481480279617, "language_loss": 0.91549879, "learning_rate": 3.973112579977733e-06, "loss": 0.93748653, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.8828125, "step": 1342, "time_per_iteration": 2.396362781524658 }, { "auxiliary_loss_clip": 0.01116091, "auxiliary_loss_mlp": 0.01074496, "balance_loss_clip": 1.01815748, "balance_loss_mlp": 1.02835691, "epoch": 0.08074552833308282, "flos": 10560422782080.0, "grad_norm": 2.4758569954282668, "language_loss": 0.78880423, "learning_rate": 3.973048896036459e-06, "loss": 0.81071007, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.87890625, "step": 1343, "time_per_iteration": 2.3771536350250244 }, { "auxiliary_loss_clip": 0.01027631, "auxiliary_loss_mlp": 0.01013065, "balance_loss_clip": 1.00276518, "balance_loss_mlp": 1.00715125, "epoch": 0.08080565158575079, "flos": 60837026739840.0, "grad_norm": 0.8118396813334285, "language_loss": 0.57663286, "learning_rate": 3.972985137276974e-06, "loss": 0.59703982, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.20507812, "step": 1344, "time_per_iteration": 2.960801124572754 }, { "auxiliary_loss_clip": 0.01118731, "auxiliary_loss_mlp": 0.01085935, "balance_loss_clip": 1.02973962, "balance_loss_mlp": 1.03091156, "epoch": 0.08086577483841875, "flos": 18331390800000.0, "grad_norm": 2.253586511031173, "language_loss": 0.89439678, "learning_rate": 3.972921303701695e-06, "loss": 0.91644335, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.87890625, "step": 1345, "time_per_iteration": 2.3916232585906982 }, { "auxiliary_loss_clip": 0.0111543, "auxiliary_loss_mlp": 0.01078627, "balance_loss_clip": 1.02681899, "balance_loss_mlp": 1.03055668, "epoch": 0.08092589809108673, "flos": 21542527365120.0, "grad_norm": 1.6045428472178436, "language_loss": 0.89417076, "learning_rate": 3.972857395313042e-06, "loss": 0.91611147, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.84765625, "step": 1346, "time_per_iteration": 2.424100160598755 }, { "auxiliary_loss_clip": 0.01114454, "auxiliary_loss_mlp": 0.01079451, "balance_loss_clip": 1.02897811, "balance_loss_mlp": 1.0300796, "epoch": 0.0809860213437547, "flos": 22126903568640.0, "grad_norm": 2.0283170538473114, "language_loss": 0.94470823, "learning_rate": 3.972793412113439e-06, "loss": 0.96664733, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.84375, "step": 1347, "time_per_iteration": 2.4152109622955322 }, { "auxiliary_loss_clip": 0.01117346, "auxiliary_loss_mlp": 0.01086466, "balance_loss_clip": 1.0314157, "balance_loss_mlp": 1.03346086, "epoch": 0.08104614459642266, "flos": 21724424881920.0, "grad_norm": 1.7080277963871962, "language_loss": 0.91198003, "learning_rate": 3.972729354105312e-06, "loss": 0.93401814, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.83984375, "step": 1348, "time_per_iteration": 2.4430036544799805 }, { "auxiliary_loss_clip": 0.01113425, "auxiliary_loss_mlp": 0.01083193, "balance_loss_clip": 1.03145576, "balance_loss_mlp": 1.03117883, "epoch": 0.08110626784909064, "flos": 23950731415680.0, "grad_norm": 1.8472960606832112, "language_loss": 0.78028381, "learning_rate": 3.97266522129109e-06, "loss": 0.80224997, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.8203125, "step": 1349, "time_per_iteration": 2.4375977516174316 }, { "auxiliary_loss_clip": 0.01117057, "auxiliary_loss_mlp": 0.0109268, "balance_loss_clip": 1.0311923, "balance_loss_mlp": 1.03009534, "epoch": 0.0811663911017586, "flos": 19024696045440.0, "grad_norm": 1.9444947526387593, "language_loss": 0.89922661, "learning_rate": 3.972601013673205e-06, "loss": 0.92132401, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.8671875, "step": 1350, "time_per_iteration": 2.4147348403930664 }, { "auxiliary_loss_clip": 0.01112617, "auxiliary_loss_mlp": 0.01095476, "balance_loss_clip": 1.0372777, "balance_loss_mlp": 1.02918196, "epoch": 0.08122651435442657, "flos": 15340381557120.0, "grad_norm": 1.94539313352845, "language_loss": 0.8486737, "learning_rate": 3.972536731254092e-06, "loss": 0.8707546, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.8359375, "step": 1351, "time_per_iteration": 2.4005260467529297 }, { "auxiliary_loss_clip": 0.01113119, "auxiliary_loss_mlp": 0.01089045, "balance_loss_clip": 1.02853477, "balance_loss_mlp": 1.02726603, "epoch": 0.08128663760709455, "flos": 23220453173760.0, "grad_norm": 1.9763791643624964, "language_loss": 0.77385086, "learning_rate": 3.972472374036189e-06, "loss": 0.79587245, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.859375, "step": 1352, "time_per_iteration": 2.426219940185547 }, { "auxiliary_loss_clip": 0.0111365, "auxiliary_loss_mlp": 0.01078439, "balance_loss_clip": 1.01988328, "balance_loss_mlp": 1.02704501, "epoch": 0.08134676085976252, "flos": 22964539841280.0, "grad_norm": 1.7220478412127043, "language_loss": 0.85082364, "learning_rate": 3.972407942021935e-06, "loss": 0.87274456, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8671875, "step": 1353, "time_per_iteration": 2.443525791168213 }, { "auxiliary_loss_clip": 0.01024492, "auxiliary_loss_mlp": 0.01023138, "balance_loss_clip": 1.0150317, "balance_loss_mlp": 1.00437808, "epoch": 0.08140688411243048, "flos": 64319369679360.0, "grad_norm": 0.8590377884044433, "language_loss": 0.59925461, "learning_rate": 3.972343435213775e-06, "loss": 0.61973089, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 0.08105469, "router_z_loss_mlp": 0.20117188, "step": 1354, "time_per_iteration": 3.0278260707855225 }, { "auxiliary_loss_clip": 0.01111919, "auxiliary_loss_mlp": 0.01077298, "balance_loss_clip": 1.01936281, "balance_loss_mlp": 1.02710986, "epoch": 0.08146700736509845, "flos": 22490768424960.0, "grad_norm": 1.7397507454779344, "language_loss": 0.84593832, "learning_rate": 3.972278853614154e-06, "loss": 0.86783051, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.84765625, "step": 1355, "time_per_iteration": 2.4200022220611572 }, { "auxiliary_loss_clip": 0.01110562, "auxiliary_loss_mlp": 0.01076307, "balance_loss_clip": 1.01922917, "balance_loss_mlp": 1.02485931, "epoch": 0.08152713061776642, "flos": 20446813255680.0, "grad_norm": 1.7823100838189185, "language_loss": 0.74108452, "learning_rate": 3.972214197225521e-06, "loss": 0.76295322, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.85546875, "step": 1356, "time_per_iteration": 2.402315378189087 }, { "auxiliary_loss_clip": 0.01115896, "auxiliary_loss_mlp": 0.01083975, "balance_loss_clip": 1.02484739, "balance_loss_mlp": 1.02906394, "epoch": 0.08158725387043439, "flos": 23549090601600.0, "grad_norm": 1.9158472627661638, "language_loss": 0.72575223, "learning_rate": 3.972149466050329e-06, "loss": 0.74775088, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.8671875, "step": 1357, "time_per_iteration": 2.4154152870178223 }, { "auxiliary_loss_clip": 0.01117843, "auxiliary_loss_mlp": 0.01088367, "balance_loss_clip": 1.02799928, "balance_loss_mlp": 1.03011346, "epoch": 0.08164737712310235, "flos": 22016263870080.0, "grad_norm": 2.5200883897254838, "language_loss": 0.87415963, "learning_rate": 3.97208466009103e-06, "loss": 0.89622176, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.875, "step": 1358, "time_per_iteration": 2.4459500312805176 }, { "auxiliary_loss_clip": 0.01118647, "auxiliary_loss_mlp": 0.01076745, "balance_loss_clip": 1.01692605, "balance_loss_mlp": 1.03133488, "epoch": 0.08170750037577033, "flos": 23366704325760.0, "grad_norm": 1.9807527731988828, "language_loss": 1.0362227, "learning_rate": 3.972019779350084e-06, "loss": 1.05817676, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.87109375, "step": 1359, "time_per_iteration": 2.438546657562256 }, { "auxiliary_loss_clip": 0.01117796, "auxiliary_loss_mlp": 0.01086262, "balance_loss_clip": 1.02620423, "balance_loss_mlp": 1.03030992, "epoch": 0.0817676236284383, "flos": 28396850970240.0, "grad_norm": 1.8875304057098072, "language_loss": 0.85893643, "learning_rate": 3.971954823829951e-06, "loss": 0.88097703, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.875, "step": 1360, "time_per_iteration": 3.9236767292022705 }, { "auxiliary_loss_clip": 0.01117644, "auxiliary_loss_mlp": 0.01093077, "balance_loss_clip": 1.03361583, "balance_loss_mlp": 1.02963698, "epoch": 0.08182774688110626, "flos": 19207885282560.0, "grad_norm": 2.1343271935604355, "language_loss": 0.76211262, "learning_rate": 3.971889793533093e-06, "loss": 0.78421992, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.8828125, "step": 1361, "time_per_iteration": 3.830503225326538 }, { "auxiliary_loss_clip": 0.01115325, "auxiliary_loss_mlp": 0.01088676, "balance_loss_clip": 1.02995372, "balance_loss_mlp": 1.0295558, "epoch": 0.08188787013377424, "flos": 22782991438080.0, "grad_norm": 2.9674175338757527, "language_loss": 0.79178232, "learning_rate": 3.971824688461976e-06, "loss": 0.81382239, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.85546875, "step": 1362, "time_per_iteration": 3.9383528232574463 }, { "auxiliary_loss_clip": 0.01114144, "auxiliary_loss_mlp": 0.01088155, "balance_loss_clip": 1.03300905, "balance_loss_mlp": 1.0316236, "epoch": 0.08194799338644221, "flos": 16467273377280.0, "grad_norm": 2.044144902953067, "language_loss": 0.75931215, "learning_rate": 3.971759508619069e-06, "loss": 0.78133518, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.82421875, "step": 1363, "time_per_iteration": 2.3868229389190674 }, { "auxiliary_loss_clip": 0.01116961, "auxiliary_loss_mlp": 0.01097824, "balance_loss_clip": 1.04010272, "balance_loss_mlp": 1.03175497, "epoch": 0.08200811663911017, "flos": 23912536521600.0, "grad_norm": 2.1876829620482177, "language_loss": 0.80381334, "learning_rate": 3.971694254006844e-06, "loss": 0.82596123, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8515625, "step": 1364, "time_per_iteration": 3.8066349029541016 }, { "auxiliary_loss_clip": 0.01113448, "auxiliary_loss_mlp": 0.01083449, "balance_loss_clip": 1.02670503, "balance_loss_mlp": 1.02880073, "epoch": 0.08206823989177814, "flos": 17895534986880.0, "grad_norm": 1.76667309511773, "language_loss": 0.83329529, "learning_rate": 3.971628924627776e-06, "loss": 0.85526419, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.84765625, "step": 1365, "time_per_iteration": 2.4012794494628906 }, { "auxiliary_loss_clip": 0.01110978, "auxiliary_loss_mlp": 0.01083732, "balance_loss_clip": 1.02949214, "balance_loss_mlp": 1.02886939, "epoch": 0.08212836314444612, "flos": 22087172574720.0, "grad_norm": 1.8620937434407556, "language_loss": 0.83406579, "learning_rate": 3.97156352048434e-06, "loss": 0.85601294, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.8203125, "step": 1366, "time_per_iteration": 2.430553436279297 }, { "auxiliary_loss_clip": 0.01114399, "auxiliary_loss_mlp": 0.01082446, "balance_loss_clip": 1.02660871, "balance_loss_mlp": 1.0268085, "epoch": 0.08218848639711408, "flos": 17596678815360.0, "grad_norm": 1.858159419660863, "language_loss": 0.84323168, "learning_rate": 3.97149804157902e-06, "loss": 0.86520016, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.875, "step": 1367, "time_per_iteration": 2.366088390350342 }, { "auxiliary_loss_clip": 0.01115816, "auxiliary_loss_mlp": 0.01082889, "balance_loss_clip": 1.02698016, "balance_loss_mlp": 1.0285399, "epoch": 0.08224860964978205, "flos": 17856886245120.0, "grad_norm": 2.0715979750697713, "language_loss": 0.86516553, "learning_rate": 3.9714324879142946e-06, "loss": 0.88715255, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.87109375, "step": 1368, "time_per_iteration": 2.4284470081329346 }, { "auxiliary_loss_clip": 0.01106765, "auxiliary_loss_mlp": 0.01074191, "balance_loss_clip": 1.02183437, "balance_loss_mlp": 1.02654362, "epoch": 0.08230873290245003, "flos": 25226388005760.0, "grad_norm": 1.7006888295315814, "language_loss": 0.82558376, "learning_rate": 3.971366859492653e-06, "loss": 0.84739339, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.80078125, "step": 1369, "time_per_iteration": 2.443692684173584 }, { "auxiliary_loss_clip": 0.01112208, "auxiliary_loss_mlp": 0.01078383, "balance_loss_clip": 1.0241667, "balance_loss_mlp": 1.02871931, "epoch": 0.08236885615511799, "flos": 31758567696000.0, "grad_norm": 2.4183339853649835, "language_loss": 0.77598745, "learning_rate": 3.971301156316582e-06, "loss": 0.79789335, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8359375, "step": 1370, "time_per_iteration": 2.5017385482788086 }, { "auxiliary_loss_clip": 0.01115027, "auxiliary_loss_mlp": 0.01081006, "balance_loss_clip": 1.02507341, "balance_loss_mlp": 1.0285244, "epoch": 0.08242897940778596, "flos": 23184702074880.0, "grad_norm": 1.4787990402702074, "language_loss": 0.76676124, "learning_rate": 3.971235378388573e-06, "loss": 0.78872156, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.86328125, "step": 1371, "time_per_iteration": 2.411972761154175 }, { "auxiliary_loss_clip": 0.01114813, "auxiliary_loss_mlp": 0.01075451, "balance_loss_clip": 1.01644278, "balance_loss_mlp": 1.02628553, "epoch": 0.08248910266045394, "flos": 34490172470400.0, "grad_norm": 2.0184619927737, "language_loss": 0.73864537, "learning_rate": 3.971169525711122e-06, "loss": 0.760548, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.88671875, "step": 1372, "time_per_iteration": 2.5271806716918945 }, { "auxiliary_loss_clip": 0.01114102, "auxiliary_loss_mlp": 0.01090021, "balance_loss_clip": 1.02493286, "balance_loss_mlp": 1.02688265, "epoch": 0.0825492259131219, "flos": 13435590533760.0, "grad_norm": 2.942985855534107, "language_loss": 0.91760528, "learning_rate": 3.9711035982867246e-06, "loss": 0.93964648, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.875, "step": 1373, "time_per_iteration": 2.4015536308288574 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.01078616, "balance_loss_clip": 1.02006078, "balance_loss_mlp": 1.02902246, "epoch": 0.08260934916578987, "flos": 25811252968320.0, "grad_norm": 1.8169075735936528, "language_loss": 0.85061991, "learning_rate": 3.971037596117882e-06, "loss": 0.87255228, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.85546875, "step": 1374, "time_per_iteration": 2.4519412517547607 }, { "auxiliary_loss_clip": 0.01032341, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.02135658, "balance_loss_mlp": 1.0102607, "epoch": 0.08266947241845783, "flos": 63456909563520.0, "grad_norm": 0.8509303171829613, "language_loss": 0.60715562, "learning_rate": 3.970971519207095e-06, "loss": 0.62777174, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 0.07910156, "router_z_loss_mlp": 0.22070312, "step": 1375, "time_per_iteration": 3.0209405422210693 }, { "auxiliary_loss_clip": 0.01031259, "auxiliary_loss_mlp": 0.01018319, "balance_loss_clip": 1.01016533, "balance_loss_mlp": 1.00869417, "epoch": 0.08272959567112581, "flos": 69990346062720.0, "grad_norm": 0.9198688518618698, "language_loss": 0.62376827, "learning_rate": 3.970905367556871e-06, "loss": 0.64426404, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 0.08154297, "router_z_loss_mlp": 0.2265625, "step": 1376, "time_per_iteration": 3.005570411682129 }, { "auxiliary_loss_clip": 0.01115597, "auxiliary_loss_mlp": 0.01086386, "balance_loss_clip": 1.02148843, "balance_loss_mlp": 1.02857971, "epoch": 0.08278971892379378, "flos": 20412144408960.0, "grad_norm": 1.5735052034043406, "language_loss": 0.84243321, "learning_rate": 3.970839141169718e-06, "loss": 0.86445308, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.8671875, "step": 1377, "time_per_iteration": 2.4132397174835205 }, { "auxiliary_loss_clip": 0.01114045, "auxiliary_loss_mlp": 0.01081381, "balance_loss_clip": 1.0204885, "balance_loss_mlp": 1.02717841, "epoch": 0.08284984217646174, "flos": 26249028906240.0, "grad_norm": 1.8402358781554387, "language_loss": 0.86907488, "learning_rate": 3.970772840048147e-06, "loss": 0.89102912, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.8671875, "step": 1378, "time_per_iteration": 2.470933198928833 }, { "auxiliary_loss_clip": 0.01114856, "auxiliary_loss_mlp": 0.01092253, "balance_loss_clip": 1.03014469, "balance_loss_mlp": 1.02781117, "epoch": 0.08290996542912972, "flos": 27193569361920.0, "grad_norm": 1.8556355902745458, "language_loss": 0.89778137, "learning_rate": 3.970706464194672e-06, "loss": 0.9198525, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.87109375, "step": 1379, "time_per_iteration": 2.5366737842559814 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01081845, "balance_loss_clip": 1.02567363, "balance_loss_mlp": 1.02736938, "epoch": 0.08297008868179769, "flos": 38616661728000.0, "grad_norm": 1.9638152864727898, "language_loss": 0.80275232, "learning_rate": 3.970640013611812e-06, "loss": 0.82469153, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.84765625, "step": 1380, "time_per_iteration": 2.5874640941619873 }, { "auxiliary_loss_clip": 0.01112256, "auxiliary_loss_mlp": 0.01090402, "balance_loss_clip": 1.03277659, "balance_loss_mlp": 1.02845407, "epoch": 0.08303021193446565, "flos": 19973705155200.0, "grad_norm": 2.10247974081279, "language_loss": 0.88980794, "learning_rate": 3.970573488302083e-06, "loss": 0.91183454, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.8359375, "step": 1381, "time_per_iteration": 2.390225410461426 }, { "auxiliary_loss_clip": 0.01123764, "auxiliary_loss_mlp": 0.01095449, "balance_loss_clip": 1.02807164, "balance_loss_mlp": 1.03204083, "epoch": 0.08309033518713363, "flos": 13661792432640.0, "grad_norm": 2.6700346984088696, "language_loss": 0.90960169, "learning_rate": 3.970506888268011e-06, "loss": 0.93179387, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.91796875, "step": 1382, "time_per_iteration": 2.3958740234375 }, { "auxiliary_loss_clip": 0.01119549, "auxiliary_loss_mlp": 0.01102995, "balance_loss_clip": 1.03766894, "balance_loss_mlp": 1.03030193, "epoch": 0.0831504584398016, "flos": 17967560855040.0, "grad_norm": 2.0165986644569442, "language_loss": 0.7833159, "learning_rate": 3.970440213512121e-06, "loss": 0.80554134, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.890625, "step": 1383, "time_per_iteration": 2.375577688217163 }, { "auxiliary_loss_clip": 0.0112312, "auxiliary_loss_mlp": 0.01093332, "balance_loss_clip": 1.02929306, "balance_loss_mlp": 1.03103316, "epoch": 0.08321058169246956, "flos": 22600290960000.0, "grad_norm": 1.8004299875189334, "language_loss": 0.85283303, "learning_rate": 3.97037346403694e-06, "loss": 0.8749975, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.921875, "step": 1384, "time_per_iteration": 2.4694762229919434 }, { "auxiliary_loss_clip": 0.01125951, "auxiliary_loss_mlp": 0.01092494, "balance_loss_clip": 1.02773952, "balance_loss_mlp": 1.03245091, "epoch": 0.08327070494513754, "flos": 22849501311360.0, "grad_norm": 2.403872375738541, "language_loss": 0.88780528, "learning_rate": 3.970306639845e-06, "loss": 0.90998971, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.9375, "step": 1385, "time_per_iteration": 2.400082588195801 }, { "auxiliary_loss_clip": 0.01121121, "auxiliary_loss_mlp": 0.01093875, "balance_loss_clip": 1.02392304, "balance_loss_mlp": 1.03065276, "epoch": 0.0833308281978055, "flos": 22781909185920.0, "grad_norm": 2.0184492473530082, "language_loss": 0.71468121, "learning_rate": 3.970239740938835e-06, "loss": 0.73683113, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.90625, "step": 1386, "time_per_iteration": 2.4170522689819336 }, { "auxiliary_loss_clip": 0.01114436, "auxiliary_loss_mlp": 0.01079458, "balance_loss_clip": 1.01932871, "balance_loss_mlp": 1.0261935, "epoch": 0.08339095145047347, "flos": 20811585807360.0, "grad_norm": 1.611814450062574, "language_loss": 0.83610451, "learning_rate": 3.97017276732098e-06, "loss": 0.85804343, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.8828125, "step": 1387, "time_per_iteration": 2.397003650665283 }, { "auxiliary_loss_clip": 0.01115976, "auxiliary_loss_mlp": 0.01085902, "balance_loss_clip": 1.02138579, "balance_loss_mlp": 1.02646101, "epoch": 0.08345107470314143, "flos": 18514335657600.0, "grad_norm": 1.7992570829050043, "language_loss": 0.79475051, "learning_rate": 3.970105718993978e-06, "loss": 0.81676924, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.89453125, "step": 1388, "time_per_iteration": 2.399960994720459 }, { "auxiliary_loss_clip": 0.01112153, "auxiliary_loss_mlp": 0.01081116, "balance_loss_clip": 1.021631, "balance_loss_mlp": 1.02725863, "epoch": 0.08351119795580941, "flos": 18806558670720.0, "grad_norm": 2.041565377541685, "language_loss": 0.81683552, "learning_rate": 3.970038595960369e-06, "loss": 0.83876818, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.84765625, "step": 1389, "time_per_iteration": 2.3746466636657715 }, { "auxiliary_loss_clip": 0.01118415, "auxiliary_loss_mlp": 0.01075261, "balance_loss_clip": 1.01460695, "balance_loss_mlp": 1.0302484, "epoch": 0.08357132120847738, "flos": 18440843512320.0, "grad_norm": 2.2653982118689875, "language_loss": 0.90585464, "learning_rate": 3.969971398222699e-06, "loss": 0.92779136, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.8828125, "step": 1390, "time_per_iteration": 2.369727373123169 }, { "auxiliary_loss_clip": 0.01112828, "auxiliary_loss_mlp": 0.01083222, "balance_loss_clip": 1.02411819, "balance_loss_mlp": 1.02617037, "epoch": 0.08363144446114534, "flos": 25921124616960.0, "grad_norm": 1.6385066683570457, "language_loss": 0.88485849, "learning_rate": 3.969904125783517e-06, "loss": 0.90681899, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.8671875, "step": 1391, "time_per_iteration": 2.4261651039123535 }, { "auxiliary_loss_clip": 0.01120452, "auxiliary_loss_mlp": 0.01089992, "balance_loss_clip": 1.0265727, "balance_loss_mlp": 1.03008628, "epoch": 0.08369156771381332, "flos": 18040319861760.0, "grad_norm": 2.1010120146315083, "language_loss": 0.91791081, "learning_rate": 3.969836778645371e-06, "loss": 0.9400152, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.90625, "step": 1392, "time_per_iteration": 2.418532133102417 }, { "auxiliary_loss_clip": 0.01112852, "auxiliary_loss_mlp": 0.01081653, "balance_loss_clip": 1.02698362, "balance_loss_mlp": 1.02702689, "epoch": 0.08375169096648129, "flos": 22673992573440.0, "grad_norm": 2.412700474191262, "language_loss": 0.82848752, "learning_rate": 3.969769356810819e-06, "loss": 0.85043252, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.859375, "step": 1393, "time_per_iteration": 2.411720037460327 }, { "auxiliary_loss_clip": 0.01109279, "auxiliary_loss_mlp": 0.01084619, "balance_loss_clip": 1.02825701, "balance_loss_mlp": 1.02592278, "epoch": 0.08381181421914925, "flos": 26102044615680.0, "grad_norm": 1.7798438229829463, "language_loss": 0.8641988, "learning_rate": 3.969701860282415e-06, "loss": 0.88613778, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8359375, "step": 1394, "time_per_iteration": 2.447401285171509 }, { "auxiliary_loss_clip": 0.01115185, "auxiliary_loss_mlp": 0.01088497, "balance_loss_clip": 1.02781987, "balance_loss_mlp": 1.02782297, "epoch": 0.08387193747181723, "flos": 20628780595200.0, "grad_norm": 2.370356933459279, "language_loss": 0.84495461, "learning_rate": 3.969634289062719e-06, "loss": 0.8669914, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.875, "step": 1395, "time_per_iteration": 2.4002926349639893 }, { "auxiliary_loss_clip": 0.01116037, "auxiliary_loss_mlp": 0.01094812, "balance_loss_clip": 1.03110683, "balance_loss_mlp": 1.02817655, "epoch": 0.0839320607244852, "flos": 13442363337600.0, "grad_norm": 2.1400180442785484, "language_loss": 0.85569286, "learning_rate": 3.969566643154293e-06, "loss": 0.8778013, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.87890625, "step": 1396, "time_per_iteration": 2.368790864944458 }, { "auxiliary_loss_clip": 0.01109521, "auxiliary_loss_mlp": 0.01091867, "balance_loss_clip": 1.02384639, "balance_loss_mlp": 1.02685153, "epoch": 0.08399218397715316, "flos": 23476122126720.0, "grad_norm": 1.9649349017174513, "language_loss": 0.78599036, "learning_rate": 3.969498922559703e-06, "loss": 0.8080042, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.828125, "step": 1397, "time_per_iteration": 2.4267778396606445 }, { "auxiliary_loss_clip": 0.01113542, "auxiliary_loss_mlp": 0.01091288, "balance_loss_clip": 1.0262953, "balance_loss_mlp": 1.02805257, "epoch": 0.08405230722982113, "flos": 25919553605760.0, "grad_norm": 2.1799131720060716, "language_loss": 0.79999208, "learning_rate": 3.969431127281516e-06, "loss": 0.82204044, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 0.85546875, "step": 1398, "time_per_iteration": 2.432393789291382 }, { "auxiliary_loss_clip": 0.01110378, "auxiliary_loss_mlp": 0.01088126, "balance_loss_clip": 1.03023756, "balance_loss_mlp": 1.02603745, "epoch": 0.0841124304824891, "flos": 17966478602880.0, "grad_norm": 2.2132381604744102, "language_loss": 0.96883905, "learning_rate": 3.969363257322304e-06, "loss": 0.9908241, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.84375, "step": 1399, "time_per_iteration": 3.844805955886841 }, { "auxiliary_loss_clip": 0.01113905, "auxiliary_loss_mlp": 0.01084044, "balance_loss_clip": 1.02243638, "balance_loss_mlp": 1.02659392, "epoch": 0.08417255373515707, "flos": 25628482667520.0, "grad_norm": 2.293454677068318, "language_loss": 0.84955823, "learning_rate": 3.96929531268464e-06, "loss": 0.87153769, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.875, "step": 1400, "time_per_iteration": 2.4963936805725098 }, { "auxiliary_loss_clip": 0.01112735, "auxiliary_loss_mlp": 0.01079858, "balance_loss_clip": 1.01870322, "balance_loss_mlp": 1.02786112, "epoch": 0.08423267698782504, "flos": 26248540147200.0, "grad_norm": 1.848027880186848, "language_loss": 0.8897863, "learning_rate": 3.969227293371099e-06, "loss": 0.91171229, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.84765625, "step": 1401, "time_per_iteration": 3.905409097671509 }, { "auxiliary_loss_clip": 0.01116566, "auxiliary_loss_mlp": 0.01093457, "balance_loss_clip": 1.03149176, "balance_loss_mlp": 1.02777958, "epoch": 0.08429280024049302, "flos": 20118699498240.0, "grad_norm": 1.7897395412274912, "language_loss": 0.89774215, "learning_rate": 3.969159199384263e-06, "loss": 0.91984236, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.88671875, "step": 1402, "time_per_iteration": 3.7939095497131348 }, { "auxiliary_loss_clip": 0.01113293, "auxiliary_loss_mlp": 0.01080029, "balance_loss_clip": 1.020329, "balance_loss_mlp": 1.02711666, "epoch": 0.08435292349316098, "flos": 42922849086720.0, "grad_norm": 2.0808049087504883, "language_loss": 0.90966964, "learning_rate": 3.9690910307267125e-06, "loss": 0.93160284, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.86328125, "step": 1403, "time_per_iteration": 2.590278387069702 }, { "auxiliary_loss_clip": 0.01117132, "auxiliary_loss_mlp": 0.01081346, "balance_loss_clip": 1.02164555, "balance_loss_mlp": 1.02769399, "epoch": 0.08441304674582895, "flos": 22856169381120.0, "grad_norm": 2.1862286044609727, "language_loss": 0.82894778, "learning_rate": 3.969022787401033e-06, "loss": 0.8509326, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.89453125, "step": 1404, "time_per_iteration": 3.798909902572632 }, { "auxiliary_loss_clip": 0.01119545, "auxiliary_loss_mlp": 0.01089754, "balance_loss_clip": 1.02604866, "balance_loss_mlp": 1.02864075, "epoch": 0.08447316999849692, "flos": 18696512465280.0, "grad_norm": 1.9164886258212148, "language_loss": 0.85926306, "learning_rate": 3.968954469409811e-06, "loss": 0.881356, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.90625, "step": 1405, "time_per_iteration": 2.3572118282318115 }, { "auxiliary_loss_clip": 0.01117086, "auxiliary_loss_mlp": 0.01078609, "balance_loss_clip": 1.01580918, "balance_loss_mlp": 1.02815735, "epoch": 0.08453329325116489, "flos": 25482790097280.0, "grad_norm": 1.456029417431269, "language_loss": 0.82221991, "learning_rate": 3.968886076755639e-06, "loss": 0.84417683, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.890625, "step": 1406, "time_per_iteration": 2.4397239685058594 }, { "auxiliary_loss_clip": 0.01117915, "auxiliary_loss_mlp": 0.01087035, "balance_loss_clip": 1.02769232, "balance_loss_mlp": 1.03029108, "epoch": 0.08459341650383286, "flos": 20919083483520.0, "grad_norm": 1.8756244038628709, "language_loss": 0.81468219, "learning_rate": 3.96881760944111e-06, "loss": 0.83673167, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.875, "step": 1407, "time_per_iteration": 2.3878774642944336 }, { "auxiliary_loss_clip": 0.01112223, "auxiliary_loss_mlp": 0.01083123, "balance_loss_clip": 1.02661812, "balance_loss_mlp": 1.0285027, "epoch": 0.08465353975650082, "flos": 13042223712000.0, "grad_norm": 2.064055858098722, "language_loss": 0.93716156, "learning_rate": 3.968749067468819e-06, "loss": 0.95911503, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8359375, "step": 1408, "time_per_iteration": 2.3763062953948975 }, { "auxiliary_loss_clip": 0.01048802, "auxiliary_loss_mlp": 0.01009303, "balance_loss_clip": 1.00076771, "balance_loss_mlp": 1.02741575, "epoch": 0.0847136630091688, "flos": 60874174293120.0, "grad_norm": 0.899121895159353, "language_loss": 0.61954868, "learning_rate": 3.968680450841368e-06, "loss": 0.6401298, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 0.08544922, "router_z_loss_mlp": 0.21386719, "step": 1409, "time_per_iteration": 3.1043546199798584 }, { "auxiliary_loss_clip": 0.011052, "auxiliary_loss_mlp": 0.0107871, "balance_loss_clip": 1.02497053, "balance_loss_mlp": 1.02605641, "epoch": 0.08477378626183676, "flos": 22045661101440.0, "grad_norm": 2.3133840908624625, "language_loss": 0.89147848, "learning_rate": 3.968611759561355e-06, "loss": 0.91331756, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.79296875, "step": 1410, "time_per_iteration": 2.4078266620635986 }, { "auxiliary_loss_clip": 0.01113556, "auxiliary_loss_mlp": 0.01084476, "balance_loss_clip": 1.02100849, "balance_loss_mlp": 1.02655125, "epoch": 0.08483390951450473, "flos": 16689146267520.0, "grad_norm": 2.017730094235175, "language_loss": 0.76440203, "learning_rate": 3.968542993631388e-06, "loss": 0.78638232, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.87109375, "step": 1411, "time_per_iteration": 2.376321792602539 }, { "auxiliary_loss_clip": 0.01034838, "auxiliary_loss_mlp": 0.01048379, "balance_loss_clip": 1.03874671, "balance_loss_mlp": 1.01362562, "epoch": 0.08489403276717271, "flos": 51581341710720.0, "grad_norm": 0.9844933613043992, "language_loss": 0.56860018, "learning_rate": 3.968474153054073e-06, "loss": 0.58943236, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 0.09619141, "router_z_loss_mlp": 0.21289062, "step": 1412, "time_per_iteration": 2.970231056213379 }, { "auxiliary_loss_clip": 0.01110336, "auxiliary_loss_mlp": 0.01076913, "balance_loss_clip": 1.02260101, "balance_loss_mlp": 1.02662587, "epoch": 0.08495415601984067, "flos": 17091380574720.0, "grad_norm": 2.318960863034292, "language_loss": 0.91942322, "learning_rate": 3.96840523783202e-06, "loss": 0.94129574, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.8359375, "step": 1413, "time_per_iteration": 2.365710496902466 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.01079732, "balance_loss_clip": 1.02401352, "balance_loss_mlp": 1.02638221, "epoch": 0.08501427927250864, "flos": 23147310142080.0, "grad_norm": 1.7790083794186835, "language_loss": 0.90110755, "learning_rate": 3.968336247967844e-06, "loss": 0.92299372, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.82421875, "step": 1414, "time_per_iteration": 2.426056146621704 }, { "auxiliary_loss_clip": 0.01118333, "auxiliary_loss_mlp": 0.0108492, "balance_loss_clip": 1.03380275, "balance_loss_mlp": 1.03100491, "epoch": 0.08507440252517662, "flos": 19062437091840.0, "grad_norm": 1.6916776567164322, "language_loss": 0.79130346, "learning_rate": 3.96826718346416e-06, "loss": 0.81333601, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.875, "step": 1415, "time_per_iteration": 2.3966903686523438 }, { "auxiliary_loss_clip": 0.01109801, "auxiliary_loss_mlp": 0.01082812, "balance_loss_clip": 1.03272045, "balance_loss_mlp": 1.02818453, "epoch": 0.08513452577784458, "flos": 60180137775360.0, "grad_norm": 1.6403556611559988, "language_loss": 0.72788024, "learning_rate": 3.968198044323587e-06, "loss": 0.7498064, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.81640625, "step": 1416, "time_per_iteration": 2.780937671661377 }, { "auxiliary_loss_clip": 0.0112044, "auxiliary_loss_mlp": 0.01098129, "balance_loss_clip": 1.04224336, "balance_loss_mlp": 1.0331502, "epoch": 0.08519464903051255, "flos": 27307246348800.0, "grad_norm": 1.8788234481105044, "language_loss": 0.77030414, "learning_rate": 3.968128830548748e-06, "loss": 0.79248983, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.875, "step": 1417, "time_per_iteration": 2.44214129447937 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01088158, "balance_loss_clip": 1.03415632, "balance_loss_mlp": 1.03219712, "epoch": 0.08525477228318051, "flos": 20265404497920.0, "grad_norm": 2.226908673252253, "language_loss": 0.85176468, "learning_rate": 3.968059542142265e-06, "loss": 0.87380391, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8359375, "step": 1418, "time_per_iteration": 2.4115822315216064 }, { "auxiliary_loss_clip": 0.01032178, "auxiliary_loss_mlp": 0.0104884, "balance_loss_clip": 1.04082918, "balance_loss_mlp": 1.01130629, "epoch": 0.08531489553584849, "flos": 67611923268480.0, "grad_norm": 0.9103125170411256, "language_loss": 0.56791902, "learning_rate": 3.9679901791067685e-06, "loss": 0.58872914, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 0.08007812, "router_z_loss_mlp": 0.20898438, "step": 1419, "time_per_iteration": 2.9400699138641357 }, { "auxiliary_loss_clip": 0.01116261, "auxiliary_loss_mlp": 0.01087167, "balance_loss_clip": 1.03481054, "balance_loss_mlp": 1.0308677, "epoch": 0.08537501878851646, "flos": 27525732837120.0, "grad_norm": 2.158622550947699, "language_loss": 0.73245436, "learning_rate": 3.967920741444886e-06, "loss": 0.75448865, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.8515625, "step": 1420, "time_per_iteration": 2.505561351776123 }, { "auxiliary_loss_clip": 0.01113341, "auxiliary_loss_mlp": 0.01077893, "balance_loss_clip": 1.02517867, "balance_loss_mlp": 1.03103662, "epoch": 0.08543514204118442, "flos": 22783131083520.0, "grad_norm": 1.6820846281710935, "language_loss": 0.90052062, "learning_rate": 3.967851229159252e-06, "loss": 0.9224329, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.8203125, "step": 1421, "time_per_iteration": 2.476797342300415 }, { "auxiliary_loss_clip": 0.01035795, "auxiliary_loss_mlp": 0.01012584, "balance_loss_clip": 1.00371516, "balance_loss_mlp": 1.01580596, "epoch": 0.0854952652938524, "flos": 60987362520960.0, "grad_norm": 0.8083987011504955, "language_loss": 0.63656533, "learning_rate": 3.967781642252502e-06, "loss": 0.65704906, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 0.08886719, "router_z_loss_mlp": 0.19921875, "step": 1422, "time_per_iteration": 3.1240267753601074 }, { "auxiliary_loss_clip": 0.01112471, "auxiliary_loss_mlp": 0.01092175, "balance_loss_clip": 1.03595614, "balance_loss_mlp": 1.03040361, "epoch": 0.08555538854652037, "flos": 28036791452160.0, "grad_norm": 1.872987604068758, "language_loss": 0.85021901, "learning_rate": 3.967711980727276e-06, "loss": 0.87226552, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8203125, "step": 1423, "time_per_iteration": 2.452277183532715 }, { "auxiliary_loss_clip": 0.01113182, "auxiliary_loss_mlp": 0.01089304, "balance_loss_clip": 1.0351119, "balance_loss_mlp": 1.02935719, "epoch": 0.08561551179918833, "flos": 23508277355520.0, "grad_norm": 1.7501794811926428, "language_loss": 0.77323192, "learning_rate": 3.967642244586213e-06, "loss": 0.79525679, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.8359375, "step": 1424, "time_per_iteration": 2.471280097961426 }, { "auxiliary_loss_clip": 0.01111512, "auxiliary_loss_mlp": 0.01081251, "balance_loss_clip": 1.02760649, "balance_loss_mlp": 1.02971339, "epoch": 0.08567563505185631, "flos": 17926084293120.0, "grad_norm": 1.7599352281880454, "language_loss": 0.78676742, "learning_rate": 3.96757243383196e-06, "loss": 0.80869502, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.81640625, "step": 1425, "time_per_iteration": 2.3701725006103516 }, { "auxiliary_loss_clip": 0.01111266, "auxiliary_loss_mlp": 0.01080377, "balance_loss_clip": 1.02430069, "balance_loss_mlp": 1.02844954, "epoch": 0.08573575830452428, "flos": 19718490049920.0, "grad_norm": 1.878361620876402, "language_loss": 0.95259249, "learning_rate": 3.9675025484671624e-06, "loss": 0.97450894, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.828125, "step": 1426, "time_per_iteration": 2.4057061672210693 }, { "auxiliary_loss_clip": 0.01116925, "auxiliary_loss_mlp": 0.01080495, "balance_loss_clip": 1.02069998, "balance_loss_mlp": 1.03035212, "epoch": 0.08579588155719224, "flos": 17930587858560.0, "grad_norm": 2.2873133884261785, "language_loss": 0.77411801, "learning_rate": 3.967432588494471e-06, "loss": 0.79609221, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.8671875, "step": 1427, "time_per_iteration": 2.3804595470428467 }, { "auxiliary_loss_clip": 0.01111194, "auxiliary_loss_mlp": 0.01078663, "balance_loss_clip": 1.02852321, "balance_loss_mlp": 1.02917266, "epoch": 0.08585600480986022, "flos": 16032429993600.0, "grad_norm": 2.5994789225577244, "language_loss": 0.84816945, "learning_rate": 3.96736255391654e-06, "loss": 0.87006807, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.8203125, "step": 1428, "time_per_iteration": 2.38411283493042 }, { "auxiliary_loss_clip": 0.01113934, "auxiliary_loss_mlp": 0.01080702, "balance_loss_clip": 1.02407813, "balance_loss_mlp": 1.02999592, "epoch": 0.08591612806252819, "flos": 28656185616000.0, "grad_norm": 2.057324733047282, "language_loss": 0.82921427, "learning_rate": 3.967292444736023e-06, "loss": 0.85116065, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.83984375, "step": 1429, "time_per_iteration": 2.4644107818603516 }, { "auxiliary_loss_clip": 0.01116789, "auxiliary_loss_mlp": 0.01086597, "balance_loss_clip": 1.02804112, "balance_loss_mlp": 1.03144622, "epoch": 0.08597625131519615, "flos": 20958081338880.0, "grad_norm": 1.9420518808743064, "language_loss": 0.89817762, "learning_rate": 3.967222260955578e-06, "loss": 0.92021149, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8515625, "step": 1430, "time_per_iteration": 2.4637722969055176 }, { "auxiliary_loss_clip": 0.01116165, "auxiliary_loss_mlp": 0.01070056, "balance_loss_clip": 1.01619685, "balance_loss_mlp": 1.03415608, "epoch": 0.08603637456786412, "flos": 23255296577280.0, "grad_norm": 1.571123833435201, "language_loss": 0.83976698, "learning_rate": 3.96715200257787e-06, "loss": 0.86162913, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8203125, "step": 1431, "time_per_iteration": 2.4456446170806885 }, { "auxiliary_loss_clip": 0.01116124, "auxiliary_loss_mlp": 0.01072626, "balance_loss_clip": 1.02117491, "balance_loss_mlp": 1.03247929, "epoch": 0.0860964978205321, "flos": 28692914232960.0, "grad_norm": 1.7188953589434028, "language_loss": 0.79189759, "learning_rate": 3.967081669605559e-06, "loss": 0.81378508, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.83984375, "step": 1432, "time_per_iteration": 2.4799957275390625 }, { "auxiliary_loss_clip": 0.01116294, "auxiliary_loss_mlp": 0.01078601, "balance_loss_clip": 1.02440858, "balance_loss_mlp": 1.03194785, "epoch": 0.08615662107320006, "flos": 19317372906240.0, "grad_norm": 1.96902837000507, "language_loss": 0.76014602, "learning_rate": 3.967011262041315e-06, "loss": 0.7820949, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.84375, "step": 1433, "time_per_iteration": 2.407606601715088 }, { "auxiliary_loss_clip": 0.0112356, "auxiliary_loss_mlp": 0.01081192, "balance_loss_clip": 1.02156353, "balance_loss_mlp": 1.03532553, "epoch": 0.08621674432586802, "flos": 15850776856320.0, "grad_norm": 2.6097833625342957, "language_loss": 0.89278162, "learning_rate": 3.9669407798878065e-06, "loss": 0.91482919, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.8828125, "step": 1434, "time_per_iteration": 2.4276256561279297 }, { "auxiliary_loss_clip": 0.01119496, "auxiliary_loss_mlp": 0.01081947, "balance_loss_clip": 1.0290184, "balance_loss_mlp": 1.03286588, "epoch": 0.086276867578536, "flos": 14099777838720.0, "grad_norm": 2.450471605172046, "language_loss": 0.81907034, "learning_rate": 3.966870223147707e-06, "loss": 0.84108478, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.8671875, "step": 1435, "time_per_iteration": 2.394925832748413 }, { "auxiliary_loss_clip": 0.01045552, "auxiliary_loss_mlp": 0.010149, "balance_loss_clip": 1.00855815, "balance_loss_mlp": 1.02510858, "epoch": 0.08633699083120397, "flos": 70181250710400.0, "grad_norm": 0.9024021432673198, "language_loss": 0.57986838, "learning_rate": 3.96679959182369e-06, "loss": 0.60047293, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 0.06347656, "router_z_loss_mlp": 0.20507812, "step": 1436, "time_per_iteration": 3.1369192600250244 }, { "auxiliary_loss_clip": 0.01119287, "auxiliary_loss_mlp": 0.01094713, "balance_loss_clip": 1.03568101, "balance_loss_mlp": 1.03313851, "epoch": 0.08639711408387193, "flos": 30297592275840.0, "grad_norm": 2.1584659396296706, "language_loss": 0.72806776, "learning_rate": 3.966728885918437e-06, "loss": 0.75020778, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.859375, "step": 1437, "time_per_iteration": 2.47381591796875 }, { "auxiliary_loss_clip": 0.01111881, "auxiliary_loss_mlp": 0.0107538, "balance_loss_clip": 1.02516866, "balance_loss_mlp": 1.03080988, "epoch": 0.08645723733653991, "flos": 20296791676800.0, "grad_norm": 1.7511859793835034, "language_loss": 0.75384074, "learning_rate": 3.966658105434627e-06, "loss": 0.77571332, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.8125, "step": 1438, "time_per_iteration": 2.4227497577667236 }, { "auxiliary_loss_clip": 0.01110918, "auxiliary_loss_mlp": 0.01072271, "balance_loss_clip": 1.02244115, "balance_loss_mlp": 1.03024793, "epoch": 0.08651736058920788, "flos": 32889195031680.0, "grad_norm": 2.006054621899296, "language_loss": 0.66755843, "learning_rate": 3.966587250374945e-06, "loss": 0.68939036, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.80859375, "step": 1439, "time_per_iteration": 3.968137264251709 }, { "auxiliary_loss_clip": 0.01109768, "auxiliary_loss_mlp": 0.01089778, "balance_loss_clip": 1.03758812, "balance_loss_mlp": 1.0281558, "epoch": 0.08657748384187584, "flos": 22636286438400.0, "grad_norm": 2.1781412637823956, "language_loss": 0.90426373, "learning_rate": 3.966516320742077e-06, "loss": 0.92625922, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.81640625, "step": 1440, "time_per_iteration": 2.426774501800537 }, { "auxiliary_loss_clip": 0.0111726, "auxiliary_loss_mlp": 0.01104429, "balance_loss_clip": 1.04463363, "balance_loss_mlp": 1.02877986, "epoch": 0.08663760709454381, "flos": 23657286504960.0, "grad_norm": 2.1808064570110806, "language_loss": 0.86329901, "learning_rate": 3.9664453165387124e-06, "loss": 0.88551581, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.8828125, "step": 1441, "time_per_iteration": 3.8498198986053467 }, { "auxiliary_loss_clip": 0.01025273, "auxiliary_loss_mlp": 0.01014786, "balance_loss_clip": 1.00782466, "balance_loss_mlp": 1.00432384, "epoch": 0.08669773034721179, "flos": 62683688482560.0, "grad_norm": 0.8589133435335505, "language_loss": 0.60599625, "learning_rate": 3.966374237767545e-06, "loss": 0.62639678, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 0.06982422, "router_z_loss_mlp": 0.20898438, "step": 1442, "time_per_iteration": 4.606550455093384 }, { "auxiliary_loss_clip": 0.01115005, "auxiliary_loss_mlp": 0.0108462, "balance_loss_clip": 1.03266871, "balance_loss_mlp": 1.02724648, "epoch": 0.08675785359987975, "flos": 20666451818880.0, "grad_norm": 2.183011345817947, "language_loss": 0.82634366, "learning_rate": 3.96630308443127e-06, "loss": 0.84833992, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.87890625, "step": 1443, "time_per_iteration": 3.9023399353027344 }, { "auxiliary_loss_clip": 0.01112582, "auxiliary_loss_mlp": 0.01079389, "balance_loss_clip": 1.02512515, "balance_loss_mlp": 1.02741575, "epoch": 0.08681797685254772, "flos": 26939960179200.0, "grad_norm": 1.7510366184191377, "language_loss": 0.8502841, "learning_rate": 3.966231856532584e-06, "loss": 0.87220383, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.8515625, "step": 1444, "time_per_iteration": 2.4551033973693848 }, { "auxiliary_loss_clip": 0.01115627, "auxiliary_loss_mlp": 0.01073469, "balance_loss_clip": 1.02037346, "balance_loss_mlp": 1.02859306, "epoch": 0.0868781001052157, "flos": 17711856990720.0, "grad_norm": 2.0176007667047724, "language_loss": 0.91051841, "learning_rate": 3.966160554074189e-06, "loss": 0.93240941, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.87109375, "step": 1445, "time_per_iteration": 2.3741347789764404 }, { "auxiliary_loss_clip": 0.01116336, "auxiliary_loss_mlp": 0.01070498, "balance_loss_clip": 1.01835549, "balance_loss_mlp": 1.03191006, "epoch": 0.08693822335788366, "flos": 19895639621760.0, "grad_norm": 2.147681418847971, "language_loss": 0.84657621, "learning_rate": 3.96608917705879e-06, "loss": 0.86844456, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.84375, "step": 1446, "time_per_iteration": 2.423619031906128 }, { "auxiliary_loss_clip": 0.01035002, "auxiliary_loss_mlp": 0.01007121, "balance_loss_clip": 1.00063586, "balance_loss_mlp": 1.01290298, "epoch": 0.08699834661055163, "flos": 67020878995200.0, "grad_norm": 0.7314113702234439, "language_loss": 0.54856455, "learning_rate": 3.966017725489091e-06, "loss": 0.56898582, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 0.06494141, "router_z_loss_mlp": 0.22070312, "step": 1447, "time_per_iteration": 3.0771048069000244 }, { "auxiliary_loss_clip": 0.0111426, "auxiliary_loss_mlp": 0.01072662, "balance_loss_clip": 1.0185647, "balance_loss_mlp": 1.03261232, "epoch": 0.0870584698632196, "flos": 13479650536320.0, "grad_norm": 2.072212645881914, "language_loss": 0.87208784, "learning_rate": 3.965946199367804e-06, "loss": 0.89395702, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.81640625, "step": 1448, "time_per_iteration": 2.3829267024993896 }, { "auxiliary_loss_clip": 0.01123087, "auxiliary_loss_mlp": 0.01074617, "balance_loss_clip": 1.01801658, "balance_loss_mlp": 1.03458977, "epoch": 0.08711859311588757, "flos": 16106096695680.0, "grad_norm": 2.5894806760174016, "language_loss": 0.85140091, "learning_rate": 3.965874598697638e-06, "loss": 0.87337792, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.8828125, "step": 1449, "time_per_iteration": 2.3910915851593018 }, { "auxiliary_loss_clip": 0.01117386, "auxiliary_loss_mlp": 0.01079108, "balance_loss_clip": 1.02322221, "balance_loss_mlp": 1.03371429, "epoch": 0.08717871636855554, "flos": 38470829512320.0, "grad_norm": 1.514910957918278, "language_loss": 0.73106921, "learning_rate": 3.965802923481313e-06, "loss": 0.75303411, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8359375, "step": 1450, "time_per_iteration": 2.5526249408721924 }, { "auxiliary_loss_clip": 0.01116501, "auxiliary_loss_mlp": 0.01080257, "balance_loss_clip": 1.02596951, "balance_loss_mlp": 1.03247833, "epoch": 0.0872388396212235, "flos": 17599681192320.0, "grad_norm": 1.8398225043867875, "language_loss": 0.8475877, "learning_rate": 3.965731173721542e-06, "loss": 0.86955529, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.83984375, "step": 1451, "time_per_iteration": 2.4064066410064697 }, { "auxiliary_loss_clip": 0.01115896, "auxiliary_loss_mlp": 0.01081916, "balance_loss_clip": 1.02867699, "balance_loss_mlp": 1.03330588, "epoch": 0.08729896287389148, "flos": 25258368677760.0, "grad_norm": 1.7512759878795952, "language_loss": 0.76147288, "learning_rate": 3.965659349421049e-06, "loss": 0.78345096, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.82421875, "step": 1452, "time_per_iteration": 2.4591469764709473 }, { "auxiliary_loss_clip": 0.01117849, "auxiliary_loss_mlp": 0.01088864, "balance_loss_clip": 1.03562462, "balance_loss_mlp": 1.03127027, "epoch": 0.08735908612655945, "flos": 15631557229440.0, "grad_norm": 2.5387254400196277, "language_loss": 0.83638072, "learning_rate": 3.965587450582556e-06, "loss": 0.85844791, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8671875, "step": 1453, "time_per_iteration": 2.4037959575653076 }, { "auxiliary_loss_clip": 0.01114814, "auxiliary_loss_mlp": 0.0107522, "balance_loss_clip": 1.02684498, "balance_loss_mlp": 1.0319078, "epoch": 0.08741920937922741, "flos": 20338617352320.0, "grad_norm": 2.5070264808703895, "language_loss": 0.73180056, "learning_rate": 3.96551547720879e-06, "loss": 0.75370097, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.828125, "step": 1454, "time_per_iteration": 2.453380823135376 }, { "auxiliary_loss_clip": 0.01029206, "auxiliary_loss_mlp": 0.01018032, "balance_loss_clip": 1.01121342, "balance_loss_mlp": 1.00741255, "epoch": 0.08747933263189539, "flos": 62816252515200.0, "grad_norm": 0.7899442217902768, "language_loss": 0.58746308, "learning_rate": 3.96544342930248e-06, "loss": 0.60793549, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 0.06835938, "router_z_loss_mlp": 0.21875, "step": 1455, "time_per_iteration": 3.0455079078674316 }, { "auxiliary_loss_clip": 0.01111902, "auxiliary_loss_mlp": 0.01078802, "balance_loss_clip": 1.02134299, "balance_loss_mlp": 1.02725852, "epoch": 0.08753945588456336, "flos": 33034503576960.0, "grad_norm": 1.712183080070253, "language_loss": 0.79740429, "learning_rate": 3.965371306866359e-06, "loss": 0.81931138, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.84765625, "step": 1456, "time_per_iteration": 2.514287233352661 }, { "auxiliary_loss_clip": 0.01108973, "auxiliary_loss_mlp": 0.01077824, "balance_loss_clip": 1.02050805, "balance_loss_mlp": 1.02620721, "epoch": 0.08759957913723132, "flos": 35545911206400.0, "grad_norm": 1.8340154401536264, "language_loss": 0.74541891, "learning_rate": 3.96529910990316e-06, "loss": 0.7672869, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.828125, "step": 1457, "time_per_iteration": 2.5370287895202637 }, { "auxiliary_loss_clip": 0.01106001, "auxiliary_loss_mlp": 0.01066732, "balance_loss_clip": 1.0150907, "balance_loss_mlp": 1.02479792, "epoch": 0.0876597023898993, "flos": 23910092726400.0, "grad_norm": 1.509206461076867, "language_loss": 0.88228375, "learning_rate": 3.965226838415622e-06, "loss": 0.90401107, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.8125, "step": 1458, "time_per_iteration": 2.52462100982666 }, { "auxiliary_loss_clip": 0.01111115, "auxiliary_loss_mlp": 0.01074795, "balance_loss_clip": 1.02107942, "balance_loss_mlp": 1.02947068, "epoch": 0.08771982564256726, "flos": 18113043957120.0, "grad_norm": 1.594725376203298, "language_loss": 0.81962031, "learning_rate": 3.965154492406486e-06, "loss": 0.84147942, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.81640625, "step": 1459, "time_per_iteration": 2.4338138103485107 }, { "auxiliary_loss_clip": 0.01113952, "auxiliary_loss_mlp": 0.01083841, "balance_loss_clip": 1.02750266, "balance_loss_mlp": 1.02884984, "epoch": 0.08777994889523523, "flos": 17711054029440.0, "grad_norm": 2.12412159317749, "language_loss": 0.86771858, "learning_rate": 3.9650820718784945e-06, "loss": 0.88969648, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8515625, "step": 1460, "time_per_iteration": 2.3847978115081787 }, { "auxiliary_loss_clip": 0.01111219, "auxiliary_loss_mlp": 0.01077052, "balance_loss_clip": 1.02357459, "balance_loss_mlp": 1.02895474, "epoch": 0.0878400721479032, "flos": 12819198746880.0, "grad_norm": 2.5102168231036037, "language_loss": 0.83369768, "learning_rate": 3.965009576834394e-06, "loss": 0.85558033, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.82421875, "step": 1461, "time_per_iteration": 2.3685266971588135 }, { "auxiliary_loss_clip": 0.01115651, "auxiliary_loss_mlp": 0.01082455, "balance_loss_clip": 1.02954984, "balance_loss_mlp": 1.03143322, "epoch": 0.08790019540057117, "flos": 26391579454080.0, "grad_norm": 1.6498477376973795, "language_loss": 0.7713989, "learning_rate": 3.964937007276932e-06, "loss": 0.79338002, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.84375, "step": 1462, "time_per_iteration": 2.486179828643799 }, { "auxiliary_loss_clip": 0.01116504, "auxiliary_loss_mlp": 0.01086724, "balance_loss_clip": 1.02683353, "balance_loss_mlp": 1.02971745, "epoch": 0.08796031865323914, "flos": 19133066505600.0, "grad_norm": 2.9582952116501096, "language_loss": 0.78336728, "learning_rate": 3.9648643632088634e-06, "loss": 0.80539954, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.8671875, "step": 1463, "time_per_iteration": 2.407639503479004 }, { "auxiliary_loss_clip": 0.01114469, "auxiliary_loss_mlp": 0.01082092, "balance_loss_clip": 1.02460885, "balance_loss_mlp": 1.02797174, "epoch": 0.0880204419059071, "flos": 26063186405760.0, "grad_norm": 1.8364709504617904, "language_loss": 0.86058027, "learning_rate": 3.964791644632941e-06, "loss": 0.88254589, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.8671875, "step": 1464, "time_per_iteration": 2.4386796951293945 }, { "auxiliary_loss_clip": 0.01113357, "auxiliary_loss_mlp": 0.01080794, "balance_loss_clip": 1.02705455, "balance_loss_mlp": 1.02911568, "epoch": 0.08808056515857508, "flos": 22376881969920.0, "grad_norm": 2.071622882381408, "language_loss": 0.80284196, "learning_rate": 3.964718851551923e-06, "loss": 0.82478344, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.84375, "step": 1465, "time_per_iteration": 2.4034459590911865 }, { "auxiliary_loss_clip": 0.01116493, "auxiliary_loss_mlp": 0.01080247, "balance_loss_clip": 1.02204871, "balance_loss_mlp": 1.02890396, "epoch": 0.08814068841124305, "flos": 23184178404480.0, "grad_norm": 1.9811141977764644, "language_loss": 0.88102806, "learning_rate": 3.9646459839685675e-06, "loss": 0.90299541, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.87890625, "step": 1466, "time_per_iteration": 2.428858757019043 }, { "auxiliary_loss_clip": 0.01111531, "auxiliary_loss_mlp": 0.01083564, "balance_loss_clip": 1.02424502, "balance_loss_mlp": 1.02694619, "epoch": 0.08820081166391101, "flos": 25154117758080.0, "grad_norm": 2.0553448563211307, "language_loss": 0.85068804, "learning_rate": 3.964573041885641e-06, "loss": 0.872639, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.84375, "step": 1467, "time_per_iteration": 2.4491524696350098 }, { "auxiliary_loss_clip": 0.01112154, "auxiliary_loss_mlp": 0.01078099, "balance_loss_clip": 1.02125967, "balance_loss_mlp": 1.02750313, "epoch": 0.08826093491657899, "flos": 22230735552000.0, "grad_norm": 1.662114176694134, "language_loss": 0.78000438, "learning_rate": 3.964500025305907e-06, "loss": 0.80190694, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.84375, "step": 1468, "time_per_iteration": 2.4559433460235596 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01075941, "balance_loss_clip": 1.02358401, "balance_loss_mlp": 1.02803528, "epoch": 0.08832105816924696, "flos": 22125751493760.0, "grad_norm": 1.5428232121566852, "language_loss": 0.81364679, "learning_rate": 3.9644269342321355e-06, "loss": 0.83551735, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.828125, "step": 1469, "time_per_iteration": 2.431121349334717 }, { "auxiliary_loss_clip": 0.01113366, "auxiliary_loss_mlp": 0.01079233, "balance_loss_clip": 1.02155972, "balance_loss_mlp": 1.02842462, "epoch": 0.08838118142191492, "flos": 17565536016000.0, "grad_norm": 2.117264508542001, "language_loss": 0.80071217, "learning_rate": 3.9643537686670974e-06, "loss": 0.82263815, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8515625, "step": 1470, "time_per_iteration": 2.396859884262085 }, { "auxiliary_loss_clip": 0.0111011, "auxiliary_loss_mlp": 0.01082593, "balance_loss_clip": 1.02432382, "balance_loss_mlp": 1.02704954, "epoch": 0.0884413046745829, "flos": 20776148910720.0, "grad_norm": 1.811908864242278, "language_loss": 0.86483073, "learning_rate": 3.964280528613569e-06, "loss": 0.88675773, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.828125, "step": 1471, "time_per_iteration": 2.4096903800964355 }, { "auxiliary_loss_clip": 0.0110308, "auxiliary_loss_mlp": 0.01075301, "balance_loss_clip": 1.02654409, "balance_loss_mlp": 1.02547359, "epoch": 0.08850142792725087, "flos": 22124424862080.0, "grad_norm": 1.5157660656074357, "language_loss": 0.84710777, "learning_rate": 3.964207214074324e-06, "loss": 0.8688916, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7734375, "step": 1472, "time_per_iteration": 2.4593687057495117 }, { "auxiliary_loss_clip": 0.0110988, "auxiliary_loss_mlp": 0.01075374, "balance_loss_clip": 1.02053773, "balance_loss_mlp": 1.02671289, "epoch": 0.08856155117991883, "flos": 22417660304640.0, "grad_norm": 2.4906480891090883, "language_loss": 0.86922204, "learning_rate": 3.964133825052146e-06, "loss": 0.89107454, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.83203125, "step": 1473, "time_per_iteration": 2.3947339057922363 }, { "auxiliary_loss_clip": 0.01111052, "auxiliary_loss_mlp": 0.01075839, "balance_loss_clip": 1.02250493, "balance_loss_mlp": 1.02588296, "epoch": 0.0886216744325868, "flos": 29935647544320.0, "grad_norm": 1.6252181455641697, "language_loss": 0.80898452, "learning_rate": 3.964060361549816e-06, "loss": 0.8308534, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8515625, "step": 1474, "time_per_iteration": 2.5051679611206055 }, { "auxiliary_loss_clip": 0.01107695, "auxiliary_loss_mlp": 0.01080868, "balance_loss_clip": 1.02333748, "balance_loss_mlp": 1.02549338, "epoch": 0.08868179768525478, "flos": 23981839303680.0, "grad_norm": 1.773515094833119, "language_loss": 0.81431776, "learning_rate": 3.963986823570121e-06, "loss": 0.8362034, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.82421875, "step": 1475, "time_per_iteration": 2.412865400314331 }, { "auxiliary_loss_clip": 0.0110928, "auxiliary_loss_mlp": 0.0107432, "balance_loss_clip": 1.02103305, "balance_loss_mlp": 1.02490354, "epoch": 0.08874192093792274, "flos": 43175934599040.0, "grad_norm": 1.556744656878787, "language_loss": 0.76169324, "learning_rate": 3.963913211115848e-06, "loss": 0.78352916, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.84375, "step": 1476, "time_per_iteration": 2.580961227416992 }, { "auxiliary_loss_clip": 0.01110321, "auxiliary_loss_mlp": 0.01077995, "balance_loss_clip": 1.02296758, "balance_loss_mlp": 1.0258162, "epoch": 0.0888020441905907, "flos": 32851104871680.0, "grad_norm": 1.5813534333872663, "language_loss": 0.76812601, "learning_rate": 3.9638395241897895e-06, "loss": 0.79000914, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.84375, "step": 1477, "time_per_iteration": 2.5251657962799072 }, { "auxiliary_loss_clip": 0.0111123, "auxiliary_loss_mlp": 0.0108111, "balance_loss_clip": 1.02529621, "balance_loss_mlp": 1.0262146, "epoch": 0.08886216744325869, "flos": 23148217837440.0, "grad_norm": 1.871240919147204, "language_loss": 0.89508778, "learning_rate": 3.963765762794739e-06, "loss": 0.9170112, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8515625, "step": 1478, "time_per_iteration": 2.4318556785583496 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01075162, "balance_loss_clip": 1.02008748, "balance_loss_mlp": 1.02472031, "epoch": 0.08892229069592665, "flos": 23330464467840.0, "grad_norm": 1.5450972500791056, "language_loss": 0.79439819, "learning_rate": 3.963691926933495e-06, "loss": 0.81623137, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.8359375, "step": 1479, "time_per_iteration": 5.375801086425781 }, { "auxiliary_loss_clip": 0.01107266, "auxiliary_loss_mlp": 0.01069251, "balance_loss_clip": 1.01715624, "balance_loss_mlp": 1.02508116, "epoch": 0.08898241394859462, "flos": 26212579580160.0, "grad_norm": 2.4087970391462044, "language_loss": 0.80752939, "learning_rate": 3.9636180166088555e-06, "loss": 0.82929456, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.8203125, "step": 1480, "time_per_iteration": 2.4887309074401855 }, { "auxiliary_loss_clip": 0.01110635, "auxiliary_loss_mlp": 0.01082888, "balance_loss_clip": 1.0250001, "balance_loss_mlp": 1.02557898, "epoch": 0.0890425372012626, "flos": 23549474626560.0, "grad_norm": 1.6490382780217814, "language_loss": 0.69447935, "learning_rate": 3.963544031823624e-06, "loss": 0.71641457, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8515625, "step": 1481, "time_per_iteration": 3.8958072662353516 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01069186, "balance_loss_clip": 1.02095389, "balance_loss_mlp": 1.02680755, "epoch": 0.08910266045393056, "flos": 23001687394560.0, "grad_norm": 1.9861187832442893, "language_loss": 0.98841119, "learning_rate": 3.9634699725806065e-06, "loss": 1.01019382, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.8203125, "step": 1482, "time_per_iteration": 3.8417856693267822 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.0108084, "balance_loss_clip": 1.02404857, "balance_loss_mlp": 1.02737927, "epoch": 0.08916278370659853, "flos": 31935298331520.0, "grad_norm": 1.864300427496658, "language_loss": 0.80082321, "learning_rate": 3.96339583888261e-06, "loss": 0.82277614, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.87109375, "step": 1483, "time_per_iteration": 2.519275665283203 }, { "auxiliary_loss_clip": 0.01111005, "auxiliary_loss_mlp": 0.01083248, "balance_loss_clip": 1.02819729, "balance_loss_mlp": 1.02736831, "epoch": 0.08922290695926649, "flos": 17529435803520.0, "grad_norm": 2.394137763470711, "language_loss": 0.87980139, "learning_rate": 3.963321630732448e-06, "loss": 0.90174389, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.8359375, "step": 1484, "time_per_iteration": 2.386676549911499 }, { "auxiliary_loss_clip": 0.01116719, "auxiliary_loss_mlp": 0.01077153, "balance_loss_clip": 1.02095723, "balance_loss_mlp": 1.02943611, "epoch": 0.08928303021193447, "flos": 32123689361280.0, "grad_norm": 1.650437815783227, "language_loss": 0.81977254, "learning_rate": 3.963247348132932e-06, "loss": 0.84171122, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.87109375, "step": 1485, "time_per_iteration": 2.481862783432007 }, { "auxiliary_loss_clip": 0.01107643, "auxiliary_loss_mlp": 0.01067612, "balance_loss_clip": 1.02197814, "balance_loss_mlp": 1.02539802, "epoch": 0.08934315346460243, "flos": 22124180482560.0, "grad_norm": 1.6348100812736432, "language_loss": 0.84867632, "learning_rate": 3.96317299108688e-06, "loss": 0.87042892, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.82421875, "step": 1486, "time_per_iteration": 2.427866220474243 }, { "auxiliary_loss_clip": 0.0110823, "auxiliary_loss_mlp": 0.01079714, "balance_loss_clip": 1.02537894, "balance_loss_mlp": 1.02566624, "epoch": 0.0894032767172704, "flos": 22564470038400.0, "grad_norm": 1.6325281266127587, "language_loss": 0.78536284, "learning_rate": 3.963098559597111e-06, "loss": 0.80724233, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.82421875, "step": 1487, "time_per_iteration": 2.405565023422241 }, { "auxiliary_loss_clip": 0.01108807, "auxiliary_loss_mlp": 0.01078202, "balance_loss_clip": 1.02603602, "balance_loss_mlp": 1.02558923, "epoch": 0.08946339996993838, "flos": 20192366200320.0, "grad_norm": 2.0813114479462085, "language_loss": 0.85506034, "learning_rate": 3.963024053666449e-06, "loss": 0.87693036, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.83203125, "step": 1488, "time_per_iteration": 2.400275945663452 }, { "auxiliary_loss_clip": 0.01105438, "auxiliary_loss_mlp": 0.01067989, "balance_loss_clip": 1.02121127, "balance_loss_mlp": 1.02490723, "epoch": 0.08952352322260634, "flos": 48358372060800.0, "grad_norm": 1.777939991273865, "language_loss": 0.74371445, "learning_rate": 3.962949473297718e-06, "loss": 0.76544875, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.8046875, "step": 1489, "time_per_iteration": 2.6290080547332764 }, { "auxiliary_loss_clip": 0.011064, "auxiliary_loss_mlp": 0.01078983, "balance_loss_clip": 1.02505255, "balance_loss_mlp": 1.02541518, "epoch": 0.08958364647527431, "flos": 31791805176960.0, "grad_norm": 1.805984216123992, "language_loss": 0.91613555, "learning_rate": 3.962874818493745e-06, "loss": 0.93798935, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.80859375, "step": 1490, "time_per_iteration": 2.4992287158966064 }, { "auxiliary_loss_clip": 0.01110975, "auxiliary_loss_mlp": 0.01081261, "balance_loss_clip": 1.02809381, "balance_loss_mlp": 1.02654743, "epoch": 0.08964376972794229, "flos": 23367053439360.0, "grad_norm": 2.066606195312171, "language_loss": 0.77037829, "learning_rate": 3.9628000892573635e-06, "loss": 0.7923007, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.84375, "step": 1491, "time_per_iteration": 2.424330949783325 }, { "auxiliary_loss_clip": 0.01106218, "auxiliary_loss_mlp": 0.0106472, "balance_loss_clip": 1.01758516, "balance_loss_mlp": 1.02610373, "epoch": 0.08970389298061025, "flos": 23293666028160.0, "grad_norm": 1.7139758131889415, "language_loss": 0.79328859, "learning_rate": 3.9627252855914055e-06, "loss": 0.81499797, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.80078125, "step": 1492, "time_per_iteration": 2.4186363220214844 }, { "auxiliary_loss_clip": 0.0110577, "auxiliary_loss_mlp": 0.0107999, "balance_loss_clip": 1.02975535, "balance_loss_mlp": 1.02684021, "epoch": 0.08976401623327822, "flos": 33760417898880.0, "grad_norm": 2.0929453957468644, "language_loss": 0.73093432, "learning_rate": 3.962650407498707e-06, "loss": 0.75279194, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7890625, "step": 1493, "time_per_iteration": 2.498934507369995 }, { "auxiliary_loss_clip": 0.01107342, "auxiliary_loss_mlp": 0.01080879, "balance_loss_clip": 1.02930951, "balance_loss_mlp": 1.02570963, "epoch": 0.08982413948594618, "flos": 23910302194560.0, "grad_norm": 1.6719924955778092, "language_loss": 0.88979417, "learning_rate": 3.962575454982109e-06, "loss": 0.91167641, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.81640625, "step": 1494, "time_per_iteration": 2.436227560043335 }, { "auxiliary_loss_clip": 0.01107432, "auxiliary_loss_mlp": 0.01076442, "balance_loss_clip": 1.02580237, "balance_loss_mlp": 1.0265584, "epoch": 0.08988426273861416, "flos": 16836584405760.0, "grad_norm": 1.6152033139692503, "language_loss": 0.84875274, "learning_rate": 3.962500428044454e-06, "loss": 0.8705914, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.80859375, "step": 1495, "time_per_iteration": 2.370147466659546 }, { "auxiliary_loss_clip": 0.01111222, "auxiliary_loss_mlp": 0.01084687, "balance_loss_clip": 1.03054237, "balance_loss_mlp": 1.02658439, "epoch": 0.08994438599128213, "flos": 14792489591040.0, "grad_norm": 2.5087939534055232, "language_loss": 0.72740614, "learning_rate": 3.962425326688585e-06, "loss": 0.74936521, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.84765625, "step": 1496, "time_per_iteration": 2.4009199142456055 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01073697, "balance_loss_clip": 1.02210319, "balance_loss_mlp": 1.02644873, "epoch": 0.09000450924395009, "flos": 17383359208320.0, "grad_norm": 1.5683690833251411, "language_loss": 0.82072341, "learning_rate": 3.962350150917351e-06, "loss": 0.84254199, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.81640625, "step": 1497, "time_per_iteration": 2.3679754734039307 }, { "auxiliary_loss_clip": 0.01114222, "auxiliary_loss_mlp": 0.01091999, "balance_loss_clip": 1.0322994, "balance_loss_mlp": 1.02717352, "epoch": 0.09006463249661807, "flos": 24279159375360.0, "grad_norm": 2.078008327489936, "language_loss": 0.85090685, "learning_rate": 3.9622749007336035e-06, "loss": 0.87296909, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.87109375, "step": 1498, "time_per_iteration": 2.4333441257476807 }, { "auxiliary_loss_clip": 0.01113483, "auxiliary_loss_mlp": 0.01088362, "balance_loss_clip": 1.02928185, "balance_loss_mlp": 1.027282, "epoch": 0.09012475574928604, "flos": 13661094205440.0, "grad_norm": 2.2239430335605586, "language_loss": 0.81338161, "learning_rate": 3.962199576140195e-06, "loss": 0.8354001, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.86328125, "step": 1499, "time_per_iteration": 2.382538080215454 }, { "auxiliary_loss_clip": 0.01108616, "auxiliary_loss_mlp": 0.01087125, "balance_loss_clip": 1.03312302, "balance_loss_mlp": 1.02713943, "epoch": 0.090184879001954, "flos": 23326728952320.0, "grad_norm": 1.6609470057309854, "language_loss": 0.94312239, "learning_rate": 3.962124177139981e-06, "loss": 0.96507972, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8125, "step": 1500, "time_per_iteration": 2.4417648315429688 }, { "auxiliary_loss_clip": 0.01117478, "auxiliary_loss_mlp": 0.01081727, "balance_loss_clip": 1.02560329, "balance_loss_mlp": 1.02830207, "epoch": 0.09024500225462198, "flos": 23001582660480.0, "grad_norm": 2.384421745382371, "language_loss": 0.77172077, "learning_rate": 3.962048703735822e-06, "loss": 0.79371285, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.890625, "step": 1501, "time_per_iteration": 2.412130355834961 }, { "auxiliary_loss_clip": 0.01031203, "auxiliary_loss_mlp": 0.01012354, "balance_loss_clip": 1.00381875, "balance_loss_mlp": 1.0109694, "epoch": 0.09030512550728995, "flos": 62185966007040.0, "grad_norm": 0.7434971847226125, "language_loss": 0.58397663, "learning_rate": 3.96197315593058e-06, "loss": 0.6044122, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 0.08544922, "router_z_loss_mlp": 0.20214844, "step": 1502, "time_per_iteration": 3.039884328842163 }, { "auxiliary_loss_clip": 0.01113158, "auxiliary_loss_mlp": 0.01077862, "balance_loss_clip": 1.02810419, "balance_loss_mlp": 1.02844501, "epoch": 0.09036524875995791, "flos": 38799152737920.0, "grad_norm": 2.656786334468031, "language_loss": 0.73868883, "learning_rate": 3.961897533727119e-06, "loss": 0.76059902, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.84765625, "step": 1503, "time_per_iteration": 2.5744569301605225 }, { "auxiliary_loss_clip": 0.01120002, "auxiliary_loss_mlp": 0.01089592, "balance_loss_clip": 1.03146553, "balance_loss_mlp": 1.03034616, "epoch": 0.09042537201262588, "flos": 21688987985280.0, "grad_norm": 1.8825656797466777, "language_loss": 0.87943518, "learning_rate": 3.961821837128306e-06, "loss": 0.9015311, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.8984375, "step": 1504, "time_per_iteration": 2.447361469268799 }, { "auxiliary_loss_clip": 0.01122538, "auxiliary_loss_mlp": 0.01099923, "balance_loss_clip": 1.03388131, "balance_loss_mlp": 1.03130198, "epoch": 0.09048549526529386, "flos": 22266102625920.0, "grad_norm": 1.8164687052282962, "language_loss": 0.75615788, "learning_rate": 3.961746066137014e-06, "loss": 0.77838242, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.9140625, "step": 1505, "time_per_iteration": 2.3914897441864014 }, { "auxiliary_loss_clip": 0.01113895, "auxiliary_loss_mlp": 0.01087988, "balance_loss_clip": 1.02685773, "balance_loss_mlp": 1.02978277, "epoch": 0.09054561851796182, "flos": 14610068403840.0, "grad_norm": 1.9622807901576116, "language_loss": 0.84377092, "learning_rate": 3.961670220756114e-06, "loss": 0.86578977, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.83984375, "step": 1506, "time_per_iteration": 2.4064269065856934 }, { "auxiliary_loss_clip": 0.01115243, "auxiliary_loss_mlp": 0.01084887, "balance_loss_clip": 1.02814388, "balance_loss_mlp": 1.03013754, "epoch": 0.09060574177062979, "flos": 27634941169920.0, "grad_norm": 2.5462207629751776, "language_loss": 0.79067171, "learning_rate": 3.961594300988482e-06, "loss": 0.81267309, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.8515625, "step": 1507, "time_per_iteration": 2.4762706756591797 }, { "auxiliary_loss_clip": 0.0103263, "auxiliary_loss_mlp": 0.01007821, "balance_loss_clip": 1.00004888, "balance_loss_mlp": 1.01230764, "epoch": 0.09066586502329776, "flos": 66082657495680.0, "grad_norm": 0.7336673025900186, "language_loss": 0.57814103, "learning_rate": 3.961518306836998e-06, "loss": 0.59854555, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 0.07763672, "router_z_loss_mlp": 0.203125, "step": 1508, "time_per_iteration": 2.869333028793335 }, { "auxiliary_loss_clip": 0.01116022, "auxiliary_loss_mlp": 0.01096671, "balance_loss_clip": 1.03751945, "balance_loss_mlp": 1.02970862, "epoch": 0.09072598827596573, "flos": 18915452801280.0, "grad_norm": 1.7032013109541906, "language_loss": 0.86847401, "learning_rate": 3.961442238304543e-06, "loss": 0.89060092, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.86328125, "step": 1509, "time_per_iteration": 2.4741878509521484 }, { "auxiliary_loss_clip": 0.01124513, "auxiliary_loss_mlp": 0.01096673, "balance_loss_clip": 1.03339696, "balance_loss_mlp": 1.0302484, "epoch": 0.0907861115286337, "flos": 24820732385280.0, "grad_norm": 2.1688221227733795, "language_loss": 0.87194145, "learning_rate": 3.961366095394002e-06, "loss": 0.89415336, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.9453125, "step": 1510, "time_per_iteration": 2.433610439300537 }, { "auxiliary_loss_clip": 0.01118004, "auxiliary_loss_mlp": 0.01099033, "balance_loss_clip": 1.03714001, "balance_loss_mlp": 1.02932215, "epoch": 0.09084623478130167, "flos": 21651770609280.0, "grad_norm": 2.0422380703842347, "language_loss": 0.89015245, "learning_rate": 3.961289878108262e-06, "loss": 0.91232282, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.88671875, "step": 1511, "time_per_iteration": 2.410454273223877 }, { "auxiliary_loss_clip": 0.01113964, "auxiliary_loss_mlp": 0.01092176, "balance_loss_clip": 1.03238022, "balance_loss_mlp": 1.02987075, "epoch": 0.09090635803396964, "flos": 27637943546880.0, "grad_norm": 1.428868263474219, "language_loss": 0.86297405, "learning_rate": 3.9612135864502135e-06, "loss": 0.8850354, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.83984375, "step": 1512, "time_per_iteration": 2.466679334640503 }, { "auxiliary_loss_clip": 0.01111912, "auxiliary_loss_mlp": 0.0107679, "balance_loss_clip": 1.02245474, "balance_loss_mlp": 1.02744186, "epoch": 0.0909664812866376, "flos": 17668355569920.0, "grad_norm": 2.816549002711642, "language_loss": 0.90307796, "learning_rate": 3.961137220422749e-06, "loss": 0.92496502, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.84375, "step": 1513, "time_per_iteration": 2.363409996032715 }, { "auxiliary_loss_clip": 0.01112741, "auxiliary_loss_mlp": 0.01080415, "balance_loss_clip": 1.02290881, "balance_loss_mlp": 1.02823675, "epoch": 0.09102660453930557, "flos": 23950312479360.0, "grad_norm": 1.819754572719603, "language_loss": 0.88657814, "learning_rate": 3.961060780028764e-06, "loss": 0.90850973, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.84375, "step": 1514, "time_per_iteration": 2.4353113174438477 }, { "auxiliary_loss_clip": 0.01114721, "auxiliary_loss_mlp": 0.01097078, "balance_loss_clip": 1.0397861, "balance_loss_mlp": 1.02902222, "epoch": 0.09108672779197355, "flos": 25811741727360.0, "grad_norm": 1.752178218617194, "language_loss": 0.92032456, "learning_rate": 3.960984265271159e-06, "loss": 0.94244254, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.85546875, "step": 1515, "time_per_iteration": 2.4596993923187256 }, { "auxiliary_loss_clip": 0.01113694, "auxiliary_loss_mlp": 0.01082629, "balance_loss_clip": 1.02295291, "balance_loss_mlp": 1.02768445, "epoch": 0.09114685104464151, "flos": 29638292561280.0, "grad_norm": 1.9227267279839466, "language_loss": 0.87477815, "learning_rate": 3.9609076761528335e-06, "loss": 0.89674139, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.859375, "step": 1516, "time_per_iteration": 2.463984727859497 }, { "auxiliary_loss_clip": 0.01118362, "auxiliary_loss_mlp": 0.01085374, "balance_loss_clip": 1.0278194, "balance_loss_mlp": 1.03084636, "epoch": 0.09120697429730948, "flos": 33728227758720.0, "grad_norm": 1.4721470117606972, "language_loss": 0.82415593, "learning_rate": 3.960831012676692e-06, "loss": 0.84619325, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.875, "step": 1517, "time_per_iteration": 2.518859624862671 }, { "auxiliary_loss_clip": 0.01120377, "auxiliary_loss_mlp": 0.0109069, "balance_loss_clip": 1.02607906, "balance_loss_mlp": 1.03049028, "epoch": 0.09126709754997746, "flos": 18400519025280.0, "grad_norm": 1.7662413571937445, "language_loss": 0.79791492, "learning_rate": 3.960754274845642e-06, "loss": 0.82002568, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.8984375, "step": 1518, "time_per_iteration": 3.8782687187194824 }, { "auxiliary_loss_clip": 0.01113453, "auxiliary_loss_mlp": 0.01088132, "balance_loss_clip": 1.02557123, "balance_loss_mlp": 1.02892709, "epoch": 0.09132722080264542, "flos": 22090838267520.0, "grad_norm": 1.6890513801790352, "language_loss": 0.89372605, "learning_rate": 3.960677462662594e-06, "loss": 0.91574192, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.84765625, "step": 1519, "time_per_iteration": 3.8329615592956543 }, { "auxiliary_loss_clip": 0.011151, "auxiliary_loss_mlp": 0.01080916, "balance_loss_clip": 1.0199281, "balance_loss_mlp": 1.02956319, "epoch": 0.09138734405531339, "flos": 21032062243200.0, "grad_norm": 2.143659652590676, "language_loss": 0.76293993, "learning_rate": 3.96060057613046e-06, "loss": 0.78490007, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.85546875, "step": 1520, "time_per_iteration": 3.941753625869751 }, { "auxiliary_loss_clip": 0.0111784, "auxiliary_loss_mlp": 0.01081193, "balance_loss_clip": 1.0172019, "balance_loss_mlp": 1.03070045, "epoch": 0.09144746730798137, "flos": 20082913488000.0, "grad_norm": 2.2890452716644507, "language_loss": 0.88765359, "learning_rate": 3.960523615252156e-06, "loss": 0.90964389, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.87109375, "step": 1521, "time_per_iteration": 2.4317142963409424 }, { "auxiliary_loss_clip": 0.01119535, "auxiliary_loss_mlp": 0.01087612, "balance_loss_clip": 1.0281024, "balance_loss_mlp": 1.030599, "epoch": 0.09150759056064933, "flos": 22777265975040.0, "grad_norm": 1.925555600767937, "language_loss": 0.86278105, "learning_rate": 3.960446580030599e-06, "loss": 0.88485247, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.890625, "step": 1522, "time_per_iteration": 3.969216823577881 }, { "auxiliary_loss_clip": 0.01110378, "auxiliary_loss_mlp": 0.01087944, "balance_loss_clip": 1.02967477, "balance_loss_mlp": 1.02874708, "epoch": 0.0915677138133173, "flos": 27562950213120.0, "grad_norm": 1.7411318178844923, "language_loss": 0.82919931, "learning_rate": 3.960369470468711e-06, "loss": 0.85118252, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.81640625, "step": 1523, "time_per_iteration": 2.4459054470062256 }, { "auxiliary_loss_clip": 0.0111452, "auxiliary_loss_mlp": 0.01089493, "balance_loss_clip": 1.02964997, "balance_loss_mlp": 1.02889049, "epoch": 0.09162783706598528, "flos": 17673836653440.0, "grad_norm": 2.271721925238458, "language_loss": 0.76478308, "learning_rate": 3.960292286569418e-06, "loss": 0.78682315, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.85546875, "step": 1524, "time_per_iteration": 2.380953311920166 }, { "auxiliary_loss_clip": 0.01112174, "auxiliary_loss_mlp": 0.01079847, "balance_loss_clip": 1.02424788, "balance_loss_mlp": 1.02752495, "epoch": 0.09168796031865324, "flos": 18477223015680.0, "grad_norm": 2.346109390527503, "language_loss": 0.8828454, "learning_rate": 3.960215028335644e-06, "loss": 0.90476561, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.84375, "step": 1525, "time_per_iteration": 2.3737895488739014 }, { "auxiliary_loss_clip": 0.01114648, "auxiliary_loss_mlp": 0.01081909, "balance_loss_clip": 1.02301931, "balance_loss_mlp": 1.02992296, "epoch": 0.0917480835713212, "flos": 29386324212480.0, "grad_norm": 1.9602787547021547, "language_loss": 0.76971221, "learning_rate": 3.96013769577032e-06, "loss": 0.79167777, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.84765625, "step": 1526, "time_per_iteration": 2.4519925117492676 }, { "auxiliary_loss_clip": 0.01112, "auxiliary_loss_mlp": 0.01080243, "balance_loss_clip": 1.02566838, "balance_loss_mlp": 1.02896392, "epoch": 0.09180820682398917, "flos": 19828222053120.0, "grad_norm": 1.8519425670605048, "language_loss": 0.79425347, "learning_rate": 3.960060288876378e-06, "loss": 0.81617594, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.828125, "step": 1527, "time_per_iteration": 2.388181686401367 }, { "auxiliary_loss_clip": 0.01112667, "auxiliary_loss_mlp": 0.01081673, "balance_loss_clip": 1.02254558, "balance_loss_mlp": 1.02794886, "epoch": 0.09186833007665715, "flos": 23840720121600.0, "grad_norm": 1.9131397859192008, "language_loss": 0.82477582, "learning_rate": 3.959982807656753e-06, "loss": 0.84671915, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.84765625, "step": 1528, "time_per_iteration": 2.4230844974517822 }, { "auxiliary_loss_clip": 0.01116468, "auxiliary_loss_mlp": 0.01080869, "balance_loss_clip": 1.02317202, "balance_loss_mlp": 1.03010392, "epoch": 0.09192845332932512, "flos": 12931898215680.0, "grad_norm": 2.5570562671691874, "language_loss": 0.79772675, "learning_rate": 3.959905252114384e-06, "loss": 0.81970012, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.86328125, "step": 1529, "time_per_iteration": 2.4137628078460693 }, { "auxiliary_loss_clip": 0.01113831, "auxiliary_loss_mlp": 0.01083875, "balance_loss_clip": 1.02577209, "balance_loss_mlp": 1.02598858, "epoch": 0.09198857658199308, "flos": 24567123202560.0, "grad_norm": 2.047433573079776, "language_loss": 0.85047191, "learning_rate": 3.959827622252211e-06, "loss": 0.87244898, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.87890625, "step": 1530, "time_per_iteration": 2.42779278755188 }, { "auxiliary_loss_clip": 0.01112382, "auxiliary_loss_mlp": 0.01083326, "balance_loss_clip": 1.02667737, "balance_loss_mlp": 1.02954578, "epoch": 0.09204869983466106, "flos": 20265893256960.0, "grad_norm": 1.9283982258582641, "language_loss": 0.86324906, "learning_rate": 3.959749918073179e-06, "loss": 0.8852061, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.828125, "step": 1531, "time_per_iteration": 2.397167682647705 }, { "auxiliary_loss_clip": 0.01111363, "auxiliary_loss_mlp": 0.01079901, "balance_loss_clip": 1.02349126, "balance_loss_mlp": 1.02722526, "epoch": 0.09210882308732903, "flos": 20884624104960.0, "grad_norm": 1.7640482213845297, "language_loss": 0.82682145, "learning_rate": 3.959672139580233e-06, "loss": 0.84873402, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.84375, "step": 1532, "time_per_iteration": 2.4126298427581787 }, { "auxiliary_loss_clip": 0.01113209, "auxiliary_loss_mlp": 0.01083749, "balance_loss_clip": 1.02714825, "balance_loss_mlp": 1.02861822, "epoch": 0.09216894633999699, "flos": 30955006776960.0, "grad_norm": 1.825428949142385, "language_loss": 0.86100858, "learning_rate": 3.9595942867763235e-06, "loss": 0.88297808, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.84765625, "step": 1533, "time_per_iteration": 2.4915108680725098 }, { "auxiliary_loss_clip": 0.01111485, "auxiliary_loss_mlp": 0.01081142, "balance_loss_clip": 1.02930975, "balance_loss_mlp": 1.02756143, "epoch": 0.09222906959266497, "flos": 13150733817600.0, "grad_norm": 1.9602486106635217, "language_loss": 0.92837369, "learning_rate": 3.959516359664402e-06, "loss": 0.95029998, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.83984375, "step": 1534, "time_per_iteration": 2.403778076171875 }, { "auxiliary_loss_clip": 0.01112943, "auxiliary_loss_mlp": 0.01096766, "balance_loss_clip": 1.03699422, "balance_loss_mlp": 1.02815974, "epoch": 0.09228919284533293, "flos": 25993290130560.0, "grad_norm": 2.368152291143002, "language_loss": 0.7839613, "learning_rate": 3.959438358247424e-06, "loss": 0.80605841, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.84765625, "step": 1535, "time_per_iteration": 2.4370861053466797 }, { "auxiliary_loss_clip": 0.01104872, "auxiliary_loss_mlp": 0.01078993, "balance_loss_clip": 1.02470541, "balance_loss_mlp": 1.02602029, "epoch": 0.0923493160980009, "flos": 18659818759680.0, "grad_norm": 1.7222424341723932, "language_loss": 0.83470994, "learning_rate": 3.959360282528346e-06, "loss": 0.85654861, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.7890625, "step": 1536, "time_per_iteration": 2.412506580352783 }, { "auxiliary_loss_clip": 0.0110547, "auxiliary_loss_mlp": 0.01074689, "balance_loss_clip": 1.02273798, "balance_loss_mlp": 1.02472973, "epoch": 0.09240943935066886, "flos": 21139559919360.0, "grad_norm": 1.92284312716687, "language_loss": 0.91189051, "learning_rate": 3.959282132510131e-06, "loss": 0.93369216, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.8046875, "step": 1537, "time_per_iteration": 2.38391375541687 }, { "auxiliary_loss_clip": 0.01111628, "auxiliary_loss_mlp": 0.0108499, "balance_loss_clip": 1.0272212, "balance_loss_mlp": 1.02669752, "epoch": 0.09246956260333684, "flos": 20591458485120.0, "grad_norm": 1.9652504614992454, "language_loss": 0.83112502, "learning_rate": 3.959203908195741e-06, "loss": 0.85309124, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.84765625, "step": 1538, "time_per_iteration": 2.4389352798461914 }, { "auxiliary_loss_clip": 0.01029558, "auxiliary_loss_mlp": 0.01007044, "balance_loss_clip": 0.99998689, "balance_loss_mlp": 1.00782847, "epoch": 0.09252968585600481, "flos": 67555153664640.0, "grad_norm": 0.7462228114839802, "language_loss": 0.57460278, "learning_rate": 3.959125609588142e-06, "loss": 0.5949688, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 0.07080078, "router_z_loss_mlp": 0.21679688, "step": 1539, "time_per_iteration": 3.1154885292053223 }, { "auxiliary_loss_clip": 0.01112289, "auxiliary_loss_mlp": 0.01081833, "balance_loss_clip": 1.02308667, "balance_loss_mlp": 1.0286262, "epoch": 0.09258980910867277, "flos": 17382905360640.0, "grad_norm": 2.20583550041661, "language_loss": 0.70783305, "learning_rate": 3.959047236690304e-06, "loss": 0.72977436, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8359375, "step": 1540, "time_per_iteration": 2.381819725036621 }, { "auxiliary_loss_clip": 0.01111804, "auxiliary_loss_mlp": 0.01070341, "balance_loss_clip": 1.01486063, "balance_loss_mlp": 1.02877617, "epoch": 0.09264993236134075, "flos": 19864880847360.0, "grad_norm": 1.7652616938640926, "language_loss": 0.85139537, "learning_rate": 3.958968789505198e-06, "loss": 0.87321687, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.83203125, "step": 1541, "time_per_iteration": 2.3839406967163086 }, { "auxiliary_loss_clip": 0.01027101, "auxiliary_loss_mlp": 0.01010737, "balance_loss_clip": 1.00291705, "balance_loss_mlp": 1.00495458, "epoch": 0.09271005561400872, "flos": 62281558909440.0, "grad_norm": 0.8929756684182153, "language_loss": 0.62018621, "learning_rate": 3.9588902680358e-06, "loss": 0.64056462, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 0.078125, "router_z_loss_mlp": 0.22167969, "step": 1542, "time_per_iteration": 3.051844358444214 }, { "auxiliary_loss_clip": 0.01111346, "auxiliary_loss_mlp": 0.01075436, "balance_loss_clip": 1.02329421, "balance_loss_mlp": 1.0284003, "epoch": 0.09277017886667668, "flos": 23328788722560.0, "grad_norm": 1.5237704694615548, "language_loss": 0.84647286, "learning_rate": 3.958811672285086e-06, "loss": 0.86834067, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.83203125, "step": 1543, "time_per_iteration": 2.426017999649048 }, { "auxiliary_loss_clip": 0.01107356, "auxiliary_loss_mlp": 0.01074721, "balance_loss_clip": 1.0221262, "balance_loss_mlp": 1.02798557, "epoch": 0.09283030211934466, "flos": 54743183435520.0, "grad_norm": 1.5796454495499592, "language_loss": 0.74377328, "learning_rate": 3.958733002256038e-06, "loss": 0.76559407, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.796875, "step": 1544, "time_per_iteration": 2.690892219543457 }, { "auxiliary_loss_clip": 0.0111193, "auxiliary_loss_mlp": 0.01073564, "balance_loss_clip": 1.02135026, "balance_loss_mlp": 1.02825546, "epoch": 0.09289042537201263, "flos": 30333517931520.0, "grad_norm": 1.660205508839919, "language_loss": 0.79370642, "learning_rate": 3.958654257951637e-06, "loss": 0.81556141, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.8359375, "step": 1545, "time_per_iteration": 2.4560794830322266 }, { "auxiliary_loss_clip": 0.01107708, "auxiliary_loss_mlp": 0.01077919, "balance_loss_clip": 1.02425075, "balance_loss_mlp": 1.02750731, "epoch": 0.09295054862468059, "flos": 17745932344320.0, "grad_norm": 3.434896832588203, "language_loss": 0.79653955, "learning_rate": 3.9585754393748706e-06, "loss": 0.81839579, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8046875, "step": 1546, "time_per_iteration": 2.3745367527008057 }, { "auxiliary_loss_clip": 0.01107977, "auxiliary_loss_mlp": 0.01068469, "balance_loss_clip": 1.01673222, "balance_loss_mlp": 1.0264827, "epoch": 0.09301067187734856, "flos": 23656937391360.0, "grad_norm": 1.730644398449994, "language_loss": 0.86076963, "learning_rate": 3.9584965465287275e-06, "loss": 0.88253415, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.81640625, "step": 1547, "time_per_iteration": 2.415879726409912 }, { "auxiliary_loss_clip": 0.01111751, "auxiliary_loss_mlp": 0.01083771, "balance_loss_clip": 1.03067517, "balance_loss_mlp": 1.02696967, "epoch": 0.09307079513001654, "flos": 27526465975680.0, "grad_norm": 1.8871385391519144, "language_loss": 0.7104708, "learning_rate": 3.958417579416199e-06, "loss": 0.73242599, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8515625, "step": 1548, "time_per_iteration": 2.447303533554077 }, { "auxiliary_loss_clip": 0.01110969, "auxiliary_loss_mlp": 0.01077747, "balance_loss_clip": 1.02171826, "balance_loss_mlp": 1.02755284, "epoch": 0.0931309183826845, "flos": 20626406622720.0, "grad_norm": 2.5382528167317915, "language_loss": 0.85762775, "learning_rate": 3.9583385380402795e-06, "loss": 0.87951493, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8359375, "step": 1549, "time_per_iteration": 2.414156436920166 }, { "auxiliary_loss_clip": 0.01104786, "auxiliary_loss_mlp": 0.01072112, "balance_loss_clip": 1.02223539, "balance_loss_mlp": 1.02682269, "epoch": 0.09319104163535247, "flos": 29019701358720.0, "grad_norm": 1.5537684547248862, "language_loss": 0.77314007, "learning_rate": 3.958259422403966e-06, "loss": 0.794909, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.78125, "step": 1550, "time_per_iteration": 2.5018465518951416 }, { "auxiliary_loss_clip": 0.01106761, "auxiliary_loss_mlp": 0.01082223, "balance_loss_clip": 1.0286746, "balance_loss_mlp": 1.02446139, "epoch": 0.09325116488802045, "flos": 25300368910080.0, "grad_norm": 2.112278640344617, "language_loss": 0.85835373, "learning_rate": 3.95818023251026e-06, "loss": 0.8802436, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.8203125, "step": 1551, "time_per_iteration": 2.4166924953460693 }, { "auxiliary_loss_clip": 0.0102262, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.02045465, "balance_loss_mlp": 1.00239635, "epoch": 0.09331128814068841, "flos": 61532880514560.0, "grad_norm": 0.7650003354319651, "language_loss": 0.6194067, "learning_rate": 3.958100968362163e-06, "loss": 0.63989538, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 0.05786133, "router_z_loss_mlp": 0.20214844, "step": 1552, "time_per_iteration": 3.142827033996582 }, { "auxiliary_loss_clip": 0.01025153, "auxiliary_loss_mlp": 0.01007158, "balance_loss_clip": 1.00119793, "balance_loss_mlp": 1.00498664, "epoch": 0.09337141139335638, "flos": 53290515052800.0, "grad_norm": 0.8331778187860939, "language_loss": 0.59030706, "learning_rate": 3.958021629962681e-06, "loss": 0.61063015, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 0.05957031, "router_z_loss_mlp": 0.20117188, "step": 1553, "time_per_iteration": 3.1626241207122803 }, { "auxiliary_loss_clip": 0.01109075, "auxiliary_loss_mlp": 0.0107384, "balance_loss_clip": 1.02134061, "balance_loss_mlp": 1.02737617, "epoch": 0.09343153464602436, "flos": 23475738101760.0, "grad_norm": 1.9319505666546912, "language_loss": 0.9002043, "learning_rate": 3.957942217314823e-06, "loss": 0.92203349, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.81640625, "step": 1554, "time_per_iteration": 2.427488327026367 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.01069639, "balance_loss_clip": 1.02298045, "balance_loss_mlp": 1.02935243, "epoch": 0.09349165789869232, "flos": 19352495600640.0, "grad_norm": 1.758496786718638, "language_loss": 0.83159781, "learning_rate": 3.957862730421599e-06, "loss": 0.85334265, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.75390625, "step": 1555, "time_per_iteration": 2.4667415618896484 }, { "auxiliary_loss_clip": 0.01031897, "auxiliary_loss_mlp": 0.0100935, "balance_loss_clip": 1.00310326, "balance_loss_mlp": 1.01125431, "epoch": 0.09355178115136029, "flos": 67499572913280.0, "grad_norm": 0.9059725369078745, "language_loss": 0.59695101, "learning_rate": 3.957783169286024e-06, "loss": 0.61736351, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 0.0625, "router_z_loss_mlp": 0.20703125, "step": 1556, "time_per_iteration": 3.074983596801758 }, { "auxiliary_loss_clip": 0.01105572, "auxiliary_loss_mlp": 0.01084063, "balance_loss_clip": 1.03325653, "balance_loss_mlp": 1.02782059, "epoch": 0.09361190440402825, "flos": 37340132353920.0, "grad_norm": 1.638913048941182, "language_loss": 0.8656038, "learning_rate": 3.9577035339111155e-06, "loss": 0.88750023, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.77734375, "step": 1557, "time_per_iteration": 4.161519527435303 }, { "auxiliary_loss_clip": 0.01106419, "auxiliary_loss_mlp": 0.01076303, "balance_loss_clip": 1.02377987, "balance_loss_mlp": 1.02708817, "epoch": 0.09367202765669623, "flos": 24898553539200.0, "grad_norm": 1.8434474701957655, "language_loss": 0.79396212, "learning_rate": 3.957623824299893e-06, "loss": 0.81578934, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.79296875, "step": 1558, "time_per_iteration": 2.4767630100250244 }, { "auxiliary_loss_clip": 0.011109, "auxiliary_loss_mlp": 0.01088082, "balance_loss_clip": 1.03489101, "balance_loss_mlp": 1.03006721, "epoch": 0.0937321509093642, "flos": 15704665349760.0, "grad_norm": 2.0589546542043986, "language_loss": 0.82168341, "learning_rate": 3.957544040455379e-06, "loss": 0.84367323, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.80859375, "step": 1559, "time_per_iteration": 3.7921829223632812 }, { "auxiliary_loss_clip": 0.01104497, "auxiliary_loss_mlp": 0.01079965, "balance_loss_clip": 1.03275776, "balance_loss_mlp": 1.02828133, "epoch": 0.09379227416203216, "flos": 20482704000000.0, "grad_norm": 1.8961158718188131, "language_loss": 0.78434062, "learning_rate": 3.957464182380599e-06, "loss": 0.80618531, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.76171875, "step": 1560, "time_per_iteration": 3.8214142322540283 }, { "auxiliary_loss_clip": 0.01109541, "auxiliary_loss_mlp": 0.01083402, "balance_loss_clip": 1.03509879, "balance_loss_mlp": 1.02703166, "epoch": 0.09385239741470014, "flos": 24351359800320.0, "grad_norm": 2.131541226283718, "language_loss": 0.83193266, "learning_rate": 3.95738425007858e-06, "loss": 0.85386217, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.82421875, "step": 1561, "time_per_iteration": 3.8965706825256348 }, { "auxiliary_loss_clip": 0.01104066, "auxiliary_loss_mlp": 0.01089738, "balance_loss_clip": 1.04119647, "balance_loss_mlp": 1.02517021, "epoch": 0.0939125206673681, "flos": 33290102707200.0, "grad_norm": 1.9692995827690527, "language_loss": 0.64528763, "learning_rate": 3.957304243552354e-06, "loss": 0.66722572, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7890625, "step": 1562, "time_per_iteration": 2.4954569339752197 }, { "auxiliary_loss_clip": 0.01101187, "auxiliary_loss_mlp": 0.01076462, "balance_loss_clip": 1.03242636, "balance_loss_mlp": 1.02564025, "epoch": 0.09397264392003607, "flos": 19243915672320.0, "grad_norm": 1.8985637387716423, "language_loss": 0.87626088, "learning_rate": 3.957224162804956e-06, "loss": 0.89803737, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.7578125, "step": 1563, "time_per_iteration": 2.398174524307251 }, { "auxiliary_loss_clip": 0.01100595, "auxiliary_loss_mlp": 0.01075822, "balance_loss_clip": 1.0278995, "balance_loss_mlp": 1.02481222, "epoch": 0.09403276717270405, "flos": 19316919058560.0, "grad_norm": 1.8995493509656518, "language_loss": 0.79113925, "learning_rate": 3.9571440078394205e-06, "loss": 0.8129034, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7578125, "step": 1564, "time_per_iteration": 2.3925321102142334 }, { "auxiliary_loss_clip": 0.01103273, "auxiliary_loss_mlp": 0.01081313, "balance_loss_clip": 1.03370118, "balance_loss_mlp": 1.02530456, "epoch": 0.09409289042537201, "flos": 23582432816640.0, "grad_norm": 1.7752293655588647, "language_loss": 0.82152903, "learning_rate": 3.9570637786587895e-06, "loss": 0.84337491, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.78125, "step": 1565, "time_per_iteration": 2.4311835765838623 }, { "auxiliary_loss_clip": 0.0110452, "auxiliary_loss_mlp": 0.01074704, "balance_loss_clip": 1.02387357, "balance_loss_mlp": 1.02522397, "epoch": 0.09415301367803998, "flos": 20077572049920.0, "grad_norm": 1.8521367298484126, "language_loss": 0.7796939, "learning_rate": 3.956983475266103e-06, "loss": 0.80148613, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7890625, "step": 1566, "time_per_iteration": 2.3932371139526367 }, { "auxiliary_loss_clip": 0.01107134, "auxiliary_loss_mlp": 0.01072801, "balance_loss_clip": 1.02368712, "balance_loss_mlp": 1.02791309, "epoch": 0.09421313693070796, "flos": 21061215095040.0, "grad_norm": 2.00594193303136, "language_loss": 0.80909908, "learning_rate": 3.956903097664407e-06, "loss": 0.8308984, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.79296875, "step": 1567, "time_per_iteration": 2.4117980003356934 }, { "auxiliary_loss_clip": 0.01111898, "auxiliary_loss_mlp": 0.01076306, "balance_loss_clip": 1.0257374, "balance_loss_mlp": 1.03119612, "epoch": 0.09427326018337592, "flos": 24315015208320.0, "grad_norm": 2.132844068242884, "language_loss": 0.84984076, "learning_rate": 3.956822645856749e-06, "loss": 0.87172282, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.8046875, "step": 1568, "time_per_iteration": 2.43041729927063 }, { "auxiliary_loss_clip": 0.01110439, "auxiliary_loss_mlp": 0.01075039, "balance_loss_clip": 1.02342153, "balance_loss_mlp": 1.03023338, "epoch": 0.09433338343604389, "flos": 20262925791360.0, "grad_norm": 2.183146245202465, "language_loss": 0.78571796, "learning_rate": 3.9567421198461814e-06, "loss": 0.80757272, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.80078125, "step": 1569, "time_per_iteration": 2.4096498489379883 }, { "auxiliary_loss_clip": 0.01108764, "auxiliary_loss_mlp": 0.01076066, "balance_loss_clip": 1.02795291, "balance_loss_mlp": 1.03015351, "epoch": 0.09439350668871185, "flos": 12742355111040.0, "grad_norm": 2.634157246885169, "language_loss": 0.87898397, "learning_rate": 3.956661519635756e-06, "loss": 0.9008323, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.78515625, "step": 1570, "time_per_iteration": 2.359407424926758 }, { "auxiliary_loss_clip": 0.01112552, "auxiliary_loss_mlp": 0.01069505, "balance_loss_clip": 1.01919878, "balance_loss_mlp": 1.03211808, "epoch": 0.09445362994137983, "flos": 25960960344960.0, "grad_norm": 1.5666366674499692, "language_loss": 0.78847367, "learning_rate": 3.95658084522853e-06, "loss": 0.81029427, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.8046875, "step": 1571, "time_per_iteration": 2.4857497215270996 }, { "auxiliary_loss_clip": 0.01108073, "auxiliary_loss_mlp": 0.01080379, "balance_loss_clip": 1.03054905, "balance_loss_mlp": 1.0318594, "epoch": 0.0945137531940478, "flos": 19714440332160.0, "grad_norm": 1.6296477687990505, "language_loss": 0.81438005, "learning_rate": 3.956500096627561e-06, "loss": 0.83626461, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.76171875, "step": 1572, "time_per_iteration": 2.4068503379821777 }, { "auxiliary_loss_clip": 0.01108048, "auxiliary_loss_mlp": 0.01071217, "balance_loss_clip": 1.02293742, "balance_loss_mlp": 1.03071332, "epoch": 0.09457387644671576, "flos": 23616089233920.0, "grad_norm": 1.789363533905425, "language_loss": 0.89132404, "learning_rate": 3.956419273835913e-06, "loss": 0.91311669, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.7734375, "step": 1573, "time_per_iteration": 2.4506731033325195 }, { "auxiliary_loss_clip": 0.01113773, "auxiliary_loss_mlp": 0.01090894, "balance_loss_clip": 1.03581882, "balance_loss_mlp": 1.03181887, "epoch": 0.09463399969938374, "flos": 26906059382400.0, "grad_norm": 2.0179844319136917, "language_loss": 0.83331752, "learning_rate": 3.95633837685665e-06, "loss": 0.8553642, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.8203125, "step": 1574, "time_per_iteration": 2.470216989517212 }, { "auxiliary_loss_clip": 0.01107549, "auxiliary_loss_mlp": 0.0107776, "balance_loss_clip": 1.02905083, "balance_loss_mlp": 1.03025961, "epoch": 0.0946941229520517, "flos": 23658438579840.0, "grad_norm": 1.71571141830514, "language_loss": 0.8295446, "learning_rate": 3.95625740569284e-06, "loss": 0.85139763, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7734375, "step": 1575, "time_per_iteration": 2.4480130672454834 }, { "auxiliary_loss_clip": 0.01103318, "auxiliary_loss_mlp": 0.0107547, "balance_loss_clip": 1.0231843, "balance_loss_mlp": 1.02626145, "epoch": 0.09475424620471967, "flos": 24132908223360.0, "grad_norm": 1.8923284697200395, "language_loss": 0.8921082, "learning_rate": 3.956176360347553e-06, "loss": 0.91389608, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.76953125, "step": 1576, "time_per_iteration": 2.425128221511841 }, { "auxiliary_loss_clip": 0.0102364, "auxiliary_loss_mlp": 0.01008138, "balance_loss_clip": 1.00079513, "balance_loss_mlp": 1.00480807, "epoch": 0.09481436945738765, "flos": 68422815573120.0, "grad_norm": 0.9813990260561815, "language_loss": 0.65956259, "learning_rate": 3.956095240823862e-06, "loss": 0.67988038, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 0.07324219, "router_z_loss_mlp": 0.1875, "step": 1577, "time_per_iteration": 2.9893174171447754 }, { "auxiliary_loss_clip": 0.01107265, "auxiliary_loss_mlp": 0.01075863, "balance_loss_clip": 1.02734518, "balance_loss_mlp": 1.02733207, "epoch": 0.09487449271005562, "flos": 16653150789120.0, "grad_norm": 2.031646104724104, "language_loss": 0.8257004, "learning_rate": 3.956014047124844e-06, "loss": 0.84753168, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.796875, "step": 1578, "time_per_iteration": 2.3976097106933594 }, { "auxiliary_loss_clip": 0.01105996, "auxiliary_loss_mlp": 0.01075616, "balance_loss_clip": 1.02559578, "balance_loss_mlp": 1.02519178, "epoch": 0.09493461596272358, "flos": 24274655809920.0, "grad_norm": 1.6811323280272412, "language_loss": 0.79995799, "learning_rate": 3.955932779253578e-06, "loss": 0.82177413, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.80859375, "step": 1579, "time_per_iteration": 2.469198703765869 }, { "auxiliary_loss_clip": 0.01108438, "auxiliary_loss_mlp": 0.01081956, "balance_loss_clip": 1.02709603, "balance_loss_mlp": 1.02677846, "epoch": 0.09499473921539155, "flos": 21869139934080.0, "grad_norm": 1.8988950420069364, "language_loss": 0.75379193, "learning_rate": 3.955851437213144e-06, "loss": 0.77569592, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8203125, "step": 1580, "time_per_iteration": 2.401291847229004 }, { "auxiliary_loss_clip": 0.01104649, "auxiliary_loss_mlp": 0.01077328, "balance_loss_clip": 1.02444661, "balance_loss_mlp": 1.02620959, "epoch": 0.09505486246805953, "flos": 33545736748800.0, "grad_norm": 1.6746147552912378, "language_loss": 0.79085857, "learning_rate": 3.955770021006627e-06, "loss": 0.81267834, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.78515625, "step": 1581, "time_per_iteration": 2.5123448371887207 }, { "auxiliary_loss_clip": 0.01105023, "auxiliary_loss_mlp": 0.01078223, "balance_loss_clip": 1.02324378, "balance_loss_mlp": 1.02631831, "epoch": 0.09511498572072749, "flos": 21214273962240.0, "grad_norm": 2.1456370893664896, "language_loss": 0.89219153, "learning_rate": 3.955688530637116e-06, "loss": 0.914024, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.78515625, "step": 1582, "time_per_iteration": 2.4266607761383057 }, { "auxiliary_loss_clip": 0.01107021, "auxiliary_loss_mlp": 0.01078792, "balance_loss_clip": 1.02319264, "balance_loss_mlp": 1.02714324, "epoch": 0.09517510897339546, "flos": 14610382606080.0, "grad_norm": 1.900717047310488, "language_loss": 0.69393158, "learning_rate": 3.955606966107699e-06, "loss": 0.71578968, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.80078125, "step": 1583, "time_per_iteration": 2.4320664405822754 }, { "auxiliary_loss_clip": 0.0111246, "auxiliary_loss_mlp": 0.01081714, "balance_loss_clip": 1.020679, "balance_loss_mlp": 1.02934122, "epoch": 0.09523523222606343, "flos": 27816140459520.0, "grad_norm": 1.6806659134654878, "language_loss": 0.72713912, "learning_rate": 3.95552532742147e-06, "loss": 0.74908078, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.828125, "step": 1584, "time_per_iteration": 2.44708251953125 }, { "auxiliary_loss_clip": 0.01108131, "auxiliary_loss_mlp": 0.01082578, "balance_loss_clip": 1.03010249, "balance_loss_mlp": 1.02794814, "epoch": 0.0952953554787314, "flos": 20705170383360.0, "grad_norm": 1.503810785159523, "language_loss": 0.83231986, "learning_rate": 3.955443614581525e-06, "loss": 0.85422695, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.8046875, "step": 1585, "time_per_iteration": 2.465524435043335 }, { "auxiliary_loss_clip": 0.01113662, "auxiliary_loss_mlp": 0.01081864, "balance_loss_clip": 1.02564538, "balance_loss_mlp": 1.02906871, "epoch": 0.09535547873139937, "flos": 24786552297600.0, "grad_norm": 1.7422501134792872, "language_loss": 0.75063449, "learning_rate": 3.955361827590961e-06, "loss": 0.7725898, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.84765625, "step": 1586, "time_per_iteration": 2.4342799186706543 }, { "auxiliary_loss_clip": 0.01030788, "auxiliary_loss_mlp": 0.01010919, "balance_loss_clip": 1.00371873, "balance_loss_mlp": 1.0112958, "epoch": 0.09541560198406734, "flos": 71909208230400.0, "grad_norm": 0.8497229837703482, "language_loss": 0.55547488, "learning_rate": 3.955279966452883e-06, "loss": 0.57589197, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 0.07177734, "router_z_loss_mlp": 0.1953125, "step": 1587, "time_per_iteration": 2.825852155685425 }, { "auxiliary_loss_clip": 0.01109955, "auxiliary_loss_mlp": 0.01083323, "balance_loss_clip": 1.02874851, "balance_loss_mlp": 1.02728915, "epoch": 0.09547572523673531, "flos": 28981436641920.0, "grad_norm": 1.7844032546801347, "language_loss": 0.8340835, "learning_rate": 3.955198031170391e-06, "loss": 0.85601628, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.828125, "step": 1588, "time_per_iteration": 2.456507921218872 }, { "auxiliary_loss_clip": 0.01103965, "auxiliary_loss_mlp": 0.01073024, "balance_loss_clip": 1.02252698, "balance_loss_mlp": 1.02584124, "epoch": 0.09553584848940327, "flos": 24132768577920.0, "grad_norm": 1.4960330355428664, "language_loss": 0.83697015, "learning_rate": 3.955116021746594e-06, "loss": 0.85874009, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.78125, "step": 1589, "time_per_iteration": 2.4288554191589355 }, { "auxiliary_loss_clip": 0.01105287, "auxiliary_loss_mlp": 0.01078504, "balance_loss_clip": 1.02412093, "balance_loss_mlp": 1.02553964, "epoch": 0.09559597174207124, "flos": 42849706055040.0, "grad_norm": 1.750539087268331, "language_loss": 0.67070806, "learning_rate": 3.955033938184601e-06, "loss": 0.69254601, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.796875, "step": 1590, "time_per_iteration": 2.5858876705169678 }, { "auxiliary_loss_clip": 0.01104613, "auxiliary_loss_mlp": 0.01086164, "balance_loss_clip": 1.0318048, "balance_loss_mlp": 1.02535272, "epoch": 0.09565609499473922, "flos": 32669486645760.0, "grad_norm": 1.6314640431535699, "language_loss": 0.85342956, "learning_rate": 3.954951780487526e-06, "loss": 0.87533736, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.79296875, "step": 1591, "time_per_iteration": 2.4922778606414795 }, { "auxiliary_loss_clip": 0.01108017, "auxiliary_loss_mlp": 0.0109249, "balance_loss_clip": 1.03670049, "balance_loss_mlp": 1.02587175, "epoch": 0.09571621824740718, "flos": 18477432483840.0, "grad_norm": 2.3849426279399784, "language_loss": 0.77764428, "learning_rate": 3.9548695486584835e-06, "loss": 0.79964936, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8203125, "step": 1592, "time_per_iteration": 2.3681893348693848 }, { "auxiliary_loss_clip": 0.01105275, "auxiliary_loss_mlp": 0.01074097, "balance_loss_clip": 1.02305174, "balance_loss_mlp": 1.02530873, "epoch": 0.09577634150007515, "flos": 29386219478400.0, "grad_norm": 2.3232916184256145, "language_loss": 0.76257885, "learning_rate": 3.954787242700592e-06, "loss": 0.78437251, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.80078125, "step": 1593, "time_per_iteration": 2.4658350944519043 }, { "auxiliary_loss_clip": 0.01103093, "auxiliary_loss_mlp": 0.01081601, "balance_loss_clip": 1.02705038, "balance_loss_mlp": 1.02460766, "epoch": 0.09583646475274313, "flos": 22746716668800.0, "grad_norm": 2.1606517441671773, "language_loss": 0.72042239, "learning_rate": 3.954704862616971e-06, "loss": 0.74226928, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.78515625, "step": 1594, "time_per_iteration": 2.39479923248291 }, { "auxiliary_loss_clip": 0.01107826, "auxiliary_loss_mlp": 0.01081718, "balance_loss_clip": 1.02506983, "balance_loss_mlp": 1.02684975, "epoch": 0.0958965880054111, "flos": 23217346062720.0, "grad_norm": 2.1700569724748457, "language_loss": 0.84671354, "learning_rate": 3.954622408410747e-06, "loss": 0.86860907, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.8125, "step": 1595, "time_per_iteration": 2.4211251735687256 }, { "auxiliary_loss_clip": 0.011075, "auxiliary_loss_mlp": 0.01077701, "balance_loss_clip": 1.02360368, "balance_loss_mlp": 1.02667379, "epoch": 0.09595671125807906, "flos": 21323377560960.0, "grad_norm": 2.0185559424295265, "language_loss": 0.87429786, "learning_rate": 3.954539880085045e-06, "loss": 0.89614993, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.80859375, "step": 1596, "time_per_iteration": 2.3998987674713135 }, { "auxiliary_loss_clip": 0.01110241, "auxiliary_loss_mlp": 0.01078782, "balance_loss_clip": 1.02299201, "balance_loss_mlp": 1.02714634, "epoch": 0.09601683451074704, "flos": 39601910695680.0, "grad_norm": 1.5286096168479322, "language_loss": 0.70642501, "learning_rate": 3.9544572776429945e-06, "loss": 0.72831523, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.83203125, "step": 1597, "time_per_iteration": 4.014523029327393 }, { "auxiliary_loss_clip": 0.01109605, "auxiliary_loss_mlp": 0.01075291, "balance_loss_clip": 1.0210743, "balance_loss_mlp": 1.0256654, "epoch": 0.096076957763415, "flos": 23731581611520.0, "grad_norm": 2.2648826640474438, "language_loss": 0.77086252, "learning_rate": 3.954374601087729e-06, "loss": 0.79271144, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.83984375, "step": 1598, "time_per_iteration": 3.853623867034912 }, { "auxiliary_loss_clip": 0.01106219, "auxiliary_loss_mlp": 0.01078902, "balance_loss_clip": 1.02287364, "balance_loss_mlp": 1.02560878, "epoch": 0.09613708101608297, "flos": 34676678286720.0, "grad_norm": 1.666917606142713, "language_loss": 0.71481001, "learning_rate": 3.954291850422382e-06, "loss": 0.73666126, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8046875, "step": 1599, "time_per_iteration": 3.916212797164917 }, { "auxiliary_loss_clip": 0.01109655, "auxiliary_loss_mlp": 0.01073854, "balance_loss_clip": 1.02297568, "balance_loss_mlp": 1.02742863, "epoch": 0.09619720426875093, "flos": 20739001357440.0, "grad_norm": 2.035548287324398, "language_loss": 0.86479074, "learning_rate": 3.954209025650093e-06, "loss": 0.88662589, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.8203125, "step": 1600, "time_per_iteration": 2.4057278633117676 }, { "auxiliary_loss_clip": 0.01109067, "auxiliary_loss_mlp": 0.01078084, "balance_loss_clip": 1.02563214, "balance_loss_mlp": 1.02641964, "epoch": 0.09625732752141891, "flos": 13041874598400.0, "grad_norm": 2.10188966121092, "language_loss": 0.83140117, "learning_rate": 3.954126126774001e-06, "loss": 0.85327268, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.828125, "step": 1601, "time_per_iteration": 3.9549529552459717 }, { "auxiliary_loss_clip": 0.01110201, "auxiliary_loss_mlp": 0.01083674, "balance_loss_clip": 1.02645397, "balance_loss_mlp": 1.02718687, "epoch": 0.09631745077408688, "flos": 22272526316160.0, "grad_norm": 2.172260091146376, "language_loss": 0.84739375, "learning_rate": 3.954043153797251e-06, "loss": 0.86933249, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.83203125, "step": 1602, "time_per_iteration": 2.4098377227783203 }, { "auxiliary_loss_clip": 0.01107633, "auxiliary_loss_mlp": 0.01072204, "balance_loss_clip": 1.01848888, "balance_loss_mlp": 1.02790439, "epoch": 0.09637757402675484, "flos": 24753105348480.0, "grad_norm": 2.1743481600705854, "language_loss": 0.65381384, "learning_rate": 3.953960106722989e-06, "loss": 0.67561227, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.796875, "step": 1603, "time_per_iteration": 2.4325709342956543 }, { "auxiliary_loss_clip": 0.01111576, "auxiliary_loss_mlp": 0.01078123, "balance_loss_clip": 1.02066398, "balance_loss_mlp": 1.02754831, "epoch": 0.09643769727942282, "flos": 22524739044480.0, "grad_norm": 2.3389055380757733, "language_loss": 0.74557132, "learning_rate": 3.953876985554364e-06, "loss": 0.76746833, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.83984375, "step": 1604, "time_per_iteration": 2.4298739433288574 }, { "auxiliary_loss_clip": 0.01106201, "auxiliary_loss_mlp": 0.01076743, "balance_loss_clip": 1.02348018, "balance_loss_mlp": 1.02613711, "epoch": 0.09649782053209079, "flos": 30919674614400.0, "grad_norm": 1.9121618101168492, "language_loss": 0.81045002, "learning_rate": 3.953793790294527e-06, "loss": 0.83227944, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.80078125, "step": 1605, "time_per_iteration": 2.4829559326171875 }, { "auxiliary_loss_clip": 0.01109094, "auxiliary_loss_mlp": 0.01077648, "balance_loss_clip": 1.01999855, "balance_loss_mlp": 1.0262568, "epoch": 0.09655794378475875, "flos": 25336469122560.0, "grad_norm": 1.8284130742423268, "language_loss": 0.77764148, "learning_rate": 3.953710520946634e-06, "loss": 0.79950893, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.828125, "step": 1606, "time_per_iteration": 2.4197540283203125 }, { "auxiliary_loss_clip": 0.01109385, "auxiliary_loss_mlp": 0.01069583, "balance_loss_clip": 1.01622534, "balance_loss_mlp": 1.02695251, "epoch": 0.09661806703742673, "flos": 22344971120640.0, "grad_norm": 2.232074573932912, "language_loss": 0.77761221, "learning_rate": 3.953627177513843e-06, "loss": 0.79940194, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.828125, "step": 1607, "time_per_iteration": 2.4170305728912354 }, { "auxiliary_loss_clip": 0.01108011, "auxiliary_loss_mlp": 0.01075646, "balance_loss_clip": 1.02011871, "balance_loss_mlp": 1.02582693, "epoch": 0.0966781902900947, "flos": 17456606974080.0, "grad_norm": 1.7884519847317093, "language_loss": 0.89080453, "learning_rate": 3.953543759999312e-06, "loss": 0.91264111, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8203125, "step": 1608, "time_per_iteration": 2.357280731201172 }, { "auxiliary_loss_clip": 0.01113112, "auxiliary_loss_mlp": 0.01093932, "balance_loss_clip": 1.03337407, "balance_loss_mlp": 1.0284512, "epoch": 0.09673831354276266, "flos": 36902496061440.0, "grad_norm": 2.23730631740532, "language_loss": 0.74107963, "learning_rate": 3.953460268406207e-06, "loss": 0.7631501, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.84765625, "step": 1609, "time_per_iteration": 2.530562400817871 }, { "auxiliary_loss_clip": 0.01107853, "auxiliary_loss_mlp": 0.01082921, "balance_loss_clip": 1.02398443, "balance_loss_mlp": 1.0248208, "epoch": 0.09679843679543064, "flos": 20700422438400.0, "grad_norm": 2.6421030902365703, "language_loss": 0.88088429, "learning_rate": 3.953376702737693e-06, "loss": 0.9027921, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.828125, "step": 1610, "time_per_iteration": 2.383683204650879 }, { "auxiliary_loss_clip": 0.01106756, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.02199757, "balance_loss_mlp": 1.02533531, "epoch": 0.0968585600480986, "flos": 23513269680000.0, "grad_norm": 2.0644212487621867, "language_loss": 0.70081913, "learning_rate": 3.953293062996939e-06, "loss": 0.72266126, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.81640625, "step": 1611, "time_per_iteration": 2.421774387359619 }, { "auxiliary_loss_clip": 0.01106464, "auxiliary_loss_mlp": 0.01078324, "balance_loss_clip": 1.02353513, "balance_loss_mlp": 1.02658069, "epoch": 0.09691868330076657, "flos": 20120026129920.0, "grad_norm": 1.703965667846029, "language_loss": 0.83754063, "learning_rate": 3.953209349187115e-06, "loss": 0.85938853, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.80078125, "step": 1612, "time_per_iteration": 2.428283452987671 }, { "auxiliary_loss_clip": 0.01109063, "auxiliary_loss_mlp": 0.01080526, "balance_loss_clip": 1.02161276, "balance_loss_mlp": 1.02682436, "epoch": 0.09697880655343454, "flos": 16543767899520.0, "grad_norm": 3.3711158908387078, "language_loss": 0.83217096, "learning_rate": 3.953125561311398e-06, "loss": 0.85406685, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.82421875, "step": 1613, "time_per_iteration": 2.4338459968566895 }, { "auxiliary_loss_clip": 0.01108099, "auxiliary_loss_mlp": 0.01082104, "balance_loss_clip": 1.02371502, "balance_loss_mlp": 1.02677321, "epoch": 0.09703892980610251, "flos": 26102987222400.0, "grad_norm": 1.8159604347520433, "language_loss": 0.86943018, "learning_rate": 3.953041699372964e-06, "loss": 0.89133227, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.8125, "step": 1614, "time_per_iteration": 2.4420080184936523 }, { "auxiliary_loss_clip": 0.01027149, "auxiliary_loss_mlp": 0.01009179, "balance_loss_clip": 1.00097728, "balance_loss_mlp": 1.00765395, "epoch": 0.09709905305877048, "flos": 60440273516160.0, "grad_norm": 0.7114388910725121, "language_loss": 0.54640126, "learning_rate": 3.952957763374992e-06, "loss": 0.56676459, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 0.08203125, "router_z_loss_mlp": 0.1953125, "step": 1615, "time_per_iteration": 3.001404047012329 }, { "auxiliary_loss_clip": 0.01024632, "auxiliary_loss_mlp": 0.01008046, "balance_loss_clip": 0.99993986, "balance_loss_mlp": 1.00553751, "epoch": 0.09715917631143844, "flos": 57636503228160.0, "grad_norm": 0.7627212241250961, "language_loss": 0.58397746, "learning_rate": 3.952873753320666e-06, "loss": 0.60430431, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 0.08105469, "router_z_loss_mlp": 0.19140625, "step": 1616, "time_per_iteration": 3.171184778213501 }, { "auxiliary_loss_clip": 0.01108775, "auxiliary_loss_mlp": 0.01082695, "balance_loss_clip": 1.02542663, "balance_loss_mlp": 1.02615321, "epoch": 0.09721929956410642, "flos": 20557173663360.0, "grad_norm": 1.7329379376361493, "language_loss": 0.71487916, "learning_rate": 3.952789669213172e-06, "loss": 0.73679388, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.828125, "step": 1617, "time_per_iteration": 2.410926580429077 }, { "auxiliary_loss_clip": 0.01106126, "auxiliary_loss_mlp": 0.01085642, "balance_loss_clip": 1.02985239, "balance_loss_mlp": 1.02493536, "epoch": 0.09727942281677439, "flos": 27343137093120.0, "grad_norm": 1.6878754135616512, "language_loss": 0.82465041, "learning_rate": 3.952705511055698e-06, "loss": 0.84656811, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8125, "step": 1618, "time_per_iteration": 2.445171594619751 }, { "auxiliary_loss_clip": 0.0110414, "auxiliary_loss_mlp": 0.01069679, "balance_loss_clip": 1.02070832, "balance_loss_mlp": 1.02560842, "epoch": 0.09733954606944235, "flos": 24898867741440.0, "grad_norm": 1.599561037233907, "language_loss": 0.94515753, "learning_rate": 3.952621278851435e-06, "loss": 0.9668957, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.78515625, "step": 1619, "time_per_iteration": 2.4506914615631104 }, { "auxiliary_loss_clip": 0.01102778, "auxiliary_loss_mlp": 0.01076802, "balance_loss_clip": 1.01898563, "balance_loss_mlp": 1.02645719, "epoch": 0.09739966932211033, "flos": 31502584540800.0, "grad_norm": 1.7504334102373715, "language_loss": 0.90421933, "learning_rate": 3.9525369726035784e-06, "loss": 0.92601514, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.765625, "step": 1620, "time_per_iteration": 2.5068349838256836 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.01080507, "balance_loss_clip": 1.02829373, "balance_loss_mlp": 1.02688682, "epoch": 0.0974597925747783, "flos": 23877623295360.0, "grad_norm": 3.0041560534363776, "language_loss": 0.79666996, "learning_rate": 3.952452592315324e-06, "loss": 0.81853455, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7890625, "step": 1621, "time_per_iteration": 2.4402284622192383 }, { "auxiliary_loss_clip": 0.01105235, "auxiliary_loss_mlp": 0.01090211, "balance_loss_clip": 1.03592336, "balance_loss_mlp": 1.02590334, "epoch": 0.09751991582744626, "flos": 17018621568000.0, "grad_norm": 1.985980220000075, "language_loss": 0.78992647, "learning_rate": 3.952368137989871e-06, "loss": 0.81188095, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.79296875, "step": 1622, "time_per_iteration": 2.370020866394043 }, { "auxiliary_loss_clip": 0.01107671, "auxiliary_loss_mlp": 0.01080646, "balance_loss_clip": 1.02569056, "balance_loss_mlp": 1.02691627, "epoch": 0.09758003908011423, "flos": 28401564003840.0, "grad_norm": 3.103647140341387, "language_loss": 0.86719602, "learning_rate": 3.9522836096304225e-06, "loss": 0.88907921, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.80859375, "step": 1623, "time_per_iteration": 2.490595817565918 }, { "auxiliary_loss_clip": 0.0110253, "auxiliary_loss_mlp": 0.0108473, "balance_loss_clip": 1.03275466, "balance_loss_mlp": 1.02556264, "epoch": 0.09764016233278221, "flos": 18143488529280.0, "grad_norm": 1.9044894173336737, "language_loss": 0.82347345, "learning_rate": 3.952199007240184e-06, "loss": 0.84534609, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.76953125, "step": 1624, "time_per_iteration": 2.4219815731048584 }, { "auxiliary_loss_clip": 0.0110418, "auxiliary_loss_mlp": 0.01069204, "balance_loss_clip": 1.01939845, "balance_loss_mlp": 1.02603126, "epoch": 0.09770028558545017, "flos": 15265004198400.0, "grad_norm": 2.104520799773898, "language_loss": 0.88625598, "learning_rate": 3.952114330822364e-06, "loss": 0.90798992, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.78125, "step": 1625, "time_per_iteration": 2.4146273136138916 }, { "auxiliary_loss_clip": 0.0110774, "auxiliary_loss_mlp": 0.01070435, "balance_loss_clip": 1.01936615, "balance_loss_mlp": 1.0270443, "epoch": 0.09776040883811814, "flos": 23471444004480.0, "grad_norm": 2.7032420062029074, "language_loss": 0.87488401, "learning_rate": 3.952029580380172e-06, "loss": 0.89666575, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.8046875, "step": 1626, "time_per_iteration": 2.4074630737304688 }, { "auxiliary_loss_clip": 0.01109466, "auxiliary_loss_mlp": 0.01074659, "balance_loss_clip": 1.01779664, "balance_loss_mlp": 1.02637446, "epoch": 0.09782053209078612, "flos": 24498309179520.0, "grad_norm": 1.8556351402213684, "language_loss": 0.84202832, "learning_rate": 3.9519447559168234e-06, "loss": 0.86386955, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.828125, "step": 1627, "time_per_iteration": 2.432111978530884 }, { "auxiliary_loss_clip": 0.01105382, "auxiliary_loss_mlp": 0.01070005, "balance_loss_clip": 1.02131987, "balance_loss_mlp": 1.02688503, "epoch": 0.09788065534345408, "flos": 21579081425280.0, "grad_norm": 1.6867048831332374, "language_loss": 0.86580735, "learning_rate": 3.951859857435534e-06, "loss": 0.88756126, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.78515625, "step": 1628, "time_per_iteration": 2.392963171005249 }, { "auxiliary_loss_clip": 0.01105305, "auxiliary_loss_mlp": 0.01075369, "balance_loss_clip": 1.02415705, "balance_loss_mlp": 1.02563691, "epoch": 0.09794077859612205, "flos": 23841313614720.0, "grad_norm": 1.5817848201246871, "language_loss": 0.77773905, "learning_rate": 3.951774884939523e-06, "loss": 0.79954582, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.796875, "step": 1629, "time_per_iteration": 2.4805216789245605 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.01078042, "balance_loss_clip": 1.02353954, "balance_loss_mlp": 1.02616382, "epoch": 0.09800090184879003, "flos": 23658752782080.0, "grad_norm": 1.6901381584867556, "language_loss": 0.80345124, "learning_rate": 3.951689838432013e-06, "loss": 0.82528359, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.7890625, "step": 1630, "time_per_iteration": 2.4625566005706787 }, { "auxiliary_loss_clip": 0.01103643, "auxiliary_loss_mlp": 0.01079806, "balance_loss_clip": 1.02651906, "balance_loss_mlp": 1.02607346, "epoch": 0.09806102510145799, "flos": 17054826514560.0, "grad_norm": 2.101403614883873, "language_loss": 0.89114201, "learning_rate": 3.951604717916228e-06, "loss": 0.9129765, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.7734375, "step": 1631, "time_per_iteration": 2.384974241256714 }, { "auxiliary_loss_clip": 0.01103291, "auxiliary_loss_mlp": 0.01071925, "balance_loss_clip": 1.02049804, "balance_loss_mlp": 1.02565217, "epoch": 0.09812114835412596, "flos": 23877344004480.0, "grad_norm": 2.2983287885926877, "language_loss": 0.84923869, "learning_rate": 3.9515195233953975e-06, "loss": 0.87099087, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7734375, "step": 1632, "time_per_iteration": 2.4236254692077637 }, { "auxiliary_loss_clip": 0.01105379, "auxiliary_loss_mlp": 0.01090306, "balance_loss_clip": 1.03570819, "balance_loss_mlp": 1.0269289, "epoch": 0.09818127160679392, "flos": 20594425950720.0, "grad_norm": 1.8463381223431157, "language_loss": 0.80762249, "learning_rate": 3.951434254872751e-06, "loss": 0.82957935, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.78515625, "step": 1633, "time_per_iteration": 2.4258577823638916 }, { "auxiliary_loss_clip": 0.01103628, "auxiliary_loss_mlp": 0.01072191, "balance_loss_clip": 1.02124059, "balance_loss_mlp": 1.02655423, "epoch": 0.0982413948594619, "flos": 15486423240960.0, "grad_norm": 2.860326656266267, "language_loss": 0.75749266, "learning_rate": 3.951348912351521e-06, "loss": 0.77925086, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7734375, "step": 1634, "time_per_iteration": 2.3652942180633545 }, { "auxiliary_loss_clip": 0.01108616, "auxiliary_loss_mlp": 0.01084466, "balance_loss_clip": 1.03067935, "balance_loss_mlp": 1.02506614, "epoch": 0.09830151811212987, "flos": 24206784393600.0, "grad_norm": 2.5944991562321227, "language_loss": 0.76422465, "learning_rate": 3.951263495834947e-06, "loss": 0.78615546, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8359375, "step": 1635, "time_per_iteration": 2.4179203510284424 }, { "auxiliary_loss_clip": 0.01107572, "auxiliary_loss_mlp": 0.01081586, "balance_loss_clip": 1.02858508, "balance_loss_mlp": 1.0257231, "epoch": 0.09836164136479783, "flos": 20593553166720.0, "grad_norm": 2.03612435648842, "language_loss": 0.79732376, "learning_rate": 3.951178005326264e-06, "loss": 0.8192153, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8203125, "step": 1636, "time_per_iteration": 2.463616132736206 }, { "auxiliary_loss_clip": 0.01104361, "auxiliary_loss_mlp": 0.01069919, "balance_loss_clip": 1.02051902, "balance_loss_mlp": 1.02598381, "epoch": 0.09842176461746581, "flos": 19933241022720.0, "grad_norm": 1.9874216757811056, "language_loss": 0.72334164, "learning_rate": 3.951092440828715e-06, "loss": 0.7450844, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.78515625, "step": 1637, "time_per_iteration": 3.864670753479004 }, { "auxiliary_loss_clip": 0.01103426, "auxiliary_loss_mlp": 0.01077499, "balance_loss_clip": 1.02559495, "balance_loss_mlp": 1.02377057, "epoch": 0.09848188787013377, "flos": 21213610646400.0, "grad_norm": 2.1222354235221528, "language_loss": 0.79913712, "learning_rate": 3.951006802345545e-06, "loss": 0.8209464, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.796875, "step": 1638, "time_per_iteration": 3.946472406387329 }, { "auxiliary_loss_clip": 0.01098065, "auxiliary_loss_mlp": 0.01070684, "balance_loss_clip": 1.02311969, "balance_loss_mlp": 1.02258277, "epoch": 0.09854201112280174, "flos": 30152912135040.0, "grad_norm": 1.4919503165803094, "language_loss": 0.74175179, "learning_rate": 3.950921089880003e-06, "loss": 0.76343936, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.75390625, "step": 1639, "time_per_iteration": 3.848884105682373 }, { "auxiliary_loss_clip": 0.01102439, "auxiliary_loss_mlp": 0.01062317, "balance_loss_clip": 1.01513398, "balance_loss_mlp": 1.02497363, "epoch": 0.09860213437546972, "flos": 21794740093440.0, "grad_norm": 2.290369854834682, "language_loss": 0.90147913, "learning_rate": 3.950835303435337e-06, "loss": 0.9231267, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.7734375, "step": 1640, "time_per_iteration": 2.4076266288757324 }, { "auxiliary_loss_clip": 0.01105975, "auxiliary_loss_mlp": 0.01064786, "balance_loss_clip": 1.01819932, "balance_loss_mlp": 1.02790391, "epoch": 0.09866225762813768, "flos": 21834471087360.0, "grad_norm": 1.7458191478006757, "language_loss": 0.84006023, "learning_rate": 3.950749443014801e-06, "loss": 0.86176777, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.78125, "step": 1641, "time_per_iteration": 3.8483307361602783 }, { "auxiliary_loss_clip": 0.01103361, "auxiliary_loss_mlp": 0.01070837, "balance_loss_clip": 1.02370191, "balance_loss_mlp": 1.02643633, "epoch": 0.09872238088080565, "flos": 17598982965120.0, "grad_norm": 2.80792561788103, "language_loss": 0.88879156, "learning_rate": 3.95066350862165e-06, "loss": 0.91053355, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.76953125, "step": 1642, "time_per_iteration": 2.3969016075134277 }, { "auxiliary_loss_clip": 0.01103133, "auxiliary_loss_mlp": 0.01071077, "balance_loss_clip": 1.02332211, "balance_loss_mlp": 1.02518606, "epoch": 0.09878250413347361, "flos": 27634906258560.0, "grad_norm": 1.7195390827569323, "language_loss": 0.82884479, "learning_rate": 3.950577500259144e-06, "loss": 0.85058689, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.78125, "step": 1643, "time_per_iteration": 2.4703660011291504 }, { "auxiliary_loss_clip": 0.01102265, "auxiliary_loss_mlp": 0.01071512, "balance_loss_clip": 1.02156353, "balance_loss_mlp": 1.02477062, "epoch": 0.0988426273861416, "flos": 16543802810880.0, "grad_norm": 1.7163951577795258, "language_loss": 0.85234201, "learning_rate": 3.950491417930543e-06, "loss": 0.87407976, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7734375, "step": 1644, "time_per_iteration": 2.3618063926696777 }, { "auxiliary_loss_clip": 0.01099057, "auxiliary_loss_mlp": 0.01064712, "balance_loss_clip": 1.01807773, "balance_loss_mlp": 1.02407956, "epoch": 0.09890275063880956, "flos": 21214204139520.0, "grad_norm": 1.6231948390854696, "language_loss": 0.70650393, "learning_rate": 3.9504052616391124e-06, "loss": 0.72814161, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.75, "step": 1645, "time_per_iteration": 2.398422956466675 }, { "auxiliary_loss_clip": 0.01029536, "auxiliary_loss_mlp": 0.01014002, "balance_loss_clip": 1.00618172, "balance_loss_mlp": 1.01121688, "epoch": 0.09896287389147752, "flos": 59376225876480.0, "grad_norm": 0.8510179972440951, "language_loss": 0.61036646, "learning_rate": 3.950319031388119e-06, "loss": 0.6308018, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 0.078125, "router_z_loss_mlp": 0.18359375, "step": 1646, "time_per_iteration": 2.9981093406677246 }, { "auxiliary_loss_clip": 0.01100304, "auxiliary_loss_mlp": 0.01069714, "balance_loss_clip": 1.01928806, "balance_loss_mlp": 1.02481687, "epoch": 0.0990229971441455, "flos": 29641399672320.0, "grad_norm": 1.8296214726173763, "language_loss": 0.74999666, "learning_rate": 3.950232727180833e-06, "loss": 0.77169681, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7578125, "step": 1647, "time_per_iteration": 2.463477373123169 }, { "auxiliary_loss_clip": 0.01103714, "auxiliary_loss_mlp": 0.01071137, "balance_loss_clip": 1.01966262, "balance_loss_mlp": 1.02644491, "epoch": 0.09908312039681347, "flos": 21833807771520.0, "grad_norm": 1.9046592328881813, "language_loss": 0.8663798, "learning_rate": 3.950146349020525e-06, "loss": 0.88812834, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7734375, "step": 1648, "time_per_iteration": 2.4102609157562256 }, { "auxiliary_loss_clip": 0.01025869, "auxiliary_loss_mlp": 0.01010326, "balance_loss_clip": 1.00207663, "balance_loss_mlp": 1.00691056, "epoch": 0.09914324364948143, "flos": 57560951312640.0, "grad_norm": 0.7325394945353588, "language_loss": 0.5575493, "learning_rate": 3.950059896910473e-06, "loss": 0.57791126, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 0.08251953, "router_z_loss_mlp": 0.18945312, "step": 1649, "time_per_iteration": 2.9665117263793945 }, { "auxiliary_loss_clip": 0.01098228, "auxiliary_loss_mlp": 0.0106882, "balance_loss_clip": 1.01975393, "balance_loss_mlp": 1.02328897, "epoch": 0.09920336690214941, "flos": 34122711744000.0, "grad_norm": 2.1235813948399356, "language_loss": 0.93026525, "learning_rate": 3.949973370853954e-06, "loss": 0.95193571, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.75, "step": 1650, "time_per_iteration": 2.4819529056549072 }, { "auxiliary_loss_clip": 0.01024179, "auxiliary_loss_mlp": 0.01014586, "balance_loss_clip": 1.00614572, "balance_loss_mlp": 1.00518847, "epoch": 0.09926349015481738, "flos": 71212514716800.0, "grad_norm": 0.8072824449091115, "language_loss": 0.63901246, "learning_rate": 3.94988677085425e-06, "loss": 0.65940011, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 0.08447266, "router_z_loss_mlp": 0.18945312, "step": 1651, "time_per_iteration": 3.217590808868408 }, { "auxiliary_loss_clip": 0.01100447, "auxiliary_loss_mlp": 0.01075628, "balance_loss_clip": 1.02467823, "balance_loss_mlp": 1.0248729, "epoch": 0.09932361340748534, "flos": 23147589432960.0, "grad_norm": 1.6063869520628244, "language_loss": 0.89663959, "learning_rate": 3.949800096914643e-06, "loss": 0.91840041, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 1652, "time_per_iteration": 2.4003946781158447 }, { "auxiliary_loss_clip": 0.0110586, "auxiliary_loss_mlp": 0.01077704, "balance_loss_clip": 1.02565694, "balance_loss_mlp": 1.0269829, "epoch": 0.09938373666015332, "flos": 19827628560000.0, "grad_norm": 1.8222551001494267, "language_loss": 0.83790624, "learning_rate": 3.949713349038422e-06, "loss": 0.85974193, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.7890625, "step": 1653, "time_per_iteration": 2.4091849327087402 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01083704, "balance_loss_clip": 1.02736545, "balance_loss_mlp": 1.02973485, "epoch": 0.09944385991282129, "flos": 22089581458560.0, "grad_norm": 1.833242030229131, "language_loss": 0.81321424, "learning_rate": 3.949626527228875e-06, "loss": 0.83513319, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.78515625, "step": 1654, "time_per_iteration": 2.5021731853485107 }, { "auxiliary_loss_clip": 0.01102695, "auxiliary_loss_mlp": 0.0107381, "balance_loss_clip": 1.02581692, "balance_loss_mlp": 1.02767527, "epoch": 0.09950398316548925, "flos": 19827838028160.0, "grad_norm": 1.566558572440024, "language_loss": 0.83031702, "learning_rate": 3.949539631489295e-06, "loss": 0.85208207, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.75, "step": 1655, "time_per_iteration": 2.412677526473999 }, { "auxiliary_loss_clip": 0.01104579, "auxiliary_loss_mlp": 0.01072711, "balance_loss_clip": 1.01930523, "balance_loss_mlp": 1.02801538, "epoch": 0.09956410641815722, "flos": 25002699724800.0, "grad_norm": 1.7761133133741258, "language_loss": 0.83285224, "learning_rate": 3.9494526618229765e-06, "loss": 0.85462517, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.765625, "step": 1656, "time_per_iteration": 2.4364984035491943 }, { "auxiliary_loss_clip": 0.01107359, "auxiliary_loss_mlp": 0.01077246, "balance_loss_clip": 1.02770281, "balance_loss_mlp": 1.0314548, "epoch": 0.0996242296708252, "flos": 19316709590400.0, "grad_norm": 1.573705895283084, "language_loss": 0.90835059, "learning_rate": 3.949365618233217e-06, "loss": 0.9301967, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.7578125, "step": 1657, "time_per_iteration": 2.4293630123138428 }, { "auxiliary_loss_clip": 0.0111191, "auxiliary_loss_mlp": 0.01082596, "balance_loss_clip": 1.02358711, "balance_loss_mlp": 1.02992654, "epoch": 0.09968435292349316, "flos": 21870536388480.0, "grad_norm": 1.9942871250625316, "language_loss": 0.86624467, "learning_rate": 3.9492785007233195e-06, "loss": 0.88818973, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.8203125, "step": 1658, "time_per_iteration": 2.4032602310180664 }, { "auxiliary_loss_clip": 0.01023088, "auxiliary_loss_mlp": 0.01012921, "balance_loss_clip": 1.00605428, "balance_loss_mlp": 1.0052948, "epoch": 0.09974447617616113, "flos": 65381636839680.0, "grad_norm": 0.9035744283793647, "language_loss": 0.61019313, "learning_rate": 3.949191309296585e-06, "loss": 0.63055325, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 0.06884766, "router_z_loss_mlp": 0.17773438, "step": 1659, "time_per_iteration": 3.042620897293091 }, { "auxiliary_loss_clip": 0.01105649, "auxiliary_loss_mlp": 0.01080048, "balance_loss_clip": 1.0253787, "balance_loss_mlp": 1.02740419, "epoch": 0.0998045994288291, "flos": 23658682959360.0, "grad_norm": 1.7876464630642381, "language_loss": 0.8787992, "learning_rate": 3.949104043956321e-06, "loss": 0.9006561, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.78515625, "step": 1660, "time_per_iteration": 2.43302059173584 }, { "auxiliary_loss_clip": 0.01104995, "auxiliary_loss_mlp": 0.01076383, "balance_loss_clip": 1.02590942, "balance_loss_mlp": 1.02911949, "epoch": 0.09986472268149707, "flos": 19608688224000.0, "grad_norm": 2.043085972937327, "language_loss": 0.8189894, "learning_rate": 3.949016704705836e-06, "loss": 0.84080315, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7578125, "step": 1661, "time_per_iteration": 2.4227871894836426 }, { "auxiliary_loss_clip": 0.01109526, "auxiliary_loss_mlp": 0.01080195, "balance_loss_clip": 1.02416635, "balance_loss_mlp": 1.02767491, "epoch": 0.09992484593416504, "flos": 26212125732480.0, "grad_norm": 1.7513440960401827, "language_loss": 0.85802221, "learning_rate": 3.948929291548443e-06, "loss": 0.87991941, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8203125, "step": 1662, "time_per_iteration": 2.4701945781707764 }, { "auxiliary_loss_clip": 0.01108106, "auxiliary_loss_mlp": 0.01076639, "balance_loss_clip": 1.02354312, "balance_loss_mlp": 1.02962053, "epoch": 0.09998496918683301, "flos": 17492672275200.0, "grad_norm": 1.9401008221420013, "language_loss": 0.91297233, "learning_rate": 3.9488418044874546e-06, "loss": 0.9348197, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.78515625, "step": 1663, "time_per_iteration": 2.4012911319732666 }, { "auxiliary_loss_clip": 0.01105768, "auxiliary_loss_mlp": 0.0108855, "balance_loss_clip": 1.03268862, "balance_loss_mlp": 1.02755368, "epoch": 0.10004509243950098, "flos": 22783794399360.0, "grad_norm": 1.6317062477092048, "language_loss": 0.72309226, "learning_rate": 3.948754243526191e-06, "loss": 0.74503541, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.78125, "step": 1664, "time_per_iteration": 2.441054344177246 }, { "auxiliary_loss_clip": 0.01107504, "auxiliary_loss_mlp": 0.01076544, "balance_loss_clip": 1.02273309, "balance_loss_mlp": 1.0297699, "epoch": 0.10010521569216894, "flos": 16252452581760.0, "grad_norm": 1.9524252939564413, "language_loss": 0.823089, "learning_rate": 3.94866660866797e-06, "loss": 0.84492946, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.77734375, "step": 1665, "time_per_iteration": 2.3911240100860596 }, { "auxiliary_loss_clip": 0.01106564, "auxiliary_loss_mlp": 0.01074543, "balance_loss_clip": 1.02018332, "balance_loss_mlp": 1.02831328, "epoch": 0.10016533894483691, "flos": 23401512817920.0, "grad_norm": 1.6224831224515477, "language_loss": 0.71278971, "learning_rate": 3.9485788999161165e-06, "loss": 0.73460078, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.78125, "step": 1666, "time_per_iteration": 2.420421838760376 }, { "auxiliary_loss_clip": 0.01108363, "auxiliary_loss_mlp": 0.01082547, "balance_loss_clip": 1.02804422, "balance_loss_mlp": 1.03002882, "epoch": 0.10022546219750489, "flos": 19353158916480.0, "grad_norm": 2.0743367313092698, "language_loss": 0.8221736, "learning_rate": 3.948491117273956e-06, "loss": 0.84408271, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.78125, "step": 1667, "time_per_iteration": 2.406050443649292 }, { "auxiliary_loss_clip": 0.01105174, "auxiliary_loss_mlp": 0.01079178, "balance_loss_clip": 1.02202892, "balance_loss_mlp": 1.02810073, "epoch": 0.10028558545017285, "flos": 27084640320000.0, "grad_norm": 2.962510225553142, "language_loss": 0.81216514, "learning_rate": 3.948403260744817e-06, "loss": 0.83400869, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.76953125, "step": 1668, "time_per_iteration": 2.440906524658203 }, { "auxiliary_loss_clip": 0.01105594, "auxiliary_loss_mlp": 0.01070959, "balance_loss_clip": 1.01667142, "balance_loss_mlp": 1.0277009, "epoch": 0.10034570870284082, "flos": 25845991637760.0, "grad_norm": 1.7095814976337, "language_loss": 0.80399078, "learning_rate": 3.948315330332031e-06, "loss": 0.82575631, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.77734375, "step": 1669, "time_per_iteration": 2.447714328765869 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01082516, "balance_loss_clip": 1.02701187, "balance_loss_mlp": 1.03042197, "epoch": 0.1004058319555088, "flos": 26248400501760.0, "grad_norm": 2.1808067693062987, "language_loss": 0.87943649, "learning_rate": 3.948227326038933e-06, "loss": 0.90137285, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8046875, "step": 1670, "time_per_iteration": 2.4442763328552246 }, { "auxiliary_loss_clip": 0.01099476, "auxiliary_loss_mlp": 0.01067475, "balance_loss_clip": 1.01874185, "balance_loss_mlp": 1.02605903, "epoch": 0.10046595520817676, "flos": 25373302473600.0, "grad_norm": 1.4678212646482667, "language_loss": 0.78069031, "learning_rate": 3.9481392478688586e-06, "loss": 0.80235982, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.734375, "step": 1671, "time_per_iteration": 2.459282636642456 }, { "auxiliary_loss_clip": 0.01026201, "auxiliary_loss_mlp": 0.01023446, "balance_loss_clip": 1.01643682, "balance_loss_mlp": 1.00930071, "epoch": 0.10052607846084473, "flos": 67458934224000.0, "grad_norm": 0.7830368120380419, "language_loss": 0.60795546, "learning_rate": 3.948051095825149e-06, "loss": 0.62845194, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 0.0703125, "router_z_loss_mlp": 0.16894531, "step": 1672, "time_per_iteration": 3.003394365310669 }, { "auxiliary_loss_clip": 0.0111145, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.02826142, "balance_loss_mlp": 1.03193748, "epoch": 0.10058620171351271, "flos": 21359442862080.0, "grad_norm": 2.0430421435278627, "language_loss": 0.7931211, "learning_rate": 3.947962869911147e-06, "loss": 0.81506515, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.796875, "step": 1673, "time_per_iteration": 2.4184505939483643 }, { "auxiliary_loss_clip": 0.01110096, "auxiliary_loss_mlp": 0.01072848, "balance_loss_clip": 1.0224936, "balance_loss_mlp": 1.03155243, "epoch": 0.10064632496618067, "flos": 16799192472960.0, "grad_norm": 2.1285189967702527, "language_loss": 0.77331603, "learning_rate": 3.947874570130197e-06, "loss": 0.79514539, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.78515625, "step": 1674, "time_per_iteration": 2.3782689571380615 }, { "auxiliary_loss_clip": 0.0111011, "auxiliary_loss_mlp": 0.01084497, "balance_loss_clip": 1.03171098, "balance_loss_mlp": 1.03081131, "epoch": 0.10070644821884864, "flos": 23623280974080.0, "grad_norm": 1.7656979150418646, "language_loss": 0.81593972, "learning_rate": 3.947786196485649e-06, "loss": 0.8378858, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.79296875, "step": 1675, "time_per_iteration": 2.4475479125976562 }, { "auxiliary_loss_clip": 0.01106209, "auxiliary_loss_mlp": 0.01086164, "balance_loss_clip": 1.02975428, "balance_loss_mlp": 1.03001475, "epoch": 0.1007665714715166, "flos": 24461406005760.0, "grad_norm": 1.998815326205301, "language_loss": 0.83522958, "learning_rate": 3.947697748980853e-06, "loss": 0.8571533, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.76171875, "step": 1676, "time_per_iteration": 3.846339702606201 }, { "auxiliary_loss_clip": 0.01111108, "auxiliary_loss_mlp": 0.0107867, "balance_loss_clip": 1.0241437, "balance_loss_mlp": 1.03246546, "epoch": 0.10082669472418458, "flos": 16798214954880.0, "grad_norm": 2.3958481830996132, "language_loss": 0.87914789, "learning_rate": 3.947609227619163e-06, "loss": 0.90104556, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.78515625, "step": 1677, "time_per_iteration": 3.8398962020874023 }, { "auxiliary_loss_clip": 0.0110919, "auxiliary_loss_mlp": 0.01085804, "balance_loss_clip": 1.03318465, "balance_loss_mlp": 1.03134227, "epoch": 0.10088681797685255, "flos": 13552653922560.0, "grad_norm": 1.7430008580850598, "language_loss": 0.88445795, "learning_rate": 3.947520632403936e-06, "loss": 0.90640783, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.77734375, "step": 1678, "time_per_iteration": 3.9414219856262207 }, { "auxiliary_loss_clip": 0.01107858, "auxiliary_loss_mlp": 0.01087762, "balance_loss_clip": 1.03333139, "balance_loss_mlp": 1.03052711, "epoch": 0.10094694122952051, "flos": 25264513077120.0, "grad_norm": 2.5474008514443587, "language_loss": 0.9212302, "learning_rate": 3.947431963338532e-06, "loss": 0.9431864, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.7734375, "step": 1679, "time_per_iteration": 2.499066114425659 }, { "auxiliary_loss_clip": 0.0102739, "auxiliary_loss_mlp": 0.01011976, "balance_loss_clip": 1.00482333, "balance_loss_mlp": 1.01003671, "epoch": 0.10100706448218849, "flos": 69850762980480.0, "grad_norm": 0.7906405769344667, "language_loss": 0.53072375, "learning_rate": 3.947343220426312e-06, "loss": 0.55111742, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 0.07128906, "router_z_loss_mlp": 0.17382812, "step": 1680, "time_per_iteration": 4.543371677398682 }, { "auxiliary_loss_clip": 0.01102655, "auxiliary_loss_mlp": 0.01085409, "balance_loss_clip": 1.03419662, "balance_loss_mlp": 1.02732265, "epoch": 0.10106718773485646, "flos": 20006244408960.0, "grad_norm": 1.7271896323681721, "language_loss": 0.78886855, "learning_rate": 3.947254403670641e-06, "loss": 0.81074917, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.75390625, "step": 1681, "time_per_iteration": 2.4518518447875977 }, { "auxiliary_loss_clip": 0.01107325, "auxiliary_loss_mlp": 0.01082165, "balance_loss_clip": 1.02225041, "balance_loss_mlp": 1.02527046, "epoch": 0.10112731098752442, "flos": 13478987220480.0, "grad_norm": 2.650385534651113, "language_loss": 0.9594028, "learning_rate": 3.947165513074889e-06, "loss": 0.98129779, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.8203125, "step": 1682, "time_per_iteration": 2.4166274070739746 }, { "auxiliary_loss_clip": 0.01103846, "auxiliary_loss_mlp": 0.0109433, "balance_loss_clip": 1.04101944, "balance_loss_mlp": 1.0250783, "epoch": 0.1011874342401924, "flos": 18514894239360.0, "grad_norm": 2.728925762999583, "language_loss": 0.89122677, "learning_rate": 3.947076548642425e-06, "loss": 0.91320854, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7890625, "step": 1683, "time_per_iteration": 2.3865911960601807 }, { "auxiliary_loss_clip": 0.01100901, "auxiliary_loss_mlp": 0.01090019, "balance_loss_clip": 1.03675628, "balance_loss_mlp": 1.02476954, "epoch": 0.10124755749286037, "flos": 20701853804160.0, "grad_norm": 1.7808197296605388, "language_loss": 0.77214837, "learning_rate": 3.946987510376624e-06, "loss": 0.79405755, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.76171875, "step": 1684, "time_per_iteration": 2.394023895263672 }, { "auxiliary_loss_clip": 0.01023718, "auxiliary_loss_mlp": 0.01014093, "balance_loss_clip": 1.0078944, "balance_loss_mlp": 1.00592101, "epoch": 0.10130768074552833, "flos": 56106015557760.0, "grad_norm": 0.7676853116422544, "language_loss": 0.61160648, "learning_rate": 3.9468983982808615e-06, "loss": 0.63198459, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 0.06201172, "router_z_loss_mlp": 0.17773438, "step": 1685, "time_per_iteration": 3.095723867416382 }, { "auxiliary_loss_clip": 0.01108052, "auxiliary_loss_mlp": 0.01085198, "balance_loss_clip": 1.03031373, "balance_loss_mlp": 1.029971, "epoch": 0.1013678039981963, "flos": 33400916962560.0, "grad_norm": 2.111615690317149, "language_loss": 0.63476104, "learning_rate": 3.946809212358516e-06, "loss": 0.65669346, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.78125, "step": 1686, "time_per_iteration": 2.5104753971099854 }, { "auxiliary_loss_clip": 0.01108642, "auxiliary_loss_mlp": 0.01079564, "balance_loss_clip": 1.0259676, "balance_loss_mlp": 1.03217244, "epoch": 0.10142792725086427, "flos": 31903980975360.0, "grad_norm": 10.601570834435808, "language_loss": 0.82949173, "learning_rate": 3.946719952612972e-06, "loss": 0.85137379, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.765625, "step": 1687, "time_per_iteration": 2.498149871826172 }, { "auxiliary_loss_clip": 0.01119154, "auxiliary_loss_mlp": 0.01081824, "balance_loss_clip": 1.02605772, "balance_loss_mlp": 1.03715932, "epoch": 0.10148805050353224, "flos": 28474637212800.0, "grad_norm": 1.7171874218684928, "language_loss": 0.74394321, "learning_rate": 3.94663061904761e-06, "loss": 0.765953, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.8203125, "step": 1688, "time_per_iteration": 2.472393035888672 }, { "auxiliary_loss_clip": 0.01112984, "auxiliary_loss_mlp": 0.0108364, "balance_loss_clip": 1.02968574, "balance_loss_mlp": 1.03614771, "epoch": 0.1015481737562002, "flos": 25147903536000.0, "grad_norm": 2.016415756707416, "language_loss": 0.89538038, "learning_rate": 3.94654121166582e-06, "loss": 0.9173466, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.76953125, "step": 1689, "time_per_iteration": 2.453439950942993 }, { "auxiliary_loss_clip": 0.01110808, "auxiliary_loss_mlp": 0.01085457, "balance_loss_clip": 1.03174126, "balance_loss_mlp": 1.0336895, "epoch": 0.10160829700886818, "flos": 30881479720320.0, "grad_norm": 1.7074489867464142, "language_loss": 0.89970905, "learning_rate": 3.946451730470993e-06, "loss": 0.92167169, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.76953125, "step": 1690, "time_per_iteration": 2.482327699661255 }, { "auxiliary_loss_clip": 0.01117787, "auxiliary_loss_mlp": 0.01085114, "balance_loss_clip": 1.02901411, "balance_loss_mlp": 1.0359695, "epoch": 0.10166842026153615, "flos": 20410992334080.0, "grad_norm": 1.8633947966149866, "language_loss": 0.85577786, "learning_rate": 3.946362175466521e-06, "loss": 0.87780678, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.81640625, "step": 1691, "time_per_iteration": 2.422517776489258 }, { "auxiliary_loss_clip": 0.01119182, "auxiliary_loss_mlp": 0.01083795, "balance_loss_clip": 1.02745676, "balance_loss_mlp": 1.03834581, "epoch": 0.10172854351420411, "flos": 33475491360000.0, "grad_norm": 1.6245598726369788, "language_loss": 0.68487811, "learning_rate": 3.946272546655801e-06, "loss": 0.70690787, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.80859375, "step": 1692, "time_per_iteration": 2.5340418815612793 }, { "auxiliary_loss_clip": 0.0111507, "auxiliary_loss_mlp": 0.0109333, "balance_loss_clip": 1.03956699, "balance_loss_mlp": 1.03449917, "epoch": 0.1017886667668721, "flos": 23549195335680.0, "grad_norm": 1.8222298802354826, "language_loss": 0.7816118, "learning_rate": 3.94618284404223e-06, "loss": 0.80369586, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.8046875, "step": 1693, "time_per_iteration": 2.4445548057556152 }, { "auxiliary_loss_clip": 0.0111351, "auxiliary_loss_mlp": 0.01099128, "balance_loss_clip": 1.0438621, "balance_loss_mlp": 1.033234, "epoch": 0.10184879001954006, "flos": 23294922837120.0, "grad_norm": 1.7935220108264887, "language_loss": 0.88515556, "learning_rate": 3.9460930676292105e-06, "loss": 0.90728188, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.80078125, "step": 1694, "time_per_iteration": 2.4354090690612793 }, { "auxiliary_loss_clip": 0.01118306, "auxiliary_loss_mlp": 0.01088675, "balance_loss_clip": 1.02806878, "balance_loss_mlp": 1.03197527, "epoch": 0.10190891327220802, "flos": 18332123938560.0, "grad_norm": 1.8072700694683024, "language_loss": 0.81191146, "learning_rate": 3.946003217420147e-06, "loss": 0.83398128, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.86328125, "step": 1695, "time_per_iteration": 2.406522512435913 }, { "auxiliary_loss_clip": 0.01112203, "auxiliary_loss_mlp": 0.01087565, "balance_loss_clip": 1.02724528, "balance_loss_mlp": 1.02998638, "epoch": 0.10196903652487599, "flos": 26464268638080.0, "grad_norm": 1.6456914367748285, "language_loss": 0.87992936, "learning_rate": 3.945913293418447e-06, "loss": 0.90192705, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.8203125, "step": 1696, "time_per_iteration": 2.447374105453491 }, { "auxiliary_loss_clip": 0.01105355, "auxiliary_loss_mlp": 0.01083273, "balance_loss_clip": 1.02831733, "balance_loss_mlp": 1.02866495, "epoch": 0.10202915977754397, "flos": 21868511529600.0, "grad_norm": 1.9037190669795645, "language_loss": 0.83369887, "learning_rate": 3.945823295627519e-06, "loss": 0.8555851, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.765625, "step": 1697, "time_per_iteration": 2.4468212127685547 }, { "auxiliary_loss_clip": 0.01107335, "auxiliary_loss_mlp": 0.01087715, "balance_loss_clip": 1.02935028, "balance_loss_mlp": 1.02713656, "epoch": 0.10208928303021193, "flos": 22308661440000.0, "grad_norm": 1.8315455295002698, "language_loss": 0.82827985, "learning_rate": 3.9457332240507775e-06, "loss": 0.85023034, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.80078125, "step": 1698, "time_per_iteration": 2.391206741333008 }, { "auxiliary_loss_clip": 0.0110694, "auxiliary_loss_mlp": 0.01074481, "balance_loss_clip": 1.01802373, "balance_loss_mlp": 1.02673221, "epoch": 0.1021494062828799, "flos": 22124529596160.0, "grad_norm": 2.2395514219476897, "language_loss": 0.78339553, "learning_rate": 3.945643078691637e-06, "loss": 0.80520976, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.80078125, "step": 1699, "time_per_iteration": 2.4337379932403564 }, { "auxiliary_loss_clip": 0.01104368, "auxiliary_loss_mlp": 0.01075843, "balance_loss_clip": 1.02222311, "balance_loss_mlp": 1.02649045, "epoch": 0.10220952953554788, "flos": 19645696131840.0, "grad_norm": 1.8037904448179025, "language_loss": 0.81814075, "learning_rate": 3.945552859553516e-06, "loss": 0.83994281, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.78125, "step": 1700, "time_per_iteration": 2.3783936500549316 }, { "auxiliary_loss_clip": 0.0110729, "auxiliary_loss_mlp": 0.0107458, "balance_loss_clip": 1.01912379, "balance_loss_mlp": 1.02595687, "epoch": 0.10226965278821584, "flos": 29786044901760.0, "grad_norm": 2.6901443094678332, "language_loss": 0.78627449, "learning_rate": 3.945462566639836e-06, "loss": 0.80809319, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8125, "step": 1701, "time_per_iteration": 2.4773969650268555 }, { "auxiliary_loss_clip": 0.01111137, "auxiliary_loss_mlp": 0.01084947, "balance_loss_clip": 1.02586699, "balance_loss_mlp": 1.02863884, "epoch": 0.10232977604088381, "flos": 27015581917440.0, "grad_norm": 1.8262557935007746, "language_loss": 0.79048133, "learning_rate": 3.945372199954019e-06, "loss": 0.81244218, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.828125, "step": 1702, "time_per_iteration": 2.461958169937134 }, { "auxiliary_loss_clip": 0.01105537, "auxiliary_loss_mlp": 0.0107428, "balance_loss_clip": 1.02066016, "balance_loss_mlp": 1.02783585, "epoch": 0.10238989929355179, "flos": 20776463112960.0, "grad_norm": 2.4703235872792173, "language_loss": 0.96282721, "learning_rate": 3.945281759499494e-06, "loss": 0.9846254, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.77734375, "step": 1703, "time_per_iteration": 2.4080567359924316 }, { "auxiliary_loss_clip": 0.0103261, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.0239321, "balance_loss_mlp": 1.01406717, "epoch": 0.10245002254621975, "flos": 57695297690880.0, "grad_norm": 0.9258191526817878, "language_loss": 0.55187511, "learning_rate": 3.94519124527969e-06, "loss": 0.57252014, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 0.07958984, "router_z_loss_mlp": 0.18554688, "step": 1704, "time_per_iteration": 2.939100980758667 }, { "auxiliary_loss_clip": 0.01115181, "auxiliary_loss_mlp": 0.01084044, "balance_loss_clip": 1.02575064, "balance_loss_mlp": 1.03422976, "epoch": 0.10251014579888772, "flos": 16799192472960.0, "grad_norm": 5.143255582877381, "language_loss": 0.86511385, "learning_rate": 3.945100657298039e-06, "loss": 0.88710612, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.80859375, "step": 1705, "time_per_iteration": 2.387413740158081 }, { "auxiliary_loss_clip": 0.01040487, "auxiliary_loss_mlp": 0.01015939, "balance_loss_clip": 1.00802302, "balance_loss_mlp": 1.02154326, "epoch": 0.1025702690515557, "flos": 68562328832640.0, "grad_norm": 0.7745995925100884, "language_loss": 0.60470951, "learning_rate": 3.9450099955579765e-06, "loss": 0.62527382, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 0.07910156, "router_z_loss_mlp": 0.18945312, "step": 1706, "time_per_iteration": 3.0883145332336426 }, { "auxiliary_loss_clip": 0.01113822, "auxiliary_loss_mlp": 0.01091965, "balance_loss_clip": 1.03591275, "balance_loss_mlp": 1.03332329, "epoch": 0.10263039230422366, "flos": 14865737356800.0, "grad_norm": 2.071868684070822, "language_loss": 0.88570905, "learning_rate": 3.94491926006294e-06, "loss": 0.90776688, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8046875, "step": 1707, "time_per_iteration": 2.407362222671509 }, { "auxiliary_loss_clip": 0.01112264, "auxiliary_loss_mlp": 0.01094323, "balance_loss_clip": 1.045614, "balance_loss_mlp": 1.03464532, "epoch": 0.10269051555689163, "flos": 25336434211200.0, "grad_norm": 1.6415747229778852, "language_loss": 0.74190348, "learning_rate": 3.944828450816369e-06, "loss": 0.7639693, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.77734375, "step": 1708, "time_per_iteration": 2.447716236114502 }, { "auxiliary_loss_clip": 0.01112415, "auxiliary_loss_mlp": 0.01111042, "balance_loss_clip": 1.05611062, "balance_loss_mlp": 1.03229928, "epoch": 0.10275063880955959, "flos": 21067778430720.0, "grad_norm": 1.6774154183617775, "language_loss": 0.92570019, "learning_rate": 3.944737567821709e-06, "loss": 0.94793481, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.80078125, "step": 1709, "time_per_iteration": 2.4175167083740234 }, { "auxiliary_loss_clip": 0.01105327, "auxiliary_loss_mlp": 0.01103785, "balance_loss_clip": 1.05045044, "balance_loss_mlp": 1.0304563, "epoch": 0.10281076206222757, "flos": 30365638248960.0, "grad_norm": 1.8243466391873682, "language_loss": 0.89839363, "learning_rate": 3.944646611082406e-06, "loss": 0.92048472, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.75, "step": 1710, "time_per_iteration": 2.471731424331665 }, { "auxiliary_loss_clip": 0.01104399, "auxiliary_loss_mlp": 0.01097068, "balance_loss_clip": 1.04456818, "balance_loss_mlp": 1.02980089, "epoch": 0.10287088531489554, "flos": 22417241368320.0, "grad_norm": 1.642893144258983, "language_loss": 0.80988669, "learning_rate": 3.944555580601908e-06, "loss": 0.83190137, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.74609375, "step": 1711, "time_per_iteration": 2.443575143814087 }, { "auxiliary_loss_clip": 0.0110543, "auxiliary_loss_mlp": 0.01099183, "balance_loss_clip": 1.04148567, "balance_loss_mlp": 1.02597201, "epoch": 0.1029310085675635, "flos": 25114910434560.0, "grad_norm": 1.7403236661141095, "language_loss": 0.74764544, "learning_rate": 3.944464476383668e-06, "loss": 0.76969159, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.796875, "step": 1712, "time_per_iteration": 2.4352235794067383 }, { "auxiliary_loss_clip": 0.01105168, "auxiliary_loss_mlp": 0.01101998, "balance_loss_clip": 1.04546881, "balance_loss_mlp": 1.02839458, "epoch": 0.10299113182023148, "flos": 19864601556480.0, "grad_norm": 1.970794318926384, "language_loss": 0.88408291, "learning_rate": 3.94437329843114e-06, "loss": 0.90615463, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.765625, "step": 1713, "time_per_iteration": 2.424957036972046 }, { "auxiliary_loss_clip": 0.01102188, "auxiliary_loss_mlp": 0.0108087, "balance_loss_clip": 1.03011107, "balance_loss_mlp": 1.02580845, "epoch": 0.10305125507289944, "flos": 20446603787520.0, "grad_norm": 1.8000194211968905, "language_loss": 0.74432325, "learning_rate": 3.944282046747782e-06, "loss": 0.76615381, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.765625, "step": 1714, "time_per_iteration": 2.427103281021118 }, { "auxiliary_loss_clip": 0.01106951, "auxiliary_loss_mlp": 0.01083233, "balance_loss_clip": 1.02770531, "balance_loss_mlp": 1.02668977, "epoch": 0.10311137832556741, "flos": 26249552576640.0, "grad_norm": 2.155616730845291, "language_loss": 0.92984933, "learning_rate": 3.944190721337053e-06, "loss": 0.95175111, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.80078125, "step": 1715, "time_per_iteration": 3.8688347339630127 }, { "auxiliary_loss_clip": 0.01104741, "auxiliary_loss_mlp": 0.01080761, "balance_loss_clip": 1.0251621, "balance_loss_mlp": 1.02737999, "epoch": 0.10317150157823539, "flos": 35297468904960.0, "grad_norm": 1.7389587491597964, "language_loss": 0.77931786, "learning_rate": 3.944099322202418e-06, "loss": 0.80117285, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.7734375, "step": 1716, "time_per_iteration": 2.542492151260376 }, { "auxiliary_loss_clip": 0.01112112, "auxiliary_loss_mlp": 0.01087352, "balance_loss_clip": 1.02851033, "balance_loss_mlp": 1.03216183, "epoch": 0.10323162483090335, "flos": 25738738341120.0, "grad_norm": 2.0683965101923794, "language_loss": 0.87261111, "learning_rate": 3.944007849347342e-06, "loss": 0.89460576, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.796875, "step": 1717, "time_per_iteration": 3.933896780014038 }, { "auxiliary_loss_clip": 0.01111102, "auxiliary_loss_mlp": 0.01077799, "balance_loss_clip": 1.02301013, "balance_loss_mlp": 1.03202367, "epoch": 0.10329174808357132, "flos": 16288936819200.0, "grad_norm": 1.984285108379828, "language_loss": 0.85239971, "learning_rate": 3.943916302775292e-06, "loss": 0.87428874, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.7890625, "step": 1718, "time_per_iteration": 3.857229709625244 }, { "auxiliary_loss_clip": 0.01111893, "auxiliary_loss_mlp": 0.01072621, "balance_loss_clip": 1.02097917, "balance_loss_mlp": 1.03422248, "epoch": 0.10335187133623928, "flos": 36685615495680.0, "grad_norm": 1.775685402296659, "language_loss": 0.7450586, "learning_rate": 3.943824682489742e-06, "loss": 0.76690376, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.77734375, "step": 1719, "time_per_iteration": 3.985074520111084 }, { "auxiliary_loss_clip": 0.01111863, "auxiliary_loss_mlp": 0.01076509, "balance_loss_clip": 1.02422357, "balance_loss_mlp": 1.03332925, "epoch": 0.10341199458890726, "flos": 14974771132800.0, "grad_norm": 1.7378824337936356, "language_loss": 0.94711894, "learning_rate": 3.9437329884941665e-06, "loss": 0.9690026, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.78515625, "step": 1720, "time_per_iteration": 2.39813232421875 }, { "auxiliary_loss_clip": 0.0111473, "auxiliary_loss_mlp": 0.01080621, "balance_loss_clip": 1.02576137, "balance_loss_mlp": 1.03586745, "epoch": 0.10347211784157523, "flos": 21030561054720.0, "grad_norm": 1.7121834479711338, "language_loss": 0.81945658, "learning_rate": 3.943641220792039e-06, "loss": 0.8414101, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.7890625, "step": 1721, "time_per_iteration": 2.451406955718994 }, { "auxiliary_loss_clip": 0.01114688, "auxiliary_loss_mlp": 0.0109068, "balance_loss_clip": 1.03541446, "balance_loss_mlp": 1.03259182, "epoch": 0.1035322410942432, "flos": 19791074499840.0, "grad_norm": 1.6744972544471992, "language_loss": 0.82205379, "learning_rate": 3.9435493793868434e-06, "loss": 0.84410751, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8203125, "step": 1722, "time_per_iteration": 2.4130895137786865 }, { "auxiliary_loss_clip": 0.01041872, "auxiliary_loss_mlp": 0.01020019, "balance_loss_clip": 1.01343882, "balance_loss_mlp": 1.02324462, "epoch": 0.10359236434691117, "flos": 52696014554880.0, "grad_norm": 1.3863620069761562, "language_loss": 0.67233062, "learning_rate": 3.943457464282059e-06, "loss": 0.69294953, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 0.06591797, "router_z_loss_mlp": 0.18652344, "step": 1723, "time_per_iteration": 2.8234076499938965 }, { "auxiliary_loss_clip": 0.0110913, "auxiliary_loss_mlp": 0.01094734, "balance_loss_clip": 1.04204345, "balance_loss_mlp": 1.03068423, "epoch": 0.10365248759957914, "flos": 18404429097600.0, "grad_norm": 2.600924163459291, "language_loss": 0.80372667, "learning_rate": 3.9433654754811745e-06, "loss": 0.82576525, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.78515625, "step": 1724, "time_per_iteration": 2.3935773372650146 }, { "auxiliary_loss_clip": 0.01116829, "auxiliary_loss_mlp": 0.01089709, "balance_loss_clip": 1.03763819, "balance_loss_mlp": 1.03290534, "epoch": 0.1037126108522471, "flos": 47551878587520.0, "grad_norm": 2.2011352645150275, "language_loss": 0.76597255, "learning_rate": 3.943273412987676e-06, "loss": 0.7880379, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.8359375, "step": 1725, "time_per_iteration": 2.6224093437194824 }, { "auxiliary_loss_clip": 0.01104009, "auxiliary_loss_mlp": 0.01077861, "balance_loss_clip": 1.03005838, "balance_loss_mlp": 1.029616, "epoch": 0.10377273410491508, "flos": 22815670337280.0, "grad_norm": 1.9498744616912764, "language_loss": 0.77007508, "learning_rate": 3.943181276805054e-06, "loss": 0.79189378, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.74609375, "step": 1726, "time_per_iteration": 2.4405667781829834 }, { "auxiliary_loss_clip": 0.01103183, "auxiliary_loss_mlp": 0.01086102, "balance_loss_clip": 1.03529525, "balance_loss_mlp": 1.02632451, "epoch": 0.10383285735758305, "flos": 26137551335040.0, "grad_norm": 1.8901683432943996, "language_loss": 0.76808071, "learning_rate": 3.9430890669368035e-06, "loss": 0.78997362, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.765625, "step": 1727, "time_per_iteration": 2.490493059158325 }, { "auxiliary_loss_clip": 0.01100129, "auxiliary_loss_mlp": 0.01074938, "balance_loss_clip": 1.02315354, "balance_loss_mlp": 1.02456856, "epoch": 0.10389298061025101, "flos": 17090856904320.0, "grad_norm": 2.2252696229426134, "language_loss": 0.86654198, "learning_rate": 3.942996783386422e-06, "loss": 0.88829267, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.7578125, "step": 1728, "time_per_iteration": 2.4056386947631836 }, { "auxiliary_loss_clip": 0.0110147, "auxiliary_loss_mlp": 0.01085257, "balance_loss_clip": 1.03099298, "balance_loss_mlp": 1.02684975, "epoch": 0.10395310386291898, "flos": 20775485594880.0, "grad_norm": 2.1085349470638364, "language_loss": 0.72606266, "learning_rate": 3.942904426157406e-06, "loss": 0.74792993, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.7421875, "step": 1729, "time_per_iteration": 2.426069498062134 }, { "auxiliary_loss_clip": 0.0110546, "auxiliary_loss_mlp": 0.01081508, "balance_loss_clip": 1.02645743, "balance_loss_mlp": 1.02849364, "epoch": 0.10401322711558696, "flos": 12819792240000.0, "grad_norm": 2.3180731850441143, "language_loss": 0.85371125, "learning_rate": 3.9428119952532605e-06, "loss": 0.87558097, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.76953125, "step": 1730, "time_per_iteration": 2.344053030014038 }, { "auxiliary_loss_clip": 0.01102588, "auxiliary_loss_mlp": 0.01069443, "balance_loss_clip": 1.02204537, "balance_loss_mlp": 1.02747202, "epoch": 0.10407335036825492, "flos": 23183584911360.0, "grad_norm": 1.6471010996759012, "language_loss": 0.77910888, "learning_rate": 3.942719490677489e-06, "loss": 0.80082923, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.75, "step": 1731, "time_per_iteration": 2.453585147857666 }, { "auxiliary_loss_clip": 0.0110041, "auxiliary_loss_mlp": 0.01068512, "balance_loss_clip": 1.02290297, "balance_loss_mlp": 1.02684975, "epoch": 0.10413347362092289, "flos": 26102987222400.0, "grad_norm": 1.8566926893903308, "language_loss": 0.855335, "learning_rate": 3.9426269124336e-06, "loss": 0.87702429, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.734375, "step": 1732, "time_per_iteration": 2.450479030609131 }, { "auxiliary_loss_clip": 0.0110461, "auxiliary_loss_mlp": 0.01065566, "balance_loss_clip": 1.02019525, "balance_loss_mlp": 1.02991307, "epoch": 0.10419359687359087, "flos": 12640233784320.0, "grad_norm": 2.1366048150256103, "language_loss": 0.85656518, "learning_rate": 3.942534260525104e-06, "loss": 0.87826693, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.74609375, "step": 1733, "time_per_iteration": 2.3979127407073975 }, { "auxiliary_loss_clip": 0.01108081, "auxiliary_loss_mlp": 0.0107207, "balance_loss_clip": 1.02095342, "balance_loss_mlp": 1.02968049, "epoch": 0.10425372012625883, "flos": 12124427224320.0, "grad_norm": 2.1451602273606136, "language_loss": 0.79590678, "learning_rate": 3.942441534955514e-06, "loss": 0.81770831, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.78515625, "step": 1734, "time_per_iteration": 2.3754119873046875 }, { "auxiliary_loss_clip": 0.01105388, "auxiliary_loss_mlp": 0.01068708, "balance_loss_clip": 1.02119088, "balance_loss_mlp": 1.03000927, "epoch": 0.1043138433789268, "flos": 25336399299840.0, "grad_norm": 1.651585380082517, "language_loss": 0.76703095, "learning_rate": 3.9423487357283465e-06, "loss": 0.78877187, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.75390625, "step": 1735, "time_per_iteration": 2.441539764404297 }, { "auxiliary_loss_clip": 0.0110919, "auxiliary_loss_mlp": 0.010779, "balance_loss_clip": 1.02623487, "balance_loss_mlp": 1.03130734, "epoch": 0.10437396663159478, "flos": 29165917599360.0, "grad_norm": 1.593149538973987, "language_loss": 0.80877233, "learning_rate": 3.94225586284712e-06, "loss": 0.83064324, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.77734375, "step": 1736, "time_per_iteration": 2.4682459831237793 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01086474, "balance_loss_clip": 1.0309937, "balance_loss_mlp": 1.03250098, "epoch": 0.10443408988426274, "flos": 25079822651520.0, "grad_norm": 1.758983206856964, "language_loss": 0.72942877, "learning_rate": 3.942162916315356e-06, "loss": 0.75137365, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.75390625, "step": 1737, "time_per_iteration": 2.4726333618164062 }, { "auxiliary_loss_clip": 0.0111101, "auxiliary_loss_mlp": 0.01083496, "balance_loss_clip": 1.02355814, "balance_loss_mlp": 1.02849495, "epoch": 0.1044942131369307, "flos": 26758481598720.0, "grad_norm": 2.106372038071898, "language_loss": 0.83633351, "learning_rate": 3.942069896136581e-06, "loss": 0.85827851, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.82421875, "step": 1738, "time_per_iteration": 2.4630205631256104 }, { "auxiliary_loss_clip": 0.01110793, "auxiliary_loss_mlp": 0.01091925, "balance_loss_clip": 1.03093743, "balance_loss_mlp": 1.02955198, "epoch": 0.10455433638959867, "flos": 18441576650880.0, "grad_norm": 1.7722587618717178, "language_loss": 0.77094793, "learning_rate": 3.9419768023143196e-06, "loss": 0.79297507, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.8125, "step": 1739, "time_per_iteration": 2.411649465560913 }, { "auxiliary_loss_clip": 0.01105986, "auxiliary_loss_mlp": 0.01084402, "balance_loss_clip": 1.03063858, "balance_loss_mlp": 1.02936029, "epoch": 0.10461445964226665, "flos": 23217939555840.0, "grad_norm": 2.262531061026446, "language_loss": 0.78922421, "learning_rate": 3.941883634852104e-06, "loss": 0.81112808, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.765625, "step": 1740, "time_per_iteration": 2.430528402328491 }, { "auxiliary_loss_clip": 0.01103, "auxiliary_loss_mlp": 0.01080446, "balance_loss_clip": 1.03047299, "balance_loss_mlp": 1.02723455, "epoch": 0.10467458289493461, "flos": 24344307705600.0, "grad_norm": 2.1227922078655226, "language_loss": 0.8819254, "learning_rate": 3.941790393753467e-06, "loss": 0.9037599, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7578125, "step": 1741, "time_per_iteration": 2.4366374015808105 }, { "auxiliary_loss_clip": 0.01104656, "auxiliary_loss_mlp": 0.01082952, "balance_loss_clip": 1.02623248, "balance_loss_mlp": 1.02602303, "epoch": 0.10473470614760258, "flos": 21286893323520.0, "grad_norm": 2.8649686200634403, "language_loss": 0.78106952, "learning_rate": 3.941697079021942e-06, "loss": 0.80294561, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.7890625, "step": 1742, "time_per_iteration": 2.4112653732299805 }, { "auxiliary_loss_clip": 0.01103632, "auxiliary_loss_mlp": 0.01090508, "balance_loss_clip": 1.0376029, "balance_loss_mlp": 1.02753162, "epoch": 0.10479482940027056, "flos": 21686195076480.0, "grad_norm": 1.9432176912097687, "language_loss": 0.88935453, "learning_rate": 3.94160369066107e-06, "loss": 0.91129589, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.76171875, "step": 1743, "time_per_iteration": 2.458625555038452 }, { "auxiliary_loss_clip": 0.01102449, "auxiliary_loss_mlp": 0.01074502, "balance_loss_clip": 1.0164237, "balance_loss_mlp": 1.02577698, "epoch": 0.10485495265293852, "flos": 21572797380480.0, "grad_norm": 2.1551523198318656, "language_loss": 0.77628118, "learning_rate": 3.941510228674391e-06, "loss": 0.79805064, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.76953125, "step": 1744, "time_per_iteration": 2.39650559425354 }, { "auxiliary_loss_clip": 0.01100994, "auxiliary_loss_mlp": 0.01067006, "balance_loss_clip": 1.01972735, "balance_loss_mlp": 1.02608323, "epoch": 0.10491507590560649, "flos": 37960399301760.0, "grad_norm": 1.9718095596353262, "language_loss": 0.83021355, "learning_rate": 3.941416693065451e-06, "loss": 0.85189354, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.75, "step": 1745, "time_per_iteration": 2.558744430541992 }, { "auxiliary_loss_clip": 0.01101938, "auxiliary_loss_mlp": 0.01080051, "balance_loss_clip": 1.02731323, "balance_loss_mlp": 1.02485788, "epoch": 0.10497519915827447, "flos": 26395070590080.0, "grad_norm": 1.9358127498644706, "language_loss": 0.86234146, "learning_rate": 3.941323083837794e-06, "loss": 0.88416135, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.7734375, "step": 1746, "time_per_iteration": 2.4348719120025635 }, { "auxiliary_loss_clip": 0.01103265, "auxiliary_loss_mlp": 0.01071581, "balance_loss_clip": 1.02261019, "balance_loss_mlp": 1.02674568, "epoch": 0.10503532241094243, "flos": 40660581985920.0, "grad_norm": 1.725517830137955, "language_loss": 0.72621816, "learning_rate": 3.941229400994971e-06, "loss": 0.74796665, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.765625, "step": 1747, "time_per_iteration": 2.5897576808929443 }, { "auxiliary_loss_clip": 0.01108677, "auxiliary_loss_mlp": 0.01085365, "balance_loss_clip": 1.03069568, "balance_loss_mlp": 1.02755857, "epoch": 0.1050954456636104, "flos": 29788104672000.0, "grad_norm": 2.170442113125856, "language_loss": 0.86797142, "learning_rate": 3.941135644540535e-06, "loss": 0.88991189, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.8125, "step": 1748, "time_per_iteration": 2.4490978717803955 }, { "auxiliary_loss_clip": 0.01101727, "auxiliary_loss_mlp": 0.01077528, "balance_loss_clip": 1.02254891, "balance_loss_mlp": 1.02556896, "epoch": 0.10515556891627838, "flos": 23947694127360.0, "grad_norm": 1.78316055913918, "language_loss": 0.7382955, "learning_rate": 3.941041814478041e-06, "loss": 0.76008803, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.76171875, "step": 1749, "time_per_iteration": 2.454747200012207 }, { "auxiliary_loss_clip": 0.01097965, "auxiliary_loss_mlp": 0.01072972, "balance_loss_clip": 1.02023411, "balance_loss_mlp": 1.02401686, "epoch": 0.10521569216894634, "flos": 18258631793280.0, "grad_norm": 3.2437011177144996, "language_loss": 0.83870012, "learning_rate": 3.940947910811047e-06, "loss": 0.8604095, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.73828125, "step": 1750, "time_per_iteration": 2.3645286560058594 }, { "auxiliary_loss_clip": 0.01104413, "auxiliary_loss_mlp": 0.0107844, "balance_loss_clip": 1.02234054, "balance_loss_mlp": 1.02754259, "epoch": 0.10527581542161431, "flos": 15630056040960.0, "grad_norm": 2.4064263129332835, "language_loss": 0.95029044, "learning_rate": 3.940853933543114e-06, "loss": 0.97211897, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.765625, "step": 1751, "time_per_iteration": 2.392998456954956 }, { "auxiliary_loss_clip": 0.01102059, "auxiliary_loss_mlp": 0.01077734, "balance_loss_clip": 1.02761889, "balance_loss_mlp": 1.025594, "epoch": 0.10533593867428227, "flos": 18295569878400.0, "grad_norm": 1.9450340763664844, "language_loss": 0.80375457, "learning_rate": 3.940759882677805e-06, "loss": 0.82555246, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.765625, "step": 1752, "time_per_iteration": 2.38059663772583 }, { "auxiliary_loss_clip": 0.01101173, "auxiliary_loss_mlp": 0.0107712, "balance_loss_clip": 1.02338016, "balance_loss_mlp": 1.02617431, "epoch": 0.10539606192695025, "flos": 29021935685760.0, "grad_norm": 1.7824144705570768, "language_loss": 0.77227348, "learning_rate": 3.940665758218686e-06, "loss": 0.79405642, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.75, "step": 1753, "time_per_iteration": 2.46714186668396 }, { "auxiliary_loss_clip": 0.01109271, "auxiliary_loss_mlp": 0.01092933, "balance_loss_clip": 1.03111124, "balance_loss_mlp": 1.02777219, "epoch": 0.10545618517961822, "flos": 19968433539840.0, "grad_norm": 1.8498816916365897, "language_loss": 0.85656959, "learning_rate": 3.940571560169328e-06, "loss": 0.87859166, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.81640625, "step": 1754, "time_per_iteration": 2.383728265762329 }, { "auxiliary_loss_clip": 0.0110962, "auxiliary_loss_mlp": 0.01087991, "balance_loss_clip": 1.02969813, "balance_loss_mlp": 1.02937365, "epoch": 0.10551630843228618, "flos": 16142511110400.0, "grad_norm": 2.4053003192181555, "language_loss": 0.71640629, "learning_rate": 3.940477288533302e-06, "loss": 0.73838246, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.80078125, "step": 1755, "time_per_iteration": 3.890162229537964 }, { "auxiliary_loss_clip": 0.01108059, "auxiliary_loss_mlp": 0.01085541, "balance_loss_clip": 1.02860713, "balance_loss_mlp": 1.02744937, "epoch": 0.10557643168495416, "flos": 23439009484800.0, "grad_norm": 2.3103513893585466, "language_loss": 0.79771608, "learning_rate": 3.940382943314182e-06, "loss": 0.81965208, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.8046875, "step": 1756, "time_per_iteration": 2.418246269226074 }, { "auxiliary_loss_clip": 0.01105716, "auxiliary_loss_mlp": 0.01082168, "balance_loss_clip": 1.02807117, "balance_loss_mlp": 1.0261395, "epoch": 0.10563655493762213, "flos": 21797951938560.0, "grad_norm": 1.7831141851609307, "language_loss": 0.81864619, "learning_rate": 3.940288524515547e-06, "loss": 0.84052503, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.796875, "step": 1757, "time_per_iteration": 3.865386962890625 }, { "auxiliary_loss_clip": 0.01104999, "auxiliary_loss_mlp": 0.01080074, "balance_loss_clip": 1.02199554, "balance_loss_mlp": 1.02627003, "epoch": 0.10569667819029009, "flos": 53798782625280.0, "grad_norm": 1.640322340769754, "language_loss": 0.80951768, "learning_rate": 3.940194032140976e-06, "loss": 0.83136839, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.7890625, "step": 1758, "time_per_iteration": 4.0562357902526855 }, { "auxiliary_loss_clip": 0.01106725, "auxiliary_loss_mlp": 0.01078501, "balance_loss_clip": 1.02337837, "balance_loss_mlp": 1.02605987, "epoch": 0.10575680144295807, "flos": 22924529556480.0, "grad_norm": 1.7387165765084127, "language_loss": 0.93444455, "learning_rate": 3.940099466194054e-06, "loss": 0.9562968, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.80859375, "step": 1759, "time_per_iteration": 2.4169657230377197 }, { "auxiliary_loss_clip": 0.011073, "auxiliary_loss_mlp": 0.0108322, "balance_loss_clip": 1.0261898, "balance_loss_mlp": 1.02522445, "epoch": 0.10581692469562604, "flos": 14135808228480.0, "grad_norm": 2.900651299958461, "language_loss": 0.7968415, "learning_rate": 3.940004826678365e-06, "loss": 0.81874669, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.8203125, "step": 1760, "time_per_iteration": 2.3788199424743652 }, { "auxiliary_loss_clip": 0.01104948, "auxiliary_loss_mlp": 0.01076679, "balance_loss_clip": 1.01759946, "balance_loss_mlp": 1.02429938, "epoch": 0.105877047948294, "flos": 25957469208960.0, "grad_norm": 2.090556124116569, "language_loss": 0.92227876, "learning_rate": 3.939910113597498e-06, "loss": 0.94409502, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 0.80859375, "step": 1761, "time_per_iteration": 2.4436850547790527 }, { "auxiliary_loss_clip": 0.0110351, "auxiliary_loss_mlp": 0.01090403, "balance_loss_clip": 1.03380227, "balance_loss_mlp": 1.02528715, "epoch": 0.10593717120096197, "flos": 30663447079680.0, "grad_norm": 1.905793487116791, "language_loss": 0.80323756, "learning_rate": 3.9398153269550464e-06, "loss": 0.82517666, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.78125, "step": 1762, "time_per_iteration": 2.4932267665863037 }, { "auxiliary_loss_clip": 0.01029151, "auxiliary_loss_mlp": 0.01023788, "balance_loss_clip": 1.0159204, "balance_loss_mlp": 1.01038504, "epoch": 0.10599729445362994, "flos": 66432905055360.0, "grad_norm": 0.7872640798473728, "language_loss": 0.60670274, "learning_rate": 3.939720466754602e-06, "loss": 0.62723213, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 0.07861328, "router_z_loss_mlp": 0.1875, "step": 1763, "time_per_iteration": 3.200045347213745 }, { "auxiliary_loss_clip": 0.01102062, "auxiliary_loss_mlp": 0.01076692, "balance_loss_clip": 1.02304816, "balance_loss_mlp": 1.02437401, "epoch": 0.10605741770629791, "flos": 23947135545600.0, "grad_norm": 1.7498856337444955, "language_loss": 0.82034898, "learning_rate": 3.939625532999763e-06, "loss": 0.8421365, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.77734375, "step": 1764, "time_per_iteration": 2.4080810546875 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.01082916, "balance_loss_clip": 1.02564776, "balance_loss_mlp": 1.02468348, "epoch": 0.10611754095896588, "flos": 19386605865600.0, "grad_norm": 1.6836110639743302, "language_loss": 0.81733233, "learning_rate": 3.9395305256941314e-06, "loss": 0.83917707, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.765625, "step": 1765, "time_per_iteration": 2.3842990398406982 }, { "auxiliary_loss_clip": 0.01100262, "auxiliary_loss_mlp": 0.01086566, "balance_loss_clip": 1.02963209, "balance_loss_mlp": 1.02377963, "epoch": 0.10617766421163385, "flos": 22236635571840.0, "grad_norm": 1.9145436044122668, "language_loss": 0.7815975, "learning_rate": 3.939435444841306e-06, "loss": 0.80346572, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.765625, "step": 1766, "time_per_iteration": 2.391209602355957 }, { "auxiliary_loss_clip": 0.0110268, "auxiliary_loss_mlp": 0.01084346, "balance_loss_clip": 1.03144097, "balance_loss_mlp": 1.02505374, "epoch": 0.10623778746430182, "flos": 28403100103680.0, "grad_norm": 1.6850992759018937, "language_loss": 0.78949219, "learning_rate": 3.939340290444895e-06, "loss": 0.81136245, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7734375, "step": 1767, "time_per_iteration": 2.449960708618164 }, { "auxiliary_loss_clip": 0.01024884, "auxiliary_loss_mlp": 0.01018669, "balance_loss_clip": 1.00989532, "balance_loss_mlp": 1.00626087, "epoch": 0.10629791071696978, "flos": 64231282719360.0, "grad_norm": 0.7268573682080434, "language_loss": 0.58080262, "learning_rate": 3.939245062508506e-06, "loss": 0.60123819, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 0.08789062, "router_z_loss_mlp": 0.18554688, "step": 1768, "time_per_iteration": 3.1312811374664307 }, { "auxiliary_loss_clip": 0.01102663, "auxiliary_loss_mlp": 0.01080966, "balance_loss_clip": 1.02636826, "balance_loss_mlp": 1.02605498, "epoch": 0.10635803396963776, "flos": 22746472289280.0, "grad_norm": 1.8171339001980529, "language_loss": 0.8795433, "learning_rate": 3.939149761035749e-06, "loss": 0.90137959, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.765625, "step": 1769, "time_per_iteration": 2.4456729888916016 }, { "auxiliary_loss_clip": 0.01106284, "auxiliary_loss_mlp": 0.0108993, "balance_loss_clip": 1.02922893, "balance_loss_mlp": 1.02607942, "epoch": 0.10641815722230573, "flos": 31394214080640.0, "grad_norm": 1.9790553190782525, "language_loss": 0.63328922, "learning_rate": 3.9390543860302395e-06, "loss": 0.65525138, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.80078125, "step": 1770, "time_per_iteration": 2.4703547954559326 }, { "auxiliary_loss_clip": 0.01026773, "auxiliary_loss_mlp": 0.01018848, "balance_loss_clip": 1.01212454, "balance_loss_mlp": 1.00814915, "epoch": 0.1064782804749737, "flos": 58550077307520.0, "grad_norm": 0.8862819607823016, "language_loss": 0.57190061, "learning_rate": 3.9389589374955925e-06, "loss": 0.5923568, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 0.06738281, "router_z_loss_mlp": 0.18554688, "step": 1771, "time_per_iteration": 2.936384439468384 }, { "auxiliary_loss_clip": 0.01102323, "auxiliary_loss_mlp": 0.01091041, "balance_loss_clip": 1.03372538, "balance_loss_mlp": 1.02577662, "epoch": 0.10653840372764166, "flos": 23986691982720.0, "grad_norm": 1.754576749718091, "language_loss": 0.9086321, "learning_rate": 3.938863415435429e-06, "loss": 0.93056571, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.765625, "step": 1772, "time_per_iteration": 2.5058155059814453 }, { "auxiliary_loss_clip": 0.01107295, "auxiliary_loss_mlp": 0.01092642, "balance_loss_clip": 1.03051054, "balance_loss_mlp": 1.02517343, "epoch": 0.10659852698030964, "flos": 18293719576320.0, "grad_norm": 2.7009871763587276, "language_loss": 0.80025184, "learning_rate": 3.93876781985337e-06, "loss": 0.8222512, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.8203125, "step": 1773, "time_per_iteration": 2.407862663269043 }, { "auxiliary_loss_clip": 0.01104662, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.02822495, "balance_loss_mlp": 1.02659631, "epoch": 0.1066586502329776, "flos": 32159230992000.0, "grad_norm": 2.021912391354111, "language_loss": 0.8612777, "learning_rate": 3.938672150753041e-06, "loss": 0.88318115, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.78125, "step": 1774, "time_per_iteration": 2.516582489013672 }, { "auxiliary_loss_clip": 0.01108602, "auxiliary_loss_mlp": 0.01079221, "balance_loss_clip": 1.02326417, "balance_loss_mlp": 1.0266552, "epoch": 0.10671877348564557, "flos": 17784197061120.0, "grad_norm": 2.738987551332105, "language_loss": 0.80211937, "learning_rate": 3.9385764081380704e-06, "loss": 0.82399762, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.8203125, "step": 1775, "time_per_iteration": 2.428105592727661 }, { "auxiliary_loss_clip": 0.01025724, "auxiliary_loss_mlp": 0.01022127, "balance_loss_clip": 1.01492631, "balance_loss_mlp": 1.0059725, "epoch": 0.10677889673831355, "flos": 63506695029120.0, "grad_norm": 0.8426283761754608, "language_loss": 0.57612145, "learning_rate": 3.9384805920120876e-06, "loss": 0.59659994, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 0.07177734, "router_z_loss_mlp": 0.19726562, "step": 1776, "time_per_iteration": 3.0867412090301514 }, { "auxiliary_loss_clip": 0.01104121, "auxiliary_loss_mlp": 0.0108124, "balance_loss_clip": 1.02392364, "balance_loss_mlp": 1.0284574, "epoch": 0.10683901999098151, "flos": 22016612983680.0, "grad_norm": 1.5670006323144838, "language_loss": 0.84922236, "learning_rate": 3.938384702378727e-06, "loss": 0.87107599, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.75390625, "step": 1777, "time_per_iteration": 2.4592878818511963 }, { "auxiliary_loss_clip": 0.01105015, "auxiliary_loss_mlp": 0.01077855, "balance_loss_clip": 1.02275634, "balance_loss_mlp": 1.02937341, "epoch": 0.10689914324364948, "flos": 25041872136960.0, "grad_norm": 2.253557687486277, "language_loss": 0.89016473, "learning_rate": 3.938288739241625e-06, "loss": 0.91199344, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.7578125, "step": 1778, "time_per_iteration": 2.4878475666046143 }, { "auxiliary_loss_clip": 0.01110961, "auxiliary_loss_mlp": 0.01091808, "balance_loss_clip": 1.03527892, "balance_loss_mlp": 1.03283834, "epoch": 0.10695926649631746, "flos": 16434210453120.0, "grad_norm": 2.3985440398511293, "language_loss": 0.86333096, "learning_rate": 3.938192702604417e-06, "loss": 0.88535857, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.78125, "step": 1779, "time_per_iteration": 2.4623663425445557 }, { "auxiliary_loss_clip": 0.01105912, "auxiliary_loss_mlp": 0.01076924, "balance_loss_clip": 1.02525902, "balance_loss_mlp": 1.03076243, "epoch": 0.10701938974898542, "flos": 16978366903680.0, "grad_norm": 1.9241107119409386, "language_loss": 0.68562496, "learning_rate": 3.9380965924707495e-06, "loss": 0.70745331, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.75, "step": 1780, "time_per_iteration": 2.430586099624634 }, { "auxiliary_loss_clip": 0.01108171, "auxiliary_loss_mlp": 0.01081465, "balance_loss_clip": 1.02619958, "balance_loss_mlp": 1.03187001, "epoch": 0.10707951300165339, "flos": 15887191271040.0, "grad_norm": 2.2635731432969513, "language_loss": 0.94581503, "learning_rate": 3.938000408844265e-06, "loss": 0.96771145, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.76171875, "step": 1781, "time_per_iteration": 2.4462666511535645 }, { "auxiliary_loss_clip": 0.01112088, "auxiliary_loss_mlp": 0.01091707, "balance_loss_clip": 1.0363698, "balance_loss_mlp": 1.03361332, "epoch": 0.10713963625432135, "flos": 14246273370240.0, "grad_norm": 1.9007409248390699, "language_loss": 0.80847782, "learning_rate": 3.9379041517286105e-06, "loss": 0.8305158, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.78515625, "step": 1782, "time_per_iteration": 2.431711196899414 }, { "auxiliary_loss_clip": 0.01112488, "auxiliary_loss_mlp": 0.01092532, "balance_loss_clip": 1.029351, "balance_loss_mlp": 1.03159404, "epoch": 0.10719975950698933, "flos": 16756040165760.0, "grad_norm": 2.5355711094729965, "language_loss": 0.81853294, "learning_rate": 3.937807821127436e-06, "loss": 0.84058315, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.80859375, "step": 1783, "time_per_iteration": 2.466233730316162 }, { "auxiliary_loss_clip": 0.0110944, "auxiliary_loss_mlp": 0.01086229, "balance_loss_clip": 1.03072476, "balance_loss_mlp": 1.02974701, "epoch": 0.1072598827596573, "flos": 22709534204160.0, "grad_norm": 2.2699161573974274, "language_loss": 0.89580917, "learning_rate": 3.937711417044395e-06, "loss": 0.91776586, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.796875, "step": 1784, "time_per_iteration": 2.4875853061676025 }, { "auxiliary_loss_clip": 0.01109738, "auxiliary_loss_mlp": 0.01101137, "balance_loss_clip": 1.0390532, "balance_loss_mlp": 1.03087699, "epoch": 0.10732000601232526, "flos": 23257146879360.0, "grad_norm": 2.4666659234288852, "language_loss": 1.04631567, "learning_rate": 3.937614939483143e-06, "loss": 1.06842446, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.7890625, "step": 1785, "time_per_iteration": 2.463488817214966 }, { "auxiliary_loss_clip": 0.01107039, "auxiliary_loss_mlp": 0.01091889, "balance_loss_clip": 1.03803027, "balance_loss_mlp": 1.03054619, "epoch": 0.10738012926499324, "flos": 24205911609600.0, "grad_norm": 1.361971881963877, "language_loss": 0.85996342, "learning_rate": 3.937518388447339e-06, "loss": 0.8819527, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.765625, "step": 1786, "time_per_iteration": 2.4628236293792725 }, { "auxiliary_loss_clip": 0.01108516, "auxiliary_loss_mlp": 0.01089435, "balance_loss_clip": 1.02954459, "balance_loss_mlp": 1.02760863, "epoch": 0.1074402525176612, "flos": 20922016037760.0, "grad_norm": 1.875708549236498, "language_loss": 0.80072117, "learning_rate": 3.937421763940642e-06, "loss": 0.82270074, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.80859375, "step": 1787, "time_per_iteration": 2.4292001724243164 }, { "auxiliary_loss_clip": 0.01108543, "auxiliary_loss_mlp": 0.01093566, "balance_loss_clip": 1.03176785, "balance_loss_mlp": 1.02777445, "epoch": 0.10750037577032917, "flos": 16945967295360.0, "grad_norm": 1.7860093625280076, "language_loss": 0.86416298, "learning_rate": 3.937325065966719e-06, "loss": 0.8861841, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.80859375, "step": 1788, "time_per_iteration": 2.399644136428833 }, { "auxiliary_loss_clip": 0.01103189, "auxiliary_loss_mlp": 0.01085771, "balance_loss_clip": 1.03060114, "balance_loss_mlp": 1.02529407, "epoch": 0.10756049902299715, "flos": 20265509232000.0, "grad_norm": 2.053981299260295, "language_loss": 0.80566937, "learning_rate": 3.9372282945292335e-06, "loss": 0.82755888, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.78125, "step": 1789, "time_per_iteration": 2.430898666381836 }, { "auxiliary_loss_clip": 0.0110825, "auxiliary_loss_mlp": 0.01097361, "balance_loss_clip": 1.03241563, "balance_loss_mlp": 1.02742267, "epoch": 0.10762062227566511, "flos": 23585400282240.0, "grad_norm": 2.5226822089913132, "language_loss": 0.78892046, "learning_rate": 3.937131449631859e-06, "loss": 0.81097662, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.80859375, "step": 1790, "time_per_iteration": 2.4127004146575928 }, { "auxiliary_loss_clip": 0.01106577, "auxiliary_loss_mlp": 0.01104172, "balance_loss_clip": 1.03889334, "balance_loss_mlp": 1.02771962, "epoch": 0.10768074552833308, "flos": 24309638858880.0, "grad_norm": 2.1810887277185493, "language_loss": 0.8067987, "learning_rate": 3.9370345312782645e-06, "loss": 0.82890618, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.7890625, "step": 1791, "time_per_iteration": 2.4471685886383057 }, { "auxiliary_loss_clip": 0.01099337, "auxiliary_loss_mlp": 0.01071904, "balance_loss_clip": 1.01885617, "balance_loss_mlp": 1.02518487, "epoch": 0.10774086878100106, "flos": 25298832810240.0, "grad_norm": 1.8081157187861745, "language_loss": 0.73336768, "learning_rate": 3.936937539472126e-06, "loss": 0.7550801, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7421875, "step": 1792, "time_per_iteration": 2.4370737075805664 }, { "auxiliary_loss_clip": 0.01105259, "auxiliary_loss_mlp": 0.01084122, "balance_loss_clip": 1.01989162, "balance_loss_mlp": 1.02604699, "epoch": 0.10780099203366902, "flos": 22052957575680.0, "grad_norm": 1.7162053470356557, "language_loss": 0.78707123, "learning_rate": 3.9368404742171236e-06, "loss": 0.80896509, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.7890625, "step": 1793, "time_per_iteration": 2.429750680923462 }, { "auxiliary_loss_clip": 0.01103944, "auxiliary_loss_mlp": 0.01080391, "balance_loss_clip": 1.02774811, "balance_loss_mlp": 1.0294826, "epoch": 0.10786111528633699, "flos": 22746367555200.0, "grad_norm": 2.0704457453356135, "language_loss": 0.86846381, "learning_rate": 3.936743335516936e-06, "loss": 0.89030719, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.74609375, "step": 1794, "time_per_iteration": 2.4511470794677734 }, { "auxiliary_loss_clip": 0.01113186, "auxiliary_loss_mlp": 0.01085709, "balance_loss_clip": 1.0249362, "balance_loss_mlp": 1.03084242, "epoch": 0.10792123853900495, "flos": 20849990169600.0, "grad_norm": 1.6622183139610038, "language_loss": 0.77430236, "learning_rate": 3.936646123375246e-06, "loss": 0.79629123, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.82421875, "step": 1795, "time_per_iteration": 3.8470277786254883 }, { "auxiliary_loss_clip": 0.01111658, "auxiliary_loss_mlp": 0.0108933, "balance_loss_clip": 1.02624404, "balance_loss_mlp": 1.02906227, "epoch": 0.10798136179167293, "flos": 17747747735040.0, "grad_norm": 2.43360129851574, "language_loss": 0.85200572, "learning_rate": 3.936548837795741e-06, "loss": 0.87401557, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.82421875, "step": 1796, "time_per_iteration": 2.391935348510742 }, { "auxiliary_loss_clip": 0.01112981, "auxiliary_loss_mlp": 0.01093153, "balance_loss_clip": 1.03331017, "balance_loss_mlp": 1.03083777, "epoch": 0.1080414850443409, "flos": 13588789046400.0, "grad_norm": 2.3434917788046796, "language_loss": 0.77301693, "learning_rate": 3.936451478782111e-06, "loss": 0.79507822, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.8203125, "step": 1797, "time_per_iteration": 6.6293559074401855 }, { "auxiliary_loss_clip": 0.0111112, "auxiliary_loss_mlp": 0.01081353, "balance_loss_clip": 1.02563453, "balance_loss_mlp": 1.02958536, "epoch": 0.10810160829700886, "flos": 16252487493120.0, "grad_norm": 2.120633774622022, "language_loss": 0.83814836, "learning_rate": 3.936354046338046e-06, "loss": 0.86007309, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.81640625, "step": 1798, "time_per_iteration": 2.382735013961792 }, { "auxiliary_loss_clip": 0.01110223, "auxiliary_loss_mlp": 0.01084513, "balance_loss_clip": 1.02905643, "balance_loss_mlp": 1.02816665, "epoch": 0.10816173154967684, "flos": 15157122497280.0, "grad_norm": 2.490878376981136, "language_loss": 0.89662349, "learning_rate": 3.936256540467242e-06, "loss": 0.91857082, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.8203125, "step": 1799, "time_per_iteration": 2.3897128105163574 }, { "auxiliary_loss_clip": 0.0110706, "auxiliary_loss_mlp": 0.01087878, "balance_loss_clip": 1.03144479, "balance_loss_mlp": 1.02830184, "epoch": 0.10822185480234481, "flos": 17784371617920.0, "grad_norm": 2.005832505660918, "language_loss": 0.8002528, "learning_rate": 3.9361589611733955e-06, "loss": 0.82220221, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.78515625, "step": 1800, "time_per_iteration": 2.392831563949585 }, { "auxiliary_loss_clip": 0.01104785, "auxiliary_loss_mlp": 0.01083678, "balance_loss_clip": 1.0259093, "balance_loss_mlp": 1.0265398, "epoch": 0.10828197805501277, "flos": 25555479281280.0, "grad_norm": 1.585579711025779, "language_loss": 0.7437495, "learning_rate": 3.9360613084602075e-06, "loss": 0.76563412, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.78515625, "step": 1801, "time_per_iteration": 2.45788836479187 }, { "auxiliary_loss_clip": 0.01109108, "auxiliary_loss_mlp": 0.01082036, "balance_loss_clip": 1.02548313, "balance_loss_mlp": 1.02760434, "epoch": 0.10834210130768075, "flos": 28983217121280.0, "grad_norm": 2.0517478025170517, "language_loss": 0.68840164, "learning_rate": 3.935963582331381e-06, "loss": 0.71031308, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.8125, "step": 1802, "time_per_iteration": 2.458113670349121 }, { "auxiliary_loss_clip": 0.01103922, "auxiliary_loss_mlp": 0.01092844, "balance_loss_clip": 1.03433609, "balance_loss_mlp": 1.02603364, "epoch": 0.10840222456034872, "flos": 20263239993600.0, "grad_norm": 1.68790812593393, "language_loss": 0.83659798, "learning_rate": 3.935865782790621e-06, "loss": 0.85856569, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.78125, "step": 1803, "time_per_iteration": 2.4181807041168213 }, { "auxiliary_loss_clip": 0.01101154, "auxiliary_loss_mlp": 0.01079098, "balance_loss_clip": 1.02402294, "balance_loss_mlp": 1.02442694, "epoch": 0.10846234781301668, "flos": 19862087938560.0, "grad_norm": 1.6038514852205523, "language_loss": 0.92416739, "learning_rate": 3.9357679098416365e-06, "loss": 0.94596988, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.765625, "step": 1804, "time_per_iteration": 2.3894455432891846 }, { "auxiliary_loss_clip": 0.01103263, "auxiliary_loss_mlp": 0.01076114, "balance_loss_clip": 1.01891756, "balance_loss_mlp": 1.02494025, "epoch": 0.10852247106568465, "flos": 26467829596800.0, "grad_norm": 1.9022023289830055, "language_loss": 0.78340149, "learning_rate": 3.935669963488139e-06, "loss": 0.80519527, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.78125, "step": 1805, "time_per_iteration": 2.4441189765930176 }, { "auxiliary_loss_clip": 0.0109866, "auxiliary_loss_mlp": 0.01071766, "balance_loss_clip": 1.02238917, "balance_loss_mlp": 1.02457011, "epoch": 0.10858259431835263, "flos": 30080188039680.0, "grad_norm": 1.7681877395895296, "language_loss": 0.87335479, "learning_rate": 3.935571943733843e-06, "loss": 0.89505905, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.7421875, "step": 1806, "time_per_iteration": 2.4484691619873047 }, { "auxiliary_loss_clip": 0.01105445, "auxiliary_loss_mlp": 0.0108059, "balance_loss_clip": 1.02589726, "balance_loss_mlp": 1.02533329, "epoch": 0.10864271757102059, "flos": 19062157800960.0, "grad_norm": 2.097358810791154, "language_loss": 0.82492584, "learning_rate": 3.9354738505824635e-06, "loss": 0.84678614, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.80078125, "step": 1807, "time_per_iteration": 2.3942699432373047 }, { "auxiliary_loss_clip": 0.01100054, "auxiliary_loss_mlp": 0.0107129, "balance_loss_clip": 1.02222371, "balance_loss_mlp": 1.02492642, "epoch": 0.10870284082368856, "flos": 24713967847680.0, "grad_norm": 1.7698734313011755, "language_loss": 0.80686378, "learning_rate": 3.9353756840377225e-06, "loss": 0.82857722, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.75390625, "step": 1808, "time_per_iteration": 2.4194867610931396 }, { "auxiliary_loss_clip": 0.01103773, "auxiliary_loss_mlp": 0.01075431, "balance_loss_clip": 1.02428973, "balance_loss_mlp": 1.02712774, "epoch": 0.10876296407635654, "flos": 20626720824960.0, "grad_norm": 1.728962795467947, "language_loss": 0.80291831, "learning_rate": 3.935277444103342e-06, "loss": 0.82471037, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.765625, "step": 1809, "time_per_iteration": 2.416457414627075 }, { "auxiliary_loss_clip": 0.01101859, "auxiliary_loss_mlp": 0.01074955, "balance_loss_clip": 1.02579308, "balance_loss_mlp": 1.02672076, "epoch": 0.1088230873290245, "flos": 21578767223040.0, "grad_norm": 2.060769033204955, "language_loss": 0.88350594, "learning_rate": 3.935179130783046e-06, "loss": 0.90527409, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.75, "step": 1810, "time_per_iteration": 2.3957698345184326 }, { "auxiliary_loss_clip": 0.01111289, "auxiliary_loss_mlp": 0.0108179, "balance_loss_clip": 1.02595186, "balance_loss_mlp": 1.02955151, "epoch": 0.10888321058169247, "flos": 26467829596800.0, "grad_norm": 2.4125826690210683, "language_loss": 0.65641016, "learning_rate": 3.935080744080564e-06, "loss": 0.67834091, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.81640625, "step": 1811, "time_per_iteration": 2.448526620864868 }, { "auxiliary_loss_clip": 0.01103683, "auxiliary_loss_mlp": 0.01084385, "balance_loss_clip": 1.03255248, "balance_loss_mlp": 1.02673995, "epoch": 0.10894333383436045, "flos": 25847423003520.0, "grad_norm": 1.8778586847031504, "language_loss": 0.76418281, "learning_rate": 3.934982283999626e-06, "loss": 0.78606355, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.76953125, "step": 1812, "time_per_iteration": 2.432039737701416 }, { "auxiliary_loss_clip": 0.01102738, "auxiliary_loss_mlp": 0.01072442, "balance_loss_clip": 1.0257597, "balance_loss_mlp": 1.02623367, "epoch": 0.10900345708702841, "flos": 19536068862720.0, "grad_norm": 2.7185472842113785, "language_loss": 0.74803507, "learning_rate": 3.934883750543966e-06, "loss": 0.76978689, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.765625, "step": 1813, "time_per_iteration": 2.381136417388916 }, { "auxiliary_loss_clip": 0.01100547, "auxiliary_loss_mlp": 0.01074623, "balance_loss_clip": 1.02879846, "balance_loss_mlp": 1.02725816, "epoch": 0.10906358033969638, "flos": 23622163810560.0, "grad_norm": 1.8292889523661957, "language_loss": 0.84670591, "learning_rate": 3.93478514371732e-06, "loss": 0.86845762, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.734375, "step": 1814, "time_per_iteration": 2.5078063011169434 }, { "auxiliary_loss_clip": 0.01104288, "auxiliary_loss_mlp": 0.01076016, "balance_loss_clip": 1.02795124, "balance_loss_mlp": 1.02705216, "epoch": 0.10912370359236434, "flos": 21213680469120.0, "grad_norm": 2.4664526514741794, "language_loss": 0.86996579, "learning_rate": 3.934686463523429e-06, "loss": 0.89176887, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7734375, "step": 1815, "time_per_iteration": 2.4300730228424072 }, { "auxiliary_loss_clip": 0.01100588, "auxiliary_loss_mlp": 0.0107573, "balance_loss_clip": 1.02559042, "balance_loss_mlp": 1.02637064, "epoch": 0.10918382684503232, "flos": 13552339720320.0, "grad_norm": 2.1726986316347983, "language_loss": 0.75033742, "learning_rate": 3.9345877099660315e-06, "loss": 0.77210057, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7421875, "step": 1816, "time_per_iteration": 2.393874406814575 }, { "auxiliary_loss_clip": 0.01104792, "auxiliary_loss_mlp": 0.01078006, "balance_loss_clip": 1.02567279, "balance_loss_mlp": 1.02782249, "epoch": 0.10924395009770028, "flos": 27963089838720.0, "grad_norm": 2.5136614161580138, "language_loss": 0.7655099, "learning_rate": 3.9344888830488744e-06, "loss": 0.7873379, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.76953125, "step": 1817, "time_per_iteration": 2.4470858573913574 }, { "auxiliary_loss_clip": 0.01100335, "auxiliary_loss_mlp": 0.010728, "balance_loss_clip": 1.02285075, "balance_loss_mlp": 1.02527452, "epoch": 0.10930407335036825, "flos": 25592557011840.0, "grad_norm": 1.8053648135546518, "language_loss": 0.69606841, "learning_rate": 3.934389982775706e-06, "loss": 0.71779972, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.75, "step": 1818, "time_per_iteration": 2.4431748390197754 }, { "auxiliary_loss_clip": 0.0110365, "auxiliary_loss_mlp": 0.01086034, "balance_loss_clip": 1.03539348, "balance_loss_mlp": 1.02706945, "epoch": 0.10936419660303623, "flos": 18405197147520.0, "grad_norm": 2.047310951877223, "language_loss": 0.75600147, "learning_rate": 3.934291009150275e-06, "loss": 0.77789837, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.765625, "step": 1819, "time_per_iteration": 2.38191294670105 }, { "auxiliary_loss_clip": 0.01099158, "auxiliary_loss_mlp": 0.01071375, "balance_loss_clip": 1.02378702, "balance_loss_mlp": 1.02459538, "epoch": 0.1094243198557042, "flos": 23838974553600.0, "grad_norm": 2.279546498192579, "language_loss": 0.7643944, "learning_rate": 3.934191962176335e-06, "loss": 0.78609967, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.74609375, "step": 1820, "time_per_iteration": 2.446497678756714 }, { "auxiliary_loss_clip": 0.01098986, "auxiliary_loss_mlp": 0.0107538, "balance_loss_clip": 1.02445364, "balance_loss_mlp": 1.02423286, "epoch": 0.10948444310837216, "flos": 14643166239360.0, "grad_norm": 2.3469975415300772, "language_loss": 0.843799, "learning_rate": 3.934092841857642e-06, "loss": 0.86554271, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.74609375, "step": 1821, "time_per_iteration": 2.3553030490875244 }, { "auxiliary_loss_clip": 0.01096828, "auxiliary_loss_mlp": 0.01064266, "balance_loss_clip": 1.01682115, "balance_loss_mlp": 1.0230161, "epoch": 0.10954456636104014, "flos": 27817571825280.0, "grad_norm": 1.8614427716209658, "language_loss": 0.78595269, "learning_rate": 3.933993648197955e-06, "loss": 0.80756366, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.7421875, "step": 1822, "time_per_iteration": 2.4424335956573486 }, { "auxiliary_loss_clip": 0.01096434, "auxiliary_loss_mlp": 0.0106469, "balance_loss_clip": 1.01767373, "balance_loss_mlp": 1.02380204, "epoch": 0.1096046896137081, "flos": 33619508184960.0, "grad_norm": 1.7277554588408512, "language_loss": 0.81826091, "learning_rate": 3.933894381201034e-06, "loss": 0.83987218, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.7265625, "step": 1823, "time_per_iteration": 2.485976219177246 }, { "auxiliary_loss_clip": 0.01097882, "auxiliary_loss_mlp": 0.01064804, "balance_loss_clip": 1.01998162, "balance_loss_mlp": 1.02550673, "epoch": 0.10966481286637607, "flos": 26978783477760.0, "grad_norm": 1.5270541878701929, "language_loss": 0.80978745, "learning_rate": 3.933795040870645e-06, "loss": 0.8314144, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.7265625, "step": 1824, "time_per_iteration": 2.44006609916687 }, { "auxiliary_loss_clip": 0.01096874, "auxiliary_loss_mlp": 0.01068259, "balance_loss_clip": 1.02169585, "balance_loss_mlp": 1.02444458, "epoch": 0.10972493611904403, "flos": 23035518368640.0, "grad_norm": 1.8924730153705107, "language_loss": 0.8929252, "learning_rate": 3.933695627210554e-06, "loss": 0.91457653, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.7265625, "step": 1825, "time_per_iteration": 2.404723882675171 }, { "auxiliary_loss_clip": 0.01095236, "auxiliary_loss_mlp": 0.01070128, "balance_loss_clip": 1.02354097, "balance_loss_mlp": 1.02390122, "epoch": 0.10978505937171201, "flos": 38103194229120.0, "grad_norm": 1.9663823765769586, "language_loss": 0.77945828, "learning_rate": 3.933596140224532e-06, "loss": 0.80111194, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.7109375, "step": 1826, "time_per_iteration": 2.5327086448669434 }, { "auxiliary_loss_clip": 0.01028265, "auxiliary_loss_mlp": 0.01012016, "balance_loss_clip": 1.00524461, "balance_loss_mlp": 1.00821185, "epoch": 0.10984518262437998, "flos": 59846645802240.0, "grad_norm": 0.8678077399200091, "language_loss": 0.55114448, "learning_rate": 3.93349657991635e-06, "loss": 0.57154727, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 0.06787109, "router_z_loss_mlp": 0.20117188, "step": 1827, "time_per_iteration": 3.0208489894866943 }, { "auxiliary_loss_clip": 0.01028251, "auxiliary_loss_mlp": 0.01009823, "balance_loss_clip": 1.00324261, "balance_loss_mlp": 1.00909328, "epoch": 0.10990530587704794, "flos": 66716295494400.0, "grad_norm": 0.7487388422310868, "language_loss": 0.55467093, "learning_rate": 3.933396946289784e-06, "loss": 0.57505167, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 0.06591797, "router_z_loss_mlp": 0.19140625, "step": 1828, "time_per_iteration": 3.0644068717956543 }, { "auxiliary_loss_clip": 0.01103204, "auxiliary_loss_mlp": 0.01074077, "balance_loss_clip": 1.02439022, "balance_loss_mlp": 1.0261848, "epoch": 0.10996542912971592, "flos": 25446026568960.0, "grad_norm": 2.357445326420788, "language_loss": 0.86911714, "learning_rate": 3.933297239348612e-06, "loss": 0.89088994, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.76953125, "step": 1829, "time_per_iteration": 2.4586398601531982 }, { "auxiliary_loss_clip": 0.01101747, "auxiliary_loss_mlp": 0.01088642, "balance_loss_clip": 1.04150617, "balance_loss_mlp": 1.02706814, "epoch": 0.11002555238238389, "flos": 44016503425920.0, "grad_norm": 1.778390409450466, "language_loss": 0.90687287, "learning_rate": 3.933197459096614e-06, "loss": 0.92877674, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.74609375, "step": 1830, "time_per_iteration": 2.5847115516662598 }, { "auxiliary_loss_clip": 0.01026855, "auxiliary_loss_mlp": 0.01016094, "balance_loss_clip": 1.01013315, "balance_loss_mlp": 1.00797558, "epoch": 0.11008567563505185, "flos": 54061781097600.0, "grad_norm": 0.7792404114377771, "language_loss": 0.55553281, "learning_rate": 3.9330976055375756e-06, "loss": 0.57596231, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 0.05957031, "router_z_loss_mlp": 0.18945312, "step": 1831, "time_per_iteration": 3.023843288421631 }, { "auxiliary_loss_clip": 0.01103685, "auxiliary_loss_mlp": 0.01082983, "balance_loss_clip": 1.03799391, "balance_loss_mlp": 1.02574992, "epoch": 0.11014579888771983, "flos": 24242011822080.0, "grad_norm": 2.0010826920105207, "language_loss": 0.93781084, "learning_rate": 3.932997678675282e-06, "loss": 0.95967752, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.77734375, "step": 1832, "time_per_iteration": 2.437107801437378 }, { "auxiliary_loss_clip": 0.01021277, "auxiliary_loss_mlp": 0.01016282, "balance_loss_clip": 1.00932062, "balance_loss_mlp": 1.00323629, "epoch": 0.1102059221403878, "flos": 57740684325120.0, "grad_norm": 0.7346182357216329, "language_loss": 0.60079539, "learning_rate": 3.932897678513523e-06, "loss": 0.621171, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 0.06982422, "router_z_loss_mlp": 0.1796875, "step": 1833, "time_per_iteration": 3.0854196548461914 }, { "auxiliary_loss_clip": 0.01095056, "auxiliary_loss_mlp": 0.01083554, "balance_loss_clip": 1.03744376, "balance_loss_mlp": 1.02258062, "epoch": 0.11026604539305576, "flos": 16795107843840.0, "grad_norm": 3.1958613139616094, "language_loss": 0.84289914, "learning_rate": 3.93279760505609e-06, "loss": 0.86468518, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7265625, "step": 1834, "time_per_iteration": 3.814936876296997 }, { "auxiliary_loss_clip": 0.01104762, "auxiliary_loss_mlp": 0.01082172, "balance_loss_clip": 1.03160393, "balance_loss_mlp": 1.0299623, "epoch": 0.11032616864572373, "flos": 23986936362240.0, "grad_norm": 2.4030834456948242, "language_loss": 0.93920034, "learning_rate": 3.932697458306779e-06, "loss": 0.9610697, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.75, "step": 1835, "time_per_iteration": 2.431258201599121 }, { "auxiliary_loss_clip": 0.01101725, "auxiliary_loss_mlp": 0.01077761, "balance_loss_clip": 1.02962446, "balance_loss_mlp": 1.02722776, "epoch": 0.1103862918983917, "flos": 19682110546560.0, "grad_norm": 2.1275767017001606, "language_loss": 0.67645168, "learning_rate": 3.932597238269386e-06, "loss": 0.6982466, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.74609375, "step": 1836, "time_per_iteration": 3.8317441940307617 }, { "auxiliary_loss_clip": 0.01100204, "auxiliary_loss_mlp": 0.01072603, "balance_loss_clip": 1.02692163, "balance_loss_mlp": 1.02717841, "epoch": 0.11044641515105967, "flos": 32159510282880.0, "grad_norm": 2.145617527556642, "language_loss": 0.75259566, "learning_rate": 3.932496944947711e-06, "loss": 0.7743237, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.73046875, "step": 1837, "time_per_iteration": 4.054734468460083 }, { "auxiliary_loss_clip": 0.01104348, "auxiliary_loss_mlp": 0.01065398, "balance_loss_clip": 1.02143335, "balance_loss_mlp": 1.02987385, "epoch": 0.11050653840372764, "flos": 16688343306240.0, "grad_norm": 2.1565137843797135, "language_loss": 0.80223441, "learning_rate": 3.93239657834556e-06, "loss": 0.82393193, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.74609375, "step": 1838, "time_per_iteration": 2.3821582794189453 }, { "auxiliary_loss_clip": 0.01104585, "auxiliary_loss_mlp": 0.01076735, "balance_loss_clip": 1.02993393, "balance_loss_mlp": 1.03211856, "epoch": 0.11056666165639562, "flos": 21207989917440.0, "grad_norm": 2.0813863834827484, "language_loss": 0.72923666, "learning_rate": 3.932296138466736e-06, "loss": 0.75104982, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.72265625, "step": 1839, "time_per_iteration": 2.420905590057373 }, { "auxiliary_loss_clip": 0.01110475, "auxiliary_loss_mlp": 0.01065261, "balance_loss_clip": 1.01755381, "balance_loss_mlp": 1.03310728, "epoch": 0.11062678490906358, "flos": 19164663152640.0, "grad_norm": 2.4166196259860775, "language_loss": 0.81130153, "learning_rate": 3.93219562531505e-06, "loss": 0.83305889, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.7734375, "step": 1840, "time_per_iteration": 2.407621145248413 }, { "auxiliary_loss_clip": 0.01105426, "auxiliary_loss_mlp": 0.01061593, "balance_loss_clip": 1.01839209, "balance_loss_mlp": 1.03393877, "epoch": 0.11068690816173155, "flos": 24894259441920.0, "grad_norm": 2.0426768268797812, "language_loss": 0.89381814, "learning_rate": 3.932095038894311e-06, "loss": 0.91548836, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.71484375, "step": 1841, "time_per_iteration": 2.4570319652557373 }, { "auxiliary_loss_clip": 0.01105203, "auxiliary_loss_mlp": 0.01058487, "balance_loss_clip": 1.01416469, "balance_loss_mlp": 1.03446412, "epoch": 0.11074703141439952, "flos": 16471427829120.0, "grad_norm": 1.8489573155321657, "language_loss": 0.93145823, "learning_rate": 3.931994379208334e-06, "loss": 0.95309508, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.703125, "step": 1842, "time_per_iteration": 2.3950109481811523 }, { "auxiliary_loss_clip": 0.01107564, "auxiliary_loss_mlp": 0.01063946, "balance_loss_clip": 1.02043509, "balance_loss_mlp": 1.0356915, "epoch": 0.11080715466706749, "flos": 19171401045120.0, "grad_norm": 1.7782385739968325, "language_loss": 0.8774333, "learning_rate": 3.931893646260937e-06, "loss": 0.89914834, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.71875, "step": 1843, "time_per_iteration": 2.424804210662842 }, { "auxiliary_loss_clip": 0.011083, "auxiliary_loss_mlp": 0.01073619, "balance_loss_clip": 1.0271517, "balance_loss_mlp": 1.03437555, "epoch": 0.11086727791973545, "flos": 27703580636160.0, "grad_norm": 1.673495552347332, "language_loss": 0.76399124, "learning_rate": 3.931792840055941e-06, "loss": 0.78581047, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.73828125, "step": 1844, "time_per_iteration": 2.5054471492767334 }, { "auxiliary_loss_clip": 0.01107511, "auxiliary_loss_mlp": 0.01067027, "balance_loss_clip": 1.02103651, "balance_loss_mlp": 1.03420258, "epoch": 0.11092740117240343, "flos": 18513986544000.0, "grad_norm": 2.1790993874468616, "language_loss": 0.78027058, "learning_rate": 3.931691960597165e-06, "loss": 0.80201602, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.73046875, "step": 1845, "time_per_iteration": 2.4181275367736816 }, { "auxiliary_loss_clip": 0.01104626, "auxiliary_loss_mlp": 0.01070398, "balance_loss_clip": 1.02362001, "balance_loss_mlp": 1.03229225, "epoch": 0.1109875244250714, "flos": 20521387653120.0, "grad_norm": 1.5254687551044002, "language_loss": 0.78269351, "learning_rate": 3.9315910078884375e-06, "loss": 0.80444372, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.7265625, "step": 1846, "time_per_iteration": 2.4165163040161133 }, { "auxiliary_loss_clip": 0.0110536, "auxiliary_loss_mlp": 0.01072041, "balance_loss_clip": 1.02299798, "balance_loss_mlp": 1.03062904, "epoch": 0.11104764767773936, "flos": 14097787891200.0, "grad_norm": 2.573279062157269, "language_loss": 0.89371645, "learning_rate": 3.931489981933584e-06, "loss": 0.91549039, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.75, "step": 1847, "time_per_iteration": 2.3723669052124023 }, { "auxiliary_loss_clip": 0.01101728, "auxiliary_loss_mlp": 0.01069096, "balance_loss_clip": 1.02374923, "balance_loss_mlp": 1.02788234, "epoch": 0.11110777093040733, "flos": 20593483344000.0, "grad_norm": 1.8875165049366052, "language_loss": 0.7923075, "learning_rate": 3.931388882736438e-06, "loss": 0.81401581, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.73828125, "step": 1848, "time_per_iteration": 2.4484691619873047 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01067607, "balance_loss_clip": 1.02385712, "balance_loss_mlp": 1.02787519, "epoch": 0.11116789418307531, "flos": 21869035200000.0, "grad_norm": 1.8282645621155653, "language_loss": 0.79645181, "learning_rate": 3.931287710300832e-06, "loss": 0.81810129, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6953125, "step": 1849, "time_per_iteration": 2.40321683883667 }, { "auxiliary_loss_clip": 0.01100239, "auxiliary_loss_mlp": 0.01074533, "balance_loss_clip": 1.02611017, "balance_loss_mlp": 1.02469242, "epoch": 0.11122801743574327, "flos": 15522209251200.0, "grad_norm": 2.4313512144389353, "language_loss": 0.74787056, "learning_rate": 3.931186464630601e-06, "loss": 0.76961827, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.75390625, "step": 1850, "time_per_iteration": 2.3929550647735596 }, { "auxiliary_loss_clip": 0.01099535, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.02441978, "balance_loss_mlp": 1.02484059, "epoch": 0.11128814068841124, "flos": 14391407358720.0, "grad_norm": 2.8999611090764046, "language_loss": 0.84610397, "learning_rate": 3.931085145729588e-06, "loss": 0.86780584, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.74609375, "step": 1851, "time_per_iteration": 2.3492844104766846 }, { "auxiliary_loss_clip": 0.01095434, "auxiliary_loss_mlp": 0.0106692, "balance_loss_clip": 1.02309883, "balance_loss_mlp": 1.02511406, "epoch": 0.11134826394107922, "flos": 16653011143680.0, "grad_norm": 2.334413820695967, "language_loss": 0.9049499, "learning_rate": 3.930983753601631e-06, "loss": 0.92657351, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.703125, "step": 1852, "time_per_iteration": 2.3694043159484863 }, { "auxiliary_loss_clip": 0.01098553, "auxiliary_loss_mlp": 0.01074721, "balance_loss_clip": 1.02596474, "balance_loss_mlp": 1.025208, "epoch": 0.11140838719374718, "flos": 16690053962880.0, "grad_norm": 1.9403445925353926, "language_loss": 0.74049991, "learning_rate": 3.930882288250578e-06, "loss": 0.76223266, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.734375, "step": 1853, "time_per_iteration": 2.379237651824951 }, { "auxiliary_loss_clip": 0.01024428, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.02666807, "balance_loss_mlp": 1.00678837, "epoch": 0.11146851044641515, "flos": 60973397976960.0, "grad_norm": 0.8069928914208974, "language_loss": 0.53854942, "learning_rate": 3.930780749680273e-06, "loss": 0.55912519, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 0.06494141, "router_z_loss_mlp": 0.17578125, "step": 1854, "time_per_iteration": 2.990219831466675 }, { "auxiliary_loss_clip": 0.01106473, "auxiliary_loss_mlp": 0.01068632, "balance_loss_clip": 1.01911259, "balance_loss_mlp": 1.02787971, "epoch": 0.11152863369908313, "flos": 22192924682880.0, "grad_norm": 3.8747024588761905, "language_loss": 0.87201589, "learning_rate": 3.9306791378945705e-06, "loss": 0.89376694, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.78515625, "step": 1855, "time_per_iteration": 2.415235996246338 }, { "auxiliary_loss_clip": 0.01099131, "auxiliary_loss_mlp": 0.01073064, "balance_loss_clip": 1.02757335, "balance_loss_mlp": 1.02733386, "epoch": 0.11158875695175109, "flos": 19536487799040.0, "grad_norm": 2.4871528569716626, "language_loss": 0.84162831, "learning_rate": 3.9305774528973205e-06, "loss": 0.86335027, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.71875, "step": 1856, "time_per_iteration": 2.4062297344207764 }, { "auxiliary_loss_clip": 0.01100841, "auxiliary_loss_mlp": 0.01068074, "balance_loss_clip": 1.02148652, "balance_loss_mlp": 1.02964246, "epoch": 0.11164888020441906, "flos": 25441662648960.0, "grad_norm": 1.605153216250298, "language_loss": 0.84211034, "learning_rate": 3.93047569469238e-06, "loss": 0.86379945, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.7109375, "step": 1857, "time_per_iteration": 2.4475197792053223 }, { "auxiliary_loss_clip": 0.01106089, "auxiliary_loss_mlp": 0.01063847, "balance_loss_clip": 1.0191915, "balance_loss_mlp": 1.03082228, "epoch": 0.11170900345708702, "flos": 15631836520320.0, "grad_norm": 2.0495630928987607, "language_loss": 0.86022991, "learning_rate": 3.930373863283608e-06, "loss": 0.88192928, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.75390625, "step": 1858, "time_per_iteration": 2.403580904006958 }, { "auxiliary_loss_clip": 0.01104848, "auxiliary_loss_mlp": 0.01080399, "balance_loss_clip": 1.03149939, "balance_loss_mlp": 1.03090286, "epoch": 0.111769126709755, "flos": 23038311277440.0, "grad_norm": 2.2432384240293497, "language_loss": 0.94252133, "learning_rate": 3.930271958674866e-06, "loss": 0.96437383, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7421875, "step": 1859, "time_per_iteration": 2.4300200939178467 }, { "auxiliary_loss_clip": 0.01105106, "auxiliary_loss_mlp": 0.01082846, "balance_loss_clip": 1.0347805, "balance_loss_mlp": 1.0297296, "epoch": 0.11182924996242297, "flos": 20849641056000.0, "grad_norm": 2.044764732911658, "language_loss": 0.852723, "learning_rate": 3.930169980870018e-06, "loss": 0.87460256, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.75390625, "step": 1860, "time_per_iteration": 2.4387805461883545 }, { "auxiliary_loss_clip": 0.01104898, "auxiliary_loss_mlp": 0.01078633, "balance_loss_clip": 1.03218937, "balance_loss_mlp": 1.03237712, "epoch": 0.11188937321509093, "flos": 17454407558400.0, "grad_norm": 1.8532894520242558, "language_loss": 0.78312159, "learning_rate": 3.930067929872931e-06, "loss": 0.80495703, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.72265625, "step": 1861, "time_per_iteration": 2.412659168243408 }, { "auxiliary_loss_clip": 0.01102847, "auxiliary_loss_mlp": 0.01074804, "balance_loss_clip": 1.03210342, "balance_loss_mlp": 1.03050554, "epoch": 0.11194949646775891, "flos": 24094818063360.0, "grad_norm": 1.9076652727283643, "language_loss": 0.91204727, "learning_rate": 3.929965805687474e-06, "loss": 0.9338237, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.72265625, "step": 1862, "time_per_iteration": 2.4636106491088867 }, { "auxiliary_loss_clip": 0.01104875, "auxiliary_loss_mlp": 0.0107759, "balance_loss_clip": 1.0315752, "balance_loss_mlp": 1.02988827, "epoch": 0.11200961972042688, "flos": 25152756215040.0, "grad_norm": 2.523365775097726, "language_loss": 0.89980108, "learning_rate": 3.92986360831752e-06, "loss": 0.92162573, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.75, "step": 1863, "time_per_iteration": 2.4306387901306152 }, { "auxiliary_loss_clip": 0.01102974, "auxiliary_loss_mlp": 0.01069934, "balance_loss_clip": 1.01957941, "balance_loss_mlp": 1.027457, "epoch": 0.11206974297309484, "flos": 21287242437120.0, "grad_norm": 2.0566667389338744, "language_loss": 0.65794426, "learning_rate": 3.929761337766945e-06, "loss": 0.67967331, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.75390625, "step": 1864, "time_per_iteration": 2.447702646255493 }, { "auxiliary_loss_clip": 0.01102412, "auxiliary_loss_mlp": 0.0107741, "balance_loss_clip": 1.03058505, "balance_loss_mlp": 1.0282352, "epoch": 0.11212986622576282, "flos": 18914998953600.0, "grad_norm": 2.0821785378618505, "language_loss": 0.76102626, "learning_rate": 3.929658994039627e-06, "loss": 0.78282446, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.7421875, "step": 1865, "time_per_iteration": 2.405714511871338 }, { "auxiliary_loss_clip": 0.01098779, "auxiliary_loss_mlp": 0.01078346, "balance_loss_clip": 1.02529812, "balance_loss_mlp": 1.0248909, "epoch": 0.11218998947843078, "flos": 22053655802880.0, "grad_norm": 13.303501424570783, "language_loss": 0.8824231, "learning_rate": 3.929556577139446e-06, "loss": 0.90419436, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.73828125, "step": 1866, "time_per_iteration": 2.4739158153533936 }, { "auxiliary_loss_clip": 0.01099612, "auxiliary_loss_mlp": 0.01077632, "balance_loss_clip": 1.02622902, "balance_loss_mlp": 1.0240593, "epoch": 0.11225011273109875, "flos": 24570544515840.0, "grad_norm": 2.8087754590914193, "language_loss": 0.83187294, "learning_rate": 3.929454087070286e-06, "loss": 0.85364538, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.75390625, "step": 1867, "time_per_iteration": 2.453341484069824 }, { "auxiliary_loss_clip": 0.01098329, "auxiliary_loss_mlp": 0.01073689, "balance_loss_clip": 1.025648, "balance_loss_mlp": 1.02437031, "epoch": 0.11231023598376672, "flos": 28437419836800.0, "grad_norm": 2.9863625959634277, "language_loss": 0.89498687, "learning_rate": 3.929351523836035e-06, "loss": 0.91670704, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7421875, "step": 1868, "time_per_iteration": 2.494337558746338 }, { "auxiliary_loss_clip": 0.01096515, "auxiliary_loss_mlp": 0.01071824, "balance_loss_clip": 1.0233295, "balance_loss_mlp": 1.02432346, "epoch": 0.1123703592364347, "flos": 14425657269120.0, "grad_norm": 2.2671150608784596, "language_loss": 0.70128286, "learning_rate": 3.9292488874405795e-06, "loss": 0.72296625, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.72265625, "step": 1869, "time_per_iteration": 2.403276205062866 }, { "auxiliary_loss_clip": 0.0110261, "auxiliary_loss_mlp": 0.01073153, "balance_loss_clip": 1.02091527, "balance_loss_mlp": 1.02499545, "epoch": 0.11243048248910266, "flos": 22235204206080.0, "grad_norm": 1.7999289715022688, "language_loss": 0.79256278, "learning_rate": 3.929146177887814e-06, "loss": 0.81432039, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.77734375, "step": 1870, "time_per_iteration": 2.4106569290161133 }, { "auxiliary_loss_clip": 0.01100605, "auxiliary_loss_mlp": 0.01078561, "balance_loss_clip": 1.02658582, "balance_loss_mlp": 1.02378428, "epoch": 0.11249060574177062, "flos": 18583289326080.0, "grad_norm": 1.8754945887567247, "language_loss": 0.78173065, "learning_rate": 3.929043395181631e-06, "loss": 0.80352229, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.76953125, "step": 1871, "time_per_iteration": 2.3714852333068848 }, { "auxiliary_loss_clip": 0.01101772, "auxiliary_loss_mlp": 0.01075277, "balance_loss_clip": 1.02675927, "balance_loss_mlp": 1.02688217, "epoch": 0.1125507289944386, "flos": 22855471153920.0, "grad_norm": 2.109045831003895, "language_loss": 0.84588909, "learning_rate": 3.928940539325929e-06, "loss": 0.86765969, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.75, "step": 1872, "time_per_iteration": 2.426419258117676 }, { "auxiliary_loss_clip": 0.01100046, "auxiliary_loss_mlp": 0.01070506, "balance_loss_clip": 1.01977015, "balance_loss_mlp": 1.02552271, "epoch": 0.11261085224710657, "flos": 19675547210880.0, "grad_norm": 2.302261445487002, "language_loss": 0.84636939, "learning_rate": 3.9288376103246095e-06, "loss": 0.86807501, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7421875, "step": 1873, "time_per_iteration": 3.8020644187927246 }, { "auxiliary_loss_clip": 0.01104129, "auxiliary_loss_mlp": 0.01080662, "balance_loss_clip": 1.02148616, "balance_loss_mlp": 1.02533126, "epoch": 0.11267097549977453, "flos": 26062173976320.0, "grad_norm": 1.8693041633854317, "language_loss": 0.93537807, "learning_rate": 3.928734608181575e-06, "loss": 0.95722592, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.7890625, "step": 1874, "time_per_iteration": 2.450397491455078 }, { "auxiliary_loss_clip": 0.01098637, "auxiliary_loss_mlp": 0.01069755, "balance_loss_clip": 1.01789856, "balance_loss_mlp": 1.02503562, "epoch": 0.11273109875244251, "flos": 21067010380800.0, "grad_norm": 1.5444510406006333, "language_loss": 0.76353008, "learning_rate": 3.928631532900729e-06, "loss": 0.78521401, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.734375, "step": 1875, "time_per_iteration": 2.4390599727630615 }, { "auxiliary_loss_clip": 0.0109686, "auxiliary_loss_mlp": 0.01072392, "balance_loss_clip": 1.02878559, "balance_loss_mlp": 1.02631259, "epoch": 0.11279122200511048, "flos": 27087782342400.0, "grad_norm": 1.86897390320642, "language_loss": 0.74271333, "learning_rate": 3.928528384485984e-06, "loss": 0.76440585, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.70703125, "step": 1876, "time_per_iteration": 3.879782199859619 }, { "auxiliary_loss_clip": 0.01099706, "auxiliary_loss_mlp": 0.01066436, "balance_loss_clip": 1.01941967, "balance_loss_mlp": 1.02761054, "epoch": 0.11285134525777844, "flos": 20187024762240.0, "grad_norm": 1.8917786796660516, "language_loss": 0.78641903, "learning_rate": 3.9284251629412475e-06, "loss": 0.80808043, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.71875, "step": 1877, "time_per_iteration": 5.265125036239624 }, { "auxiliary_loss_clip": 0.01101101, "auxiliary_loss_mlp": 0.01069747, "balance_loss_clip": 1.01853502, "balance_loss_mlp": 1.02697456, "epoch": 0.11291146851044641, "flos": 12457638040320.0, "grad_norm": 2.1759545031813112, "language_loss": 0.90824407, "learning_rate": 3.928321868270436e-06, "loss": 0.92995256, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.7421875, "step": 1878, "time_per_iteration": 2.5670459270477295 }, { "auxiliary_loss_clip": 0.01097836, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.0203259, "balance_loss_mlp": 1.02607751, "epoch": 0.11297159176311439, "flos": 23841173969280.0, "grad_norm": 1.9428841447663734, "language_loss": 0.84095526, "learning_rate": 3.928218500477466e-06, "loss": 0.86261284, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.71875, "step": 1879, "time_per_iteration": 2.4591658115386963 }, { "auxiliary_loss_clip": 0.01100415, "auxiliary_loss_mlp": 0.01078289, "balance_loss_clip": 1.02478814, "balance_loss_mlp": 1.0252378, "epoch": 0.11303171501578235, "flos": 29929363499520.0, "grad_norm": 1.8849349429999354, "language_loss": 0.72397721, "learning_rate": 3.928115059566259e-06, "loss": 0.74576426, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.75, "step": 1880, "time_per_iteration": 2.5262112617492676 }, { "auxiliary_loss_clip": 0.01096647, "auxiliary_loss_mlp": 0.01069077, "balance_loss_clip": 1.02127421, "balance_loss_mlp": 1.02491975, "epoch": 0.11309183826845032, "flos": 16179623752320.0, "grad_norm": 1.590189952746905, "language_loss": 0.73756194, "learning_rate": 3.928011545540734e-06, "loss": 0.75921917, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.71875, "step": 1881, "time_per_iteration": 2.40313720703125 }, { "auxiliary_loss_clip": 0.01099009, "auxiliary_loss_mlp": 0.01070966, "balance_loss_clip": 1.02261496, "balance_loss_mlp": 1.02470291, "epoch": 0.1131519615211183, "flos": 12019897013760.0, "grad_norm": 2.13825440647513, "language_loss": 0.76745653, "learning_rate": 3.927907958404819e-06, "loss": 0.78915632, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7421875, "step": 1882, "time_per_iteration": 2.4102942943573 }, { "auxiliary_loss_clip": 0.01097094, "auxiliary_loss_mlp": 0.01064997, "balance_loss_clip": 1.01936352, "balance_loss_mlp": 1.02425206, "epoch": 0.11321208477378626, "flos": 26248924172160.0, "grad_norm": 2.028690310013846, "language_loss": 0.82195008, "learning_rate": 3.92780429816244e-06, "loss": 0.84357095, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.7265625, "step": 1883, "time_per_iteration": 2.461684226989746 }, { "auxiliary_loss_clip": 0.01098811, "auxiliary_loss_mlp": 0.01076146, "balance_loss_clip": 1.02283561, "balance_loss_mlp": 1.02347541, "epoch": 0.11327220802645423, "flos": 13625517663360.0, "grad_norm": 1.8184585007930745, "language_loss": 0.79082727, "learning_rate": 3.927700564817529e-06, "loss": 0.81257689, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.75, "step": 1884, "time_per_iteration": 2.4012253284454346 }, { "auxiliary_loss_clip": 0.01034132, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.02719951, "balance_loss_mlp": 1.01575351, "epoch": 0.1133323312791222, "flos": 57188672818560.0, "grad_norm": 0.8346576611494941, "language_loss": 0.55341917, "learning_rate": 3.927596758374019e-06, "loss": 0.57409185, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 0.05932617, "router_z_loss_mlp": 0.18359375, "step": 1885, "time_per_iteration": 2.9715123176574707 }, { "auxiliary_loss_clip": 0.01092564, "auxiliary_loss_mlp": 0.01066852, "balance_loss_clip": 1.02133799, "balance_loss_mlp": 1.02258849, "epoch": 0.11339245453179017, "flos": 24350591750400.0, "grad_norm": 2.1762888962388653, "language_loss": 0.924577, "learning_rate": 3.927492878835848e-06, "loss": 0.94617116, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.69921875, "step": 1886, "time_per_iteration": 2.5209672451019287 }, { "auxiliary_loss_clip": 0.01096971, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.02577114, "balance_loss_mlp": 1.0243454, "epoch": 0.11345257778445814, "flos": 22669698476160.0, "grad_norm": 1.6569463502935136, "language_loss": 0.86806983, "learning_rate": 3.927388926206953e-06, "loss": 0.88977629, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.7265625, "step": 1887, "time_per_iteration": 2.4785964488983154 }, { "auxiliary_loss_clip": 0.01097146, "auxiliary_loss_mlp": 0.01070763, "balance_loss_clip": 1.02448606, "balance_loss_mlp": 1.02490258, "epoch": 0.11351270103712612, "flos": 20987408747520.0, "grad_norm": 2.460425572651278, "language_loss": 0.80043852, "learning_rate": 3.927284900491277e-06, "loss": 0.82211769, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.72265625, "step": 1888, "time_per_iteration": 2.492938756942749 }, { "auxiliary_loss_clip": 0.01099098, "auxiliary_loss_mlp": 0.01077062, "balance_loss_clip": 1.02666032, "balance_loss_mlp": 1.02547514, "epoch": 0.11357282428979408, "flos": 37346241841920.0, "grad_norm": 1.9772865667537864, "language_loss": 0.69375616, "learning_rate": 3.927180801692764e-06, "loss": 0.71551776, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 1889, "time_per_iteration": 2.601879596710205 }, { "auxiliary_loss_clip": 0.01096481, "auxiliary_loss_mlp": 0.01070475, "balance_loss_clip": 1.01985884, "balance_loss_mlp": 1.0249536, "epoch": 0.11363294754246205, "flos": 21756091351680.0, "grad_norm": 1.6606015685371962, "language_loss": 0.85298574, "learning_rate": 3.927076629815362e-06, "loss": 0.87465525, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.71484375, "step": 1890, "time_per_iteration": 2.4335434436798096 }, { "auxiliary_loss_clip": 0.01095659, "auxiliary_loss_mlp": 0.01071225, "balance_loss_clip": 1.02623594, "balance_loss_mlp": 1.02563465, "epoch": 0.11369307079513001, "flos": 22600535339520.0, "grad_norm": 2.6521065676607343, "language_loss": 0.68373477, "learning_rate": 3.926972384863022e-06, "loss": 0.70540369, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.69921875, "step": 1891, "time_per_iteration": 2.396352529525757 }, { "auxiliary_loss_clip": 0.01101738, "auxiliary_loss_mlp": 0.01070181, "balance_loss_clip": 1.02244902, "balance_loss_mlp": 1.02642405, "epoch": 0.11375319404779799, "flos": 21943190661120.0, "grad_norm": 2.0437717535521975, "language_loss": 0.90590245, "learning_rate": 3.9268680668396956e-06, "loss": 0.9276216, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.75390625, "step": 1892, "time_per_iteration": 2.4096405506134033 }, { "auxiliary_loss_clip": 0.01103202, "auxiliary_loss_mlp": 0.0108169, "balance_loss_clip": 1.03004885, "balance_loss_mlp": 1.02683306, "epoch": 0.11381331730046595, "flos": 26394267628800.0, "grad_norm": 2.7223509585698085, "language_loss": 0.75923604, "learning_rate": 3.926763675749339e-06, "loss": 0.78108495, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.765625, "step": 1893, "time_per_iteration": 2.4340016841888428 }, { "auxiliary_loss_clip": 0.01094557, "auxiliary_loss_mlp": 0.01073107, "balance_loss_clip": 1.02554297, "balance_loss_mlp": 1.02400064, "epoch": 0.11387344055313392, "flos": 23803607479680.0, "grad_norm": 1.8893833949054872, "language_loss": 0.81409949, "learning_rate": 3.92665921159591e-06, "loss": 0.83577621, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.703125, "step": 1894, "time_per_iteration": 2.4373579025268555 }, { "auxiliary_loss_clip": 0.01103576, "auxiliary_loss_mlp": 0.01071936, "balance_loss_clip": 1.02193952, "balance_loss_mlp": 1.02791119, "epoch": 0.1139335638058019, "flos": 34521699294720.0, "grad_norm": 4.128387348864333, "language_loss": 0.84122884, "learning_rate": 3.926554674383371e-06, "loss": 0.86298394, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7578125, "step": 1895, "time_per_iteration": 2.4865076541900635 }, { "auxiliary_loss_clip": 0.01026502, "auxiliary_loss_mlp": 0.01017288, "balance_loss_clip": 1.00980127, "balance_loss_mlp": 1.0084796, "epoch": 0.11399368705846986, "flos": 70584148333440.0, "grad_norm": 0.8147085719504553, "language_loss": 0.63502467, "learning_rate": 3.926450064115686e-06, "loss": 0.6554625, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 0.07470703, "router_z_loss_mlp": 0.1796875, "step": 1896, "time_per_iteration": 3.1232588291168213 }, { "auxiliary_loss_clip": 0.01101255, "auxiliary_loss_mlp": 0.01067352, "balance_loss_clip": 1.016927, "balance_loss_mlp": 1.02822733, "epoch": 0.11405381031113783, "flos": 21323203004160.0, "grad_norm": 1.9318551735230916, "language_loss": 0.86146086, "learning_rate": 3.926345380796821e-06, "loss": 0.88314694, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.73046875, "step": 1897, "time_per_iteration": 2.425869941711426 }, { "auxiliary_loss_clip": 0.01104157, "auxiliary_loss_mlp": 0.01071224, "balance_loss_clip": 1.02303958, "balance_loss_mlp": 1.02957845, "epoch": 0.11411393356380581, "flos": 19718594784000.0, "grad_norm": 2.2199132149606893, "language_loss": 0.81340933, "learning_rate": 3.9262406244307465e-06, "loss": 0.83516318, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.74609375, "step": 1898, "time_per_iteration": 2.443129062652588 }, { "auxiliary_loss_clip": 0.01104505, "auxiliary_loss_mlp": 0.01077304, "balance_loss_clip": 1.02661633, "balance_loss_mlp": 1.02875268, "epoch": 0.11417405681647377, "flos": 17529470714880.0, "grad_norm": 2.578378824415495, "language_loss": 0.76418275, "learning_rate": 3.926135795021435e-06, "loss": 0.78600085, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 1899, "time_per_iteration": 2.3868911266326904 }, { "auxiliary_loss_clip": 0.01035545, "auxiliary_loss_mlp": 0.01018162, "balance_loss_clip": 1.01322711, "balance_loss_mlp": 1.01728582, "epoch": 0.11423418006914174, "flos": 59671416355200.0, "grad_norm": 0.9306140439127988, "language_loss": 0.63517708, "learning_rate": 3.92603089257286e-06, "loss": 0.65571415, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 0.04931641, "router_z_loss_mlp": 0.18261719, "step": 1900, "time_per_iteration": 3.0043721199035645 }, { "auxiliary_loss_clip": 0.01105093, "auxiliary_loss_mlp": 0.01079259, "balance_loss_clip": 1.03095543, "balance_loss_mlp": 1.02963984, "epoch": 0.1142943033218097, "flos": 22962096046080.0, "grad_norm": 1.9113502691584607, "language_loss": 0.79754734, "learning_rate": 3.925925917089001e-06, "loss": 0.81939083, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.75390625, "step": 1901, "time_per_iteration": 2.4261326789855957 }, { "auxiliary_loss_clip": 0.01103142, "auxiliary_loss_mlp": 0.01073913, "balance_loss_clip": 1.02596688, "balance_loss_mlp": 1.02781212, "epoch": 0.11435442657447768, "flos": 18255385036800.0, "grad_norm": 2.0978125028630017, "language_loss": 0.86818624, "learning_rate": 3.925820868573839e-06, "loss": 0.88995683, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.75390625, "step": 1902, "time_per_iteration": 2.423414707183838 }, { "auxiliary_loss_clip": 0.01098273, "auxiliary_loss_mlp": 0.01075791, "balance_loss_clip": 1.02646255, "balance_loss_mlp": 1.02488196, "epoch": 0.11441454982714565, "flos": 24060044482560.0, "grad_norm": 3.69791193643829, "language_loss": 0.79799533, "learning_rate": 3.925715747031356e-06, "loss": 0.819736, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.734375, "step": 1903, "time_per_iteration": 2.439527988433838 }, { "auxiliary_loss_clip": 0.01100175, "auxiliary_loss_mlp": 0.01073945, "balance_loss_clip": 1.02514064, "balance_loss_mlp": 1.02542341, "epoch": 0.11447467307981361, "flos": 25336538945280.0, "grad_norm": 5.006334007746795, "language_loss": 0.76973581, "learning_rate": 3.925610552465539e-06, "loss": 0.79147708, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.75, "step": 1904, "time_per_iteration": 2.4378955364227295 }, { "auxiliary_loss_clip": 0.01098442, "auxiliary_loss_mlp": 0.01082149, "balance_loss_clip": 1.02762282, "balance_loss_mlp": 1.0257175, "epoch": 0.11453479633248159, "flos": 21724983463680.0, "grad_norm": 2.322709133589434, "language_loss": 0.94894862, "learning_rate": 3.9255052848803764e-06, "loss": 0.9707545, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.7265625, "step": 1905, "time_per_iteration": 2.4019157886505127 }, { "auxiliary_loss_clip": 0.01106759, "auxiliary_loss_mlp": 0.01076048, "balance_loss_clip": 1.02273726, "balance_loss_mlp": 1.02536607, "epoch": 0.11459491958514956, "flos": 12968871212160.0, "grad_norm": 2.3173862953568123, "language_loss": 0.80984318, "learning_rate": 3.925399944279861e-06, "loss": 0.83167124, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.8125, "step": 1906, "time_per_iteration": 2.4461934566497803 }, { "auxiliary_loss_clip": 0.01099596, "auxiliary_loss_mlp": 0.01078397, "balance_loss_clip": 1.02499104, "balance_loss_mlp": 1.02467704, "epoch": 0.11465504283781752, "flos": 22710162608640.0, "grad_norm": 2.0420346689084563, "language_loss": 0.85179579, "learning_rate": 3.925294530667986e-06, "loss": 0.87357569, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.75, "step": 1907, "time_per_iteration": 2.4013493061065674 }, { "auxiliary_loss_clip": 0.01104586, "auxiliary_loss_mlp": 0.01094775, "balance_loss_clip": 1.04182243, "balance_loss_mlp": 1.02952981, "epoch": 0.1147151660904855, "flos": 23397428188800.0, "grad_norm": 2.3984462271129345, "language_loss": 0.86514479, "learning_rate": 3.92518904404875e-06, "loss": 0.88713837, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.75, "step": 1908, "time_per_iteration": 2.466416120529175 }, { "auxiliary_loss_clip": 0.01037684, "auxiliary_loss_mlp": 0.01008842, "balance_loss_clip": 1.00393057, "balance_loss_mlp": 1.01954818, "epoch": 0.11477528934315347, "flos": 63009044242560.0, "grad_norm": 0.9278183448357621, "language_loss": 0.61119366, "learning_rate": 3.925083484426153e-06, "loss": 0.63165897, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 0.04907227, "router_z_loss_mlp": 0.18164062, "step": 1909, "time_per_iteration": 2.7610809803009033 }, { "auxiliary_loss_clip": 0.01104264, "auxiliary_loss_mlp": 0.01075453, "balance_loss_clip": 1.02409756, "balance_loss_mlp": 1.02895105, "epoch": 0.11483541259582143, "flos": 16324687918080.0, "grad_norm": 1.842888055947887, "language_loss": 0.81009704, "learning_rate": 3.924977851804197e-06, "loss": 0.83189416, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.75390625, "step": 1910, "time_per_iteration": 2.38220477104187 }, { "auxiliary_loss_clip": 0.01103611, "auxiliary_loss_mlp": 0.01065713, "balance_loss_clip": 1.01478636, "balance_loss_mlp": 1.02840471, "epoch": 0.1148955358484894, "flos": 21579325804800.0, "grad_norm": 2.040757855259629, "language_loss": 0.78414232, "learning_rate": 3.9248721461868875e-06, "loss": 0.80583549, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 1911, "time_per_iteration": 2.3916642665863037 }, { "auxiliary_loss_clip": 0.01098167, "auxiliary_loss_mlp": 0.01070481, "balance_loss_clip": 1.02191544, "balance_loss_mlp": 1.02687752, "epoch": 0.11495565910115738, "flos": 27672437836800.0, "grad_norm": 1.6259707657283342, "language_loss": 0.80389506, "learning_rate": 3.9247663675782336e-06, "loss": 0.82558155, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.71484375, "step": 1912, "time_per_iteration": 2.457498550415039 }, { "auxiliary_loss_clip": 0.01103763, "auxiliary_loss_mlp": 0.01081574, "balance_loss_clip": 1.02969384, "balance_loss_mlp": 1.02839506, "epoch": 0.11501578235382534, "flos": 20631294213120.0, "grad_norm": 1.9691481128065385, "language_loss": 0.803545, "learning_rate": 3.924660515982246e-06, "loss": 0.82539833, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.75390625, "step": 1913, "time_per_iteration": 3.8680005073547363 }, { "auxiliary_loss_clip": 0.01104216, "auxiliary_loss_mlp": 0.0106984, "balance_loss_clip": 1.0177691, "balance_loss_mlp": 1.02767372, "epoch": 0.1150759056064933, "flos": 19828012584960.0, "grad_norm": 4.113344882486828, "language_loss": 0.72709507, "learning_rate": 3.924554591402939e-06, "loss": 0.74883562, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.765625, "step": 1914, "time_per_iteration": 2.395423173904419 }, { "auxiliary_loss_clip": 0.01039858, "auxiliary_loss_mlp": 0.01006677, "balance_loss_clip": 1.00071704, "balance_loss_mlp": 1.02014196, "epoch": 0.11513602885916129, "flos": 70041981830400.0, "grad_norm": 0.8619132263675272, "language_loss": 0.61130953, "learning_rate": 3.92444859384433e-06, "loss": 0.63177478, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 0.05957031, "router_z_loss_mlp": 0.19726562, "step": 1915, "time_per_iteration": 3.123317241668701 }, { "auxiliary_loss_clip": 0.011031, "auxiliary_loss_mlp": 0.01085096, "balance_loss_clip": 1.030617, "balance_loss_mlp": 1.02827954, "epoch": 0.11519615211182925, "flos": 15740835384960.0, "grad_norm": 2.3587560287586697, "language_loss": 0.96246397, "learning_rate": 3.924342523310436e-06, "loss": 0.98434597, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.75, "step": 1916, "time_per_iteration": 5.260532379150391 }, { "auxiliary_loss_clip": 0.01102699, "auxiliary_loss_mlp": 0.01082716, "balance_loss_clip": 1.02787995, "balance_loss_mlp": 1.02652657, "epoch": 0.11525627536449722, "flos": 20666591464320.0, "grad_norm": 1.8663706166075011, "language_loss": 0.74269754, "learning_rate": 3.9242363798052806e-06, "loss": 0.7645517, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.76171875, "step": 1917, "time_per_iteration": 2.389037609100342 }, { "auxiliary_loss_clip": 0.01096504, "auxiliary_loss_mlp": 0.01071462, "balance_loss_clip": 1.01815176, "balance_loss_mlp": 1.02505946, "epoch": 0.1153163986171652, "flos": 20302237848960.0, "grad_norm": 1.878223501492166, "language_loss": 0.77126813, "learning_rate": 3.92413016333289e-06, "loss": 0.79294789, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.71484375, "step": 1918, "time_per_iteration": 2.420306921005249 }, { "auxiliary_loss_clip": 0.01101986, "auxiliary_loss_mlp": 0.01073936, "balance_loss_clip": 1.02081609, "balance_loss_mlp": 1.02552521, "epoch": 0.11537652186983316, "flos": 17638364845440.0, "grad_norm": 1.9435062321325158, "language_loss": 0.89015043, "learning_rate": 3.92402387389729e-06, "loss": 0.91190958, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.765625, "step": 1919, "time_per_iteration": 2.3609936237335205 }, { "auxiliary_loss_clip": 0.01099992, "auxiliary_loss_mlp": 0.01078531, "balance_loss_clip": 1.02443421, "balance_loss_mlp": 1.02426577, "epoch": 0.11543664512250112, "flos": 21068337012480.0, "grad_norm": 1.8801302635587271, "language_loss": 0.88996416, "learning_rate": 3.923917511502512e-06, "loss": 0.91174942, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7578125, "step": 1920, "time_per_iteration": 2.412687301635742 }, { "auxiliary_loss_clip": 0.01097556, "auxiliary_loss_mlp": 0.01073687, "balance_loss_clip": 1.0231421, "balance_loss_mlp": 1.02450156, "epoch": 0.11549676837516909, "flos": 22746437377920.0, "grad_norm": 1.9015909137237426, "language_loss": 0.8145293, "learning_rate": 3.923811076152589e-06, "loss": 0.83624172, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 1921, "time_per_iteration": 2.4050183296203613 }, { "auxiliary_loss_clip": 0.01105586, "auxiliary_loss_mlp": 0.01079158, "balance_loss_clip": 1.02150798, "balance_loss_mlp": 1.02614427, "epoch": 0.11555689162783707, "flos": 19168049554560.0, "grad_norm": 1.9489528284703288, "language_loss": 0.79503161, "learning_rate": 3.923704567851557e-06, "loss": 0.81687903, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.79296875, "step": 1922, "time_per_iteration": 2.402738094329834 }, { "auxiliary_loss_clip": 0.01103091, "auxiliary_loss_mlp": 0.01084399, "balance_loss_clip": 1.03073144, "balance_loss_mlp": 1.02614999, "epoch": 0.11561701488050503, "flos": 24570893629440.0, "grad_norm": 1.8604942293116362, "language_loss": 0.85317189, "learning_rate": 3.923597986603456e-06, "loss": 0.87504685, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.76953125, "step": 1923, "time_per_iteration": 2.4516875743865967 }, { "auxiliary_loss_clip": 0.01104156, "auxiliary_loss_mlp": 0.0107577, "balance_loss_clip": 1.01885962, "balance_loss_mlp": 1.02692676, "epoch": 0.115677138133173, "flos": 17091590042880.0, "grad_norm": 2.0027427251759735, "language_loss": 0.83251321, "learning_rate": 3.9234913324123264e-06, "loss": 0.85431242, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.7734375, "step": 1924, "time_per_iteration": 2.420670509338379 }, { "auxiliary_loss_clip": 0.01024496, "auxiliary_loss_mlp": 0.01009615, "balance_loss_clip": 1.0039643, "balance_loss_mlp": 1.00571227, "epoch": 0.11573726138584098, "flos": 62700515758080.0, "grad_norm": 0.816670549206094, "language_loss": 0.61332083, "learning_rate": 3.923384605282212e-06, "loss": 0.63366193, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 0.05639648, "router_z_loss_mlp": 0.1875, "step": 1925, "time_per_iteration": 3.0350961685180664 }, { "auxiliary_loss_clip": 0.01100762, "auxiliary_loss_mlp": 0.01078035, "balance_loss_clip": 1.02160156, "balance_loss_mlp": 1.02532911, "epoch": 0.11579738463850894, "flos": 22600046580480.0, "grad_norm": 1.6592144619966165, "language_loss": 0.76847106, "learning_rate": 3.923277805217161e-06, "loss": 0.79025906, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.75390625, "step": 1926, "time_per_iteration": 2.4421029090881348 }, { "auxiliary_loss_clip": 0.01105084, "auxiliary_loss_mlp": 0.01081601, "balance_loss_clip": 1.02404654, "balance_loss_mlp": 1.02691829, "epoch": 0.11585750789117691, "flos": 21725053286400.0, "grad_norm": 2.81843715438092, "language_loss": 0.7573036, "learning_rate": 3.923170932221222e-06, "loss": 0.77917039, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.78125, "step": 1927, "time_per_iteration": 2.4649441242218018 }, { "auxiliary_loss_clip": 0.01101324, "auxiliary_loss_mlp": 0.01075424, "balance_loss_clip": 1.01879978, "balance_loss_mlp": 1.02573895, "epoch": 0.11591763114384489, "flos": 26286316104960.0, "grad_norm": 1.6026120684954417, "language_loss": 0.88697374, "learning_rate": 3.92306398629845e-06, "loss": 0.90874124, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.75390625, "step": 1928, "time_per_iteration": 2.4759366512298584 }, { "auxiliary_loss_clip": 0.01105063, "auxiliary_loss_mlp": 0.0109223, "balance_loss_clip": 1.03725076, "balance_loss_mlp": 1.02782726, "epoch": 0.11597775439651285, "flos": 22999418156160.0, "grad_norm": 1.969625707307952, "language_loss": 0.80208433, "learning_rate": 3.922956967452898e-06, "loss": 0.82405722, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.7734375, "step": 1929, "time_per_iteration": 2.4990079402923584 }, { "auxiliary_loss_clip": 0.01100095, "auxiliary_loss_mlp": 0.01080886, "balance_loss_clip": 1.03248727, "balance_loss_mlp": 1.02691579, "epoch": 0.11603787764918082, "flos": 31940360478720.0, "grad_norm": 1.7275403423455795, "language_loss": 0.79079658, "learning_rate": 3.922849875688626e-06, "loss": 0.81260639, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.73046875, "step": 1930, "time_per_iteration": 2.504427433013916 }, { "auxiliary_loss_clip": 0.01101347, "auxiliary_loss_mlp": 0.0107767, "balance_loss_clip": 1.02576673, "balance_loss_mlp": 1.0264256, "epoch": 0.1160980009018488, "flos": 22270606191360.0, "grad_norm": 1.9524965515627912, "language_loss": 0.74097633, "learning_rate": 3.922742711009693e-06, "loss": 0.76276648, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.75, "step": 1931, "time_per_iteration": 2.4005420207977295 }, { "auxiliary_loss_clip": 0.01105348, "auxiliary_loss_mlp": 0.01079492, "balance_loss_clip": 1.02765942, "balance_loss_mlp": 1.02877855, "epoch": 0.11615812415451676, "flos": 22782537590400.0, "grad_norm": 1.7454332805812756, "language_loss": 0.84138036, "learning_rate": 3.922635473420164e-06, "loss": 0.8632288, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.765625, "step": 1932, "time_per_iteration": 2.4524455070495605 }, { "auxiliary_loss_clip": 0.01024423, "auxiliary_loss_mlp": 0.01016575, "balance_loss_clip": 1.00985181, "balance_loss_mlp": 1.00607646, "epoch": 0.11621824740718473, "flos": 67142864885760.0, "grad_norm": 0.7795563159617596, "language_loss": 0.61040831, "learning_rate": 3.922528162924105e-06, "loss": 0.63081825, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 0.06738281, "router_z_loss_mlp": 0.18359375, "step": 1933, "time_per_iteration": 2.9197466373443604 }, { "auxiliary_loss_clip": 0.0110241, "auxiliary_loss_mlp": 0.01074543, "balance_loss_clip": 1.02287745, "balance_loss_mlp": 1.02558279, "epoch": 0.11627837065985269, "flos": 20374892121600.0, "grad_norm": 1.9751206780424375, "language_loss": 0.88423049, "learning_rate": 3.922420779525586e-06, "loss": 0.90600002, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.76953125, "step": 1934, "time_per_iteration": 2.421251058578491 }, { "auxiliary_loss_clip": 0.0110623, "auxiliary_loss_mlp": 0.01080654, "balance_loss_clip": 1.02801085, "balance_loss_mlp": 1.02898979, "epoch": 0.11633849391252067, "flos": 21724739084160.0, "grad_norm": 2.26167043084455, "language_loss": 0.69919413, "learning_rate": 3.9223133232286776e-06, "loss": 0.72106302, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.7734375, "step": 1935, "time_per_iteration": 2.385678291320801 }, { "auxiliary_loss_clip": 0.01099681, "auxiliary_loss_mlp": 0.01077429, "balance_loss_clip": 1.02755141, "balance_loss_mlp": 1.02478611, "epoch": 0.11639861716518864, "flos": 18804394166400.0, "grad_norm": 2.087801591831859, "language_loss": 0.77685356, "learning_rate": 3.922205794037456e-06, "loss": 0.79862463, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.75, "step": 1936, "time_per_iteration": 2.394496202468872 }, { "auxiliary_loss_clip": 0.01100015, "auxiliary_loss_mlp": 0.01076216, "balance_loss_clip": 1.02376366, "balance_loss_mlp": 1.02456009, "epoch": 0.1164587404178566, "flos": 21213924848640.0, "grad_norm": 1.8307292919040705, "language_loss": 0.86297512, "learning_rate": 3.922098191955998e-06, "loss": 0.88473749, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.75390625, "step": 1937, "time_per_iteration": 2.388935089111328 }, { "auxiliary_loss_clip": 0.01094732, "auxiliary_loss_mlp": 0.01067897, "balance_loss_clip": 1.01997471, "balance_loss_mlp": 1.02352262, "epoch": 0.11651886367052458, "flos": 27817397268480.0, "grad_norm": 1.9335887271623498, "language_loss": 0.7779693, "learning_rate": 3.921990516988384e-06, "loss": 0.79959559, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7109375, "step": 1938, "time_per_iteration": 2.473313808441162 }, { "auxiliary_loss_clip": 0.01104287, "auxiliary_loss_mlp": 0.01084428, "balance_loss_clip": 1.02837622, "balance_loss_mlp": 1.0257802, "epoch": 0.11657898692319255, "flos": 22888568989440.0, "grad_norm": 1.7367867235260621, "language_loss": 0.80767649, "learning_rate": 3.921882769138696e-06, "loss": 0.82956362, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.78515625, "step": 1939, "time_per_iteration": 2.3945810794830322 }, { "auxiliary_loss_clip": 0.01099419, "auxiliary_loss_mlp": 0.0108171, "balance_loss_clip": 1.0319519, "balance_loss_mlp": 1.0249927, "epoch": 0.11663911017586051, "flos": 24314770828800.0, "grad_norm": 2.7873221709203815, "language_loss": 0.8819685, "learning_rate": 3.9217749484110215e-06, "loss": 0.90377975, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.7421875, "step": 1940, "time_per_iteration": 2.4464802742004395 }, { "auxiliary_loss_clip": 0.01094395, "auxiliary_loss_mlp": 0.01078484, "balance_loss_clip": 1.03428137, "balance_loss_mlp": 1.02451062, "epoch": 0.11669923342852849, "flos": 42338507794560.0, "grad_norm": 1.5274539315744171, "language_loss": 0.77823973, "learning_rate": 3.921667054809449e-06, "loss": 0.79996848, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.69921875, "step": 1941, "time_per_iteration": 2.5917911529541016 }, { "auxiliary_loss_clip": 0.01099636, "auxiliary_loss_mlp": 0.01085479, "balance_loss_clip": 1.0362215, "balance_loss_mlp": 1.02487659, "epoch": 0.11675935668119646, "flos": 14641560316800.0, "grad_norm": 1.9065487899793008, "language_loss": 0.89950025, "learning_rate": 3.921559088338068e-06, "loss": 0.92135143, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.75, "step": 1942, "time_per_iteration": 2.3784279823303223 }, { "auxiliary_loss_clip": 0.01097409, "auxiliary_loss_mlp": 0.01071666, "balance_loss_clip": 1.02534151, "balance_loss_mlp": 1.02689183, "epoch": 0.11681947993386442, "flos": 35115012806400.0, "grad_norm": 2.053962402357319, "language_loss": 0.69737881, "learning_rate": 3.921451049000975e-06, "loss": 0.7190696, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.703125, "step": 1943, "time_per_iteration": 2.524909496307373 }, { "auxiliary_loss_clip": 0.01097538, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.01818633, "balance_loss_mlp": 1.02541852, "epoch": 0.11687960318653239, "flos": 38981713570560.0, "grad_norm": 1.871225335178146, "language_loss": 0.7164948, "learning_rate": 3.921342936802265e-06, "loss": 0.73814678, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.72265625, "step": 1944, "time_per_iteration": 2.56483793258667 }, { "auxiliary_loss_clip": 0.01102197, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.02689004, "balance_loss_mlp": 1.02652848, "epoch": 0.11693972643920036, "flos": 25993778889600.0, "grad_norm": 1.5659902023576129, "language_loss": 0.83571392, "learning_rate": 3.921234751746038e-06, "loss": 0.85750091, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.7578125, "step": 1945, "time_per_iteration": 2.440305471420288 }, { "auxiliary_loss_clip": 0.01095979, "auxiliary_loss_mlp": 0.01075449, "balance_loss_clip": 1.02600121, "balance_loss_mlp": 1.02307653, "epoch": 0.11699984969186833, "flos": 27270866845440.0, "grad_norm": 2.0354172485559996, "language_loss": 0.78286022, "learning_rate": 3.9211264938363975e-06, "loss": 0.80457449, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.73046875, "step": 1946, "time_per_iteration": 2.443709373474121 }, { "auxiliary_loss_clip": 0.01094886, "auxiliary_loss_mlp": 0.0106774, "balance_loss_clip": 1.02208304, "balance_loss_mlp": 1.02453065, "epoch": 0.1170599729445363, "flos": 15266959234560.0, "grad_norm": 1.9162103734297344, "language_loss": 0.70844942, "learning_rate": 3.921018163077448e-06, "loss": 0.73007572, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.703125, "step": 1947, "time_per_iteration": 2.3867526054382324 }, { "auxiliary_loss_clip": 0.01098535, "auxiliary_loss_mlp": 0.01070628, "balance_loss_clip": 1.02010679, "balance_loss_mlp": 1.0261786, "epoch": 0.11712009619720427, "flos": 17163511176960.0, "grad_norm": 1.86112873725718, "language_loss": 0.87185425, "learning_rate": 3.920909759473295e-06, "loss": 0.89354587, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.72265625, "step": 1948, "time_per_iteration": 2.383723020553589 }, { "auxiliary_loss_clip": 0.01025867, "auxiliary_loss_mlp": 0.01020997, "balance_loss_clip": 1.01515615, "balance_loss_mlp": 1.00658453, "epoch": 0.11718021944987224, "flos": 70937644515840.0, "grad_norm": 0.8347948317840764, "language_loss": 0.65263897, "learning_rate": 3.920801283028054e-06, "loss": 0.67310762, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 0.05834961, "router_z_loss_mlp": 0.19335938, "step": 1949, "time_per_iteration": 3.0580947399139404 }, { "auxiliary_loss_clip": 0.01095745, "auxiliary_loss_mlp": 0.01069687, "balance_loss_clip": 1.02431607, "balance_loss_mlp": 1.02404857, "epoch": 0.1172403427025402, "flos": 27452240691840.0, "grad_norm": 1.6309491835037033, "language_loss": 0.73445642, "learning_rate": 3.920692733745835e-06, "loss": 0.75611079, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.71875, "step": 1950, "time_per_iteration": 2.453089714050293 }, { "auxiliary_loss_clip": 0.01099548, "auxiliary_loss_mlp": 0.01074748, "balance_loss_clip": 1.026564, "balance_loss_mlp": 1.02574944, "epoch": 0.11730046595520818, "flos": 15667831998720.0, "grad_norm": 2.140938309932894, "language_loss": 0.79714155, "learning_rate": 3.920584111630755e-06, "loss": 0.81888449, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.73828125, "step": 1951, "time_per_iteration": 2.3543148040771484 }, { "auxiliary_loss_clip": 0.0109912, "auxiliary_loss_mlp": 0.01079958, "balance_loss_clip": 1.03141642, "balance_loss_mlp": 1.02570367, "epoch": 0.11736058920787615, "flos": 25628971426560.0, "grad_norm": 1.7707328433413079, "language_loss": 0.78388196, "learning_rate": 3.9204754166869325e-06, "loss": 0.80567276, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.734375, "step": 1952, "time_per_iteration": 2.4808359146118164 }, { "auxiliary_loss_clip": 0.01099645, "auxiliary_loss_mlp": 0.01075363, "balance_loss_clip": 1.02608204, "balance_loss_mlp": 1.02539825, "epoch": 0.11742071246054411, "flos": 21433214298240.0, "grad_norm": 1.8347422267290776, "language_loss": 0.74187744, "learning_rate": 3.920366648918491e-06, "loss": 0.76362753, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.7421875, "step": 1953, "time_per_iteration": 4.628724098205566 }, { "auxiliary_loss_clip": 0.01103828, "auxiliary_loss_mlp": 0.01087097, "balance_loss_clip": 1.03519344, "balance_loss_mlp": 1.02541363, "epoch": 0.11748083571321208, "flos": 15996923274240.0, "grad_norm": 2.576622216994708, "language_loss": 0.83628595, "learning_rate": 3.920257808329552e-06, "loss": 0.85819519, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.78515625, "step": 1954, "time_per_iteration": 2.4205286502838135 }, { "auxiliary_loss_clip": 0.0109953, "auxiliary_loss_mlp": 0.01078657, "balance_loss_clip": 1.02725363, "balance_loss_mlp": 1.02441096, "epoch": 0.11754095896588006, "flos": 16179134993280.0, "grad_norm": 2.1752875628262167, "language_loss": 0.87702525, "learning_rate": 3.920148894924246e-06, "loss": 0.89880705, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.75, "step": 1955, "time_per_iteration": 5.291463613510132 }, { "auxiliary_loss_clip": 0.01096756, "auxiliary_loss_mlp": 0.0107593, "balance_loss_clip": 1.0284605, "balance_loss_mlp": 1.02308774, "epoch": 0.11760108221854802, "flos": 13260745111680.0, "grad_norm": 2.1927197636241345, "language_loss": 0.79162407, "learning_rate": 3.920039908706701e-06, "loss": 0.81335092, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.734375, "step": 1956, "time_per_iteration": 2.428910732269287 }, { "auxiliary_loss_clip": 0.01094802, "auxiliary_loss_mlp": 0.01083694, "balance_loss_clip": 1.03326809, "balance_loss_mlp": 1.02414382, "epoch": 0.11766120547121599, "flos": 24497296750080.0, "grad_norm": 1.9578382747614873, "language_loss": 0.82200289, "learning_rate": 3.91993084968105e-06, "loss": 0.84378785, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.70703125, "step": 1957, "time_per_iteration": 3.9153811931610107 }, { "auxiliary_loss_clip": 0.01099358, "auxiliary_loss_mlp": 0.01068539, "balance_loss_clip": 1.02011645, "balance_loss_mlp": 1.02493358, "epoch": 0.11772132872388397, "flos": 17783079897600.0, "grad_norm": 2.008533326891821, "language_loss": 0.81396335, "learning_rate": 3.919821717851428e-06, "loss": 0.83564234, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.74609375, "step": 1958, "time_per_iteration": 2.547597646713257 }, { "auxiliary_loss_clip": 0.01097261, "auxiliary_loss_mlp": 0.01065679, "balance_loss_clip": 1.01840091, "balance_loss_mlp": 1.02544034, "epoch": 0.11778145197655193, "flos": 13216405818240.0, "grad_norm": 1.8823968293413278, "language_loss": 0.79106855, "learning_rate": 3.919712513221976e-06, "loss": 0.81269795, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.71875, "step": 1959, "time_per_iteration": 2.4653778076171875 }, { "auxiliary_loss_clip": 0.01095478, "auxiliary_loss_mlp": 0.01069631, "balance_loss_clip": 1.02173281, "balance_loss_mlp": 1.02389824, "epoch": 0.1178415752292199, "flos": 20229164640000.0, "grad_norm": 1.9011261186504385, "language_loss": 0.72540903, "learning_rate": 3.919603235796832e-06, "loss": 0.74706012, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.71484375, "step": 1960, "time_per_iteration": 2.467146873474121 }, { "auxiliary_loss_clip": 0.01101117, "auxiliary_loss_mlp": 0.01078166, "balance_loss_clip": 1.02945721, "balance_loss_mlp": 1.02518308, "epoch": 0.11790169848188788, "flos": 13039360980480.0, "grad_norm": 3.2353078315259984, "language_loss": 0.83770704, "learning_rate": 3.9194938855801406e-06, "loss": 0.85949981, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7578125, "step": 1961, "time_per_iteration": 2.3909335136413574 }, { "auxiliary_loss_clip": 0.01094421, "auxiliary_loss_mlp": 0.01075894, "balance_loss_clip": 1.02811527, "balance_loss_mlp": 1.02373433, "epoch": 0.11796182173455584, "flos": 22264845816960.0, "grad_norm": 1.764938619687065, "language_loss": 0.94603556, "learning_rate": 3.919384462576049e-06, "loss": 0.96773869, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.70703125, "step": 1962, "time_per_iteration": 2.4402852058410645 }, { "auxiliary_loss_clip": 0.01098698, "auxiliary_loss_mlp": 0.01078368, "balance_loss_clip": 1.03151906, "balance_loss_mlp": 1.02517033, "epoch": 0.1180219449872238, "flos": 10634229129600.0, "grad_norm": 2.2805685847614856, "language_loss": 0.90378129, "learning_rate": 3.919274966788707e-06, "loss": 0.92555201, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.734375, "step": 1963, "time_per_iteration": 2.397088050842285 }, { "auxiliary_loss_clip": 0.01103203, "auxiliary_loss_mlp": 0.01077506, "balance_loss_clip": 1.03015637, "balance_loss_mlp": 1.02593577, "epoch": 0.11808206823989177, "flos": 20922469885440.0, "grad_norm": 1.8935345525425222, "language_loss": 0.86051631, "learning_rate": 3.919165398222265e-06, "loss": 0.88232338, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.7734375, "step": 1964, "time_per_iteration": 2.473780632019043 }, { "auxiliary_loss_clip": 0.01098289, "auxiliary_loss_mlp": 0.01071061, "balance_loss_clip": 1.02538013, "balance_loss_mlp": 1.02582359, "epoch": 0.11814219149255975, "flos": 20776707492480.0, "grad_norm": 2.0143657817907665, "language_loss": 0.85223615, "learning_rate": 3.919055756880879e-06, "loss": 0.87392962, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.7265625, "step": 1965, "time_per_iteration": 2.431307792663574 }, { "auxiliary_loss_clip": 0.01098774, "auxiliary_loss_mlp": 0.01080247, "balance_loss_clip": 1.03172922, "balance_loss_mlp": 1.02437353, "epoch": 0.11820231474522772, "flos": 48758162572800.0, "grad_norm": 1.6564758975121856, "language_loss": 0.76597512, "learning_rate": 3.918946042768707e-06, "loss": 0.78776532, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.74609375, "step": 1966, "time_per_iteration": 2.6666133403778076 }, { "auxiliary_loss_clip": 0.01102013, "auxiliary_loss_mlp": 0.01089877, "balance_loss_clip": 1.04131079, "balance_loss_mlp": 1.02631295, "epoch": 0.11826243799789568, "flos": 16689669937920.0, "grad_norm": 2.23096624147491, "language_loss": 0.74836242, "learning_rate": 3.918836255889908e-06, "loss": 0.77028131, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7578125, "step": 1967, "time_per_iteration": 2.401096820831299 }, { "auxiliary_loss_clip": 0.01095578, "auxiliary_loss_mlp": 0.01074592, "balance_loss_clip": 1.02581167, "balance_loss_mlp": 1.02498949, "epoch": 0.11832256125056366, "flos": 16908924476160.0, "grad_norm": 2.2520420532733962, "language_loss": 0.9109596, "learning_rate": 3.9187263962486456e-06, "loss": 0.93266129, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.703125, "step": 1968, "time_per_iteration": 2.4904839992523193 }, { "auxiliary_loss_clip": 0.01100343, "auxiliary_loss_mlp": 0.01076785, "balance_loss_clip": 1.03046072, "balance_loss_mlp": 1.02577674, "epoch": 0.11838268450323162, "flos": 22819301118720.0, "grad_norm": 1.75858492264819, "language_loss": 0.69124985, "learning_rate": 3.918616463849087e-06, "loss": 0.71302116, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.74609375, "step": 1969, "time_per_iteration": 2.440962076187134 }, { "auxiliary_loss_clip": 0.01098371, "auxiliary_loss_mlp": 0.0107058, "balance_loss_clip": 1.02156162, "balance_loss_mlp": 1.02679706, "epoch": 0.11844280775589959, "flos": 33544479939840.0, "grad_norm": 2.1169594305573227, "language_loss": 0.83471286, "learning_rate": 3.918506458695399e-06, "loss": 0.85640246, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.71484375, "step": 1970, "time_per_iteration": 2.54463529586792 }, { "auxiliary_loss_clip": 0.01027703, "auxiliary_loss_mlp": 0.0100801, "balance_loss_clip": 1.00195396, "balance_loss_mlp": 1.00787377, "epoch": 0.11850293100856757, "flos": 66347577959040.0, "grad_norm": 0.8082849103206722, "language_loss": 0.66209161, "learning_rate": 3.918396380791754e-06, "loss": 0.68244874, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 0.06054688, "router_z_loss_mlp": 0.19921875, "step": 1971, "time_per_iteration": 3.0194623470306396 }, { "auxiliary_loss_clip": 0.01099981, "auxiliary_loss_mlp": 0.01068902, "balance_loss_clip": 1.02140939, "balance_loss_mlp": 1.02438641, "epoch": 0.11856305426123553, "flos": 24679892494080.0, "grad_norm": 2.826370033863772, "language_loss": 0.81743836, "learning_rate": 3.918286230142327e-06, "loss": 0.83912718, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.7578125, "step": 1972, "time_per_iteration": 2.431324005126953 }, { "auxiliary_loss_clip": 0.01095262, "auxiliary_loss_mlp": 0.01076755, "balance_loss_clip": 1.02837968, "balance_loss_mlp": 1.02409101, "epoch": 0.1186231775139035, "flos": 24278949907200.0, "grad_norm": 2.212271199992865, "language_loss": 0.74296468, "learning_rate": 3.918176006751292e-06, "loss": 0.76468486, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7109375, "step": 1973, "time_per_iteration": 2.4557363986968994 }, { "auxiliary_loss_clip": 0.01093986, "auxiliary_loss_mlp": 0.01074244, "balance_loss_clip": 1.02453411, "balance_loss_mlp": 1.02348924, "epoch": 0.11868330076657148, "flos": 21756475376640.0, "grad_norm": 1.590519927574492, "language_loss": 0.74300444, "learning_rate": 3.918065710622832e-06, "loss": 0.76468676, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.703125, "step": 1974, "time_per_iteration": 2.4105241298675537 }, { "auxiliary_loss_clip": 0.01098038, "auxiliary_loss_mlp": 0.01075033, "balance_loss_clip": 1.02193689, "balance_loss_mlp": 1.02419257, "epoch": 0.11874342401923944, "flos": 17192559294720.0, "grad_norm": 2.522444835456934, "language_loss": 0.80826199, "learning_rate": 3.917955341761128e-06, "loss": 0.82999265, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.73828125, "step": 1975, "time_per_iteration": 2.3541975021362305 }, { "auxiliary_loss_clip": 0.01098601, "auxiliary_loss_mlp": 0.01072573, "balance_loss_clip": 1.02744079, "balance_loss_mlp": 1.02585196, "epoch": 0.11880354727190741, "flos": 15228729429120.0, "grad_norm": 2.9093295250655937, "language_loss": 0.78025293, "learning_rate": 3.917844900170364e-06, "loss": 0.80196464, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.7265625, "step": 1976, "time_per_iteration": 2.3779454231262207 }, { "auxiliary_loss_clip": 0.01099631, "auxiliary_loss_mlp": 0.01075919, "balance_loss_clip": 1.02363396, "balance_loss_mlp": 1.02605879, "epoch": 0.11886367052457537, "flos": 27308433335040.0, "grad_norm": 1.5022591072757743, "language_loss": 0.76572037, "learning_rate": 3.91773438585473e-06, "loss": 0.78747582, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.734375, "step": 1977, "time_per_iteration": 2.451704740524292 }, { "auxiliary_loss_clip": 0.01102803, "auxiliary_loss_mlp": 0.01081234, "balance_loss_clip": 1.02749419, "balance_loss_mlp": 1.02580285, "epoch": 0.11892379377724335, "flos": 21797218800000.0, "grad_norm": 2.2810410563787453, "language_loss": 0.76764417, "learning_rate": 3.9176237988184165e-06, "loss": 0.7894845, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7734375, "step": 1978, "time_per_iteration": 2.420731544494629 }, { "auxiliary_loss_clip": 0.01098206, "auxiliary_loss_mlp": 0.01068732, "balance_loss_clip": 1.0235281, "balance_loss_mlp": 1.02554226, "epoch": 0.11898391702991132, "flos": 13990150569600.0, "grad_norm": 1.6703888744720572, "language_loss": 0.74642849, "learning_rate": 3.917513139065616e-06, "loss": 0.76809794, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.7265625, "step": 1979, "time_per_iteration": 2.424344062805176 }, { "auxiliary_loss_clip": 0.01099319, "auxiliary_loss_mlp": 0.01070426, "balance_loss_clip": 1.02178907, "balance_loss_mlp": 1.02521837, "epoch": 0.11904404028257928, "flos": 32233176984960.0, "grad_norm": 1.6881998049047522, "language_loss": 1.00195682, "learning_rate": 3.917402406600525e-06, "loss": 1.02365422, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.7421875, "step": 1980, "time_per_iteration": 2.5174827575683594 }, { "auxiliary_loss_clip": 0.01104547, "auxiliary_loss_mlp": 0.01081701, "balance_loss_clip": 1.02946401, "balance_loss_mlp": 1.02820432, "epoch": 0.11910416353524726, "flos": 23585155902720.0, "grad_norm": 1.6600414445236873, "language_loss": 0.87702686, "learning_rate": 3.917291601427342e-06, "loss": 0.89888936, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.765625, "step": 1981, "time_per_iteration": 2.4694221019744873 }, { "auxiliary_loss_clip": 0.0110244, "auxiliary_loss_mlp": 0.01071673, "balance_loss_clip": 1.01905417, "balance_loss_mlp": 1.02628779, "epoch": 0.11916428678791523, "flos": 25332000468480.0, "grad_norm": 1.8443650429948728, "language_loss": 0.87343848, "learning_rate": 3.91718072355027e-06, "loss": 0.89517963, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.76171875, "step": 1982, "time_per_iteration": 2.4658966064453125 }, { "auxiliary_loss_clip": 0.01097523, "auxiliary_loss_mlp": 0.01065145, "balance_loss_clip": 1.01674581, "balance_loss_mlp": 1.02529323, "epoch": 0.11922441004058319, "flos": 19787513541120.0, "grad_norm": 1.7884512626553415, "language_loss": 0.86241335, "learning_rate": 3.917069772973513e-06, "loss": 0.88404, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.71875, "step": 1983, "time_per_iteration": 2.503352403640747 }, { "auxiliary_loss_clip": 0.01104388, "auxiliary_loss_mlp": 0.0107926, "balance_loss_clip": 1.02263546, "balance_loss_mlp": 1.02738404, "epoch": 0.11928453329325117, "flos": 21535475270400.0, "grad_norm": 2.933758828510656, "language_loss": 0.80892384, "learning_rate": 3.916958749701277e-06, "loss": 0.83076036, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.76953125, "step": 1984, "time_per_iteration": 2.4047012329101562 }, { "auxiliary_loss_clip": 0.01101427, "auxiliary_loss_mlp": 0.01069592, "balance_loss_clip": 1.02250385, "balance_loss_mlp": 1.02641273, "epoch": 0.11934465654591914, "flos": 20813924868480.0, "grad_norm": 1.8658768617307708, "language_loss": 0.85052133, "learning_rate": 3.9168476537377745e-06, "loss": 0.8722316, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.75, "step": 1985, "time_per_iteration": 2.4365234375 }, { "auxiliary_loss_clip": 0.01099644, "auxiliary_loss_mlp": 0.01078192, "balance_loss_clip": 1.02821922, "balance_loss_mlp": 1.02602315, "epoch": 0.1194047797985871, "flos": 19059539448960.0, "grad_norm": 1.9833492736668135, "language_loss": 0.7667771, "learning_rate": 3.916736485087216e-06, "loss": 0.7885555, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.734375, "step": 1986, "time_per_iteration": 2.3927929401397705 }, { "auxiliary_loss_clip": 0.01102733, "auxiliary_loss_mlp": 0.01079752, "balance_loss_clip": 1.02739501, "balance_loss_mlp": 1.02693653, "epoch": 0.11946490305125507, "flos": 27189798935040.0, "grad_norm": 2.0307746150811474, "language_loss": 0.75117689, "learning_rate": 3.916625243753819e-06, "loss": 0.77300179, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7578125, "step": 1987, "time_per_iteration": 2.4350292682647705 }, { "auxiliary_loss_clip": 0.01100072, "auxiliary_loss_mlp": 0.01079398, "balance_loss_clip": 1.02839994, "balance_loss_mlp": 1.02607214, "epoch": 0.11952502630392305, "flos": 21139769387520.0, "grad_norm": 1.8618209477601615, "language_loss": 0.75083005, "learning_rate": 3.916513929741799e-06, "loss": 0.77262479, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.73828125, "step": 1988, "time_per_iteration": 2.4165165424346924 }, { "auxiliary_loss_clip": 0.01101005, "auxiliary_loss_mlp": 0.01086501, "balance_loss_clip": 1.02901793, "balance_loss_mlp": 1.02569008, "epoch": 0.11958514955659101, "flos": 22123237875840.0, "grad_norm": 2.036405786914528, "language_loss": 0.83034456, "learning_rate": 3.91640254305538e-06, "loss": 0.85221964, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.75390625, "step": 1989, "time_per_iteration": 2.403749465942383 }, { "auxiliary_loss_clip": 0.01099889, "auxiliary_loss_mlp": 0.01086348, "balance_loss_clip": 1.0322032, "balance_loss_mlp": 1.02468252, "epoch": 0.11964527280925898, "flos": 17420471850240.0, "grad_norm": 2.370735707533791, "language_loss": 0.78127801, "learning_rate": 3.916291083698784e-06, "loss": 0.80314028, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.75, "step": 1990, "time_per_iteration": 2.4581000804901123 }, { "auxiliary_loss_clip": 0.01026246, "auxiliary_loss_mlp": 0.01017792, "balance_loss_clip": 1.01149738, "balance_loss_mlp": 1.00718284, "epoch": 0.11970539606192696, "flos": 70676564302080.0, "grad_norm": 0.8745206723342456, "language_loss": 0.55380261, "learning_rate": 3.916179551676238e-06, "loss": 0.57424301, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 0.06298828, "router_z_loss_mlp": 0.19140625, "step": 1991, "time_per_iteration": 3.0395352840423584 }, { "auxiliary_loss_clip": 0.01096793, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.0157572, "balance_loss_mlp": 1.02505863, "epoch": 0.11976551931459492, "flos": 21213959760000.0, "grad_norm": 2.65704373762825, "language_loss": 0.80542469, "learning_rate": 3.916067946991971e-06, "loss": 0.8270396, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.71875, "step": 1992, "time_per_iteration": 3.9649112224578857 }, { "auxiliary_loss_clip": 0.01100403, "auxiliary_loss_mlp": 0.01076277, "balance_loss_clip": 1.02408671, "balance_loss_mlp": 1.02408791, "epoch": 0.11982564256726289, "flos": 25988262894720.0, "grad_norm": 1.9484625021473028, "language_loss": 0.80804837, "learning_rate": 3.915956269650216e-06, "loss": 0.82981521, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.765625, "step": 1993, "time_per_iteration": 2.4504408836364746 }, { "auxiliary_loss_clip": 0.01098994, "auxiliary_loss_mlp": 0.01082082, "balance_loss_clip": 1.0281992, "balance_loss_mlp": 1.02334023, "epoch": 0.11988576581993086, "flos": 21649850484480.0, "grad_norm": 1.819952999341646, "language_loss": 0.84539223, "learning_rate": 3.915844519655208e-06, "loss": 0.86720294, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7578125, "step": 1994, "time_per_iteration": 3.8368923664093018 }, { "auxiliary_loss_clip": 0.01097663, "auxiliary_loss_mlp": 0.01073072, "balance_loss_clip": 1.02631783, "balance_loss_mlp": 1.02424538, "epoch": 0.11994588907259883, "flos": 17856467308800.0, "grad_norm": 2.053111139127266, "language_loss": 0.91079462, "learning_rate": 3.915732697011183e-06, "loss": 0.93250197, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.734375, "step": 1995, "time_per_iteration": 3.7944447994232178 }, { "auxiliary_loss_clip": 0.01101169, "auxiliary_loss_mlp": 0.01085154, "balance_loss_clip": 1.03265452, "balance_loss_mlp": 1.02711642, "epoch": 0.1200060123252668, "flos": 24461580562560.0, "grad_norm": 2.1947525367916465, "language_loss": 0.76161927, "learning_rate": 3.9156208017223825e-06, "loss": 0.78348249, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7421875, "step": 1996, "time_per_iteration": 3.8567821979522705 }, { "auxiliary_loss_clip": 0.01102286, "auxiliary_loss_mlp": 0.01082867, "balance_loss_clip": 1.02822185, "balance_loss_mlp": 1.02760184, "epoch": 0.12006613557793476, "flos": 18731251134720.0, "grad_norm": 1.8991834544527353, "language_loss": 0.89453328, "learning_rate": 3.915508833793048e-06, "loss": 0.91638482, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.74609375, "step": 1997, "time_per_iteration": 2.374429941177368 }, { "auxiliary_loss_clip": 0.01101032, "auxiliary_loss_mlp": 0.01083592, "balance_loss_clip": 1.02925682, "balance_loss_mlp": 1.02612877, "epoch": 0.12012625883060274, "flos": 22266800853120.0, "grad_norm": 1.7461969063410048, "language_loss": 0.80896258, "learning_rate": 3.915396793227428e-06, "loss": 0.83080888, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.75, "step": 1998, "time_per_iteration": 2.4020864963531494 }, { "auxiliary_loss_clip": 0.01101154, "auxiliary_loss_mlp": 0.01080212, "balance_loss_clip": 1.02866566, "balance_loss_mlp": 1.02690041, "epoch": 0.1201863820832707, "flos": 21757906742400.0, "grad_norm": 1.5485039456893595, "language_loss": 0.73996842, "learning_rate": 3.915284680029769e-06, "loss": 0.76178205, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7421875, "step": 1999, "time_per_iteration": 2.4123663902282715 }, { "auxiliary_loss_clip": 0.01104924, "auxiliary_loss_mlp": 0.01077225, "balance_loss_clip": 1.02603638, "balance_loss_mlp": 1.02838016, "epoch": 0.12024650533593867, "flos": 21906915891840.0, "grad_norm": 2.0148322726718813, "language_loss": 0.77953172, "learning_rate": 3.915172494204323e-06, "loss": 0.80135322, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.765625, "step": 2000, "time_per_iteration": 2.4312736988067627 }, { "auxiliary_loss_clip": 0.01100827, "auxiliary_loss_mlp": 0.01077749, "balance_loss_clip": 1.02386618, "balance_loss_mlp": 1.02605891, "epoch": 0.12030662858860665, "flos": 21688150112640.0, "grad_norm": 1.5357069174431184, "language_loss": 0.87022698, "learning_rate": 3.915060235755344e-06, "loss": 0.89201272, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.75, "step": 2001, "time_per_iteration": 2.4093170166015625 }, { "auxiliary_loss_clip": 0.01102482, "auxiliary_loss_mlp": 0.01074914, "balance_loss_clip": 1.0234158, "balance_loss_mlp": 1.02663589, "epoch": 0.12036675184127461, "flos": 12932386974720.0, "grad_norm": 2.1521722000856673, "language_loss": 0.77409619, "learning_rate": 3.91494790468709e-06, "loss": 0.79587018, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7578125, "step": 2002, "time_per_iteration": 2.3760640621185303 }, { "auxiliary_loss_clip": 0.01104877, "auxiliary_loss_mlp": 0.01078146, "balance_loss_clip": 1.0212357, "balance_loss_mlp": 1.02616906, "epoch": 0.12042687509394258, "flos": 20849955258240.0, "grad_norm": 1.9742932210525594, "language_loss": 0.80461574, "learning_rate": 3.9148355010038185e-06, "loss": 0.82644594, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.7890625, "step": 2003, "time_per_iteration": 2.3926310539245605 }, { "auxiliary_loss_clip": 0.01100392, "auxiliary_loss_mlp": 0.01072534, "balance_loss_clip": 1.02048755, "balance_loss_mlp": 1.02594829, "epoch": 0.12048699834661056, "flos": 23877378915840.0, "grad_norm": 1.7516043441517906, "language_loss": 0.73556221, "learning_rate": 3.914723024709793e-06, "loss": 0.75729144, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.74609375, "step": 2004, "time_per_iteration": 2.4246857166290283 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01077836, "balance_loss_clip": 1.022475, "balance_loss_mlp": 1.02415192, "epoch": 0.12054712159927852, "flos": 19755323400960.0, "grad_norm": 1.5278522372437715, "language_loss": 0.8042835, "learning_rate": 3.914610475809279e-06, "loss": 0.82606846, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.765625, "step": 2005, "time_per_iteration": 2.388758897781372 }, { "auxiliary_loss_clip": 0.0102637, "auxiliary_loss_mlp": 0.01011751, "balance_loss_clip": 1.00521803, "balance_loss_mlp": 1.00709653, "epoch": 0.12060724485194649, "flos": 51670057075200.0, "grad_norm": 0.9355256713301027, "language_loss": 0.58146429, "learning_rate": 3.914497854306543e-06, "loss": 0.6018455, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 0.06542969, "router_z_loss_mlp": 0.19335938, "step": 2006, "time_per_iteration": 2.8250136375427246 }, { "auxiliary_loss_clip": 0.01097827, "auxiliary_loss_mlp": 0.01072638, "balance_loss_clip": 1.02268934, "balance_loss_mlp": 1.02444279, "epoch": 0.12066736810461445, "flos": 18989398794240.0, "grad_norm": 1.9284007458093175, "language_loss": 0.78901267, "learning_rate": 3.9143851602058575e-06, "loss": 0.81071734, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.734375, "step": 2007, "time_per_iteration": 2.413078546524048 }, { "auxiliary_loss_clip": 0.01100918, "auxiliary_loss_mlp": 0.01079794, "balance_loss_clip": 1.02507722, "balance_loss_mlp": 1.02491093, "epoch": 0.12072749135728243, "flos": 16471043804160.0, "grad_norm": 2.750731650383216, "language_loss": 0.8665058, "learning_rate": 3.914272393511494e-06, "loss": 0.88831294, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.7578125, "step": 2008, "time_per_iteration": 2.42895770072937 }, { "auxiliary_loss_clip": 0.01097345, "auxiliary_loss_mlp": 0.01078803, "balance_loss_clip": 1.02804422, "balance_loss_mlp": 1.02364898, "epoch": 0.1207876146099504, "flos": 18076140783360.0, "grad_norm": 1.8207046029444316, "language_loss": 0.86446536, "learning_rate": 3.91415955422773e-06, "loss": 0.88622683, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.73828125, "step": 2009, "time_per_iteration": 2.393139362335205 }, { "auxiliary_loss_clip": 0.01099216, "auxiliary_loss_mlp": 0.01082942, "balance_loss_clip": 1.03122866, "balance_loss_mlp": 1.02429438, "epoch": 0.12084773786261836, "flos": 21870501477120.0, "grad_norm": 1.947446039372563, "language_loss": 0.8604666, "learning_rate": 3.914046642358844e-06, "loss": 0.88228816, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.75, "step": 2010, "time_per_iteration": 2.393658399581909 }, { "auxiliary_loss_clip": 0.01101846, "auxiliary_loss_mlp": 0.01081283, "balance_loss_clip": 1.02799702, "balance_loss_mlp": 1.0261147, "epoch": 0.12090786111528634, "flos": 18332054115840.0, "grad_norm": 1.7712058122923342, "language_loss": 0.85864633, "learning_rate": 3.9139336579091174e-06, "loss": 0.88047767, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7578125, "step": 2011, "time_per_iteration": 2.373638153076172 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01084869, "balance_loss_clip": 1.03391862, "balance_loss_mlp": 1.02493131, "epoch": 0.1209679843679543, "flos": 21104786338560.0, "grad_norm": 1.9122767260484372, "language_loss": 0.98636723, "learning_rate": 3.913820600882834e-06, "loss": 1.00821793, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2012, "time_per_iteration": 2.404940605163574 }, { "auxiliary_loss_clip": 0.01098806, "auxiliary_loss_mlp": 0.0107676, "balance_loss_clip": 1.02457047, "balance_loss_mlp": 1.02681601, "epoch": 0.12102810762062227, "flos": 29239793769600.0, "grad_norm": 2.132585782696584, "language_loss": 0.81927824, "learning_rate": 3.913707471284283e-06, "loss": 0.84103394, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.71875, "step": 2013, "time_per_iteration": 2.4583590030670166 }, { "auxiliary_loss_clip": 0.01100676, "auxiliary_loss_mlp": 0.01073223, "balance_loss_clip": 1.02065194, "balance_loss_mlp": 1.02498317, "epoch": 0.12108823087329025, "flos": 17929749985920.0, "grad_norm": 2.6129780929695943, "language_loss": 0.79474306, "learning_rate": 3.9135942691177515e-06, "loss": 0.81648207, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7578125, "step": 2014, "time_per_iteration": 2.3796372413635254 }, { "auxiliary_loss_clip": 0.01101031, "auxiliary_loss_mlp": 0.01077186, "balance_loss_clip": 1.02411413, "balance_loss_mlp": 1.02689409, "epoch": 0.12114835412595822, "flos": 22090733533440.0, "grad_norm": 6.252650995932334, "language_loss": 0.8912667, "learning_rate": 3.913480994387535e-06, "loss": 0.91304892, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7421875, "step": 2015, "time_per_iteration": 2.396007537841797 }, { "auxiliary_loss_clip": 0.01094611, "auxiliary_loss_mlp": 0.01072568, "balance_loss_clip": 1.02278614, "balance_loss_mlp": 1.02302539, "epoch": 0.12120847737862618, "flos": 20411306536320.0, "grad_norm": 2.158719660804708, "language_loss": 0.72802806, "learning_rate": 3.913367647097926e-06, "loss": 0.74969989, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.71484375, "step": 2016, "time_per_iteration": 2.389314651489258 }, { "auxiliary_loss_clip": 0.01103517, "auxiliary_loss_mlp": 0.0107349, "balance_loss_clip": 1.01681769, "balance_loss_mlp": 1.02737439, "epoch": 0.12126860063129415, "flos": 22307963212800.0, "grad_norm": 2.4298115875707773, "language_loss": 0.82641542, "learning_rate": 3.913254227253225e-06, "loss": 0.84818554, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.76171875, "step": 2017, "time_per_iteration": 2.407562017440796 }, { "auxiliary_loss_clip": 0.01102521, "auxiliary_loss_mlp": 0.01069905, "balance_loss_clip": 1.01835895, "balance_loss_mlp": 1.02759886, "epoch": 0.12132872388396213, "flos": 13698416315520.0, "grad_norm": 2.5426549681984727, "language_loss": 0.72382724, "learning_rate": 3.913140734857731e-06, "loss": 0.74555147, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.75, "step": 2018, "time_per_iteration": 2.3722572326660156 }, { "auxiliary_loss_clip": 0.01098996, "auxiliary_loss_mlp": 0.01073024, "balance_loss_clip": 1.02419543, "balance_loss_mlp": 1.02634835, "epoch": 0.12138884713663009, "flos": 26465804737920.0, "grad_norm": 1.6765216002555856, "language_loss": 0.73807836, "learning_rate": 3.91302716991575e-06, "loss": 0.75979853, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7265625, "step": 2019, "time_per_iteration": 2.4477133750915527 }, { "auxiliary_loss_clip": 0.01098821, "auxiliary_loss_mlp": 0.01073841, "balance_loss_clip": 1.02348661, "balance_loss_mlp": 1.02545166, "epoch": 0.12144897038929806, "flos": 26140379155200.0, "grad_norm": 1.6391196921058766, "language_loss": 0.94323379, "learning_rate": 3.912913532431586e-06, "loss": 0.96496046, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 2020, "time_per_iteration": 2.4295859336853027 }, { "auxiliary_loss_clip": 0.01101166, "auxiliary_loss_mlp": 0.01068424, "balance_loss_clip": 1.02155137, "balance_loss_mlp": 1.02704287, "epoch": 0.12150909364196603, "flos": 24716376731520.0, "grad_norm": 3.016657971970353, "language_loss": 0.80003452, "learning_rate": 3.912799822409549e-06, "loss": 0.82173038, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.7421875, "step": 2021, "time_per_iteration": 2.4687986373901367 }, { "auxiliary_loss_clip": 0.01097673, "auxiliary_loss_mlp": 0.01071001, "balance_loss_clip": 1.02634466, "balance_loss_mlp": 1.02644575, "epoch": 0.121569216894634, "flos": 25185958784640.0, "grad_norm": 1.992559760619375, "language_loss": 0.82123077, "learning_rate": 3.912686039853952e-06, "loss": 0.84291744, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.7109375, "step": 2022, "time_per_iteration": 2.4219000339508057 }, { "auxiliary_loss_clip": 0.01100362, "auxiliary_loss_mlp": 0.01071895, "balance_loss_clip": 1.0240922, "balance_loss_mlp": 1.02610743, "epoch": 0.12162934014730196, "flos": 13443236121600.0, "grad_norm": 1.7304771420679124, "language_loss": 0.87428689, "learning_rate": 3.912572184769108e-06, "loss": 0.8960095, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.7421875, "step": 2023, "time_per_iteration": 2.380903482437134 }, { "auxiliary_loss_clip": 0.0109664, "auxiliary_loss_mlp": 0.01072186, "balance_loss_clip": 1.02314281, "balance_loss_mlp": 1.02475595, "epoch": 0.12168946339996994, "flos": 16945199245440.0, "grad_norm": 2.24343236522217, "language_loss": 0.88681155, "learning_rate": 3.912458257159335e-06, "loss": 0.90849984, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.71875, "step": 2024, "time_per_iteration": 2.3560869693756104 }, { "auxiliary_loss_clip": 0.01096249, "auxiliary_loss_mlp": 0.01074245, "balance_loss_clip": 1.02524996, "balance_loss_mlp": 1.02344918, "epoch": 0.12174958665263791, "flos": 29820399546240.0, "grad_norm": 1.9362419442833214, "language_loss": 0.73329562, "learning_rate": 3.912344257028954e-06, "loss": 0.75500059, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7265625, "step": 2025, "time_per_iteration": 2.4567461013793945 }, { "auxiliary_loss_clip": 0.01098404, "auxiliary_loss_mlp": 0.0106188, "balance_loss_clip": 1.01662803, "balance_loss_mlp": 1.0265224, "epoch": 0.12180970990530587, "flos": 24640824816000.0, "grad_norm": 1.7373914318618429, "language_loss": 0.77455968, "learning_rate": 3.912230184382286e-06, "loss": 0.79616255, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.71875, "step": 2026, "time_per_iteration": 2.4456303119659424 }, { "auxiliary_loss_clip": 0.0109705, "auxiliary_loss_mlp": 0.01073019, "balance_loss_clip": 1.02335608, "balance_loss_mlp": 1.02471697, "epoch": 0.12186983315797385, "flos": 20520654514560.0, "grad_norm": 2.0902975911179387, "language_loss": 0.91106635, "learning_rate": 3.912116039223659e-06, "loss": 0.93276703, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.72265625, "step": 2027, "time_per_iteration": 2.3931589126586914 }, { "auxiliary_loss_clip": 0.01094783, "auxiliary_loss_mlp": 0.01059222, "balance_loss_clip": 1.01554406, "balance_loss_mlp": 1.02400589, "epoch": 0.12192995641064182, "flos": 27817117977600.0, "grad_norm": 1.727760463443916, "language_loss": 0.77637988, "learning_rate": 3.912001821557399e-06, "loss": 0.79791999, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.70703125, "step": 2028, "time_per_iteration": 2.442688465118408 }, { "auxiliary_loss_clip": 0.01097496, "auxiliary_loss_mlp": 0.01071325, "balance_loss_clip": 1.02337885, "balance_loss_mlp": 1.02546561, "epoch": 0.12199007966330978, "flos": 22016054401920.0, "grad_norm": 2.158207453716294, "language_loss": 0.78875971, "learning_rate": 3.911887531387839e-06, "loss": 0.81044793, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.71875, "step": 2029, "time_per_iteration": 2.424067735671997 }, { "auxiliary_loss_clip": 0.0109384, "auxiliary_loss_mlp": 0.01062802, "balance_loss_clip": 1.01731169, "balance_loss_mlp": 1.02459514, "epoch": 0.12205020291597775, "flos": 23294084964480.0, "grad_norm": 1.821161545617599, "language_loss": 0.81320238, "learning_rate": 3.911773168719313e-06, "loss": 0.83476877, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.69140625, "step": 2030, "time_per_iteration": 2.52874493598938 }, { "auxiliary_loss_clip": 0.01094847, "auxiliary_loss_mlp": 0.01065589, "balance_loss_clip": 1.02040911, "balance_loss_mlp": 1.02542949, "epoch": 0.12211032616864573, "flos": 26030402772480.0, "grad_norm": 2.1154043574589516, "language_loss": 0.77602094, "learning_rate": 3.911658733556155e-06, "loss": 0.7976253, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6953125, "step": 2031, "time_per_iteration": 2.462101936340332 }, { "auxiliary_loss_clip": 0.01094894, "auxiliary_loss_mlp": 0.0106596, "balance_loss_clip": 1.0229013, "balance_loss_mlp": 1.02499247, "epoch": 0.12217044942131369, "flos": 20409944993280.0, "grad_norm": 1.777755454066369, "language_loss": 0.76476181, "learning_rate": 3.911544225902707e-06, "loss": 0.78637034, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6953125, "step": 2032, "time_per_iteration": 3.8741612434387207 }, { "auxiliary_loss_clip": 0.01090749, "auxiliary_loss_mlp": 0.01055686, "balance_loss_clip": 1.01880288, "balance_loss_mlp": 1.02480173, "epoch": 0.12223057267398166, "flos": 22856029735680.0, "grad_norm": 1.5465644651536612, "language_loss": 0.90926754, "learning_rate": 3.911429645763311e-06, "loss": 0.93073189, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.66015625, "step": 2033, "time_per_iteration": 2.4857101440429688 }, { "auxiliary_loss_clip": 0.0109937, "auxiliary_loss_mlp": 0.01065909, "balance_loss_clip": 1.02001321, "balance_loss_mlp": 1.02723336, "epoch": 0.12229069592664964, "flos": 20046533984640.0, "grad_norm": 2.2788883366632278, "language_loss": 0.67856073, "learning_rate": 3.911314993142311e-06, "loss": 0.70021349, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.72265625, "step": 2034, "time_per_iteration": 3.8004846572875977 }, { "auxiliary_loss_clip": 0.01095609, "auxiliary_loss_mlp": 0.01061323, "balance_loss_clip": 1.01933777, "balance_loss_mlp": 1.02516866, "epoch": 0.1223508191793176, "flos": 22273119809280.0, "grad_norm": 2.112112280464022, "language_loss": 0.78143543, "learning_rate": 3.911200268044055e-06, "loss": 0.80300474, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.703125, "step": 2035, "time_per_iteration": 5.227928876876831 }, { "auxiliary_loss_clip": 0.01097873, "auxiliary_loss_mlp": 0.01070365, "balance_loss_clip": 1.02473176, "balance_loss_mlp": 1.026021, "epoch": 0.12241094243198557, "flos": 21284973198720.0, "grad_norm": 2.1026026932902924, "language_loss": 0.73358333, "learning_rate": 3.911085470472892e-06, "loss": 0.75526571, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.71875, "step": 2036, "time_per_iteration": 2.43646240234375 }, { "auxiliary_loss_clip": 0.0109354, "auxiliary_loss_mlp": 0.01063849, "balance_loss_clip": 1.01919365, "balance_loss_mlp": 1.0246433, "epoch": 0.12247106568465355, "flos": 17381473994880.0, "grad_norm": 1.6312314502071057, "language_loss": 0.84441555, "learning_rate": 3.910970600433178e-06, "loss": 0.86598945, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6875, "step": 2037, "time_per_iteration": 2.403738260269165 }, { "auxiliary_loss_clip": 0.01097735, "auxiliary_loss_mlp": 0.01068362, "balance_loss_clip": 1.02234674, "balance_loss_mlp": 1.0257194, "epoch": 0.12253118893732151, "flos": 27044420567040.0, "grad_norm": 2.7378280757705764, "language_loss": 0.83843458, "learning_rate": 3.910855657929267e-06, "loss": 0.8600955, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.72265625, "step": 2038, "time_per_iteration": 2.4489896297454834 }, { "auxiliary_loss_clip": 0.0102724, "auxiliary_loss_mlp": 0.01014643, "balance_loss_clip": 1.00932586, "balance_loss_mlp": 1.01020432, "epoch": 0.12259131218998948, "flos": 53858762208000.0, "grad_norm": 0.8288294661885447, "language_loss": 0.58852047, "learning_rate": 3.910740642965518e-06, "loss": 0.60893929, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 0.05322266, "router_z_loss_mlp": 0.16992188, "step": 2039, "time_per_iteration": 2.899082660675049 }, { "auxiliary_loss_clip": 0.01095161, "auxiliary_loss_mlp": 0.01065488, "balance_loss_clip": 1.01856744, "balance_loss_mlp": 1.0251404, "epoch": 0.12265143544265744, "flos": 17891031421440.0, "grad_norm": 3.475170532777588, "language_loss": 0.83230424, "learning_rate": 3.910625555546292e-06, "loss": 0.85391068, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.69921875, "step": 2040, "time_per_iteration": 2.396820306777954 }, { "auxiliary_loss_clip": 0.01094307, "auxiliary_loss_mlp": 0.01059766, "balance_loss_clip": 1.01580119, "balance_loss_mlp": 1.02418733, "epoch": 0.12271155869532542, "flos": 21798824722560.0, "grad_norm": 1.8250902053084934, "language_loss": 0.84842402, "learning_rate": 3.910510395675953e-06, "loss": 0.86996472, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.69921875, "step": 2041, "time_per_iteration": 2.4108901023864746 }, { "auxiliary_loss_clip": 0.01100113, "auxiliary_loss_mlp": 0.01074342, "balance_loss_clip": 1.02332067, "balance_loss_mlp": 1.02571213, "epoch": 0.12277168194799339, "flos": 19827733294080.0, "grad_norm": 2.04895345969584, "language_loss": 0.70323157, "learning_rate": 3.9103951633588694e-06, "loss": 0.72497612, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.7421875, "step": 2042, "time_per_iteration": 2.396517515182495 }, { "auxiliary_loss_clip": 0.01102181, "auxiliary_loss_mlp": 0.01074845, "balance_loss_clip": 1.02532506, "balance_loss_mlp": 1.02647924, "epoch": 0.12283180520066135, "flos": 23219929503360.0, "grad_norm": 1.84579155283626, "language_loss": 0.8289305, "learning_rate": 3.910279858599409e-06, "loss": 0.85070086, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.7578125, "step": 2043, "time_per_iteration": 2.4528863430023193 }, { "auxiliary_loss_clip": 0.01104281, "auxiliary_loss_mlp": 0.01072384, "balance_loss_clip": 1.01983666, "balance_loss_mlp": 1.02717531, "epoch": 0.12289192845332933, "flos": 18587478689280.0, "grad_norm": 1.5994070044506536, "language_loss": 0.81927812, "learning_rate": 3.910164481401946e-06, "loss": 0.84104484, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.7734375, "step": 2044, "time_per_iteration": 2.430948257446289 }, { "auxiliary_loss_clip": 0.01100648, "auxiliary_loss_mlp": 0.01070459, "balance_loss_clip": 1.02391958, "balance_loss_mlp": 1.03021646, "epoch": 0.1229520517059973, "flos": 25768519597440.0, "grad_norm": 2.0812957377747, "language_loss": 0.79213905, "learning_rate": 3.910049031770853e-06, "loss": 0.8138501, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.703125, "step": 2045, "time_per_iteration": 2.4407637119293213 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.01084356, "balance_loss_clip": 1.02770782, "balance_loss_mlp": 1.02807474, "epoch": 0.12301217495866526, "flos": 20886090382080.0, "grad_norm": 2.6035637219847394, "language_loss": 0.69962198, "learning_rate": 3.90993350971051e-06, "loss": 0.72150624, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.76171875, "step": 2046, "time_per_iteration": 2.403661012649536 }, { "auxiliary_loss_clip": 0.01102554, "auxiliary_loss_mlp": 0.01072368, "balance_loss_clip": 1.02525663, "balance_loss_mlp": 1.02933848, "epoch": 0.12307229821133324, "flos": 22377824576640.0, "grad_norm": 3.3185583231086673, "language_loss": 0.74473155, "learning_rate": 3.909817915225297e-06, "loss": 0.7664808, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.734375, "step": 2047, "time_per_iteration": 2.439704179763794 }, { "auxiliary_loss_clip": 0.01106395, "auxiliary_loss_mlp": 0.01066882, "balance_loss_clip": 1.01702857, "balance_loss_mlp": 1.03078938, "epoch": 0.1231324214640012, "flos": 23366285389440.0, "grad_norm": 1.6507888626790412, "language_loss": 0.79207921, "learning_rate": 3.909702248319597e-06, "loss": 0.81381202, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.7578125, "step": 2048, "time_per_iteration": 2.4446001052856445 }, { "auxiliary_loss_clip": 0.01101361, "auxiliary_loss_mlp": 0.01075816, "balance_loss_clip": 1.02856112, "balance_loss_mlp": 1.02796519, "epoch": 0.12319254471666917, "flos": 23766075901440.0, "grad_norm": 2.1154955643251507, "language_loss": 0.8764677, "learning_rate": 3.909586508997797e-06, "loss": 0.89823949, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.734375, "step": 2049, "time_per_iteration": 2.439011573791504 }, { "auxiliary_loss_clip": 0.01104118, "auxiliary_loss_mlp": 0.01073127, "balance_loss_clip": 1.02220082, "balance_loss_mlp": 1.02865362, "epoch": 0.12325266796933713, "flos": 23549020778880.0, "grad_norm": 1.4742929209807694, "language_loss": 0.78183985, "learning_rate": 3.909470697264285e-06, "loss": 0.80361229, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75390625, "step": 2050, "time_per_iteration": 2.433151960372925 }, { "auxiliary_loss_clip": 0.01101641, "auxiliary_loss_mlp": 0.01072036, "balance_loss_clip": 1.02325547, "balance_loss_mlp": 1.02693617, "epoch": 0.12331279122200511, "flos": 24422896909440.0, "grad_norm": 2.0343430057138967, "language_loss": 0.83939666, "learning_rate": 3.909354813123452e-06, "loss": 0.8611334, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.74609375, "step": 2051, "time_per_iteration": 2.451732635498047 }, { "auxiliary_loss_clip": 0.01101874, "auxiliary_loss_mlp": 0.01070648, "balance_loss_clip": 1.02451372, "balance_loss_mlp": 1.02879846, "epoch": 0.12337291447467308, "flos": 25483104299520.0, "grad_norm": 1.558967632639031, "language_loss": 0.81091416, "learning_rate": 3.909238856579693e-06, "loss": 0.8326394, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.73046875, "step": 2052, "time_per_iteration": 2.447704315185547 }, { "auxiliary_loss_clip": 0.01101289, "auxiliary_loss_mlp": 0.0107629, "balance_loss_clip": 1.02541161, "balance_loss_mlp": 1.02740407, "epoch": 0.12343303772734104, "flos": 23548881133440.0, "grad_norm": 2.200160358247917, "language_loss": 0.76149035, "learning_rate": 3.909122827637406e-06, "loss": 0.78326607, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.73828125, "step": 2053, "time_per_iteration": 2.4194579124450684 }, { "auxiliary_loss_clip": 0.01100909, "auxiliary_loss_mlp": 0.01068813, "balance_loss_clip": 1.01872158, "balance_loss_mlp": 1.02509356, "epoch": 0.12349316098000902, "flos": 47555299900800.0, "grad_norm": 1.4602745343085053, "language_loss": 0.7615273, "learning_rate": 3.909006726300991e-06, "loss": 0.78322452, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7578125, "step": 2054, "time_per_iteration": 2.62043833732605 }, { "auxiliary_loss_clip": 0.01094362, "auxiliary_loss_mlp": 0.010621, "balance_loss_clip": 1.01725316, "balance_loss_mlp": 1.02342665, "epoch": 0.12355328423267699, "flos": 25044804691200.0, "grad_norm": 1.654600054283012, "language_loss": 0.86399066, "learning_rate": 3.908890552574849e-06, "loss": 0.88555527, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.7109375, "step": 2055, "time_per_iteration": 2.4431726932525635 }, { "auxiliary_loss_clip": 0.0110285, "auxiliary_loss_mlp": 0.01076264, "balance_loss_clip": 1.02548099, "balance_loss_mlp": 1.02694726, "epoch": 0.12361340748534495, "flos": 27707909644800.0, "grad_norm": 1.8772361462237437, "language_loss": 0.80354464, "learning_rate": 3.908774306463384e-06, "loss": 0.82533586, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 2056, "time_per_iteration": 2.453562021255493 }, { "auxiliary_loss_clip": 0.01099322, "auxiliary_loss_mlp": 0.01071278, "balance_loss_clip": 1.01610827, "balance_loss_mlp": 1.02515936, "epoch": 0.12367353073801293, "flos": 26139401637120.0, "grad_norm": 2.133033260603137, "language_loss": 0.84793949, "learning_rate": 3.908657987971009e-06, "loss": 0.86964554, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.7421875, "step": 2057, "time_per_iteration": 2.4447827339172363 }, { "auxiliary_loss_clip": 0.01102722, "auxiliary_loss_mlp": 0.01075825, "balance_loss_clip": 1.02399325, "balance_loss_mlp": 1.02526152, "epoch": 0.1237336539906809, "flos": 25154850896640.0, "grad_norm": 1.5515261900462065, "language_loss": 0.80114263, "learning_rate": 3.90854159710213e-06, "loss": 0.82292807, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7734375, "step": 2058, "time_per_iteration": 2.4445579051971436 }, { "auxiliary_loss_clip": 0.01102928, "auxiliary_loss_mlp": 0.01077654, "balance_loss_clip": 1.02369976, "balance_loss_mlp": 1.02615619, "epoch": 0.12379377724334886, "flos": 15303687851520.0, "grad_norm": 1.8414191723691204, "language_loss": 0.85353434, "learning_rate": 3.9084251338611624e-06, "loss": 0.8753401, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.765625, "step": 2059, "time_per_iteration": 2.396862745285034 }, { "auxiliary_loss_clip": 0.01104923, "auxiliary_loss_mlp": 0.01083039, "balance_loss_clip": 1.02696323, "balance_loss_mlp": 1.02706861, "epoch": 0.12385390049601683, "flos": 21315871618560.0, "grad_norm": 2.5997271222125455, "language_loss": 0.8355009, "learning_rate": 3.908308598252523e-06, "loss": 0.85738051, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.77734375, "step": 2060, "time_per_iteration": 2.385915756225586 }, { "auxiliary_loss_clip": 0.01100441, "auxiliary_loss_mlp": 0.01074287, "balance_loss_clip": 1.02340877, "balance_loss_mlp": 1.02541351, "epoch": 0.1239140237486848, "flos": 15115576112640.0, "grad_norm": 1.8742015410889903, "language_loss": 0.88349009, "learning_rate": 3.9081919902806306e-06, "loss": 0.90523732, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2061, "time_per_iteration": 2.385793685913086 }, { "auxiliary_loss_clip": 0.01097072, "auxiliary_loss_mlp": 0.01072707, "balance_loss_clip": 1.01970649, "balance_loss_mlp": 1.02607954, "epoch": 0.12397414700135277, "flos": 21975834648960.0, "grad_norm": 1.76867728680214, "language_loss": 0.86641526, "learning_rate": 3.908075309949906e-06, "loss": 0.88811308, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7109375, "step": 2062, "time_per_iteration": 2.377290725708008 }, { "auxiliary_loss_clip": 0.01097417, "auxiliary_loss_mlp": 0.0107754, "balance_loss_clip": 1.02618504, "balance_loss_mlp": 1.02531052, "epoch": 0.12403427025402074, "flos": 13400223459840.0, "grad_norm": 1.711058523759988, "language_loss": 0.80737555, "learning_rate": 3.907958557264774e-06, "loss": 0.82912517, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.71875, "step": 2063, "time_per_iteration": 2.3697285652160645 }, { "auxiliary_loss_clip": 0.01101941, "auxiliary_loss_mlp": 0.01068925, "balance_loss_clip": 1.01811814, "balance_loss_mlp": 1.0266819, "epoch": 0.12409439350668872, "flos": 15303478383360.0, "grad_norm": 1.9201969062870314, "language_loss": 0.81433487, "learning_rate": 3.907841732229663e-06, "loss": 0.83604348, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2064, "time_per_iteration": 2.356920003890991 }, { "auxiliary_loss_clip": 0.0109826, "auxiliary_loss_mlp": 0.01076059, "balance_loss_clip": 1.02627718, "balance_loss_mlp": 1.02642202, "epoch": 0.12415451675935668, "flos": 25008215719680.0, "grad_norm": 2.3321194103369125, "language_loss": 0.93810534, "learning_rate": 3.907724834849002e-06, "loss": 0.95984852, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.71875, "step": 2065, "time_per_iteration": 2.4324593544006348 }, { "auxiliary_loss_clip": 0.0110207, "auxiliary_loss_mlp": 0.01070047, "balance_loss_clip": 1.01757085, "balance_loss_mlp": 1.02560151, "epoch": 0.12421464001202465, "flos": 23658543313920.0, "grad_norm": 1.5702033567887026, "language_loss": 0.83040226, "learning_rate": 3.907607865127225e-06, "loss": 0.8521235, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.765625, "step": 2066, "time_per_iteration": 2.417567729949951 }, { "auxiliary_loss_clip": 0.01031145, "auxiliary_loss_mlp": 0.01019319, "balance_loss_clip": 1.01245248, "balance_loss_mlp": 1.01137173, "epoch": 0.12427476326469263, "flos": 65729440604160.0, "grad_norm": 0.9655542385750268, "language_loss": 0.63411349, "learning_rate": 3.907490823068766e-06, "loss": 0.65461808, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 0.06884766, "router_z_loss_mlp": 0.19726562, "step": 2067, "time_per_iteration": 2.9953720569610596 }, { "auxiliary_loss_clip": 0.01099981, "auxiliary_loss_mlp": 0.01071899, "balance_loss_clip": 1.02459669, "balance_loss_mlp": 1.02608716, "epoch": 0.12433488651736059, "flos": 24534269746560.0, "grad_norm": 1.798849508054672, "language_loss": 0.95304084, "learning_rate": 3.907373708678063e-06, "loss": 0.97475964, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.73828125, "step": 2068, "time_per_iteration": 2.410287857055664 }, { "auxiliary_loss_clip": 0.01100036, "auxiliary_loss_mlp": 0.01064338, "balance_loss_clip": 1.02228117, "balance_loss_mlp": 1.0278132, "epoch": 0.12439500977002856, "flos": 21030630877440.0, "grad_norm": 1.8968719470765192, "language_loss": 0.82948923, "learning_rate": 3.9072565219595596e-06, "loss": 0.85113299, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.72265625, "step": 2069, "time_per_iteration": 2.4397928714752197 }, { "auxiliary_loss_clip": 0.01101485, "auxiliary_loss_mlp": 0.01075235, "balance_loss_clip": 1.02578688, "balance_loss_mlp": 1.02767277, "epoch": 0.12445513302269653, "flos": 26829495037440.0, "grad_norm": 1.712623774292522, "language_loss": 0.78713715, "learning_rate": 3.907139262917696e-06, "loss": 0.80890441, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.73828125, "step": 2070, "time_per_iteration": 2.4469263553619385 }, { "auxiliary_loss_clip": 0.01100706, "auxiliary_loss_mlp": 0.01076139, "balance_loss_clip": 1.02425909, "balance_loss_mlp": 1.0269618, "epoch": 0.1245152562753645, "flos": 18367944860160.0, "grad_norm": 2.495067701395955, "language_loss": 0.83349824, "learning_rate": 3.907021931556922e-06, "loss": 0.85526669, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.73828125, "step": 2071, "time_per_iteration": 3.8380000591278076 }, { "auxiliary_loss_clip": 0.01096935, "auxiliary_loss_mlp": 0.01081845, "balance_loss_clip": 1.03323174, "balance_loss_mlp": 1.0257802, "epoch": 0.12457537952803246, "flos": 33106634179200.0, "grad_norm": 1.74452817456631, "language_loss": 0.80133927, "learning_rate": 3.906904527881684e-06, "loss": 0.82312709, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7109375, "step": 2072, "time_per_iteration": 2.5089497566223145 }, { "auxiliary_loss_clip": 0.01098201, "auxiliary_loss_mlp": 0.01071654, "balance_loss_clip": 1.02461421, "balance_loss_mlp": 1.0265168, "epoch": 0.12463550278070043, "flos": 22269209736960.0, "grad_norm": 1.8763199360852083, "language_loss": 0.77721858, "learning_rate": 3.9067870518964355e-06, "loss": 0.79891711, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.71875, "step": 2073, "time_per_iteration": 3.8386478424072266 }, { "auxiliary_loss_clip": 0.01095482, "auxiliary_loss_mlp": 0.01070278, "balance_loss_clip": 1.02516913, "balance_loss_mlp": 1.02402532, "epoch": 0.12469562603336841, "flos": 14678288933760.0, "grad_norm": 2.0039301053183087, "language_loss": 0.91479802, "learning_rate": 3.906669503605631e-06, "loss": 0.93645567, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.71484375, "step": 2074, "time_per_iteration": 3.768892526626587 }, { "auxiliary_loss_clip": 0.01100886, "auxiliary_loss_mlp": 0.01072452, "balance_loss_clip": 1.02126396, "balance_loss_mlp": 1.02616143, "epoch": 0.12475574928603637, "flos": 24643617724800.0, "grad_norm": 3.2533629162476467, "language_loss": 0.86380816, "learning_rate": 3.906551883013728e-06, "loss": 0.88554156, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.75, "step": 2075, "time_per_iteration": 3.8505442142486572 }, { "auxiliary_loss_clip": 0.01099339, "auxiliary_loss_mlp": 0.01079183, "balance_loss_clip": 1.03049755, "balance_loss_mlp": 1.02490568, "epoch": 0.12481587253870434, "flos": 21761886637440.0, "grad_norm": 1.9032532322010431, "language_loss": 0.74654835, "learning_rate": 3.9064341901251865e-06, "loss": 0.76833355, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.74609375, "step": 2076, "time_per_iteration": 2.401890516281128 }, { "auxiliary_loss_clip": 0.0109499, "auxiliary_loss_mlp": 0.01069416, "balance_loss_clip": 1.02476048, "balance_loss_mlp": 1.02540016, "epoch": 0.12487599579137232, "flos": 21431503641600.0, "grad_norm": 1.9187105930682862, "language_loss": 0.779791, "learning_rate": 3.906316424944469e-06, "loss": 0.80143499, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6953125, "step": 2077, "time_per_iteration": 2.3947150707244873 }, { "auxiliary_loss_clip": 0.01097937, "auxiliary_loss_mlp": 0.01077504, "balance_loss_clip": 1.02784157, "balance_loss_mlp": 1.02377677, "epoch": 0.12493611904404028, "flos": 16106690188800.0, "grad_norm": 2.002542384901918, "language_loss": 0.84979808, "learning_rate": 3.906198587476043e-06, "loss": 0.87155259, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.7421875, "step": 2078, "time_per_iteration": 2.3877267837524414 }, { "auxiliary_loss_clip": 0.01102501, "auxiliary_loss_mlp": 0.0106665, "balance_loss_clip": 1.02044487, "balance_loss_mlp": 1.02737248, "epoch": 0.12499624229670825, "flos": 21579186159360.0, "grad_norm": 1.6538365073071768, "language_loss": 0.771119, "learning_rate": 3.906080677724374e-06, "loss": 0.79281044, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.75, "step": 2079, "time_per_iteration": 2.485616683959961 }, { "auxiliary_loss_clip": 0.01101932, "auxiliary_loss_mlp": 0.0107151, "balance_loss_clip": 1.02444601, "balance_loss_mlp": 1.02710176, "epoch": 0.1250563655493762, "flos": 25697960006400.0, "grad_norm": 2.917230522348832, "language_loss": 0.86151254, "learning_rate": 3.905962695693935e-06, "loss": 0.8832469, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.75, "step": 2080, "time_per_iteration": 2.4205875396728516 }, { "auxiliary_loss_clip": 0.01098276, "auxiliary_loss_mlp": 0.01067822, "balance_loss_clip": 1.02087712, "balance_loss_mlp": 1.02595782, "epoch": 0.12511648880204418, "flos": 16908575362560.0, "grad_norm": 2.169931869684115, "language_loss": 0.87034643, "learning_rate": 3.9058446413892e-06, "loss": 0.89200735, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.72265625, "step": 2081, "time_per_iteration": 2.3674209117889404 }, { "auxiliary_loss_clip": 0.01097382, "auxiliary_loss_mlp": 0.0106888, "balance_loss_clip": 1.02257872, "balance_loss_mlp": 1.02497804, "epoch": 0.12517661205471217, "flos": 17566513534080.0, "grad_norm": 2.038809405738405, "language_loss": 0.78053987, "learning_rate": 3.905726514814646e-06, "loss": 0.80220252, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.72265625, "step": 2082, "time_per_iteration": 2.364903450012207 }, { "auxiliary_loss_clip": 0.01109731, "auxiliary_loss_mlp": 0.01073246, "balance_loss_clip": 1.01891017, "balance_loss_mlp": 1.02954555, "epoch": 0.12523673530738014, "flos": 16032883841280.0, "grad_norm": 2.5628209157990898, "language_loss": 0.82240182, "learning_rate": 3.9056083159747495e-06, "loss": 0.84423161, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.80078125, "step": 2083, "time_per_iteration": 2.401794910430908 }, { "auxiliary_loss_clip": 0.01100905, "auxiliary_loss_mlp": 0.01065215, "balance_loss_clip": 1.01457548, "balance_loss_mlp": 1.02602339, "epoch": 0.1252968585600481, "flos": 18806733227520.0, "grad_norm": 2.259297114636559, "language_loss": 0.91939288, "learning_rate": 3.9054900448739966e-06, "loss": 0.94105411, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2084, "time_per_iteration": 2.3839058876037598 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.01070343, "balance_loss_clip": 1.02330339, "balance_loss_mlp": 1.02845871, "epoch": 0.12535698181271607, "flos": 27270343175040.0, "grad_norm": 1.9295354144979746, "language_loss": 0.8259905, "learning_rate": 3.905371701516869e-06, "loss": 0.84772325, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.74609375, "step": 2085, "time_per_iteration": 2.4521946907043457 }, { "auxiliary_loss_clip": 0.01098976, "auxiliary_loss_mlp": 0.0107354, "balance_loss_clip": 1.02452087, "balance_loss_mlp": 1.02685952, "epoch": 0.12541710506538403, "flos": 22053027398400.0, "grad_norm": 1.7945688715036465, "language_loss": 0.90490174, "learning_rate": 3.905253285907856e-06, "loss": 0.92662692, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.72265625, "step": 2086, "time_per_iteration": 2.4161536693573 }, { "auxiliary_loss_clip": 0.01095198, "auxiliary_loss_mlp": 0.01067396, "balance_loss_clip": 1.02352715, "balance_loss_mlp": 1.02654982, "epoch": 0.125477228318052, "flos": 12602388003840.0, "grad_norm": 3.906863675830979, "language_loss": 0.88276929, "learning_rate": 3.905134798051447e-06, "loss": 0.90439522, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6875, "step": 2087, "time_per_iteration": 2.374539375305176 }, { "auxiliary_loss_clip": 0.01100024, "auxiliary_loss_mlp": 0.01069471, "balance_loss_clip": 1.0197612, "balance_loss_mlp": 1.02631855, "epoch": 0.12553735157071996, "flos": 23877413827200.0, "grad_norm": 1.8864615784360474, "language_loss": 0.75776803, "learning_rate": 3.905016237952136e-06, "loss": 0.77946299, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.73828125, "step": 2088, "time_per_iteration": 2.4770774841308594 }, { "auxiliary_loss_clip": 0.01028493, "auxiliary_loss_mlp": 0.01009165, "balance_loss_clip": 1.00215578, "balance_loss_mlp": 1.00877571, "epoch": 0.12559747482338796, "flos": 69917482321920.0, "grad_norm": 0.7543294700646492, "language_loss": 0.61870003, "learning_rate": 3.904897605614418e-06, "loss": 0.63907659, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 0.0703125, "router_z_loss_mlp": 0.19726562, "step": 2089, "time_per_iteration": 3.0809853076934814 }, { "auxiliary_loss_clip": 0.01096597, "auxiliary_loss_mlp": 0.01066002, "balance_loss_clip": 1.01793671, "balance_loss_mlp": 1.02545571, "epoch": 0.12565759807605592, "flos": 24278426236800.0, "grad_norm": 2.480972761368767, "language_loss": 0.80346167, "learning_rate": 3.904778901042793e-06, "loss": 0.82508767, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7109375, "step": 2090, "time_per_iteration": 2.4240455627441406 }, { "auxiliary_loss_clip": 0.01026659, "auxiliary_loss_mlp": 0.01007729, "balance_loss_clip": 1.00033796, "balance_loss_mlp": 1.0068574, "epoch": 0.12571772132872389, "flos": 56448375016320.0, "grad_norm": 0.7659677819536875, "language_loss": 0.59696865, "learning_rate": 3.90466012424176e-06, "loss": 0.61731255, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 0.07373047, "router_z_loss_mlp": 0.19824219, "step": 2091, "time_per_iteration": 2.939903974533081 }, { "auxiliary_loss_clip": 0.01101397, "auxiliary_loss_mlp": 0.01070255, "balance_loss_clip": 1.0233345, "balance_loss_mlp": 1.02838159, "epoch": 0.12577784458139185, "flos": 41244225050880.0, "grad_norm": 2.294099353437529, "language_loss": 0.67198324, "learning_rate": 3.904541275215825e-06, "loss": 0.69369972, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.73046875, "step": 2092, "time_per_iteration": 2.5664381980895996 }, { "auxiliary_loss_clip": 0.0110318, "auxiliary_loss_mlp": 0.01075065, "balance_loss_clip": 1.02556968, "balance_loss_mlp": 1.02850103, "epoch": 0.12583796783405982, "flos": 19754485528320.0, "grad_norm": 2.0937845431267648, "language_loss": 0.83230537, "learning_rate": 3.904422353969493e-06, "loss": 0.85408783, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.74609375, "step": 2093, "time_per_iteration": 2.409550189971924 }, { "auxiliary_loss_clip": 0.01100343, "auxiliary_loss_mlp": 0.0107023, "balance_loss_clip": 1.02252293, "balance_loss_mlp": 1.02747798, "epoch": 0.12589809108672778, "flos": 22600989187200.0, "grad_norm": 4.010100292576967, "language_loss": 0.77535617, "learning_rate": 3.904303360507276e-06, "loss": 0.79706192, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.7265625, "step": 2094, "time_per_iteration": 2.412766695022583 }, { "auxiliary_loss_clip": 0.01095888, "auxiliary_loss_mlp": 0.01065553, "balance_loss_clip": 1.02061129, "balance_loss_mlp": 1.02553749, "epoch": 0.12595821433939577, "flos": 45221111665920.0, "grad_norm": 1.6178884577198958, "language_loss": 0.78845894, "learning_rate": 3.9041842948336835e-06, "loss": 0.81007338, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.703125, "step": 2095, "time_per_iteration": 2.613774061203003 }, { "auxiliary_loss_clip": 0.0110077, "auxiliary_loss_mlp": 0.0106413, "balance_loss_clip": 1.01709008, "balance_loss_mlp": 1.02578056, "epoch": 0.12601833759206374, "flos": 14318927642880.0, "grad_norm": 2.298427310107454, "language_loss": 0.86728024, "learning_rate": 3.904065156953232e-06, "loss": 0.88892925, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.75, "step": 2096, "time_per_iteration": 2.3652350902557373 }, { "auxiliary_loss_clip": 0.01099921, "auxiliary_loss_mlp": 0.010802, "balance_loss_clip": 1.02924979, "balance_loss_mlp": 1.02683473, "epoch": 0.1260784608447317, "flos": 21287172614400.0, "grad_norm": 2.041857833389885, "language_loss": 0.77012503, "learning_rate": 3.903945946870439e-06, "loss": 0.79192626, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.73046875, "step": 2097, "time_per_iteration": 2.422727346420288 }, { "auxiliary_loss_clip": 0.01099006, "auxiliary_loss_mlp": 0.01074696, "balance_loss_clip": 1.0268451, "balance_loss_mlp": 1.02711439, "epoch": 0.12613858409739967, "flos": 26250076247040.0, "grad_norm": 1.8806725712123253, "language_loss": 0.88966548, "learning_rate": 3.9038266645898246e-06, "loss": 0.91140246, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.71875, "step": 2098, "time_per_iteration": 2.4385225772857666 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01077606, "balance_loss_clip": 1.02293706, "balance_loss_mlp": 1.0277276, "epoch": 0.12619870735006763, "flos": 21578906868480.0, "grad_norm": 1.80885434019419, "language_loss": 0.71467018, "learning_rate": 3.903707310115912e-06, "loss": 0.73648471, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.76171875, "step": 2099, "time_per_iteration": 2.4187145233154297 }, { "auxiliary_loss_clip": 0.01100844, "auxiliary_loss_mlp": 0.01076761, "balance_loss_clip": 1.02621675, "balance_loss_mlp": 1.0267241, "epoch": 0.1262588306027356, "flos": 23365936275840.0, "grad_norm": 2.0459697087326916, "language_loss": 0.83882058, "learning_rate": 3.903587883453228e-06, "loss": 0.86059666, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7421875, "step": 2100, "time_per_iteration": 2.4331295490264893 }, { "auxiliary_loss_clip": 0.01103684, "auxiliary_loss_mlp": 0.01076921, "balance_loss_clip": 1.02549434, "balance_loss_mlp": 1.02695155, "epoch": 0.12631895385540357, "flos": 23948113063680.0, "grad_norm": 1.937344077331392, "language_loss": 0.83285069, "learning_rate": 3.903468384606302e-06, "loss": 0.8546567, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.765625, "step": 2101, "time_per_iteration": 2.4277701377868652 }, { "auxiliary_loss_clip": 0.01026852, "auxiliary_loss_mlp": 0.01007164, "balance_loss_clip": 1.00058401, "balance_loss_mlp": 1.00643969, "epoch": 0.12637907710807156, "flos": 70278868471680.0, "grad_norm": 0.709846291096711, "language_loss": 0.57129288, "learning_rate": 3.903348813579662e-06, "loss": 0.59163308, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 0.06591797, "router_z_loss_mlp": 0.20410156, "step": 2102, "time_per_iteration": 3.0667166709899902 }, { "auxiliary_loss_clip": 0.01103495, "auxiliary_loss_mlp": 0.0107421, "balance_loss_clip": 1.02550149, "balance_loss_mlp": 1.02853513, "epoch": 0.12643920036073952, "flos": 18914126169600.0, "grad_norm": 2.213121025184689, "language_loss": 0.95580673, "learning_rate": 3.903229170377845e-06, "loss": 0.97758371, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.75, "step": 2103, "time_per_iteration": 2.3907690048217773 }, { "auxiliary_loss_clip": 0.0109308, "auxiliary_loss_mlp": 0.01059057, "balance_loss_clip": 1.01652336, "balance_loss_mlp": 1.0243901, "epoch": 0.1264993236134075, "flos": 27781227233280.0, "grad_norm": 1.6982247439133973, "language_loss": 0.79580879, "learning_rate": 3.903109455005387e-06, "loss": 0.81733012, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6875, "step": 2104, "time_per_iteration": 2.449740409851074 }, { "auxiliary_loss_clip": 0.01103013, "auxiliary_loss_mlp": 0.01075021, "balance_loss_clip": 1.0258112, "balance_loss_mlp": 1.02943814, "epoch": 0.12655944686607545, "flos": 24753524284800.0, "grad_norm": 1.7608372495154618, "language_loss": 0.84060472, "learning_rate": 3.902989667466828e-06, "loss": 0.86238503, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.734375, "step": 2105, "time_per_iteration": 2.453557014465332 }, { "auxiliary_loss_clip": 0.01102965, "auxiliary_loss_mlp": 0.01076518, "balance_loss_clip": 1.02640271, "balance_loss_mlp": 1.02763617, "epoch": 0.12661957011874342, "flos": 24131930705280.0, "grad_norm": 1.7849529316768553, "language_loss": 0.84455574, "learning_rate": 3.90286980776671e-06, "loss": 0.86635053, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.75390625, "step": 2106, "time_per_iteration": 2.4228100776672363 }, { "auxiliary_loss_clip": 0.01098242, "auxiliary_loss_mlp": 0.01076602, "balance_loss_clip": 1.02813125, "balance_loss_mlp": 1.02697301, "epoch": 0.12667969337141138, "flos": 24568519656960.0, "grad_norm": 1.7737080614408403, "language_loss": 0.75314415, "learning_rate": 3.902749875909578e-06, "loss": 0.77489257, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7109375, "step": 2107, "time_per_iteration": 2.4417243003845215 }, { "auxiliary_loss_clip": 0.01098884, "auxiliary_loss_mlp": 0.01066212, "balance_loss_clip": 1.02010202, "balance_loss_mlp": 1.02894425, "epoch": 0.12673981662407935, "flos": 22960699591680.0, "grad_norm": 1.978240378070436, "language_loss": 0.8083272, "learning_rate": 3.90262987189998e-06, "loss": 0.82997811, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.69921875, "step": 2108, "time_per_iteration": 2.41288423538208 }, { "auxiliary_loss_clip": 0.01097878, "auxiliary_loss_mlp": 0.01057533, "balance_loss_clip": 1.01263857, "balance_loss_mlp": 1.02517676, "epoch": 0.12679993987674734, "flos": 17273906496000.0, "grad_norm": 1.7686333410104331, "language_loss": 0.78784013, "learning_rate": 3.902509795742467e-06, "loss": 0.80939424, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.7265625, "step": 2109, "time_per_iteration": 2.397465944290161 }, { "auxiliary_loss_clip": 0.0109429, "auxiliary_loss_mlp": 0.01066094, "balance_loss_clip": 1.02205801, "balance_loss_mlp": 1.02429247, "epoch": 0.1268600631294153, "flos": 17274115964160.0, "grad_norm": 1.6423418252906739, "language_loss": 0.85298687, "learning_rate": 3.902389647441592e-06, "loss": 0.87459069, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.69921875, "step": 2110, "time_per_iteration": 2.4018049240112305 }, { "auxiliary_loss_clip": 0.01097519, "auxiliary_loss_mlp": 0.01068986, "balance_loss_clip": 1.02223182, "balance_loss_mlp": 1.02688062, "epoch": 0.12692018638208327, "flos": 24059904837120.0, "grad_norm": 2.0053995889375873, "language_loss": 0.8123129, "learning_rate": 3.90226942700191e-06, "loss": 0.83397794, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.7109375, "step": 2111, "time_per_iteration": 3.853485584259033 }, { "auxiliary_loss_clip": 0.01105653, "auxiliary_loss_mlp": 0.01080175, "balance_loss_clip": 1.02750802, "balance_loss_mlp": 1.02796817, "epoch": 0.12698030963475124, "flos": 31830558652800.0, "grad_norm": 1.9923017631396902, "language_loss": 0.79077971, "learning_rate": 3.902149134427982e-06, "loss": 0.81263793, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.77734375, "step": 2112, "time_per_iteration": 2.465670347213745 }, { "auxiliary_loss_clip": 0.01097859, "auxiliary_loss_mlp": 0.0106057, "balance_loss_clip": 1.01934743, "balance_loss_mlp": 1.02654886, "epoch": 0.1270404328874192, "flos": 25186691923200.0, "grad_norm": 1.7615828064080397, "language_loss": 0.86774397, "learning_rate": 3.902028769724367e-06, "loss": 0.8893283, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.7109375, "step": 2113, "time_per_iteration": 3.850452423095703 }, { "auxiliary_loss_clip": 0.01095139, "auxiliary_loss_mlp": 0.01067165, "balance_loss_clip": 1.02083993, "balance_loss_mlp": 1.02468586, "epoch": 0.12710055614008717, "flos": 15996434515200.0, "grad_norm": 1.8993008356207108, "language_loss": 0.75400722, "learning_rate": 3.9019083328956315e-06, "loss": 0.77563024, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.70703125, "step": 2114, "time_per_iteration": 5.231007814407349 }, { "auxiliary_loss_clip": 0.01099663, "auxiliary_loss_mlp": 0.01067381, "balance_loss_clip": 1.02241492, "balance_loss_mlp": 1.02786088, "epoch": 0.12716067939275516, "flos": 15084747515520.0, "grad_norm": 3.732960096500092, "language_loss": 0.85291243, "learning_rate": 3.901787823946341e-06, "loss": 0.87458289, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.71875, "step": 2115, "time_per_iteration": 2.377838134765625 }, { "auxiliary_loss_clip": 0.01101968, "auxiliary_loss_mlp": 0.01074416, "balance_loss_clip": 1.02563524, "balance_loss_mlp": 1.02876568, "epoch": 0.12722080264542313, "flos": 28365463791360.0, "grad_norm": 1.8239713692432853, "language_loss": 0.8873347, "learning_rate": 3.901667242881065e-06, "loss": 0.90909851, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.734375, "step": 2116, "time_per_iteration": 2.472651958465576 }, { "auxiliary_loss_clip": 0.01094542, "auxiliary_loss_mlp": 0.01063708, "balance_loss_clip": 1.01998186, "balance_loss_mlp": 1.02571571, "epoch": 0.1272809258980911, "flos": 32378520441600.0, "grad_norm": 1.8977255185679933, "language_loss": 0.72030169, "learning_rate": 3.9015465897043775e-06, "loss": 0.74188417, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6875, "step": 2117, "time_per_iteration": 2.506897449493408 }, { "auxiliary_loss_clip": 0.01097908, "auxiliary_loss_mlp": 0.01064841, "balance_loss_clip": 1.02087629, "balance_loss_mlp": 1.02690625, "epoch": 0.12734104915075906, "flos": 16033477334400.0, "grad_norm": 2.1123390325401674, "language_loss": 0.88026881, "learning_rate": 3.901425864420852e-06, "loss": 0.9018963, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.7109375, "step": 2118, "time_per_iteration": 2.3871867656707764 }, { "auxiliary_loss_clip": 0.01094619, "auxiliary_loss_mlp": 0.01072641, "balance_loss_clip": 1.02707911, "balance_loss_mlp": 1.02455306, "epoch": 0.12740117240342702, "flos": 18259330020480.0, "grad_norm": 1.777095038175777, "language_loss": 0.89981472, "learning_rate": 3.901305067035068e-06, "loss": 0.92148739, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.703125, "step": 2119, "time_per_iteration": 2.392143964767456 }, { "auxiliary_loss_clip": 0.01098346, "auxiliary_loss_mlp": 0.01069956, "balance_loss_clip": 1.02572966, "balance_loss_mlp": 1.02686262, "epoch": 0.127461295656095, "flos": 12121215379200.0, "grad_norm": 2.338813219628736, "language_loss": 0.89659369, "learning_rate": 3.901184197551605e-06, "loss": 0.91827679, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.71484375, "step": 2120, "time_per_iteration": 2.338932514190674 }, { "auxiliary_loss_clip": 0.01099091, "auxiliary_loss_mlp": 0.01064823, "balance_loss_clip": 1.01883233, "balance_loss_mlp": 1.02729821, "epoch": 0.12752141890876295, "flos": 23147973457920.0, "grad_norm": 1.7869999541678403, "language_loss": 0.7796129, "learning_rate": 3.901063255975046e-06, "loss": 0.80125207, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.71875, "step": 2121, "time_per_iteration": 2.4427952766418457 }, { "auxiliary_loss_clip": 0.01096737, "auxiliary_loss_mlp": 0.01072248, "balance_loss_clip": 1.02339578, "balance_loss_mlp": 1.02585292, "epoch": 0.12758154216143094, "flos": 21614937258240.0, "grad_norm": 2.2278393657309, "language_loss": 0.84214371, "learning_rate": 3.900942242309978e-06, "loss": 0.86383355, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7109375, "step": 2122, "time_per_iteration": 2.390829086303711 }, { "auxiliary_loss_clip": 0.01096579, "auxiliary_loss_mlp": 0.01065645, "balance_loss_clip": 1.02137053, "balance_loss_mlp": 1.02663898, "epoch": 0.1276416654140989, "flos": 15923954799360.0, "grad_norm": 1.8299033224575254, "language_loss": 0.8157838, "learning_rate": 3.90082115656099e-06, "loss": 0.83740604, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.69921875, "step": 2123, "time_per_iteration": 2.38773775100708 }, { "auxiliary_loss_clip": 0.011, "auxiliary_loss_mlp": 0.01070653, "balance_loss_clip": 1.02695119, "balance_loss_mlp": 1.02825522, "epoch": 0.12770178866676687, "flos": 22381595003520.0, "grad_norm": 1.626708526647291, "language_loss": 0.81140411, "learning_rate": 3.900699998732673e-06, "loss": 0.83311069, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.71875, "step": 2124, "time_per_iteration": 2.4497182369232178 }, { "auxiliary_loss_clip": 0.01098105, "auxiliary_loss_mlp": 0.01075566, "balance_loss_clip": 1.02921736, "balance_loss_mlp": 1.02514982, "epoch": 0.12776191191943484, "flos": 21651421495680.0, "grad_norm": 1.920609281260963, "language_loss": 0.76993394, "learning_rate": 3.900578768829623e-06, "loss": 0.79167068, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.7265625, "step": 2125, "time_per_iteration": 2.389789581298828 }, { "auxiliary_loss_clip": 0.01098346, "auxiliary_loss_mlp": 0.01065025, "balance_loss_clip": 1.0198921, "balance_loss_mlp": 1.0265789, "epoch": 0.1278220351721028, "flos": 25734479155200.0, "grad_norm": 2.1006449814837818, "language_loss": 0.7968539, "learning_rate": 3.900457466856434e-06, "loss": 0.81848764, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.71875, "step": 2126, "time_per_iteration": 2.4288575649261475 }, { "auxiliary_loss_clip": 0.01097925, "auxiliary_loss_mlp": 0.0107224, "balance_loss_clip": 1.02713084, "balance_loss_mlp": 1.02741504, "epoch": 0.12788215842477077, "flos": 41241676521600.0, "grad_norm": 1.471003300986322, "language_loss": 0.71418953, "learning_rate": 3.9003360928177085e-06, "loss": 0.7358911, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.70703125, "step": 2127, "time_per_iteration": 2.591374158859253 }, { "auxiliary_loss_clip": 0.01035868, "auxiliary_loss_mlp": 0.01010503, "balance_loss_clip": 1.00416136, "balance_loss_mlp": 1.01463032, "epoch": 0.12794228167743876, "flos": 70873822817280.0, "grad_norm": 0.852645922483622, "language_loss": 0.62999916, "learning_rate": 3.900214646718047e-06, "loss": 0.65046287, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 0.06347656, "router_z_loss_mlp": 0.21289062, "step": 2128, "time_per_iteration": 3.0717384815216064 }, { "auxiliary_loss_clip": 0.01098301, "auxiliary_loss_mlp": 0.01067938, "balance_loss_clip": 1.02177978, "balance_loss_mlp": 1.02620006, "epoch": 0.12800240493010673, "flos": 16288797173760.0, "grad_norm": 2.755545132945294, "language_loss": 0.8059653, "learning_rate": 3.900093128562056e-06, "loss": 0.82762766, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.71875, "step": 2129, "time_per_iteration": 2.473583459854126 }, { "auxiliary_loss_clip": 0.01103813, "auxiliary_loss_mlp": 0.01071043, "balance_loss_clip": 1.01854289, "balance_loss_mlp": 1.02782166, "epoch": 0.1280625281827747, "flos": 20630491251840.0, "grad_norm": 2.4974660448896677, "language_loss": 0.8254205, "learning_rate": 3.899971538354343e-06, "loss": 0.84716904, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7578125, "step": 2130, "time_per_iteration": 2.421613931655884 }, { "auxiliary_loss_clip": 0.01097476, "auxiliary_loss_mlp": 0.01071713, "balance_loss_clip": 1.02278936, "balance_loss_mlp": 1.02525949, "epoch": 0.12812265143544266, "flos": 22637124311040.0, "grad_norm": 1.7984073623287196, "language_loss": 0.72785008, "learning_rate": 3.899849876099518e-06, "loss": 0.749542, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.72265625, "step": 2131, "time_per_iteration": 2.433311700820923 }, { "auxiliary_loss_clip": 0.01099444, "auxiliary_loss_mlp": 0.01066751, "balance_loss_clip": 1.0191623, "balance_loss_mlp": 1.02744675, "epoch": 0.12818277468811062, "flos": 34713267258240.0, "grad_norm": 2.6532069950252097, "language_loss": 0.75361168, "learning_rate": 3.899728141802197e-06, "loss": 0.77527362, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.71875, "step": 2132, "time_per_iteration": 2.512260675430298 }, { "auxiliary_loss_clip": 0.01097022, "auxiliary_loss_mlp": 0.01066395, "balance_loss_clip": 1.01923525, "balance_loss_mlp": 1.02742743, "epoch": 0.1282428979407786, "flos": 23111000461440.0, "grad_norm": 2.130297013143634, "language_loss": 0.8366586, "learning_rate": 3.8996063354669935e-06, "loss": 0.85829276, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6953125, "step": 2133, "time_per_iteration": 2.459123134613037 }, { "auxiliary_loss_clip": 0.0110779, "auxiliary_loss_mlp": 0.01078903, "balance_loss_clip": 1.02707124, "balance_loss_mlp": 1.02944136, "epoch": 0.12830302119344655, "flos": 20885461977600.0, "grad_norm": 2.5477247497376783, "language_loss": 0.82169557, "learning_rate": 3.899484457098528e-06, "loss": 0.84356248, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.78515625, "step": 2134, "time_per_iteration": 2.4053640365600586 }, { "auxiliary_loss_clip": 0.0110127, "auxiliary_loss_mlp": 0.01077543, "balance_loss_clip": 1.02759445, "balance_loss_mlp": 1.02866387, "epoch": 0.12836314444611455, "flos": 21396695149440.0, "grad_norm": 1.7665007290235353, "language_loss": 0.85576051, "learning_rate": 3.899362506701421e-06, "loss": 0.87754869, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7265625, "step": 2135, "time_per_iteration": 2.4040963649749756 }, { "auxiliary_loss_clip": 0.0109949, "auxiliary_loss_mlp": 0.01073129, "balance_loss_clip": 1.02484977, "balance_loss_mlp": 1.02775538, "epoch": 0.1284232676987825, "flos": 13661617875840.0, "grad_norm": 2.178735363994609, "language_loss": 0.80063188, "learning_rate": 3.899240484280298e-06, "loss": 0.82235807, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.71875, "step": 2136, "time_per_iteration": 2.4106063842773438 }, { "auxiliary_loss_clip": 0.01029612, "auxiliary_loss_mlp": 0.01024085, "balance_loss_clip": 1.01564538, "balance_loss_mlp": 1.00863743, "epoch": 0.12848339095145048, "flos": 59991709968000.0, "grad_norm": 0.902831602181296, "language_loss": 0.59206522, "learning_rate": 3.899118389839785e-06, "loss": 0.61260223, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 0.08447266, "router_z_loss_mlp": 0.20996094, "step": 2137, "time_per_iteration": 3.139983892440796 }, { "auxiliary_loss_clip": 0.01102136, "auxiliary_loss_mlp": 0.01073839, "balance_loss_clip": 1.02513027, "balance_loss_mlp": 1.02843142, "epoch": 0.12854351420411844, "flos": 13880523300480.0, "grad_norm": 2.503153097920799, "language_loss": 0.85267538, "learning_rate": 3.898996223384512e-06, "loss": 0.87443519, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.73828125, "step": 2138, "time_per_iteration": 2.374898910522461 }, { "auxiliary_loss_clip": 0.01106013, "auxiliary_loss_mlp": 0.0108006, "balance_loss_clip": 1.03034997, "balance_loss_mlp": 1.02957308, "epoch": 0.1286036374567864, "flos": 22636845020160.0, "grad_norm": 3.723604201752595, "language_loss": 0.81704056, "learning_rate": 3.898873984919113e-06, "loss": 0.83890128, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.765625, "step": 2139, "time_per_iteration": 2.4395737648010254 }, { "auxiliary_loss_clip": 0.01104622, "auxiliary_loss_mlp": 0.01087183, "balance_loss_clip": 1.03661466, "balance_loss_mlp": 1.02929151, "epoch": 0.12866376070945437, "flos": 16323884956800.0, "grad_norm": 1.7946937930524334, "language_loss": 0.86959648, "learning_rate": 3.8987516744482215e-06, "loss": 0.89151454, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.75390625, "step": 2140, "time_per_iteration": 2.397221326828003 }, { "auxiliary_loss_clip": 0.01098999, "auxiliary_loss_mlp": 0.0107144, "balance_loss_clip": 1.02797592, "balance_loss_mlp": 1.02807331, "epoch": 0.12872388396212234, "flos": 11873750595840.0, "grad_norm": 2.005436340285127, "language_loss": 0.8735441, "learning_rate": 3.898629291976476e-06, "loss": 0.89524841, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.7109375, "step": 2141, "time_per_iteration": 2.399759292602539 }, { "auxiliary_loss_clip": 0.01105523, "auxiliary_loss_mlp": 0.01082706, "balance_loss_clip": 1.03282905, "balance_loss_mlp": 1.02897584, "epoch": 0.12878400721479033, "flos": 28365428880000.0, "grad_norm": 1.9538901436228513, "language_loss": 0.70832264, "learning_rate": 3.898506837508518e-06, "loss": 0.73020494, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.765625, "step": 2142, "time_per_iteration": 2.4588537216186523 }, { "auxiliary_loss_clip": 0.01106471, "auxiliary_loss_mlp": 0.01081504, "balance_loss_clip": 1.03145981, "balance_loss_mlp": 1.02978516, "epoch": 0.1288441304674583, "flos": 25884430911360.0, "grad_norm": 2.273505944389879, "language_loss": 0.86100835, "learning_rate": 3.89838431104899e-06, "loss": 0.88288808, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.765625, "step": 2143, "time_per_iteration": 2.4309492111206055 }, { "auxiliary_loss_clip": 0.01106111, "auxiliary_loss_mlp": 0.01085694, "balance_loss_clip": 1.03476739, "balance_loss_mlp": 1.03029275, "epoch": 0.12890425372012626, "flos": 20812737882240.0, "grad_norm": 1.7238903002284929, "language_loss": 0.83865738, "learning_rate": 3.898261712602539e-06, "loss": 0.86057538, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 2144, "time_per_iteration": 2.4402389526367188 }, { "auxiliary_loss_clip": 0.01102883, "auxiliary_loss_mlp": 0.01090767, "balance_loss_clip": 1.03879213, "balance_loss_mlp": 1.02760077, "epoch": 0.12896437697279423, "flos": 22564749329280.0, "grad_norm": 2.0313800768567836, "language_loss": 0.81231457, "learning_rate": 3.898139042173813e-06, "loss": 0.83425105, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.75390625, "step": 2145, "time_per_iteration": 2.3943943977355957 }, { "auxiliary_loss_clip": 0.01104524, "auxiliary_loss_mlp": 0.010879, "balance_loss_clip": 1.0311799, "balance_loss_mlp": 1.02722168, "epoch": 0.1290245002254622, "flos": 17492811920640.0, "grad_norm": 1.9097048490765307, "language_loss": 0.84887475, "learning_rate": 3.898016299767465e-06, "loss": 0.87079901, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.7734375, "step": 2146, "time_per_iteration": 2.4047343730926514 }, { "auxiliary_loss_clip": 0.01100938, "auxiliary_loss_mlp": 0.01089772, "balance_loss_clip": 1.03672397, "balance_loss_mlp": 1.02816844, "epoch": 0.12908462347813016, "flos": 36314593810560.0, "grad_norm": 3.559626873553877, "language_loss": 0.73351383, "learning_rate": 3.897893485388149e-06, "loss": 0.75542092, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7265625, "step": 2147, "time_per_iteration": 2.5256307125091553 }, { "auxiliary_loss_clip": 0.01102351, "auxiliary_loss_mlp": 0.01072922, "balance_loss_clip": 1.02333093, "balance_loss_mlp": 1.02784872, "epoch": 0.12914474673079815, "flos": 22527601776000.0, "grad_norm": 2.133264318744652, "language_loss": 0.73575222, "learning_rate": 3.897770599040521e-06, "loss": 0.75750494, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.74609375, "step": 2148, "time_per_iteration": 2.42390775680542 }, { "auxiliary_loss_clip": 0.0110007, "auxiliary_loss_mlp": 0.0106908, "balance_loss_clip": 1.02153933, "balance_loss_mlp": 1.02845812, "epoch": 0.12920486998346611, "flos": 21470780787840.0, "grad_norm": 1.6048687876951306, "language_loss": 0.8061527, "learning_rate": 3.897647640729242e-06, "loss": 0.82784426, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.71484375, "step": 2149, "time_per_iteration": 2.4270668029785156 }, { "auxiliary_loss_clip": 0.01106387, "auxiliary_loss_mlp": 0.01078877, "balance_loss_clip": 1.02482748, "balance_loss_mlp": 1.03030229, "epoch": 0.12926499323613408, "flos": 27307316171520.0, "grad_norm": 2.037643000542415, "language_loss": 0.78389686, "learning_rate": 3.897524610458975e-06, "loss": 0.80574954, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.76171875, "step": 2150, "time_per_iteration": 3.9015908241271973 }, { "auxiliary_loss_clip": 0.01100793, "auxiliary_loss_mlp": 0.0108682, "balance_loss_clip": 1.03467822, "balance_loss_mlp": 1.02674079, "epoch": 0.12932511648880204, "flos": 22090035306240.0, "grad_norm": 2.1873923320628923, "language_loss": 0.73836011, "learning_rate": 3.8974015082343835e-06, "loss": 0.76023626, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7421875, "step": 2151, "time_per_iteration": 2.441594123840332 }, { "auxiliary_loss_clip": 0.01101921, "auxiliary_loss_mlp": 0.01077632, "balance_loss_clip": 1.0254184, "balance_loss_mlp": 1.02878642, "epoch": 0.12938523974147, "flos": 20301749089920.0, "grad_norm": 1.983095765708907, "language_loss": 0.86479634, "learning_rate": 3.897278334060137e-06, "loss": 0.88659191, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.734375, "step": 2152, "time_per_iteration": 2.384993553161621 }, { "auxiliary_loss_clip": 0.0110231, "auxiliary_loss_mlp": 0.01084994, "balance_loss_clip": 1.03247035, "balance_loss_mlp": 1.02654552, "epoch": 0.12944536299413797, "flos": 19498956220800.0, "grad_norm": 1.668044403525724, "language_loss": 0.80230546, "learning_rate": 3.897155087940906e-06, "loss": 0.82417846, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7578125, "step": 2153, "time_per_iteration": 5.246873617172241 }, { "auxiliary_loss_clip": 0.01102653, "auxiliary_loss_mlp": 0.01079191, "balance_loss_clip": 1.02421165, "balance_loss_mlp": 1.0266118, "epoch": 0.12950548624680594, "flos": 27706722658560.0, "grad_norm": 1.712649903689431, "language_loss": 0.823228, "learning_rate": 3.897031769881364e-06, "loss": 0.84504646, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.76171875, "step": 2154, "time_per_iteration": 3.856257677078247 }, { "auxiliary_loss_clip": 0.01104203, "auxiliary_loss_mlp": 0.01077627, "balance_loss_clip": 1.02534175, "balance_loss_mlp": 1.02897525, "epoch": 0.12956560949947393, "flos": 17564802877440.0, "grad_norm": 1.92020694841077, "language_loss": 0.85300857, "learning_rate": 3.896908379886188e-06, "loss": 0.87482691, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.75390625, "step": 2155, "time_per_iteration": 2.434171676635742 }, { "auxiliary_loss_clip": 0.01107911, "auxiliary_loss_mlp": 0.01080043, "balance_loss_clip": 1.02442026, "balance_loss_mlp": 1.03027892, "epoch": 0.1296257327521419, "flos": 20739664673280.0, "grad_norm": 2.4936167385423955, "language_loss": 0.7763865, "learning_rate": 3.896784917960055e-06, "loss": 0.79826605, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.77734375, "step": 2156, "time_per_iteration": 2.4074456691741943 }, { "auxiliary_loss_clip": 0.01098624, "auxiliary_loss_mlp": 0.01072313, "balance_loss_clip": 1.02288842, "balance_loss_mlp": 1.02656102, "epoch": 0.12968585600480986, "flos": 16394898395520.0, "grad_norm": 1.9300415881305968, "language_loss": 0.88281727, "learning_rate": 3.896661384107648e-06, "loss": 0.90452671, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.71875, "step": 2157, "time_per_iteration": 2.372838258743286 }, { "auxiliary_loss_clip": 0.01105243, "auxiliary_loss_mlp": 0.0108774, "balance_loss_clip": 1.02770567, "balance_loss_mlp": 1.02739358, "epoch": 0.12974597925747783, "flos": 28328281326720.0, "grad_norm": 2.482528184085084, "language_loss": 0.83481377, "learning_rate": 3.896537778333651e-06, "loss": 0.85674363, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.78125, "step": 2158, "time_per_iteration": 2.482666492462158 }, { "auxiliary_loss_clip": 0.01105827, "auxiliary_loss_mlp": 0.01090516, "balance_loss_clip": 1.03412986, "balance_loss_mlp": 1.02813244, "epoch": 0.1298061025101458, "flos": 9682357288320.0, "grad_norm": 2.577298082869026, "language_loss": 0.77145863, "learning_rate": 3.896414100642752e-06, "loss": 0.79342204, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.77734375, "step": 2159, "time_per_iteration": 2.361314058303833 }, { "auxiliary_loss_clip": 0.01100475, "auxiliary_loss_mlp": 0.010783, "balance_loss_clip": 1.02310634, "balance_loss_mlp": 1.02536941, "epoch": 0.12986622576281376, "flos": 27708293669760.0, "grad_norm": 2.138159615476322, "language_loss": 0.84459567, "learning_rate": 3.89629035103964e-06, "loss": 0.86638349, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.75, "step": 2160, "time_per_iteration": 2.45601749420166 }, { "auxiliary_loss_clip": 0.01098214, "auxiliary_loss_mlp": 0.01079133, "balance_loss_clip": 1.02646589, "balance_loss_mlp": 1.02615285, "epoch": 0.12992634901548175, "flos": 18801845637120.0, "grad_norm": 2.0795491496572556, "language_loss": 0.83479977, "learning_rate": 3.896166529529008e-06, "loss": 0.85657322, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.71875, "step": 2161, "time_per_iteration": 2.37758469581604 }, { "auxiliary_loss_clip": 0.01105233, "auxiliary_loss_mlp": 0.01076326, "balance_loss_clip": 1.01886749, "balance_loss_mlp": 1.02896333, "epoch": 0.12998647226814972, "flos": 29126430984960.0, "grad_norm": 1.9394543476831771, "language_loss": 0.84819478, "learning_rate": 3.896042636115551e-06, "loss": 0.87001038, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.765625, "step": 2162, "time_per_iteration": 2.462890863418579 }, { "auxiliary_loss_clip": 0.01105679, "auxiliary_loss_mlp": 0.01069682, "balance_loss_clip": 1.01484585, "balance_loss_mlp": 1.02604949, "epoch": 0.13004659552081768, "flos": 19572657834240.0, "grad_norm": 4.3031566290862875, "language_loss": 0.74970555, "learning_rate": 3.895918670803968e-06, "loss": 0.77145922, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.796875, "step": 2163, "time_per_iteration": 2.3761167526245117 }, { "auxiliary_loss_clip": 0.01105477, "auxiliary_loss_mlp": 0.01093623, "balance_loss_clip": 1.03041792, "balance_loss_mlp": 1.02618325, "epoch": 0.13010671877348565, "flos": 22489651261440.0, "grad_norm": 1.9680069335514843, "language_loss": 0.83958429, "learning_rate": 3.895794633598958e-06, "loss": 0.86157537, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.79296875, "step": 2164, "time_per_iteration": 2.3981714248657227 }, { "auxiliary_loss_clip": 0.01103049, "auxiliary_loss_mlp": 0.01077544, "balance_loss_clip": 1.02561641, "balance_loss_mlp": 1.02615595, "epoch": 0.1301668420261536, "flos": 23877099624960.0, "grad_norm": 1.9581646387317124, "language_loss": 0.74384749, "learning_rate": 3.8956705245052256e-06, "loss": 0.76565349, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.76953125, "step": 2165, "time_per_iteration": 2.3984694480895996 }, { "auxiliary_loss_clip": 0.01106716, "auxiliary_loss_mlp": 0.01080568, "balance_loss_clip": 1.02177358, "balance_loss_mlp": 1.02712548, "epoch": 0.13022696527882158, "flos": 23148916064640.0, "grad_norm": 1.757578930794073, "language_loss": 0.76846951, "learning_rate": 3.8955463435274765e-06, "loss": 0.79034233, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.796875, "step": 2166, "time_per_iteration": 2.4149773120880127 }, { "auxiliary_loss_clip": 0.01103432, "auxiliary_loss_mlp": 0.01079853, "balance_loss_clip": 1.02539849, "balance_loss_mlp": 1.02670789, "epoch": 0.13028708853148954, "flos": 26907281280000.0, "grad_norm": 1.5480991107658018, "language_loss": 0.84858656, "learning_rate": 3.895422090670421e-06, "loss": 0.87041938, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.765625, "step": 2167, "time_per_iteration": 2.4516680240631104 }, { "auxiliary_loss_clip": 0.01103725, "auxiliary_loss_mlp": 0.01088183, "balance_loss_clip": 1.02953172, "balance_loss_mlp": 1.02715278, "epoch": 0.13034721178415754, "flos": 21250409086080.0, "grad_norm": 1.593509813888936, "language_loss": 0.8556115, "learning_rate": 3.89529776593877e-06, "loss": 0.87753057, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.765625, "step": 2168, "time_per_iteration": 2.411665678024292 }, { "auxiliary_loss_clip": 0.01103219, "auxiliary_loss_mlp": 0.01088589, "balance_loss_clip": 1.03234661, "balance_loss_mlp": 1.02483761, "epoch": 0.1304073350368255, "flos": 18766338917760.0, "grad_norm": 2.5277730479883607, "language_loss": 0.81934512, "learning_rate": 3.8951733693372375e-06, "loss": 0.84126323, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.78125, "step": 2169, "time_per_iteration": 2.4268789291381836 }, { "auxiliary_loss_clip": 0.0110484, "auxiliary_loss_mlp": 0.01073994, "balance_loss_clip": 1.01942003, "balance_loss_mlp": 1.02718449, "epoch": 0.13046745828949347, "flos": 28363438932480.0, "grad_norm": 1.9945692413261011, "language_loss": 0.70927185, "learning_rate": 3.8950489008705406e-06, "loss": 0.73106027, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.77734375, "step": 2170, "time_per_iteration": 2.444035291671753 }, { "auxiliary_loss_clip": 0.0110173, "auxiliary_loss_mlp": 0.0108378, "balance_loss_clip": 1.03106558, "balance_loss_mlp": 1.02672482, "epoch": 0.13052758154216143, "flos": 29603798271360.0, "grad_norm": 1.6597732886289804, "language_loss": 0.68830538, "learning_rate": 3.8949243605434e-06, "loss": 0.71016043, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.75, "step": 2171, "time_per_iteration": 2.462592124938965 }, { "auxiliary_loss_clip": 0.01103603, "auxiliary_loss_mlp": 0.01083711, "balance_loss_clip": 1.02839756, "balance_loss_mlp": 1.02755022, "epoch": 0.1305877047948294, "flos": 19389852622080.0, "grad_norm": 1.797303949574266, "language_loss": 0.74787533, "learning_rate": 3.894799748360537e-06, "loss": 0.76974845, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.76171875, "step": 2172, "time_per_iteration": 2.4035677909851074 }, { "auxiliary_loss_clip": 0.01096918, "auxiliary_loss_mlp": 0.01066742, "balance_loss_clip": 1.01805747, "balance_loss_mlp": 1.02596641, "epoch": 0.13064782804749736, "flos": 16872579884160.0, "grad_norm": 1.6062312006489599, "language_loss": 0.77107453, "learning_rate": 3.894675064326678e-06, "loss": 0.79271108, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7109375, "step": 2173, "time_per_iteration": 2.383920431137085 }, { "auxiliary_loss_clip": 0.01104374, "auxiliary_loss_mlp": 0.01084473, "balance_loss_clip": 1.02367592, "balance_loss_mlp": 1.02684903, "epoch": 0.13070795130016533, "flos": 24497925154560.0, "grad_norm": 2.148086685359615, "language_loss": 0.73761266, "learning_rate": 3.894550308446551e-06, "loss": 0.7595011, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.77734375, "step": 2174, "time_per_iteration": 2.4106554985046387 }, { "auxiliary_loss_clip": 0.01026133, "auxiliary_loss_mlp": 0.01023231, "balance_loss_clip": 1.01588738, "balance_loss_mlp": 1.00703239, "epoch": 0.13076807455283332, "flos": 71051042211840.0, "grad_norm": 1.3271502743681594, "language_loss": 0.59081382, "learning_rate": 3.894425480724886e-06, "loss": 0.6113075, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 0.07324219, "router_z_loss_mlp": 0.19140625, "step": 2175, "time_per_iteration": 3.186306953430176 }, { "auxiliary_loss_clip": 0.01101312, "auxiliary_loss_mlp": 0.01073086, "balance_loss_clip": 1.01982284, "balance_loss_mlp": 1.02684224, "epoch": 0.13082819780550128, "flos": 20263519284480.0, "grad_norm": 2.0233089495626286, "language_loss": 0.82500839, "learning_rate": 3.894300581166417e-06, "loss": 0.84675241, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.74609375, "step": 2176, "time_per_iteration": 2.392143964767456 }, { "auxiliary_loss_clip": 0.01103056, "auxiliary_loss_mlp": 0.01079865, "balance_loss_clip": 1.0249573, "balance_loss_mlp": 1.02847028, "epoch": 0.13088832105816925, "flos": 34202034086400.0, "grad_norm": 1.8140135000813216, "language_loss": 0.76003402, "learning_rate": 3.894175609775881e-06, "loss": 0.78186327, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.74609375, "step": 2177, "time_per_iteration": 2.49130916595459 }, { "auxiliary_loss_clip": 0.01098207, "auxiliary_loss_mlp": 0.01067108, "balance_loss_clip": 1.0163244, "balance_loss_mlp": 1.02541482, "epoch": 0.13094844431083721, "flos": 17893998887040.0, "grad_norm": 1.720955826433338, "language_loss": 0.83740467, "learning_rate": 3.894050566558015e-06, "loss": 0.85905778, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7265625, "step": 2178, "time_per_iteration": 2.4031450748443604 }, { "auxiliary_loss_clip": 0.01101447, "auxiliary_loss_mlp": 0.01073703, "balance_loss_clip": 1.02313411, "balance_loss_mlp": 1.02576351, "epoch": 0.13100856756350518, "flos": 17310355822080.0, "grad_norm": 1.9384048704673258, "language_loss": 0.76314843, "learning_rate": 3.893925451517562e-06, "loss": 0.78489995, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7578125, "step": 2179, "time_per_iteration": 2.3660812377929688 }, { "auxiliary_loss_clip": 0.01098221, "auxiliary_loss_mlp": 0.01072138, "balance_loss_clip": 1.02371478, "balance_loss_mlp": 1.02575374, "epoch": 0.13106869081617314, "flos": 22199453107200.0, "grad_norm": 3.884586288962993, "language_loss": 0.86351562, "learning_rate": 3.893800264659266e-06, "loss": 0.88521922, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7265625, "step": 2180, "time_per_iteration": 2.4578819274902344 }, { "auxiliary_loss_clip": 0.0110077, "auxiliary_loss_mlp": 0.01077793, "balance_loss_clip": 1.02624714, "balance_loss_mlp": 1.02796233, "epoch": 0.13112881406884114, "flos": 21762026282880.0, "grad_norm": 4.332106160440889, "language_loss": 0.91461575, "learning_rate": 3.8936750059878746e-06, "loss": 0.93640137, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7265625, "step": 2181, "time_per_iteration": 2.451793909072876 }, { "auxiliary_loss_clip": 0.01101838, "auxiliary_loss_mlp": 0.01077466, "balance_loss_clip": 1.02448988, "balance_loss_mlp": 1.02778435, "epoch": 0.1311889373215091, "flos": 23329975708800.0, "grad_norm": 2.05400055651519, "language_loss": 0.70575041, "learning_rate": 3.893549675508137e-06, "loss": 0.72754347, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7421875, "step": 2182, "time_per_iteration": 2.4025819301605225 }, { "auxiliary_loss_clip": 0.01105745, "auxiliary_loss_mlp": 0.01071517, "balance_loss_clip": 1.01856422, "balance_loss_mlp": 1.02907538, "epoch": 0.13124906057417707, "flos": 21466381956480.0, "grad_norm": 1.7763946611080843, "language_loss": 0.80281943, "learning_rate": 3.893424273224806e-06, "loss": 0.82459199, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.765625, "step": 2183, "time_per_iteration": 2.4590823650360107 }, { "auxiliary_loss_clip": 0.01099295, "auxiliary_loss_mlp": 0.01075092, "balance_loss_clip": 1.02547693, "balance_loss_mlp": 1.02656484, "epoch": 0.13130918382684503, "flos": 23254284147840.0, "grad_norm": 1.551847241482306, "language_loss": 0.86526358, "learning_rate": 3.893298799142636e-06, "loss": 0.88700747, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.7265625, "step": 2184, "time_per_iteration": 2.417428731918335 }, { "auxiliary_loss_clip": 0.0110309, "auxiliary_loss_mlp": 0.01075245, "balance_loss_clip": 1.0226016, "balance_loss_mlp": 1.02787352, "epoch": 0.131369307079513, "flos": 20849222119680.0, "grad_norm": 1.9253980271838569, "language_loss": 0.83943623, "learning_rate": 3.893173253266387e-06, "loss": 0.86121953, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.75390625, "step": 2185, "time_per_iteration": 2.4021122455596924 }, { "auxiliary_loss_clip": 0.01102327, "auxiliary_loss_mlp": 0.01083768, "balance_loss_clip": 1.0331037, "balance_loss_mlp": 1.026546, "epoch": 0.13142943033218096, "flos": 17857375004160.0, "grad_norm": 2.0324346868977017, "language_loss": 0.7581706, "learning_rate": 3.893047635600818e-06, "loss": 0.78003156, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 2186, "time_per_iteration": 2.3760697841644287 }, { "auxiliary_loss_clip": 0.01098425, "auxiliary_loss_mlp": 0.0107473, "balance_loss_clip": 1.02270675, "balance_loss_mlp": 1.02595532, "epoch": 0.13148955358484893, "flos": 20994984512640.0, "grad_norm": 2.1810867133613843, "language_loss": 0.82840842, "learning_rate": 3.892921946150693e-06, "loss": 0.85013998, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.72265625, "step": 2187, "time_per_iteration": 2.390354633331299 }, { "auxiliary_loss_clip": 0.01026024, "auxiliary_loss_mlp": 0.01025985, "balance_loss_clip": 1.01845086, "balance_loss_mlp": 1.00727439, "epoch": 0.13154967683751692, "flos": 70169206291200.0, "grad_norm": 0.8526040588713295, "language_loss": 0.59173119, "learning_rate": 3.892796184920778e-06, "loss": 0.61225128, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 0.07519531, "router_z_loss_mlp": 0.1875, "step": 2188, "time_per_iteration": 3.058379650115967 }, { "auxiliary_loss_clip": 0.01098561, "auxiliary_loss_mlp": 0.01072022, "balance_loss_clip": 1.02233529, "balance_loss_mlp": 1.02687764, "epoch": 0.1316098000901849, "flos": 20375101589760.0, "grad_norm": 2.1819189748443946, "language_loss": 0.75803828, "learning_rate": 3.892670351915842e-06, "loss": 0.77974403, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.71875, "step": 2189, "time_per_iteration": 2.373556137084961 }, { "auxiliary_loss_clip": 0.01096778, "auxiliary_loss_mlp": 0.01076097, "balance_loss_clip": 1.02605259, "balance_loss_mlp": 1.02622294, "epoch": 0.13166992334285285, "flos": 23220034237440.0, "grad_norm": 1.8170873819931301, "language_loss": 0.73954928, "learning_rate": 3.892544447140657e-06, "loss": 0.76127803, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.703125, "step": 2190, "time_per_iteration": 3.866952419281006 }, { "auxiliary_loss_clip": 0.01101772, "auxiliary_loss_mlp": 0.01077132, "balance_loss_clip": 1.02997327, "balance_loss_mlp": 1.02823138, "epoch": 0.13173004659552082, "flos": 23329836063360.0, "grad_norm": 1.7045394228997708, "language_loss": 0.76233184, "learning_rate": 3.892418470599996e-06, "loss": 0.78412086, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.734375, "step": 2191, "time_per_iteration": 2.405099630355835 }, { "auxiliary_loss_clip": 0.01100217, "auxiliary_loss_mlp": 0.01079395, "balance_loss_clip": 1.03080559, "balance_loss_mlp": 1.0269568, "epoch": 0.13179016984818878, "flos": 21250443997440.0, "grad_norm": 1.8611697006616352, "language_loss": 0.80638373, "learning_rate": 3.892292422298637e-06, "loss": 0.82817996, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.734375, "step": 2192, "time_per_iteration": 3.939382553100586 }, { "auxiliary_loss_clip": 0.01100907, "auxiliary_loss_mlp": 0.01076242, "balance_loss_clip": 1.0283196, "balance_loss_mlp": 1.02775931, "epoch": 0.13185029310085675, "flos": 17777913016320.0, "grad_norm": 1.862093829668189, "language_loss": 0.86937112, "learning_rate": 3.892166302241361e-06, "loss": 0.89114261, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.73046875, "step": 2193, "time_per_iteration": 3.8463470935821533 }, { "auxiliary_loss_clip": 0.01025657, "auxiliary_loss_mlp": 0.01015601, "balance_loss_clip": 1.00787592, "balance_loss_mlp": 1.00635481, "epoch": 0.1319104163535247, "flos": 69848319185280.0, "grad_norm": 0.7642247581550348, "language_loss": 0.54157043, "learning_rate": 3.8920401104329475e-06, "loss": 0.56198299, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 0.07714844, "router_z_loss_mlp": 0.19335938, "step": 2194, "time_per_iteration": 2.992501974105835 }, { "auxiliary_loss_clip": 0.01100398, "auxiliary_loss_mlp": 0.01077809, "balance_loss_clip": 1.02771759, "balance_loss_mlp": 1.02622354, "epoch": 0.1319705396061927, "flos": 25191893715840.0, "grad_norm": 1.9800781632050022, "language_loss": 0.73985565, "learning_rate": 3.891913846878185e-06, "loss": 0.76163775, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7421875, "step": 2195, "time_per_iteration": 2.475264549255371 }, { "auxiliary_loss_clip": 0.01103576, "auxiliary_loss_mlp": 0.01073103, "balance_loss_clip": 1.02146173, "balance_loss_mlp": 1.02726948, "epoch": 0.13203066285886067, "flos": 20739420293760.0, "grad_norm": 1.5897912860088113, "language_loss": 0.79767597, "learning_rate": 3.891787511581859e-06, "loss": 0.81944275, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.76171875, "step": 2196, "time_per_iteration": 2.398730516433716 }, { "auxiliary_loss_clip": 0.01101923, "auxiliary_loss_mlp": 0.01068002, "balance_loss_clip": 1.01838756, "balance_loss_mlp": 1.02693677, "epoch": 0.13209078611152864, "flos": 22053306689280.0, "grad_norm": 2.087994542189287, "language_loss": 0.77902067, "learning_rate": 3.89166110454876e-06, "loss": 0.80071986, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.75, "step": 2197, "time_per_iteration": 2.390437602996826 }, { "auxiliary_loss_clip": 0.01102531, "auxiliary_loss_mlp": 0.0107752, "balance_loss_clip": 1.02416193, "balance_loss_mlp": 1.0267837, "epoch": 0.1321509093641966, "flos": 16284153962880.0, "grad_norm": 2.241333516925777, "language_loss": 0.8223244, "learning_rate": 3.891534625783685e-06, "loss": 0.84412491, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.7578125, "step": 2198, "time_per_iteration": 2.380690336227417 }, { "auxiliary_loss_clip": 0.01102473, "auxiliary_loss_mlp": 0.01072234, "balance_loss_clip": 1.0224762, "balance_loss_mlp": 1.02763891, "epoch": 0.13221103261686457, "flos": 16982067507840.0, "grad_norm": 2.280050670343438, "language_loss": 0.85262024, "learning_rate": 3.891408075291425e-06, "loss": 0.8743673, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.75, "step": 2199, "time_per_iteration": 2.3616855144500732 }, { "auxiliary_loss_clip": 0.01103017, "auxiliary_loss_mlp": 0.01075078, "balance_loss_clip": 1.02469993, "balance_loss_mlp": 1.02802777, "epoch": 0.13227115586953253, "flos": 34232373924480.0, "grad_norm": 1.8971044008347986, "language_loss": 0.70888042, "learning_rate": 3.8912814530767826e-06, "loss": 0.73066139, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.75, "step": 2200, "time_per_iteration": 2.5392343997955322 }, { "auxiliary_loss_clip": 0.01099209, "auxiliary_loss_mlp": 0.01072467, "balance_loss_clip": 1.02340031, "balance_loss_mlp": 1.02655399, "epoch": 0.13233127912220052, "flos": 20703599372160.0, "grad_norm": 1.6772671422531187, "language_loss": 0.8695336, "learning_rate": 3.891154759144557e-06, "loss": 0.89125037, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.7265625, "step": 2201, "time_per_iteration": 2.399847984313965 }, { "auxiliary_loss_clip": 0.01105383, "auxiliary_loss_mlp": 0.01074412, "balance_loss_clip": 1.02207947, "balance_loss_mlp": 1.02817249, "epoch": 0.1323914023748685, "flos": 25804061228160.0, "grad_norm": 1.7626929257480006, "language_loss": 0.89090186, "learning_rate": 3.891027993499554e-06, "loss": 0.91269982, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7734375, "step": 2202, "time_per_iteration": 2.4358041286468506 }, { "auxiliary_loss_clip": 0.01101246, "auxiliary_loss_mlp": 0.01072023, "balance_loss_clip": 1.02102518, "balance_loss_mlp": 1.02812624, "epoch": 0.13245152562753645, "flos": 21250478908800.0, "grad_norm": 2.162875163457208, "language_loss": 0.74754173, "learning_rate": 3.89090115614658e-06, "loss": 0.76927441, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.73046875, "step": 2203, "time_per_iteration": 2.426265239715576 }, { "auxiliary_loss_clip": 0.01103502, "auxiliary_loss_mlp": 0.01072753, "balance_loss_clip": 1.02335298, "balance_loss_mlp": 1.0271883, "epoch": 0.13251164888020442, "flos": 26609856474240.0, "grad_norm": 22.091661392112414, "language_loss": 0.76117986, "learning_rate": 3.890774247090444e-06, "loss": 0.78294241, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.76171875, "step": 2204, "time_per_iteration": 2.428243398666382 }, { "auxiliary_loss_clip": 0.01102814, "auxiliary_loss_mlp": 0.01074333, "balance_loss_clip": 1.02471769, "balance_loss_mlp": 1.02933431, "epoch": 0.13257177213287238, "flos": 29825217313920.0, "grad_norm": 1.863317482381587, "language_loss": 0.80477309, "learning_rate": 3.89064726633596e-06, "loss": 0.82654446, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.734375, "step": 2205, "time_per_iteration": 2.4821231365203857 }, { "auxiliary_loss_clip": 0.01100521, "auxiliary_loss_mlp": 0.01073936, "balance_loss_clip": 1.02308106, "balance_loss_mlp": 1.02849793, "epoch": 0.13263189538554035, "flos": 21287382082560.0, "grad_norm": 2.059719799170582, "language_loss": 0.81315291, "learning_rate": 3.890520213887941e-06, "loss": 0.83489746, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.71875, "step": 2206, "time_per_iteration": 2.3967065811157227 }, { "auxiliary_loss_clip": 0.0110418, "auxiliary_loss_mlp": 0.01078341, "balance_loss_clip": 1.02965617, "balance_loss_mlp": 1.0296495, "epoch": 0.13269201863820831, "flos": 16873138465920.0, "grad_norm": 2.161020196177023, "language_loss": 0.77088231, "learning_rate": 3.890393089751208e-06, "loss": 0.7927075, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.74609375, "step": 2207, "time_per_iteration": 2.3862597942352295 }, { "auxiliary_loss_clip": 0.01099054, "auxiliary_loss_mlp": 0.01069336, "balance_loss_clip": 1.01740837, "balance_loss_mlp": 1.02708483, "epoch": 0.1327521418908763, "flos": 23767786558080.0, "grad_norm": 1.603926206442705, "language_loss": 0.86015475, "learning_rate": 3.890265893930578e-06, "loss": 0.88183862, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.71875, "step": 2208, "time_per_iteration": 2.411311388015747 }, { "auxiliary_loss_clip": 0.01096619, "auxiliary_loss_mlp": 0.01076314, "balance_loss_clip": 1.02791524, "balance_loss_mlp": 1.02822804, "epoch": 0.13281226514354427, "flos": 26504383656960.0, "grad_norm": 1.6122563237262262, "language_loss": 0.86913729, "learning_rate": 3.890138626430876e-06, "loss": 0.89086658, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.68359375, "step": 2209, "time_per_iteration": 2.4462203979492188 }, { "auxiliary_loss_clip": 0.01098641, "auxiliary_loss_mlp": 0.01073012, "balance_loss_clip": 1.02511382, "balance_loss_mlp": 1.02627707, "epoch": 0.13287238839621224, "flos": 24497610952320.0, "grad_norm": 1.8509664704096933, "language_loss": 0.84244275, "learning_rate": 3.890011287256929e-06, "loss": 0.86415929, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.7265625, "step": 2210, "time_per_iteration": 2.405325412750244 }, { "auxiliary_loss_clip": 0.01028678, "auxiliary_loss_mlp": 0.01013493, "balance_loss_clip": 1.0045284, "balance_loss_mlp": 1.00929117, "epoch": 0.1329325116488802, "flos": 67691071054080.0, "grad_norm": 0.7644311733127371, "language_loss": 0.58120221, "learning_rate": 3.889883876413563e-06, "loss": 0.60162395, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 0.08984375, "router_z_loss_mlp": 0.19433594, "step": 2211, "time_per_iteration": 3.157275915145874 }, { "auxiliary_loss_clip": 0.01030107, "auxiliary_loss_mlp": 0.01008513, "balance_loss_clip": 1.00016809, "balance_loss_mlp": 1.00954723, "epoch": 0.13299263490154817, "flos": 72258303715200.0, "grad_norm": 0.8247129396851514, "language_loss": 0.55416054, "learning_rate": 3.889756393905611e-06, "loss": 0.57454669, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 0.08349609, "router_z_loss_mlp": 0.20507812, "step": 2212, "time_per_iteration": 3.1067214012145996 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.0107704, "balance_loss_clip": 1.02511203, "balance_loss_mlp": 1.02860498, "epoch": 0.13305275815421613, "flos": 17930308567680.0, "grad_norm": 2.147444456482937, "language_loss": 0.77006203, "learning_rate": 3.889628839737908e-06, "loss": 0.7918995, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.78125, "step": 2213, "time_per_iteration": 2.363706588745117 }, { "auxiliary_loss_clip": 0.01093354, "auxiliary_loss_mlp": 0.0106555, "balance_loss_clip": 1.01963019, "balance_loss_mlp": 1.02437508, "epoch": 0.13311288140688413, "flos": 22339943884800.0, "grad_norm": 1.6570479582484574, "language_loss": 0.80528772, "learning_rate": 3.889501213915291e-06, "loss": 0.82687676, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.69140625, "step": 2214, "time_per_iteration": 2.430022716522217 }, { "auxiliary_loss_clip": 0.01100087, "auxiliary_loss_mlp": 0.0107587, "balance_loss_clip": 1.02699411, "balance_loss_mlp": 1.02724886, "epoch": 0.1331730046595521, "flos": 31867531649280.0, "grad_norm": 2.6049125947989578, "language_loss": 0.71706915, "learning_rate": 3.889373516442597e-06, "loss": 0.73882866, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7265625, "step": 2215, "time_per_iteration": 2.602797031402588 }, { "auxiliary_loss_clip": 0.01100546, "auxiliary_loss_mlp": 0.01079967, "balance_loss_clip": 1.02856398, "balance_loss_mlp": 1.02581954, "epoch": 0.13323312791222006, "flos": 22565447556480.0, "grad_norm": 1.6504324730825204, "language_loss": 0.8263526, "learning_rate": 3.889245747324671e-06, "loss": 0.8481577, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.74609375, "step": 2216, "time_per_iteration": 2.4838573932647705 }, { "auxiliary_loss_clip": 0.01099081, "auxiliary_loss_mlp": 0.01080751, "balance_loss_clip": 1.03218544, "balance_loss_mlp": 1.02684927, "epoch": 0.13329325116488802, "flos": 15084433313280.0, "grad_norm": 2.154954400830231, "language_loss": 0.89480364, "learning_rate": 3.889117906566356e-06, "loss": 0.91660196, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.71875, "step": 2217, "time_per_iteration": 2.4196934700012207 }, { "auxiliary_loss_clip": 0.01099481, "auxiliary_loss_mlp": 0.01069494, "balance_loss_clip": 1.02114272, "balance_loss_mlp": 1.02821517, "epoch": 0.133353374417556, "flos": 27452450160000.0, "grad_norm": 2.3070235340726657, "language_loss": 0.75504792, "learning_rate": 3.888989994172501e-06, "loss": 0.77673769, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7109375, "step": 2218, "time_per_iteration": 2.4511611461639404 }, { "auxiliary_loss_clip": 0.01100319, "auxiliary_loss_mlp": 0.01072347, "balance_loss_clip": 1.01979923, "balance_loss_mlp": 1.02819741, "epoch": 0.13341349767022395, "flos": 24093631077120.0, "grad_norm": 1.6865516504875226, "language_loss": 0.88894379, "learning_rate": 3.8888620101479565e-06, "loss": 0.9106704, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.72265625, "step": 2219, "time_per_iteration": 2.419753313064575 }, { "auxiliary_loss_clip": 0.01104086, "auxiliary_loss_mlp": 0.01074202, "balance_loss_clip": 1.02556419, "balance_loss_mlp": 1.03176022, "epoch": 0.13347362092289192, "flos": 24132209996160.0, "grad_norm": 1.9515259778377105, "language_loss": 0.78933638, "learning_rate": 3.888733954497574e-06, "loss": 0.81111932, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.72265625, "step": 2220, "time_per_iteration": 2.4327468872070312 }, { "auxiliary_loss_clip": 0.01101113, "auxiliary_loss_mlp": 0.01074012, "balance_loss_clip": 1.02616096, "balance_loss_mlp": 1.02821457, "epoch": 0.1335337441755599, "flos": 18435711542400.0, "grad_norm": 2.1120706156893796, "language_loss": 0.81201619, "learning_rate": 3.888605827226212e-06, "loss": 0.83376741, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.73046875, "step": 2221, "time_per_iteration": 2.369741678237915 }, { "auxiliary_loss_clip": 0.01030732, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.02020633, "balance_loss_mlp": 1.0107193, "epoch": 0.13359386742822787, "flos": 50609395837440.0, "grad_norm": 0.9849485220337436, "language_loss": 0.69057494, "learning_rate": 3.8884776283387275e-06, "loss": 0.71116537, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 0.08105469, "router_z_loss_mlp": 0.20019531, "step": 2222, "time_per_iteration": 2.845627784729004 }, { "auxiliary_loss_clip": 0.01102037, "auxiliary_loss_mlp": 0.01078514, "balance_loss_clip": 1.03171277, "balance_loss_mlp": 1.03042388, "epoch": 0.13365399068089584, "flos": 22777615088640.0, "grad_norm": 1.75699260935051, "language_loss": 0.68620056, "learning_rate": 3.888349357839982e-06, "loss": 0.70800608, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.71484375, "step": 2223, "time_per_iteration": 2.4104342460632324 }, { "auxiliary_loss_clip": 0.01098854, "auxiliary_loss_mlp": 0.01069809, "balance_loss_clip": 1.02074289, "balance_loss_mlp": 1.02735221, "epoch": 0.1337141139335638, "flos": 12530781072000.0, "grad_norm": 1.9775686222207347, "language_loss": 0.8480376, "learning_rate": 3.88822101573484e-06, "loss": 0.86972427, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.71484375, "step": 2224, "time_per_iteration": 2.38739275932312 }, { "auxiliary_loss_clip": 0.01100708, "auxiliary_loss_mlp": 0.01071646, "balance_loss_clip": 1.02279413, "balance_loss_mlp": 1.0275836, "epoch": 0.13377423718623177, "flos": 23037857429760.0, "grad_norm": 1.9255327745820277, "language_loss": 0.68654615, "learning_rate": 3.888092602028167e-06, "loss": 0.70826972, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.73046875, "step": 2225, "time_per_iteration": 2.4010989665985107 }, { "auxiliary_loss_clip": 0.01099627, "auxiliary_loss_mlp": 0.0106855, "balance_loss_clip": 1.02313101, "balance_loss_mlp": 1.02782238, "epoch": 0.13383436043889974, "flos": 16215479585280.0, "grad_norm": 2.0355513272661234, "language_loss": 0.92166185, "learning_rate": 3.887964116724835e-06, "loss": 0.94334352, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.71875, "step": 2226, "time_per_iteration": 2.3766438961029053 }, { "auxiliary_loss_clip": 0.01098751, "auxiliary_loss_mlp": 0.01085356, "balance_loss_clip": 1.03769648, "balance_loss_mlp": 1.02690625, "epoch": 0.1338944836915677, "flos": 24278530970880.0, "grad_norm": 2.3197974944567985, "language_loss": 0.76496994, "learning_rate": 3.887835559829712e-06, "loss": 0.78681099, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.71875, "step": 2227, "time_per_iteration": 2.5395162105560303 }, { "auxiliary_loss_clip": 0.0109695, "auxiliary_loss_mlp": 0.01069372, "balance_loss_clip": 1.01923299, "balance_loss_mlp": 1.02516258, "epoch": 0.1339546069442357, "flos": 17597900712960.0, "grad_norm": 2.045375160076701, "language_loss": 0.86295033, "learning_rate": 3.8877069313476764e-06, "loss": 0.88461351, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.71875, "step": 2228, "time_per_iteration": 2.364319324493408 }, { "auxiliary_loss_clip": 0.01094923, "auxiliary_loss_mlp": 0.01067807, "balance_loss_clip": 1.02460504, "balance_loss_mlp": 1.02542627, "epoch": 0.13401473019690366, "flos": 18989049680640.0, "grad_norm": 1.8602125548642618, "language_loss": 0.83485156, "learning_rate": 3.8875782312836054e-06, "loss": 0.85647893, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.6953125, "step": 2229, "time_per_iteration": 2.375342607498169 }, { "auxiliary_loss_clip": 0.01097343, "auxiliary_loss_mlp": 0.01076618, "balance_loss_clip": 1.03084183, "balance_loss_mlp": 1.0256443, "epoch": 0.13407485344957162, "flos": 26942578531200.0, "grad_norm": 1.617592929277372, "language_loss": 0.76911587, "learning_rate": 3.887449459642378e-06, "loss": 0.79085553, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.71875, "step": 2230, "time_per_iteration": 3.856142997741699 }, { "auxiliary_loss_clip": 0.01097907, "auxiliary_loss_mlp": 0.01075188, "balance_loss_clip": 1.02967405, "balance_loss_mlp": 1.02619624, "epoch": 0.1341349767022396, "flos": 20338338061440.0, "grad_norm": 1.7600423149187243, "language_loss": 0.82524061, "learning_rate": 3.8873206164288785e-06, "loss": 0.84697163, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.71875, "step": 2231, "time_per_iteration": 2.4314239025115967 }, { "auxiliary_loss_clip": 0.01099182, "auxiliary_loss_mlp": 0.01074762, "balance_loss_clip": 1.02421737, "balance_loss_mlp": 1.02655923, "epoch": 0.13419509995490755, "flos": 29860724033280.0, "grad_norm": 1.5567977257933459, "language_loss": 0.74192995, "learning_rate": 3.887191701647992e-06, "loss": 0.76366937, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7265625, "step": 2232, "time_per_iteration": 3.860792398452759 }, { "auxiliary_loss_clip": 0.01102289, "auxiliary_loss_mlp": 0.010757, "balance_loss_clip": 1.02386832, "balance_loss_mlp": 1.02843082, "epoch": 0.13425522320757552, "flos": 26941775569920.0, "grad_norm": 2.5913909199948186, "language_loss": 0.68207562, "learning_rate": 3.8870627153046066e-06, "loss": 0.70385551, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.73828125, "step": 2233, "time_per_iteration": 3.8814749717712402 }, { "auxiliary_loss_clip": 0.01098815, "auxiliary_loss_mlp": 0.01063877, "balance_loss_clip": 1.01583529, "balance_loss_mlp": 1.02544498, "epoch": 0.1343153464602435, "flos": 15776411927040.0, "grad_norm": 2.5458297067228233, "language_loss": 0.8360821, "learning_rate": 3.886933657403615e-06, "loss": 0.85770905, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.734375, "step": 2234, "time_per_iteration": 2.418581008911133 }, { "auxiliary_loss_clip": 0.01099129, "auxiliary_loss_mlp": 0.01070572, "balance_loss_clip": 1.0197413, "balance_loss_mlp": 1.02733564, "epoch": 0.13437546971291148, "flos": 24313653665280.0, "grad_norm": 1.9024392479551717, "language_loss": 0.82850933, "learning_rate": 3.886804527949909e-06, "loss": 0.85020638, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.71875, "step": 2235, "time_per_iteration": 2.4185612201690674 }, { "auxiliary_loss_clip": 0.01098267, "auxiliary_loss_mlp": 0.0107459, "balance_loss_clip": 1.02435517, "balance_loss_mlp": 1.02520013, "epoch": 0.13443559296557944, "flos": 26649482734080.0, "grad_norm": 1.5479806939711218, "language_loss": 0.87836885, "learning_rate": 3.8866753269483864e-06, "loss": 0.90009737, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 2236, "time_per_iteration": 2.4468624591827393 }, { "auxiliary_loss_clip": 0.01100489, "auxiliary_loss_mlp": 0.01079573, "balance_loss_clip": 1.02638197, "balance_loss_mlp": 1.0286299, "epoch": 0.1344957162182474, "flos": 21795193941120.0, "grad_norm": 1.497363164959225, "language_loss": 0.78802741, "learning_rate": 3.886546054403946e-06, "loss": 0.80982798, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.71875, "step": 2237, "time_per_iteration": 2.4397060871124268 }, { "auxiliary_loss_clip": 0.01104362, "auxiliary_loss_mlp": 0.01081905, "balance_loss_clip": 1.0286901, "balance_loss_mlp": 1.02831817, "epoch": 0.13455583947091537, "flos": 19864531733760.0, "grad_norm": 1.9403822991825106, "language_loss": 0.81926149, "learning_rate": 3.886416710321491e-06, "loss": 0.84112418, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.76171875, "step": 2238, "time_per_iteration": 2.518111228942871 }, { "auxiliary_loss_clip": 0.01097606, "auxiliary_loss_mlp": 0.01073574, "balance_loss_clip": 1.02162218, "balance_loss_mlp": 1.02651429, "epoch": 0.13461596272358334, "flos": 30845519153280.0, "grad_norm": 1.9232825845611043, "language_loss": 0.70116866, "learning_rate": 3.886287294705924e-06, "loss": 0.72288048, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.7109375, "step": 2239, "time_per_iteration": 2.4701101779937744 }, { "auxiliary_loss_clip": 0.01100847, "auxiliary_loss_mlp": 0.01071233, "balance_loss_clip": 1.02083182, "balance_loss_mlp": 1.02680802, "epoch": 0.1346760859762513, "flos": 12493633518720.0, "grad_norm": 2.533619918983608, "language_loss": 0.85951328, "learning_rate": 3.8861578075621555e-06, "loss": 0.88123411, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7421875, "step": 2240, "time_per_iteration": 2.3721256256103516 }, { "auxiliary_loss_clip": 0.01104244, "auxiliary_loss_mlp": 0.01079318, "balance_loss_clip": 1.02359986, "balance_loss_mlp": 1.02625155, "epoch": 0.1347362092289193, "flos": 21834924935040.0, "grad_norm": 1.759400868063721, "language_loss": 0.80302268, "learning_rate": 3.886028248895093e-06, "loss": 0.82485831, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.78125, "step": 2241, "time_per_iteration": 2.4087882041931152 }, { "auxiliary_loss_clip": 0.01097081, "auxiliary_loss_mlp": 0.01065632, "balance_loss_clip": 1.01835346, "balance_loss_mlp": 1.02690816, "epoch": 0.13479633248158726, "flos": 23508451912320.0, "grad_norm": 1.8099688119627833, "language_loss": 0.85164362, "learning_rate": 3.88589861870965e-06, "loss": 0.87327075, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.703125, "step": 2242, "time_per_iteration": 2.4149866104125977 }, { "auxiliary_loss_clip": 0.01103398, "auxiliary_loss_mlp": 0.01083021, "balance_loss_clip": 1.03257179, "balance_loss_mlp": 1.02953422, "epoch": 0.13485645573425523, "flos": 29343241728000.0, "grad_norm": 2.581427263831077, "language_loss": 0.68314719, "learning_rate": 3.885768917010744e-06, "loss": 0.70501137, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.7421875, "step": 2243, "time_per_iteration": 2.4642670154571533 }, { "auxiliary_loss_clip": 0.01096104, "auxiliary_loss_mlp": 0.01070603, "balance_loss_clip": 1.02492189, "balance_loss_mlp": 1.02598977, "epoch": 0.1349165789869232, "flos": 28035883756800.0, "grad_norm": 1.4770541881657482, "language_loss": 0.74721408, "learning_rate": 3.8856391438032895e-06, "loss": 0.7688812, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69921875, "step": 2244, "time_per_iteration": 2.4423115253448486 }, { "auxiliary_loss_clip": 0.01099045, "auxiliary_loss_mlp": 0.01077254, "balance_loss_clip": 1.02575552, "balance_loss_mlp": 1.02527559, "epoch": 0.13497670223959116, "flos": 22852713156480.0, "grad_norm": 1.5933513245835889, "language_loss": 0.87576103, "learning_rate": 3.88550929909221e-06, "loss": 0.89752406, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.73828125, "step": 2245, "time_per_iteration": 2.406095266342163 }, { "auxiliary_loss_clip": 0.010964, "auxiliary_loss_mlp": 0.01070214, "balance_loss_clip": 1.02264965, "balance_loss_mlp": 1.02639711, "epoch": 0.13503682549225912, "flos": 16503757614720.0, "grad_norm": 1.6263544111086021, "language_loss": 0.80819583, "learning_rate": 3.88537938288243e-06, "loss": 0.82986194, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.703125, "step": 2246, "time_per_iteration": 2.403489112854004 }, { "auxiliary_loss_clip": 0.01026472, "auxiliary_loss_mlp": 0.01015146, "balance_loss_clip": 1.00718296, "balance_loss_mlp": 1.00655138, "epoch": 0.1350969487449271, "flos": 70753023912960.0, "grad_norm": 0.757451107081713, "language_loss": 0.60601932, "learning_rate": 3.885249395178874e-06, "loss": 0.62643546, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 0.07958984, "router_z_loss_mlp": 0.19921875, "step": 2247, "time_per_iteration": 3.1073782444000244 }, { "auxiliary_loss_clip": 0.01103046, "auxiliary_loss_mlp": 0.01076854, "balance_loss_clip": 1.02134991, "balance_loss_mlp": 1.02588606, "epoch": 0.13515707199759508, "flos": 23074865337600.0, "grad_norm": 1.7590742050169108, "language_loss": 0.82550848, "learning_rate": 3.885119335986473e-06, "loss": 0.8473075, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.7734375, "step": 2248, "time_per_iteration": 2.4199070930480957 }, { "auxiliary_loss_clip": 0.01097229, "auxiliary_loss_mlp": 0.01066393, "balance_loss_clip": 1.0205214, "balance_loss_mlp": 1.02659726, "epoch": 0.13521719525026304, "flos": 23185225745280.0, "grad_norm": 1.7847406000295978, "language_loss": 0.78590763, "learning_rate": 3.884989205310157e-06, "loss": 0.80754375, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.70703125, "step": 2249, "time_per_iteration": 2.424166679382324 }, { "auxiliary_loss_clip": 0.01097915, "auxiliary_loss_mlp": 0.01072005, "balance_loss_clip": 1.02208006, "balance_loss_mlp": 1.02584088, "epoch": 0.135277318502931, "flos": 24789764142720.0, "grad_norm": 1.4772714333685675, "language_loss": 0.85804892, "learning_rate": 3.884859003154862e-06, "loss": 0.87974811, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.71875, "step": 2250, "time_per_iteration": 2.459350824356079 }, { "auxiliary_loss_clip": 0.01100116, "auxiliary_loss_mlp": 0.01076168, "balance_loss_clip": 1.02388275, "balance_loss_mlp": 1.02707946, "epoch": 0.13533744175559898, "flos": 21907439562240.0, "grad_norm": 1.952702536190691, "language_loss": 0.84368861, "learning_rate": 3.884728729525524e-06, "loss": 0.86545146, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.73046875, "step": 2251, "time_per_iteration": 2.4180047512054443 }, { "auxiliary_loss_clip": 0.01098556, "auxiliary_loss_mlp": 0.01082246, "balance_loss_clip": 1.03410912, "balance_loss_mlp": 1.0249269, "epoch": 0.13539756500826694, "flos": 21210678092160.0, "grad_norm": 1.7143504194189876, "language_loss": 0.86527205, "learning_rate": 3.884598384427084e-06, "loss": 0.88708001, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.734375, "step": 2252, "time_per_iteration": 2.398433208465576 }, { "auxiliary_loss_clip": 0.01028553, "auxiliary_loss_mlp": 0.0101601, "balance_loss_clip": 1.00728357, "balance_loss_mlp": 1.00885916, "epoch": 0.1354576882609349, "flos": 63238981656960.0, "grad_norm": 0.7733023201636623, "language_loss": 0.62033314, "learning_rate": 3.884467967864485e-06, "loss": 0.64077878, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 0.08740234, "router_z_loss_mlp": 0.19726562, "step": 2253, "time_per_iteration": 3.136885166168213 }, { "auxiliary_loss_clip": 0.01097439, "auxiliary_loss_mlp": 0.01076398, "balance_loss_clip": 1.02907228, "balance_loss_mlp": 1.02739191, "epoch": 0.1355178115136029, "flos": 25481882401920.0, "grad_norm": 1.6281869850150181, "language_loss": 0.91105425, "learning_rate": 3.884337479842671e-06, "loss": 0.9327926, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.69921875, "step": 2254, "time_per_iteration": 2.457720994949341 }, { "auxiliary_loss_clip": 0.01101147, "auxiliary_loss_mlp": 0.0108143, "balance_loss_clip": 1.0323391, "balance_loss_mlp": 1.02672029, "epoch": 0.13557793476627086, "flos": 21615879864960.0, "grad_norm": 1.8199792501807002, "language_loss": 0.87098551, "learning_rate": 3.884206920366591e-06, "loss": 0.8928113, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.7421875, "step": 2255, "time_per_iteration": 2.409785270690918 }, { "auxiliary_loss_clip": 0.01094607, "auxiliary_loss_mlp": 0.01066914, "balance_loss_clip": 1.02163863, "balance_loss_mlp": 1.02453744, "epoch": 0.13563805801893883, "flos": 24927322366080.0, "grad_norm": 2.41359802652145, "language_loss": 0.77321422, "learning_rate": 3.884076289441196e-06, "loss": 0.79482943, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.69921875, "step": 2256, "time_per_iteration": 2.4274182319641113 }, { "auxiliary_loss_clip": 0.010961, "auxiliary_loss_mlp": 0.01068414, "balance_loss_clip": 1.01841819, "balance_loss_mlp": 1.02437472, "epoch": 0.1356981812716068, "flos": 14749581663360.0, "grad_norm": 2.1700475601124296, "language_loss": 0.84923184, "learning_rate": 3.88394558707144e-06, "loss": 0.87087703, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.71875, "step": 2257, "time_per_iteration": 2.373764753341675 }, { "auxiliary_loss_clip": 0.01103694, "auxiliary_loss_mlp": 0.01079532, "balance_loss_clip": 1.02600682, "balance_loss_mlp": 1.02561045, "epoch": 0.13575830452427476, "flos": 11107791077760.0, "grad_norm": 2.1731821464517953, "language_loss": 0.84647334, "learning_rate": 3.883814813262277e-06, "loss": 0.86830562, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.78125, "step": 2258, "time_per_iteration": 2.370626926422119 }, { "auxiliary_loss_clip": 0.01099121, "auxiliary_loss_mlp": 0.01071684, "balance_loss_clip": 1.02104437, "balance_loss_mlp": 1.02624071, "epoch": 0.13581842777694272, "flos": 17959531242240.0, "grad_norm": 2.6373067891920203, "language_loss": 0.85858792, "learning_rate": 3.883683968018669e-06, "loss": 0.88029593, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.73046875, "step": 2259, "time_per_iteration": 2.395463705062866 }, { "auxiliary_loss_clip": 0.01097073, "auxiliary_loss_mlp": 0.01075163, "balance_loss_clip": 1.0283848, "balance_loss_mlp": 1.02525616, "epoch": 0.1358785510296107, "flos": 22856029735680.0, "grad_norm": 2.0032252664893138, "language_loss": 0.75659031, "learning_rate": 3.8835530513455755e-06, "loss": 0.77831268, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.71875, "step": 2260, "time_per_iteration": 2.3941452503204346 }, { "auxiliary_loss_clip": 0.01097665, "auxiliary_loss_mlp": 0.01071203, "balance_loss_clip": 1.02318597, "balance_loss_mlp": 1.02639341, "epoch": 0.13593867428227868, "flos": 25738214670720.0, "grad_norm": 2.292928812605111, "language_loss": 0.77429187, "learning_rate": 3.883422063247961e-06, "loss": 0.79598057, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.71484375, "step": 2261, "time_per_iteration": 2.4400699138641357 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.01065961, "balance_loss_clip": 1.0187062, "balance_loss_mlp": 1.025599, "epoch": 0.13599879753494665, "flos": 31247858194560.0, "grad_norm": 2.604008987517394, "language_loss": 0.65523297, "learning_rate": 3.883291003730794e-06, "loss": 0.67687988, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.734375, "step": 2262, "time_per_iteration": 2.57812762260437 }, { "auxiliary_loss_clip": 0.01097153, "auxiliary_loss_mlp": 0.01061891, "balance_loss_clip": 1.01711595, "balance_loss_mlp": 1.0269351, "epoch": 0.1360589207876146, "flos": 23913898064640.0, "grad_norm": 2.5597430599023268, "language_loss": 0.85676247, "learning_rate": 3.883159872799043e-06, "loss": 0.87835294, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.703125, "step": 2263, "time_per_iteration": 2.4191956520080566 }, { "auxiliary_loss_clip": 0.01099959, "auxiliary_loss_mlp": 0.01076651, "balance_loss_clip": 1.02605867, "balance_loss_mlp": 1.02628708, "epoch": 0.13611904404028258, "flos": 19973181484800.0, "grad_norm": 1.813451160050589, "language_loss": 0.90408027, "learning_rate": 3.8830286704576815e-06, "loss": 0.92584634, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.73828125, "step": 2264, "time_per_iteration": 2.3880631923675537 }, { "auxiliary_loss_clip": 0.01101171, "auxiliary_loss_mlp": 0.01070633, "balance_loss_clip": 1.01980257, "balance_loss_mlp": 1.02616441, "epoch": 0.13617916729295054, "flos": 15339753152640.0, "grad_norm": 6.156833009868393, "language_loss": 0.74880558, "learning_rate": 3.882897396711683e-06, "loss": 0.77052367, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2265, "time_per_iteration": 2.3641650676727295 }, { "auxiliary_loss_clip": 0.01095415, "auxiliary_loss_mlp": 0.01066369, "balance_loss_clip": 1.02023554, "balance_loss_mlp": 1.02640224, "epoch": 0.1362392905456185, "flos": 27450285655680.0, "grad_norm": 1.9548746074036174, "language_loss": 0.6882602, "learning_rate": 3.882766051566027e-06, "loss": 0.70987803, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.69140625, "step": 2266, "time_per_iteration": 2.4353957176208496 }, { "auxiliary_loss_clip": 0.010956, "auxiliary_loss_mlp": 0.01077713, "balance_loss_clip": 1.02888465, "balance_loss_mlp": 1.02594626, "epoch": 0.1362994137982865, "flos": 25007866606080.0, "grad_norm": 1.539464861056712, "language_loss": 0.78371, "learning_rate": 3.882634635025694e-06, "loss": 0.80544311, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.6953125, "step": 2267, "time_per_iteration": 2.422356367111206 }, { "auxiliary_loss_clip": 0.01096752, "auxiliary_loss_mlp": 0.01074279, "balance_loss_clip": 1.02852607, "balance_loss_mlp": 1.02502263, "epoch": 0.13635953705095447, "flos": 20301993469440.0, "grad_norm": 1.819110101552072, "language_loss": 0.83485019, "learning_rate": 3.882503147095667e-06, "loss": 0.85656047, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.71875, "step": 2268, "time_per_iteration": 2.3829476833343506 }, { "auxiliary_loss_clip": 0.0109508, "auxiliary_loss_mlp": 0.01060979, "balance_loss_clip": 1.01548886, "balance_loss_mlp": 1.02592158, "epoch": 0.13641966030362243, "flos": 31357066527360.0, "grad_norm": 1.7807704464336256, "language_loss": 0.7826739, "learning_rate": 3.882371587780931e-06, "loss": 0.8042345, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69140625, "step": 2269, "time_per_iteration": 3.877122640609741 }, { "auxiliary_loss_clip": 0.0110139, "auxiliary_loss_mlp": 0.01068464, "balance_loss_clip": 1.02104235, "balance_loss_mlp": 1.02821445, "epoch": 0.1364797835562904, "flos": 20477257827840.0, "grad_norm": 3.152170079273674, "language_loss": 0.82435924, "learning_rate": 3.882239957086477e-06, "loss": 0.84605777, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.734375, "step": 2270, "time_per_iteration": 2.3710389137268066 }, { "auxiliary_loss_clip": 0.01098724, "auxiliary_loss_mlp": 0.01076521, "balance_loss_clip": 1.02833652, "balance_loss_mlp": 1.02480543, "epoch": 0.13653990680895836, "flos": 13077520963200.0, "grad_norm": 2.448385458643169, "language_loss": 0.77454352, "learning_rate": 3.882108255017295e-06, "loss": 0.79629594, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.73828125, "step": 2271, "time_per_iteration": 3.860752820968628 }, { "auxiliary_loss_clip": 0.01099024, "auxiliary_loss_mlp": 0.01076941, "balance_loss_clip": 1.03009224, "balance_loss_mlp": 1.02736855, "epoch": 0.13660003006162633, "flos": 16945757827200.0, "grad_norm": 1.8984415281285782, "language_loss": 0.82231283, "learning_rate": 3.881976481578379e-06, "loss": 0.84407252, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.71484375, "step": 2272, "time_per_iteration": 5.248999118804932 }, { "auxiliary_loss_clip": 0.01023918, "auxiliary_loss_mlp": 0.01008438, "balance_loss_clip": 1.00138044, "balance_loss_mlp": 1.00426769, "epoch": 0.1366601533142943, "flos": 68679357310080.0, "grad_norm": 0.6972934234469925, "language_loss": 0.60881793, "learning_rate": 3.8818446367747255e-06, "loss": 0.62914151, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 0.07080078, "router_z_loss_mlp": 0.19726562, "step": 2273, "time_per_iteration": 3.117666006088257 }, { "auxiliary_loss_clip": 0.01095842, "auxiliary_loss_mlp": 0.01070711, "balance_loss_clip": 1.0235517, "balance_loss_mlp": 1.02516508, "epoch": 0.13672027656696228, "flos": 19243252356480.0, "grad_norm": 1.6499385615182312, "language_loss": 0.79421479, "learning_rate": 3.881712720611336e-06, "loss": 0.8158803, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.70703125, "step": 2274, "time_per_iteration": 2.3902363777160645 }, { "auxiliary_loss_clip": 0.01097658, "auxiliary_loss_mlp": 0.0107179, "balance_loss_clip": 1.02637148, "balance_loss_mlp": 1.02565575, "epoch": 0.13678039981963025, "flos": 24533780987520.0, "grad_norm": 1.7358002679608784, "language_loss": 0.79964995, "learning_rate": 3.881580733093211e-06, "loss": 0.82134438, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.71875, "step": 2275, "time_per_iteration": 2.432495594024658 }, { "auxiliary_loss_clip": 0.01095109, "auxiliary_loss_mlp": 0.01071957, "balance_loss_clip": 1.02618098, "balance_loss_mlp": 1.02461791, "epoch": 0.13684052307229821, "flos": 15668425491840.0, "grad_norm": 2.394219708841684, "language_loss": 0.83864653, "learning_rate": 3.881448674225356e-06, "loss": 0.86031723, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.703125, "step": 2276, "time_per_iteration": 2.379607915878296 }, { "auxiliary_loss_clip": 0.01103667, "auxiliary_loss_mlp": 0.01082731, "balance_loss_clip": 1.02877665, "balance_loss_mlp": 1.02644181, "epoch": 0.13690064632496618, "flos": 28363473843840.0, "grad_norm": 2.577271704825388, "language_loss": 0.73588455, "learning_rate": 3.881316544012779e-06, "loss": 0.75774848, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7734375, "step": 2277, "time_per_iteration": 2.4821271896362305 }, { "auxiliary_loss_clip": 0.01100044, "auxiliary_loss_mlp": 0.01090852, "balance_loss_clip": 1.04123688, "balance_loss_mlp": 1.02697539, "epoch": 0.13696076957763414, "flos": 23403642410880.0, "grad_norm": 2.823581413939763, "language_loss": 0.82793057, "learning_rate": 3.88118434246049e-06, "loss": 0.84983957, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.73046875, "step": 2278, "time_per_iteration": 2.4047176837921143 }, { "auxiliary_loss_clip": 0.01097557, "auxiliary_loss_mlp": 0.01078271, "balance_loss_clip": 1.03199434, "balance_loss_mlp": 1.02652359, "epoch": 0.1370208928303021, "flos": 37195068188160.0, "grad_norm": 2.0781161923109384, "language_loss": 0.77574086, "learning_rate": 3.881052069573502e-06, "loss": 0.79749906, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.7109375, "step": 2279, "time_per_iteration": 2.556952953338623 }, { "auxiliary_loss_clip": 0.01100715, "auxiliary_loss_mlp": 0.01073409, "balance_loss_clip": 1.02613068, "balance_loss_mlp": 1.02540874, "epoch": 0.13708101608297008, "flos": 26975187607680.0, "grad_norm": 10.713343639648075, "language_loss": 0.78060412, "learning_rate": 3.880919725356831e-06, "loss": 0.8023454, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.75390625, "step": 2280, "time_per_iteration": 2.43609881401062 }, { "auxiliary_loss_clip": 0.0109492, "auxiliary_loss_mlp": 0.01072273, "balance_loss_clip": 1.0250659, "balance_loss_mlp": 1.02497327, "epoch": 0.13714113933563807, "flos": 32555635102080.0, "grad_norm": 1.727473074379076, "language_loss": 0.81443465, "learning_rate": 3.880787309815496e-06, "loss": 0.8361066, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.69921875, "step": 2281, "time_per_iteration": 2.5257480144500732 }, { "auxiliary_loss_clip": 0.01103161, "auxiliary_loss_mlp": 0.01079185, "balance_loss_clip": 1.02928364, "balance_loss_mlp": 1.02799702, "epoch": 0.13720126258830603, "flos": 16100510878080.0, "grad_norm": 1.670013957744066, "language_loss": 0.85952735, "learning_rate": 3.880654822954518e-06, "loss": 0.88135087, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.75, "step": 2282, "time_per_iteration": 2.3663246631622314 }, { "auxiliary_loss_clip": 0.01099191, "auxiliary_loss_mlp": 0.01074217, "balance_loss_clip": 1.02698636, "balance_loss_mlp": 1.02656865, "epoch": 0.137261385840974, "flos": 18952530531840.0, "grad_norm": 1.5193065495591382, "language_loss": 0.75200641, "learning_rate": 3.8805222647789195e-06, "loss": 0.77374053, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.7265625, "step": 2283, "time_per_iteration": 2.402902126312256 }, { "auxiliary_loss_clip": 0.01102513, "auxiliary_loss_mlp": 0.01073634, "balance_loss_clip": 1.02499628, "balance_loss_mlp": 1.02980363, "epoch": 0.13732150909364196, "flos": 23294224609920.0, "grad_norm": 2.645519710891511, "language_loss": 0.87478697, "learning_rate": 3.880389635293729e-06, "loss": 0.89654839, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.7265625, "step": 2284, "time_per_iteration": 2.4220385551452637 }, { "auxiliary_loss_clip": 0.01106307, "auxiliary_loss_mlp": 0.01086514, "balance_loss_clip": 1.03158212, "balance_loss_mlp": 1.0280261, "epoch": 0.13738163234630993, "flos": 29349979620480.0, "grad_norm": 1.7791836999671387, "language_loss": 0.76967597, "learning_rate": 3.880256934503974e-06, "loss": 0.79160422, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.78125, "step": 2285, "time_per_iteration": 2.4919214248657227 }, { "auxiliary_loss_clip": 0.01099447, "auxiliary_loss_mlp": 0.01076976, "balance_loss_clip": 1.02609742, "balance_loss_mlp": 1.02654481, "epoch": 0.1374417555989779, "flos": 26650111138560.0, "grad_norm": 1.612541184902368, "language_loss": 0.77256465, "learning_rate": 3.880124162414689e-06, "loss": 0.79432893, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7265625, "step": 2286, "time_per_iteration": 2.44081974029541 }, { "auxiliary_loss_clip": 0.01102861, "auxiliary_loss_mlp": 0.01072635, "balance_loss_clip": 1.0188719, "balance_loss_mlp": 1.02679563, "epoch": 0.1375018788516459, "flos": 28402122585600.0, "grad_norm": 2.3770547945236316, "language_loss": 0.88716841, "learning_rate": 3.879991319030908e-06, "loss": 0.90892333, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.76171875, "step": 2287, "time_per_iteration": 2.458857774734497 }, { "auxiliary_loss_clip": 0.01097854, "auxiliary_loss_mlp": 0.01075133, "balance_loss_clip": 1.02506518, "balance_loss_mlp": 1.02482677, "epoch": 0.13756200210431385, "flos": 37412297867520.0, "grad_norm": 2.601686648892162, "language_loss": 0.70701313, "learning_rate": 3.879858404357666e-06, "loss": 0.72874302, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.73046875, "step": 2288, "time_per_iteration": 2.527089834213257 }, { "auxiliary_loss_clip": 0.01102251, "auxiliary_loss_mlp": 0.01078883, "balance_loss_clip": 1.026407, "balance_loss_mlp": 1.02810645, "epoch": 0.13762212535698182, "flos": 22709918229120.0, "grad_norm": 2.190929540402221, "language_loss": 0.89871919, "learning_rate": 3.879725418400005e-06, "loss": 0.9205305, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7421875, "step": 2289, "time_per_iteration": 2.410564422607422 }, { "auxiliary_loss_clip": 0.0109637, "auxiliary_loss_mlp": 0.01072824, "balance_loss_clip": 1.023543, "balance_loss_mlp": 1.02444482, "epoch": 0.13768224860964978, "flos": 23950975795200.0, "grad_norm": 1.7688432432085246, "language_loss": 0.76291549, "learning_rate": 3.879592361162969e-06, "loss": 0.78460747, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.71875, "step": 2290, "time_per_iteration": 2.4063212871551514 }, { "auxiliary_loss_clip": 0.01025214, "auxiliary_loss_mlp": 0.01010472, "balance_loss_clip": 1.00389171, "balance_loss_mlp": 1.00469732, "epoch": 0.13774237186231775, "flos": 63586750510080.0, "grad_norm": 0.7058457857720316, "language_loss": 0.5168618, "learning_rate": 3.8794592326516015e-06, "loss": 0.53721869, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 0.06591797, "router_z_loss_mlp": 0.20507812, "step": 2291, "time_per_iteration": 3.065821409225464 }, { "auxiliary_loss_clip": 0.01098689, "auxiliary_loss_mlp": 0.01067315, "balance_loss_clip": 1.01688957, "balance_loss_mlp": 1.02532411, "epoch": 0.1378024951149857, "flos": 24278321502720.0, "grad_norm": 1.997706831738451, "language_loss": 0.7277174, "learning_rate": 3.879326032870952e-06, "loss": 0.74937743, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 2292, "time_per_iteration": 2.4066214561462402 }, { "auxiliary_loss_clip": 0.01099581, "auxiliary_loss_mlp": 0.01078393, "balance_loss_clip": 1.02524972, "balance_loss_mlp": 1.0276202, "epoch": 0.13786261836765368, "flos": 14020839521280.0, "grad_norm": 2.7158987104013512, "language_loss": 0.82024562, "learning_rate": 3.879192761826071e-06, "loss": 0.84202534, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.71875, "step": 2293, "time_per_iteration": 2.3778576850891113 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01081193, "balance_loss_clip": 1.02616608, "balance_loss_mlp": 1.02490544, "epoch": 0.13792274162032167, "flos": 28877360279040.0, "grad_norm": 2.349710677917006, "language_loss": 0.80887091, "learning_rate": 3.879059419522011e-06, "loss": 0.83069271, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.76171875, "step": 2294, "time_per_iteration": 2.466806173324585 }, { "auxiliary_loss_clip": 0.01099237, "auxiliary_loss_mlp": 0.01068466, "balance_loss_clip": 1.0192802, "balance_loss_mlp": 1.02621043, "epoch": 0.13798286487298964, "flos": 21140118501120.0, "grad_norm": 2.1096307442438134, "language_loss": 0.8194865, "learning_rate": 3.878926005963831e-06, "loss": 0.84116352, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.73046875, "step": 2295, "time_per_iteration": 2.4102723598480225 }, { "auxiliary_loss_clip": 0.01098162, "auxiliary_loss_mlp": 0.0108087, "balance_loss_clip": 1.02603388, "balance_loss_mlp": 1.02447736, "epoch": 0.1380429881256576, "flos": 22486509239040.0, "grad_norm": 1.6183791427254255, "language_loss": 0.80288756, "learning_rate": 3.878792521156588e-06, "loss": 0.82467788, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.734375, "step": 2296, "time_per_iteration": 2.4178006649017334 }, { "auxiliary_loss_clip": 0.01100757, "auxiliary_loss_mlp": 0.01072975, "balance_loss_clip": 1.02400398, "balance_loss_mlp": 1.02733612, "epoch": 0.13810311137832557, "flos": 21392715254400.0, "grad_norm": 1.8487846683645697, "language_loss": 0.800394, "learning_rate": 3.8786589651053446e-06, "loss": 0.82213134, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.734375, "step": 2297, "time_per_iteration": 2.413954019546509 }, { "auxiliary_loss_clip": 0.0110143, "auxiliary_loss_mlp": 0.01079807, "balance_loss_clip": 1.0299058, "balance_loss_mlp": 1.02815509, "epoch": 0.13816323463099353, "flos": 25988786565120.0, "grad_norm": 2.0851136503108894, "language_loss": 0.70543957, "learning_rate": 3.878525337815164e-06, "loss": 0.72725189, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.734375, "step": 2298, "time_per_iteration": 2.4297866821289062 }, { "auxiliary_loss_clip": 0.01105569, "auxiliary_loss_mlp": 0.01086277, "balance_loss_clip": 1.03444457, "balance_loss_mlp": 1.02774286, "epoch": 0.1382233578836615, "flos": 19243322179200.0, "grad_norm": 2.1961745269843256, "language_loss": 0.89040697, "learning_rate": 3.878391639291116e-06, "loss": 0.91232544, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.78125, "step": 2299, "time_per_iteration": 2.3947176933288574 }, { "auxiliary_loss_clip": 0.01096682, "auxiliary_loss_mlp": 0.01079389, "balance_loss_clip": 1.0280571, "balance_loss_mlp": 1.02380145, "epoch": 0.1382834811363295, "flos": 25665106550400.0, "grad_norm": 1.7743553231500588, "language_loss": 0.78206468, "learning_rate": 3.878257869538267e-06, "loss": 0.80382538, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.73046875, "step": 2300, "time_per_iteration": 2.44600248336792 }, { "auxiliary_loss_clip": 0.01099037, "auxiliary_loss_mlp": 0.01080589, "balance_loss_clip": 1.03202343, "balance_loss_mlp": 1.02796245, "epoch": 0.13834360438899745, "flos": 19783394000640.0, "grad_norm": 2.3544058251236164, "language_loss": 0.84997076, "learning_rate": 3.878124028561692e-06, "loss": 0.87176704, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7109375, "step": 2301, "time_per_iteration": 2.36763072013855 }, { "auxiliary_loss_clip": 0.01097462, "auxiliary_loss_mlp": 0.01075698, "balance_loss_clip": 1.02698886, "balance_loss_mlp": 1.0255233, "epoch": 0.13840372764166542, "flos": 26650634808960.0, "grad_norm": 3.0487476019763236, "language_loss": 0.87715501, "learning_rate": 3.877990116366466e-06, "loss": 0.89888656, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.71875, "step": 2302, "time_per_iteration": 2.4854891300201416 }, { "auxiliary_loss_clip": 0.01025184, "auxiliary_loss_mlp": 0.01014759, "balance_loss_clip": 1.00932288, "balance_loss_mlp": 1.00514877, "epoch": 0.13846385089433338, "flos": 70507444343040.0, "grad_norm": 0.7706338764867829, "language_loss": 0.65793359, "learning_rate": 3.877856132957667e-06, "loss": 0.67833304, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 0.05444336, "router_z_loss_mlp": 0.20117188, "step": 2303, "time_per_iteration": 3.156560182571411 }, { "auxiliary_loss_clip": 0.01098113, "auxiliary_loss_mlp": 0.01068781, "balance_loss_clip": 1.01966667, "balance_loss_mlp": 1.02554488, "epoch": 0.13852397414700135, "flos": 17347747754880.0, "grad_norm": 3.2309340948674956, "language_loss": 0.79546964, "learning_rate": 3.877722078340374e-06, "loss": 0.81713855, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.7265625, "step": 2304, "time_per_iteration": 2.3472673892974854 }, { "auxiliary_loss_clip": 0.01102174, "auxiliary_loss_mlp": 0.01069077, "balance_loss_clip": 1.01791239, "balance_loss_mlp": 1.02756572, "epoch": 0.13858409739966931, "flos": 21542701921920.0, "grad_norm": 1.6291551231975308, "language_loss": 0.79094279, "learning_rate": 3.877587952519672e-06, "loss": 0.81265527, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.74609375, "step": 2305, "time_per_iteration": 2.406261920928955 }, { "auxiliary_loss_clip": 0.01096838, "auxiliary_loss_mlp": 0.01072228, "balance_loss_clip": 1.0230664, "balance_loss_mlp": 1.02455163, "epoch": 0.13864422065233728, "flos": 21578837045760.0, "grad_norm": 1.6828507129835466, "language_loss": 0.89839327, "learning_rate": 3.877453755500647e-06, "loss": 0.92008394, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.72265625, "step": 2306, "time_per_iteration": 2.4221625328063965 }, { "auxiliary_loss_clip": 0.01025438, "auxiliary_loss_mlp": 0.01012873, "balance_loss_clip": 1.00653148, "balance_loss_mlp": 1.00638831, "epoch": 0.13870434390500527, "flos": 53368861743360.0, "grad_norm": 0.8895551455015643, "language_loss": 0.59281766, "learning_rate": 3.877319487288387e-06, "loss": 0.61320078, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 0.06347656, "router_z_loss_mlp": 0.19042969, "step": 2307, "time_per_iteration": 3.0858495235443115 }, { "auxiliary_loss_clip": 0.01101974, "auxiliary_loss_mlp": 0.01079981, "balance_loss_clip": 1.02624106, "balance_loss_mlp": 1.02611279, "epoch": 0.13876446715767324, "flos": 22564784240640.0, "grad_norm": 1.761154039198625, "language_loss": 0.81950474, "learning_rate": 3.877185147887984e-06, "loss": 0.84132427, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7578125, "step": 2308, "time_per_iteration": 2.41827654838562 }, { "auxiliary_loss_clip": 0.01095422, "auxiliary_loss_mlp": 0.01066286, "balance_loss_clip": 1.01648068, "balance_loss_mlp": 1.02429426, "epoch": 0.1388245904103412, "flos": 20704157953920.0, "grad_norm": 2.0389684100065417, "language_loss": 0.80894417, "learning_rate": 3.877050737304533e-06, "loss": 0.83056128, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.7109375, "step": 2309, "time_per_iteration": 3.811041831970215 }, { "auxiliary_loss_clip": 0.01103964, "auxiliary_loss_mlp": 0.01067559, "balance_loss_clip": 1.01579809, "balance_loss_mlp": 1.02735353, "epoch": 0.13888471366300917, "flos": 20553787261440.0, "grad_norm": 1.9623723176277659, "language_loss": 0.69932956, "learning_rate": 3.876916255543129e-06, "loss": 0.72104478, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.765625, "step": 2310, "time_per_iteration": 3.8001632690429688 }, { "auxiliary_loss_clip": 0.01097431, "auxiliary_loss_mlp": 0.01074428, "balance_loss_clip": 1.02552867, "balance_loss_mlp": 1.02534401, "epoch": 0.13894483691567713, "flos": 13837370993280.0, "grad_norm": 1.881604913232477, "language_loss": 0.85570848, "learning_rate": 3.8767817026088725e-06, "loss": 0.8774271, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.71875, "step": 2311, "time_per_iteration": 2.357818603515625 }, { "auxiliary_loss_clip": 0.01103614, "auxiliary_loss_mlp": 0.01083903, "balance_loss_clip": 1.02684927, "balance_loss_mlp": 1.02703547, "epoch": 0.1390049601683451, "flos": 28030123382400.0, "grad_norm": 2.235906768874491, "language_loss": 0.83865625, "learning_rate": 3.876647078506866e-06, "loss": 0.86053145, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.765625, "step": 2312, "time_per_iteration": 3.8730225563049316 }, { "auxiliary_loss_clip": 0.01103618, "auxiliary_loss_mlp": 0.01080113, "balance_loss_clip": 1.02773273, "balance_loss_mlp": 1.02786899, "epoch": 0.13906508342101306, "flos": 26755758512640.0, "grad_norm": 1.672791128299684, "language_loss": 0.88396561, "learning_rate": 3.876512383242215e-06, "loss": 0.90580297, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7578125, "step": 2313, "time_per_iteration": 2.4380085468292236 }, { "auxiliary_loss_clip": 0.01099907, "auxiliary_loss_mlp": 0.01082395, "balance_loss_clip": 1.03056276, "balance_loss_mlp": 1.02565956, "epoch": 0.13912520667368106, "flos": 24533955544320.0, "grad_norm": 1.8049775069403353, "language_loss": 0.81867188, "learning_rate": 3.876377616820024e-06, "loss": 0.84049487, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.7421875, "step": 2314, "time_per_iteration": 2.4388504028320312 }, { "auxiliary_loss_clip": 0.01100529, "auxiliary_loss_mlp": 0.01074857, "balance_loss_clip": 1.02311993, "balance_loss_mlp": 1.02689791, "epoch": 0.13918532992634902, "flos": 19382416502400.0, "grad_norm": 2.329975764577655, "language_loss": 0.88713676, "learning_rate": 3.876242779245409e-06, "loss": 0.9088906, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.734375, "step": 2315, "time_per_iteration": 2.3557851314544678 }, { "auxiliary_loss_clip": 0.01098722, "auxiliary_loss_mlp": 0.01075573, "balance_loss_clip": 1.02636361, "balance_loss_mlp": 1.02541173, "epoch": 0.139245453179017, "flos": 21322714245120.0, "grad_norm": 4.266012427524393, "language_loss": 0.79235256, "learning_rate": 3.876107870523477e-06, "loss": 0.8140955, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.734375, "step": 2316, "time_per_iteration": 2.399505376815796 }, { "auxiliary_loss_clip": 0.01099451, "auxiliary_loss_mlp": 0.01077135, "balance_loss_clip": 1.02554083, "balance_loss_mlp": 1.02741385, "epoch": 0.13930557643168495, "flos": 19499584625280.0, "grad_norm": 1.6705724494023786, "language_loss": 0.78873736, "learning_rate": 3.875972890659349e-06, "loss": 0.81050324, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.71875, "step": 2317, "time_per_iteration": 2.4020607471466064 }, { "auxiliary_loss_clip": 0.01100838, "auxiliary_loss_mlp": 0.01066524, "balance_loss_clip": 1.0191021, "balance_loss_mlp": 1.02721941, "epoch": 0.13936569968435292, "flos": 25409647065600.0, "grad_norm": 2.0362172739515687, "language_loss": 0.82685184, "learning_rate": 3.875837839658139e-06, "loss": 0.84852552, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.734375, "step": 2318, "time_per_iteration": 2.433493137359619 }, { "auxiliary_loss_clip": 0.01020803, "auxiliary_loss_mlp": 0.01015984, "balance_loss_clip": 1.01021445, "balance_loss_mlp": 1.00301051, "epoch": 0.13942582293702088, "flos": 70767372481920.0, "grad_norm": 0.8638124709776664, "language_loss": 0.59128332, "learning_rate": 3.87570271752497e-06, "loss": 0.61165118, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 0.05761719, "router_z_loss_mlp": 0.17773438, "step": 2319, "time_per_iteration": 3.064567804336548 }, { "auxiliary_loss_clip": 0.01099857, "auxiliary_loss_mlp": 0.01070451, "balance_loss_clip": 1.02174187, "balance_loss_mlp": 1.0253098, "epoch": 0.13948594618968888, "flos": 35589412627200.0, "grad_norm": 2.179385314317167, "language_loss": 0.68977374, "learning_rate": 3.875567524264967e-06, "loss": 0.7114768, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.74609375, "step": 2320, "time_per_iteration": 2.5087361335754395 }, { "auxiliary_loss_clip": 0.01096609, "auxiliary_loss_mlp": 0.01067206, "balance_loss_clip": 1.02126241, "balance_loss_mlp": 1.02520037, "epoch": 0.13954606944235684, "flos": 21104157934080.0, "grad_norm": 1.7929222526692348, "language_loss": 0.72406977, "learning_rate": 3.875432259883256e-06, "loss": 0.74570793, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.71484375, "step": 2321, "time_per_iteration": 2.3905608654022217 }, { "auxiliary_loss_clip": 0.01098514, "auxiliary_loss_mlp": 0.01072776, "balance_loss_clip": 1.0253073, "balance_loss_mlp": 1.02546048, "epoch": 0.1396061926950248, "flos": 25043303502720.0, "grad_norm": 1.823688269110296, "language_loss": 0.87624943, "learning_rate": 3.875296924384965e-06, "loss": 0.89796227, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.73046875, "step": 2322, "time_per_iteration": 2.444798469543457 }, { "auxiliary_loss_clip": 0.01093817, "auxiliary_loss_mlp": 0.01067472, "balance_loss_clip": 1.02527153, "balance_loss_mlp": 1.02503991, "epoch": 0.13966631594769277, "flos": 37632495012480.0, "grad_norm": 1.679641955649151, "language_loss": 0.68735874, "learning_rate": 3.875161517775226e-06, "loss": 0.70897162, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6875, "step": 2323, "time_per_iteration": 2.5323569774627686 }, { "auxiliary_loss_clip": 0.01106393, "auxiliary_loss_mlp": 0.0106864, "balance_loss_clip": 1.01878691, "balance_loss_mlp": 1.02811384, "epoch": 0.13972643920036074, "flos": 16690053962880.0, "grad_norm": 2.001042786560693, "language_loss": 0.92948622, "learning_rate": 3.875026040059175e-06, "loss": 0.95123661, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.78125, "step": 2324, "time_per_iteration": 2.3752689361572266 }, { "auxiliary_loss_clip": 0.01099865, "auxiliary_loss_mlp": 0.01078059, "balance_loss_clip": 1.02706146, "balance_loss_mlp": 1.02509928, "epoch": 0.1397865624530287, "flos": 23329940797440.0, "grad_norm": 3.020303081202028, "language_loss": 0.73696572, "learning_rate": 3.8748904912419485e-06, "loss": 0.75874496, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2325, "time_per_iteration": 2.39174485206604 }, { "auxiliary_loss_clip": 0.01098923, "auxiliary_loss_mlp": 0.01075597, "balance_loss_clip": 1.02738833, "balance_loss_mlp": 1.02781367, "epoch": 0.13984668570569667, "flos": 22777370709120.0, "grad_norm": 1.8625049111790701, "language_loss": 0.83402061, "learning_rate": 3.874754871328688e-06, "loss": 0.85576576, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.7109375, "step": 2326, "time_per_iteration": 2.4044384956359863 }, { "auxiliary_loss_clip": 0.01096732, "auxiliary_loss_mlp": 0.01071019, "balance_loss_clip": 1.02586281, "balance_loss_mlp": 1.0266856, "epoch": 0.13990680895836466, "flos": 19463519324160.0, "grad_norm": 1.797039911333627, "language_loss": 0.90511805, "learning_rate": 3.874619180324534e-06, "loss": 0.9267956, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.69921875, "step": 2327, "time_per_iteration": 2.3899290561676025 }, { "auxiliary_loss_clip": 0.01096659, "auxiliary_loss_mlp": 0.01070155, "balance_loss_clip": 1.02347279, "balance_loss_mlp": 1.02642465, "epoch": 0.13996693221103262, "flos": 20302237848960.0, "grad_norm": 1.9157399043675598, "language_loss": 0.86463577, "learning_rate": 3.874483418234632e-06, "loss": 0.8863039, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.703125, "step": 2328, "time_per_iteration": 2.3760221004486084 }, { "auxiliary_loss_clip": 0.01097616, "auxiliary_loss_mlp": 0.01068771, "balance_loss_clip": 1.02168345, "balance_loss_mlp": 1.02462769, "epoch": 0.1400270554637006, "flos": 26616419809920.0, "grad_norm": 1.5674418082209531, "language_loss": 0.75268614, "learning_rate": 3.874347585064131e-06, "loss": 0.77435005, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.73046875, "step": 2329, "time_per_iteration": 2.4418728351593018 }, { "auxiliary_loss_clip": 0.01098106, "auxiliary_loss_mlp": 0.01065353, "balance_loss_clip": 1.017097, "balance_loss_mlp": 1.02543378, "epoch": 0.14008717871636855, "flos": 19390446115200.0, "grad_norm": 1.8090363478201983, "language_loss": 0.80433857, "learning_rate": 3.874211680818183e-06, "loss": 0.82597315, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.7265625, "step": 2330, "time_per_iteration": 2.384094715118408 }, { "auxiliary_loss_clip": 0.01095153, "auxiliary_loss_mlp": 0.01069231, "balance_loss_clip": 1.02355027, "balance_loss_mlp": 1.0243454, "epoch": 0.14014730196903652, "flos": 15303373649280.0, "grad_norm": 3.1018054971729954, "language_loss": 0.74080718, "learning_rate": 3.87407570550194e-06, "loss": 0.76245099, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.7109375, "step": 2331, "time_per_iteration": 2.363744020462036 }, { "auxiliary_loss_clip": 0.01092016, "auxiliary_loss_mlp": 0.01071282, "balance_loss_clip": 1.02977347, "balance_loss_mlp": 1.02559328, "epoch": 0.14020742522170448, "flos": 14938810565760.0, "grad_norm": 1.60475532013002, "language_loss": 0.74623108, "learning_rate": 3.873939659120557e-06, "loss": 0.76786405, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6640625, "step": 2332, "time_per_iteration": 2.436201810836792 }, { "auxiliary_loss_clip": 0.01022133, "auxiliary_loss_mlp": 0.01012344, "balance_loss_clip": 1.00652611, "balance_loss_mlp": 1.00351977, "epoch": 0.14026754847437245, "flos": 48822017316480.0, "grad_norm": 0.8487859216876602, "language_loss": 0.56244385, "learning_rate": 3.873803541679196e-06, "loss": 0.58278859, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 0.05810547, "router_z_loss_mlp": 0.18554688, "step": 2333, "time_per_iteration": 2.8638577461242676 }, { "auxiliary_loss_clip": 0.01096487, "auxiliary_loss_mlp": 0.01075171, "balance_loss_clip": 1.02875125, "balance_loss_mlp": 1.02506387, "epoch": 0.14032767172704044, "flos": 25772150378880.0, "grad_norm": 1.5914010635479372, "language_loss": 0.8360281, "learning_rate": 3.873667353183016e-06, "loss": 0.85774463, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.71484375, "step": 2334, "time_per_iteration": 2.4287009239196777 }, { "auxiliary_loss_clip": 0.01094514, "auxiliary_loss_mlp": 0.01071489, "balance_loss_clip": 1.02914596, "balance_loss_mlp": 1.02508903, "epoch": 0.1403877949797084, "flos": 21215216568960.0, "grad_norm": 1.6071457497797799, "language_loss": 0.81850404, "learning_rate": 3.8735310936371825e-06, "loss": 0.84016401, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6953125, "step": 2335, "time_per_iteration": 2.417670249938965 }, { "auxiliary_loss_clip": 0.01097257, "auxiliary_loss_mlp": 0.01077199, "balance_loss_clip": 1.02741706, "balance_loss_mlp": 1.02569258, "epoch": 0.14044791823237637, "flos": 22746856314240.0, "grad_norm": 1.6337788034437608, "language_loss": 0.83876276, "learning_rate": 3.873394763046862e-06, "loss": 0.86050731, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.71484375, "step": 2336, "time_per_iteration": 2.4388797283172607 }, { "auxiliary_loss_clip": 0.01096473, "auxiliary_loss_mlp": 0.01077751, "balance_loss_clip": 1.03276134, "balance_loss_mlp": 1.02563655, "epoch": 0.14050804148504434, "flos": 22963387766400.0, "grad_norm": 1.6690502699344472, "language_loss": 0.81863654, "learning_rate": 3.873258361417225e-06, "loss": 0.84037876, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.70703125, "step": 2337, "time_per_iteration": 2.399803400039673 }, { "auxiliary_loss_clip": 0.01096852, "auxiliary_loss_mlp": 0.01080153, "balance_loss_clip": 1.03435278, "balance_loss_mlp": 1.0252521, "epoch": 0.1405681647377123, "flos": 22199243639040.0, "grad_norm": 3.618651933775152, "language_loss": 0.81271327, "learning_rate": 3.873121888753442e-06, "loss": 0.83448327, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.71484375, "step": 2338, "time_per_iteration": 2.4098405838012695 }, { "auxiliary_loss_clip": 0.01099849, "auxiliary_loss_mlp": 0.01087712, "balance_loss_clip": 1.03735852, "balance_loss_mlp": 1.02764118, "epoch": 0.14062828799038027, "flos": 23731651434240.0, "grad_norm": 2.298099449912599, "language_loss": 0.83314776, "learning_rate": 3.87298534506069e-06, "loss": 0.85502338, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.72265625, "step": 2339, "time_per_iteration": 2.41626238822937 }, { "auxiliary_loss_clip": 0.01095892, "auxiliary_loss_mlp": 0.01074312, "balance_loss_clip": 1.02879786, "balance_loss_mlp": 1.02549624, "epoch": 0.14068841124304826, "flos": 39200933197440.0, "grad_norm": 1.7317404394200744, "language_loss": 0.67206317, "learning_rate": 3.872848730344146e-06, "loss": 0.69376522, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.703125, "step": 2340, "time_per_iteration": 2.556863784790039 }, { "auxiliary_loss_clip": 0.01093781, "auxiliary_loss_mlp": 0.01067871, "balance_loss_clip": 1.02552819, "balance_loss_mlp": 1.02686203, "epoch": 0.14074853449571623, "flos": 20191283948160.0, "grad_norm": 2.5793686109431158, "language_loss": 0.81785798, "learning_rate": 3.87271204460899e-06, "loss": 0.83947456, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.66796875, "step": 2341, "time_per_iteration": 2.4176712036132812 }, { "auxiliary_loss_clip": 0.01093383, "auxiliary_loss_mlp": 0.01070137, "balance_loss_clip": 1.02390778, "balance_loss_mlp": 1.02490532, "epoch": 0.1408086577483842, "flos": 18404882945280.0, "grad_norm": 1.8573064688021232, "language_loss": 0.83149493, "learning_rate": 3.8725752878604066e-06, "loss": 0.8531301, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.68359375, "step": 2342, "time_per_iteration": 2.3970870971679688 }, { "auxiliary_loss_clip": 0.01090687, "auxiliary_loss_mlp": 0.0106508, "balance_loss_clip": 1.02297568, "balance_loss_mlp": 1.02588344, "epoch": 0.14086878100105216, "flos": 25263430824960.0, "grad_norm": 2.3744095516360826, "language_loss": 0.80615437, "learning_rate": 3.87243846010358e-06, "loss": 0.82771206, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6484375, "step": 2343, "time_per_iteration": 2.413259506225586 }, { "auxiliary_loss_clip": 0.01021274, "auxiliary_loss_mlp": 0.01014568, "balance_loss_clip": 1.00846434, "balance_loss_mlp": 1.00476551, "epoch": 0.14092890425372012, "flos": 65975194730880.0, "grad_norm": 0.8386368192521199, "language_loss": 0.61639196, "learning_rate": 3.872301561343699e-06, "loss": 0.63675034, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 0.06103516, "router_z_loss_mlp": 0.16503906, "step": 2344, "time_per_iteration": 2.957770347595215 }, { "auxiliary_loss_clip": 0.01091914, "auxiliary_loss_mlp": 0.01057357, "balance_loss_clip": 1.01525259, "balance_loss_mlp": 1.02395976, "epoch": 0.1409890275063881, "flos": 23693875476480.0, "grad_norm": 1.7217596055612878, "language_loss": 0.66969234, "learning_rate": 3.872164591585956e-06, "loss": 0.691185, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6796875, "step": 2345, "time_per_iteration": 2.398142099380493 }, { "auxiliary_loss_clip": 0.01099441, "auxiliary_loss_mlp": 0.01064012, "balance_loss_clip": 1.01682913, "balance_loss_mlp": 1.02614999, "epoch": 0.14104915075905605, "flos": 23622024165120.0, "grad_norm": 2.849842550456899, "language_loss": 0.77279401, "learning_rate": 3.8720275508355435e-06, "loss": 0.79442847, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.73046875, "step": 2346, "time_per_iteration": 2.4196252822875977 }, { "auxiliary_loss_clip": 0.01097379, "auxiliary_loss_mlp": 0.01069531, "balance_loss_clip": 1.0225383, "balance_loss_mlp": 1.02661347, "epoch": 0.14110927401172405, "flos": 20594111748480.0, "grad_norm": 1.8907999068220207, "language_loss": 0.7887587, "learning_rate": 3.8718904390976585e-06, "loss": 0.81042778, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.7109375, "step": 2347, "time_per_iteration": 2.380185127258301 }, { "auxiliary_loss_clip": 0.01096654, "auxiliary_loss_mlp": 0.01067499, "balance_loss_clip": 1.02100706, "balance_loss_mlp": 1.02524769, "epoch": 0.141169397264392, "flos": 28546802726400.0, "grad_norm": 2.0447175396747257, "language_loss": 0.78037524, "learning_rate": 3.8717532563775e-06, "loss": 0.80201674, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.71484375, "step": 2348, "time_per_iteration": 3.87050461769104 }, { "auxiliary_loss_clip": 0.01094176, "auxiliary_loss_mlp": 0.01060945, "balance_loss_clip": 1.01559746, "balance_loss_mlp": 1.02532113, "epoch": 0.14122952051705998, "flos": 17091310752000.0, "grad_norm": 1.62365409890781, "language_loss": 0.88682103, "learning_rate": 3.871616002680272e-06, "loss": 0.90837222, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6875, "step": 2349, "time_per_iteration": 3.808807611465454 }, { "auxiliary_loss_clip": 0.01094341, "auxiliary_loss_mlp": 0.01063383, "balance_loss_clip": 1.01810718, "balance_loss_mlp": 1.02666235, "epoch": 0.14128964376972794, "flos": 28945615720320.0, "grad_norm": 1.609767530616754, "language_loss": 0.90336883, "learning_rate": 3.871478678011177e-06, "loss": 0.92494613, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6796875, "step": 2350, "time_per_iteration": 2.4457435607910156 }, { "auxiliary_loss_clip": 0.01099722, "auxiliary_loss_mlp": 0.01070965, "balance_loss_clip": 1.02263784, "balance_loss_mlp": 1.02827561, "epoch": 0.1413497670223959, "flos": 18988770389760.0, "grad_norm": 1.7840272272770248, "language_loss": 0.82614648, "learning_rate": 3.871341282375423e-06, "loss": 0.84785342, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.71484375, "step": 2351, "time_per_iteration": 3.763456106185913 }, { "auxiliary_loss_clip": 0.01098834, "auxiliary_loss_mlp": 0.01072854, "balance_loss_clip": 1.02726793, "balance_loss_mlp": 1.02765095, "epoch": 0.14140989027506387, "flos": 29860933501440.0, "grad_norm": 2.0538363623413867, "language_loss": 0.86355698, "learning_rate": 3.871203815778219e-06, "loss": 0.88527393, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.7109375, "step": 2352, "time_per_iteration": 3.950869083404541 }, { "auxiliary_loss_clip": 0.01021319, "auxiliary_loss_mlp": 0.01018357, "balance_loss_clip": 1.01215804, "balance_loss_mlp": 1.00365102, "epoch": 0.14147001352773186, "flos": 62076303826560.0, "grad_norm": 1.0290291430437577, "language_loss": 0.62007737, "learning_rate": 3.87106627822478e-06, "loss": 0.6404742, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 0.06176758, "router_z_loss_mlp": 0.17675781, "step": 2353, "time_per_iteration": 2.9648189544677734 }, { "auxiliary_loss_clip": 0.01092072, "auxiliary_loss_mlp": 0.0106999, "balance_loss_clip": 1.02743196, "balance_loss_mlp": 1.02592862, "epoch": 0.14153013678039983, "flos": 22016438426880.0, "grad_norm": 2.114365724356601, "language_loss": 0.88850433, "learning_rate": 3.8709286697203196e-06, "loss": 0.9101249, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6640625, "step": 2354, "time_per_iteration": 2.4362807273864746 }, { "auxiliary_loss_clip": 0.01095519, "auxiliary_loss_mlp": 0.01065752, "balance_loss_clip": 1.01880741, "balance_loss_mlp": 1.02513134, "epoch": 0.1415902600330678, "flos": 19719048631680.0, "grad_norm": 2.4738266875671293, "language_loss": 0.76584566, "learning_rate": 3.870790990270057e-06, "loss": 0.78745842, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.703125, "step": 2355, "time_per_iteration": 2.393272876739502 }, { "auxiliary_loss_clip": 0.01021171, "auxiliary_loss_mlp": 0.01005469, "balance_loss_clip": 0.99903131, "balance_loss_mlp": 1.0038414, "epoch": 0.14165038328573576, "flos": 65897862336000.0, "grad_norm": 0.6905474049446052, "language_loss": 0.52014923, "learning_rate": 3.870653239879212e-06, "loss": 0.54041564, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 0.06445312, "router_z_loss_mlp": 0.17382812, "step": 2356, "time_per_iteration": 2.9643349647521973 }, { "auxiliary_loss_clip": 0.01095071, "auxiliary_loss_mlp": 0.01067043, "balance_loss_clip": 1.02305484, "balance_loss_mlp": 1.02627206, "epoch": 0.14171050653840372, "flos": 12129349726080.0, "grad_norm": 3.44367815537643, "language_loss": 0.72895277, "learning_rate": 3.8705154185530095e-06, "loss": 0.75057387, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.6875, "step": 2357, "time_per_iteration": 2.405557870864868 }, { "auxiliary_loss_clip": 0.01097603, "auxiliary_loss_mlp": 0.01068252, "balance_loss_clip": 1.02228522, "balance_loss_mlp": 1.02484655, "epoch": 0.1417706297910717, "flos": 20411446181760.0, "grad_norm": 1.9540989193968346, "language_loss": 0.84554189, "learning_rate": 3.870377526296674e-06, "loss": 0.86720049, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.7265625, "step": 2358, "time_per_iteration": 2.4183664321899414 }, { "auxiliary_loss_clip": 0.0109973, "auxiliary_loss_mlp": 0.0106999, "balance_loss_clip": 1.02340364, "balance_loss_mlp": 1.02662396, "epoch": 0.14183075304373965, "flos": 22379570144640.0, "grad_norm": 2.1284243629906143, "language_loss": 0.73992664, "learning_rate": 3.870239563115436e-06, "loss": 0.76162386, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.73046875, "step": 2359, "time_per_iteration": 2.397797107696533 }, { "auxiliary_loss_clip": 0.01096407, "auxiliary_loss_mlp": 0.0106192, "balance_loss_clip": 1.02038801, "balance_loss_mlp": 1.02701902, "epoch": 0.14189087629640765, "flos": 21579814563840.0, "grad_norm": 2.0206623757932056, "language_loss": 0.78634334, "learning_rate": 3.870101529014526e-06, "loss": 0.8079266, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6953125, "step": 2360, "time_per_iteration": 2.4997646808624268 }, { "auxiliary_loss_clip": 0.01095137, "auxiliary_loss_mlp": 0.01065257, "balance_loss_clip": 1.01785898, "balance_loss_mlp": 1.02696097, "epoch": 0.1419509995490756, "flos": 20007605952000.0, "grad_norm": 2.0651959770895125, "language_loss": 0.84261692, "learning_rate": 3.869963423999178e-06, "loss": 0.86422086, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6796875, "step": 2361, "time_per_iteration": 2.389340400695801 }, { "auxiliary_loss_clip": 0.0109568, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.0196352, "balance_loss_mlp": 1.02720356, "epoch": 0.14201112280174358, "flos": 31940116099200.0, "grad_norm": 1.8583325644120083, "language_loss": 0.7663433, "learning_rate": 3.86982524807463e-06, "loss": 0.78794158, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.68359375, "step": 2362, "time_per_iteration": 2.5709047317504883 }, { "auxiliary_loss_clip": 0.0109715, "auxiliary_loss_mlp": 0.01063262, "balance_loss_clip": 1.01753342, "balance_loss_mlp": 1.02890205, "epoch": 0.14207124605441154, "flos": 41462536982400.0, "grad_norm": 1.8974042724551525, "language_loss": 0.75918436, "learning_rate": 3.869687001246122e-06, "loss": 0.78078848, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.68359375, "step": 2363, "time_per_iteration": 2.5720465183258057 }, { "auxiliary_loss_clip": 0.01093469, "auxiliary_loss_mlp": 0.01062822, "balance_loss_clip": 1.02055085, "balance_loss_mlp": 1.02511299, "epoch": 0.1421313693070795, "flos": 31903736595840.0, "grad_norm": 1.949011831226733, "language_loss": 0.74746877, "learning_rate": 3.8695486835188946e-06, "loss": 0.76903164, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.68359375, "step": 2364, "time_per_iteration": 2.5085959434509277 }, { "auxiliary_loss_clip": 0.01092967, "auxiliary_loss_mlp": 0.0105875, "balance_loss_clip": 1.01941097, "balance_loss_mlp": 1.02714086, "epoch": 0.14219149255974747, "flos": 26869924258560.0, "grad_norm": 1.8162505330910113, "language_loss": 0.92309189, "learning_rate": 3.869410294898195e-06, "loss": 0.94460905, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.65625, "step": 2365, "time_per_iteration": 2.4344723224639893 }, { "auxiliary_loss_clip": 0.01097296, "auxiliary_loss_mlp": 0.01068953, "balance_loss_clip": 1.0199821, "balance_loss_mlp": 1.02694428, "epoch": 0.14225161581241544, "flos": 27453183298560.0, "grad_norm": 1.6909320163029966, "language_loss": 0.6732766, "learning_rate": 3.869271835389268e-06, "loss": 0.69493914, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.703125, "step": 2366, "time_per_iteration": 2.461634874343872 }, { "auxiliary_loss_clip": 0.01093072, "auxiliary_loss_mlp": 0.01061414, "balance_loss_clip": 1.01713991, "balance_loss_mlp": 1.02502322, "epoch": 0.14231173906508343, "flos": 10560667161600.0, "grad_norm": 1.8002024462817556, "language_loss": 0.83267176, "learning_rate": 3.8691333049973665e-06, "loss": 0.85421658, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6796875, "step": 2367, "time_per_iteration": 2.4098618030548096 }, { "auxiliary_loss_clip": 0.01098377, "auxiliary_loss_mlp": 0.01081762, "balance_loss_clip": 1.03379166, "balance_loss_mlp": 1.02722013, "epoch": 0.1423718623177514, "flos": 28359773239680.0, "grad_norm": 1.8686043352427915, "language_loss": 0.84684205, "learning_rate": 3.868994703727742e-06, "loss": 0.8686434, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7109375, "step": 2368, "time_per_iteration": 2.4541900157928467 }, { "auxiliary_loss_clip": 0.01094129, "auxiliary_loss_mlp": 0.01074922, "balance_loss_clip": 1.02819169, "balance_loss_mlp": 1.02500486, "epoch": 0.14243198557041936, "flos": 19353228739200.0, "grad_norm": 2.28753091294977, "language_loss": 0.89359874, "learning_rate": 3.868856031585652e-06, "loss": 0.91528922, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.69140625, "step": 2369, "time_per_iteration": 2.374861717224121 }, { "auxiliary_loss_clip": 0.01097487, "auxiliary_loss_mlp": 0.01072857, "balance_loss_clip": 1.02736664, "balance_loss_mlp": 1.02430117, "epoch": 0.14249210882308733, "flos": 28805508967680.0, "grad_norm": 1.478099951806457, "language_loss": 0.76854205, "learning_rate": 3.868717288576354e-06, "loss": 0.79024547, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.734375, "step": 2370, "time_per_iteration": 2.4790923595428467 }, { "auxiliary_loss_clip": 0.01095028, "auxiliary_loss_mlp": 0.01063451, "balance_loss_clip": 1.02017832, "balance_loss_mlp": 1.02471995, "epoch": 0.1425522320757553, "flos": 21833947416960.0, "grad_norm": 1.5897874487141483, "language_loss": 0.84612048, "learning_rate": 3.868578474705109e-06, "loss": 0.86770523, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.703125, "step": 2371, "time_per_iteration": 2.3977749347686768 }, { "auxiliary_loss_clip": 0.01099014, "auxiliary_loss_mlp": 0.01070489, "balance_loss_clip": 1.02457011, "balance_loss_mlp": 1.02698374, "epoch": 0.14261235532842326, "flos": 17310495467520.0, "grad_norm": 2.185422774035855, "language_loss": 0.84390569, "learning_rate": 3.868439589977181e-06, "loss": 0.86560076, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.71875, "step": 2372, "time_per_iteration": 2.388528347015381 }, { "auxiliary_loss_clip": 0.01095989, "auxiliary_loss_mlp": 0.01074076, "balance_loss_clip": 1.02839494, "balance_loss_mlp": 1.02698219, "epoch": 0.14267247858109125, "flos": 18805755709440.0, "grad_norm": 2.3927984742990107, "language_loss": 0.86746681, "learning_rate": 3.868300634397836e-06, "loss": 0.88916743, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6875, "step": 2373, "time_per_iteration": 2.379897117614746 }, { "auxiliary_loss_clip": 0.01092757, "auxiliary_loss_mlp": 0.0107488, "balance_loss_clip": 1.03117752, "balance_loss_mlp": 1.0239073, "epoch": 0.14273260183375922, "flos": 11358258238080.0, "grad_norm": 1.9870723132583892, "language_loss": 0.87381363, "learning_rate": 3.8681616079723445e-06, "loss": 0.89548993, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6875, "step": 2374, "time_per_iteration": 2.3837428092956543 }, { "auxiliary_loss_clip": 0.01098982, "auxiliary_loss_mlp": 0.01066802, "balance_loss_clip": 1.01871264, "balance_loss_mlp": 1.02655721, "epoch": 0.14279272508642718, "flos": 27566336615040.0, "grad_norm": 1.7779997785714894, "language_loss": 0.80318689, "learning_rate": 3.868022510705977e-06, "loss": 0.82484472, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.72265625, "step": 2375, "time_per_iteration": 2.462386131286621 }, { "auxiliary_loss_clip": 0.01096006, "auxiliary_loss_mlp": 0.01076411, "balance_loss_clip": 1.031183, "balance_loss_mlp": 1.02670288, "epoch": 0.14285284833909515, "flos": 16251649620480.0, "grad_norm": 2.201692122505234, "language_loss": 0.79072428, "learning_rate": 3.867883342604009e-06, "loss": 0.81244844, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6953125, "step": 2376, "time_per_iteration": 2.3885726928710938 }, { "auxiliary_loss_clip": 0.01094216, "auxiliary_loss_mlp": 0.01066541, "balance_loss_clip": 1.01940584, "balance_loss_mlp": 1.02493668, "epoch": 0.1429129715917631, "flos": 19754590262400.0, "grad_norm": 1.74923658613742, "language_loss": 0.9503786, "learning_rate": 3.867744103671717e-06, "loss": 0.97198617, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6953125, "step": 2377, "time_per_iteration": 2.4360718727111816 }, { "auxiliary_loss_clip": 0.01095697, "auxiliary_loss_mlp": 0.01063613, "balance_loss_clip": 1.0174315, "balance_loss_mlp": 1.0253408, "epoch": 0.14297309484443108, "flos": 21136173517440.0, "grad_norm": 1.936924493031441, "language_loss": 0.93021452, "learning_rate": 3.867604793914382e-06, "loss": 0.95180768, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.703125, "step": 2378, "time_per_iteration": 2.403214693069458 }, { "auxiliary_loss_clip": 0.01095994, "auxiliary_loss_mlp": 0.01056035, "balance_loss_clip": 1.01276231, "balance_loss_mlp": 1.02547026, "epoch": 0.14303321809709904, "flos": 23585539927680.0, "grad_norm": 1.6473067291678978, "language_loss": 0.76051378, "learning_rate": 3.8674654133372864e-06, "loss": 0.78203404, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.703125, "step": 2379, "time_per_iteration": 2.405268430709839 }, { "auxiliary_loss_clip": 0.01095794, "auxiliary_loss_mlp": 0.01065766, "balance_loss_clip": 1.0201329, "balance_loss_mlp": 1.02556729, "epoch": 0.14309334134976703, "flos": 15887365827840.0, "grad_norm": 1.9383256615331015, "language_loss": 0.81375277, "learning_rate": 3.867325961945714e-06, "loss": 0.83536839, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.703125, "step": 2380, "time_per_iteration": 2.386484384536743 }, { "auxiliary_loss_clip": 0.01097447, "auxiliary_loss_mlp": 0.01067901, "balance_loss_clip": 1.02341223, "balance_loss_mlp": 1.02776682, "epoch": 0.143153464602435, "flos": 16324687918080.0, "grad_norm": 2.0694957076498848, "language_loss": 0.90646529, "learning_rate": 3.867186439744955e-06, "loss": 0.92811877, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6953125, "step": 2381, "time_per_iteration": 2.3853209018707275 }, { "auxiliary_loss_clip": 0.01093604, "auxiliary_loss_mlp": 0.01065668, "balance_loss_clip": 1.02072573, "balance_loss_mlp": 1.02596939, "epoch": 0.14321358785510296, "flos": 17091136195200.0, "grad_norm": 2.11987394464071, "language_loss": 0.7805512, "learning_rate": 3.867046846740299e-06, "loss": 0.80214387, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.67578125, "step": 2382, "time_per_iteration": 2.452282667160034 }, { "auxiliary_loss_clip": 0.01096066, "auxiliary_loss_mlp": 0.0106552, "balance_loss_clip": 1.02205586, "balance_loss_mlp": 1.02564335, "epoch": 0.14327371110777093, "flos": 26321718090240.0, "grad_norm": 2.099294418313181, "language_loss": 0.79235601, "learning_rate": 3.866907182937039e-06, "loss": 0.81397188, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.703125, "step": 2383, "time_per_iteration": 2.4532601833343506 }, { "auxiliary_loss_clip": 0.01097324, "auxiliary_loss_mlp": 0.01069447, "balance_loss_clip": 1.02076149, "balance_loss_mlp": 1.02640915, "epoch": 0.1433338343604389, "flos": 18075512378880.0, "grad_norm": 2.117007521699034, "language_loss": 0.90278947, "learning_rate": 3.866767448340471e-06, "loss": 0.92445719, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.7109375, "step": 2384, "time_per_iteration": 2.372790813446045 }, { "auxiliary_loss_clip": 0.01098633, "auxiliary_loss_mlp": 0.01069744, "balance_loss_clip": 1.02213192, "balance_loss_mlp": 1.02633286, "epoch": 0.14339395761310686, "flos": 15521895048960.0, "grad_norm": 2.0509814875596635, "language_loss": 0.83245885, "learning_rate": 3.866627642955895e-06, "loss": 0.85414255, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.72265625, "step": 2385, "time_per_iteration": 2.35886287689209 }, { "auxiliary_loss_clip": 0.01093618, "auxiliary_loss_mlp": 0.01070739, "balance_loss_clip": 1.02617836, "balance_loss_mlp": 1.02287889, "epoch": 0.14345408086577485, "flos": 28547500953600.0, "grad_norm": 1.8978318070862894, "language_loss": 0.7692064, "learning_rate": 3.866487766788612e-06, "loss": 0.79084992, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.70703125, "step": 2386, "time_per_iteration": 2.4738097190856934 }, { "auxiliary_loss_clip": 0.01095182, "auxiliary_loss_mlp": 0.01061859, "balance_loss_clip": 1.01720345, "balance_loss_mlp": 1.02543402, "epoch": 0.14351420411844282, "flos": 20229024994560.0, "grad_norm": 2.093115263853797, "language_loss": 0.79714584, "learning_rate": 3.866347819843925e-06, "loss": 0.81871629, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6953125, "step": 2387, "time_per_iteration": 2.3994054794311523 }, { "auxiliary_loss_clip": 0.01095333, "auxiliary_loss_mlp": 0.01068445, "balance_loss_clip": 1.02438569, "balance_loss_mlp": 1.02590656, "epoch": 0.14357432737111078, "flos": 19864008063360.0, "grad_norm": 1.9367702883266693, "language_loss": 0.84171307, "learning_rate": 3.866207802127143e-06, "loss": 0.86335087, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6953125, "step": 2388, "time_per_iteration": 4.096855878829956 }, { "auxiliary_loss_clip": 0.01096327, "auxiliary_loss_mlp": 0.01071522, "balance_loss_clip": 1.0276053, "balance_loss_mlp": 1.02511561, "epoch": 0.14363445062377875, "flos": 28255557231360.0, "grad_norm": 2.205635866942516, "language_loss": 0.84409058, "learning_rate": 3.866067713643573e-06, "loss": 0.86576909, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.7109375, "step": 2389, "time_per_iteration": 4.014849901199341 }, { "auxiliary_loss_clip": 0.01099884, "auxiliary_loss_mlp": 0.01080092, "balance_loss_clip": 1.03264642, "balance_loss_mlp": 1.02659345, "epoch": 0.1436945738764467, "flos": 18185698229760.0, "grad_norm": 1.8211244725591191, "language_loss": 0.85047162, "learning_rate": 3.8659275543985285e-06, "loss": 0.87227136, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.734375, "step": 2390, "time_per_iteration": 2.44083833694458 }, { "auxiliary_loss_clip": 0.01097296, "auxiliary_loss_mlp": 0.01065515, "balance_loss_clip": 1.02164602, "balance_loss_mlp": 1.02640152, "epoch": 0.14375469712911468, "flos": 27306687767040.0, "grad_norm": 1.640918566031728, "language_loss": 0.76235723, "learning_rate": 3.865787324397324e-06, "loss": 0.78398538, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.7109375, "step": 2391, "time_per_iteration": 3.837477922439575 }, { "auxiliary_loss_clip": 0.01026283, "auxiliary_loss_mlp": 0.01016957, "balance_loss_clip": 1.0124985, "balance_loss_mlp": 1.00838256, "epoch": 0.14381482038178264, "flos": 56888559838080.0, "grad_norm": 0.8765737300721885, "language_loss": 0.6193682, "learning_rate": 3.865647023645277e-06, "loss": 0.63980055, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 0.04467773, "router_z_loss_mlp": 0.1796875, "step": 2392, "time_per_iteration": 4.335782527923584 }, { "auxiliary_loss_clip": 0.01098616, "auxiliary_loss_mlp": 0.0107467, "balance_loss_clip": 1.02760637, "balance_loss_mlp": 1.02492285, "epoch": 0.14387494363445064, "flos": 14281326241920.0, "grad_norm": 2.2812503661024732, "language_loss": 0.79401863, "learning_rate": 3.865506652147709e-06, "loss": 0.81575143, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.73828125, "step": 2393, "time_per_iteration": 2.5161001682281494 }, { "auxiliary_loss_clip": 0.01096954, "auxiliary_loss_mlp": 0.01068664, "balance_loss_clip": 1.02212477, "balance_loss_mlp": 1.0256474, "epoch": 0.1439350668871186, "flos": 26760262078080.0, "grad_norm": 1.775653920820062, "language_loss": 0.78532535, "learning_rate": 3.865366209909941e-06, "loss": 0.80698156, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.7109375, "step": 2394, "time_per_iteration": 2.609265089035034 }, { "auxiliary_loss_clip": 0.0109491, "auxiliary_loss_mlp": 0.01070361, "balance_loss_clip": 1.02191448, "balance_loss_mlp": 1.0252701, "epoch": 0.14399519013978657, "flos": 40698392855040.0, "grad_norm": 1.6166734686881281, "language_loss": 0.883066, "learning_rate": 3.8652256969372994e-06, "loss": 0.90471876, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.6953125, "step": 2395, "time_per_iteration": 2.7156260013580322 }, { "auxiliary_loss_clip": 0.01095757, "auxiliary_loss_mlp": 0.01066755, "balance_loss_clip": 1.02300513, "balance_loss_mlp": 1.02708554, "epoch": 0.14405531339245453, "flos": 20556510347520.0, "grad_norm": 1.5522994844666484, "language_loss": 0.83613944, "learning_rate": 3.865085113235113e-06, "loss": 0.85776454, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6875, "step": 2396, "time_per_iteration": 2.3968915939331055 }, { "auxiliary_loss_clip": 0.01096141, "auxiliary_loss_mlp": 0.0106436, "balance_loss_clip": 1.0203476, "balance_loss_mlp": 1.02688456, "epoch": 0.1441154366451225, "flos": 19571924695680.0, "grad_norm": 2.2130350559590872, "language_loss": 0.8561278, "learning_rate": 3.864944458808712e-06, "loss": 0.87773287, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.69140625, "step": 2397, "time_per_iteration": 2.391301155090332 }, { "auxiliary_loss_clip": 0.0109947, "auxiliary_loss_mlp": 0.01072523, "balance_loss_clip": 1.02398133, "balance_loss_mlp": 1.02649617, "epoch": 0.14417555989779046, "flos": 18514719682560.0, "grad_norm": 1.8560700660300093, "language_loss": 0.81835431, "learning_rate": 3.86480373366343e-06, "loss": 0.84007424, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.7265625, "step": 2398, "time_per_iteration": 2.400973320007324 }, { "auxiliary_loss_clip": 0.01095864, "auxiliary_loss_mlp": 0.01076245, "balance_loss_clip": 1.03230476, "balance_loss_mlp": 1.02710867, "epoch": 0.14423568315045843, "flos": 26030472595200.0, "grad_norm": 2.2714374278289355, "language_loss": 0.67254782, "learning_rate": 3.864662937804603e-06, "loss": 0.69426894, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.6875, "step": 2399, "time_per_iteration": 2.454521656036377 }, { "auxiliary_loss_clip": 0.01097593, "auxiliary_loss_mlp": 0.01068732, "balance_loss_clip": 1.02126265, "balance_loss_mlp": 1.02788377, "epoch": 0.14429580640312642, "flos": 21287661373440.0, "grad_norm": 1.7685658290954376, "language_loss": 0.84367967, "learning_rate": 3.864522071237571e-06, "loss": 0.86534292, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.6953125, "step": 2400, "time_per_iteration": 2.4384162425994873 }, { "auxiliary_loss_clip": 0.01100274, "auxiliary_loss_mlp": 0.01075201, "balance_loss_clip": 1.02246308, "balance_loss_mlp": 1.02673566, "epoch": 0.14435592965579438, "flos": 25626737099520.0, "grad_norm": 1.517662881564376, "language_loss": 0.76824874, "learning_rate": 3.864381133967676e-06, "loss": 0.79000354, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.734375, "step": 2401, "time_per_iteration": 2.475950002670288 }, { "auxiliary_loss_clip": 0.01095731, "auxiliary_loss_mlp": 0.01065891, "balance_loss_clip": 1.02490687, "balance_loss_mlp": 1.0267818, "epoch": 0.14441605290846235, "flos": 22963981259520.0, "grad_norm": 1.562949438128985, "language_loss": 0.82712728, "learning_rate": 3.86424012600026e-06, "loss": 0.8487435, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.69140625, "step": 2402, "time_per_iteration": 2.4332962036132812 }, { "auxiliary_loss_clip": 0.01097803, "auxiliary_loss_mlp": 0.01065153, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.02635479, "epoch": 0.14447617616113032, "flos": 17346700414080.0, "grad_norm": 2.617185989089007, "language_loss": 0.8596493, "learning_rate": 3.864099047340673e-06, "loss": 0.88127881, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.7109375, "step": 2403, "time_per_iteration": 2.3692800998687744 }, { "auxiliary_loss_clip": 0.01096001, "auxiliary_loss_mlp": 0.01066793, "balance_loss_clip": 1.02294803, "balance_loss_mlp": 1.02428436, "epoch": 0.14453629941379828, "flos": 24059066964480.0, "grad_norm": 1.6510926838423856, "language_loss": 0.71366775, "learning_rate": 3.863957897994262e-06, "loss": 0.73529565, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.71875, "step": 2404, "time_per_iteration": 2.4166979789733887 }, { "auxiliary_loss_clip": 0.01097102, "auxiliary_loss_mlp": 0.01067374, "balance_loss_clip": 1.02290916, "balance_loss_mlp": 1.0269078, "epoch": 0.14459642266646625, "flos": 14428659646080.0, "grad_norm": 2.6499992253887146, "language_loss": 0.7554329, "learning_rate": 3.863816677966381e-06, "loss": 0.77707767, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.703125, "step": 2405, "time_per_iteration": 2.3652732372283936 }, { "auxiliary_loss_clip": 0.01096215, "auxiliary_loss_mlp": 0.01067121, "balance_loss_clip": 1.02501619, "balance_loss_mlp": 1.02687156, "epoch": 0.14465654591913424, "flos": 9866314575360.0, "grad_norm": 2.115247133715662, "language_loss": 0.75162393, "learning_rate": 3.863675387262386e-06, "loss": 0.77325732, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.69140625, "step": 2406, "time_per_iteration": 2.4027485847473145 }, { "auxiliary_loss_clip": 0.01096694, "auxiliary_loss_mlp": 0.01071727, "balance_loss_clip": 1.02313709, "balance_loss_mlp": 1.02650082, "epoch": 0.1447166691718022, "flos": 24971766393600.0, "grad_norm": 2.3874762183075284, "language_loss": 0.78308189, "learning_rate": 3.8635340258876325e-06, "loss": 0.80476612, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.703125, "step": 2407, "time_per_iteration": 2.435927391052246 }, { "auxiliary_loss_clip": 0.01095514, "auxiliary_loss_mlp": 0.01067331, "balance_loss_clip": 1.02343774, "balance_loss_mlp": 1.02509701, "epoch": 0.14477679242447017, "flos": 21906950803200.0, "grad_norm": 1.6790895437525375, "language_loss": 0.80228579, "learning_rate": 3.8633925938474826e-06, "loss": 0.82391429, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.703125, "step": 2408, "time_per_iteration": 2.409522533416748 }, { "auxiliary_loss_clip": 0.01098997, "auxiliary_loss_mlp": 0.0107782, "balance_loss_clip": 1.03235388, "balance_loss_mlp": 1.02783346, "epoch": 0.14483691567713813, "flos": 20739699584640.0, "grad_norm": 1.9210554877929276, "language_loss": 0.84147936, "learning_rate": 3.863251091147299e-06, "loss": 0.86324751, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.7109375, "step": 2409, "time_per_iteration": 2.391932487487793 }, { "auxiliary_loss_clip": 0.01097086, "auxiliary_loss_mlp": 0.01067829, "balance_loss_clip": 1.0230298, "balance_loss_mlp": 1.02634716, "epoch": 0.1448970389298061, "flos": 35406258301440.0, "grad_norm": 1.8574315057108421, "language_loss": 0.76954699, "learning_rate": 3.863109517792446e-06, "loss": 0.79119611, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.70703125, "step": 2410, "time_per_iteration": 2.5106914043426514 }, { "auxiliary_loss_clip": 0.01095143, "auxiliary_loss_mlp": 0.01070289, "balance_loss_clip": 1.02162755, "balance_loss_mlp": 1.02483618, "epoch": 0.14495716218247406, "flos": 15413454766080.0, "grad_norm": 1.655506284675812, "language_loss": 0.83072007, "learning_rate": 3.8629678737882945e-06, "loss": 0.85237432, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.703125, "step": 2411, "time_per_iteration": 2.367459297180176 }, { "auxiliary_loss_clip": 0.01096868, "auxiliary_loss_mlp": 0.01067704, "balance_loss_clip": 1.02109289, "balance_loss_mlp": 1.02751815, "epoch": 0.14501728543514203, "flos": 33691813344000.0, "grad_norm": 2.067179888233775, "language_loss": 0.733679, "learning_rate": 3.862826159140214e-06, "loss": 0.75532472, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.6953125, "step": 2412, "time_per_iteration": 2.5224807262420654 }, { "auxiliary_loss_clip": 0.01097725, "auxiliary_loss_mlp": 0.01064822, "balance_loss_clip": 1.02207375, "balance_loss_mlp": 1.02774489, "epoch": 0.14507740868781002, "flos": 15595212637440.0, "grad_norm": 1.8269151586610555, "language_loss": 0.78736496, "learning_rate": 3.862684373853579e-06, "loss": 0.80899042, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.69921875, "step": 2413, "time_per_iteration": 2.365117073059082 }, { "auxiliary_loss_clip": 0.01024629, "auxiliary_loss_mlp": 0.0100548, "balance_loss_clip": 1.00006819, "balance_loss_mlp": 1.00705779, "epoch": 0.145137531940478, "flos": 66672026023680.0, "grad_norm": 0.9179057056763439, "language_loss": 0.59007514, "learning_rate": 3.8625425179337656e-06, "loss": 0.61037624, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 0.05419922, "router_z_loss_mlp": 0.17578125, "step": 2414, "time_per_iteration": 2.9444878101348877 }, { "auxiliary_loss_clip": 0.01023324, "auxiliary_loss_mlp": 0.01010993, "balance_loss_clip": 1.00584292, "balance_loss_mlp": 1.00568306, "epoch": 0.14519765519314595, "flos": 67518041022720.0, "grad_norm": 0.8526647312986252, "language_loss": 0.62459564, "learning_rate": 3.862400591386154e-06, "loss": 0.64493877, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 0.05151367, "router_z_loss_mlp": 0.17675781, "step": 2415, "time_per_iteration": 3.0272014141082764 }, { "auxiliary_loss_clip": 0.01097141, "auxiliary_loss_mlp": 0.01063959, "balance_loss_clip": 1.02144909, "balance_loss_mlp": 1.02709723, "epoch": 0.14525777844581392, "flos": 17198040378240.0, "grad_norm": 2.540199992625111, "language_loss": 0.74670166, "learning_rate": 3.8622585942161245e-06, "loss": 0.76831269, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.69921875, "step": 2416, "time_per_iteration": 2.3867125511169434 }, { "auxiliary_loss_clip": 0.01022402, "auxiliary_loss_mlp": 0.0100817, "balance_loss_clip": 1.00225699, "balance_loss_mlp": 1.00475931, "epoch": 0.14531790169848188, "flos": 65401152289920.0, "grad_norm": 0.7370501876049392, "language_loss": 0.60554767, "learning_rate": 3.8621165264290635e-06, "loss": 0.62585342, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 0.05908203, "router_z_loss_mlp": 0.17675781, "step": 2417, "time_per_iteration": 3.0694992542266846 }, { "auxiliary_loss_clip": 0.01099198, "auxiliary_loss_mlp": 0.0107905, "balance_loss_clip": 1.03403711, "balance_loss_mlp": 1.02587605, "epoch": 0.14537802495114985, "flos": 32561081274240.0, "grad_norm": 2.5084461822423028, "language_loss": 0.81821817, "learning_rate": 3.861974388030356e-06, "loss": 0.84000069, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.734375, "step": 2418, "time_per_iteration": 2.4783363342285156 }, { "auxiliary_loss_clip": 0.01094115, "auxiliary_loss_mlp": 0.0106488, "balance_loss_clip": 1.02215552, "balance_loss_mlp": 1.0263499, "epoch": 0.1454381482038178, "flos": 20225743326720.0, "grad_norm": 2.022417785769775, "language_loss": 0.7369898, "learning_rate": 3.861832179025394e-06, "loss": 0.75857973, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.6796875, "step": 2419, "time_per_iteration": 2.4065279960632324 }, { "auxiliary_loss_clip": 0.01096083, "auxiliary_loss_mlp": 0.010687, "balance_loss_clip": 1.02354336, "balance_loss_mlp": 1.02679014, "epoch": 0.1454982714564858, "flos": 22892025214080.0, "grad_norm": 2.17103110613656, "language_loss": 0.92823929, "learning_rate": 3.861689899419569e-06, "loss": 0.9498871, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.6953125, "step": 2420, "time_per_iteration": 2.431934118270874 }, { "auxiliary_loss_clip": 0.01100121, "auxiliary_loss_mlp": 0.01066137, "balance_loss_clip": 1.02021754, "balance_loss_mlp": 1.02856088, "epoch": 0.14555839470915377, "flos": 20228815526400.0, "grad_norm": 1.9537472857288558, "language_loss": 0.83854508, "learning_rate": 3.861547549218276e-06, "loss": 0.86020762, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.71484375, "step": 2421, "time_per_iteration": 2.3996660709381104 }, { "auxiliary_loss_clip": 0.01099295, "auxiliary_loss_mlp": 0.01065234, "balance_loss_clip": 1.02293825, "balance_loss_mlp": 1.02757502, "epoch": 0.14561851796182174, "flos": 22235204206080.0, "grad_norm": 2.046972095534364, "language_loss": 0.8280127, "learning_rate": 3.861405128426914e-06, "loss": 0.84965795, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.71875, "step": 2422, "time_per_iteration": 2.4426403045654297 }, { "auxiliary_loss_clip": 0.01022218, "auxiliary_loss_mlp": 0.01008294, "balance_loss_clip": 1.00226223, "balance_loss_mlp": 1.00443721, "epoch": 0.1456786412144897, "flos": 52633624222080.0, "grad_norm": 0.9043288996944128, "language_loss": 0.63476884, "learning_rate": 3.861262637050883e-06, "loss": 0.655074, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 0.06030273, "router_z_loss_mlp": 0.17773438, "step": 2423, "time_per_iteration": 3.0056581497192383 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.01060649, "balance_loss_clip": 1.02073765, "balance_loss_mlp": 1.02752936, "epoch": 0.14573876446715767, "flos": 23220557907840.0, "grad_norm": 1.685930955669371, "language_loss": 0.83249277, "learning_rate": 3.861120075095585e-06, "loss": 0.85407388, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.69921875, "step": 2424, "time_per_iteration": 2.433666229248047 }, { "auxiliary_loss_clip": 0.01096648, "auxiliary_loss_mlp": 0.01061929, "balance_loss_clip": 1.01889491, "balance_loss_mlp": 1.02763188, "epoch": 0.14579888771982563, "flos": 18113393070720.0, "grad_norm": 1.890295413044119, "language_loss": 0.80257285, "learning_rate": 3.860977442566429e-06, "loss": 0.82415861, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.69140625, "step": 2425, "time_per_iteration": 2.476473331451416 }, { "auxiliary_loss_clip": 0.01096166, "auxiliary_loss_mlp": 0.01072671, "balance_loss_clip": 1.02911174, "balance_loss_mlp": 1.02712667, "epoch": 0.14585901097249362, "flos": 23000046560640.0, "grad_norm": 2.0627694127846348, "language_loss": 0.84775686, "learning_rate": 3.860834739468821e-06, "loss": 0.86944532, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.69140625, "step": 2426, "time_per_iteration": 2.4028544425964355 }, { "auxiliary_loss_clip": 0.01098722, "auxiliary_loss_mlp": 0.01060839, "balance_loss_clip": 1.01811409, "balance_loss_mlp": 1.02875614, "epoch": 0.1459191342251616, "flos": 21907579207680.0, "grad_norm": 2.245460876957568, "language_loss": 0.89481699, "learning_rate": 3.860691965808173e-06, "loss": 0.91641259, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.69921875, "step": 2427, "time_per_iteration": 2.4195988178253174 }, { "auxiliary_loss_clip": 0.01104713, "auxiliary_loss_mlp": 0.01070712, "balance_loss_clip": 1.022241, "balance_loss_mlp": 1.02835882, "epoch": 0.14597925747782955, "flos": 14974631487360.0, "grad_norm": 1.8941693416139154, "language_loss": 0.69453251, "learning_rate": 3.8605491215899e-06, "loss": 0.71628678, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.765625, "step": 2428, "time_per_iteration": 3.9255177974700928 }, { "auxiliary_loss_clip": 0.01095757, "auxiliary_loss_mlp": 0.01061316, "balance_loss_clip": 1.01782846, "balance_loss_mlp": 1.02532625, "epoch": 0.14603938073049752, "flos": 21067848253440.0, "grad_norm": 1.7454189941142433, "language_loss": 0.85234916, "learning_rate": 3.860406206819417e-06, "loss": 0.8739199, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.703125, "step": 2429, "time_per_iteration": 3.895122766494751 }, { "auxiliary_loss_clip": 0.0109403, "auxiliary_loss_mlp": 0.01063588, "balance_loss_clip": 1.02050531, "balance_loss_mlp": 1.02502084, "epoch": 0.14609950398316549, "flos": 19863763683840.0, "grad_norm": 1.7741341144285783, "language_loss": 0.80833554, "learning_rate": 3.860263221502145e-06, "loss": 0.82991171, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6875, "step": 2430, "time_per_iteration": 3.8319296836853027 }, { "auxiliary_loss_clip": 0.01099297, "auxiliary_loss_mlp": 0.01067194, "balance_loss_clip": 1.02439785, "balance_loss_mlp": 1.02754045, "epoch": 0.14615962723583345, "flos": 22417765038720.0, "grad_norm": 2.0607443172223965, "language_loss": 0.8595466, "learning_rate": 3.860120165643504e-06, "loss": 0.88121152, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.71875, "step": 2431, "time_per_iteration": 2.4585623741149902 }, { "auxiliary_loss_clip": 0.01099714, "auxiliary_loss_mlp": 0.01068439, "balance_loss_clip": 1.02337813, "balance_loss_mlp": 1.02770352, "epoch": 0.14621975048850142, "flos": 22345145677440.0, "grad_norm": 2.1014633246690595, "language_loss": 0.80919695, "learning_rate": 3.859977039248921e-06, "loss": 0.83087844, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.71875, "step": 2432, "time_per_iteration": 3.8450517654418945 }, { "auxiliary_loss_clip": 0.01094827, "auxiliary_loss_mlp": 0.01066237, "balance_loss_clip": 1.0201745, "balance_loss_mlp": 1.02483547, "epoch": 0.1462798737411694, "flos": 24388018594560.0, "grad_norm": 1.9275260119718027, "language_loss": 0.83050263, "learning_rate": 3.859833842323822e-06, "loss": 0.85211325, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.703125, "step": 2433, "time_per_iteration": 2.414325714111328 }, { "auxiliary_loss_clip": 0.01093178, "auxiliary_loss_mlp": 0.01064778, "balance_loss_clip": 1.0227927, "balance_loss_mlp": 1.02549481, "epoch": 0.14633999699383737, "flos": 19243671292800.0, "grad_norm": 2.1521960261138635, "language_loss": 0.79915607, "learning_rate": 3.859690574873638e-06, "loss": 0.82073563, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.67578125, "step": 2434, "time_per_iteration": 2.437758684158325 }, { "auxiliary_loss_clip": 0.01022091, "auxiliary_loss_mlp": 0.01009071, "balance_loss_clip": 1.00313425, "balance_loss_mlp": 1.00463533, "epoch": 0.14640012024650534, "flos": 62657468184960.0, "grad_norm": 0.8692019039032973, "language_loss": 0.58499002, "learning_rate": 3.8595472369038e-06, "loss": 0.60530162, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 0.05932617, "router_z_loss_mlp": 0.17480469, "step": 2435, "time_per_iteration": 3.0096054077148438 }, { "auxiliary_loss_clip": 0.01090595, "auxiliary_loss_mlp": 0.01062331, "balance_loss_clip": 1.01865304, "balance_loss_mlp": 1.02371049, "epoch": 0.1464602434991733, "flos": 12275426321280.0, "grad_norm": 2.300555102014229, "language_loss": 0.90603876, "learning_rate": 3.859403828419744e-06, "loss": 0.92756808, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.66796875, "step": 2436, "time_per_iteration": 2.387996196746826 }, { "auxiliary_loss_clip": 0.01096664, "auxiliary_loss_mlp": 0.01064815, "balance_loss_clip": 1.02104187, "balance_loss_mlp": 1.0266304, "epoch": 0.14652036675184127, "flos": 20921282899200.0, "grad_norm": 1.992716338542552, "language_loss": 0.7706126, "learning_rate": 3.85926034942691e-06, "loss": 0.79222739, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.69921875, "step": 2437, "time_per_iteration": 2.415969133377075 }, { "auxiliary_loss_clip": 0.01098862, "auxiliary_loss_mlp": 0.01072569, "balance_loss_clip": 1.02269185, "balance_loss_mlp": 1.02669942, "epoch": 0.14658049000450923, "flos": 27702603118080.0, "grad_norm": 2.0972067205844414, "language_loss": 0.75825679, "learning_rate": 3.859116799930736e-06, "loss": 0.77997112, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.71875, "step": 2438, "time_per_iteration": 2.4462890625 }, { "auxiliary_loss_clip": 0.01097581, "auxiliary_loss_mlp": 0.01068824, "balance_loss_clip": 1.02531266, "balance_loss_mlp": 1.02776372, "epoch": 0.14664061325717723, "flos": 24935351978880.0, "grad_norm": 1.9075747824541653, "language_loss": 0.76332027, "learning_rate": 3.858973179936668e-06, "loss": 0.78498435, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.69921875, "step": 2439, "time_per_iteration": 2.4122400283813477 }, { "auxiliary_loss_clip": 0.01097006, "auxiliary_loss_mlp": 0.01071461, "balance_loss_clip": 1.0265671, "balance_loss_mlp": 1.02731824, "epoch": 0.1467007365098452, "flos": 40296053813760.0, "grad_norm": 1.9181926336963941, "language_loss": 0.76293385, "learning_rate": 3.85882948945015e-06, "loss": 0.7846185, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6953125, "step": 2440, "time_per_iteration": 2.553365468978882 }, { "auxiliary_loss_clip": 0.01095462, "auxiliary_loss_mlp": 0.01069096, "balance_loss_clip": 1.02999556, "balance_loss_mlp": 1.02738667, "epoch": 0.14676085976251316, "flos": 26539890376320.0, "grad_norm": 1.5567723605499801, "language_loss": 0.8403334, "learning_rate": 3.85868572847663e-06, "loss": 0.86197901, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.6796875, "step": 2441, "time_per_iteration": 2.443652868270874 }, { "auxiliary_loss_clip": 0.0110462, "auxiliary_loss_mlp": 0.01074934, "balance_loss_clip": 1.02488971, "balance_loss_mlp": 1.0287286, "epoch": 0.14682098301518112, "flos": 23548985867520.0, "grad_norm": 1.8919922846489026, "language_loss": 0.75999266, "learning_rate": 3.858541897021563e-06, "loss": 0.78178823, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7578125, "step": 2442, "time_per_iteration": 2.3993821144104004 }, { "auxiliary_loss_clip": 0.01101684, "auxiliary_loss_mlp": 0.01071346, "balance_loss_clip": 1.02566516, "balance_loss_mlp": 1.02824354, "epoch": 0.1468811062678491, "flos": 11650411428480.0, "grad_norm": 3.5926877895002503, "language_loss": 0.85209274, "learning_rate": 3.8583979950904e-06, "loss": 0.87382305, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.734375, "step": 2443, "time_per_iteration": 2.3490679264068604 }, { "auxiliary_loss_clip": 0.01098999, "auxiliary_loss_mlp": 0.01077543, "balance_loss_clip": 1.03035998, "balance_loss_mlp": 1.02698636, "epoch": 0.14694122952051705, "flos": 23001512837760.0, "grad_norm": 1.638147347253433, "language_loss": 0.84472048, "learning_rate": 3.858254022688599e-06, "loss": 0.86648583, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.71875, "step": 2444, "time_per_iteration": 2.4034340381622314 }, { "auxiliary_loss_clip": 0.01099359, "auxiliary_loss_mlp": 0.01061886, "balance_loss_clip": 1.01825547, "balance_loss_mlp": 1.02699971, "epoch": 0.14700135277318502, "flos": 26501835127680.0, "grad_norm": 1.5957139567683911, "language_loss": 0.73091751, "learning_rate": 3.85810997982162e-06, "loss": 0.75252998, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.72265625, "step": 2445, "time_per_iteration": 2.4295976161956787 }, { "auxiliary_loss_clip": 0.01024787, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.03320765, "balance_loss_mlp": 1.00679374, "epoch": 0.147061476025853, "flos": 59446366531200.0, "grad_norm": 0.8460990111409177, "language_loss": 0.63223791, "learning_rate": 3.857965866494923e-06, "loss": 0.65286767, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 0.04980469, "router_z_loss_mlp": 0.1796875, "step": 2446, "time_per_iteration": 2.911855697631836 }, { "auxiliary_loss_clip": 0.01101688, "auxiliary_loss_mlp": 0.01074412, "balance_loss_clip": 1.02765775, "balance_loss_mlp": 1.02844119, "epoch": 0.14712159927852098, "flos": 28329607958400.0, "grad_norm": 1.6272868934955003, "language_loss": 0.77166736, "learning_rate": 3.857821682713975e-06, "loss": 0.7934283, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.734375, "step": 2447, "time_per_iteration": 2.4587817192077637 }, { "auxiliary_loss_clip": 0.01099291, "auxiliary_loss_mlp": 0.0106766, "balance_loss_clip": 1.02159727, "balance_loss_mlp": 1.02678084, "epoch": 0.14718172253118894, "flos": 27088585303680.0, "grad_norm": 1.8810746481606395, "language_loss": 0.87475824, "learning_rate": 3.857677428484242e-06, "loss": 0.89642775, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7265625, "step": 2448, "time_per_iteration": 2.43615460395813 }, { "auxiliary_loss_clip": 0.01023392, "auxiliary_loss_mlp": 0.01006158, "balance_loss_clip": 1.00062668, "balance_loss_mlp": 1.00605989, "epoch": 0.1472418457838569, "flos": 66703587759360.0, "grad_norm": 0.7686752387624525, "language_loss": 0.56934094, "learning_rate": 3.857533103811195e-06, "loss": 0.58963645, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 0.05541992, "router_z_loss_mlp": 0.17382812, "step": 2449, "time_per_iteration": 2.974536418914795 }, { "auxiliary_loss_clip": 0.01097666, "auxiliary_loss_mlp": 0.01067246, "balance_loss_clip": 1.02108848, "balance_loss_mlp": 1.02722335, "epoch": 0.14730196903652487, "flos": 19572553100160.0, "grad_norm": 1.7620447302795383, "language_loss": 0.86746895, "learning_rate": 3.857388708700307e-06, "loss": 0.88911808, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.703125, "step": 2450, "time_per_iteration": 2.3912692070007324 }, { "auxiliary_loss_clip": 0.01099883, "auxiliary_loss_mlp": 0.01079596, "balance_loss_clip": 1.03060055, "balance_loss_mlp": 1.02773046, "epoch": 0.14736209228919284, "flos": 16070101217280.0, "grad_norm": 1.8948975742558394, "language_loss": 0.77137697, "learning_rate": 3.857244243157052e-06, "loss": 0.79317176, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.71875, "step": 2451, "time_per_iteration": 2.36212420463562 }, { "auxiliary_loss_clip": 0.01095651, "auxiliary_loss_mlp": 0.01069599, "balance_loss_clip": 1.02916348, "balance_loss_mlp": 1.02660644, "epoch": 0.1474222155418608, "flos": 23038346188800.0, "grad_norm": 1.5699881294284104, "language_loss": 0.83476424, "learning_rate": 3.85709970718691e-06, "loss": 0.85641676, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.69140625, "step": 2452, "time_per_iteration": 2.408055543899536 }, { "auxiliary_loss_clip": 0.01097725, "auxiliary_loss_mlp": 0.01074171, "balance_loss_clip": 1.02953935, "balance_loss_mlp": 1.02688432, "epoch": 0.1474823387945288, "flos": 17017713872640.0, "grad_norm": 1.9270339623963557, "language_loss": 0.75618696, "learning_rate": 3.856955100795361e-06, "loss": 0.77790588, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.70703125, "step": 2453, "time_per_iteration": 2.351170539855957 }, { "auxiliary_loss_clip": 0.01101718, "auxiliary_loss_mlp": 0.01073018, "balance_loss_clip": 1.02390349, "balance_loss_mlp": 1.02754736, "epoch": 0.14754246204719676, "flos": 17894068709760.0, "grad_norm": 1.9578791037843046, "language_loss": 0.78440231, "learning_rate": 3.856810423987889e-06, "loss": 0.80614966, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.7421875, "step": 2454, "time_per_iteration": 2.388763666152954 }, { "auxiliary_loss_clip": 0.01099601, "auxiliary_loss_mlp": 0.01072266, "balance_loss_clip": 1.02789593, "balance_loss_mlp": 1.0262301, "epoch": 0.14760258529986472, "flos": 13078254101760.0, "grad_norm": 1.902442925207913, "language_loss": 0.84432828, "learning_rate": 3.856665676769979e-06, "loss": 0.86604702, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.734375, "step": 2455, "time_per_iteration": 2.365283489227295 }, { "auxiliary_loss_clip": 0.01101424, "auxiliary_loss_mlp": 0.01073822, "balance_loss_clip": 1.02730632, "balance_loss_mlp": 1.0260464, "epoch": 0.1476627085525327, "flos": 30805194666240.0, "grad_norm": 1.9731022189781138, "language_loss": 0.86136854, "learning_rate": 3.85652085914712e-06, "loss": 0.88312101, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.75390625, "step": 2456, "time_per_iteration": 2.467848300933838 }, { "auxiliary_loss_clip": 0.01097719, "auxiliary_loss_mlp": 0.01067471, "balance_loss_clip": 1.02391195, "balance_loss_mlp": 1.02812886, "epoch": 0.14772283180520066, "flos": 21688359580800.0, "grad_norm": 1.6333746364611896, "language_loss": 0.85959697, "learning_rate": 3.856375971124805e-06, "loss": 0.88124883, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.6953125, "step": 2457, "time_per_iteration": 2.3873815536499023 }, { "auxiliary_loss_clip": 0.01093992, "auxiliary_loss_mlp": 0.01064289, "balance_loss_clip": 1.01853633, "balance_loss_mlp": 1.02582431, "epoch": 0.14778295505786862, "flos": 18769411117440.0, "grad_norm": 2.2157566975761385, "language_loss": 0.77142531, "learning_rate": 3.856231012708527e-06, "loss": 0.79300809, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6796875, "step": 2458, "time_per_iteration": 2.3928511142730713 }, { "auxiliary_loss_clip": 0.01106035, "auxiliary_loss_mlp": 0.01074209, "balance_loss_clip": 1.02304411, "balance_loss_mlp": 1.02870023, "epoch": 0.1478430783105366, "flos": 22892444150400.0, "grad_norm": 1.9505714856299643, "language_loss": 0.85122252, "learning_rate": 3.856085983903782e-06, "loss": 0.87302494, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.7734375, "step": 2459, "time_per_iteration": 2.4292595386505127 }, { "auxiliary_loss_clip": 0.01093187, "auxiliary_loss_mlp": 0.0106228, "balance_loss_clip": 1.02012777, "balance_loss_mlp": 1.02594709, "epoch": 0.14790320156320458, "flos": 15084433313280.0, "grad_norm": 2.023544266713236, "language_loss": 0.76772064, "learning_rate": 3.855940884716071e-06, "loss": 0.78927529, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.671875, "step": 2460, "time_per_iteration": 2.379777193069458 }, { "auxiliary_loss_clip": 0.01099786, "auxiliary_loss_mlp": 0.0107731, "balance_loss_clip": 1.03162885, "balance_loss_mlp": 1.02636433, "epoch": 0.14796332481587254, "flos": 26503580695680.0, "grad_norm": 1.6709260765579441, "language_loss": 0.82930833, "learning_rate": 3.855795715150896e-06, "loss": 0.85107929, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.734375, "step": 2461, "time_per_iteration": 2.4474596977233887 }, { "auxiliary_loss_clip": 0.01097535, "auxiliary_loss_mlp": 0.0107697, "balance_loss_clip": 1.02876163, "balance_loss_mlp": 1.02629185, "epoch": 0.1480234480685405, "flos": 17562324170880.0, "grad_norm": 2.6040231332709873, "language_loss": 0.68725038, "learning_rate": 3.855650475213761e-06, "loss": 0.70899546, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.7109375, "step": 2462, "time_per_iteration": 2.3660292625427246 }, { "auxiliary_loss_clip": 0.01097023, "auxiliary_loss_mlp": 0.01068424, "balance_loss_clip": 1.02457881, "balance_loss_mlp": 1.0268321, "epoch": 0.14808357132120847, "flos": 53580121580160.0, "grad_norm": 2.2729480708219922, "language_loss": 0.68763828, "learning_rate": 3.8555051649101745e-06, "loss": 0.70929277, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.703125, "step": 2463, "time_per_iteration": 2.7324020862579346 }, { "auxiliary_loss_clip": 0.01098132, "auxiliary_loss_mlp": 0.01077114, "balance_loss_clip": 1.02990699, "balance_loss_mlp": 1.02659583, "epoch": 0.14814369457387644, "flos": 19828152230400.0, "grad_norm": 1.6701447464541475, "language_loss": 0.78808916, "learning_rate": 3.855359784245646e-06, "loss": 0.80984163, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.71484375, "step": 2464, "time_per_iteration": 2.391052007675171 }, { "auxiliary_loss_clip": 0.01098688, "auxiliary_loss_mlp": 0.01075794, "balance_loss_clip": 1.03476238, "balance_loss_mlp": 1.02810156, "epoch": 0.1482038178265444, "flos": 23913828241920.0, "grad_norm": 1.6285918339432055, "language_loss": 0.81520182, "learning_rate": 3.855214333225688e-06, "loss": 0.83694661, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.70703125, "step": 2465, "time_per_iteration": 2.416149139404297 }, { "auxiliary_loss_clip": 0.01102655, "auxiliary_loss_mlp": 0.01082639, "balance_loss_clip": 1.03288102, "balance_loss_mlp": 1.0280782, "epoch": 0.1482639410792124, "flos": 24169357549440.0, "grad_norm": 1.5797553479385578, "language_loss": 0.781461, "learning_rate": 3.855068811855817e-06, "loss": 0.80331397, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.74609375, "step": 2466, "time_per_iteration": 2.4337644577026367 }, { "auxiliary_loss_clip": 0.01023507, "auxiliary_loss_mlp": 0.01021416, "balance_loss_clip": 1.01674354, "balance_loss_mlp": 1.00532091, "epoch": 0.14832406433188036, "flos": 66188025578880.0, "grad_norm": 0.7930868803681371, "language_loss": 0.60151768, "learning_rate": 3.854923220141551e-06, "loss": 0.62196696, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 0.04663086, "router_z_loss_mlp": 0.18164062, "step": 2467, "time_per_iteration": 3.0348620414733887 }, { "auxiliary_loss_clip": 0.0109461, "auxiliary_loss_mlp": 0.01068471, "balance_loss_clip": 1.02436399, "balance_loss_mlp": 1.02526033, "epoch": 0.14838418758454833, "flos": 25410066001920.0, "grad_norm": 2.0654667690431916, "language_loss": 0.8914237, "learning_rate": 3.85477755808841e-06, "loss": 0.91305459, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.69140625, "step": 2468, "time_per_iteration": 3.8651061058044434 }, { "auxiliary_loss_clip": 0.0109801, "auxiliary_loss_mlp": 0.01070615, "balance_loss_clip": 1.01933122, "balance_loss_mlp": 1.0260278, "epoch": 0.1484443108372163, "flos": 23288918083200.0, "grad_norm": 2.1231896995533526, "language_loss": 0.78280437, "learning_rate": 3.854631825701919e-06, "loss": 0.80449057, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.71875, "step": 2469, "time_per_iteration": 2.404529571533203 }, { "auxiliary_loss_clip": 0.01097149, "auxiliary_loss_mlp": 0.01068847, "balance_loss_clip": 1.02268934, "balance_loss_mlp": 1.026824, "epoch": 0.14850443408988426, "flos": 14646797020800.0, "grad_norm": 1.897251362223043, "language_loss": 0.77403468, "learning_rate": 3.854486022987603e-06, "loss": 0.79569459, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.703125, "step": 2470, "time_per_iteration": 3.8456714153289795 }, { "auxiliary_loss_clip": 0.01094599, "auxiliary_loss_mlp": 0.01067411, "balance_loss_clip": 1.02180183, "balance_loss_mlp": 1.02603734, "epoch": 0.14856455734255222, "flos": 23547240299520.0, "grad_norm": 1.6641754455455549, "language_loss": 0.7405529, "learning_rate": 3.8543401499509905e-06, "loss": 0.76217306, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.68359375, "step": 2471, "time_per_iteration": 3.8998091220855713 }, { "auxiliary_loss_clip": 0.01099473, "auxiliary_loss_mlp": 0.0107561, "balance_loss_clip": 1.02551806, "balance_loss_mlp": 1.02659035, "epoch": 0.1486246805952202, "flos": 18076315340160.0, "grad_norm": 1.8348858223601816, "language_loss": 0.91545963, "learning_rate": 3.854194206597615e-06, "loss": 0.93721044, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7265625, "step": 2472, "time_per_iteration": 2.4353387355804443 }, { "auxiliary_loss_clip": 0.01098581, "auxiliary_loss_mlp": 0.01072525, "balance_loss_clip": 1.02321994, "balance_loss_mlp": 1.02615833, "epoch": 0.14868480384788818, "flos": 19352635246080.0, "grad_norm": 2.654187182109426, "language_loss": 0.83452082, "learning_rate": 3.854048192933008e-06, "loss": 0.85623193, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.72265625, "step": 2473, "time_per_iteration": 2.3959057331085205 }, { "auxiliary_loss_clip": 0.01098773, "auxiliary_loss_mlp": 0.01078367, "balance_loss_clip": 1.03406882, "balance_loss_mlp": 1.02574706, "epoch": 0.14874492710055615, "flos": 22199103993600.0, "grad_norm": 2.1473805610447743, "language_loss": 0.79949987, "learning_rate": 3.853902108962709e-06, "loss": 0.8212713, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.73046875, "step": 2474, "time_per_iteration": 2.396589517593384 }, { "auxiliary_loss_clip": 0.01099843, "auxiliary_loss_mlp": 0.01084397, "balance_loss_clip": 1.03180194, "balance_loss_mlp": 1.02602172, "epoch": 0.1488050503532241, "flos": 21102447277440.0, "grad_norm": 1.8112436225554511, "language_loss": 0.83936793, "learning_rate": 3.853755954692255e-06, "loss": 0.86121035, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.73828125, "step": 2475, "time_per_iteration": 2.4137630462646484 }, { "auxiliary_loss_clip": 0.01098062, "auxiliary_loss_mlp": 0.01074189, "balance_loss_clip": 1.02350092, "balance_loss_mlp": 1.0284934, "epoch": 0.14886517360589208, "flos": 12785751797760.0, "grad_norm": 1.7990510082692523, "language_loss": 0.82762033, "learning_rate": 3.85360973012719e-06, "loss": 0.84934282, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.6953125, "step": 2476, "time_per_iteration": 2.4145026206970215 }, { "auxiliary_loss_clip": 0.01093167, "auxiliary_loss_mlp": 0.01071103, "balance_loss_clip": 1.02618492, "balance_loss_mlp": 1.02607024, "epoch": 0.14892529685856004, "flos": 29021586572160.0, "grad_norm": 1.644229651248555, "language_loss": 0.79497421, "learning_rate": 3.853463435273058e-06, "loss": 0.81661701, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.671875, "step": 2477, "time_per_iteration": 2.4482643604278564 }, { "auxiliary_loss_clip": 0.01023411, "auxiliary_loss_mlp": 0.01073949, "balance_loss_clip": 1.06829834, "balance_loss_mlp": 1.00721741, "epoch": 0.148985420111228, "flos": 61923105313920.0, "grad_norm": 0.8666176257297775, "language_loss": 0.60206616, "learning_rate": 3.853317070135407e-06, "loss": 0.62303978, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 0.05639648, "router_z_loss_mlp": 0.16210938, "step": 2478, "time_per_iteration": 3.043100595474243 }, { "auxiliary_loss_clip": 0.01097796, "auxiliary_loss_mlp": 0.01071474, "balance_loss_clip": 1.02524483, "balance_loss_mlp": 1.02678895, "epoch": 0.149045543363896, "flos": 23913967887360.0, "grad_norm": 2.6932144822480364, "language_loss": 0.72641551, "learning_rate": 3.853170634719787e-06, "loss": 0.74810821, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2479, "time_per_iteration": 2.445418119430542 }, { "auxiliary_loss_clip": 0.01095998, "auxiliary_loss_mlp": 0.01074929, "balance_loss_clip": 1.03141725, "balance_loss_mlp": 1.0252521, "epoch": 0.14910566661656396, "flos": 23653411344000.0, "grad_norm": 1.5988557082960875, "language_loss": 0.82706577, "learning_rate": 3.853024129031751e-06, "loss": 0.84877503, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.70703125, "step": 2480, "time_per_iteration": 2.4252500534057617 }, { "auxiliary_loss_clip": 0.01098516, "auxiliary_loss_mlp": 0.01099534, "balance_loss_clip": 1.05287576, "balance_loss_mlp": 1.02562797, "epoch": 0.14916578986923193, "flos": 20514440292480.0, "grad_norm": 2.0008817013092943, "language_loss": 0.85698044, "learning_rate": 3.852877553076854e-06, "loss": 0.87896097, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.7265625, "step": 2481, "time_per_iteration": 2.4969537258148193 }, { "auxiliary_loss_clip": 0.01099869, "auxiliary_loss_mlp": 0.01101485, "balance_loss_clip": 1.05151248, "balance_loss_mlp": 1.0264293, "epoch": 0.1492259131218999, "flos": 22490733513600.0, "grad_norm": 1.9794383528494857, "language_loss": 0.79101324, "learning_rate": 3.8527309068606546e-06, "loss": 0.81302673, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.734375, "step": 2482, "time_per_iteration": 2.5329623222351074 }, { "auxiliary_loss_clip": 0.01102809, "auxiliary_loss_mlp": 0.0110767, "balance_loss_clip": 1.06303775, "balance_loss_mlp": 1.02724206, "epoch": 0.14928603637456786, "flos": 23184736986240.0, "grad_norm": 2.2893834210362014, "language_loss": 0.82406473, "learning_rate": 3.852584190388713e-06, "loss": 0.84616947, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.7578125, "step": 2483, "time_per_iteration": 2.4188201427459717 }, { "auxiliary_loss_clip": 0.01094475, "auxiliary_loss_mlp": 0.01106103, "balance_loss_clip": 1.06650198, "balance_loss_mlp": 1.02571392, "epoch": 0.14934615962723582, "flos": 21652154634240.0, "grad_norm": 1.5442775377177154, "language_loss": 0.71927452, "learning_rate": 3.852437403666595e-06, "loss": 0.74128032, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.6875, "step": 2484, "time_per_iteration": 2.4173951148986816 }, { "auxiliary_loss_clip": 0.01099546, "auxiliary_loss_mlp": 0.01101821, "balance_loss_clip": 1.05285001, "balance_loss_mlp": 1.02653694, "epoch": 0.1494062828799038, "flos": 27009018581760.0, "grad_norm": 1.7672985154264305, "language_loss": 0.86294454, "learning_rate": 3.852290546699863e-06, "loss": 0.88495821, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.73046875, "step": 2485, "time_per_iteration": 2.4422507286071777 }, { "auxiliary_loss_clip": 0.01099463, "auxiliary_loss_mlp": 0.01100674, "balance_loss_clip": 1.05446887, "balance_loss_mlp": 1.02831745, "epoch": 0.14946640613257178, "flos": 21213889937280.0, "grad_norm": 2.4061015982206544, "language_loss": 0.8727873, "learning_rate": 3.8521436194940894e-06, "loss": 0.89478868, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2486, "time_per_iteration": 2.4031965732574463 }, { "auxiliary_loss_clip": 0.01096548, "auxiliary_loss_mlp": 0.01091768, "balance_loss_clip": 1.05202317, "balance_loss_mlp": 1.02681255, "epoch": 0.14952652938523975, "flos": 13370023267200.0, "grad_norm": 2.480372794657575, "language_loss": 0.77013361, "learning_rate": 3.851996622054842e-06, "loss": 0.79201674, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6953125, "step": 2487, "time_per_iteration": 2.3689892292022705 }, { "auxiliary_loss_clip": 0.01095101, "auxiliary_loss_mlp": 0.0110138, "balance_loss_clip": 1.05615211, "balance_loss_mlp": 1.0265764, "epoch": 0.1495866526379077, "flos": 35516234684160.0, "grad_norm": 2.466388873361637, "language_loss": 0.72862023, "learning_rate": 3.8518495543877e-06, "loss": 0.75058508, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.68359375, "step": 2488, "time_per_iteration": 2.517245054244995 }, { "auxiliary_loss_clip": 0.01100736, "auxiliary_loss_mlp": 0.01086923, "balance_loss_clip": 1.04133773, "balance_loss_mlp": 1.027812, "epoch": 0.14964677589057568, "flos": 17631976066560.0, "grad_norm": 2.65741708783581, "language_loss": 0.72282994, "learning_rate": 3.851702416498235e-06, "loss": 0.74470651, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.73046875, "step": 2489, "time_per_iteration": 2.338932514190674 }, { "auxiliary_loss_clip": 0.0109884, "auxiliary_loss_mlp": 0.01078459, "balance_loss_clip": 1.03275418, "balance_loss_mlp": 1.02613616, "epoch": 0.14970689914324364, "flos": 20184476232960.0, "grad_norm": 2.6920391628997637, "language_loss": 0.84391975, "learning_rate": 3.8515552083920295e-06, "loss": 0.86569279, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.7265625, "step": 2490, "time_per_iteration": 2.3517861366271973 }, { "auxiliary_loss_clip": 0.01099617, "auxiliary_loss_mlp": 0.01075755, "balance_loss_clip": 1.03088498, "balance_loss_mlp": 1.02728963, "epoch": 0.1497670223959116, "flos": 37227293239680.0, "grad_norm": 2.492486266060026, "language_loss": 0.81758022, "learning_rate": 3.851407930074666e-06, "loss": 0.83933401, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.72265625, "step": 2491, "time_per_iteration": 2.521962881088257 }, { "auxiliary_loss_clip": 0.01098674, "auxiliary_loss_mlp": 0.01064337, "balance_loss_clip": 1.01813138, "balance_loss_mlp": 1.02630544, "epoch": 0.1498271456485796, "flos": 24454877581440.0, "grad_norm": 2.074963728153601, "language_loss": 0.91819882, "learning_rate": 3.851260581551727e-06, "loss": 0.93982893, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.72265625, "step": 2492, "time_per_iteration": 2.412959575653076 }, { "auxiliary_loss_clip": 0.01096167, "auxiliary_loss_mlp": 0.01072636, "balance_loss_clip": 1.02862382, "balance_loss_mlp": 1.02655816, "epoch": 0.14988726890124757, "flos": 16252662049920.0, "grad_norm": 2.8398881463730405, "language_loss": 0.83351183, "learning_rate": 3.851113162828802e-06, "loss": 0.85519987, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6953125, "step": 2493, "time_per_iteration": 2.382564067840576 }, { "auxiliary_loss_clip": 0.01098172, "auxiliary_loss_mlp": 0.01066212, "balance_loss_clip": 1.02296281, "balance_loss_mlp": 1.02753878, "epoch": 0.14994739215391553, "flos": 20665544123520.0, "grad_norm": 1.73868949817116, "language_loss": 0.82704777, "learning_rate": 3.85096567391148e-06, "loss": 0.84869158, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.70703125, "step": 2494, "time_per_iteration": 2.4043378829956055 }, { "auxiliary_loss_clip": 0.01091939, "auxiliary_loss_mlp": 0.01068915, "balance_loss_clip": 1.02194691, "balance_loss_mlp": 1.02556372, "epoch": 0.1500075154065835, "flos": 70649961845760.0, "grad_norm": 1.9611367575343288, "language_loss": 0.68126768, "learning_rate": 3.850818114805354e-06, "loss": 0.70287621, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.6640625, "step": 2495, "time_per_iteration": 2.825629949569702 }, { "auxiliary_loss_clip": 0.01027743, "auxiliary_loss_mlp": 0.01016543, "balance_loss_clip": 1.0122273, "balance_loss_mlp": 1.01094174, "epoch": 0.15006763865925146, "flos": 68008955783040.0, "grad_norm": 0.9021422311471574, "language_loss": 0.59555101, "learning_rate": 3.850670485516019e-06, "loss": 0.61599392, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.16796875, "step": 2496, "time_per_iteration": 3.0454862117767334 }, { "auxiliary_loss_clip": 0.01094093, "auxiliary_loss_mlp": 0.01072911, "balance_loss_clip": 1.02722991, "balance_loss_mlp": 1.02446198, "epoch": 0.15012776191191943, "flos": 18915278244480.0, "grad_norm": 2.047809941665339, "language_loss": 0.68437803, "learning_rate": 3.850522786049075e-06, "loss": 0.70604801, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6953125, "step": 2497, "time_per_iteration": 2.3804757595062256 }, { "auxiliary_loss_clip": 0.01095766, "auxiliary_loss_mlp": 0.01082155, "balance_loss_clip": 1.04129052, "balance_loss_mlp": 1.02748537, "epoch": 0.1501878851645874, "flos": 23700054787200.0, "grad_norm": 1.4754583020147116, "language_loss": 0.76348162, "learning_rate": 3.850375016410121e-06, "loss": 0.78526086, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.68359375, "step": 2498, "time_per_iteration": 2.4788625240325928 }, { "auxiliary_loss_clip": 0.01101004, "auxiliary_loss_mlp": 0.01088418, "balance_loss_clip": 1.04171205, "balance_loss_mlp": 1.02797115, "epoch": 0.15024800841725539, "flos": 20411481093120.0, "grad_norm": 2.271934057075471, "language_loss": 0.73164815, "learning_rate": 3.850227176604761e-06, "loss": 0.75354236, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.73046875, "step": 2499, "time_per_iteration": 2.3873672485351562 }, { "auxiliary_loss_clip": 0.01094149, "auxiliary_loss_mlp": 0.01074328, "balance_loss_clip": 1.03031552, "balance_loss_mlp": 1.02604723, "epoch": 0.15030813166992335, "flos": 31829685868800.0, "grad_norm": 2.2560546851762195, "language_loss": 0.74265355, "learning_rate": 3.850079266638601e-06, "loss": 0.76433831, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.6796875, "step": 2500, "time_per_iteration": 2.495368480682373 }, { "auxiliary_loss_clip": 0.01093771, "auxiliary_loss_mlp": 0.01077357, "balance_loss_clip": 1.03084135, "balance_loss_mlp": 1.02628386, "epoch": 0.15036825492259132, "flos": 35656515993600.0, "grad_norm": 2.215772391043712, "language_loss": 0.67097712, "learning_rate": 3.849931286517249e-06, "loss": 0.69268835, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.67578125, "step": 2501, "time_per_iteration": 2.5124545097351074 }, { "auxiliary_loss_clip": 0.0109413, "auxiliary_loss_mlp": 0.01072506, "balance_loss_clip": 1.02773142, "balance_loss_mlp": 1.02629566, "epoch": 0.15042837817525928, "flos": 18837317445120.0, "grad_norm": 2.1300337737716935, "language_loss": 0.86138809, "learning_rate": 3.849783236246318e-06, "loss": 0.88305449, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.67578125, "step": 2502, "time_per_iteration": 2.375136137008667 }, { "auxiliary_loss_clip": 0.01091678, "auxiliary_loss_mlp": 0.01070329, "balance_loss_clip": 1.02805758, "balance_loss_mlp": 1.02404237, "epoch": 0.15048850142792725, "flos": 19534567674240.0, "grad_norm": 2.144282051528111, "language_loss": 0.79522771, "learning_rate": 3.849635115831421e-06, "loss": 0.81684774, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6796875, "step": 2503, "time_per_iteration": 2.38259220123291 }, { "auxiliary_loss_clip": 0.01092682, "auxiliary_loss_mlp": 0.01066483, "balance_loss_clip": 1.02156496, "balance_loss_mlp": 1.02475059, "epoch": 0.1505486246805952, "flos": 22016473338240.0, "grad_norm": 2.434187261231111, "language_loss": 0.87555397, "learning_rate": 3.849486925278176e-06, "loss": 0.89714551, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6796875, "step": 2504, "time_per_iteration": 2.419578790664673 }, { "auxiliary_loss_clip": 0.01092609, "auxiliary_loss_mlp": 0.01065769, "balance_loss_clip": 1.02542877, "balance_loss_mlp": 1.02682245, "epoch": 0.15060874793326318, "flos": 20742038645760.0, "grad_norm": 1.5612602549669992, "language_loss": 0.83585477, "learning_rate": 3.8493386645922e-06, "loss": 0.85743856, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.65625, "step": 2505, "time_per_iteration": 2.45501708984375 }, { "auxiliary_loss_clip": 0.01093467, "auxiliary_loss_mlp": 0.01063675, "balance_loss_clip": 1.02042592, "balance_loss_mlp": 1.02515697, "epoch": 0.15066887118593117, "flos": 16470973981440.0, "grad_norm": 2.5012837575028084, "language_loss": 0.7779249, "learning_rate": 3.849190333779117e-06, "loss": 0.79949629, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.68359375, "step": 2506, "time_per_iteration": 2.3906748294830322 }, { "auxiliary_loss_clip": 0.01097539, "auxiliary_loss_mlp": 0.01065713, "balance_loss_clip": 1.0229404, "balance_loss_mlp": 1.02731264, "epoch": 0.15072899443859913, "flos": 19858457157120.0, "grad_norm": 2.6371719139540675, "language_loss": 0.80366504, "learning_rate": 3.849041932844552e-06, "loss": 0.82529759, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.703125, "step": 2507, "time_per_iteration": 5.239079475402832 }, { "auxiliary_loss_clip": 0.01093988, "auxiliary_loss_mlp": 0.01056863, "balance_loss_clip": 1.01656985, "balance_loss_mlp": 1.0266223, "epoch": 0.1507891176912671, "flos": 20775206304000.0, "grad_norm": 2.465007941890069, "language_loss": 0.70571911, "learning_rate": 3.848893461794131e-06, "loss": 0.72722763, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.67578125, "step": 2508, "time_per_iteration": 2.390188217163086 }, { "auxiliary_loss_clip": 0.01100176, "auxiliary_loss_mlp": 0.01066198, "balance_loss_clip": 1.02192402, "balance_loss_mlp": 1.0284121, "epoch": 0.15084924094393506, "flos": 23585505016320.0, "grad_norm": 1.7538201526249657, "language_loss": 0.79751909, "learning_rate": 3.8487449206334845e-06, "loss": 0.81918287, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.71875, "step": 2509, "time_per_iteration": 2.4125492572784424 }, { "auxiliary_loss_clip": 0.01100566, "auxiliary_loss_mlp": 0.01072605, "balance_loss_clip": 1.02549314, "balance_loss_mlp": 1.02777267, "epoch": 0.15090936419660303, "flos": 18910460476800.0, "grad_norm": 3.9251387380211877, "language_loss": 0.84335577, "learning_rate": 3.848596309368246e-06, "loss": 0.86508751, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.7265625, "step": 2510, "time_per_iteration": 3.75276255607605 }, { "auxiliary_loss_clip": 0.01098107, "auxiliary_loss_mlp": 0.01076069, "balance_loss_clip": 1.02972007, "balance_loss_mlp": 1.02757215, "epoch": 0.150969487449271, "flos": 17927341102080.0, "grad_norm": 1.7600087110432463, "language_loss": 0.75543225, "learning_rate": 3.8484476280040495e-06, "loss": 0.77717406, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.703125, "step": 2511, "time_per_iteration": 3.790982723236084 }, { "auxiliary_loss_clip": 0.01093534, "auxiliary_loss_mlp": 0.01070108, "balance_loss_clip": 1.02876663, "balance_loss_mlp": 1.02660429, "epoch": 0.151029610701939, "flos": 24241941999360.0, "grad_norm": 2.155248054891697, "language_loss": 0.71194518, "learning_rate": 3.848298876546534e-06, "loss": 0.73358154, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.66796875, "step": 2512, "time_per_iteration": 2.4149928092956543 }, { "auxiliary_loss_clip": 0.01098788, "auxiliary_loss_mlp": 0.01071113, "balance_loss_clip": 1.02793515, "balance_loss_mlp": 1.02983439, "epoch": 0.15108973395460695, "flos": 30261212772480.0, "grad_norm": 2.190306031423565, "language_loss": 0.75713086, "learning_rate": 3.84815005500134e-06, "loss": 0.77882993, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.69140625, "step": 2513, "time_per_iteration": 2.486478567123413 }, { "auxiliary_loss_clip": 0.01024724, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.02709091, "balance_loss_mlp": 1.00602555, "epoch": 0.15114985720727492, "flos": 60434443319040.0, "grad_norm": 0.8884858023483032, "language_loss": 0.64962852, "learning_rate": 3.84800116337411e-06, "loss": 0.6701901, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 0.04345703, "router_z_loss_mlp": 0.1875, "step": 2514, "time_per_iteration": 2.9855165481567383 }, { "auxiliary_loss_clip": 0.01099107, "auxiliary_loss_mlp": 0.0107579, "balance_loss_clip": 1.02710509, "balance_loss_mlp": 1.02955222, "epoch": 0.15120998045994288, "flos": 20520654514560.0, "grad_norm": 2.0703687169247695, "language_loss": 0.75318503, "learning_rate": 3.8478522016704916e-06, "loss": 0.77493405, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.6953125, "step": 2515, "time_per_iteration": 2.3908915519714355 }, { "auxiliary_loss_clip": 0.0109604, "auxiliary_loss_mlp": 0.01076667, "balance_loss_clip": 1.03151011, "balance_loss_mlp": 1.02622676, "epoch": 0.15127010371261085, "flos": 21177824636160.0, "grad_norm": 1.6872791922004229, "language_loss": 0.79323679, "learning_rate": 3.8477031698961325e-06, "loss": 0.81496382, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.69921875, "step": 2516, "time_per_iteration": 2.410939931869507 }, { "auxiliary_loss_clip": 0.01022552, "auxiliary_loss_mlp": 0.01005508, "balance_loss_clip": 1.00009632, "balance_loss_mlp": 1.00476396, "epoch": 0.1513302269652788, "flos": 65317500938880.0, "grad_norm": 0.7290160840294603, "language_loss": 0.54699725, "learning_rate": 3.8475540680566835e-06, "loss": 0.56727785, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 0.05419922, "router_z_loss_mlp": 0.17773438, "step": 2517, "time_per_iteration": 3.055333137512207 }, { "auxiliary_loss_clip": 0.01097866, "auxiliary_loss_mlp": 0.01075375, "balance_loss_clip": 1.02845407, "balance_loss_mlp": 1.02681875, "epoch": 0.15139035021794678, "flos": 19134812073600.0, "grad_norm": 1.8817959344641373, "language_loss": 0.7996822, "learning_rate": 3.8474048961577995e-06, "loss": 0.82141459, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.7109375, "step": 2518, "time_per_iteration": 2.421412944793701 }, { "auxiliary_loss_clip": 0.01100288, "auxiliary_loss_mlp": 0.01078742, "balance_loss_clip": 1.02934194, "balance_loss_mlp": 1.02605844, "epoch": 0.15145047347061477, "flos": 26577352131840.0, "grad_norm": 2.062727654951096, "language_loss": 0.72276735, "learning_rate": 3.847255654205137e-06, "loss": 0.74455762, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.7421875, "step": 2519, "time_per_iteration": 2.4406862258911133 }, { "auxiliary_loss_clip": 0.01098385, "auxiliary_loss_mlp": 0.01087486, "balance_loss_clip": 1.04008865, "balance_loss_mlp": 1.02673721, "epoch": 0.15151059672328274, "flos": 20301923646720.0, "grad_norm": 2.3345630663774353, "language_loss": 0.80948764, "learning_rate": 3.847106342204354e-06, "loss": 0.83134639, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.71875, "step": 2520, "time_per_iteration": 2.407841920852661 }, { "auxiliary_loss_clip": 0.01100066, "auxiliary_loss_mlp": 0.01083717, "balance_loss_clip": 1.0338881, "balance_loss_mlp": 1.02749395, "epoch": 0.1515707199759507, "flos": 27227330513280.0, "grad_norm": 1.872046612520587, "language_loss": 0.76894939, "learning_rate": 3.846956960161114e-06, "loss": 0.79078716, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.7265625, "step": 2521, "time_per_iteration": 2.457338333129883 }, { "auxiliary_loss_clip": 0.01100628, "auxiliary_loss_mlp": 0.01075213, "balance_loss_clip": 1.02419186, "balance_loss_mlp": 1.02622497, "epoch": 0.15163084322861867, "flos": 23586203243520.0, "grad_norm": 2.135534199209432, "language_loss": 0.8412987, "learning_rate": 3.84680750808108e-06, "loss": 0.86305714, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.74609375, "step": 2522, "time_per_iteration": 2.427781105041504 }, { "auxiliary_loss_clip": 0.01022689, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.02304602, "balance_loss_mlp": 1.00400174, "epoch": 0.15169096648128663, "flos": 66886427882880.0, "grad_norm": 0.8399761433835947, "language_loss": 0.57936323, "learning_rate": 3.846657985969922e-06, "loss": 0.5998764, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 0.0559082, "router_z_loss_mlp": 0.1875, "step": 2523, "time_per_iteration": 2.9662187099456787 }, { "auxiliary_loss_clip": 0.01096955, "auxiliary_loss_mlp": 0.01075797, "balance_loss_clip": 1.02634931, "balance_loss_mlp": 1.02684617, "epoch": 0.1517510897339546, "flos": 29094171022080.0, "grad_norm": 1.5002728619740164, "language_loss": 0.76528943, "learning_rate": 3.8465083938333066e-06, "loss": 0.78701699, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.703125, "step": 2524, "time_per_iteration": 2.4852726459503174 }, { "auxiliary_loss_clip": 0.0109892, "auxiliary_loss_mlp": 0.01085202, "balance_loss_clip": 1.03573036, "balance_loss_mlp": 1.02613258, "epoch": 0.1518112129866226, "flos": 18405546261120.0, "grad_norm": 1.7776221562925887, "language_loss": 0.76452565, "learning_rate": 3.8463587316769085e-06, "loss": 0.78636694, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.7265625, "step": 2525, "time_per_iteration": 2.3836252689361572 }, { "auxiliary_loss_clip": 0.0110516, "auxiliary_loss_mlp": 0.01084778, "balance_loss_clip": 1.02951288, "balance_loss_mlp": 1.03054094, "epoch": 0.15187133623929056, "flos": 19424451646080.0, "grad_norm": 1.8969240245464323, "language_loss": 0.81745791, "learning_rate": 3.846208999506402e-06, "loss": 0.83935726, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.74609375, "step": 2526, "time_per_iteration": 2.405791997909546 }, { "auxiliary_loss_clip": 0.01097633, "auxiliary_loss_mlp": 0.01065167, "balance_loss_clip": 1.02043939, "balance_loss_mlp": 1.02802122, "epoch": 0.15193145949195852, "flos": 17565256725120.0, "grad_norm": 1.8035210069683476, "language_loss": 0.86383271, "learning_rate": 3.846059197327466e-06, "loss": 0.88546067, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6953125, "step": 2527, "time_per_iteration": 2.382150650024414 }, { "auxiliary_loss_clip": 0.01099964, "auxiliary_loss_mlp": 0.01074429, "balance_loss_clip": 1.02831864, "balance_loss_mlp": 1.02859974, "epoch": 0.15199158274462649, "flos": 36174731437440.0, "grad_norm": 1.5994046406836089, "language_loss": 0.71308672, "learning_rate": 3.845909325145779e-06, "loss": 0.73483062, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.71484375, "step": 2528, "time_per_iteration": 2.5499250888824463 }, { "auxiliary_loss_clip": 0.01098732, "auxiliary_loss_mlp": 0.01074785, "balance_loss_clip": 1.02810311, "balance_loss_mlp": 1.02697706, "epoch": 0.15205170599729445, "flos": 23072980124160.0, "grad_norm": 1.7772628479897812, "language_loss": 0.8875289, "learning_rate": 3.845759382967026e-06, "loss": 0.90926409, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.71875, "step": 2529, "time_per_iteration": 2.4078006744384766 }, { "auxiliary_loss_clip": 0.01099326, "auxiliary_loss_mlp": 0.01076162, "balance_loss_clip": 1.02914584, "balance_loss_mlp": 1.02898145, "epoch": 0.15211182924996242, "flos": 21907299916800.0, "grad_norm": 1.7981590331376214, "language_loss": 0.84938741, "learning_rate": 3.845609370796893e-06, "loss": 0.87114227, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.703125, "step": 2530, "time_per_iteration": 2.425597667694092 }, { "auxiliary_loss_clip": 0.01098963, "auxiliary_loss_mlp": 0.01079499, "balance_loss_clip": 1.03591609, "balance_loss_mlp": 1.02711892, "epoch": 0.15217195250263038, "flos": 13880662945920.0, "grad_norm": 2.719446557287777, "language_loss": 0.82987899, "learning_rate": 3.845459288641066e-06, "loss": 0.85166359, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.71875, "step": 2531, "time_per_iteration": 2.377437114715576 }, { "auxiliary_loss_clip": 0.01097254, "auxiliary_loss_mlp": 0.01078266, "balance_loss_clip": 1.03394413, "balance_loss_mlp": 1.02813625, "epoch": 0.15223207575529837, "flos": 24534165012480.0, "grad_norm": 1.7052328572682445, "language_loss": 0.80166292, "learning_rate": 3.8453091365052394e-06, "loss": 0.82341814, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.69140625, "step": 2532, "time_per_iteration": 2.4589648246765137 }, { "auxiliary_loss_clip": 0.01096223, "auxiliary_loss_mlp": 0.0107674, "balance_loss_clip": 1.0337292, "balance_loss_mlp": 1.02731657, "epoch": 0.15229219900796634, "flos": 25555618926720.0, "grad_norm": 1.8842302485160907, "language_loss": 0.89500463, "learning_rate": 3.845158914395105e-06, "loss": 0.91673428, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6875, "step": 2533, "time_per_iteration": 2.4251551628112793 }, { "auxiliary_loss_clip": 0.01099628, "auxiliary_loss_mlp": 0.01081829, "balance_loss_clip": 1.03500319, "balance_loss_mlp": 1.02836061, "epoch": 0.1523523222606343, "flos": 18216980674560.0, "grad_norm": 2.6803366057171853, "language_loss": 0.80958569, "learning_rate": 3.84500862231636e-06, "loss": 0.83140028, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.7109375, "step": 2534, "time_per_iteration": 2.3794875144958496 }, { "auxiliary_loss_clip": 0.01100741, "auxiliary_loss_mlp": 0.01075901, "balance_loss_clip": 1.02714491, "balance_loss_mlp": 1.02767408, "epoch": 0.15241244551330227, "flos": 13259278834560.0, "grad_norm": 2.472772633228721, "language_loss": 0.79112452, "learning_rate": 3.844858260274702e-06, "loss": 0.81289101, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.73046875, "step": 2535, "time_per_iteration": 2.389275312423706 }, { "auxiliary_loss_clip": 0.01102043, "auxiliary_loss_mlp": 0.01073849, "balance_loss_clip": 1.02680922, "balance_loss_mlp": 1.02880085, "epoch": 0.15247256876597023, "flos": 19714649800320.0, "grad_norm": 2.0403229171505113, "language_loss": 0.79499602, "learning_rate": 3.844707828275835e-06, "loss": 0.816755, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.734375, "step": 2536, "time_per_iteration": 2.4379541873931885 }, { "auxiliary_loss_clip": 0.01097123, "auxiliary_loss_mlp": 0.01063776, "balance_loss_clip": 1.02021682, "balance_loss_mlp": 1.02906179, "epoch": 0.1525326920186382, "flos": 20374822298880.0, "grad_norm": 2.3188945937776513, "language_loss": 0.77777088, "learning_rate": 3.844557326325461e-06, "loss": 0.79937983, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.6796875, "step": 2537, "time_per_iteration": 2.4232356548309326 }, { "auxiliary_loss_clip": 0.01099923, "auxiliary_loss_mlp": 0.01073122, "balance_loss_clip": 1.02415073, "balance_loss_mlp": 1.02896285, "epoch": 0.15259281527130616, "flos": 13589103248640.0, "grad_norm": 2.3592252554996143, "language_loss": 0.79057246, "learning_rate": 3.8444067544292896e-06, "loss": 0.81230295, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.70703125, "step": 2538, "time_per_iteration": 2.400545597076416 }, { "auxiliary_loss_clip": 0.0109451, "auxiliary_loss_mlp": 0.01055938, "balance_loss_clip": 1.01433396, "balance_loss_mlp": 1.02702236, "epoch": 0.15265293852397416, "flos": 22859171758080.0, "grad_norm": 1.562961096304252, "language_loss": 0.91274017, "learning_rate": 3.844256112593029e-06, "loss": 0.93424469, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.67578125, "step": 2539, "time_per_iteration": 2.502378225326538 }, { "auxiliary_loss_clip": 0.01097021, "auxiliary_loss_mlp": 0.01067757, "balance_loss_clip": 1.02426958, "balance_loss_mlp": 1.02690458, "epoch": 0.15271306177664212, "flos": 29236931038080.0, "grad_norm": 2.06925833340324, "language_loss": 0.94694918, "learning_rate": 3.844105400822391e-06, "loss": 0.96859694, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.703125, "step": 2540, "time_per_iteration": 2.459963798522949 }, { "auxiliary_loss_clip": 0.01094573, "auxiliary_loss_mlp": 0.01062582, "balance_loss_clip": 1.01823568, "balance_loss_mlp": 1.02596366, "epoch": 0.1527731850293101, "flos": 31244995463040.0, "grad_norm": 1.7116432977712255, "language_loss": 0.764305, "learning_rate": 3.843954619123092e-06, "loss": 0.78587657, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6875, "step": 2541, "time_per_iteration": 2.5024309158325195 }, { "auxiliary_loss_clip": 0.01095473, "auxiliary_loss_mlp": 0.01069237, "balance_loss_clip": 1.02555835, "balance_loss_mlp": 1.0263083, "epoch": 0.15283330828197805, "flos": 22381001510400.0, "grad_norm": 1.5473365294735217, "language_loss": 0.82768631, "learning_rate": 3.84380376750085e-06, "loss": 0.84933335, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.69140625, "step": 2542, "time_per_iteration": 2.3986380100250244 }, { "auxiliary_loss_clip": 0.01097486, "auxiliary_loss_mlp": 0.01070007, "balance_loss_clip": 1.02477884, "balance_loss_mlp": 1.0267849, "epoch": 0.15289343153464602, "flos": 25518960132480.0, "grad_norm": 2.4328903536694884, "language_loss": 0.79360616, "learning_rate": 3.843652845961383e-06, "loss": 0.81528109, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.70703125, "step": 2543, "time_per_iteration": 2.4708242416381836 }, { "auxiliary_loss_clip": 0.0109462, "auxiliary_loss_mlp": 0.01059643, "balance_loss_clip": 1.01725245, "balance_loss_mlp": 1.02587903, "epoch": 0.15295355478731398, "flos": 22708940711040.0, "grad_norm": 1.8985773567350754, "language_loss": 0.88735342, "learning_rate": 3.843501854510416e-06, "loss": 0.90889609, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.6875, "step": 2544, "time_per_iteration": 2.3975024223327637 }, { "auxiliary_loss_clip": 0.01101356, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.0177145, "balance_loss_mlp": 1.02765787, "epoch": 0.15301367803998198, "flos": 23250967568640.0, "grad_norm": 1.956342701724328, "language_loss": 0.83747399, "learning_rate": 3.843350793153673e-06, "loss": 0.8591522, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.734375, "step": 2545, "time_per_iteration": 2.4199275970458984 }, { "auxiliary_loss_clip": 0.01098117, "auxiliary_loss_mlp": 0.01067211, "balance_loss_clip": 1.02117181, "balance_loss_mlp": 1.02882612, "epoch": 0.15307380129264994, "flos": 25885059315840.0, "grad_norm": 2.3184733391423045, "language_loss": 0.72719479, "learning_rate": 3.843199661896884e-06, "loss": 0.74884808, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.6953125, "step": 2546, "time_per_iteration": 3.8704402446746826 }, { "auxiliary_loss_clip": 0.01098132, "auxiliary_loss_mlp": 0.01069904, "balance_loss_clip": 1.02105236, "balance_loss_mlp": 1.02642655, "epoch": 0.1531339245453179, "flos": 46971482279040.0, "grad_norm": 1.6817684212124877, "language_loss": 0.80030125, "learning_rate": 3.843048460745779e-06, "loss": 0.82198161, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.71875, "step": 2547, "time_per_iteration": 4.031328439712524 }, { "auxiliary_loss_clip": 0.01099036, "auxiliary_loss_mlp": 0.01075053, "balance_loss_clip": 1.02722621, "balance_loss_mlp": 1.02752709, "epoch": 0.15319404779798587, "flos": 35880588299520.0, "grad_norm": 1.978955515397656, "language_loss": 0.75919974, "learning_rate": 3.842897189706092e-06, "loss": 0.78094065, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.71875, "step": 2548, "time_per_iteration": 2.523813009262085 }, { "auxiliary_loss_clip": 0.01095194, "auxiliary_loss_mlp": 0.01067977, "balance_loss_clip": 1.02348804, "balance_loss_mlp": 1.02574301, "epoch": 0.15325417105065384, "flos": 25663500627840.0, "grad_norm": 1.4432942664705732, "language_loss": 0.81832165, "learning_rate": 3.842745848783558e-06, "loss": 0.83995336, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6953125, "step": 2549, "time_per_iteration": 2.45752215385437 }, { "auxiliary_loss_clip": 0.01098374, "auxiliary_loss_mlp": 0.01065502, "balance_loss_clip": 1.02032161, "balance_loss_mlp": 1.02622664, "epoch": 0.1533142943033218, "flos": 18769830053760.0, "grad_norm": 1.6653555769892043, "language_loss": 0.75929648, "learning_rate": 3.842594437983917e-06, "loss": 0.78093523, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.71875, "step": 2550, "time_per_iteration": 5.203188419342041 }, { "auxiliary_loss_clip": 0.01100453, "auxiliary_loss_mlp": 0.01068569, "balance_loss_clip": 1.01981199, "balance_loss_mlp": 1.02835107, "epoch": 0.15337441755598977, "flos": 23106392161920.0, "grad_norm": 2.279158254129376, "language_loss": 0.79346585, "learning_rate": 3.8424429573129115e-06, "loss": 0.8151561, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.72265625, "step": 2551, "time_per_iteration": 2.416642665863037 }, { "auxiliary_loss_clip": 0.01024596, "auxiliary_loss_mlp": 0.01046489, "balance_loss_clip": 1.03967071, "balance_loss_mlp": 1.00470138, "epoch": 0.15343454080865776, "flos": 59857712703360.0, "grad_norm": 1.0364194003386662, "language_loss": 0.56765676, "learning_rate": 3.842291406776283e-06, "loss": 0.58836764, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 0.06835938, "router_z_loss_mlp": 0.19921875, "step": 2552, "time_per_iteration": 2.96636700630188 }, { "auxiliary_loss_clip": 0.01099776, "auxiliary_loss_mlp": 0.0107233, "balance_loss_clip": 1.02500379, "balance_loss_mlp": 1.0277127, "epoch": 0.15349466406132573, "flos": 11910095187840.0, "grad_norm": 2.4014388423634045, "language_loss": 0.9034788, "learning_rate": 3.84213978637978e-06, "loss": 0.92519987, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.71875, "step": 2553, "time_per_iteration": 2.3910927772521973 }, { "auxiliary_loss_clip": 0.01101591, "auxiliary_loss_mlp": 0.01072463, "balance_loss_clip": 1.02895141, "balance_loss_mlp": 1.02820694, "epoch": 0.1535547873139937, "flos": 24095795581440.0, "grad_norm": 2.1363456086996937, "language_loss": 0.80224639, "learning_rate": 3.841988096129152e-06, "loss": 0.82398689, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.734375, "step": 2554, "time_per_iteration": 2.427932024002075 }, { "auxiliary_loss_clip": 0.01103302, "auxiliary_loss_mlp": 0.010858, "balance_loss_clip": 1.03763938, "balance_loss_mlp": 1.02909565, "epoch": 0.15361491056666166, "flos": 17565501104640.0, "grad_norm": 2.3367972699744013, "language_loss": 0.81216413, "learning_rate": 3.841836336030151e-06, "loss": 0.83405519, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.7421875, "step": 2555, "time_per_iteration": 2.3645737171173096 }, { "auxiliary_loss_clip": 0.01096604, "auxiliary_loss_mlp": 0.01079506, "balance_loss_clip": 1.03747249, "balance_loss_mlp": 1.02723527, "epoch": 0.15367503381932962, "flos": 25044874513920.0, "grad_norm": 1.5878386172858978, "language_loss": 0.79242194, "learning_rate": 3.8416845060885305e-06, "loss": 0.81418312, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6953125, "step": 2556, "time_per_iteration": 2.4415009021759033 }, { "auxiliary_loss_clip": 0.01094195, "auxiliary_loss_mlp": 0.01067409, "balance_loss_clip": 1.02678275, "balance_loss_mlp": 1.02621567, "epoch": 0.15373515707199759, "flos": 21506252595840.0, "grad_norm": 1.7834831595398932, "language_loss": 0.92716771, "learning_rate": 3.84153260631005e-06, "loss": 0.94878376, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6796875, "step": 2557, "time_per_iteration": 2.40327525138855 }, { "auxiliary_loss_clip": 0.010992, "auxiliary_loss_mlp": 0.01069665, "balance_loss_clip": 1.02112246, "balance_loss_mlp": 1.02693439, "epoch": 0.15379528032466555, "flos": 25993534510080.0, "grad_norm": 3.622183165240118, "language_loss": 0.72086704, "learning_rate": 3.841380636700468e-06, "loss": 0.74255562, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.72265625, "step": 2558, "time_per_iteration": 2.4307589530944824 }, { "auxiliary_loss_clip": 0.01098714, "auxiliary_loss_mlp": 0.01072301, "balance_loss_clip": 1.02614355, "balance_loss_mlp": 1.02772546, "epoch": 0.15385540357733354, "flos": 19276420014720.0, "grad_norm": 1.9605255719640426, "language_loss": 0.93350697, "learning_rate": 3.841228597265548e-06, "loss": 0.95521712, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2559, "time_per_iteration": 2.3948278427124023 }, { "auxiliary_loss_clip": 0.01096545, "auxiliary_loss_mlp": 0.01067064, "balance_loss_clip": 1.02042961, "balance_loss_mlp": 1.02639329, "epoch": 0.1539155268300015, "flos": 28547850067200.0, "grad_norm": 2.453192870595979, "language_loss": 0.6731596, "learning_rate": 3.841076488011055e-06, "loss": 0.69479567, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.703125, "step": 2560, "time_per_iteration": 2.446274995803833 }, { "auxiliary_loss_clip": 0.0109657, "auxiliary_loss_mlp": 0.01075842, "balance_loss_clip": 1.02691865, "balance_loss_mlp": 1.02596188, "epoch": 0.15397565008266947, "flos": 23546821363200.0, "grad_norm": 1.6250910385109087, "language_loss": 0.90009302, "learning_rate": 3.8409243089427574e-06, "loss": 0.92181712, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.703125, "step": 2561, "time_per_iteration": 2.481677532196045 }, { "auxiliary_loss_clip": 0.01093649, "auxiliary_loss_mlp": 0.01059406, "balance_loss_clip": 1.01930368, "balance_loss_mlp": 1.02710235, "epoch": 0.15403577333533744, "flos": 17128842330240.0, "grad_norm": 1.9682293613477904, "language_loss": 0.84695685, "learning_rate": 3.840772060066425e-06, "loss": 0.86848736, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.6640625, "step": 2562, "time_per_iteration": 2.3705010414123535 }, { "auxiliary_loss_clip": 0.01104933, "auxiliary_loss_mlp": 0.01076273, "balance_loss_clip": 1.02572799, "balance_loss_mlp": 1.02979231, "epoch": 0.1540958965880054, "flos": 17893545039360.0, "grad_norm": 1.776985284325818, "language_loss": 0.76830292, "learning_rate": 3.840619741387832e-06, "loss": 0.790115, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.75, "step": 2563, "time_per_iteration": 2.3822598457336426 }, { "auxiliary_loss_clip": 0.01096761, "auxiliary_loss_mlp": 0.01061446, "balance_loss_clip": 1.01709998, "balance_loss_mlp": 1.025455, "epoch": 0.15415601984067337, "flos": 32159684839680.0, "grad_norm": 1.9113635556428057, "language_loss": 0.79161245, "learning_rate": 3.8404673529127534e-06, "loss": 0.81319451, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.7109375, "step": 2564, "time_per_iteration": 2.5182526111602783 }, { "auxiliary_loss_clip": 0.01096085, "auxiliary_loss_mlp": 0.0106898, "balance_loss_clip": 1.02584982, "balance_loss_mlp": 1.02684999, "epoch": 0.15421614309334136, "flos": 24023280954240.0, "grad_norm": 2.183516921216014, "language_loss": 0.72840607, "learning_rate": 3.840314894646969e-06, "loss": 0.75005674, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.69140625, "step": 2565, "time_per_iteration": 2.44225811958313 }, { "auxiliary_loss_clip": 0.01097313, "auxiliary_loss_mlp": 0.01072384, "balance_loss_clip": 1.02744198, "balance_loss_mlp": 1.02783966, "epoch": 0.15427626634600933, "flos": 24385225685760.0, "grad_norm": 2.034402558322712, "language_loss": 0.7373898, "learning_rate": 3.840162366596259e-06, "loss": 0.75908673, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6953125, "step": 2566, "time_per_iteration": 2.426607131958008 }, { "auxiliary_loss_clip": 0.01093254, "auxiliary_loss_mlp": 0.01058925, "balance_loss_clip": 1.01982427, "balance_loss_mlp": 1.02594876, "epoch": 0.1543363895986773, "flos": 23330394645120.0, "grad_norm": 1.820940303541678, "language_loss": 0.86442018, "learning_rate": 3.840009768766408e-06, "loss": 0.88594198, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.671875, "step": 2567, "time_per_iteration": 2.411522626876831 }, { "auxiliary_loss_clip": 0.01097132, "auxiliary_loss_mlp": 0.01063508, "balance_loss_clip": 1.02257109, "balance_loss_mlp": 1.02830207, "epoch": 0.15439651285134526, "flos": 24273294266880.0, "grad_norm": 1.9144231075705551, "language_loss": 0.80204099, "learning_rate": 3.839857101163202e-06, "loss": 0.82364738, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6875, "step": 2568, "time_per_iteration": 2.4023633003234863 }, { "auxiliary_loss_clip": 0.0109785, "auxiliary_loss_mlp": 0.01064498, "balance_loss_clip": 1.02024746, "balance_loss_mlp": 1.02825642, "epoch": 0.15445663610401322, "flos": 22455052237440.0, "grad_norm": 1.8937998798109774, "language_loss": 0.71557164, "learning_rate": 3.83970436379243e-06, "loss": 0.73719513, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6953125, "step": 2569, "time_per_iteration": 2.4714925289154053 }, { "auxiliary_loss_clip": 0.01094335, "auxiliary_loss_mlp": 0.01065552, "balance_loss_clip": 1.02297068, "balance_loss_mlp": 1.02727473, "epoch": 0.1545167593566812, "flos": 22048558744320.0, "grad_norm": 1.7500689344432683, "language_loss": 0.78234422, "learning_rate": 3.839551556659884e-06, "loss": 0.80394304, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.671875, "step": 2570, "time_per_iteration": 2.3910834789276123 }, { "auxiliary_loss_clip": 0.01096604, "auxiliary_loss_mlp": 0.01064898, "balance_loss_clip": 1.02360368, "balance_loss_mlp": 1.0279547, "epoch": 0.15457688260934915, "flos": 19317233260800.0, "grad_norm": 2.0716643593386217, "language_loss": 0.79481471, "learning_rate": 3.839398679771359e-06, "loss": 0.81642973, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6875, "step": 2571, "time_per_iteration": 2.3895623683929443 }, { "auxiliary_loss_clip": 0.01096258, "auxiliary_loss_mlp": 0.01070475, "balance_loss_clip": 1.02786994, "balance_loss_mlp": 1.02694273, "epoch": 0.15463700586201715, "flos": 24132838400640.0, "grad_norm": 2.186632878312773, "language_loss": 0.84615272, "learning_rate": 3.839245733132652e-06, "loss": 0.86782002, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.69140625, "step": 2572, "time_per_iteration": 2.4396603107452393 }, { "auxiliary_loss_clip": 0.0109753, "auxiliary_loss_mlp": 0.01075659, "balance_loss_clip": 1.0338167, "balance_loss_mlp": 1.02662945, "epoch": 0.1546971291146851, "flos": 22419789897600.0, "grad_norm": 1.555799664525543, "language_loss": 0.91988909, "learning_rate": 3.839092716749563e-06, "loss": 0.94162095, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.7109375, "step": 2573, "time_per_iteration": 2.4209787845611572 }, { "auxiliary_loss_clip": 0.01097817, "auxiliary_loss_mlp": 0.0106894, "balance_loss_clip": 1.0257144, "balance_loss_mlp": 1.02670562, "epoch": 0.15475725236735308, "flos": 17529261246720.0, "grad_norm": 3.0262617267426966, "language_loss": 0.72338963, "learning_rate": 3.838939630627893e-06, "loss": 0.74505711, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.7109375, "step": 2574, "time_per_iteration": 2.38788104057312 }, { "auxiliary_loss_clip": 0.01097943, "auxiliary_loss_mlp": 0.0106947, "balance_loss_clip": 1.023193, "balance_loss_mlp": 1.02711785, "epoch": 0.15481737562002104, "flos": 22560734522880.0, "grad_norm": 1.5998806438172986, "language_loss": 0.8411774, "learning_rate": 3.838786474773448e-06, "loss": 0.86285156, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.70703125, "step": 2575, "time_per_iteration": 2.4068105220794678 }, { "auxiliary_loss_clip": 0.0109555, "auxiliary_loss_mlp": 0.01077061, "balance_loss_clip": 1.03281069, "balance_loss_mlp": 1.02546036, "epoch": 0.154877498872689, "flos": 24899391411840.0, "grad_norm": 1.8159503566104984, "language_loss": 0.86419511, "learning_rate": 3.838633249192036e-06, "loss": 0.88592124, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.703125, "step": 2576, "time_per_iteration": 2.4594900608062744 }, { "auxiliary_loss_clip": 0.01098122, "auxiliary_loss_mlp": 0.01066833, "balance_loss_clip": 1.02391768, "balance_loss_mlp": 1.02649486, "epoch": 0.15493762212535697, "flos": 28146244164480.0, "grad_norm": 1.622452302752956, "language_loss": 0.84554434, "learning_rate": 3.838479953889465e-06, "loss": 0.86719388, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.71875, "step": 2577, "time_per_iteration": 2.4852817058563232 }, { "auxiliary_loss_clip": 0.01097698, "auxiliary_loss_mlp": 0.01081294, "balance_loss_clip": 1.03375316, "balance_loss_mlp": 1.02753055, "epoch": 0.15499774537802496, "flos": 25409891445120.0, "grad_norm": 2.1785750619947892, "language_loss": 0.7869215, "learning_rate": 3.8383265888715525e-06, "loss": 0.80871141, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.69921875, "step": 2578, "time_per_iteration": 2.4510843753814697 }, { "auxiliary_loss_clip": 0.01097543, "auxiliary_loss_mlp": 0.01076175, "balance_loss_clip": 1.02670348, "balance_loss_mlp": 1.02702403, "epoch": 0.15505786863069293, "flos": 22090454242560.0, "grad_norm": 1.7573737442840023, "language_loss": 0.83660018, "learning_rate": 3.83817315414411e-06, "loss": 0.85833734, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.70703125, "step": 2579, "time_per_iteration": 2.430471658706665 }, { "auxiliary_loss_clip": 0.01098023, "auxiliary_loss_mlp": 0.01078904, "balance_loss_clip": 1.0338428, "balance_loss_mlp": 1.02726936, "epoch": 0.1551179918833609, "flos": 18916116117120.0, "grad_norm": 1.5607441086364207, "language_loss": 0.82495558, "learning_rate": 3.838019649712958e-06, "loss": 0.84672493, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.7109375, "step": 2580, "time_per_iteration": 2.3770833015441895 }, { "auxiliary_loss_clip": 0.01042176, "auxiliary_loss_mlp": 0.01008939, "balance_loss_clip": 1.0023582, "balance_loss_mlp": 1.01274061, "epoch": 0.15517811513602886, "flos": 66235821096960.0, "grad_norm": 0.8458406064497984, "language_loss": 0.58942306, "learning_rate": 3.8378660755839166e-06, "loss": 0.60993421, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 0.06591797, "router_z_loss_mlp": 0.29492188, "step": 2581, "time_per_iteration": 3.1661159992218018 }, { "auxiliary_loss_clip": 0.01100546, "auxiliary_loss_mlp": 0.01076247, "balance_loss_clip": 1.02870584, "balance_loss_mlp": 1.02673602, "epoch": 0.15523823838869683, "flos": 24020034197760.0, "grad_norm": 1.7861447766695535, "language_loss": 0.86469716, "learning_rate": 3.8377124317628095e-06, "loss": 0.88646507, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.73828125, "step": 2582, "time_per_iteration": 2.434481382369995 }, { "auxiliary_loss_clip": 0.01099081, "auxiliary_loss_mlp": 0.0107589, "balance_loss_clip": 1.02491593, "balance_loss_mlp": 1.02664554, "epoch": 0.1552983616413648, "flos": 20484030631680.0, "grad_norm": 2.320409293554752, "language_loss": 0.80251288, "learning_rate": 3.8375587182554625e-06, "loss": 0.82426262, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7265625, "step": 2583, "time_per_iteration": 2.4756081104278564 }, { "auxiliary_loss_clip": 0.01100372, "auxiliary_loss_mlp": 0.01065919, "balance_loss_clip": 1.01983285, "balance_loss_mlp": 1.0280323, "epoch": 0.15535848489403276, "flos": 32122362729600.0, "grad_norm": 1.7238357292521107, "language_loss": 0.77733183, "learning_rate": 3.837404935067705e-06, "loss": 0.79899478, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.72265625, "step": 2584, "time_per_iteration": 2.509094476699829 }, { "auxiliary_loss_clip": 0.01098478, "auxiliary_loss_mlp": 0.01071619, "balance_loss_clip": 1.02229059, "balance_loss_mlp": 1.02719021, "epoch": 0.15541860814670075, "flos": 19097455052160.0, "grad_norm": 1.8437334872258608, "language_loss": 0.77844614, "learning_rate": 3.837251082205368e-06, "loss": 0.80014712, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.7109375, "step": 2585, "time_per_iteration": 3.8687052726745605 }, { "auxiliary_loss_clip": 0.01096942, "auxiliary_loss_mlp": 0.01072587, "balance_loss_clip": 1.02754974, "balance_loss_mlp": 1.02739859, "epoch": 0.1554787313993687, "flos": 19171086842880.0, "grad_norm": 3.2636896408155587, "language_loss": 0.63598025, "learning_rate": 3.837097159674286e-06, "loss": 0.65767562, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6953125, "step": 2586, "time_per_iteration": 2.423546314239502 }, { "auxiliary_loss_clip": 0.01098077, "auxiliary_loss_mlp": 0.01073369, "balance_loss_clip": 1.02542281, "balance_loss_mlp": 1.02625418, "epoch": 0.15553885465203668, "flos": 16142895135360.0, "grad_norm": 1.7942491504368887, "language_loss": 0.83843333, "learning_rate": 3.836943167480296e-06, "loss": 0.86014783, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.71875, "step": 2587, "time_per_iteration": 3.7980542182922363 }, { "auxiliary_loss_clip": 0.01099563, "auxiliary_loss_mlp": 0.01071335, "balance_loss_clip": 1.02427125, "balance_loss_mlp": 1.02707887, "epoch": 0.15559897790470464, "flos": 25336608768000.0, "grad_norm": 1.8160141890613328, "language_loss": 0.9063884, "learning_rate": 3.836789105629236e-06, "loss": 0.92809737, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.7265625, "step": 2588, "time_per_iteration": 2.4508678913116455 }, { "auxiliary_loss_clip": 0.01097904, "auxiliary_loss_mlp": 0.01078093, "balance_loss_clip": 1.02828765, "balance_loss_mlp": 1.0269134, "epoch": 0.1556591011573726, "flos": 23147659255680.0, "grad_norm": 2.4615164940096617, "language_loss": 0.66682875, "learning_rate": 3.83663497412695e-06, "loss": 0.68858874, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.7109375, "step": 2589, "time_per_iteration": 3.8832924365997314 }, { "auxiliary_loss_clip": 0.01095306, "auxiliary_loss_mlp": 0.01077039, "balance_loss_clip": 1.02747214, "balance_loss_mlp": 1.02510238, "epoch": 0.15571922441004057, "flos": 25369811337600.0, "grad_norm": 2.003641474081145, "language_loss": 0.8438338, "learning_rate": 3.836480772979281e-06, "loss": 0.86555719, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.703125, "step": 2590, "time_per_iteration": 3.873076915740967 }, { "auxiliary_loss_clip": 0.0109792, "auxiliary_loss_mlp": 0.0107168, "balance_loss_clip": 1.02394867, "balance_loss_mlp": 1.02709472, "epoch": 0.15577934766270854, "flos": 14500510957440.0, "grad_norm": 40.3573028737164, "language_loss": 0.81230736, "learning_rate": 3.836326502192077e-06, "loss": 0.83400339, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.7109375, "step": 2591, "time_per_iteration": 2.4202780723571777 }, { "auxiliary_loss_clip": 0.01096003, "auxiliary_loss_mlp": 0.01075992, "balance_loss_clip": 1.0332675, "balance_loss_mlp": 1.02646804, "epoch": 0.15583947091537653, "flos": 37413031006080.0, "grad_norm": 2.4946715793412793, "language_loss": 0.68006468, "learning_rate": 3.836172161771189e-06, "loss": 0.70178461, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6953125, "step": 2592, "time_per_iteration": 2.5637879371643066 }, { "auxiliary_loss_clip": 0.01100202, "auxiliary_loss_mlp": 0.01079081, "balance_loss_clip": 1.03020549, "balance_loss_mlp": 1.02854741, "epoch": 0.1558995941680445, "flos": 21833668126080.0, "grad_norm": 2.1995989804263982, "language_loss": 0.84717542, "learning_rate": 3.836017751722467e-06, "loss": 0.86896825, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.71484375, "step": 2593, "time_per_iteration": 2.4200408458709717 }, { "auxiliary_loss_clip": 0.01095047, "auxiliary_loss_mlp": 0.01078224, "balance_loss_clip": 1.02963424, "balance_loss_mlp": 1.02588654, "epoch": 0.15595971742071246, "flos": 19791598170240.0, "grad_norm": 1.9928576033483099, "language_loss": 0.7483117, "learning_rate": 3.8358632720517695e-06, "loss": 0.77004445, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.69140625, "step": 2594, "time_per_iteration": 2.417465925216675 }, { "auxiliary_loss_clip": 0.01093761, "auxiliary_loss_mlp": 0.01074304, "balance_loss_clip": 1.02600074, "balance_loss_mlp": 1.02505457, "epoch": 0.15601984067338043, "flos": 26720984931840.0, "grad_norm": 1.9794451969064943, "language_loss": 0.83469123, "learning_rate": 3.835708722764952e-06, "loss": 0.85637188, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.6875, "step": 2595, "time_per_iteration": 2.5143094062805176 }, { "auxiliary_loss_clip": 0.01098907, "auxiliary_loss_mlp": 0.01082905, "balance_loss_clip": 1.03171659, "balance_loss_mlp": 1.02644813, "epoch": 0.1560799639260484, "flos": 18368293973760.0, "grad_norm": 2.5570768471939864, "language_loss": 0.88448894, "learning_rate": 3.835554103867876e-06, "loss": 0.90630698, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.7265625, "step": 2596, "time_per_iteration": 2.5827157497406006 }, { "auxiliary_loss_clip": 0.01094468, "auxiliary_loss_mlp": 0.01067886, "balance_loss_clip": 1.02244306, "balance_loss_mlp": 1.02636695, "epoch": 0.15614008717871636, "flos": 22597951898880.0, "grad_norm": 2.269797878176051, "language_loss": 0.70113981, "learning_rate": 3.835399415366404e-06, "loss": 0.7227633, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.6796875, "step": 2597, "time_per_iteration": 2.6604318618774414 }, { "auxiliary_loss_clip": 0.01093514, "auxiliary_loss_mlp": 0.01069119, "balance_loss_clip": 1.02448761, "balance_loss_mlp": 1.02565885, "epoch": 0.15620021043138435, "flos": 22745774062080.0, "grad_norm": 1.7647168158664732, "language_loss": 0.80912727, "learning_rate": 3.8352446572664035e-06, "loss": 0.83075362, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6796875, "step": 2598, "time_per_iteration": 2.4874603748321533 }, { "auxiliary_loss_clip": 0.01093876, "auxiliary_loss_mlp": 0.01064244, "balance_loss_clip": 1.02070904, "balance_loss_mlp": 1.02676916, "epoch": 0.15626033368405232, "flos": 13114109934720.0, "grad_norm": 1.960505862975146, "language_loss": 0.83797705, "learning_rate": 3.8350898295737405e-06, "loss": 0.85955822, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.671875, "step": 2599, "time_per_iteration": 2.3877322673797607 }, { "auxiliary_loss_clip": 0.01104908, "auxiliary_loss_mlp": 0.01076945, "balance_loss_clip": 1.02594745, "balance_loss_mlp": 1.02978063, "epoch": 0.15632045693672028, "flos": 16471358006400.0, "grad_norm": 1.962282710291464, "language_loss": 0.84407914, "learning_rate": 3.834934932294287e-06, "loss": 0.86589766, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2600, "time_per_iteration": 2.3838112354278564 }, { "auxiliary_loss_clip": 0.01099673, "auxiliary_loss_mlp": 0.01082818, "balance_loss_clip": 1.03353667, "balance_loss_mlp": 1.02814841, "epoch": 0.15638058018938825, "flos": 20849291942400.0, "grad_norm": 1.829847356188403, "language_loss": 0.90162408, "learning_rate": 3.834779965433917e-06, "loss": 0.92344898, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.71484375, "step": 2601, "time_per_iteration": 2.4428181648254395 }, { "auxiliary_loss_clip": 0.01101377, "auxiliary_loss_mlp": 0.01089257, "balance_loss_clip": 1.03685224, "balance_loss_mlp": 1.02728939, "epoch": 0.1564407034420562, "flos": 21871129881600.0, "grad_norm": 1.896950716803312, "language_loss": 0.8066892, "learning_rate": 3.834624928998508e-06, "loss": 0.82859552, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7421875, "step": 2602, "time_per_iteration": 2.4765241146087646 }, { "auxiliary_loss_clip": 0.01098214, "auxiliary_loss_mlp": 0.01074248, "balance_loss_clip": 1.02718413, "balance_loss_mlp": 1.02735484, "epoch": 0.15650082669472418, "flos": 21833493569280.0, "grad_norm": 2.188821973294658, "language_loss": 0.75559092, "learning_rate": 3.8344698229939376e-06, "loss": 0.7773155, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.70703125, "step": 2603, "time_per_iteration": 2.4652647972106934 }, { "auxiliary_loss_clip": 0.01097853, "auxiliary_loss_mlp": 0.01072791, "balance_loss_clip": 1.02672815, "balance_loss_mlp": 1.02665043, "epoch": 0.15656094994739214, "flos": 13799909237760.0, "grad_norm": 4.021296249830068, "language_loss": 0.90792179, "learning_rate": 3.8343146474260865e-06, "loss": 0.92962825, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2604, "time_per_iteration": 2.4414494037628174 }, { "auxiliary_loss_clip": 0.0109881, "auxiliary_loss_mlp": 0.01079018, "balance_loss_clip": 1.02904534, "balance_loss_mlp": 1.02626705, "epoch": 0.15662107320006013, "flos": 27306967057920.0, "grad_norm": 2.0282952524323554, "language_loss": 0.86962706, "learning_rate": 3.834159402300841e-06, "loss": 0.89140534, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7265625, "step": 2605, "time_per_iteration": 2.505866765975952 }, { "auxiliary_loss_clip": 0.01102049, "auxiliary_loss_mlp": 0.01073818, "balance_loss_clip": 1.02048373, "balance_loss_mlp": 1.02715468, "epoch": 0.1566811964527281, "flos": 26683942112640.0, "grad_norm": 2.088027107330147, "language_loss": 0.75297964, "learning_rate": 3.834004087624087e-06, "loss": 0.77473825, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.75, "step": 2606, "time_per_iteration": 2.498234510421753 }, { "auxiliary_loss_clip": 0.0110058, "auxiliary_loss_mlp": 0.01070388, "balance_loss_clip": 1.02268052, "balance_loss_mlp": 1.03020382, "epoch": 0.15674131970539606, "flos": 16102605559680.0, "grad_norm": 2.2071101273572737, "language_loss": 0.78361607, "learning_rate": 3.8338487034017145e-06, "loss": 0.80532575, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.703125, "step": 2607, "time_per_iteration": 2.39552640914917 }, { "auxiliary_loss_clip": 0.01095934, "auxiliary_loss_mlp": 0.01076469, "balance_loss_clip": 1.02997768, "balance_loss_mlp": 1.02711856, "epoch": 0.15680144295806403, "flos": 19168747781760.0, "grad_norm": 1.726709307908285, "language_loss": 0.83852005, "learning_rate": 3.833693249639615e-06, "loss": 0.86024415, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6875, "step": 2608, "time_per_iteration": 2.3945152759552 }, { "auxiliary_loss_clip": 0.01101603, "auxiliary_loss_mlp": 0.0107635, "balance_loss_clip": 1.02232409, "balance_loss_mlp": 1.02746892, "epoch": 0.156861566210732, "flos": 20812388768640.0, "grad_norm": 1.719671566334231, "language_loss": 0.74420464, "learning_rate": 3.833537726343684e-06, "loss": 0.76598418, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7421875, "step": 2609, "time_per_iteration": 2.413965940475464 }, { "auxiliary_loss_clip": 0.01098867, "auxiliary_loss_mlp": 0.01075632, "balance_loss_clip": 1.02666104, "balance_loss_mlp": 1.02619791, "epoch": 0.15692168946339996, "flos": 20046883098240.0, "grad_norm": 1.851824119612525, "language_loss": 0.74521577, "learning_rate": 3.833382133519818e-06, "loss": 0.76696068, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7265625, "step": 2610, "time_per_iteration": 2.4074745178222656 }, { "auxiliary_loss_clip": 0.01098668, "auxiliary_loss_mlp": 0.010764, "balance_loss_clip": 1.02437758, "balance_loss_mlp": 1.02639592, "epoch": 0.15698181271606793, "flos": 21396939528960.0, "grad_norm": 1.8556924020672763, "language_loss": 0.744973, "learning_rate": 3.833226471173919e-06, "loss": 0.76672369, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 0.72265625, "step": 2611, "time_per_iteration": 2.4300589561462402 }, { "auxiliary_loss_clip": 0.01096871, "auxiliary_loss_mlp": 0.01076701, "balance_loss_clip": 1.02765834, "balance_loss_mlp": 1.02564597, "epoch": 0.15704193596873592, "flos": 20844858199680.0, "grad_norm": 2.0675315764762137, "language_loss": 0.72220063, "learning_rate": 3.833070739311887e-06, "loss": 0.74393636, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.7109375, "step": 2612, "time_per_iteration": 2.4165761470794678 }, { "auxiliary_loss_clip": 0.01098907, "auxiliary_loss_mlp": 0.01072652, "balance_loss_clip": 1.02451539, "balance_loss_mlp": 1.02693939, "epoch": 0.15710205922140388, "flos": 21761816814720.0, "grad_norm": 1.9217376755711606, "language_loss": 0.78542399, "learning_rate": 3.83291493793963e-06, "loss": 0.80713964, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 0.71875, "step": 2613, "time_per_iteration": 2.459002733230591 }, { "auxiliary_loss_clip": 0.01100126, "auxiliary_loss_mlp": 0.01087867, "balance_loss_clip": 1.03431821, "balance_loss_mlp": 1.02644718, "epoch": 0.15716218247407185, "flos": 25006644708480.0, "grad_norm": 1.7780168945260628, "language_loss": 0.68102163, "learning_rate": 3.832759067063055e-06, "loss": 0.7029016, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.73828125, "step": 2614, "time_per_iteration": 2.4510717391967773 }, { "auxiliary_loss_clip": 0.01102961, "auxiliary_loss_mlp": 0.01080912, "balance_loss_clip": 1.02674294, "balance_loss_mlp": 1.02846146, "epoch": 0.1572223057267398, "flos": 20190795189120.0, "grad_norm": 2.3061352287113133, "language_loss": 0.77515626, "learning_rate": 3.832603126688072e-06, "loss": 0.79699504, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.74609375, "step": 2615, "time_per_iteration": 2.424018383026123 }, { "auxiliary_loss_clip": 0.01095458, "auxiliary_loss_mlp": 0.01081897, "balance_loss_clip": 1.03421307, "balance_loss_mlp": 1.02550828, "epoch": 0.15728242897940778, "flos": 20958465363840.0, "grad_norm": 1.616655039620417, "language_loss": 0.74437958, "learning_rate": 3.832447116820594e-06, "loss": 0.7661531, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.69921875, "step": 2616, "time_per_iteration": 2.482408285140991 }, { "auxiliary_loss_clip": 0.01098058, "auxiliary_loss_mlp": 0.01079693, "balance_loss_clip": 1.03105593, "balance_loss_mlp": 1.02611196, "epoch": 0.15734255223207574, "flos": 23037194113920.0, "grad_norm": 2.8715599155733815, "language_loss": 0.74388766, "learning_rate": 3.832291037466539e-06, "loss": 0.76566517, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.71875, "step": 2617, "time_per_iteration": 2.4293770790100098 }, { "auxiliary_loss_clip": 0.01097798, "auxiliary_loss_mlp": 0.01073883, "balance_loss_clip": 1.02634239, "balance_loss_mlp": 1.02641308, "epoch": 0.15740267548474374, "flos": 20550435770880.0, "grad_norm": 2.037959970366465, "language_loss": 0.76729262, "learning_rate": 3.8321348886318235e-06, "loss": 0.78900945, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.71484375, "step": 2618, "time_per_iteration": 2.395024538040161 }, { "auxiliary_loss_clip": 0.01102677, "auxiliary_loss_mlp": 0.01073903, "balance_loss_clip": 1.02049708, "balance_loss_mlp": 1.02666175, "epoch": 0.1574627987374117, "flos": 22666032783360.0, "grad_norm": 1.879084371129764, "language_loss": 0.80936933, "learning_rate": 3.8319786703223695e-06, "loss": 0.83113503, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 0.76171875, "step": 2619, "time_per_iteration": 2.4555461406707764 }, { "auxiliary_loss_clip": 0.01099022, "auxiliary_loss_mlp": 0.01071718, "balance_loss_clip": 1.02477384, "balance_loss_mlp": 1.02842295, "epoch": 0.15752292199007967, "flos": 16799716143360.0, "grad_norm": 2.748983273564901, "language_loss": 0.78752613, "learning_rate": 3.831822382544101e-06, "loss": 0.80923355, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.70703125, "step": 2620, "time_per_iteration": 2.426804542541504 }, { "auxiliary_loss_clip": 0.01101798, "auxiliary_loss_mlp": 0.01078507, "balance_loss_clip": 1.02348006, "balance_loss_mlp": 1.02762103, "epoch": 0.15758304524274763, "flos": 29824693643520.0, "grad_norm": 1.7943109274642832, "language_loss": 0.72698474, "learning_rate": 3.831666025302944e-06, "loss": 0.74878782, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.7421875, "step": 2621, "time_per_iteration": 2.5202910900115967 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01080069, "balance_loss_clip": 1.02928591, "balance_loss_mlp": 1.02807271, "epoch": 0.1576431684954156, "flos": 53575478369280.0, "grad_norm": 1.900484935239961, "language_loss": 0.74166363, "learning_rate": 3.831509598604828e-06, "loss": 0.76349473, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.75, "step": 2622, "time_per_iteration": 2.789544105529785 }, { "auxiliary_loss_clip": 0.01100957, "auxiliary_loss_mlp": 0.01067971, "balance_loss_clip": 1.02104998, "balance_loss_mlp": 1.02849829, "epoch": 0.15770329174808356, "flos": 20812563325440.0, "grad_norm": 1.8235500441917967, "language_loss": 0.89836836, "learning_rate": 3.831353102455684e-06, "loss": 0.92005765, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.72265625, "step": 2623, "time_per_iteration": 2.437532663345337 }, { "auxiliary_loss_clip": 0.01096214, "auxiliary_loss_mlp": 0.01073358, "balance_loss_clip": 1.02670002, "balance_loss_mlp": 1.02628982, "epoch": 0.15776341500075153, "flos": 24972813734400.0, "grad_norm": 1.666040177322077, "language_loss": 0.83315217, "learning_rate": 3.831196536861448e-06, "loss": 0.85484785, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.69921875, "step": 2624, "time_per_iteration": 2.446948289871216 }, { "auxiliary_loss_clip": 0.01101544, "auxiliary_loss_mlp": 0.0108034, "balance_loss_clip": 1.02402544, "balance_loss_mlp": 1.02624452, "epoch": 0.15782353825341952, "flos": 21906846069120.0, "grad_norm": 2.0986777501414937, "language_loss": 0.83725595, "learning_rate": 3.831039901828054e-06, "loss": 0.85907477, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.75390625, "step": 2625, "time_per_iteration": 3.9176290035247803 }, { "auxiliary_loss_clip": 0.01100739, "auxiliary_loss_mlp": 0.01072548, "balance_loss_clip": 1.02233672, "balance_loss_mlp": 1.02728963, "epoch": 0.15788366150608749, "flos": 26175990608640.0, "grad_norm": 2.3956552405422746, "language_loss": 0.8162238, "learning_rate": 3.830883197361445e-06, "loss": 0.83795667, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 0.734375, "step": 2626, "time_per_iteration": 2.4363486766815186 }, { "auxiliary_loss_clip": 0.01101219, "auxiliary_loss_mlp": 0.01067194, "balance_loss_clip": 1.01810384, "balance_loss_mlp": 1.02912283, "epoch": 0.15794378475875545, "flos": 27708572960640.0, "grad_norm": 1.7190558571701282, "language_loss": 0.76267004, "learning_rate": 3.830726423467561e-06, "loss": 0.78435415, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 0.72265625, "step": 2627, "time_per_iteration": 3.8809659481048584 }, { "auxiliary_loss_clip": 0.01101071, "auxiliary_loss_mlp": 0.01081705, "balance_loss_clip": 1.02806103, "balance_loss_mlp": 1.02695453, "epoch": 0.15800390801142342, "flos": 12129349726080.0, "grad_norm": 2.124684347500375, "language_loss": 0.87497663, "learning_rate": 3.830569580152348e-06, "loss": 0.89680433, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 0.7421875, "step": 2628, "time_per_iteration": 2.4015252590179443 }, { "auxiliary_loss_clip": 0.01096136, "auxiliary_loss_mlp": 0.01064989, "balance_loss_clip": 1.01928425, "balance_loss_mlp": 1.02456725, "epoch": 0.15806403126409138, "flos": 20703669194880.0, "grad_norm": 3.607243813166436, "language_loss": 0.7866255, "learning_rate": 3.830412667421752e-06, "loss": 0.80823678, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.71875, "step": 2629, "time_per_iteration": 5.244709730148315 }, { "auxiliary_loss_clip": 0.01100004, "auxiliary_loss_mlp": 0.01098151, "balance_loss_clip": 1.04219413, "balance_loss_mlp": 1.02702641, "epoch": 0.15812415451675935, "flos": 17820751121280.0, "grad_norm": 2.5815700806940867, "language_loss": 0.76315856, "learning_rate": 3.8302556852817245e-06, "loss": 0.78514016, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.73046875, "step": 2630, "time_per_iteration": 2.4157791137695312 }, { "auxiliary_loss_clip": 0.01100199, "auxiliary_loss_mlp": 0.01076289, "balance_loss_clip": 1.02495754, "balance_loss_mlp": 1.02617908, "epoch": 0.15818427776942734, "flos": 20083018222080.0, "grad_norm": 2.1344526814547753, "language_loss": 0.86044872, "learning_rate": 3.8300986337382184e-06, "loss": 0.88221359, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.7421875, "step": 2631, "time_per_iteration": 2.472343683242798 }, { "auxiliary_loss_clip": 0.01099275, "auxiliary_loss_mlp": 0.01075192, "balance_loss_clip": 1.02226305, "balance_loss_mlp": 1.0258863, "epoch": 0.1582444010220953, "flos": 21213855025920.0, "grad_norm": 1.6551893914505127, "language_loss": 0.81332576, "learning_rate": 3.8299415127971895e-06, "loss": 0.83507037, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.734375, "step": 2632, "time_per_iteration": 2.442218065261841 }, { "auxiliary_loss_clip": 0.01102179, "auxiliary_loss_mlp": 0.01083126, "balance_loss_clip": 1.02883828, "balance_loss_mlp": 1.0264976, "epoch": 0.15830452427476327, "flos": 17857375004160.0, "grad_norm": 1.8736910905682398, "language_loss": 0.84556162, "learning_rate": 3.829784322464594e-06, "loss": 0.86741471, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 0.7578125, "step": 2633, "time_per_iteration": 2.387376308441162 }, { "auxiliary_loss_clip": 0.01101787, "auxiliary_loss_mlp": 0.01072932, "balance_loss_clip": 1.02365136, "balance_loss_mlp": 1.02760816, "epoch": 0.15836464752743123, "flos": 24533815898880.0, "grad_norm": 1.7359865077593086, "language_loss": 0.79173571, "learning_rate": 3.829627062746394e-06, "loss": 0.81348288, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.7421875, "step": 2634, "time_per_iteration": 2.438629388809204 }, { "auxiliary_loss_clip": 0.01102009, "auxiliary_loss_mlp": 0.01083705, "balance_loss_clip": 1.02626967, "balance_loss_mlp": 1.02485025, "epoch": 0.1584247707800992, "flos": 20119781750400.0, "grad_norm": 2.039671432380225, "language_loss": 0.90512645, "learning_rate": 3.829469733648552e-06, "loss": 0.92698354, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.7734375, "step": 2635, "time_per_iteration": 2.4059805870056152 }, { "auxiliary_loss_clip": 0.0110019, "auxiliary_loss_mlp": 0.01072048, "balance_loss_clip": 1.02114594, "balance_loss_mlp": 1.02440238, "epoch": 0.15848489403276717, "flos": 20374927032960.0, "grad_norm": 1.8074476311024006, "language_loss": 0.78704494, "learning_rate": 3.829312335177034e-06, "loss": 0.80876732, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 0.7578125, "step": 2636, "time_per_iteration": 2.427769184112549 }, { "auxiliary_loss_clip": 0.01103503, "auxiliary_loss_mlp": 0.01078265, "balance_loss_clip": 1.02192688, "balance_loss_mlp": 1.02711892, "epoch": 0.15854501728543513, "flos": 39345368958720.0, "grad_norm": 2.3115574486844093, "language_loss": 0.74972999, "learning_rate": 3.82915486733781e-06, "loss": 0.77154768, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.765625, "step": 2637, "time_per_iteration": 2.57893967628479 }, { "auxiliary_loss_clip": 0.01097539, "auxiliary_loss_mlp": 0.01075928, "balance_loss_clip": 1.02657592, "balance_loss_mlp": 1.02479827, "epoch": 0.15860514053810312, "flos": 24863046819840.0, "grad_norm": 1.9576004329653567, "language_loss": 0.79636997, "learning_rate": 3.82899733013685e-06, "loss": 0.81810462, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.7265625, "step": 2638, "time_per_iteration": 2.4371321201324463 }, { "auxiliary_loss_clip": 0.0109894, "auxiliary_loss_mlp": 0.01082489, "balance_loss_clip": 1.02927375, "balance_loss_mlp": 1.02595854, "epoch": 0.1586652637907711, "flos": 26176479367680.0, "grad_norm": 3.104627863596063, "language_loss": 0.76812345, "learning_rate": 3.828839723580128e-06, "loss": 0.78993773, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.73046875, "step": 2639, "time_per_iteration": 2.4586029052734375 }, { "auxiliary_loss_clip": 0.01101195, "auxiliary_loss_mlp": 0.01085621, "balance_loss_clip": 1.03331232, "balance_loss_mlp": 1.02687442, "epoch": 0.15872538704343905, "flos": 19791039588480.0, "grad_norm": 1.8040361415833037, "language_loss": 0.83456886, "learning_rate": 3.82868204767362e-06, "loss": 0.85643703, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7421875, "step": 2640, "time_per_iteration": 2.41383695602417 }, { "auxiliary_loss_clip": 0.01098009, "auxiliary_loss_mlp": 0.0106839, "balance_loss_clip": 1.02318597, "balance_loss_mlp": 1.02549183, "epoch": 0.15878551029610702, "flos": 28474113542400.0, "grad_norm": 1.424396915022408, "language_loss": 0.6830225, "learning_rate": 3.828524302423306e-06, "loss": 0.70468652, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.7265625, "step": 2641, "time_per_iteration": 2.4977784156799316 }, { "auxiliary_loss_clip": 0.01105077, "auxiliary_loss_mlp": 0.01080039, "balance_loss_clip": 1.03056765, "balance_loss_mlp": 1.02823091, "epoch": 0.15884563354877498, "flos": 24205562496000.0, "grad_norm": 2.371099570402696, "language_loss": 0.78178591, "learning_rate": 3.828366487835167e-06, "loss": 0.80363709, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.76953125, "step": 2642, "time_per_iteration": 2.442441701889038 }, { "auxiliary_loss_clip": 0.01094477, "auxiliary_loss_mlp": 0.01069461, "balance_loss_clip": 1.02478147, "balance_loss_mlp": 1.02654958, "epoch": 0.15890575680144295, "flos": 23948706556800.0, "grad_norm": 1.8333506951786191, "language_loss": 0.71590292, "learning_rate": 3.828208603915186e-06, "loss": 0.73754227, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6796875, "step": 2643, "time_per_iteration": 2.4454565048217773 }, { "auxiliary_loss_clip": 0.01095029, "auxiliary_loss_mlp": 0.01061837, "balance_loss_clip": 1.01880217, "balance_loss_mlp": 1.02706289, "epoch": 0.15896588005411091, "flos": 21213959760000.0, "grad_norm": 2.058746810669545, "language_loss": 0.80580717, "learning_rate": 3.828050650669353e-06, "loss": 0.82737583, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6796875, "step": 2644, "time_per_iteration": 2.425658941268921 }, { "auxiliary_loss_clip": 0.01093061, "auxiliary_loss_mlp": 0.0105943, "balance_loss_clip": 1.01811218, "balance_loss_mlp": 1.02446723, "epoch": 0.1590260033067789, "flos": 24351255066240.0, "grad_norm": 2.0787927131270343, "language_loss": 0.83960801, "learning_rate": 3.827892628103657e-06, "loss": 0.86113292, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.68359375, "step": 2645, "time_per_iteration": 2.439939022064209 }, { "auxiliary_loss_clip": 0.01095936, "auxiliary_loss_mlp": 0.01068466, "balance_loss_clip": 1.02273715, "balance_loss_mlp": 1.02590823, "epoch": 0.15908612655944687, "flos": 32047648686720.0, "grad_norm": 1.9431893168702854, "language_loss": 0.71780741, "learning_rate": 3.827734536224087e-06, "loss": 0.73945141, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69921875, "step": 2646, "time_per_iteration": 2.4849064350128174 }, { "auxiliary_loss_clip": 0.0109283, "auxiliary_loss_mlp": 0.01064616, "balance_loss_clip": 1.01855385, "balance_loss_mlp": 1.02574337, "epoch": 0.15914624981211484, "flos": 17784406529280.0, "grad_norm": 14.303537823184502, "language_loss": 0.65621138, "learning_rate": 3.827576375036642e-06, "loss": 0.67778587, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.671875, "step": 2647, "time_per_iteration": 2.377608299255371 }, { "auxiliary_loss_clip": 0.01095679, "auxiliary_loss_mlp": 0.01061849, "balance_loss_clip": 1.01716948, "balance_loss_mlp": 1.02752876, "epoch": 0.1592063730647828, "flos": 17711542788480.0, "grad_norm": 2.1753340263319854, "language_loss": 0.91483533, "learning_rate": 3.827418144547318e-06, "loss": 0.93641061, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6796875, "step": 2648, "time_per_iteration": 2.407356023788452 }, { "auxiliary_loss_clip": 0.01091264, "auxiliary_loss_mlp": 0.01063141, "balance_loss_clip": 1.02003467, "balance_loss_mlp": 1.02510774, "epoch": 0.15926649631745077, "flos": 18802648598400.0, "grad_norm": 1.7810082400528613, "language_loss": 0.92982066, "learning_rate": 3.827259844762114e-06, "loss": 0.9513647, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.6640625, "step": 2649, "time_per_iteration": 2.3983843326568604 }, { "auxiliary_loss_clip": 0.01099971, "auxiliary_loss_mlp": 0.01071355, "balance_loss_clip": 1.02543569, "balance_loss_mlp": 1.02592516, "epoch": 0.15932661957011873, "flos": 17565291636480.0, "grad_norm": 2.2682701147948516, "language_loss": 0.7460053, "learning_rate": 3.827101475687033e-06, "loss": 0.76771855, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.7421875, "step": 2650, "time_per_iteration": 2.375493049621582 }, { "auxiliary_loss_clip": 0.01091331, "auxiliary_loss_mlp": 0.01055527, "balance_loss_clip": 1.01919222, "balance_loss_mlp": 1.0263145, "epoch": 0.15938674282278673, "flos": 13333504118400.0, "grad_norm": 1.971952551312726, "language_loss": 0.73300397, "learning_rate": 3.826943037328082e-06, "loss": 0.75447249, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.6484375, "step": 2651, "time_per_iteration": 2.3750319480895996 }, { "auxiliary_loss_clip": 0.01096327, "auxiliary_loss_mlp": 0.01066337, "balance_loss_clip": 1.02091849, "balance_loss_mlp": 1.02716303, "epoch": 0.1594468660754547, "flos": 22487835870720.0, "grad_norm": 1.8996291003317622, "language_loss": 0.81735992, "learning_rate": 3.8267845296912674e-06, "loss": 0.83898652, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.69140625, "step": 2652, "time_per_iteration": 2.4049973487854004 }, { "auxiliary_loss_clip": 0.01089914, "auxiliary_loss_mlp": 0.01063354, "balance_loss_clip": 1.02232242, "balance_loss_mlp": 1.02589393, "epoch": 0.15950698932812266, "flos": 15006577248000.0, "grad_norm": 4.170650128181991, "language_loss": 0.72224247, "learning_rate": 3.826625952782601e-06, "loss": 0.74377519, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.640625, "step": 2653, "time_per_iteration": 2.3639533519744873 }, { "auxiliary_loss_clip": 0.01093727, "auxiliary_loss_mlp": 0.01065346, "balance_loss_clip": 1.02343178, "balance_loss_mlp": 1.02631128, "epoch": 0.15956711258079062, "flos": 30153715096320.0, "grad_norm": 4.575782056338166, "language_loss": 0.80152208, "learning_rate": 3.826467306608095e-06, "loss": 0.82311285, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.67578125, "step": 2654, "time_per_iteration": 2.4609246253967285 }, { "auxiliary_loss_clip": 0.01089253, "auxiliary_loss_mlp": 0.01060252, "balance_loss_clip": 1.01812387, "balance_loss_mlp": 1.02335906, "epoch": 0.1596272358334586, "flos": 21031643306880.0, "grad_norm": 1.7388208505243963, "language_loss": 0.83851707, "learning_rate": 3.826308591173765e-06, "loss": 0.86001211, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.66015625, "step": 2655, "time_per_iteration": 2.403367280960083 }, { "auxiliary_loss_clip": 0.01091779, "auxiliary_loss_mlp": 0.01055803, "balance_loss_clip": 1.01620209, "balance_loss_mlp": 1.02450609, "epoch": 0.15968735908612655, "flos": 15267133791360.0, "grad_norm": 3.5150680980185114, "language_loss": 0.7570827, "learning_rate": 3.826149806485631e-06, "loss": 0.77855849, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.671875, "step": 2656, "time_per_iteration": 2.3946263790130615 }, { "auxiliary_loss_clip": 0.01089642, "auxiliary_loss_mlp": 0.01054335, "balance_loss_clip": 1.01680779, "balance_loss_mlp": 1.02524066, "epoch": 0.15974748233879452, "flos": 52663791369600.0, "grad_norm": 1.7684650600710579, "language_loss": 0.79660714, "learning_rate": 3.825990952549713e-06, "loss": 0.81804693, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.64453125, "step": 2657, "time_per_iteration": 2.674722671508789 }, { "auxiliary_loss_clip": 0.01092017, "auxiliary_loss_mlp": 0.01060128, "balance_loss_clip": 1.02262449, "balance_loss_mlp": 1.02665114, "epoch": 0.1598076055914625, "flos": 18732263564160.0, "grad_norm": 1.841096095247121, "language_loss": 0.76384759, "learning_rate": 3.825832029372035e-06, "loss": 0.78536898, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.65234375, "step": 2658, "time_per_iteration": 2.3875274658203125 }, { "auxiliary_loss_clip": 0.01091214, "auxiliary_loss_mlp": 0.0106498, "balance_loss_clip": 1.02232742, "balance_loss_mlp": 1.02546239, "epoch": 0.15986772884413047, "flos": 34347831390720.0, "grad_norm": 1.739787938833924, "language_loss": 0.76943326, "learning_rate": 3.825673036958624e-06, "loss": 0.79099524, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.65625, "step": 2659, "time_per_iteration": 2.501847267150879 }, { "auxiliary_loss_clip": 0.01090822, "auxiliary_loss_mlp": 0.01064629, "balance_loss_clip": 1.02431226, "balance_loss_mlp": 1.02535343, "epoch": 0.15992785209679844, "flos": 22053865271040.0, "grad_norm": 1.9660373099506043, "language_loss": 0.92782974, "learning_rate": 3.825513975315508e-06, "loss": 0.94938421, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.65625, "step": 2660, "time_per_iteration": 2.396108627319336 }, { "auxiliary_loss_clip": 0.01094898, "auxiliary_loss_mlp": 0.01068581, "balance_loss_clip": 1.02695322, "balance_loss_mlp": 1.02805352, "epoch": 0.1599879753494664, "flos": 33065436908160.0, "grad_norm": 2.5568746089322074, "language_loss": 0.79423976, "learning_rate": 3.82535484444872e-06, "loss": 0.81587458, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.66796875, "step": 2661, "time_per_iteration": 2.5061111450195312 }, { "auxiliary_loss_clip": 0.01089635, "auxiliary_loss_mlp": 0.01056961, "balance_loss_clip": 1.0154283, "balance_loss_mlp": 1.02363503, "epoch": 0.16004809860213437, "flos": 28036756540800.0, "grad_norm": 1.7401096682432593, "language_loss": 0.75250876, "learning_rate": 3.825195644364292e-06, "loss": 0.77397478, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.66015625, "step": 2662, "time_per_iteration": 2.466242551803589 }, { "auxiliary_loss_clip": 0.01092225, "auxiliary_loss_mlp": 0.01064052, "balance_loss_clip": 1.02383065, "balance_loss_mlp": 1.02566361, "epoch": 0.16010822185480234, "flos": 22779116277120.0, "grad_norm": 1.7482026492216716, "language_loss": 0.84508014, "learning_rate": 3.825036375068263e-06, "loss": 0.86664289, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.6640625, "step": 2663, "time_per_iteration": 2.3994972705841064 }, { "auxiliary_loss_clip": 0.0109474, "auxiliary_loss_mlp": 0.0106391, "balance_loss_clip": 1.024333, "balance_loss_mlp": 1.0273664, "epoch": 0.16016834510747033, "flos": 20082983310720.0, "grad_norm": 1.988743138826804, "language_loss": 0.8231287, "learning_rate": 3.824877036566672e-06, "loss": 0.84471524, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.671875, "step": 2664, "time_per_iteration": 3.961005926132202 }, { "auxiliary_loss_clip": 0.01088999, "auxiliary_loss_mlp": 0.01060717, "balance_loss_clip": 1.01992345, "balance_loss_mlp": 1.02447975, "epoch": 0.1602284683601383, "flos": 21172902134400.0, "grad_norm": 1.6194735878548034, "language_loss": 0.95316195, "learning_rate": 3.824717628865561e-06, "loss": 0.97465909, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.64453125, "step": 2665, "time_per_iteration": 2.4029390811920166 }, { "auxiliary_loss_clip": 0.01092966, "auxiliary_loss_mlp": 0.01066202, "balance_loss_clip": 1.02359629, "balance_loss_mlp": 1.0243938, "epoch": 0.16028859161280626, "flos": 14646692286720.0, "grad_norm": 2.0840561325913423, "language_loss": 0.86957467, "learning_rate": 3.824558151970974e-06, "loss": 0.89116639, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6875, "step": 2666, "time_per_iteration": 3.8050687313079834 }, { "auxiliary_loss_clip": 0.0109248, "auxiliary_loss_mlp": 0.01064362, "balance_loss_clip": 1.0246172, "balance_loss_mlp": 1.02415252, "epoch": 0.16034871486547422, "flos": 20989433606400.0, "grad_norm": 1.895193303777103, "language_loss": 0.83652955, "learning_rate": 3.8243986058889595e-06, "loss": 0.85809797, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.68359375, "step": 2667, "time_per_iteration": 2.399412155151367 }, { "auxiliary_loss_clip": 0.01091101, "auxiliary_loss_mlp": 0.01071254, "balance_loss_clip": 1.02700377, "balance_loss_mlp": 1.02534235, "epoch": 0.1604088381181422, "flos": 21396660238080.0, "grad_norm": 1.7256139119295297, "language_loss": 0.75124538, "learning_rate": 3.824238990625567e-06, "loss": 0.77286899, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.65625, "step": 2668, "time_per_iteration": 3.9083492755889893 }, { "auxiliary_loss_clip": 0.01090756, "auxiliary_loss_mlp": 0.01063395, "balance_loss_clip": 1.0208137, "balance_loss_mlp": 1.02353835, "epoch": 0.16046896137081015, "flos": 23875947550080.0, "grad_norm": 1.7063421584559342, "language_loss": 0.79000604, "learning_rate": 3.824079306186848e-06, "loss": 0.81154758, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.671875, "step": 2669, "time_per_iteration": 3.7820682525634766 }, { "auxiliary_loss_clip": 0.01036176, "auxiliary_loss_mlp": 0.01065617, "balance_loss_clip": 1.05741537, "balance_loss_mlp": 1.01335704, "epoch": 0.16052908462347812, "flos": 59803842608640.0, "grad_norm": 0.8401215530012947, "language_loss": 0.55672359, "learning_rate": 3.823919552578861e-06, "loss": 0.5777415, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 0.08203125, "router_z_loss_mlp": 0.22851562, "step": 2670, "time_per_iteration": 2.9293465614318848 }, { "auxiliary_loss_clip": 0.01092176, "auxiliary_loss_mlp": 0.01071234, "balance_loss_clip": 1.02767467, "balance_loss_mlp": 1.02424431, "epoch": 0.1605892078761461, "flos": 18295569878400.0, "grad_norm": 2.928476199966725, "language_loss": 0.8002286, "learning_rate": 3.82375972980766e-06, "loss": 0.8218627, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.6796875, "step": 2671, "time_per_iteration": 2.392563819885254 }, { "auxiliary_loss_clip": 0.01093085, "auxiliary_loss_mlp": 0.01077763, "balance_loss_clip": 1.03539658, "balance_loss_mlp": 1.02554178, "epoch": 0.16064933112881408, "flos": 32159370637440.0, "grad_norm": 1.9396272889947388, "language_loss": 0.67251039, "learning_rate": 3.8235998378793086e-06, "loss": 0.69421887, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.67578125, "step": 2672, "time_per_iteration": 2.477816343307495 }, { "auxiliary_loss_clip": 0.01091999, "auxiliary_loss_mlp": 0.0108174, "balance_loss_clip": 1.03324556, "balance_loss_mlp": 1.02331531, "epoch": 0.16070945438148204, "flos": 19827768205440.0, "grad_norm": 1.6955721135931157, "language_loss": 0.88085216, "learning_rate": 3.8234398767998675e-06, "loss": 0.90258956, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.6875, "step": 2673, "time_per_iteration": 2.378833055496216 }, { "auxiliary_loss_clip": 0.01094984, "auxiliary_loss_mlp": 0.01088916, "balance_loss_clip": 1.04409325, "balance_loss_mlp": 1.02678657, "epoch": 0.16076957763415, "flos": 18912240956160.0, "grad_norm": 4.18110689604754, "language_loss": 0.75048435, "learning_rate": 3.823279846575403e-06, "loss": 0.77232331, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6796875, "step": 2674, "time_per_iteration": 2.3886334896087646 }, { "auxiliary_loss_clip": 0.01092921, "auxiliary_loss_mlp": 0.0108617, "balance_loss_clip": 1.04044092, "balance_loss_mlp": 1.02506042, "epoch": 0.16082970088681797, "flos": 16763406462720.0, "grad_norm": 1.5426172647324543, "language_loss": 0.86076713, "learning_rate": 3.823119747211986e-06, "loss": 0.88255799, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6796875, "step": 2675, "time_per_iteration": 2.3749051094055176 }, { "auxiliary_loss_clip": 0.01097307, "auxiliary_loss_mlp": 0.01083353, "balance_loss_clip": 1.03888774, "balance_loss_mlp": 1.02745247, "epoch": 0.16088982413948594, "flos": 35148878691840.0, "grad_norm": 1.871075866949604, "language_loss": 0.83937752, "learning_rate": 3.822959578715685e-06, "loss": 0.86118412, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.69921875, "step": 2676, "time_per_iteration": 2.500680446624756 }, { "auxiliary_loss_clip": 0.01091704, "auxiliary_loss_mlp": 0.01083272, "balance_loss_clip": 1.04328895, "balance_loss_mlp": 1.02494872, "epoch": 0.1609499473921539, "flos": 18624102572160.0, "grad_norm": 1.7043712648772171, "language_loss": 0.74623424, "learning_rate": 3.822799341092573e-06, "loss": 0.76798403, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6640625, "step": 2677, "time_per_iteration": 2.3728644847869873 }, { "auxiliary_loss_clip": 0.01092592, "auxiliary_loss_mlp": 0.01081683, "balance_loss_clip": 1.0403173, "balance_loss_mlp": 1.0246048, "epoch": 0.1610100706448219, "flos": 33144340314240.0, "grad_norm": 1.8535376131159786, "language_loss": 0.7829237, "learning_rate": 3.822639034348728e-06, "loss": 0.8046664, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6796875, "step": 2678, "time_per_iteration": 2.491743803024292 }, { "auxiliary_loss_clip": 0.01096118, "auxiliary_loss_mlp": 0.01073535, "balance_loss_clip": 1.02599466, "balance_loss_mlp": 1.02593994, "epoch": 0.16107019389748986, "flos": 34675316743680.0, "grad_norm": 1.844125136316917, "language_loss": 0.71860206, "learning_rate": 3.822478658490228e-06, "loss": 0.74029863, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.703125, "step": 2679, "time_per_iteration": 2.490894317626953 }, { "auxiliary_loss_clip": 0.01033289, "auxiliary_loss_mlp": 0.01017011, "balance_loss_clip": 1.01066899, "balance_loss_mlp": 1.00841951, "epoch": 0.16113031715015783, "flos": 65710483735680.0, "grad_norm": 0.7798423525406406, "language_loss": 0.51964855, "learning_rate": 3.822318213523154e-06, "loss": 0.54015154, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 0.06347656, "router_z_loss_mlp": 0.24804688, "step": 2680, "time_per_iteration": 3.0766689777374268 }, { "auxiliary_loss_clip": 0.01095016, "auxiliary_loss_mlp": 0.01070524, "balance_loss_clip": 1.01938295, "balance_loss_mlp": 1.02400446, "epoch": 0.1611904404028258, "flos": 20809456214400.0, "grad_norm": 1.6961277487523598, "language_loss": 0.8174001, "learning_rate": 3.8221576994535925e-06, "loss": 0.83905542, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.7109375, "step": 2681, "time_per_iteration": 2.40366792678833 }, { "auxiliary_loss_clip": 0.01096598, "auxiliary_loss_mlp": 0.01073435, "balance_loss_clip": 1.02777815, "balance_loss_mlp": 1.02774262, "epoch": 0.16125056365549376, "flos": 27012195515520.0, "grad_norm": 1.76514982554947, "language_loss": 0.71643353, "learning_rate": 3.821997116287627e-06, "loss": 0.73813379, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6875, "step": 2682, "time_per_iteration": 2.462615966796875 }, { "auxiliary_loss_clip": 0.01098633, "auxiliary_loss_mlp": 0.01077098, "balance_loss_clip": 1.02853251, "balance_loss_mlp": 1.02876449, "epoch": 0.16131068690816172, "flos": 19275651964800.0, "grad_norm": 1.7792556206152477, "language_loss": 0.89227724, "learning_rate": 3.821836464031348e-06, "loss": 0.91403449, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.69921875, "step": 2683, "time_per_iteration": 2.4210846424102783 }, { "auxiliary_loss_clip": 0.01095054, "auxiliary_loss_mlp": 0.0107333, "balance_loss_clip": 1.02569413, "balance_loss_mlp": 1.02488112, "epoch": 0.16137081016082971, "flos": 35336396937600.0, "grad_norm": 1.829267015230841, "language_loss": 0.75662398, "learning_rate": 3.821675742690849e-06, "loss": 0.7783078, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.703125, "step": 2684, "time_per_iteration": 2.5550172328948975 }, { "auxiliary_loss_clip": 0.01100929, "auxiliary_loss_mlp": 0.01075257, "balance_loss_clip": 1.02630973, "balance_loss_mlp": 1.02815008, "epoch": 0.16143093341349768, "flos": 34233979847040.0, "grad_norm": 1.7119750818041055, "language_loss": 0.72189063, "learning_rate": 3.821514952272223e-06, "loss": 0.74365252, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.7265625, "step": 2685, "time_per_iteration": 2.4954161643981934 }, { "auxiliary_loss_clip": 0.01094183, "auxiliary_loss_mlp": 0.01077578, "balance_loss_clip": 1.02970314, "balance_loss_mlp": 1.02591825, "epoch": 0.16149105666616564, "flos": 27998072887680.0, "grad_norm": 1.9662251542675646, "language_loss": 0.73176676, "learning_rate": 3.821354092781567e-06, "loss": 0.75348437, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.68359375, "step": 2686, "time_per_iteration": 2.443716526031494 }, { "auxiliary_loss_clip": 0.01096042, "auxiliary_loss_mlp": 0.01076242, "balance_loss_clip": 1.02603102, "balance_loss_mlp": 1.02602363, "epoch": 0.1615511799188336, "flos": 19421344535040.0, "grad_norm": 1.8048812145806639, "language_loss": 0.83775449, "learning_rate": 3.821193164224981e-06, "loss": 0.85947728, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.69921875, "step": 2687, "time_per_iteration": 2.4030957221984863 }, { "auxiliary_loss_clip": 0.01100241, "auxiliary_loss_mlp": 0.01070098, "balance_loss_clip": 1.02012563, "balance_loss_mlp": 1.02686024, "epoch": 0.16161130317150157, "flos": 22853865231360.0, "grad_norm": 1.594619486430567, "language_loss": 0.73399842, "learning_rate": 3.821032166608568e-06, "loss": 0.75570178, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.734375, "step": 2688, "time_per_iteration": 2.4123518466949463 }, { "auxiliary_loss_clip": 0.01095559, "auxiliary_loss_mlp": 0.01084705, "balance_loss_clip": 1.03733099, "balance_loss_mlp": 1.02610958, "epoch": 0.16167142642416954, "flos": 26109201444480.0, "grad_norm": 1.606633674882936, "language_loss": 0.7699343, "learning_rate": 3.8208710999384325e-06, "loss": 0.7917369, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6953125, "step": 2689, "time_per_iteration": 2.487297773361206 }, { "auxiliary_loss_clip": 0.01097188, "auxiliary_loss_mlp": 0.01077935, "balance_loss_clip": 1.03299272, "balance_loss_mlp": 1.02742696, "epoch": 0.1617315496768375, "flos": 22778662429440.0, "grad_norm": 1.9566340475995379, "language_loss": 0.89395744, "learning_rate": 3.820709964220683e-06, "loss": 0.91570866, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.69921875, "step": 2690, "time_per_iteration": 2.4246702194213867 }, { "auxiliary_loss_clip": 0.01093319, "auxiliary_loss_mlp": 0.01066033, "balance_loss_clip": 1.02457178, "balance_loss_mlp": 1.02544236, "epoch": 0.1617916729295055, "flos": 22016228958720.0, "grad_norm": 2.0138945528375545, "language_loss": 0.89286566, "learning_rate": 3.8205487594614284e-06, "loss": 0.91445923, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6796875, "step": 2691, "time_per_iteration": 2.4003124237060547 }, { "auxiliary_loss_clip": 0.01100069, "auxiliary_loss_mlp": 0.01079591, "balance_loss_clip": 1.02694786, "balance_loss_mlp": 1.02610683, "epoch": 0.16185179618217346, "flos": 23437194094080.0, "grad_norm": 2.0439464153757734, "language_loss": 0.83840066, "learning_rate": 3.820387485666784e-06, "loss": 0.86019731, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.7421875, "step": 2692, "time_per_iteration": 2.4256579875946045 }, { "auxiliary_loss_clip": 0.01098212, "auxiliary_loss_mlp": 0.01074407, "balance_loss_clip": 1.02209759, "balance_loss_mlp": 1.02692103, "epoch": 0.16191191943484143, "flos": 25664931993600.0, "grad_norm": 2.009129307953413, "language_loss": 0.83523935, "learning_rate": 3.820226142842862e-06, "loss": 0.85696554, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.7109375, "step": 2693, "time_per_iteration": 2.425150156021118 }, { "auxiliary_loss_clip": 0.01091878, "auxiliary_loss_mlp": 0.01067944, "balance_loss_clip": 1.02533901, "balance_loss_mlp": 1.02436769, "epoch": 0.1619720426875094, "flos": 23476226860800.0, "grad_norm": 1.6721366556593724, "language_loss": 0.86105806, "learning_rate": 3.820064730995783e-06, "loss": 0.88265622, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.67578125, "step": 2694, "time_per_iteration": 2.4386985301971436 }, { "auxiliary_loss_clip": 0.0109783, "auxiliary_loss_mlp": 0.01068059, "balance_loss_clip": 1.02314138, "balance_loss_mlp": 1.02694428, "epoch": 0.16203216594017736, "flos": 24132524198400.0, "grad_norm": 3.6482335622896427, "language_loss": 0.70934826, "learning_rate": 3.819903250131667e-06, "loss": 0.73100716, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.7109375, "step": 2695, "time_per_iteration": 2.4154298305511475 }, { "auxiliary_loss_clip": 0.01096584, "auxiliary_loss_mlp": 0.01067771, "balance_loss_clip": 1.02206576, "balance_loss_mlp": 1.02723885, "epoch": 0.16209228919284532, "flos": 22339943884800.0, "grad_norm": 1.9694572645686819, "language_loss": 0.84639549, "learning_rate": 3.819741700256637e-06, "loss": 0.86803901, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69140625, "step": 2696, "time_per_iteration": 2.398836851119995 }, { "auxiliary_loss_clip": 0.01100517, "auxiliary_loss_mlp": 0.01072032, "balance_loss_clip": 1.02084398, "balance_loss_mlp": 1.02690983, "epoch": 0.1621524124455133, "flos": 15814222796160.0, "grad_norm": 2.2551522185021042, "language_loss": 0.91134334, "learning_rate": 3.8195800813768194e-06, "loss": 0.93306887, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 0.734375, "step": 2697, "time_per_iteration": 2.3839991092681885 }, { "auxiliary_loss_clip": 0.01089268, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.02048683, "balance_loss_mlp": 1.02399898, "epoch": 0.16221253569818128, "flos": 30185486300160.0, "grad_norm": 1.7710440362427138, "language_loss": 0.8216778, "learning_rate": 3.819418393498343e-06, "loss": 0.84316921, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.65234375, "step": 2698, "time_per_iteration": 2.4925849437713623 }, { "auxiliary_loss_clip": 0.01093283, "auxiliary_loss_mlp": 0.01067636, "balance_loss_clip": 1.02674747, "balance_loss_mlp": 1.02655768, "epoch": 0.16227265895084925, "flos": 24604899160320.0, "grad_norm": 1.8181549553829663, "language_loss": 0.78505689, "learning_rate": 3.819256636627339e-06, "loss": 0.80666608, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.66796875, "step": 2699, "time_per_iteration": 2.426504135131836 }, { "auxiliary_loss_clip": 0.0109156, "auxiliary_loss_mlp": 0.01058394, "balance_loss_clip": 1.01826787, "balance_loss_mlp": 1.02600098, "epoch": 0.1623327822035172, "flos": 19572308720640.0, "grad_norm": 1.8290886490304923, "language_loss": 0.88572347, "learning_rate": 3.81909481076994e-06, "loss": 0.90722299, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.65625, "step": 2700, "time_per_iteration": 2.3980765342712402 }, { "auxiliary_loss_clip": 0.01093135, "auxiliary_loss_mlp": 0.01067851, "balance_loss_clip": 1.01938093, "balance_loss_mlp": 1.02603936, "epoch": 0.16239290545618518, "flos": 26467271015040.0, "grad_norm": 1.8836592541042572, "language_loss": 0.81904012, "learning_rate": 3.818932915932284e-06, "loss": 0.84065002, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.671875, "step": 2701, "time_per_iteration": 2.4483413696289062 }, { "auxiliary_loss_clip": 0.0109367, "auxiliary_loss_mlp": 0.01064515, "balance_loss_clip": 1.02353108, "balance_loss_mlp": 1.0270319, "epoch": 0.16245302870885314, "flos": 15851021235840.0, "grad_norm": 2.0513919141649675, "language_loss": 0.75530303, "learning_rate": 3.818770952120511e-06, "loss": 0.77688491, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6640625, "step": 2702, "time_per_iteration": 2.3906679153442383 }, { "auxiliary_loss_clip": 0.01091956, "auxiliary_loss_mlp": 0.01065225, "balance_loss_clip": 1.02192795, "balance_loss_mlp": 1.02434945, "epoch": 0.1625131519615211, "flos": 14755656240000.0, "grad_norm": 2.094674775972685, "language_loss": 0.74669278, "learning_rate": 3.81860891934076e-06, "loss": 0.76826453, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.67578125, "step": 2703, "time_per_iteration": 2.367050886154175 }, { "auxiliary_loss_clip": 0.0109315, "auxiliary_loss_mlp": 0.01064639, "balance_loss_clip": 1.02243876, "balance_loss_mlp": 1.02501714, "epoch": 0.1625732752141891, "flos": 28219247550720.0, "grad_norm": 1.8282801240020046, "language_loss": 0.72640473, "learning_rate": 3.818446817599176e-06, "loss": 0.74798262, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6796875, "step": 2704, "time_per_iteration": 4.011623859405518 }, { "auxiliary_loss_clip": 0.01047862, "auxiliary_loss_mlp": 0.01016953, "balance_loss_clip": 1.00746441, "balance_loss_mlp": 1.01578522, "epoch": 0.16263339846685707, "flos": 67324727491200.0, "grad_norm": 0.791335893067402, "language_loss": 0.534006, "learning_rate": 3.818284646901907e-06, "loss": 0.55465412, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 0.09472656, "router_z_loss_mlp": 0.3203125, "step": 2705, "time_per_iteration": 3.0511341094970703 }, { "auxiliary_loss_clip": 0.01094152, "auxiliary_loss_mlp": 0.01074007, "balance_loss_clip": 1.03240299, "balance_loss_mlp": 1.02474213, "epoch": 0.16269352171952503, "flos": 14318299238400.0, "grad_norm": 2.277006740979859, "language_loss": 0.79052103, "learning_rate": 3.818122407255102e-06, "loss": 0.81220257, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.6953125, "step": 2706, "time_per_iteration": 3.846130132675171 }, { "auxiliary_loss_clip": 0.01095976, "auxiliary_loss_mlp": 0.01066499, "balance_loss_clip": 1.0270884, "balance_loss_mlp": 1.02802014, "epoch": 0.162753644972193, "flos": 28360087441920.0, "grad_norm": 2.296022043326329, "language_loss": 0.75572413, "learning_rate": 3.817960098664914e-06, "loss": 0.77734888, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.6796875, "step": 2707, "time_per_iteration": 2.4815258979797363 }, { "auxiliary_loss_clip": 0.01093241, "auxiliary_loss_mlp": 0.01071643, "balance_loss_clip": 1.02884722, "balance_loss_mlp": 1.02569687, "epoch": 0.16281376822486096, "flos": 19936836892800.0, "grad_norm": 2.6570295027676174, "language_loss": 0.8553896, "learning_rate": 3.817797721137495e-06, "loss": 0.87703842, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.67578125, "step": 2708, "time_per_iteration": 3.861713409423828 }, { "auxiliary_loss_clip": 0.01096238, "auxiliary_loss_mlp": 0.01074846, "balance_loss_clip": 1.02909303, "balance_loss_mlp": 1.0269208, "epoch": 0.16287389147752893, "flos": 21250653465600.0, "grad_norm": 2.1182491484523673, "language_loss": 0.88173187, "learning_rate": 3.817635274679006e-06, "loss": 0.90344274, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6953125, "step": 2709, "time_per_iteration": 3.882869005203247 }, { "auxiliary_loss_clip": 0.01091914, "auxiliary_loss_mlp": 0.01070059, "balance_loss_clip": 1.02838314, "balance_loss_mlp": 1.02477336, "epoch": 0.1629340147301969, "flos": 19243671292800.0, "grad_norm": 1.592941797544818, "language_loss": 0.92725307, "learning_rate": 3.817472759295605e-06, "loss": 0.9488728, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.671875, "step": 2710, "time_per_iteration": 2.444967746734619 }, { "auxiliary_loss_clip": 0.01094023, "auxiliary_loss_mlp": 0.01069983, "balance_loss_clip": 1.02966619, "balance_loss_mlp": 1.0276798, "epoch": 0.16299413798286488, "flos": 21248803163520.0, "grad_norm": 2.016995762090991, "language_loss": 0.836146, "learning_rate": 3.817310174993453e-06, "loss": 0.85778606, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.6640625, "step": 2711, "time_per_iteration": 2.3803019523620605 }, { "auxiliary_loss_clip": 0.01095846, "auxiliary_loss_mlp": 0.01063663, "balance_loss_clip": 1.01960301, "balance_loss_mlp": 1.02494705, "epoch": 0.16305426123553285, "flos": 18769585674240.0, "grad_norm": 2.089185651215556, "language_loss": 0.82852042, "learning_rate": 3.817147521778719e-06, "loss": 0.85011548, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.7109375, "step": 2712, "time_per_iteration": 2.3931591510772705 }, { "auxiliary_loss_clip": 0.01093462, "auxiliary_loss_mlp": 0.01070878, "balance_loss_clip": 1.02419567, "balance_loss_mlp": 1.02365398, "epoch": 0.16311438448820081, "flos": 22086648904320.0, "grad_norm": 2.047178917649759, "language_loss": 0.7931484, "learning_rate": 3.816984799657568e-06, "loss": 0.8147918, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6953125, "step": 2713, "time_per_iteration": 2.424274444580078 }, { "auxiliary_loss_clip": 0.01092187, "auxiliary_loss_mlp": 0.01067558, "balance_loss_clip": 1.02867174, "balance_loss_mlp": 1.02757418, "epoch": 0.16317450774086878, "flos": 16466889352320.0, "grad_norm": 3.2163196711627666, "language_loss": 0.8069948, "learning_rate": 3.8168220086361715e-06, "loss": 0.82859224, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.64453125, "step": 2714, "time_per_iteration": 2.376124858856201 }, { "auxiliary_loss_clip": 0.01093781, "auxiliary_loss_mlp": 0.01059861, "balance_loss_clip": 1.02166629, "balance_loss_mlp": 1.02637553, "epoch": 0.16323463099353674, "flos": 24351778736640.0, "grad_norm": 1.5659686658865972, "language_loss": 0.80030835, "learning_rate": 3.816659148720702e-06, "loss": 0.82184476, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.67578125, "step": 2715, "time_per_iteration": 2.443765163421631 }, { "auxiliary_loss_clip": 0.0109333, "auxiliary_loss_mlp": 0.0106343, "balance_loss_clip": 1.02277958, "balance_loss_mlp": 1.02614319, "epoch": 0.1632947542462047, "flos": 24899600880000.0, "grad_norm": 1.9264488581064292, "language_loss": 0.83345109, "learning_rate": 3.816496219917336e-06, "loss": 0.85501873, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.671875, "step": 2716, "time_per_iteration": 2.435058832168579 }, { "auxiliary_loss_clip": 0.01095454, "auxiliary_loss_mlp": 0.01072794, "balance_loss_clip": 1.0284481, "balance_loss_mlp": 1.02735472, "epoch": 0.1633548774988727, "flos": 24899112120960.0, "grad_norm": 1.9990177246392262, "language_loss": 0.87963068, "learning_rate": 3.816333222232251e-06, "loss": 0.90131319, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6796875, "step": 2717, "time_per_iteration": 2.4388387203216553 }, { "auxiliary_loss_clip": 0.01092309, "auxiliary_loss_mlp": 0.01065505, "balance_loss_clip": 1.02402067, "balance_loss_mlp": 1.02613974, "epoch": 0.16341500075154067, "flos": 30440596671360.0, "grad_norm": 1.6857768496793424, "language_loss": 0.79198837, "learning_rate": 3.816170155671629e-06, "loss": 0.81356645, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6640625, "step": 2718, "time_per_iteration": 2.487579822540283 }, { "auxiliary_loss_clip": 0.01095227, "auxiliary_loss_mlp": 0.01066474, "balance_loss_clip": 1.02410674, "balance_loss_mlp": 1.02628195, "epoch": 0.16347512400420863, "flos": 22783410374400.0, "grad_norm": 1.9430913493701927, "language_loss": 0.76186013, "learning_rate": 3.816007020241652e-06, "loss": 0.78347719, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.6875, "step": 2719, "time_per_iteration": 2.4324374198913574 }, { "auxiliary_loss_clip": 0.01093016, "auxiliary_loss_mlp": 0.01066867, "balance_loss_clip": 1.02459526, "balance_loss_mlp": 1.02502549, "epoch": 0.1635352472568766, "flos": 22632306543360.0, "grad_norm": 1.9287140311994697, "language_loss": 0.7308557, "learning_rate": 3.815843815948507e-06, "loss": 0.75245452, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6796875, "step": 2720, "time_per_iteration": 2.4097049236297607 }, { "auxiliary_loss_clip": 0.01091363, "auxiliary_loss_mlp": 0.01056681, "balance_loss_clip": 1.01598346, "balance_loss_mlp": 1.02644634, "epoch": 0.16359537050954456, "flos": 15522104517120.0, "grad_norm": 1.8715118993343218, "language_loss": 0.77581632, "learning_rate": 3.8156805427983824e-06, "loss": 0.79729676, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6484375, "step": 2721, "time_per_iteration": 2.399419069290161 }, { "auxiliary_loss_clip": 0.01096247, "auxiliary_loss_mlp": 0.01064944, "balance_loss_clip": 1.01888156, "balance_loss_mlp": 1.02543783, "epoch": 0.16365549376221253, "flos": 22089092699520.0, "grad_norm": 1.8081238244143578, "language_loss": 0.81009877, "learning_rate": 3.8155172007974695e-06, "loss": 0.8317107, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2722, "time_per_iteration": 2.4425203800201416 }, { "auxiliary_loss_clip": 0.01097422, "auxiliary_loss_mlp": 0.01078767, "balance_loss_clip": 1.02612448, "balance_loss_mlp": 1.02561784, "epoch": 0.1637156170148805, "flos": 24059276432640.0, "grad_norm": 2.2473288268122626, "language_loss": 0.86193991, "learning_rate": 3.8153537899519624e-06, "loss": 0.8837018, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 0.71875, "step": 2723, "time_per_iteration": 2.4318931102752686 }, { "auxiliary_loss_clip": 0.01092102, "auxiliary_loss_mlp": 0.01061835, "balance_loss_clip": 1.01968253, "balance_loss_mlp": 1.02643514, "epoch": 0.1637757402675485, "flos": 26684221403520.0, "grad_norm": 1.7797690807441746, "language_loss": 0.72865003, "learning_rate": 3.815190310268058e-06, "loss": 0.75018942, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.65625, "step": 2724, "time_per_iteration": 2.455733299255371 }, { "auxiliary_loss_clip": 0.01091655, "auxiliary_loss_mlp": 0.01065202, "balance_loss_clip": 1.02316856, "balance_loss_mlp": 1.02520752, "epoch": 0.16383586352021645, "flos": 16106026872960.0, "grad_norm": 1.9212610056570587, "language_loss": 0.72683203, "learning_rate": 3.815026761751955e-06, "loss": 0.74840057, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6640625, "step": 2725, "time_per_iteration": 2.394416093826294 }, { "auxiliary_loss_clip": 0.01090978, "auxiliary_loss_mlp": 0.01062947, "balance_loss_clip": 1.0201031, "balance_loss_mlp": 1.02531576, "epoch": 0.16389598677288442, "flos": 19165151911680.0, "grad_norm": 1.8530783611217188, "language_loss": 0.89870691, "learning_rate": 3.814863144409855e-06, "loss": 0.92024612, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.65625, "step": 2726, "time_per_iteration": 2.3902058601379395 }, { "auxiliary_loss_clip": 0.01098999, "auxiliary_loss_mlp": 0.01069757, "balance_loss_clip": 1.02653182, "balance_loss_mlp": 1.02823138, "epoch": 0.16395611002555238, "flos": 21505938393600.0, "grad_norm": 2.0577769687017278, "language_loss": 0.75856721, "learning_rate": 3.814699458247963e-06, "loss": 0.78025484, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.70703125, "step": 2727, "time_per_iteration": 2.404475450515747 }, { "auxiliary_loss_clip": 0.01091338, "auxiliary_loss_mlp": 0.01066496, "balance_loss_clip": 1.02570248, "balance_loss_mlp": 1.0252198, "epoch": 0.16401623327822035, "flos": 21469838181120.0, "grad_norm": 1.5698083685685307, "language_loss": 0.84929699, "learning_rate": 3.8145357032724855e-06, "loss": 0.8708753, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.66015625, "step": 2728, "time_per_iteration": 2.4072928428649902 }, { "auxiliary_loss_clip": 0.01097676, "auxiliary_loss_mlp": 0.01071508, "balance_loss_clip": 1.02594614, "balance_loss_mlp": 1.02689338, "epoch": 0.1640763565308883, "flos": 13625378017920.0, "grad_norm": 2.292422380052449, "language_loss": 0.87333423, "learning_rate": 3.814371879489633e-06, "loss": 0.89502603, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.70703125, "step": 2729, "time_per_iteration": 2.381352186203003 }, { "auxiliary_loss_clip": 0.0109542, "auxiliary_loss_mlp": 0.01078385, "balance_loss_clip": 1.03244233, "balance_loss_mlp": 1.02495241, "epoch": 0.16413647978355628, "flos": 15450532496640.0, "grad_norm": 1.9198768415741962, "language_loss": 0.75433809, "learning_rate": 3.814207986905616e-06, "loss": 0.7760762, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.703125, "step": 2730, "time_per_iteration": 2.367783784866333 }, { "auxiliary_loss_clip": 0.01099118, "auxiliary_loss_mlp": 0.0107143, "balance_loss_clip": 1.02362716, "balance_loss_mlp": 1.02615309, "epoch": 0.16419660303622427, "flos": 45876955155840.0, "grad_norm": 1.5729551036239653, "language_loss": 0.76675683, "learning_rate": 3.814044025526651e-06, "loss": 0.78846228, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.73046875, "step": 2731, "time_per_iteration": 2.640993118286133 }, { "auxiliary_loss_clip": 0.01099817, "auxiliary_loss_mlp": 0.01072276, "balance_loss_clip": 1.02430594, "balance_loss_mlp": 1.02763271, "epoch": 0.16425672628889224, "flos": 18951832304640.0, "grad_norm": 2.259870318660352, "language_loss": 0.8128016, "learning_rate": 3.8138799953589548e-06, "loss": 0.8345226, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.72265625, "step": 2732, "time_per_iteration": 2.3673460483551025 }, { "auxiliary_loss_clip": 0.01095517, "auxiliary_loss_mlp": 0.01076865, "balance_loss_clip": 1.02748895, "balance_loss_mlp": 1.02508545, "epoch": 0.1643168495415602, "flos": 24311943008640.0, "grad_norm": 2.2321730246807596, "language_loss": 0.70915186, "learning_rate": 3.8137158964087473e-06, "loss": 0.73087573, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.703125, "step": 2733, "time_per_iteration": 2.4428422451019287 }, { "auxiliary_loss_clip": 0.01095502, "auxiliary_loss_mlp": 0.01066617, "balance_loss_clip": 1.0217464, "balance_loss_mlp": 1.02665818, "epoch": 0.16437697279422817, "flos": 26427330552960.0, "grad_norm": 1.8790922844191156, "language_loss": 0.82300675, "learning_rate": 3.8135517286822508e-06, "loss": 0.84462798, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6875, "step": 2734, "time_per_iteration": 2.4976723194122314 }, { "auxiliary_loss_clip": 0.01094099, "auxiliary_loss_mlp": 0.01065247, "balance_loss_clip": 1.01951873, "balance_loss_mlp": 1.02500129, "epoch": 0.16443709604689613, "flos": 34530811159680.0, "grad_norm": 2.1005593638275877, "language_loss": 0.85012686, "learning_rate": 3.8133874921856914e-06, "loss": 0.87172031, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69140625, "step": 2735, "time_per_iteration": 2.5512137413024902 }, { "auxiliary_loss_clip": 0.01090602, "auxiliary_loss_mlp": 0.01062742, "balance_loss_clip": 1.02049398, "balance_loss_mlp": 1.02420712, "epoch": 0.1644972192995641, "flos": 23256937411200.0, "grad_norm": 2.3911799844136263, "language_loss": 0.80560231, "learning_rate": 3.813223186925296e-06, "loss": 0.82713568, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6640625, "step": 2736, "time_per_iteration": 2.391968011856079 }, { "auxiliary_loss_clip": 0.01098731, "auxiliary_loss_mlp": 0.01069731, "balance_loss_clip": 1.02598119, "balance_loss_mlp": 1.02872121, "epoch": 0.1645573425522321, "flos": 26978329630080.0, "grad_norm": 1.9642517434762736, "language_loss": 0.82283413, "learning_rate": 3.8130588129072964e-06, "loss": 0.84451872, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.69921875, "step": 2737, "time_per_iteration": 2.4580259323120117 }, { "auxiliary_loss_clip": 0.01092928, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.01939499, "balance_loss_mlp": 1.02387822, "epoch": 0.16461746580490005, "flos": 28730480722560.0, "grad_norm": 1.768092522224841, "language_loss": 0.89176226, "learning_rate": 3.8128943701379246e-06, "loss": 0.91330886, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.6875, "step": 2738, "time_per_iteration": 2.4888288974761963 }, { "auxiliary_loss_clip": 0.01093143, "auxiliary_loss_mlp": 0.01077231, "balance_loss_clip": 1.0315026, "balance_loss_mlp": 1.02412128, "epoch": 0.16467758905756802, "flos": 24929172668160.0, "grad_norm": 1.6955088842711679, "language_loss": 0.73338234, "learning_rate": 3.8127298586234167e-06, "loss": 0.75508618, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6875, "step": 2739, "time_per_iteration": 2.447483539581299 }, { "auxiliary_loss_clip": 0.01093085, "auxiliary_loss_mlp": 0.01070486, "balance_loss_clip": 1.02575827, "balance_loss_mlp": 1.02491307, "epoch": 0.16473771231023598, "flos": 24825375596160.0, "grad_norm": 13.003842748907921, "language_loss": 0.82771897, "learning_rate": 3.8125652783700104e-06, "loss": 0.84935462, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.68359375, "step": 2740, "time_per_iteration": 2.4232521057128906 }, { "auxiliary_loss_clip": 0.01096083, "auxiliary_loss_mlp": 0.01074049, "balance_loss_clip": 1.02741444, "balance_loss_mlp": 1.02525282, "epoch": 0.16479783556290395, "flos": 39894482822400.0, "grad_norm": 1.9566560366360048, "language_loss": 0.71405935, "learning_rate": 3.8124006293839475e-06, "loss": 0.73576069, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.70703125, "step": 2741, "time_per_iteration": 2.5660853385925293 }, { "auxiliary_loss_clip": 0.01092452, "auxiliary_loss_mlp": 0.01061293, "balance_loss_clip": 1.01725733, "balance_loss_mlp": 1.02567506, "epoch": 0.16485795881557191, "flos": 19896163292160.0, "grad_norm": 1.7787717677893216, "language_loss": 0.81296003, "learning_rate": 3.812235911671472e-06, "loss": 0.83449745, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.66796875, "step": 2742, "time_per_iteration": 2.379420518875122 }, { "auxiliary_loss_clip": 0.01093336, "auxiliary_loss_mlp": 0.01068493, "balance_loss_clip": 1.02591193, "balance_loss_mlp": 1.02672958, "epoch": 0.16491808206823988, "flos": 20555148804480.0, "grad_norm": 1.7240146547881183, "language_loss": 0.86191559, "learning_rate": 3.8120711252388274e-06, "loss": 0.88353384, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6640625, "step": 2743, "time_per_iteration": 2.4186954498291016 }, { "auxiliary_loss_clip": 0.01089318, "auxiliary_loss_mlp": 0.01064581, "balance_loss_clip": 1.02204752, "balance_loss_mlp": 1.02435291, "epoch": 0.16497820532090787, "flos": 23799802141440.0, "grad_norm": 1.5270042900874738, "language_loss": 0.87049669, "learning_rate": 3.811906270092265e-06, "loss": 0.89203572, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6484375, "step": 2744, "time_per_iteration": 3.916055917739868 }, { "auxiliary_loss_clip": 0.01087792, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.02223635, "balance_loss_mlp": 1.02456534, "epoch": 0.16503832857357584, "flos": 25481498376960.0, "grad_norm": 2.3486178426489257, "language_loss": 0.83970082, "learning_rate": 3.811741346238036e-06, "loss": 0.86123049, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6328125, "step": 2745, "time_per_iteration": 2.4591195583343506 }, { "auxiliary_loss_clip": 0.01095044, "auxiliary_loss_mlp": 0.01069975, "balance_loss_clip": 1.02546191, "balance_loss_mlp": 1.02557325, "epoch": 0.1650984518262438, "flos": 17675093462400.0, "grad_norm": 1.9075349393238235, "language_loss": 0.78004473, "learning_rate": 3.8115763536823923e-06, "loss": 0.80169493, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6953125, "step": 2746, "time_per_iteration": 3.819058895111084 }, { "auxiliary_loss_clip": 0.01090119, "auxiliary_loss_mlp": 0.01071521, "balance_loss_clip": 1.02550578, "balance_loss_mlp": 1.0242486, "epoch": 0.16515857507891177, "flos": 18697315426560.0, "grad_norm": 1.5162653944322744, "language_loss": 0.81500322, "learning_rate": 3.811411292431592e-06, "loss": 0.83661962, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.66015625, "step": 2747, "time_per_iteration": 3.82015323638916 }, { "auxiliary_loss_clip": 0.0109425, "auxiliary_loss_mlp": 0.01064368, "balance_loss_clip": 1.01942587, "balance_loss_mlp": 1.02555263, "epoch": 0.16521869833157973, "flos": 15009649447680.0, "grad_norm": 2.174902745316553, "language_loss": 0.71604735, "learning_rate": 3.8112461624918945e-06, "loss": 0.73763359, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6875, "step": 2748, "time_per_iteration": 3.751910924911499 }, { "auxiliary_loss_clip": 0.0109039, "auxiliary_loss_mlp": 0.01068612, "balance_loss_clip": 1.02469552, "balance_loss_mlp": 1.02422309, "epoch": 0.1652788215842477, "flos": 22120235498880.0, "grad_norm": 2.0742631307424975, "language_loss": 0.90456176, "learning_rate": 3.811080963869561e-06, "loss": 0.92615175, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.66015625, "step": 2749, "time_per_iteration": 2.408053398132324 }, { "auxiliary_loss_clip": 0.01091405, "auxiliary_loss_mlp": 0.01059867, "balance_loss_clip": 1.01621294, "balance_loss_mlp": 1.02384591, "epoch": 0.16533894483691566, "flos": 18332089027200.0, "grad_norm": 2.081618946180685, "language_loss": 0.80768466, "learning_rate": 3.8109156965708557e-06, "loss": 0.82919741, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.67578125, "step": 2750, "time_per_iteration": 2.401859998703003 }, { "auxiliary_loss_clip": 0.01089643, "auxiliary_loss_mlp": 0.01066797, "balance_loss_clip": 1.02385771, "balance_loss_mlp": 1.02401328, "epoch": 0.16539906808958366, "flos": 22381036421760.0, "grad_norm": 3.383606732749116, "language_loss": 0.96704721, "learning_rate": 3.8107503606020455e-06, "loss": 0.98861158, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.65625, "step": 2751, "time_per_iteration": 2.4410171508789062 }, { "auxiliary_loss_clip": 0.01090882, "auxiliary_loss_mlp": 0.01063173, "balance_loss_clip": 1.02054405, "balance_loss_mlp": 1.02675104, "epoch": 0.16545919134225162, "flos": 22709988051840.0, "grad_norm": 1.8899083623984048, "language_loss": 0.72576171, "learning_rate": 3.8105849559693997e-06, "loss": 0.74730229, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.640625, "step": 2752, "time_per_iteration": 2.4002676010131836 }, { "auxiliary_loss_clip": 0.01028962, "auxiliary_loss_mlp": 0.01017949, "balance_loss_clip": 1.01036727, "balance_loss_mlp": 1.00957465, "epoch": 0.1655193145949196, "flos": 67799720805120.0, "grad_norm": 0.7712634080014775, "language_loss": 0.5423367, "learning_rate": 3.810419482679192e-06, "loss": 0.56280577, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 0.07568359, "router_z_loss_mlp": 0.19335938, "step": 2753, "time_per_iteration": 3.1309733390808105 }, { "auxiliary_loss_clip": 0.01088824, "auxiliary_loss_mlp": 0.01063184, "balance_loss_clip": 1.01809859, "balance_loss_mlp": 1.02385747, "epoch": 0.16557943784758755, "flos": 24279229198080.0, "grad_norm": 1.6258996643962718, "language_loss": 0.77334481, "learning_rate": 3.8102539407376954e-06, "loss": 0.79486489, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6484375, "step": 2754, "time_per_iteration": 2.4204041957855225 }, { "auxiliary_loss_clip": 0.01100078, "auxiliary_loss_mlp": 0.01077234, "balance_loss_clip": 1.02924025, "balance_loss_mlp": 1.02935445, "epoch": 0.16563956110025552, "flos": 20082599285760.0, "grad_norm": 2.1398829892761286, "language_loss": 0.89349544, "learning_rate": 3.810088330151188e-06, "loss": 0.91526854, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.70703125, "step": 2755, "time_per_iteration": 2.390040636062622 }, { "auxiliary_loss_clip": 0.01091965, "auxiliary_loss_mlp": 0.01070943, "balance_loss_clip": 1.02671599, "balance_loss_mlp": 1.02611148, "epoch": 0.16569968435292348, "flos": 28033300316160.0, "grad_norm": 1.8221852890069372, "language_loss": 0.74945003, "learning_rate": 3.80992265092595e-06, "loss": 0.77107906, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.66015625, "step": 2756, "time_per_iteration": 2.4387753009796143 }, { "auxiliary_loss_clip": 0.01090842, "auxiliary_loss_mlp": 0.01060902, "balance_loss_clip": 1.02034688, "balance_loss_mlp": 1.02738094, "epoch": 0.16575980760559147, "flos": 26249028906240.0, "grad_norm": 1.5835441168216, "language_loss": 0.76725578, "learning_rate": 3.8097569030682636e-06, "loss": 0.78877318, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.63671875, "step": 2757, "time_per_iteration": 2.481924533843994 }, { "auxiliary_loss_clip": 0.0109289, "auxiliary_loss_mlp": 0.01062927, "balance_loss_clip": 1.02203822, "balance_loss_mlp": 1.02745128, "epoch": 0.16581993085825944, "flos": 26942718176640.0, "grad_norm": 2.582196966652395, "language_loss": 0.86347389, "learning_rate": 3.8095910865844137e-06, "loss": 0.88503206, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.65234375, "step": 2758, "time_per_iteration": 2.4430432319641113 }, { "auxiliary_loss_clip": 0.01093548, "auxiliary_loss_mlp": 0.01066322, "balance_loss_clip": 1.02641082, "balance_loss_mlp": 1.02721334, "epoch": 0.1658800541109274, "flos": 21652538659200.0, "grad_norm": 2.3870797543162143, "language_loss": 0.81196606, "learning_rate": 3.809425201480689e-06, "loss": 0.83356476, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6640625, "step": 2759, "time_per_iteration": 2.4006094932556152 }, { "auxiliary_loss_clip": 0.01090035, "auxiliary_loss_mlp": 0.01068631, "balance_loss_clip": 1.02404726, "balance_loss_mlp": 1.02448022, "epoch": 0.16594017736359537, "flos": 16434559566720.0, "grad_norm": 2.1172870503428505, "language_loss": 0.77690071, "learning_rate": 3.8092592477633793e-06, "loss": 0.79848742, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.65625, "step": 2760, "time_per_iteration": 2.4081296920776367 }, { "auxiliary_loss_clip": 0.01096653, "auxiliary_loss_mlp": 0.01069722, "balance_loss_clip": 1.02508974, "balance_loss_mlp": 1.0270468, "epoch": 0.16600030061626334, "flos": 22636216615680.0, "grad_norm": 1.8200067825628155, "language_loss": 0.75873214, "learning_rate": 3.8090932254387774e-06, "loss": 0.78039593, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6953125, "step": 2761, "time_per_iteration": 2.4225738048553467 }, { "auxiliary_loss_clip": 0.01091461, "auxiliary_loss_mlp": 0.0106588, "balance_loss_clip": 1.02384698, "balance_loss_mlp": 1.02562535, "epoch": 0.1660604238689313, "flos": 26395349880960.0, "grad_norm": 1.885966839764211, "language_loss": 0.90257275, "learning_rate": 3.8089271345131788e-06, "loss": 0.92414618, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.66015625, "step": 2762, "time_per_iteration": 2.450077772140503 }, { "auxiliary_loss_clip": 0.01093222, "auxiliary_loss_mlp": 0.01069601, "balance_loss_clip": 1.02778196, "balance_loss_mlp": 1.02628696, "epoch": 0.16612054712159927, "flos": 23038869859200.0, "grad_norm": 1.7725303276699687, "language_loss": 0.9017812, "learning_rate": 3.8087609749928822e-06, "loss": 0.92340946, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.671875, "step": 2763, "time_per_iteration": 2.4360148906707764 }, { "auxiliary_loss_clip": 0.01026298, "auxiliary_loss_mlp": 0.01011992, "balance_loss_clip": 1.00069058, "balance_loss_mlp": 1.0072217, "epoch": 0.16618067037426726, "flos": 59237864691840.0, "grad_norm": 0.7810601386425962, "language_loss": 0.59933674, "learning_rate": 3.8085947468841885e-06, "loss": 0.61971962, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.19140625, "step": 2764, "time_per_iteration": 3.0757365226745605 }, { "auxiliary_loss_clip": 0.01093638, "auxiliary_loss_mlp": 0.01071255, "balance_loss_clip": 1.02400064, "balance_loss_mlp": 1.02620268, "epoch": 0.16624079362693522, "flos": 27197584168320.0, "grad_norm": 1.7044008716848251, "language_loss": 0.83426261, "learning_rate": 3.808428450193401e-06, "loss": 0.85591155, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.671875, "step": 2765, "time_per_iteration": 2.4910948276519775 }, { "auxiliary_loss_clip": 0.0109763, "auxiliary_loss_mlp": 0.01067257, "balance_loss_clip": 1.0206219, "balance_loss_mlp": 1.02670562, "epoch": 0.1663009168796032, "flos": 10924322549760.0, "grad_norm": 2.477884376384354, "language_loss": 0.71547925, "learning_rate": 3.8082620849268244e-06, "loss": 0.73712802, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.7109375, "step": 2766, "time_per_iteration": 2.3629746437072754 }, { "auxiliary_loss_clip": 0.01089683, "auxiliary_loss_mlp": 0.01062297, "balance_loss_clip": 1.01864231, "balance_loss_mlp": 1.02584291, "epoch": 0.16636104013227115, "flos": 17893475216640.0, "grad_norm": 2.792617140085992, "language_loss": 0.90197724, "learning_rate": 3.808095651090769e-06, "loss": 0.92349696, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.63671875, "step": 2767, "time_per_iteration": 2.393547296524048 }, { "auxiliary_loss_clip": 0.01025557, "auxiliary_loss_mlp": 0.01017246, "balance_loss_clip": 1.00880623, "balance_loss_mlp": 1.00772238, "epoch": 0.16642116338493912, "flos": 66722335159680.0, "grad_norm": 0.6532341541765665, "language_loss": 0.52998656, "learning_rate": 3.8079291486915447e-06, "loss": 0.55041462, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 0.08447266, "router_z_loss_mlp": 0.17773438, "step": 2768, "time_per_iteration": 3.1636197566986084 }, { "auxiliary_loss_clip": 0.01093826, "auxiliary_loss_mlp": 0.01067203, "balance_loss_clip": 1.02187955, "balance_loss_mlp": 1.02571464, "epoch": 0.16648128663760708, "flos": 19025045159040.0, "grad_norm": 2.2753863533059158, "language_loss": 0.88566965, "learning_rate": 3.8077625777354667e-06, "loss": 0.90727997, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6796875, "step": 2769, "time_per_iteration": 2.3921074867248535 }, { "auxiliary_loss_clip": 0.01023739, "auxiliary_loss_mlp": 0.01012871, "balance_loss_clip": 1.00462151, "balance_loss_mlp": 1.00586343, "epoch": 0.16654140989027508, "flos": 70131744535680.0, "grad_norm": 0.814591491707449, "language_loss": 0.57643324, "learning_rate": 3.80759593822885e-06, "loss": 0.59679937, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 0.08251953, "router_z_loss_mlp": 0.17871094, "step": 2770, "time_per_iteration": 2.901458501815796 }, { "auxiliary_loss_clip": 0.01021962, "auxiliary_loss_mlp": 0.01013024, "balance_loss_clip": 1.00506115, "balance_loss_mlp": 1.0043304, "epoch": 0.16660153314294304, "flos": 70269407493120.0, "grad_norm": 0.8685549315047858, "language_loss": 0.56272244, "learning_rate": 3.807429230178015e-06, "loss": 0.5830723, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 0.07958984, "router_z_loss_mlp": 0.17578125, "step": 2771, "time_per_iteration": 2.896737575531006 }, { "auxiliary_loss_clip": 0.01091314, "auxiliary_loss_mlp": 0.01075012, "balance_loss_clip": 1.02704263, "balance_loss_mlp": 1.02591109, "epoch": 0.166661656395611, "flos": 23073957642240.0, "grad_norm": 2.0332154926987855, "language_loss": 0.72690505, "learning_rate": 3.8072624535892817e-06, "loss": 0.7485683, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.65625, "step": 2772, "time_per_iteration": 2.4210855960845947 }, { "auxiliary_loss_clip": 0.01089599, "auxiliary_loss_mlp": 0.01076538, "balance_loss_clip": 1.03080952, "balance_loss_mlp": 1.02465582, "epoch": 0.16672177964827897, "flos": 28365079766400.0, "grad_norm": 2.029207375792823, "language_loss": 0.8746177, "learning_rate": 3.807095608468975e-06, "loss": 0.8962791, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6484375, "step": 2773, "time_per_iteration": 2.439413070678711 }, { "auxiliary_loss_clip": 0.01091207, "auxiliary_loss_mlp": 0.01059769, "balance_loss_clip": 1.0193094, "balance_loss_mlp": 1.02734828, "epoch": 0.16678190290094694, "flos": 19090228400640.0, "grad_norm": 2.3064932494663783, "language_loss": 0.83679879, "learning_rate": 3.8069286948234224e-06, "loss": 0.85830855, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.63671875, "step": 2774, "time_per_iteration": 2.4070403575897217 }, { "auxiliary_loss_clip": 0.01094846, "auxiliary_loss_mlp": 0.01060216, "balance_loss_clip": 1.01830184, "balance_loss_mlp": 1.02864218, "epoch": 0.1668420261536149, "flos": 21798021761280.0, "grad_norm": 2.0684483057720993, "language_loss": 0.85034013, "learning_rate": 3.806761712658952e-06, "loss": 0.87189078, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6640625, "step": 2775, "time_per_iteration": 2.40207576751709 }, { "auxiliary_loss_clip": 0.01091657, "auxiliary_loss_mlp": 0.01063227, "balance_loss_clip": 1.02155209, "balance_loss_mlp": 1.02725029, "epoch": 0.16690214940628287, "flos": 19061529396480.0, "grad_norm": 2.0821496837326947, "language_loss": 0.82717186, "learning_rate": 3.806594661981897e-06, "loss": 0.84872073, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.64453125, "step": 2776, "time_per_iteration": 2.3937885761260986 }, { "auxiliary_loss_clip": 0.01087707, "auxiliary_loss_mlp": 0.01069525, "balance_loss_clip": 1.02868366, "balance_loss_mlp": 1.02592278, "epoch": 0.16696227265895086, "flos": 18587548512000.0, "grad_norm": 2.136259099147055, "language_loss": 0.8116473, "learning_rate": 3.8064275427985906e-06, "loss": 0.83321959, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.6171875, "step": 2777, "time_per_iteration": 2.4002158641815186 }, { "auxiliary_loss_clip": 0.0109262, "auxiliary_loss_mlp": 0.01062301, "balance_loss_clip": 1.01976669, "balance_loss_mlp": 1.0271982, "epoch": 0.16702239591161883, "flos": 23293037623680.0, "grad_norm": 3.3325808203982556, "language_loss": 0.86160427, "learning_rate": 3.806260355115371e-06, "loss": 0.88315356, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.65625, "step": 2778, "time_per_iteration": 2.4349942207336426 }, { "auxiliary_loss_clip": 0.01095508, "auxiliary_loss_mlp": 0.01068136, "balance_loss_clip": 1.02317047, "balance_loss_mlp": 1.02934504, "epoch": 0.1670825191642868, "flos": 24424502832000.0, "grad_norm": 2.04750349274943, "language_loss": 0.75941694, "learning_rate": 3.8060930989385778e-06, "loss": 0.78105336, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6640625, "step": 2779, "time_per_iteration": 2.468214750289917 }, { "auxiliary_loss_clip": 0.0109291, "auxiliary_loss_mlp": 0.01068536, "balance_loss_clip": 1.02490544, "balance_loss_mlp": 1.02726984, "epoch": 0.16714264241695476, "flos": 26796292467840.0, "grad_norm": 2.7949213821781527, "language_loss": 0.67962003, "learning_rate": 3.805925774274554e-06, "loss": 0.70123452, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.65625, "step": 2780, "time_per_iteration": 2.4597458839416504 }, { "auxiliary_loss_clip": 0.01095954, "auxiliary_loss_mlp": 0.01065961, "balance_loss_clip": 1.01856315, "balance_loss_mlp": 1.02972507, "epoch": 0.16720276566962272, "flos": 21834226707840.0, "grad_norm": 2.037810793045841, "language_loss": 0.80807018, "learning_rate": 3.805758381129643e-06, "loss": 0.82968938, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.6640625, "step": 2781, "time_per_iteration": 2.4106314182281494 }, { "auxiliary_loss_clip": 0.01094679, "auxiliary_loss_mlp": 0.01060651, "balance_loss_clip": 1.0164485, "balance_loss_mlp": 1.02724838, "epoch": 0.1672628889222907, "flos": 21469349422080.0, "grad_norm": 1.4387507377797755, "language_loss": 0.76775312, "learning_rate": 3.805590919510193e-06, "loss": 0.7893064, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.67578125, "step": 2782, "time_per_iteration": 2.443311929702759 }, { "auxiliary_loss_clip": 0.010983, "auxiliary_loss_mlp": 0.01073267, "balance_loss_clip": 1.02756262, "balance_loss_mlp": 1.02845621, "epoch": 0.16732301217495865, "flos": 30772690323840.0, "grad_norm": 2.189455448213355, "language_loss": 0.69594377, "learning_rate": 3.8054233894225547e-06, "loss": 0.71765947, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.69921875, "step": 2783, "time_per_iteration": 3.917137622833252 }, { "auxiliary_loss_clip": 0.01093422, "auxiliary_loss_mlp": 0.01072136, "balance_loss_clip": 1.02669358, "balance_loss_mlp": 1.02789831, "epoch": 0.16738313542762664, "flos": 23473573597440.0, "grad_norm": 1.6714100616770964, "language_loss": 0.71307266, "learning_rate": 3.805255790873081e-06, "loss": 0.73472822, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.65625, "step": 2784, "time_per_iteration": 2.445460319519043 }, { "auxiliary_loss_clip": 0.0109254, "auxiliary_loss_mlp": 0.0107582, "balance_loss_clip": 1.02582383, "balance_loss_mlp": 1.02506375, "epoch": 0.1674432586802946, "flos": 29787790469760.0, "grad_norm": 1.8435815162394726, "language_loss": 0.63103914, "learning_rate": 3.805088123868126e-06, "loss": 0.65272272, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.67578125, "step": 2785, "time_per_iteration": 3.964520215988159 }, { "auxiliary_loss_clip": 0.01024513, "auxiliary_loss_mlp": 0.01012005, "balance_loss_clip": 1.00323153, "balance_loss_mlp": 1.00698185, "epoch": 0.16750338193296258, "flos": 66132547695360.0, "grad_norm": 0.7825227151365929, "language_loss": 0.58961439, "learning_rate": 3.8049203884140492e-06, "loss": 0.60997951, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 0.08789062, "router_z_loss_mlp": 0.17578125, "step": 2786, "time_per_iteration": 3.0586705207824707 }, { "auxiliary_loss_clip": 0.0109328, "auxiliary_loss_mlp": 0.01069216, "balance_loss_clip": 1.02255785, "balance_loss_mlp": 1.02573967, "epoch": 0.16756350518563054, "flos": 25695760590720.0, "grad_norm": 2.2494316508196097, "language_loss": 0.78478754, "learning_rate": 3.80475258451721e-06, "loss": 0.80641246, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.67578125, "step": 2787, "time_per_iteration": 5.32574725151062 }, { "auxiliary_loss_clip": 0.01095044, "auxiliary_loss_mlp": 0.01069856, "balance_loss_clip": 1.02489054, "balance_loss_mlp": 1.02803338, "epoch": 0.1676236284382985, "flos": 23835134304000.0, "grad_norm": 1.6677961699951511, "language_loss": 0.79439384, "learning_rate": 3.804584712183972e-06, "loss": 0.81604278, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.671875, "step": 2788, "time_per_iteration": 2.4263904094696045 }, { "auxiliary_loss_clip": 0.01024282, "auxiliary_loss_mlp": 0.01018732, "balance_loss_clip": 1.0111984, "balance_loss_mlp": 1.00728643, "epoch": 0.16768375169096647, "flos": 59872167872640.0, "grad_norm": 0.867902365954242, "language_loss": 0.594733, "learning_rate": 3.8044167714207013e-06, "loss": 0.61516315, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 0.07519531, "router_z_loss_mlp": 0.16992188, "step": 2789, "time_per_iteration": 2.900134563446045 }, { "auxiliary_loss_clip": 0.01091549, "auxiliary_loss_mlp": 0.0107826, "balance_loss_clip": 1.0288837, "balance_loss_mlp": 1.02462959, "epoch": 0.16774387494363446, "flos": 38434135806720.0, "grad_norm": 1.525805535185926, "language_loss": 0.71776664, "learning_rate": 3.804248762233765e-06, "loss": 0.73946476, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.66796875, "step": 2790, "time_per_iteration": 2.5661370754241943 }, { "auxiliary_loss_clip": 0.01090705, "auxiliary_loss_mlp": 0.01074322, "balance_loss_clip": 1.02890301, "balance_loss_mlp": 1.02492142, "epoch": 0.16780399819630243, "flos": 22636530817920.0, "grad_norm": 1.5365242704082918, "language_loss": 0.80947655, "learning_rate": 3.8040806846295356e-06, "loss": 0.83112681, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.65625, "step": 2791, "time_per_iteration": 2.401360034942627 }, { "auxiliary_loss_clip": 0.01094044, "auxiliary_loss_mlp": 0.01077105, "balance_loss_clip": 1.03223491, "balance_loss_mlp": 1.02581787, "epoch": 0.1678641214489704, "flos": 32890102727040.0, "grad_norm": 2.461871611199629, "language_loss": 0.73667097, "learning_rate": 3.8039125386143853e-06, "loss": 0.75838244, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.68359375, "step": 2792, "time_per_iteration": 2.5113272666931152 }, { "auxiliary_loss_clip": 0.01096819, "auxiliary_loss_mlp": 0.01065061, "balance_loss_clip": 1.0216217, "balance_loss_mlp": 1.02692533, "epoch": 0.16792424470163836, "flos": 19973879712000.0, "grad_norm": 1.9149757130230727, "language_loss": 0.73280507, "learning_rate": 3.803744324194691e-06, "loss": 0.75442392, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.69921875, "step": 2793, "time_per_iteration": 2.3997018337249756 }, { "auxiliary_loss_clip": 0.01095139, "auxiliary_loss_mlp": 0.01060778, "balance_loss_clip": 1.01543045, "balance_loss_mlp": 1.02732563, "epoch": 0.16798436795430632, "flos": 19718839163520.0, "grad_norm": 2.8224003523386365, "language_loss": 0.79046798, "learning_rate": 3.803576041376831e-06, "loss": 0.81202716, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.6796875, "step": 2794, "time_per_iteration": 2.4007251262664795 }, { "auxiliary_loss_clip": 0.01092973, "auxiliary_loss_mlp": 0.01068821, "balance_loss_clip": 1.02387929, "balance_loss_mlp": 1.02664173, "epoch": 0.1680444912069743, "flos": 28103755173120.0, "grad_norm": 5.099615268587083, "language_loss": 0.73345435, "learning_rate": 3.803407690167187e-06, "loss": 0.75507236, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6640625, "step": 2795, "time_per_iteration": 2.439584255218506 }, { "auxiliary_loss_clip": 0.01091668, "auxiliary_loss_mlp": 0.01063246, "balance_loss_clip": 1.01873302, "balance_loss_mlp": 1.02502477, "epoch": 0.16810461445964225, "flos": 18074290481280.0, "grad_norm": 1.775111465305368, "language_loss": 0.8533355, "learning_rate": 3.803239270572142e-06, "loss": 0.87488467, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6640625, "step": 2796, "time_per_iteration": 2.3776962757110596 }, { "auxiliary_loss_clip": 0.01095268, "auxiliary_loss_mlp": 0.01068788, "balance_loss_clip": 1.02127147, "balance_loss_mlp": 1.02758515, "epoch": 0.16816473771231025, "flos": 23877518561280.0, "grad_norm": 1.706767131724614, "language_loss": 0.83410811, "learning_rate": 3.8030707825980838e-06, "loss": 0.85574871, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 0.67578125, "step": 2797, "time_per_iteration": 2.420983076095581 }, { "auxiliary_loss_clip": 0.01088298, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.01515734, "balance_loss_mlp": 1.02574921, "epoch": 0.1682248609649782, "flos": 22782502679040.0, "grad_norm": 1.382584802726107, "language_loss": 0.76569343, "learning_rate": 3.802902226251401e-06, "loss": 0.78710943, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.625, "step": 2798, "time_per_iteration": 2.4325733184814453 }, { "auxiliary_loss_clip": 0.01094637, "auxiliary_loss_mlp": 0.01064014, "balance_loss_clip": 1.01973939, "balance_loss_mlp": 1.0278883, "epoch": 0.16828498421764618, "flos": 20704053219840.0, "grad_norm": 1.5284822655969978, "language_loss": 0.81118733, "learning_rate": 3.8027336015384845e-06, "loss": 0.83277386, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.66796875, "step": 2799, "time_per_iteration": 2.4079232215881348 }, { "auxiliary_loss_clip": 0.01095266, "auxiliary_loss_mlp": 0.01078377, "balance_loss_clip": 1.02978754, "balance_loss_mlp": 1.02584505, "epoch": 0.16834510747031414, "flos": 29419422048000.0, "grad_norm": 2.04990122905547, "language_loss": 0.72826451, "learning_rate": 3.8025649084657296e-06, "loss": 0.75000095, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 0.6953125, "step": 2800, "time_per_iteration": 2.4630932807922363 }, { "auxiliary_loss_clip": 0.01092803, "auxiliary_loss_mlp": 0.01063212, "balance_loss_clip": 1.01781678, "balance_loss_mlp": 1.02688348, "epoch": 0.1684052307229821, "flos": 18144535870080.0, "grad_norm": 1.7474245568222944, "language_loss": 0.8524878, "learning_rate": 3.8023961470395326e-06, "loss": 0.87404794, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.66015625, "step": 2801, "time_per_iteration": 2.3590078353881836 }, { "auxiliary_loss_clip": 0.01092891, "auxiliary_loss_mlp": 0.01067795, "balance_loss_clip": 1.02333045, "balance_loss_mlp": 1.02633595, "epoch": 0.16846535397565007, "flos": 16574177560320.0, "grad_norm": 2.2529465228494354, "language_loss": 0.86171067, "learning_rate": 3.8022273172662933e-06, "loss": 0.88331759, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6640625, "step": 2802, "time_per_iteration": 2.3782804012298584 }, { "auxiliary_loss_clip": 0.01097434, "auxiliary_loss_mlp": 0.01066881, "balance_loss_clip": 1.01962686, "balance_loss_mlp": 1.02692091, "epoch": 0.16852547722831807, "flos": 30407568658560.0, "grad_norm": 1.5950719908345647, "language_loss": 0.83168805, "learning_rate": 3.802058419152413e-06, "loss": 0.85333121, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.703125, "step": 2803, "time_per_iteration": 2.4855194091796875 }, { "auxiliary_loss_clip": 0.01092785, "auxiliary_loss_mlp": 0.01064887, "balance_loss_clip": 1.0205884, "balance_loss_mlp": 1.02781117, "epoch": 0.16858560048098603, "flos": 33506110488960.0, "grad_norm": 2.356662869916808, "language_loss": 0.79995322, "learning_rate": 3.801889452704297e-06, "loss": 0.82152992, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6484375, "step": 2804, "time_per_iteration": 2.5529115200042725 }, { "auxiliary_loss_clip": 0.01023339, "auxiliary_loss_mlp": 0.01008624, "balance_loss_clip": 1.00089884, "balance_loss_mlp": 1.00572586, "epoch": 0.168645723733654, "flos": 67367111748480.0, "grad_norm": 0.8294134436018508, "language_loss": 0.55504894, "learning_rate": 3.8017204179283526e-06, "loss": 0.57536858, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 0.07714844, "router_z_loss_mlp": 0.17578125, "step": 2805, "time_per_iteration": 2.975346326828003 }, { "auxiliary_loss_clip": 0.0109067, "auxiliary_loss_mlp": 0.01060328, "balance_loss_clip": 1.01996338, "balance_loss_mlp": 1.02566147, "epoch": 0.16870584698632196, "flos": 21323552117760.0, "grad_norm": 1.8883964826912536, "language_loss": 0.73888111, "learning_rate": 3.8015513148309892e-06, "loss": 0.76039106, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.6484375, "step": 2806, "time_per_iteration": 2.417182207107544 }, { "auxiliary_loss_clip": 0.01090625, "auxiliary_loss_mlp": 0.01063053, "balance_loss_clip": 1.02082884, "balance_loss_mlp": 1.02597785, "epoch": 0.16876597023898993, "flos": 20739699584640.0, "grad_norm": 1.7346409261006361, "language_loss": 0.71404123, "learning_rate": 3.80138214341862e-06, "loss": 0.735578, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6484375, "step": 2807, "time_per_iteration": 2.405721664428711 }, { "auxiliary_loss_clip": 0.01091665, "auxiliary_loss_mlp": 0.01066247, "balance_loss_clip": 1.01992249, "balance_loss_mlp": 1.02509046, "epoch": 0.1688260934916579, "flos": 20302447317120.0, "grad_norm": 2.377249928402245, "language_loss": 0.7285319, "learning_rate": 3.8012129036976587e-06, "loss": 0.75011098, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6640625, "step": 2808, "time_per_iteration": 2.411302328109741 }, { "auxiliary_loss_clip": 0.01091956, "auxiliary_loss_mlp": 0.01065891, "balance_loss_clip": 1.02106786, "balance_loss_mlp": 1.02544224, "epoch": 0.16888621674432586, "flos": 20339629781760.0, "grad_norm": 2.3646287421784695, "language_loss": 0.81897187, "learning_rate": 3.8010435956745236e-06, "loss": 0.84055036, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6640625, "step": 2809, "time_per_iteration": 2.400893211364746 }, { "auxiliary_loss_clip": 0.01095708, "auxiliary_loss_mlp": 0.01062693, "balance_loss_clip": 1.01944363, "balance_loss_mlp": 1.02665591, "epoch": 0.16894633999699385, "flos": 16244108766720.0, "grad_norm": 2.0378585544721473, "language_loss": 0.9052316, "learning_rate": 3.8008742193556358e-06, "loss": 0.92681563, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.69140625, "step": 2810, "time_per_iteration": 2.3855783939361572 }, { "auxiliary_loss_clip": 0.010969, "auxiliary_loss_mlp": 0.010714, "balance_loss_clip": 1.02574301, "balance_loss_mlp": 1.02747667, "epoch": 0.16900646324966181, "flos": 19609142071680.0, "grad_norm": 1.8785568446537682, "language_loss": 0.94044548, "learning_rate": 3.800704774747416e-06, "loss": 0.96212852, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6953125, "step": 2811, "time_per_iteration": 2.393899917602539 }, { "auxiliary_loss_clip": 0.0109317, "auxiliary_loss_mlp": 0.01064237, "balance_loss_clip": 1.02079678, "balance_loss_mlp": 1.02594984, "epoch": 0.16906658650232978, "flos": 22016997008640.0, "grad_norm": 2.0360706760987233, "language_loss": 0.80971998, "learning_rate": 3.800535261856291e-06, "loss": 0.83129406, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.671875, "step": 2812, "time_per_iteration": 2.423426866531372 }, { "auxiliary_loss_clip": 0.01092422, "auxiliary_loss_mlp": 0.01065571, "balance_loss_clip": 1.02346683, "balance_loss_mlp": 1.02685845, "epoch": 0.16912670975499774, "flos": 11762936340480.0, "grad_norm": 2.325365374136123, "language_loss": 0.77186131, "learning_rate": 3.8003656806886887e-06, "loss": 0.79344118, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.65625, "step": 2813, "time_per_iteration": 2.361151933670044 }, { "auxiliary_loss_clip": 0.0109578, "auxiliary_loss_mlp": 0.01069475, "balance_loss_clip": 1.02398443, "balance_loss_mlp": 1.0260067, "epoch": 0.1691868330076657, "flos": 17160543711360.0, "grad_norm": 2.3636441629704663, "language_loss": 0.71642256, "learning_rate": 3.8001960312510396e-06, "loss": 0.73807508, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.69921875, "step": 2814, "time_per_iteration": 2.3779735565185547 }, { "auxiliary_loss_clip": 0.01091392, "auxiliary_loss_mlp": 0.01061905, "balance_loss_clip": 1.01577103, "balance_loss_mlp": 1.0256629, "epoch": 0.16924695626033368, "flos": 22415530711680.0, "grad_norm": 3.1133592841790625, "language_loss": 0.6339041, "learning_rate": 3.800026313549776e-06, "loss": 0.65543711, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.65625, "step": 2815, "time_per_iteration": 2.399751663208008 }, { "auxiliary_loss_clip": 0.0109106, "auxiliary_loss_mlp": 0.01060372, "balance_loss_clip": 1.02007937, "balance_loss_mlp": 1.02477145, "epoch": 0.16930707951300164, "flos": 25738459050240.0, "grad_norm": 1.5544173376299129, "language_loss": 0.83343053, "learning_rate": 3.7998565275913342e-06, "loss": 0.85494483, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.6640625, "step": 2816, "time_per_iteration": 2.4539413452148438 }, { "auxiliary_loss_clip": 0.01093484, "auxiliary_loss_mlp": 0.01067237, "balance_loss_clip": 1.02212799, "balance_loss_mlp": 1.02522147, "epoch": 0.16936720276566963, "flos": 22745948618880.0, "grad_norm": 1.9986814531228778, "language_loss": 0.89108241, "learning_rate": 3.799686673382153e-06, "loss": 0.91268969, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.6796875, "step": 2817, "time_per_iteration": 2.4144058227539062 }, { "auxiliary_loss_clip": 0.01093106, "auxiliary_loss_mlp": 0.01068669, "balance_loss_clip": 1.02084184, "balance_loss_mlp": 1.02678072, "epoch": 0.1694273260183376, "flos": 19572937125120.0, "grad_norm": 1.6312176305661494, "language_loss": 0.82893324, "learning_rate": 3.799516750928672e-06, "loss": 0.85055101, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.6640625, "step": 2818, "time_per_iteration": 2.4073991775512695 }, { "auxiliary_loss_clip": 0.01091673, "auxiliary_loss_mlp": 0.01073756, "balance_loss_clip": 1.02635825, "balance_loss_mlp": 1.02404082, "epoch": 0.16948744927100556, "flos": 12457044547200.0, "grad_norm": 3.2134526476407332, "language_loss": 0.83228564, "learning_rate": 3.799346760237336e-06, "loss": 0.85393989, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.67578125, "step": 2819, "time_per_iteration": 2.383348226547241 }, { "auxiliary_loss_clip": 0.01024571, "auxiliary_loss_mlp": 0.01022505, "balance_loss_clip": 1.01382637, "balance_loss_mlp": 1.00536227, "epoch": 0.16954757252367353, "flos": 71288731814400.0, "grad_norm": 0.9578316048820912, "language_loss": 0.61405838, "learning_rate": 3.7991767013145902e-06, "loss": 0.63452911, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 0.08691406, "router_z_loss_mlp": 0.19140625, "step": 2820, "time_per_iteration": 2.918581247329712 }, { "auxiliary_loss_clip": 0.01093271, "auxiliary_loss_mlp": 0.01073771, "balance_loss_clip": 1.02828026, "balance_loss_mlp": 1.02458072, "epoch": 0.1696076957763415, "flos": 29605229637120.0, "grad_norm": 1.865177375554526, "language_loss": 0.80895323, "learning_rate": 3.7990065741668844e-06, "loss": 0.83062363, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.6875, "step": 2821, "time_per_iteration": 2.476318597793579 }, { "auxiliary_loss_clip": 0.01092765, "auxiliary_loss_mlp": 0.01069327, "balance_loss_clip": 1.02228749, "balance_loss_mlp": 1.02621293, "epoch": 0.16966781902900946, "flos": 24387460012800.0, "grad_norm": 2.0926119293000447, "language_loss": 0.80663157, "learning_rate": 3.7988363788006685e-06, "loss": 0.82825243, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.6640625, "step": 2822, "time_per_iteration": 3.856463670730591 }, { "auxiliary_loss_clip": 0.01090651, "auxiliary_loss_mlp": 0.01066435, "balance_loss_clip": 1.02118349, "balance_loss_mlp": 1.02448606, "epoch": 0.16972794228167745, "flos": 23037717784320.0, "grad_norm": 1.6736904869094582, "language_loss": 0.77210093, "learning_rate": 3.7986661152223967e-06, "loss": 0.79367185, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.66015625, "step": 2823, "time_per_iteration": 2.4150795936584473 }, { "auxiliary_loss_clip": 0.01094506, "auxiliary_loss_mlp": 0.01073399, "balance_loss_clip": 1.02717018, "balance_loss_mlp": 1.0256331, "epoch": 0.16978806553434542, "flos": 35227153693440.0, "grad_norm": 1.751131267386468, "language_loss": 0.60404825, "learning_rate": 3.7984957834385257e-06, "loss": 0.6257273, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.6875, "step": 2824, "time_per_iteration": 2.5323829650878906 }, { "auxiliary_loss_clip": 0.010927, "auxiliary_loss_mlp": 0.01075737, "balance_loss_clip": 1.03062844, "balance_loss_mlp": 1.02600312, "epoch": 0.16984818878701338, "flos": 32012944928640.0, "grad_norm": 1.6840524864540858, "language_loss": 0.74705148, "learning_rate": 3.7983253834555144e-06, "loss": 0.76873583, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.6640625, "step": 2825, "time_per_iteration": 3.881988525390625 }, { "auxiliary_loss_clip": 0.01095974, "auxiliary_loss_mlp": 0.01083982, "balance_loss_clip": 1.03386664, "balance_loss_mlp": 1.0247283, "epoch": 0.16990831203968135, "flos": 22817555550720.0, "grad_norm": 1.9377285476898294, "language_loss": 0.87846041, "learning_rate": 3.7981549152798245e-06, "loss": 0.90025997, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.7109375, "step": 2826, "time_per_iteration": 3.8416035175323486 }, { "auxiliary_loss_clip": 0.01095079, "auxiliary_loss_mlp": 0.01078206, "balance_loss_clip": 1.03145254, "balance_loss_mlp": 1.02447474, "epoch": 0.1699684352923493, "flos": 23038485834240.0, "grad_norm": 1.626655203869992, "language_loss": 0.83590013, "learning_rate": 3.7979843789179196e-06, "loss": 0.85763299, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.70703125, "step": 2827, "time_per_iteration": 3.8422253131866455 }, { "auxiliary_loss_clip": 0.01095027, "auxiliary_loss_mlp": 0.01073493, "balance_loss_clip": 1.02333021, "balance_loss_mlp": 1.02551496, "epoch": 0.17002855854501728, "flos": 21433039741440.0, "grad_norm": 2.128925568326487, "language_loss": 0.75779223, "learning_rate": 3.797813774376267e-06, "loss": 0.77947748, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 0.6953125, "step": 2828, "time_per_iteration": 2.4118363857269287 }, { "auxiliary_loss_clip": 0.01025202, "auxiliary_loss_mlp": 0.01015542, "balance_loss_clip": 1.00762641, "balance_loss_mlp": 1.00500631, "epoch": 0.17008868179768524, "flos": 71450099585280.0, "grad_norm": 0.7917697809161098, "language_loss": 0.56626511, "learning_rate": 3.797643101661336e-06, "loss": 0.58667254, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 0.07910156, "router_z_loss_mlp": 0.20214844, "step": 2829, "time_per_iteration": 3.067166566848755 }, { "auxiliary_loss_clip": 0.01091603, "auxiliary_loss_mlp": 0.01066404, "balance_loss_clip": 1.02482343, "balance_loss_mlp": 1.02419639, "epoch": 0.17014880505035324, "flos": 24899147032320.0, "grad_norm": 1.7196391893583634, "language_loss": 0.85071129, "learning_rate": 3.7974723607795983e-06, "loss": 0.87229133, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.671875, "step": 2830, "time_per_iteration": 2.4402270317077637 }, { "auxiliary_loss_clip": 0.0109456, "auxiliary_loss_mlp": 0.01065927, "balance_loss_clip": 1.01924455, "balance_loss_mlp": 1.02663517, "epoch": 0.1702089283030212, "flos": 29861108058240.0, "grad_norm": 1.9270029980710022, "language_loss": 0.81109655, "learning_rate": 3.797301551737529e-06, "loss": 0.83270144, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.6796875, "step": 2831, "time_per_iteration": 2.5193099975585938 }, { "auxiliary_loss_clip": 0.0109337, "auxiliary_loss_mlp": 0.01068804, "balance_loss_clip": 1.02276587, "balance_loss_mlp": 1.02478743, "epoch": 0.17026905155568917, "flos": 17743348903680.0, "grad_norm": 1.6812705303883722, "language_loss": 0.81523043, "learning_rate": 3.7971306745416044e-06, "loss": 0.83685213, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.6875, "step": 2832, "time_per_iteration": 2.4307305812835693 }, { "auxiliary_loss_clip": 0.01092446, "auxiliary_loss_mlp": 0.0107651, "balance_loss_clip": 1.03087664, "balance_loss_mlp": 1.02484059, "epoch": 0.17032917480835713, "flos": 23147554521600.0, "grad_norm": 1.6856733648519924, "language_loss": 0.90962052, "learning_rate": 3.7969597291983046e-06, "loss": 0.93131006, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.67578125, "step": 2833, "time_per_iteration": 2.421616792678833 }, { "auxiliary_loss_clip": 0.01092926, "auxiliary_loss_mlp": 0.01069102, "balance_loss_clip": 1.02399349, "balance_loss_mlp": 1.02570987, "epoch": 0.1703892980610251, "flos": 39201003020160.0, "grad_norm": 2.2873112119177934, "language_loss": 0.74799186, "learning_rate": 3.7967887157141115e-06, "loss": 0.76961219, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.671875, "step": 2834, "time_per_iteration": 2.5521669387817383 }, { "auxiliary_loss_clip": 0.01095933, "auxiliary_loss_mlp": 0.01067311, "balance_loss_clip": 1.02241635, "balance_loss_mlp": 1.02685952, "epoch": 0.17044942131369306, "flos": 23037997075200.0, "grad_norm": 2.2584313265656175, "language_loss": 0.888789, "learning_rate": 3.7966176340955106e-06, "loss": 0.91042137, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.69140625, "step": 2835, "time_per_iteration": 2.406376838684082 }, { "auxiliary_loss_clip": 0.01097036, "auxiliary_loss_mlp": 0.01077636, "balance_loss_clip": 1.0244211, "balance_loss_mlp": 1.02696443, "epoch": 0.17050954456636103, "flos": 17054058464640.0, "grad_norm": 2.8093964956035515, "language_loss": 0.771348, "learning_rate": 3.796446484348989e-06, "loss": 0.79309475, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 0.703125, "step": 2836, "time_per_iteration": 2.4281187057495117 }, { "auxiliary_loss_clip": 0.0109823, "auxiliary_loss_mlp": 0.01077617, "balance_loss_clip": 1.02592754, "balance_loss_mlp": 1.02751422, "epoch": 0.17056966781902902, "flos": 16836025824000.0, "grad_norm": 2.3312670562003843, "language_loss": 0.821051, "learning_rate": 3.796275266481036e-06, "loss": 0.8428095, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.70703125, "step": 2837, "time_per_iteration": 2.458406448364258 }, { "auxiliary_loss_clip": 0.01091271, "auxiliary_loss_mlp": 0.01072088, "balance_loss_clip": 1.02829099, "balance_loss_mlp": 1.0270071, "epoch": 0.17062979107169698, "flos": 17711577699840.0, "grad_norm": 1.7072664403600089, "language_loss": 0.8567369, "learning_rate": 3.7961039804981456e-06, "loss": 0.87837046, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.64453125, "step": 2838, "time_per_iteration": 2.469435453414917 }, { "auxiliary_loss_clip": 0.01090102, "auxiliary_loss_mlp": 0.01059607, "balance_loss_clip": 1.01816952, "balance_loss_mlp": 1.02592552, "epoch": 0.17068991432436495, "flos": 22524040817280.0, "grad_norm": 1.6544007514271024, "language_loss": 0.95071912, "learning_rate": 3.795932626406812e-06, "loss": 0.97221625, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.640625, "step": 2839, "time_per_iteration": 2.43489670753479 }, { "auxiliary_loss_clip": 0.01092947, "auxiliary_loss_mlp": 0.01068756, "balance_loss_clip": 1.02088141, "balance_loss_mlp": 1.02543569, "epoch": 0.17075003757703291, "flos": 25881812559360.0, "grad_norm": 2.0410307943300716, "language_loss": 0.85349417, "learning_rate": 3.7957612042135336e-06, "loss": 0.87511122, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.67578125, "step": 2840, "time_per_iteration": 2.4728140830993652 }, { "auxiliary_loss_clip": 0.01091487, "auxiliary_loss_mlp": 0.01065869, "balance_loss_clip": 1.0204978, "balance_loss_mlp": 1.02458382, "epoch": 0.17081016082970088, "flos": 20119677016320.0, "grad_norm": 1.7944528870396306, "language_loss": 0.78457022, "learning_rate": 3.79558971392481e-06, "loss": 0.8061437, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.66796875, "step": 2841, "time_per_iteration": 2.455179452896118 }, { "auxiliary_loss_clip": 0.01090766, "auxiliary_loss_mlp": 0.01066414, "balance_loss_clip": 1.01975513, "balance_loss_mlp": 1.02391315, "epoch": 0.17087028408236885, "flos": 24935317067520.0, "grad_norm": 1.8311591088719055, "language_loss": 0.78259361, "learning_rate": 3.7954181555471443e-06, "loss": 0.80416536, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.66796875, "step": 2842, "time_per_iteration": 2.453768014907837 }, { "auxiliary_loss_clip": 0.01088348, "auxiliary_loss_mlp": 0.01064896, "balance_loss_clip": 1.01973987, "balance_loss_mlp": 1.02416992, "epoch": 0.17093040733503684, "flos": 19056990919680.0, "grad_norm": 2.347448659326227, "language_loss": 0.8680855, "learning_rate": 3.795246529087043e-06, "loss": 0.88961798, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.640625, "step": 2843, "time_per_iteration": 2.430903196334839 }, { "auxiliary_loss_clip": 0.01089378, "auxiliary_loss_mlp": 0.01064965, "balance_loss_clip": 1.02264547, "balance_loss_mlp": 1.02520728, "epoch": 0.1709905305877048, "flos": 13078114456320.0, "grad_norm": 1.757724554712279, "language_loss": 0.70827973, "learning_rate": 3.7950748345510126e-06, "loss": 0.72982317, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.640625, "step": 2844, "time_per_iteration": 2.3674488067626953 }, { "auxiliary_loss_clip": 0.01092279, "auxiliary_loss_mlp": 0.01060944, "balance_loss_clip": 1.0175997, "balance_loss_mlp": 1.02771211, "epoch": 0.17105065384037277, "flos": 19208304218880.0, "grad_norm": 1.6626378599259997, "language_loss": 0.79371041, "learning_rate": 3.7949030719455646e-06, "loss": 0.81524265, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.6484375, "step": 2845, "time_per_iteration": 2.37955379486084 }, { "auxiliary_loss_clip": 0.01090959, "auxiliary_loss_mlp": 0.01055603, "balance_loss_clip": 1.01449919, "balance_loss_mlp": 1.02574313, "epoch": 0.17111077709304073, "flos": 18514196012160.0, "grad_norm": 2.3811232664417163, "language_loss": 0.80395436, "learning_rate": 3.7947312412772127e-06, "loss": 0.82542002, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.65234375, "step": 2846, "time_per_iteration": 2.3786847591400146 }, { "auxiliary_loss_clip": 0.01089206, "auxiliary_loss_mlp": 0.01069906, "balance_loss_clip": 1.0286119, "balance_loss_mlp": 1.02437866, "epoch": 0.1711709003457087, "flos": 25081498396800.0, "grad_norm": 1.9916801296997746, "language_loss": 0.81481111, "learning_rate": 3.794559342552472e-06, "loss": 0.83640224, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6484375, "step": 2847, "time_per_iteration": 2.4318573474884033 }, { "auxiliary_loss_clip": 0.01088987, "auxiliary_loss_mlp": 0.01067246, "balance_loss_clip": 1.02640545, "balance_loss_mlp": 1.02371383, "epoch": 0.17123102359837666, "flos": 17565431281920.0, "grad_norm": 2.322745653189173, "language_loss": 0.89611942, "learning_rate": 3.7943873757778614e-06, "loss": 0.91768175, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.65234375, "step": 2848, "time_per_iteration": 2.371429681777954 }, { "auxiliary_loss_clip": 0.01090049, "auxiliary_loss_mlp": 0.01058474, "balance_loss_clip": 1.01415241, "balance_loss_mlp": 1.02399111, "epoch": 0.17129114685104463, "flos": 26172534384000.0, "grad_norm": 2.014456980993039, "language_loss": 0.77098888, "learning_rate": 3.794215340959902e-06, "loss": 0.79247415, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.66015625, "step": 2849, "time_per_iteration": 2.4634225368499756 }, { "auxiliary_loss_clip": 0.01025478, "auxiliary_loss_mlp": 0.01025236, "balance_loss_clip": 1.01898992, "balance_loss_mlp": 1.00735235, "epoch": 0.17135127010371262, "flos": 69266212220160.0, "grad_norm": 0.8014429701659052, "language_loss": 0.57607508, "learning_rate": 3.7940432381051163e-06, "loss": 0.59658229, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 0.0625, "router_z_loss_mlp": 0.18164062, "step": 2850, "time_per_iteration": 3.016446590423584 }, { "auxiliary_loss_clip": 0.01088152, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.02071261, "balance_loss_mlp": 1.02571094, "epoch": 0.1714113933563806, "flos": 23548985867520.0, "grad_norm": 2.08154463439929, "language_loss": 0.82437003, "learning_rate": 3.793871067220031e-06, "loss": 0.84586281, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.625, "step": 2851, "time_per_iteration": 2.412357807159424 }, { "auxiliary_loss_clip": 0.01086579, "auxiliary_loss_mlp": 0.01054264, "balance_loss_clip": 1.01406705, "balance_loss_mlp": 1.02386236, "epoch": 0.17147151660904855, "flos": 21141375310080.0, "grad_norm": 2.943319179369485, "language_loss": 0.9570325, "learning_rate": 3.7936988283111764e-06, "loss": 0.97844088, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.625, "step": 2852, "time_per_iteration": 2.443964719772339 }, { "auxiliary_loss_clip": 0.01090196, "auxiliary_loss_mlp": 0.01068709, "balance_loss_clip": 1.02686667, "balance_loss_mlp": 1.02409673, "epoch": 0.17153163986171652, "flos": 18623893104000.0, "grad_norm": 1.8171999551438784, "language_loss": 0.70537043, "learning_rate": 3.7935265213850817e-06, "loss": 0.72695947, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.66015625, "step": 2853, "time_per_iteration": 2.401047706604004 }, { "auxiliary_loss_clip": 0.01094033, "auxiliary_loss_mlp": 0.01062486, "balance_loss_clip": 1.01971376, "balance_loss_mlp": 1.02656257, "epoch": 0.17159176311438448, "flos": 18222287201280.0, "grad_norm": 2.0516479640448533, "language_loss": 0.69866461, "learning_rate": 3.7933541464482815e-06, "loss": 0.7202298, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.67578125, "step": 2854, "time_per_iteration": 2.400339126586914 }, { "auxiliary_loss_clip": 0.01086326, "auxiliary_loss_mlp": 0.01058941, "balance_loss_clip": 1.01795685, "balance_loss_mlp": 1.02289236, "epoch": 0.17165188636705245, "flos": 20737988928000.0, "grad_norm": 1.6499920789907063, "language_loss": 0.90296578, "learning_rate": 3.7931817035073124e-06, "loss": 0.92441845, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6328125, "step": 2855, "time_per_iteration": 2.3948287963867188 }, { "auxiliary_loss_clip": 0.01092296, "auxiliary_loss_mlp": 0.01063254, "balance_loss_clip": 1.0206964, "balance_loss_mlp": 1.02629983, "epoch": 0.17171200961972044, "flos": 24898728096000.0, "grad_norm": 2.126830905089671, "language_loss": 0.86010611, "learning_rate": 3.7930091925687134e-06, "loss": 0.88166165, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.66015625, "step": 2856, "time_per_iteration": 2.4358468055725098 }, { "auxiliary_loss_clip": 0.01090769, "auxiliary_loss_mlp": 0.01068782, "balance_loss_clip": 1.02715468, "balance_loss_mlp": 1.02634645, "epoch": 0.1717721328723884, "flos": 20156196165120.0, "grad_norm": 2.0773448047747047, "language_loss": 0.87851989, "learning_rate": 3.792836613639026e-06, "loss": 0.90011537, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.64453125, "step": 2857, "time_per_iteration": 2.401097536087036 }, { "auxiliary_loss_clip": 0.01090152, "auxiliary_loss_mlp": 0.01069214, "balance_loss_clip": 1.02803922, "balance_loss_mlp": 1.02598619, "epoch": 0.17183225612505637, "flos": 23360699571840.0, "grad_norm": 2.056305273230322, "language_loss": 0.79866272, "learning_rate": 3.7926639667247947e-06, "loss": 0.82025635, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.640625, "step": 2858, "time_per_iteration": 2.396958827972412 }, { "auxiliary_loss_clip": 0.01098051, "auxiliary_loss_mlp": 0.01070496, "balance_loss_clip": 1.02450514, "balance_loss_mlp": 1.02694249, "epoch": 0.17189237937772434, "flos": 18113253425280.0, "grad_norm": 1.763734713206636, "language_loss": 0.78848308, "learning_rate": 3.7924912518325663e-06, "loss": 0.81016856, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.7109375, "step": 2859, "time_per_iteration": 2.3705272674560547 }, { "auxiliary_loss_clip": 0.01087962, "auxiliary_loss_mlp": 0.01055665, "balance_loss_clip": 1.01449049, "balance_loss_mlp": 1.02440035, "epoch": 0.1719525026303923, "flos": 23257286524800.0, "grad_norm": 1.8610753537587248, "language_loss": 0.776173, "learning_rate": 3.7923184689688902e-06, "loss": 0.79760921, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.6328125, "step": 2860, "time_per_iteration": 2.4063658714294434 }, { "auxiliary_loss_clip": 0.0109193, "auxiliary_loss_mlp": 0.01061987, "balance_loss_clip": 1.01706851, "balance_loss_mlp": 1.02594209, "epoch": 0.17201262588306027, "flos": 20809456214400.0, "grad_norm": 2.1402244413536726, "language_loss": 0.82587159, "learning_rate": 3.792145618140317e-06, "loss": 0.84741068, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.66015625, "step": 2861, "time_per_iteration": 2.4642581939697266 }, { "auxiliary_loss_clip": 0.01090158, "auxiliary_loss_mlp": 0.01064441, "balance_loss_clip": 1.02135921, "balance_loss_mlp": 1.02513468, "epoch": 0.17207274913572823, "flos": 20374822298880.0, "grad_norm": 2.332865779112885, "language_loss": 0.88171089, "learning_rate": 3.7919726993534038e-06, "loss": 0.90325689, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6484375, "step": 2862, "time_per_iteration": 3.8884477615356445 }, { "auxiliary_loss_clip": 0.01085168, "auxiliary_loss_mlp": 0.0105586, "balance_loss_clip": 1.01597285, "balance_loss_mlp": 1.02435887, "epoch": 0.17213287238839622, "flos": 26796501936000.0, "grad_norm": 1.842860367672586, "language_loss": 0.79576755, "learning_rate": 3.7917997126147054e-06, "loss": 0.81717777, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.609375, "step": 2863, "time_per_iteration": 2.443203926086426 }, { "auxiliary_loss_clip": 0.01089886, "auxiliary_loss_mlp": 0.01059486, "balance_loss_clip": 1.01761961, "balance_loss_mlp": 1.02480996, "epoch": 0.1721929956410642, "flos": 26029634722560.0, "grad_norm": 1.8480385034423394, "language_loss": 0.74562615, "learning_rate": 3.7916266579307823e-06, "loss": 0.76711988, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.65234375, "step": 2864, "time_per_iteration": 3.865826368331909 }, { "auxiliary_loss_clip": 0.01091071, "auxiliary_loss_mlp": 0.01066078, "balance_loss_clip": 1.02795434, "balance_loss_mlp": 1.02619588, "epoch": 0.17225311889373215, "flos": 22272002645760.0, "grad_norm": 1.6351578351136464, "language_loss": 0.74111384, "learning_rate": 3.7914535353081973e-06, "loss": 0.7626853, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.6484375, "step": 2865, "time_per_iteration": 2.3896028995513916 }, { "auxiliary_loss_clip": 0.01089291, "auxiliary_loss_mlp": 0.010688, "balance_loss_clip": 1.02624273, "balance_loss_mlp": 1.02543771, "epoch": 0.17231324214640012, "flos": 21286718766720.0, "grad_norm": 2.043121400930538, "language_loss": 0.80341017, "learning_rate": 3.7912803447535145e-06, "loss": 0.82499111, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.640625, "step": 2866, "time_per_iteration": 3.809830665588379 }, { "auxiliary_loss_clip": 0.01092415, "auxiliary_loss_mlp": 0.01063541, "balance_loss_clip": 1.02081633, "balance_loss_mlp": 1.0259279, "epoch": 0.17237336539906808, "flos": 19679771485440.0, "grad_norm": 2.001604475815061, "language_loss": 0.81287777, "learning_rate": 3.7911070862733016e-06, "loss": 0.83443731, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6640625, "step": 2867, "time_per_iteration": 3.8203063011169434 }, { "auxiliary_loss_clip": 0.0108853, "auxiliary_loss_mlp": 0.01060792, "balance_loss_clip": 1.0195694, "balance_loss_mlp": 1.02425981, "epoch": 0.17243348865173605, "flos": 17528702664960.0, "grad_norm": 2.130211316937863, "language_loss": 0.80520242, "learning_rate": 3.7909337598741276e-06, "loss": 0.82669562, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.640625, "step": 2868, "time_per_iteration": 2.39717435836792 }, { "auxiliary_loss_clip": 0.01095225, "auxiliary_loss_mlp": 0.01063995, "balance_loss_clip": 1.02069783, "balance_loss_mlp": 1.02827501, "epoch": 0.17249361190440402, "flos": 18258876172800.0, "grad_norm": 2.253349580675669, "language_loss": 0.85268378, "learning_rate": 3.7907603655625674e-06, "loss": 0.87427604, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.66796875, "step": 2869, "time_per_iteration": 2.4184956550598145 }, { "auxiliary_loss_clip": 0.01091667, "auxiliary_loss_mlp": 0.01072226, "balance_loss_clip": 1.02833319, "balance_loss_mlp": 1.02560723, "epoch": 0.172553735157072, "flos": 21173425804800.0, "grad_norm": 2.0158062909707986, "language_loss": 0.79092187, "learning_rate": 3.7905869033451932e-06, "loss": 0.81256074, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.66015625, "step": 2870, "time_per_iteration": 2.423748731613159 }, { "auxiliary_loss_clip": 0.01086343, "auxiliary_loss_mlp": 0.01060011, "balance_loss_clip": 1.02143502, "balance_loss_mlp": 1.02345431, "epoch": 0.17261385840973997, "flos": 22272177202560.0, "grad_norm": 2.5248432630015873, "language_loss": 0.79267114, "learning_rate": 3.7904133732285857e-06, "loss": 0.81413472, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.625, "step": 2871, "time_per_iteration": 2.4080138206481934 }, { "auxiliary_loss_clip": 0.01092304, "auxiliary_loss_mlp": 0.01068253, "balance_loss_clip": 1.02488458, "balance_loss_mlp": 1.0263412, "epoch": 0.17267398166240794, "flos": 27921159429120.0, "grad_norm": 2.4889666753116795, "language_loss": 0.76858968, "learning_rate": 3.7902397752193228e-06, "loss": 0.79019523, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.66015625, "step": 2872, "time_per_iteration": 2.4459519386291504 }, { "auxiliary_loss_clip": 0.0108538, "auxiliary_loss_mlp": 0.01058044, "balance_loss_clip": 1.01691675, "balance_loss_mlp": 1.02305019, "epoch": 0.1727341049150759, "flos": 21944028533760.0, "grad_norm": 1.7133838906470993, "language_loss": 0.8329708, "learning_rate": 3.790066109323988e-06, "loss": 0.85440505, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.625, "step": 2873, "time_per_iteration": 2.4300153255462646 }, { "auxiliary_loss_clip": 0.01088716, "auxiliary_loss_mlp": 0.01063228, "balance_loss_clip": 1.0189774, "balance_loss_mlp": 1.02495396, "epoch": 0.17279422816774387, "flos": 18107074114560.0, "grad_norm": 1.9751021867256773, "language_loss": 0.76727509, "learning_rate": 3.7898923755491678e-06, "loss": 0.78879452, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.640625, "step": 2874, "time_per_iteration": 2.3731637001037598 }, { "auxiliary_loss_clip": 0.01090574, "auxiliary_loss_mlp": 0.01074873, "balance_loss_clip": 1.02530599, "balance_loss_mlp": 1.02485073, "epoch": 0.17285435142041183, "flos": 21834366353280.0, "grad_norm": 2.7186990155197464, "language_loss": 0.82897699, "learning_rate": 3.7897185739014487e-06, "loss": 0.85063148, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 0.65625, "step": 2875, "time_per_iteration": 2.4122555255889893 }, { "auxiliary_loss_clip": 0.01095177, "auxiliary_loss_mlp": 0.01073274, "balance_loss_clip": 1.02430272, "balance_loss_mlp": 1.02667856, "epoch": 0.17291447467307983, "flos": 18367491012480.0, "grad_norm": 2.3318906605914753, "language_loss": 0.91021979, "learning_rate": 3.7895447043874217e-06, "loss": 0.93190426, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.6875, "step": 2876, "time_per_iteration": 2.353543281555176 }, { "auxiliary_loss_clip": 0.0108946, "auxiliary_loss_mlp": 0.01064435, "balance_loss_clip": 1.02247381, "balance_loss_mlp": 1.02576244, "epoch": 0.1729745979257478, "flos": 18623648724480.0, "grad_norm": 1.725976610517751, "language_loss": 0.8641333, "learning_rate": 3.789370767013681e-06, "loss": 0.88567227, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.63671875, "step": 2877, "time_per_iteration": 2.3814995288848877 }, { "auxiliary_loss_clip": 0.01093803, "auxiliary_loss_mlp": 0.01070975, "balance_loss_clip": 1.02364922, "balance_loss_mlp": 1.02731514, "epoch": 0.17303472117841576, "flos": 22997253651840.0, "grad_norm": 2.1444974179637284, "language_loss": 0.81142867, "learning_rate": 3.7891967617868204e-06, "loss": 0.83307648, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6640625, "step": 2878, "time_per_iteration": 2.3977184295654297 }, { "auxiliary_loss_clip": 0.01089788, "auxiliary_loss_mlp": 0.01066502, "balance_loss_clip": 1.02136946, "balance_loss_mlp": 1.0251019, "epoch": 0.17309484443108372, "flos": 25663256248320.0, "grad_norm": 1.6555079331606155, "language_loss": 0.7190547, "learning_rate": 3.78902268871344e-06, "loss": 0.74061757, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.6484375, "step": 2879, "time_per_iteration": 2.4285013675689697 }, { "auxiliary_loss_clip": 0.01090438, "auxiliary_loss_mlp": 0.01067652, "balance_loss_clip": 1.02039778, "balance_loss_mlp": 1.02314425, "epoch": 0.1731549676837517, "flos": 13552060429440.0, "grad_norm": 2.0589374673783594, "language_loss": 0.85347188, "learning_rate": 3.78884854780014e-06, "loss": 0.87505287, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.671875, "step": 2880, "time_per_iteration": 2.3559110164642334 }, { "auxiliary_loss_clip": 0.01093063, "auxiliary_loss_mlp": 0.01065678, "balance_loss_clip": 1.01684988, "balance_loss_mlp": 1.02497661, "epoch": 0.17321509093641965, "flos": 22855959912960.0, "grad_norm": 1.9413701568488353, "language_loss": 0.83029056, "learning_rate": 3.7886743390535236e-06, "loss": 0.85187805, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 0.6796875, "step": 2881, "time_per_iteration": 2.4011948108673096 }, { "auxiliary_loss_clip": 0.01091507, "auxiliary_loss_mlp": 0.01062467, "balance_loss_clip": 1.01926541, "balance_loss_mlp": 1.02626657, "epoch": 0.17327521418908762, "flos": 24351639091200.0, "grad_norm": 1.959092305494506, "language_loss": 0.78930008, "learning_rate": 3.788500062480197e-06, "loss": 0.81083977, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.65234375, "step": 2882, "time_per_iteration": 2.4336893558502197 }, { "auxiliary_loss_clip": 0.01091673, "auxiliary_loss_mlp": 0.01065719, "balance_loss_clip": 1.02080059, "balance_loss_mlp": 1.02672148, "epoch": 0.1733353374417556, "flos": 33104364940800.0, "grad_norm": 2.06214303358596, "language_loss": 0.77585185, "learning_rate": 3.788325718086769e-06, "loss": 0.79742575, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6484375, "step": 2883, "time_per_iteration": 2.531148910522461 }, { "auxiliary_loss_clip": 0.01089575, "auxiliary_loss_mlp": 0.01062584, "balance_loss_clip": 1.02107573, "balance_loss_mlp": 1.02517271, "epoch": 0.17339546069442358, "flos": 24387809126400.0, "grad_norm": 2.219275878996124, "language_loss": 0.87246788, "learning_rate": 3.7881513058798503e-06, "loss": 0.89398956, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.64453125, "step": 2884, "time_per_iteration": 2.4422266483306885 }, { "auxiliary_loss_clip": 0.01093311, "auxiliary_loss_mlp": 0.01080063, "balance_loss_clip": 1.0334518, "balance_loss_mlp": 1.02620149, "epoch": 0.17345558394709154, "flos": 27452938919040.0, "grad_norm": 1.7148885603740223, "language_loss": 0.76168227, "learning_rate": 3.787976825866055e-06, "loss": 0.78341603, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.671875, "step": 2885, "time_per_iteration": 2.439279079437256 }, { "auxiliary_loss_clip": 0.01086862, "auxiliary_loss_mlp": 0.01065454, "balance_loss_clip": 1.02525687, "balance_loss_mlp": 1.02570796, "epoch": 0.1735157071997595, "flos": 24680974746240.0, "grad_norm": 1.5842489108610747, "language_loss": 0.72438949, "learning_rate": 3.7878022780519998e-06, "loss": 0.74591267, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.609375, "step": 2886, "time_per_iteration": 2.4342730045318604 }, { "auxiliary_loss_clip": 0.01091524, "auxiliary_loss_mlp": 0.01066169, "balance_loss_clip": 1.01939154, "balance_loss_mlp": 1.02485204, "epoch": 0.17357583045242747, "flos": 21687870821760.0, "grad_norm": 2.147681637425942, "language_loss": 0.71361846, "learning_rate": 3.7876276624443024e-06, "loss": 0.7351954, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 0.6640625, "step": 2887, "time_per_iteration": 2.437958240509033 }, { "auxiliary_loss_clip": 0.01091698, "auxiliary_loss_mlp": 0.01073901, "balance_loss_clip": 1.02683735, "balance_loss_mlp": 1.02610159, "epoch": 0.17363595370509544, "flos": 15374875847040.0, "grad_norm": 1.9358917692345845, "language_loss": 0.8657006, "learning_rate": 3.787452979049585e-06, "loss": 0.88735664, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.65625, "step": 2888, "time_per_iteration": 2.421739339828491 }, { "auxiliary_loss_clip": 0.01091755, "auxiliary_loss_mlp": 0.01065567, "balance_loss_clip": 1.01993394, "balance_loss_mlp": 1.02645934, "epoch": 0.1736960769577634, "flos": 23439812446080.0, "grad_norm": 2.2776633270140545, "language_loss": 0.80712545, "learning_rate": 3.7872782278744718e-06, "loss": 0.82869864, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.65234375, "step": 2889, "time_per_iteration": 2.4313101768493652 }, { "auxiliary_loss_clip": 0.01086424, "auxiliary_loss_mlp": 0.01065279, "balance_loss_clip": 1.02243495, "balance_loss_mlp": 1.0244174, "epoch": 0.1737562002104314, "flos": 18586850284800.0, "grad_norm": 2.33987683777248, "language_loss": 0.8695032, "learning_rate": 3.7871034089255883e-06, "loss": 0.89102018, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6171875, "step": 2890, "time_per_iteration": 2.370270252227783 }, { "auxiliary_loss_clip": 0.01092103, "auxiliary_loss_mlp": 0.01063365, "balance_loss_clip": 1.01968646, "balance_loss_mlp": 1.02601039, "epoch": 0.17381632346309936, "flos": 15997132742400.0, "grad_norm": 2.0173457717488796, "language_loss": 0.84001702, "learning_rate": 3.7869285222095653e-06, "loss": 0.86157167, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.66015625, "step": 2891, "time_per_iteration": 2.3745503425598145 }, { "auxiliary_loss_clip": 0.01092097, "auxiliary_loss_mlp": 0.01070298, "balance_loss_clip": 1.02411628, "balance_loss_mlp": 1.02544606, "epoch": 0.17387644671576732, "flos": 13369010837760.0, "grad_norm": 2.079217038439262, "language_loss": 0.83039296, "learning_rate": 3.7867535677330334e-06, "loss": 0.85201693, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.6640625, "step": 2892, "time_per_iteration": 2.347869396209717 }, { "auxiliary_loss_clip": 0.0109466, "auxiliary_loss_mlp": 0.01077507, "balance_loss_clip": 1.02534091, "balance_loss_mlp": 1.02787924, "epoch": 0.1739365699684353, "flos": 26614290216960.0, "grad_norm": 1.9204135060563383, "language_loss": 0.76770782, "learning_rate": 3.786578545502627e-06, "loss": 0.78942955, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 0.66796875, "step": 2893, "time_per_iteration": 2.4391705989837646 }, { "auxiliary_loss_clip": 0.01092058, "auxiliary_loss_mlp": 0.01072547, "balance_loss_clip": 1.02283621, "balance_loss_mlp": 1.02635539, "epoch": 0.17399669322110325, "flos": 23366843971200.0, "grad_norm": 2.305887795467626, "language_loss": 0.83406007, "learning_rate": 3.7864034555249828e-06, "loss": 0.8557061, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 0.65625, "step": 2894, "time_per_iteration": 2.400878429412842 }, { "auxiliary_loss_clip": 0.01089722, "auxiliary_loss_mlp": 0.01066506, "balance_loss_clip": 1.01984727, "balance_loss_mlp": 1.02459931, "epoch": 0.17405681647377122, "flos": 22053027398400.0, "grad_norm": 2.1427415387874214, "language_loss": 0.75733209, "learning_rate": 3.786228297806741e-06, "loss": 0.77889431, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 0.65234375, "step": 2895, "time_per_iteration": 2.4011738300323486 }, { "auxiliary_loss_clip": 0.01025027, "auxiliary_loss_mlp": 0.01008268, "balance_loss_clip": 1.00154495, "balance_loss_mlp": 1.00728679, "epoch": 0.1741169397264392, "flos": 61454396044800.0, "grad_norm": 0.8902278389024223, "language_loss": 0.62882751, "learning_rate": 3.7860530723545435e-06, "loss": 0.64916044, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 0.06738281, "router_z_loss_mlp": 0.17773438, "step": 2896, "time_per_iteration": 3.0961220264434814 }, { "auxiliary_loss_clip": 0.01088869, "auxiliary_loss_mlp": 0.01062283, "balance_loss_clip": 1.01896286, "balance_loss_mlp": 1.02338195, "epoch": 0.17417706297910718, "flos": 27016419790080.0, "grad_norm": 1.8453671968280196, "language_loss": 0.77417445, "learning_rate": 3.785877779175034e-06, "loss": 0.79568589, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.65625, "step": 2897, "time_per_iteration": 2.4469895362854004 }, { "auxiliary_loss_clip": 0.01088662, "auxiliary_loss_mlp": 0.01071551, "balance_loss_clip": 1.02744365, "balance_loss_mlp": 1.02574635, "epoch": 0.17423718623177514, "flos": 33507506943360.0, "grad_norm": 2.125949905240734, "language_loss": 0.70511973, "learning_rate": 3.7857024182748606e-06, "loss": 0.72672188, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.62890625, "step": 2898, "time_per_iteration": 2.4773776531219482 }, { "auxiliary_loss_clip": 0.01091993, "auxiliary_loss_mlp": 0.01072233, "balance_loss_clip": 1.02447808, "balance_loss_mlp": 1.02548468, "epoch": 0.1742973094844431, "flos": 27197409611520.0, "grad_norm": 2.309683355679883, "language_loss": 0.78572631, "learning_rate": 3.7855269896606717e-06, "loss": 0.80736858, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.6640625, "step": 2899, "time_per_iteration": 2.435570478439331 }, { "auxiliary_loss_clip": 0.01087748, "auxiliary_loss_mlp": 0.01067096, "balance_loss_clip": 1.02277434, "balance_loss_mlp": 1.02474427, "epoch": 0.17435743273711107, "flos": 22709638938240.0, "grad_norm": 2.211621101480241, "language_loss": 0.74694598, "learning_rate": 3.785351493339121e-06, "loss": 0.76849449, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6328125, "step": 2900, "time_per_iteration": 2.4122655391693115 }, { "auxiliary_loss_clip": 0.01093288, "auxiliary_loss_mlp": 0.01071154, "balance_loss_clip": 1.02687955, "balance_loss_mlp": 1.02783132, "epoch": 0.17441755598977904, "flos": 41644853435520.0, "grad_norm": 1.8231338749612178, "language_loss": 0.7149592, "learning_rate": 3.785175929316863e-06, "loss": 0.73660362, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.65625, "step": 2901, "time_per_iteration": 4.0488200187683105 }, { "auxiliary_loss_clip": 0.0109378, "auxiliary_loss_mlp": 0.01070795, "balance_loss_clip": 1.02695036, "balance_loss_mlp": 1.02655005, "epoch": 0.174477679242447, "flos": 26285862257280.0, "grad_norm": 1.696730553977846, "language_loss": 0.77205765, "learning_rate": 3.7850002976005543e-06, "loss": 0.79370344, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.671875, "step": 2902, "time_per_iteration": 2.431731700897217 }, { "auxiliary_loss_clip": 0.01092196, "auxiliary_loss_mlp": 0.0108078, "balance_loss_clip": 1.03688669, "balance_loss_mlp": 1.02548313, "epoch": 0.174537802495115, "flos": 17857444826880.0, "grad_norm": 2.325862379307491, "language_loss": 0.83158028, "learning_rate": 3.7848245981968558e-06, "loss": 0.85331005, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.66796875, "step": 2903, "time_per_iteration": 3.811264991760254 }, { "auxiliary_loss_clip": 0.01087772, "auxiliary_loss_mlp": 0.01065544, "balance_loss_clip": 1.02212834, "balance_loss_mlp": 1.02493596, "epoch": 0.17459792574778296, "flos": 16939927630080.0, "grad_norm": 1.778342493653196, "language_loss": 0.74906021, "learning_rate": 3.784648831112429e-06, "loss": 0.7705934, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.62890625, "step": 2904, "time_per_iteration": 2.3795006275177 }, { "auxiliary_loss_clip": 0.01089669, "auxiliary_loss_mlp": 0.01060206, "balance_loss_clip": 1.01647973, "balance_loss_mlp": 1.02560961, "epoch": 0.17465804900045093, "flos": 25518855398400.0, "grad_norm": 3.361783911647681, "language_loss": 0.66569066, "learning_rate": 3.7844729963539406e-06, "loss": 0.6871894, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.640625, "step": 2905, "time_per_iteration": 2.4227616786956787 }, { "auxiliary_loss_clip": 0.01095816, "auxiliary_loss_mlp": 0.01071252, "balance_loss_clip": 1.0260005, "balance_loss_mlp": 1.02737045, "epoch": 0.1747181722531189, "flos": 24128683948800.0, "grad_norm": 2.0351061643150565, "language_loss": 0.81126302, "learning_rate": 3.7842970939280566e-06, "loss": 0.83293366, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.68359375, "step": 2906, "time_per_iteration": 3.9045958518981934 }, { "auxiliary_loss_clip": 0.01091531, "auxiliary_loss_mlp": 0.01075309, "balance_loss_clip": 1.03368115, "balance_loss_mlp": 1.02680981, "epoch": 0.17477829550578686, "flos": 17747852469120.0, "grad_norm": 1.7452977449841942, "language_loss": 0.83257782, "learning_rate": 3.784121123841449e-06, "loss": 0.85424626, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6484375, "step": 2907, "time_per_iteration": 3.767719030380249 }, { "auxiliary_loss_clip": 0.01088334, "auxiliary_loss_mlp": 0.01073022, "balance_loss_clip": 1.02994001, "balance_loss_mlp": 1.02476311, "epoch": 0.17483841875845482, "flos": 15376446858240.0, "grad_norm": 1.9210044062239655, "language_loss": 0.83407009, "learning_rate": 3.7839450861007886e-06, "loss": 0.85568357, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.63671875, "step": 2908, "time_per_iteration": 2.384033203125 }, { "auxiliary_loss_clip": 0.01090446, "auxiliary_loss_mlp": 0.01073959, "balance_loss_clip": 1.02780151, "balance_loss_mlp": 1.02590287, "epoch": 0.17489854201112282, "flos": 17162359102080.0, "grad_norm": 2.5856077688131824, "language_loss": 0.83098608, "learning_rate": 3.7837689807127518e-06, "loss": 0.85263014, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.6484375, "step": 2909, "time_per_iteration": 2.36572265625 }, { "auxiliary_loss_clip": 0.01092034, "auxiliary_loss_mlp": 0.01066829, "balance_loss_clip": 1.02424788, "balance_loss_mlp": 1.02703702, "epoch": 0.17495866526379078, "flos": 19754276060160.0, "grad_norm": 1.9003662677130055, "language_loss": 0.78629875, "learning_rate": 3.783592807684017e-06, "loss": 0.80788743, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6484375, "step": 2910, "time_per_iteration": 2.434605360031128 }, { "auxiliary_loss_clip": 0.0108969, "auxiliary_loss_mlp": 0.0106356, "balance_loss_clip": 1.0205493, "balance_loss_mlp": 1.02534711, "epoch": 0.17501878851645875, "flos": 28509899552640.0, "grad_norm": 1.83108964243515, "language_loss": 0.87899262, "learning_rate": 3.7834165670212645e-06, "loss": 0.90052509, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.64453125, "step": 2911, "time_per_iteration": 2.4561171531677246 }, { "auxiliary_loss_clip": 0.01086353, "auxiliary_loss_mlp": 0.01062566, "balance_loss_clip": 1.02050877, "balance_loss_mlp": 1.02299929, "epoch": 0.1750789117691267, "flos": 17930238744960.0, "grad_norm": 5.3857464009912634, "language_loss": 0.92041099, "learning_rate": 3.7832402587311764e-06, "loss": 0.94190013, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6328125, "step": 2912, "time_per_iteration": 2.3590400218963623 }, { "auxiliary_loss_clip": 0.01091721, "auxiliary_loss_mlp": 0.01068017, "balance_loss_clip": 1.02317047, "balance_loss_mlp": 1.02568412, "epoch": 0.17513903502179468, "flos": 18258457236480.0, "grad_norm": 2.3782396807398385, "language_loss": 0.74478644, "learning_rate": 3.783063882820439e-06, "loss": 0.76638377, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.66015625, "step": 2913, "time_per_iteration": 2.3830840587615967 }, { "auxiliary_loss_clip": 0.01089038, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.01818037, "balance_loss_mlp": 1.02552581, "epoch": 0.17519915827446264, "flos": 20703669194880.0, "grad_norm": 2.2455793451257215, "language_loss": 0.70798314, "learning_rate": 3.782887439295741e-06, "loss": 0.72949874, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6328125, "step": 2914, "time_per_iteration": 2.4031383991241455 }, { "auxiliary_loss_clip": 0.01088876, "auxiliary_loss_mlp": 0.01066116, "balance_loss_clip": 1.0230819, "balance_loss_mlp": 1.02584743, "epoch": 0.1752592815271306, "flos": 20522330259840.0, "grad_norm": 1.7611092682813587, "language_loss": 0.94816107, "learning_rate": 3.782710928163772e-06, "loss": 0.96971101, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.62890625, "step": 2915, "time_per_iteration": 2.4056267738342285 }, { "auxiliary_loss_clip": 0.0108432, "auxiliary_loss_mlp": 0.01060703, "balance_loss_clip": 1.02083921, "balance_loss_mlp": 1.02452826, "epoch": 0.1753194047797986, "flos": 21798091584000.0, "grad_norm": 1.667851734575182, "language_loss": 0.82568735, "learning_rate": 3.782534349431226e-06, "loss": 0.84713757, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.59765625, "step": 2916, "time_per_iteration": 2.429841995239258 }, { "auxiliary_loss_clip": 0.01089989, "auxiliary_loss_mlp": 0.01077711, "balance_loss_clip": 1.03336525, "balance_loss_mlp": 1.02554369, "epoch": 0.17537952803246656, "flos": 20667289691520.0, "grad_norm": 1.6106207047547538, "language_loss": 0.74966395, "learning_rate": 3.782357703104799e-06, "loss": 0.77134097, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.64453125, "step": 2917, "time_per_iteration": 2.377786159515381 }, { "auxiliary_loss_clip": 0.01086022, "auxiliary_loss_mlp": 0.01063227, "balance_loss_clip": 1.02343524, "balance_loss_mlp": 1.02559149, "epoch": 0.17543965128513453, "flos": 23293945319040.0, "grad_norm": 2.637318788006816, "language_loss": 0.78584701, "learning_rate": 3.7821809891911897e-06, "loss": 0.80733949, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.60546875, "step": 2918, "time_per_iteration": 2.3992128372192383 }, { "auxiliary_loss_clip": 0.01090813, "auxiliary_loss_mlp": 0.01058827, "balance_loss_clip": 1.01622128, "balance_loss_mlp": 1.02608585, "epoch": 0.1754997745378025, "flos": 29094345578880.0, "grad_norm": 2.087091039587337, "language_loss": 0.75596237, "learning_rate": 3.782004207697098e-06, "loss": 0.77745879, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6484375, "step": 2919, "time_per_iteration": 2.4571003913879395 }, { "auxiliary_loss_clip": 0.01090791, "auxiliary_loss_mlp": 0.01067772, "balance_loss_clip": 1.02447534, "balance_loss_mlp": 1.02365923, "epoch": 0.17555989779047046, "flos": 30370560750720.0, "grad_norm": 1.6886228336384057, "language_loss": 0.75645745, "learning_rate": 3.781827358629228e-06, "loss": 0.77804309, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.671875, "step": 2920, "time_per_iteration": 2.4785592555999756 }, { "auxiliary_loss_clip": 0.01084765, "auxiliary_loss_mlp": 0.01061525, "balance_loss_clip": 1.02383161, "balance_loss_mlp": 1.0235281, "epoch": 0.17562002104313842, "flos": 23286823401600.0, "grad_norm": 2.4105591010198006, "language_loss": 0.8089667, "learning_rate": 3.7816504419942873e-06, "loss": 0.83042955, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.61328125, "step": 2921, "time_per_iteration": 2.374786853790283 }, { "auxiliary_loss_clip": 0.01091367, "auxiliary_loss_mlp": 0.01066499, "balance_loss_clip": 1.02351213, "balance_loss_mlp": 1.02525771, "epoch": 0.1756801442958064, "flos": 24789345206400.0, "grad_norm": 1.6960201527840404, "language_loss": 0.89165568, "learning_rate": 3.7814734577989823e-06, "loss": 0.91323435, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.66015625, "step": 2922, "time_per_iteration": 2.439666986465454 }, { "auxiliary_loss_clip": 0.0108881, "auxiliary_loss_mlp": 0.0106564, "balance_loss_clip": 1.02031636, "balance_loss_mlp": 1.02369809, "epoch": 0.17574026754847438, "flos": 25770579367680.0, "grad_norm": 2.6182608140079617, "language_loss": 0.65145898, "learning_rate": 3.7812964060500253e-06, "loss": 0.67300355, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.65234375, "step": 2923, "time_per_iteration": 2.448112726211548 }, { "auxiliary_loss_clip": 0.01091541, "auxiliary_loss_mlp": 0.01062206, "balance_loss_clip": 1.01821792, "balance_loss_mlp": 1.02611911, "epoch": 0.17580039080114235, "flos": 17455664367360.0, "grad_norm": 4.211161474440613, "language_loss": 0.83717859, "learning_rate": 3.78111928675413e-06, "loss": 0.85871601, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.65234375, "step": 2924, "time_per_iteration": 2.3846991062164307 }, { "auxiliary_loss_clip": 0.01092443, "auxiliary_loss_mlp": 0.01064805, "balance_loss_clip": 1.01976728, "balance_loss_mlp": 1.02559555, "epoch": 0.1758605140538103, "flos": 14863817232000.0, "grad_norm": 3.912727636095897, "language_loss": 0.74349821, "learning_rate": 3.7809420999180126e-06, "loss": 0.76507074, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.66796875, "step": 2925, "time_per_iteration": 2.3375930786132812 }, { "auxiliary_loss_clip": 0.01087223, "auxiliary_loss_mlp": 0.01062899, "balance_loss_clip": 1.02384627, "balance_loss_mlp": 1.02541661, "epoch": 0.17592063730647828, "flos": 23003118760320.0, "grad_norm": 1.6388124988300197, "language_loss": 0.7239027, "learning_rate": 3.7807648455483934e-06, "loss": 0.74540389, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.6171875, "step": 2926, "time_per_iteration": 2.433352470397949 }, { "auxiliary_loss_clip": 0.01092121, "auxiliary_loss_mlp": 0.01067222, "balance_loss_clip": 1.02301896, "balance_loss_mlp": 1.02733636, "epoch": 0.17598076055914624, "flos": 20740432723200.0, "grad_norm": 2.1085767856671405, "language_loss": 0.8708483, "learning_rate": 3.7805875236519918e-06, "loss": 0.89244175, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6484375, "step": 2927, "time_per_iteration": 2.3931381702423096 }, { "auxiliary_loss_clip": 0.01090264, "auxiliary_loss_mlp": 0.01059403, "balance_loss_clip": 1.01913428, "balance_loss_mlp": 1.02754247, "epoch": 0.1760408838118142, "flos": 34091080185600.0, "grad_norm": 2.277042296122084, "language_loss": 0.73026568, "learning_rate": 3.7804101342355336e-06, "loss": 0.75176233, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.62890625, "step": 2928, "time_per_iteration": 2.519291877746582 }, { "auxiliary_loss_clip": 0.01087588, "auxiliary_loss_mlp": 0.01053662, "balance_loss_clip": 1.015944, "balance_loss_mlp": 1.02680743, "epoch": 0.1761010070644822, "flos": 24167297779200.0, "grad_norm": 1.98724188239387, "language_loss": 0.8368383, "learning_rate": 3.780232677305744e-06, "loss": 0.85825086, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.609375, "step": 2929, "time_per_iteration": 2.4175868034362793 }, { "auxiliary_loss_clip": 0.01089639, "auxiliary_loss_mlp": 0.01060852, "balance_loss_clip": 1.02282453, "balance_loss_mlp": 1.02553225, "epoch": 0.17616113031715017, "flos": 26575536741120.0, "grad_norm": 1.5652907653744115, "language_loss": 0.80871391, "learning_rate": 3.7800551528693535e-06, "loss": 0.83021879, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.640625, "step": 2930, "time_per_iteration": 2.4472079277038574 }, { "auxiliary_loss_clip": 0.01090927, "auxiliary_loss_mlp": 0.0105445, "balance_loss_clip": 1.01368046, "balance_loss_mlp": 1.02758789, "epoch": 0.17622125356981813, "flos": 25665490575360.0, "grad_norm": 2.966483888353363, "language_loss": 0.79167688, "learning_rate": 3.7798775609330927e-06, "loss": 0.81313062, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.6328125, "step": 2931, "time_per_iteration": 2.4499385356903076 }, { "auxiliary_loss_clip": 0.01089269, "auxiliary_loss_mlp": 0.01062005, "balance_loss_clip": 1.02016222, "balance_loss_mlp": 1.02585924, "epoch": 0.1762813768224861, "flos": 16507597864320.0, "grad_norm": 2.5413528061024735, "language_loss": 0.77786922, "learning_rate": 3.779699901503696e-06, "loss": 0.79938197, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6328125, "step": 2932, "time_per_iteration": 2.38240122795105 }, { "auxiliary_loss_clip": 0.01095052, "auxiliary_loss_mlp": 0.01063066, "balance_loss_clip": 1.01872039, "balance_loss_mlp": 1.0274384, "epoch": 0.17634150007515406, "flos": 11211239036160.0, "grad_norm": 2.284665674463721, "language_loss": 0.91705918, "learning_rate": 3.7795221745879016e-06, "loss": 0.93864036, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.67578125, "step": 2933, "time_per_iteration": 2.4355058670043945 }, { "auxiliary_loss_clip": 0.01089914, "auxiliary_loss_mlp": 0.01065629, "balance_loss_clip": 1.0268141, "balance_loss_mlp": 1.02732658, "epoch": 0.17640162332782203, "flos": 23658787693440.0, "grad_norm": 1.7608745400203607, "language_loss": 0.89507747, "learning_rate": 3.779344380192448e-06, "loss": 0.91663289, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.625, "step": 2934, "time_per_iteration": 2.446197032928467 }, { "auxiliary_loss_clip": 0.01084472, "auxiliary_loss_mlp": 0.01065314, "balance_loss_clip": 1.0243299, "balance_loss_mlp": 1.02436435, "epoch": 0.17646174658049, "flos": 53795012198400.0, "grad_norm": 1.598324683849239, "language_loss": 0.72590935, "learning_rate": 3.779166518324077e-06, "loss": 0.7474072, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6015625, "step": 2935, "time_per_iteration": 2.7311477661132812 }, { "auxiliary_loss_clip": 0.01094954, "auxiliary_loss_mlp": 0.01064281, "balance_loss_clip": 1.01843357, "balance_loss_mlp": 1.02664304, "epoch": 0.17652186983315798, "flos": 24242710049280.0, "grad_norm": 2.018301567654352, "language_loss": 0.72943503, "learning_rate": 3.7789885889895325e-06, "loss": 0.75102735, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.68359375, "step": 2936, "time_per_iteration": 2.4620087146759033 }, { "auxiliary_loss_clip": 0.01092576, "auxiliary_loss_mlp": 0.010671, "balance_loss_clip": 1.02633047, "balance_loss_mlp": 1.0292021, "epoch": 0.17658199308582595, "flos": 27453043653120.0, "grad_norm": 3.402756259469312, "language_loss": 0.72766805, "learning_rate": 3.7788105921955634e-06, "loss": 0.74926484, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6328125, "step": 2937, "time_per_iteration": 2.471367359161377 }, { "auxiliary_loss_clip": 0.01092993, "auxiliary_loss_mlp": 0.01061944, "balance_loss_clip": 1.01879013, "balance_loss_mlp": 1.02625847, "epoch": 0.17664211633849392, "flos": 22417590481920.0, "grad_norm": 2.2535563903697224, "language_loss": 0.78291547, "learning_rate": 3.7786325279489184e-06, "loss": 0.80446482, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.66796875, "step": 2938, "time_per_iteration": 2.4107861518859863 }, { "auxiliary_loss_clip": 0.01090456, "auxiliary_loss_mlp": 0.01059775, "balance_loss_clip": 1.01874328, "balance_loss_mlp": 1.02509427, "epoch": 0.17670223959116188, "flos": 24714037670400.0, "grad_norm": 2.090379864234691, "language_loss": 0.7373234, "learning_rate": 3.7784543962563495e-06, "loss": 0.75882578, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.65625, "step": 2939, "time_per_iteration": 2.414764404296875 }, { "auxiliary_loss_clip": 0.0108966, "auxiliary_loss_mlp": 0.01063165, "balance_loss_clip": 1.02265763, "balance_loss_mlp": 1.02658463, "epoch": 0.17676236284382985, "flos": 22525995853440.0, "grad_norm": 3.3454685088918388, "language_loss": 0.7668162, "learning_rate": 3.7782761971246115e-06, "loss": 0.78834438, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.6328125, "step": 2940, "time_per_iteration": 2.409230947494507 }, { "auxiliary_loss_clip": 0.01091008, "auxiliary_loss_mlp": 0.0106132, "balance_loss_clip": 1.02055073, "balance_loss_mlp": 1.02596807, "epoch": 0.1768224860964978, "flos": 12384355363200.0, "grad_norm": 2.119221690239612, "language_loss": 0.88552141, "learning_rate": 3.7780979305604616e-06, "loss": 0.90704465, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6484375, "step": 2941, "time_per_iteration": 3.802500009536743 }, { "auxiliary_loss_clip": 0.01091089, "auxiliary_loss_mlp": 0.01060538, "balance_loss_clip": 1.01931596, "balance_loss_mlp": 1.0262109, "epoch": 0.1768826093491658, "flos": 24352197672960.0, "grad_norm": 1.9127458396190964, "language_loss": 0.78635901, "learning_rate": 3.7779195965706607e-06, "loss": 0.80787528, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.6484375, "step": 2942, "time_per_iteration": 2.4250519275665283 }, { "auxiliary_loss_clip": 0.01091681, "auxiliary_loss_mlp": 0.01065552, "balance_loss_clip": 1.02015686, "balance_loss_mlp": 1.02609038, "epoch": 0.17694273260183377, "flos": 23585923952640.0, "grad_norm": 2.4871662247255313, "language_loss": 0.81709087, "learning_rate": 3.77774119516197e-06, "loss": 0.83866316, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.65625, "step": 2943, "time_per_iteration": 3.832476854324341 }, { "auxiliary_loss_clip": 0.01090813, "auxiliary_loss_mlp": 0.01068929, "balance_loss_clip": 1.022843, "balance_loss_mlp": 1.02410316, "epoch": 0.17700285585450173, "flos": 26759773319040.0, "grad_norm": 1.6811395293676255, "language_loss": 0.82449412, "learning_rate": 3.777562726341155e-06, "loss": 0.84609151, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.66796875, "step": 2944, "time_per_iteration": 2.4378952980041504 }, { "auxiliary_loss_clip": 0.01088668, "auxiliary_loss_mlp": 0.01063426, "balance_loss_clip": 1.02220368, "balance_loss_mlp": 1.02404368, "epoch": 0.1770629791071697, "flos": 42774712721280.0, "grad_norm": 1.8146150788316255, "language_loss": 0.74577105, "learning_rate": 3.7773841901149835e-06, "loss": 0.76729202, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.6484375, "step": 2945, "time_per_iteration": 2.5761895179748535 }, { "auxiliary_loss_clip": 0.01088469, "auxiliary_loss_mlp": 0.01063855, "balance_loss_clip": 1.02082014, "balance_loss_mlp": 1.02520621, "epoch": 0.17712310235983766, "flos": 17344675555200.0, "grad_norm": 2.9731231695945817, "language_loss": 0.80659848, "learning_rate": 3.7772055864902256e-06, "loss": 0.82812172, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6328125, "step": 2946, "time_per_iteration": 3.7661190032958984 }, { "auxiliary_loss_clip": 0.01086977, "auxiliary_loss_mlp": 0.01057965, "balance_loss_clip": 1.01872134, "balance_loss_mlp": 1.02483714, "epoch": 0.17718322561250563, "flos": 23877344004480.0, "grad_norm": 1.6438459014118192, "language_loss": 0.77965319, "learning_rate": 3.7770269154736535e-06, "loss": 0.80110264, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.62109375, "step": 2947, "time_per_iteration": 3.812652349472046 }, { "auxiliary_loss_clip": 0.01087952, "auxiliary_loss_mlp": 0.01064782, "balance_loss_clip": 1.02284384, "balance_loss_mlp": 1.02438092, "epoch": 0.1772433488651736, "flos": 36464859768960.0, "grad_norm": 1.991526034089991, "language_loss": 0.74439216, "learning_rate": 3.7768481770720424e-06, "loss": 0.76591945, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.63671875, "step": 2948, "time_per_iteration": 2.5272905826568604 }, { "auxiliary_loss_clip": 0.01090615, "auxiliary_loss_mlp": 0.01065633, "balance_loss_clip": 1.02302742, "balance_loss_mlp": 1.02630484, "epoch": 0.1773034721178416, "flos": 26683592999040.0, "grad_norm": 1.9244414683092006, "language_loss": 0.82702535, "learning_rate": 3.776669371292171e-06, "loss": 0.84858781, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.64453125, "step": 2949, "time_per_iteration": 2.443025588989258 }, { "auxiliary_loss_clip": 0.01028988, "auxiliary_loss_mlp": 0.01012618, "balance_loss_clip": 1.0064187, "balance_loss_mlp": 1.00949144, "epoch": 0.17736359537050955, "flos": 57114377712000.0, "grad_norm": 0.7857559367825304, "language_loss": 0.64979935, "learning_rate": 3.7764904981408186e-06, "loss": 0.67021549, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 0.06176758, "router_z_loss_mlp": 0.1953125, "step": 2950, "time_per_iteration": 3.1087162494659424 }, { "auxiliary_loss_clip": 0.01088878, "auxiliary_loss_mlp": 0.01061852, "balance_loss_clip": 1.01712513, "balance_loss_mlp": 1.02598345, "epoch": 0.17742371862317752, "flos": 27196990675200.0, "grad_norm": 1.6928622531178008, "language_loss": 0.86279452, "learning_rate": 3.7763115576247686e-06, "loss": 0.88430178, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.62890625, "step": 2951, "time_per_iteration": 2.4450535774230957 }, { "auxiliary_loss_clip": 0.01092478, "auxiliary_loss_mlp": 0.01072355, "balance_loss_clip": 1.02631664, "balance_loss_mlp": 1.02539968, "epoch": 0.17748384187584548, "flos": 20958639920640.0, "grad_norm": 2.0502827851310466, "language_loss": 0.82256985, "learning_rate": 3.776132549750806e-06, "loss": 0.84421819, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.671875, "step": 2952, "time_per_iteration": 2.3991332054138184 }, { "auxiliary_loss_clip": 0.0109136, "auxiliary_loss_mlp": 0.01069049, "balance_loss_clip": 1.02751613, "balance_loss_mlp": 1.02606821, "epoch": 0.17754396512851345, "flos": 25008809212800.0, "grad_norm": 2.109505861018574, "language_loss": 0.82029462, "learning_rate": 3.7759534745257194e-06, "loss": 0.84189868, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.65625, "step": 2953, "time_per_iteration": 2.423686981201172 }, { "auxiliary_loss_clip": 0.01092565, "auxiliary_loss_mlp": 0.01068753, "balance_loss_clip": 1.02397823, "balance_loss_mlp": 1.02600431, "epoch": 0.1776040883811814, "flos": 32050197216000.0, "grad_norm": 1.9120741323726649, "language_loss": 0.90183246, "learning_rate": 3.7757743319562994e-06, "loss": 0.9234457, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6640625, "step": 2954, "time_per_iteration": 2.4903390407562256 }, { "auxiliary_loss_clip": 0.01091948, "auxiliary_loss_mlp": 0.01066727, "balance_loss_clip": 1.02135563, "balance_loss_mlp": 1.02483177, "epoch": 0.17766421163384938, "flos": 21573216316800.0, "grad_norm": 1.8223407953926485, "language_loss": 0.86054671, "learning_rate": 3.7755951220493386e-06, "loss": 0.88213354, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.671875, "step": 2955, "time_per_iteration": 2.4123291969299316 }, { "auxiliary_loss_clip": 0.01088498, "auxiliary_loss_mlp": 0.01068142, "balance_loss_clip": 1.02515531, "balance_loss_mlp": 1.02532196, "epoch": 0.17772433488651737, "flos": 22418218886400.0, "grad_norm": 2.024614206975815, "language_loss": 0.73179841, "learning_rate": 3.7754158448116327e-06, "loss": 0.7533648, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6328125, "step": 2956, "time_per_iteration": 2.417614459991455 }, { "auxiliary_loss_clip": 0.01089355, "auxiliary_loss_mlp": 0.01065999, "balance_loss_clip": 1.02379906, "balance_loss_mlp": 1.02462697, "epoch": 0.17778445813918534, "flos": 25628273199360.0, "grad_norm": 2.323884608757457, "language_loss": 0.84253979, "learning_rate": 3.7752365002499795e-06, "loss": 0.8640933, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6484375, "step": 2957, "time_per_iteration": 2.431140899658203 }, { "auxiliary_loss_clip": 0.01089182, "auxiliary_loss_mlp": 0.01071633, "balance_loss_clip": 1.02804971, "balance_loss_mlp": 1.02612519, "epoch": 0.1778445813918533, "flos": 25627714617600.0, "grad_norm": 1.5552382787778225, "language_loss": 0.75916815, "learning_rate": 3.7750570883711807e-06, "loss": 0.78077626, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.62890625, "step": 2958, "time_per_iteration": 2.433487892150879 }, { "auxiliary_loss_clip": 0.01093099, "auxiliary_loss_mlp": 0.01069043, "balance_loss_clip": 1.02579391, "balance_loss_mlp": 1.0282073, "epoch": 0.17790470464452127, "flos": 22344447450240.0, "grad_norm": 2.1738270680083502, "language_loss": 0.82757676, "learning_rate": 3.7748776091820397e-06, "loss": 0.84919822, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.6484375, "step": 2959, "time_per_iteration": 2.425863027572632 }, { "auxiliary_loss_clip": 0.0109603, "auxiliary_loss_mlp": 0.01069011, "balance_loss_clip": 1.01765597, "balance_loss_mlp": 1.02738857, "epoch": 0.17796482789718923, "flos": 18765012286080.0, "grad_norm": 2.329095013415127, "language_loss": 0.54422832, "learning_rate": 3.774698062689362e-06, "loss": 0.56587869, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.6875, "step": 2960, "time_per_iteration": 2.396094560623169 }, { "auxiliary_loss_clip": 0.01095425, "auxiliary_loss_mlp": 0.01071421, "balance_loss_clip": 1.02731347, "balance_loss_mlp": 1.02805972, "epoch": 0.1780249511498572, "flos": 23439812446080.0, "grad_norm": 1.7721663445494245, "language_loss": 0.90405816, "learning_rate": 3.7745184488999548e-06, "loss": 0.92572659, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.671875, "step": 2961, "time_per_iteration": 2.4950878620147705 }, { "auxiliary_loss_clip": 0.01095457, "auxiliary_loss_mlp": 0.01071767, "balance_loss_clip": 1.02370191, "balance_loss_mlp": 1.0271039, "epoch": 0.1780850744025252, "flos": 23366355212160.0, "grad_norm": 2.45789156140795, "language_loss": 0.8134445, "learning_rate": 3.774338767820631e-06, "loss": 0.83511674, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.68359375, "step": 2962, "time_per_iteration": 2.4359676837921143 }, { "auxiliary_loss_clip": 0.01094236, "auxiliary_loss_mlp": 0.01070934, "balance_loss_clip": 1.02475214, "balance_loss_mlp": 1.02859378, "epoch": 0.17814519765519315, "flos": 13771140410880.0, "grad_norm": 5.2537481995066715, "language_loss": 0.7657671, "learning_rate": 3.774159019458203e-06, "loss": 0.78741872, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 0.65625, "step": 2963, "time_per_iteration": 2.3878188133239746 }, { "auxiliary_loss_clip": 0.01096869, "auxiliary_loss_mlp": 0.01066645, "balance_loss_clip": 1.021083, "balance_loss_mlp": 1.02868414, "epoch": 0.17820532090786112, "flos": 21975450624000.0, "grad_norm": 1.7425773186894244, "language_loss": 0.80399209, "learning_rate": 3.7739792038194877e-06, "loss": 0.82562721, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.6796875, "step": 2964, "time_per_iteration": 2.4181792736053467 }, { "auxiliary_loss_clip": 0.01094541, "auxiliary_loss_mlp": 0.01070385, "balance_loss_clip": 1.02611053, "balance_loss_mlp": 1.02925324, "epoch": 0.17826544416052909, "flos": 24789589585920.0, "grad_norm": 1.521725165563962, "language_loss": 0.82486302, "learning_rate": 3.7737993209113027e-06, "loss": 0.84651226, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.65234375, "step": 2965, "time_per_iteration": 2.4763569831848145 }, { "auxiliary_loss_clip": 0.01092371, "auxiliary_loss_mlp": 0.01065001, "balance_loss_clip": 1.02315831, "balance_loss_mlp": 1.02773428, "epoch": 0.17832556741319705, "flos": 13878777732480.0, "grad_norm": 2.2779903016109446, "language_loss": 0.97151577, "learning_rate": 3.7736193707404698e-06, "loss": 0.99308956, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6484375, "step": 2966, "time_per_iteration": 2.360368490219116 }, { "auxiliary_loss_clip": 0.01092327, "auxiliary_loss_mlp": 0.01069858, "balance_loss_clip": 1.02424836, "balance_loss_mlp": 1.0269804, "epoch": 0.17838569066586502, "flos": 36640403418240.0, "grad_norm": 2.039276864827948, "language_loss": 0.74423856, "learning_rate": 3.7734393533138127e-06, "loss": 0.76586044, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.65234375, "step": 2967, "time_per_iteration": 2.5491385459899902 }, { "auxiliary_loss_clip": 0.01089274, "auxiliary_loss_mlp": 0.01065407, "balance_loss_clip": 1.02425599, "balance_loss_mlp": 1.02705884, "epoch": 0.17844581391853298, "flos": 18726468278400.0, "grad_norm": 1.9861208247607782, "language_loss": 0.78608954, "learning_rate": 3.773259268638157e-06, "loss": 0.80763638, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.625, "step": 2968, "time_per_iteration": 2.3777480125427246 }, { "auxiliary_loss_clip": 0.01094463, "auxiliary_loss_mlp": 0.01065998, "balance_loss_clip": 1.02577662, "balance_loss_mlp": 1.03026736, "epoch": 0.17850593717120097, "flos": 27377107712640.0, "grad_norm": 1.7800022303088707, "language_loss": 0.76929861, "learning_rate": 3.7730791167203333e-06, "loss": 0.79090321, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.640625, "step": 2969, "time_per_iteration": 2.477607488632202 }, { "auxiliary_loss_clip": 0.01031344, "auxiliary_loss_mlp": 0.01019625, "balance_loss_clip": 1.01411796, "balance_loss_mlp": 1.01111662, "epoch": 0.17856606042386894, "flos": 66992913129600.0, "grad_norm": 0.8496502416146613, "language_loss": 0.69044411, "learning_rate": 3.772898897567171e-06, "loss": 0.71095383, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 0.05517578, "router_z_loss_mlp": 0.203125, "step": 2970, "time_per_iteration": 3.11696457862854 }, { "auxiliary_loss_clip": 0.0109688, "auxiliary_loss_mlp": 0.01062834, "balance_loss_clip": 1.01417327, "balance_loss_mlp": 1.0267489, "epoch": 0.1786261836765369, "flos": 36975499447680.0, "grad_norm": 1.799057981902391, "language_loss": 0.69209933, "learning_rate": 3.772718611185505e-06, "loss": 0.71369648, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 0.48632812, "router_z_loss_mlp": 0.703125, "step": 2971, "time_per_iteration": 2.5535476207733154 }, { "auxiliary_loss_clip": 0.01092677, "auxiliary_loss_mlp": 0.0107197, "balance_loss_clip": 1.02252221, "balance_loss_mlp": 1.02562046, "epoch": 0.17868630692920487, "flos": 24824328255360.0, "grad_norm": 1.6689932514066017, "language_loss": 0.91488957, "learning_rate": 3.7725382575821717e-06, "loss": 0.93653607, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 0.671875, "step": 2972, "time_per_iteration": 2.446415424346924 }, { "auxiliary_loss_clip": 0.01093351, "auxiliary_loss_mlp": 0.01069817, "balance_loss_clip": 1.02501774, "balance_loss_mlp": 1.02773273, "epoch": 0.17874643018187283, "flos": 16981055078400.0, "grad_norm": 2.350224869438406, "language_loss": 0.8992734, "learning_rate": 3.77235783676401e-06, "loss": 0.92090511, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.65625, "step": 2973, "time_per_iteration": 2.376587390899658 }, { "auxiliary_loss_clip": 0.01092104, "auxiliary_loss_mlp": 0.01065348, "balance_loss_clip": 1.02043009, "balance_loss_mlp": 1.0280683, "epoch": 0.1788065534345408, "flos": 21031189459200.0, "grad_norm": 1.8451531708926765, "language_loss": 0.78170973, "learning_rate": 3.7721773487378615e-06, "loss": 0.80328429, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.640625, "step": 2974, "time_per_iteration": 2.418952465057373 }, { "auxiliary_loss_clip": 0.01095703, "auxiliary_loss_mlp": 0.01075507, "balance_loss_clip": 1.0274415, "balance_loss_mlp": 1.02945495, "epoch": 0.17886667668720876, "flos": 23986587248640.0, "grad_norm": 2.6096923945924995, "language_loss": 0.77947247, "learning_rate": 3.7719967935105705e-06, "loss": 0.80118459, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 0.6640625, "step": 2975, "time_per_iteration": 2.415820837020874 }, { "auxiliary_loss_clip": 0.010915, "auxiliary_loss_mlp": 0.01062939, "balance_loss_clip": 1.0219785, "balance_loss_mlp": 1.02681684, "epoch": 0.17892679993987676, "flos": 25738284493440.0, "grad_norm": 1.5519971780822637, "language_loss": 0.74373519, "learning_rate": 3.7718161710889833e-06, "loss": 0.76527959, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6484375, "step": 2976, "time_per_iteration": 2.461374521255493 }, { "auxiliary_loss_clip": 0.01087426, "auxiliary_loss_mlp": 0.01061995, "balance_loss_clip": 1.02372909, "balance_loss_mlp": 1.02813351, "epoch": 0.17898692319254472, "flos": 25698588410880.0, "grad_norm": 1.7576418524856383, "language_loss": 0.78353024, "learning_rate": 3.7716354814799495e-06, "loss": 0.8050245, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.59375, "step": 2977, "time_per_iteration": 2.4743964672088623 }, { "auxiliary_loss_clip": 0.01094827, "auxiliary_loss_mlp": 0.01083284, "balance_loss_clip": 1.03705442, "balance_loss_mlp": 1.0310663, "epoch": 0.1790470464452127, "flos": 19316779413120.0, "grad_norm": 2.003891850634519, "language_loss": 0.8177827, "learning_rate": 3.7714547246903203e-06, "loss": 0.83956379, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 0.63671875, "step": 2978, "time_per_iteration": 2.4579994678497314 }, { "auxiliary_loss_clip": 0.01096647, "auxiliary_loss_mlp": 0.01076371, "balance_loss_clip": 1.02914, "balance_loss_mlp": 1.02882838, "epoch": 0.17910716969788065, "flos": 30042970663680.0, "grad_norm": 1.4988797704832337, "language_loss": 0.77510154, "learning_rate": 3.7712739007269508e-06, "loss": 0.79683173, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6796875, "step": 2979, "time_per_iteration": 2.497911214828491 }, { "auxiliary_loss_clip": 0.01092926, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.03085089, "balance_loss_mlp": 1.02933168, "epoch": 0.17916729295054862, "flos": 19426685973120.0, "grad_norm": 2.034246159665966, "language_loss": 0.70731312, "learning_rate": 3.7710930095966976e-06, "loss": 0.72895616, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.63671875, "step": 2980, "time_per_iteration": 2.4042341709136963 }, { "auxiliary_loss_clip": 0.01094946, "auxiliary_loss_mlp": 0.01066934, "balance_loss_clip": 1.02144325, "balance_loss_mlp": 1.02829623, "epoch": 0.17922741620321658, "flos": 14610661896960.0, "grad_norm": 1.9576351353776122, "language_loss": 0.72229278, "learning_rate": 3.7709120513064196e-06, "loss": 0.7439115, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.6640625, "step": 2981, "time_per_iteration": 3.8065884113311768 }, { "auxiliary_loss_clip": 0.01093691, "auxiliary_loss_mlp": 0.0107369, "balance_loss_clip": 1.02924895, "balance_loss_mlp": 1.02799845, "epoch": 0.17928753945588458, "flos": 17164349049600.0, "grad_norm": 2.068811821014254, "language_loss": 0.8369323, "learning_rate": 3.7707310258629796e-06, "loss": 0.8586061, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.66015625, "step": 2982, "time_per_iteration": 3.7967631816864014 }, { "auxiliary_loss_clip": 0.01088146, "auxiliary_loss_mlp": 0.01061244, "balance_loss_clip": 1.02145147, "balance_loss_mlp": 1.02565253, "epoch": 0.17934766270855254, "flos": 31394248992000.0, "grad_norm": 1.612073895844637, "language_loss": 0.84339297, "learning_rate": 3.7705499332732413e-06, "loss": 0.86488688, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.625, "step": 2983, "time_per_iteration": 2.4897921085357666 }, { "auxiliary_loss_clip": 0.01093153, "auxiliary_loss_mlp": 0.01068123, "balance_loss_clip": 1.02411067, "balance_loss_mlp": 1.02636325, "epoch": 0.1794077859612205, "flos": 20813121907200.0, "grad_norm": 1.812398969735257, "language_loss": 0.88168931, "learning_rate": 3.7703687735440718e-06, "loss": 0.90330207, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.671875, "step": 2984, "time_per_iteration": 2.3961474895477295 }, { "auxiliary_loss_clip": 0.01090928, "auxiliary_loss_mlp": 0.01063725, "balance_loss_clip": 1.01878297, "balance_loss_mlp": 1.02599335, "epoch": 0.17946790921388847, "flos": 28985172157440.0, "grad_norm": 1.3938349388431412, "language_loss": 0.90504766, "learning_rate": 3.7701875466823416e-06, "loss": 0.9265942, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6484375, "step": 2985, "time_per_iteration": 3.89841628074646 }, { "auxiliary_loss_clip": 0.01086745, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.01794398, "balance_loss_mlp": 1.0259825, "epoch": 0.17952803246655644, "flos": 20736452828160.0, "grad_norm": 2.210879630029339, "language_loss": 0.72706532, "learning_rate": 3.770006252694922e-06, "loss": 0.74850559, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.609375, "step": 2986, "time_per_iteration": 3.8518753051757812 }, { "auxiliary_loss_clip": 0.01088958, "auxiliary_loss_mlp": 0.01061264, "balance_loss_clip": 1.0199461, "balance_loss_mlp": 1.02669168, "epoch": 0.1795881557192244, "flos": 28254754270080.0, "grad_norm": 2.4020663756793645, "language_loss": 0.79779857, "learning_rate": 3.769824891588688e-06, "loss": 0.81930083, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.625, "step": 2987, "time_per_iteration": 2.443889617919922 }, { "auxiliary_loss_clip": 0.01093734, "auxiliary_loss_mlp": 0.01070551, "balance_loss_clip": 1.02315319, "balance_loss_mlp": 1.0276202, "epoch": 0.17964827897189237, "flos": 18551029363200.0, "grad_norm": 1.9215821726544553, "language_loss": 0.80457699, "learning_rate": 3.7696434633705164e-06, "loss": 0.82621986, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.66015625, "step": 2988, "time_per_iteration": 2.3856709003448486 }, { "auxiliary_loss_clip": 0.01031617, "auxiliary_loss_mlp": 0.01010996, "balance_loss_clip": 1.00489283, "balance_loss_mlp": 1.00945616, "epoch": 0.17970840222456036, "flos": 58162261392000.0, "grad_norm": 0.7716294168185285, "language_loss": 0.62746322, "learning_rate": 3.7694619680472875e-06, "loss": 0.64788938, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 0.06103516, "router_z_loss_mlp": 0.22167969, "step": 2989, "time_per_iteration": 2.973705768585205 }, { "auxiliary_loss_clip": 0.01090386, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.01542068, "balance_loss_mlp": 1.02716446, "epoch": 0.17976852547722832, "flos": 20299828965120.0, "grad_norm": 11.068882089740727, "language_loss": 0.7305361, "learning_rate": 3.7692804056258837e-06, "loss": 0.75198901, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.6328125, "step": 2990, "time_per_iteration": 2.4334096908569336 }, { "auxiliary_loss_clip": 0.01092797, "auxiliary_loss_mlp": 0.01065512, "balance_loss_clip": 1.02531433, "balance_loss_mlp": 1.02941871, "epoch": 0.1798286487298963, "flos": 39668001632640.0, "grad_norm": 2.0823771517741494, "language_loss": 0.70580608, "learning_rate": 3.7690987761131893e-06, "loss": 0.72738922, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.6328125, "step": 2991, "time_per_iteration": 2.5683889389038086 }, { "auxiliary_loss_clip": 0.01095467, "auxiliary_loss_mlp": 0.01068133, "balance_loss_clip": 1.02629066, "balance_loss_mlp": 1.03146887, "epoch": 0.17988877198256426, "flos": 25519134689280.0, "grad_norm": 1.493029457158248, "language_loss": 0.84229404, "learning_rate": 3.7689170795160924e-06, "loss": 0.86393005, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.640625, "step": 2992, "time_per_iteration": 2.4548990726470947 }, { "auxiliary_loss_clip": 0.01091343, "auxiliary_loss_mlp": 0.01061855, "balance_loss_clip": 1.02451801, "balance_loss_mlp": 1.03033054, "epoch": 0.17994889523523222, "flos": 18806488848000.0, "grad_norm": 1.898143793989187, "language_loss": 0.83772552, "learning_rate": 3.7687353158414822e-06, "loss": 0.85925752, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.609375, "step": 2993, "time_per_iteration": 2.382178544998169 }, { "auxiliary_loss_clip": 0.01092293, "auxiliary_loss_mlp": 0.01057194, "balance_loss_clip": 1.01651978, "balance_loss_mlp": 1.02893877, "epoch": 0.18000901848790019, "flos": 21103424795520.0, "grad_norm": 1.6764172116583114, "language_loss": 0.80338109, "learning_rate": 3.7685534850962517e-06, "loss": 0.82487595, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6328125, "step": 2994, "time_per_iteration": 2.455782890319824 }, { "auxiliary_loss_clip": 0.01094398, "auxiliary_loss_mlp": 0.01062825, "balance_loss_clip": 1.02241313, "balance_loss_mlp": 1.02984941, "epoch": 0.18006914174056818, "flos": 19645416840960.0, "grad_norm": 1.984292348680541, "language_loss": 0.82541978, "learning_rate": 3.768371587287296e-06, "loss": 0.84699202, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.64453125, "step": 2995, "time_per_iteration": 2.4314863681793213 }, { "auxiliary_loss_clip": 0.01093516, "auxiliary_loss_mlp": 0.01068033, "balance_loss_clip": 1.03014851, "balance_loss_mlp": 1.02956164, "epoch": 0.18012926499323614, "flos": 19498886398080.0, "grad_norm": 1.5848528473635575, "language_loss": 0.85656714, "learning_rate": 3.768189622421512e-06, "loss": 0.87818265, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.640625, "step": 2996, "time_per_iteration": 2.3924291133880615 }, { "auxiliary_loss_clip": 0.01087095, "auxiliary_loss_mlp": 0.01069513, "balance_loss_clip": 1.03279662, "balance_loss_mlp": 1.02600503, "epoch": 0.1801893882459041, "flos": 19463519324160.0, "grad_norm": 3.484491301737472, "language_loss": 0.89038014, "learning_rate": 3.7680075905058006e-06, "loss": 0.91194624, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.609375, "step": 2997, "time_per_iteration": 2.467494010925293 }, { "auxiliary_loss_clip": 0.01093891, "auxiliary_loss_mlp": 0.01069424, "balance_loss_clip": 1.02557898, "balance_loss_mlp": 1.02730846, "epoch": 0.18024951149857207, "flos": 26869365676800.0, "grad_norm": 1.7501927550630594, "language_loss": 0.87256241, "learning_rate": 3.7678254915470643e-06, "loss": 0.8941955, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6640625, "step": 2998, "time_per_iteration": 2.4533181190490723 }, { "auxiliary_loss_clip": 0.01088712, "auxiliary_loss_mlp": 0.0106378, "balance_loss_clip": 1.02458358, "balance_loss_mlp": 1.02592683, "epoch": 0.18030963475124004, "flos": 30225322028160.0, "grad_norm": 1.6512911327092155, "language_loss": 0.85836935, "learning_rate": 3.7676433255522084e-06, "loss": 0.87989426, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.62890625, "step": 2999, "time_per_iteration": 2.4637019634246826 }, { "auxiliary_loss_clip": 0.01089146, "auxiliary_loss_mlp": 0.01068798, "balance_loss_clip": 1.02650213, "balance_loss_mlp": 1.0255034, "epoch": 0.180369758003908, "flos": 22306462024320.0, "grad_norm": 1.6408172871852538, "language_loss": 0.76792109, "learning_rate": 3.76746109252814e-06, "loss": 0.78950053, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.63671875, "step": 3000, "time_per_iteration": 2.422412872314453 }, { "auxiliary_loss_clip": 0.01088006, "auxiliary_loss_mlp": 0.01068695, "balance_loss_clip": 1.02978468, "balance_loss_mlp": 1.02558672, "epoch": 0.18042988125657597, "flos": 23730918295680.0, "grad_norm": 1.727632371656516, "language_loss": 0.73275137, "learning_rate": 3.76727879248177e-06, "loss": 0.75431836, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.625, "step": 3001, "time_per_iteration": 2.412972927093506 }, { "auxiliary_loss_clip": 0.01091345, "auxiliary_loss_mlp": 0.01063401, "balance_loss_clip": 1.02298892, "balance_loss_mlp": 1.02721977, "epoch": 0.18049000450924396, "flos": 24092548824960.0, "grad_norm": 2.074750306761446, "language_loss": 0.89982629, "learning_rate": 3.767096425420011e-06, "loss": 0.92137372, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.640625, "step": 3002, "time_per_iteration": 2.41715407371521 }, { "auxiliary_loss_clip": 0.01090443, "auxiliary_loss_mlp": 0.01059452, "balance_loss_clip": 1.02106702, "balance_loss_mlp": 1.02664602, "epoch": 0.18055012776191193, "flos": 22162096085760.0, "grad_norm": 2.321027853246211, "language_loss": 0.8299185, "learning_rate": 3.7669139913497788e-06, "loss": 0.85141742, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.640625, "step": 3003, "time_per_iteration": 2.3861923217773438 }, { "auxiliary_loss_clip": 0.01090547, "auxiliary_loss_mlp": 0.0106701, "balance_loss_clip": 1.02717066, "balance_loss_mlp": 1.0262897, "epoch": 0.1806102510145799, "flos": 28912238593920.0, "grad_norm": 2.579181695905576, "language_loss": 0.70442009, "learning_rate": 3.7667314902779907e-06, "loss": 0.72599566, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.640625, "step": 3004, "time_per_iteration": 2.445986032485962 }, { "auxiliary_loss_clip": 0.01090146, "auxiliary_loss_mlp": 0.01066086, "balance_loss_clip": 1.02634132, "balance_loss_mlp": 1.02550173, "epoch": 0.18067037426724786, "flos": 19024696045440.0, "grad_norm": 1.5999047364249168, "language_loss": 0.86246699, "learning_rate": 3.7665489222115677e-06, "loss": 0.88402927, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.6484375, "step": 3005, "time_per_iteration": 2.3730576038360596 }, { "auxiliary_loss_clip": 0.01087625, "auxiliary_loss_mlp": 0.01059012, "balance_loss_clip": 1.02045989, "balance_loss_mlp": 1.02589869, "epoch": 0.18073049751991582, "flos": 27452415248640.0, "grad_norm": 1.7864600540557356, "language_loss": 0.84757876, "learning_rate": 3.766366287157432e-06, "loss": 0.86904514, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.6171875, "step": 3006, "time_per_iteration": 2.51257061958313 }, { "auxiliary_loss_clip": 0.01088623, "auxiliary_loss_mlp": 0.01067121, "balance_loss_clip": 1.02334762, "balance_loss_mlp": 1.02553558, "epoch": 0.1807906207725838, "flos": 28727827459200.0, "grad_norm": 2.52094147332136, "language_loss": 0.78411305, "learning_rate": 3.7661835851225103e-06, "loss": 0.8056705, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.62890625, "step": 3007, "time_per_iteration": 2.5056376457214355 }, { "auxiliary_loss_clip": 0.01040732, "auxiliary_loss_mlp": 0.01006524, "balance_loss_clip": 1.00104082, "balance_loss_mlp": 1.01898336, "epoch": 0.18085074402525175, "flos": 64462791144960.0, "grad_norm": 0.8091906867313444, "language_loss": 0.56967551, "learning_rate": 3.7660008161137294e-06, "loss": 0.59014809, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 0.05493164, "router_z_loss_mlp": 0.21679688, "step": 3008, "time_per_iteration": 3.161937952041626 }, { "auxiliary_loss_clip": 0.01096332, "auxiliary_loss_mlp": 0.01069931, "balance_loss_clip": 1.02613401, "balance_loss_mlp": 1.03095663, "epoch": 0.18091086727791975, "flos": 23475842835840.0, "grad_norm": 1.7788124089352244, "language_loss": 0.69621408, "learning_rate": 3.765817980138021e-06, "loss": 0.71787679, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.65625, "step": 3009, "time_per_iteration": 2.4074559211730957 }, { "auxiliary_loss_clip": 0.01092909, "auxiliary_loss_mlp": 0.01061818, "balance_loss_clip": 1.02109647, "balance_loss_mlp": 1.02826762, "epoch": 0.1809709905305877, "flos": 24169322638080.0, "grad_norm": 1.691234575417803, "language_loss": 0.77881837, "learning_rate": 3.7656350772023177e-06, "loss": 0.80036569, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6484375, "step": 3010, "time_per_iteration": 2.414494037628174 }, { "auxiliary_loss_clip": 0.01084425, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.01774454, "balance_loss_mlp": 1.0245012, "epoch": 0.18103111378325568, "flos": 21649885395840.0, "grad_norm": 1.5264142413068367, "language_loss": 0.68939781, "learning_rate": 3.7654521073135553e-06, "loss": 0.71078283, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.59765625, "step": 3011, "time_per_iteration": 2.3955702781677246 }, { "auxiliary_loss_clip": 0.01085409, "auxiliary_loss_mlp": 0.01053434, "balance_loss_clip": 1.01564503, "balance_loss_mlp": 1.02373314, "epoch": 0.18109123703592364, "flos": 53684965992960.0, "grad_norm": 1.6117654015582308, "language_loss": 0.72109449, "learning_rate": 3.7652690704786723e-06, "loss": 0.7424829, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.6171875, "step": 3012, "time_per_iteration": 2.755882501602173 }, { "auxiliary_loss_clip": 0.01085025, "auxiliary_loss_mlp": 0.0106104, "balance_loss_clip": 1.02000809, "balance_loss_mlp": 1.02561986, "epoch": 0.1811513602885916, "flos": 35844104062080.0, "grad_norm": 3.4148078725983937, "language_loss": 0.64396024, "learning_rate": 3.765085966704609e-06, "loss": 0.66542089, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.59375, "step": 3013, "time_per_iteration": 2.5151026248931885 }, { "auxiliary_loss_clip": 0.0109226, "auxiliary_loss_mlp": 0.01067016, "balance_loss_clip": 1.02522123, "balance_loss_mlp": 1.02759814, "epoch": 0.18121148354125957, "flos": 23731441966080.0, "grad_norm": 2.2100178441995864, "language_loss": 0.77964616, "learning_rate": 3.764902795998309e-06, "loss": 0.80123895, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6484375, "step": 3014, "time_per_iteration": 2.441596269607544 }, { "auxiliary_loss_clip": 0.01091703, "auxiliary_loss_mlp": 0.01068425, "balance_loss_clip": 1.02419853, "balance_loss_mlp": 1.02573228, "epoch": 0.18127160679392756, "flos": 28727129232000.0, "grad_norm": 1.7915866865287473, "language_loss": 0.66944313, "learning_rate": 3.7647195583667184e-06, "loss": 0.69104439, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.66015625, "step": 3015, "time_per_iteration": 2.451900005340576 }, { "auxiliary_loss_clip": 0.01085785, "auxiliary_loss_mlp": 0.01068173, "balance_loss_clip": 1.02745128, "balance_loss_mlp": 1.02382159, "epoch": 0.18133173004659553, "flos": 20484030631680.0, "grad_norm": 1.7734391708851638, "language_loss": 0.79522479, "learning_rate": 3.764536253816785e-06, "loss": 0.81676435, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.62109375, "step": 3016, "time_per_iteration": 2.418789863586426 }, { "auxiliary_loss_clip": 0.01092395, "auxiliary_loss_mlp": 0.01064556, "balance_loss_clip": 1.01932764, "balance_loss_mlp": 1.02633011, "epoch": 0.1813918532992635, "flos": 22851107233920.0, "grad_norm": 1.66438416750243, "language_loss": 0.84907389, "learning_rate": 3.7643528823554602e-06, "loss": 0.8706435, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.66015625, "step": 3017, "time_per_iteration": 2.3997199535369873 }, { "auxiliary_loss_clip": 0.01089102, "auxiliary_loss_mlp": 0.01052532, "balance_loss_clip": 1.01474237, "balance_loss_mlp": 1.02585483, "epoch": 0.18145197655193146, "flos": 36063637891200.0, "grad_norm": 1.9694538540421964, "language_loss": 0.69048989, "learning_rate": 3.764169443989697e-06, "loss": 0.71190619, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.6328125, "step": 3018, "time_per_iteration": 2.5227150917053223 }, { "auxiliary_loss_clip": 0.01091851, "auxiliary_loss_mlp": 0.01053622, "balance_loss_clip": 1.01235199, "balance_loss_mlp": 1.02666652, "epoch": 0.18151209980459942, "flos": 24022827106560.0, "grad_norm": 1.886675963168429, "language_loss": 0.78554004, "learning_rate": 3.7639859387264518e-06, "loss": 0.8069948, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.65234375, "step": 3019, "time_per_iteration": 2.4942259788513184 }, { "auxiliary_loss_clip": 0.01091267, "auxiliary_loss_mlp": 0.01063034, "balance_loss_clip": 1.01992834, "balance_loss_mlp": 1.02615094, "epoch": 0.1815722230572674, "flos": 23950487036160.0, "grad_norm": 1.996404597655128, "language_loss": 0.83049488, "learning_rate": 3.7638023665726834e-06, "loss": 0.85203791, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.65234375, "step": 3020, "time_per_iteration": 3.8534700870513916 }, { "auxiliary_loss_clip": 0.01089978, "auxiliary_loss_mlp": 0.0105933, "balance_loss_clip": 1.01929998, "balance_loss_mlp": 1.02613926, "epoch": 0.18163234630993536, "flos": 24385400242560.0, "grad_norm": 2.835304414127524, "language_loss": 0.78807652, "learning_rate": 3.763618727535352e-06, "loss": 0.8095696, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.63671875, "step": 3021, "time_per_iteration": 2.406572103500366 }, { "auxiliary_loss_clip": 0.01085675, "auxiliary_loss_mlp": 0.0106004, "balance_loss_clip": 1.01988983, "balance_loss_mlp": 1.0239979, "epoch": 0.18169246956260335, "flos": 24680171784960.0, "grad_norm": 1.6531190617352767, "language_loss": 0.85688484, "learning_rate": 3.763435021621422e-06, "loss": 0.87834197, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.6171875, "step": 3022, "time_per_iteration": 3.8180861473083496 }, { "auxiliary_loss_clip": 0.01089579, "auxiliary_loss_mlp": 0.01062933, "balance_loss_clip": 1.0215677, "balance_loss_mlp": 1.02508712, "epoch": 0.1817525928152713, "flos": 24242151467520.0, "grad_norm": 1.8383173256231369, "language_loss": 0.71631002, "learning_rate": 3.763251248837859e-06, "loss": 0.73783517, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.64453125, "step": 3023, "time_per_iteration": 2.419936180114746 }, { "auxiliary_loss_clip": 0.01089952, "auxiliary_loss_mlp": 0.01062988, "balance_loss_clip": 1.02491307, "balance_loss_mlp": 1.02630401, "epoch": 0.18181271606793928, "flos": 16471148538240.0, "grad_norm": 1.7389662313216923, "language_loss": 0.75945282, "learning_rate": 3.7630674091916317e-06, "loss": 0.7809822, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.63671875, "step": 3024, "time_per_iteration": 3.7847676277160645 }, { "auxiliary_loss_clip": 0.01088344, "auxiliary_loss_mlp": 0.01061857, "balance_loss_clip": 1.02330494, "balance_loss_mlp": 1.02524614, "epoch": 0.18187283932060724, "flos": 18580252037760.0, "grad_norm": 2.0457418530485794, "language_loss": 0.90036964, "learning_rate": 3.7628835026897123e-06, "loss": 0.9218716, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.6328125, "step": 3025, "time_per_iteration": 3.782660722732544 }, { "auxiliary_loss_clip": 0.01086843, "auxiliary_loss_mlp": 0.01059806, "balance_loss_clip": 1.02213573, "balance_loss_mlp": 1.02559924, "epoch": 0.1819329625732752, "flos": 20265788522880.0, "grad_norm": 2.075642825326295, "language_loss": 0.80575562, "learning_rate": 3.7626995293390735e-06, "loss": 0.82722211, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.61328125, "step": 3026, "time_per_iteration": 2.403829574584961 }, { "auxiliary_loss_clip": 0.01086328, "auxiliary_loss_mlp": 0.01061823, "balance_loss_clip": 1.02443886, "balance_loss_mlp": 1.02452445, "epoch": 0.18199308582594317, "flos": 25914177256320.0, "grad_norm": 1.6754307782109832, "language_loss": 0.77353036, "learning_rate": 3.762515489146692e-06, "loss": 0.79501188, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.6171875, "step": 3027, "time_per_iteration": 2.409081220626831 }, { "auxiliary_loss_clip": 0.01089802, "auxiliary_loss_mlp": 0.0106136, "balance_loss_clip": 1.0215199, "balance_loss_mlp": 1.02519083, "epoch": 0.18205320907861114, "flos": 15376621415040.0, "grad_norm": 2.3612617539968803, "language_loss": 0.86634833, "learning_rate": 3.762331382119546e-06, "loss": 0.88785994, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6484375, "step": 3028, "time_per_iteration": 2.36936354637146 }, { "auxiliary_loss_clip": 0.01085598, "auxiliary_loss_mlp": 0.0106174, "balance_loss_clip": 1.02104199, "balance_loss_mlp": 1.02404308, "epoch": 0.18211333233127913, "flos": 25623280874880.0, "grad_norm": 1.8104151260398915, "language_loss": 0.83967531, "learning_rate": 3.7621472082646183e-06, "loss": 0.86114872, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6171875, "step": 3029, "time_per_iteration": 2.4371275901794434 }, { "auxiliary_loss_clip": 0.01087404, "auxiliary_loss_mlp": 0.01067769, "balance_loss_clip": 1.02649844, "balance_loss_mlp": 1.02359748, "epoch": 0.1821734555839471, "flos": 14975120246400.0, "grad_norm": 2.033148541034215, "language_loss": 0.7999903, "learning_rate": 3.761962967588891e-06, "loss": 0.82154197, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.640625, "step": 3030, "time_per_iteration": 2.368514060974121 }, { "auxiliary_loss_clip": 0.01089877, "auxiliary_loss_mlp": 0.01064616, "balance_loss_clip": 1.02384663, "balance_loss_mlp": 1.02467167, "epoch": 0.18223357883661506, "flos": 20192959693440.0, "grad_norm": 2.5875740927786097, "language_loss": 0.87399149, "learning_rate": 3.761778660099352e-06, "loss": 0.89553648, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.65234375, "step": 3031, "time_per_iteration": 2.3956429958343506 }, { "auxiliary_loss_clip": 0.01089175, "auxiliary_loss_mlp": 0.01055962, "balance_loss_clip": 1.01860213, "balance_loss_mlp": 1.02454138, "epoch": 0.18229370208928303, "flos": 15231068490240.0, "grad_norm": 2.1014230955334545, "language_loss": 0.81965756, "learning_rate": 3.76159428580299e-06, "loss": 0.84110892, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.6484375, "step": 3032, "time_per_iteration": 2.36570143699646 }, { "auxiliary_loss_clip": 0.01092341, "auxiliary_loss_mlp": 0.01061443, "balance_loss_clip": 1.01888514, "balance_loss_mlp": 1.02569842, "epoch": 0.182353825341951, "flos": 23839393489920.0, "grad_norm": 1.9896430044566205, "language_loss": 0.8289839, "learning_rate": 3.761409844706795e-06, "loss": 0.85052168, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6640625, "step": 3033, "time_per_iteration": 2.434062957763672 }, { "auxiliary_loss_clip": 0.01027891, "auxiliary_loss_mlp": 0.01008749, "balance_loss_clip": 1.0030508, "balance_loss_mlp": 1.00880003, "epoch": 0.18241394859461896, "flos": 61188114038400.0, "grad_norm": 0.8936893496221087, "language_loss": 0.63629818, "learning_rate": 3.7612253368177625e-06, "loss": 0.65666461, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 0.05688477, "router_z_loss_mlp": 0.19140625, "step": 3034, "time_per_iteration": 2.9707212448120117 }, { "auxiliary_loss_clip": 0.01087839, "auxiliary_loss_mlp": 0.0106205, "balance_loss_clip": 1.02218652, "balance_loss_mlp": 1.02434611, "epoch": 0.18247407184728695, "flos": 18470904059520.0, "grad_norm": 1.9783715543198084, "language_loss": 0.82273233, "learning_rate": 3.7610407621428893e-06, "loss": 0.84423125, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6328125, "step": 3035, "time_per_iteration": 2.4014148712158203 }, { "auxiliary_loss_clip": 0.01086215, "auxiliary_loss_mlp": 0.01061388, "balance_loss_clip": 1.02340817, "balance_loss_mlp": 1.02401125, "epoch": 0.18253419509995492, "flos": 21794216423040.0, "grad_norm": 1.7856736682505145, "language_loss": 0.85678077, "learning_rate": 3.7608561206891735e-06, "loss": 0.8782568, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.62109375, "step": 3036, "time_per_iteration": 2.4203968048095703 }, { "auxiliary_loss_clip": 0.01085164, "auxiliary_loss_mlp": 0.01062422, "balance_loss_clip": 1.02251101, "balance_loss_mlp": 1.02389228, "epoch": 0.18259431835262288, "flos": 20148934602240.0, "grad_norm": 2.058689885950488, "language_loss": 0.8164618, "learning_rate": 3.760671412463617e-06, "loss": 0.83793771, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.61328125, "step": 3037, "time_per_iteration": 2.3856518268585205 }, { "auxiliary_loss_clip": 0.01091962, "auxiliary_loss_mlp": 0.01070076, "balance_loss_clip": 1.0237031, "balance_loss_mlp": 1.02707267, "epoch": 0.18265444160529085, "flos": 16980740876160.0, "grad_norm": 3.282182173943202, "language_loss": 0.83216107, "learning_rate": 3.7604866374732246e-06, "loss": 0.85378146, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6484375, "step": 3038, "time_per_iteration": 2.368680238723755 }, { "auxiliary_loss_clip": 0.01088557, "auxiliary_loss_mlp": 0.01063609, "balance_loss_clip": 1.02405512, "balance_loss_mlp": 1.02520299, "epoch": 0.1827145648579588, "flos": 34421812295040.0, "grad_norm": 1.674628729349638, "language_loss": 0.68174195, "learning_rate": 3.7603017957250023e-06, "loss": 0.70326364, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.6328125, "step": 3039, "time_per_iteration": 2.512702226638794 }, { "auxiliary_loss_clip": 0.01090712, "auxiliary_loss_mlp": 0.01062382, "balance_loss_clip": 1.01910913, "balance_loss_mlp": 1.02620816, "epoch": 0.18277468811062678, "flos": 53285035835520.0, "grad_norm": 2.004027439386442, "language_loss": 0.75724918, "learning_rate": 3.7601168872259593e-06, "loss": 0.77878016, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.64453125, "step": 3040, "time_per_iteration": 2.679309606552124 }, { "auxiliary_loss_clip": 0.01086328, "auxiliary_loss_mlp": 0.01060344, "balance_loss_clip": 1.01840568, "balance_loss_mlp": 1.02325082, "epoch": 0.18283481136329474, "flos": 31649289540480.0, "grad_norm": 2.5063584583952734, "language_loss": 0.61691666, "learning_rate": 3.7599319119831075e-06, "loss": 0.63838339, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6328125, "step": 3041, "time_per_iteration": 2.48581862449646 }, { "auxiliary_loss_clip": 0.01089594, "auxiliary_loss_mlp": 0.01064608, "balance_loss_clip": 1.02195537, "balance_loss_mlp": 1.02479601, "epoch": 0.18289493461596273, "flos": 53135782306560.0, "grad_norm": 1.7987248428715559, "language_loss": 0.61651057, "learning_rate": 3.7597468700034616e-06, "loss": 0.63805264, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6484375, "step": 3042, "time_per_iteration": 2.705409049987793 }, { "auxiliary_loss_clip": 0.01087403, "auxiliary_loss_mlp": 0.01063906, "balance_loss_clip": 1.02244544, "balance_loss_mlp": 1.02621293, "epoch": 0.1829550578686307, "flos": 25588297825920.0, "grad_norm": 1.6036165832240268, "language_loss": 0.88743252, "learning_rate": 3.7595617612940374e-06, "loss": 0.90894556, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.61328125, "step": 3043, "time_per_iteration": 2.441075086593628 }, { "auxiliary_loss_clip": 0.01088491, "auxiliary_loss_mlp": 0.01074911, "balance_loss_clip": 1.02779973, "balance_loss_mlp": 1.02418399, "epoch": 0.18301518112129866, "flos": 22600325871360.0, "grad_norm": 1.8051722586035226, "language_loss": 0.72632241, "learning_rate": 3.7593765858618552e-06, "loss": 0.74795645, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 0.64453125, "step": 3044, "time_per_iteration": 2.3988680839538574 }, { "auxiliary_loss_clip": 0.01093571, "auxiliary_loss_mlp": 0.01073375, "balance_loss_clip": 1.02557254, "balance_loss_mlp": 1.02622437, "epoch": 0.18307530437396663, "flos": 34019403431040.0, "grad_norm": 2.2116664182741674, "language_loss": 0.67320204, "learning_rate": 3.7591913437139365e-06, "loss": 0.69487149, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 0.671875, "step": 3045, "time_per_iteration": 2.4985058307647705 }, { "auxiliary_loss_clip": 0.01087529, "auxiliary_loss_mlp": 0.01064001, "balance_loss_clip": 1.02456629, "balance_loss_mlp": 1.02470303, "epoch": 0.1831354276266346, "flos": 21278933533440.0, "grad_norm": 2.3995119247398513, "language_loss": 0.81662399, "learning_rate": 3.7590060348573066e-06, "loss": 0.8381393, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.62890625, "step": 3046, "time_per_iteration": 2.359522581100464 }, { "auxiliary_loss_clip": 0.01089467, "auxiliary_loss_mlp": 0.01070138, "balance_loss_clip": 1.02359879, "balance_loss_mlp": 1.0245229, "epoch": 0.18319555087930256, "flos": 21031887686400.0, "grad_norm": 1.9304203599580447, "language_loss": 0.81405413, "learning_rate": 3.7588206592989903e-06, "loss": 0.83565015, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6484375, "step": 3047, "time_per_iteration": 2.384692907333374 }, { "auxiliary_loss_clip": 0.01086714, "auxiliary_loss_mlp": 0.01059138, "balance_loss_clip": 1.01946473, "balance_loss_mlp": 1.02559137, "epoch": 0.18325567413197055, "flos": 34381627453440.0, "grad_norm": 1.3983823274136562, "language_loss": 0.81258094, "learning_rate": 3.7586352170460194e-06, "loss": 0.83403945, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.609375, "step": 3048, "time_per_iteration": 2.5276217460632324 }, { "auxiliary_loss_clip": 0.01087679, "auxiliary_loss_mlp": 0.01062351, "balance_loss_clip": 1.02172399, "balance_loss_mlp": 1.0239706, "epoch": 0.18331579738463852, "flos": 20557418042880.0, "grad_norm": 3.0591990629794723, "language_loss": 0.88414109, "learning_rate": 3.758449708105424e-06, "loss": 0.90564144, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.63671875, "step": 3049, "time_per_iteration": 2.396831750869751 }, { "auxiliary_loss_clip": 0.01093324, "auxiliary_loss_mlp": 0.01065109, "balance_loss_clip": 1.02035737, "balance_loss_mlp": 1.02563, "epoch": 0.18337592063730648, "flos": 19606907744640.0, "grad_norm": 2.2962946083706397, "language_loss": 0.79681325, "learning_rate": 3.75826413248424e-06, "loss": 0.81839752, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.67578125, "step": 3050, "time_per_iteration": 2.347169876098633 }, { "auxiliary_loss_clip": 0.01084625, "auxiliary_loss_mlp": 0.0105858, "balance_loss_clip": 1.01933622, "balance_loss_mlp": 1.02322197, "epoch": 0.18343604388997445, "flos": 20849815612800.0, "grad_norm": 1.9239650577446885, "language_loss": 1.00769615, "learning_rate": 3.7580784901895035e-06, "loss": 1.02912831, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.61328125, "step": 3051, "time_per_iteration": 2.4059386253356934 }, { "auxiliary_loss_clip": 0.01084276, "auxiliary_loss_mlp": 0.01053223, "balance_loss_clip": 1.01366949, "balance_loss_mlp": 1.02367806, "epoch": 0.1834961671426424, "flos": 24393080741760.0, "grad_norm": 1.4429815001713389, "language_loss": 0.88052905, "learning_rate": 3.7578927812282542e-06, "loss": 0.90190405, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.60546875, "step": 3052, "time_per_iteration": 2.4632773399353027 }, { "auxiliary_loss_clip": 0.01084494, "auxiliary_loss_mlp": 0.01060615, "balance_loss_clip": 1.02084684, "balance_loss_mlp": 1.02281189, "epoch": 0.18355629039531038, "flos": 21250548731520.0, "grad_norm": 1.7459433360976353, "language_loss": 0.75150394, "learning_rate": 3.7577070056075356e-06, "loss": 0.77295506, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6171875, "step": 3053, "time_per_iteration": 2.396648645401001 }, { "auxiliary_loss_clip": 0.01087427, "auxiliary_loss_mlp": 0.01057118, "balance_loss_clip": 1.01842225, "balance_loss_mlp": 1.02511621, "epoch": 0.18361641364797834, "flos": 28655277920640.0, "grad_norm": 1.867360908459658, "language_loss": 0.6486389, "learning_rate": 3.7575211633343902e-06, "loss": 0.67008436, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.625, "step": 3054, "time_per_iteration": 2.46193790435791 }, { "auxiliary_loss_clip": 0.01087426, "auxiliary_loss_mlp": 0.01062104, "balance_loss_clip": 1.02429104, "balance_loss_mlp": 1.02492166, "epoch": 0.18367653690064634, "flos": 20917896497280.0, "grad_norm": 2.0060789126217378, "language_loss": 0.80137599, "learning_rate": 3.7573352544158663e-06, "loss": 0.82287133, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.625, "step": 3055, "time_per_iteration": 2.3853697776794434 }, { "auxiliary_loss_clip": 0.01086468, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.02246523, "balance_loss_mlp": 1.0258286, "epoch": 0.1837366601533143, "flos": 28764381519360.0, "grad_norm": 1.718147857592923, "language_loss": 0.71411479, "learning_rate": 3.757149278859014e-06, "loss": 0.73559183, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.609375, "step": 3056, "time_per_iteration": 2.461408853530884 }, { "auxiliary_loss_clip": 0.01085411, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.02061629, "balance_loss_mlp": 1.0238626, "epoch": 0.18379678340598227, "flos": 21250374174720.0, "grad_norm": 1.5464326600052405, "language_loss": 0.8165704, "learning_rate": 3.7569632366708842e-06, "loss": 0.83801788, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.6171875, "step": 3057, "time_per_iteration": 2.404388904571533 }, { "auxiliary_loss_clip": 0.0108944, "auxiliary_loss_mlp": 0.01076888, "balance_loss_clip": 1.03027689, "balance_loss_mlp": 1.02452409, "epoch": 0.18385690665865023, "flos": 20448558823680.0, "grad_norm": 1.9730440045246131, "language_loss": 0.83663309, "learning_rate": 3.756777127858533e-06, "loss": 0.85829639, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.6484375, "step": 3058, "time_per_iteration": 2.4258644580841064 }, { "auxiliary_loss_clip": 0.01089294, "auxiliary_loss_mlp": 0.0106867, "balance_loss_clip": 1.0251348, "balance_loss_mlp": 1.024315, "epoch": 0.1839170299113182, "flos": 26139366725760.0, "grad_norm": 2.0982911237163413, "language_loss": 0.87007654, "learning_rate": 3.756590952429017e-06, "loss": 0.89165616, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.6484375, "step": 3059, "time_per_iteration": 2.477228879928589 }, { "auxiliary_loss_clip": 0.01084699, "auxiliary_loss_mlp": 0.0106195, "balance_loss_clip": 1.02370763, "balance_loss_mlp": 1.02313328, "epoch": 0.18397715316398616, "flos": 31756717393920.0, "grad_norm": 1.7347866432020167, "language_loss": 0.74107969, "learning_rate": 3.756404710389396e-06, "loss": 0.76254618, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.6171875, "step": 3060, "time_per_iteration": 3.891313314437866 }, { "auxiliary_loss_clip": 0.01087447, "auxiliary_loss_mlp": 0.01067679, "balance_loss_clip": 1.02502561, "balance_loss_mlp": 1.02473307, "epoch": 0.18403727641665413, "flos": 24610729357440.0, "grad_norm": 1.5953962559028254, "language_loss": 0.74338567, "learning_rate": 3.7562184017467323e-06, "loss": 0.76493704, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.625, "step": 3061, "time_per_iteration": 2.430459976196289 }, { "auxiliary_loss_clip": 0.01086381, "auxiliary_loss_mlp": 0.01065581, "balance_loss_clip": 1.02445364, "balance_loss_mlp": 1.02450836, "epoch": 0.18409739966932212, "flos": 23438800016640.0, "grad_norm": 1.5891660054967958, "language_loss": 0.8270427, "learning_rate": 3.7560320265080906e-06, "loss": 0.84856236, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.62109375, "step": 3062, "time_per_iteration": 3.820281744003296 }, { "auxiliary_loss_clip": 0.01089304, "auxiliary_loss_mlp": 0.01069158, "balance_loss_clip": 1.02714837, "balance_loss_mlp": 1.02480853, "epoch": 0.18415752292199009, "flos": 21871025147520.0, "grad_norm": 1.8823804651794063, "language_loss": 0.74726188, "learning_rate": 3.7558455846805383e-06, "loss": 0.76884645, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6484375, "step": 3063, "time_per_iteration": 2.376692056655884 }, { "auxiliary_loss_clip": 0.01084267, "auxiliary_loss_mlp": 0.01060645, "balance_loss_clip": 1.02395201, "balance_loss_mlp": 1.022686, "epoch": 0.18421764617465805, "flos": 25409507420160.0, "grad_norm": 1.7072448736369765, "language_loss": 0.67444527, "learning_rate": 3.7556590762711463e-06, "loss": 0.69589436, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.6171875, "step": 3064, "time_per_iteration": 3.849440336227417 }, { "auxiliary_loss_clip": 0.01085706, "auxiliary_loss_mlp": 0.010659, "balance_loss_clip": 1.02553558, "balance_loss_mlp": 1.02343249, "epoch": 0.18427776942732602, "flos": 27196920852480.0, "grad_norm": 1.8284256729450834, "language_loss": 0.7099449, "learning_rate": 3.7554725012869853e-06, "loss": 0.73146093, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.625, "step": 3065, "time_per_iteration": 3.81312894821167 }, { "auxiliary_loss_clip": 0.01090762, "auxiliary_loss_mlp": 0.0106585, "balance_loss_clip": 1.02217162, "balance_loss_mlp": 1.0258534, "epoch": 0.18433789267999398, "flos": 27851193331200.0, "grad_norm": 2.203137863668416, "language_loss": 0.75329155, "learning_rate": 3.7552858597351318e-06, "loss": 0.77485764, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6484375, "step": 3066, "time_per_iteration": 2.4305264949798584 }, { "auxiliary_loss_clip": 0.01086183, "auxiliary_loss_mlp": 0.01054678, "balance_loss_clip": 1.01634061, "balance_loss_mlp": 1.02320278, "epoch": 0.18439801593266195, "flos": 17856013461120.0, "grad_norm": 1.9777925812161226, "language_loss": 0.84158009, "learning_rate": 3.7550991516226622e-06, "loss": 0.86298871, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.62890625, "step": 3067, "time_per_iteration": 2.37589168548584 }, { "auxiliary_loss_clip": 0.01031112, "auxiliary_loss_mlp": 0.01013274, "balance_loss_clip": 1.00688446, "balance_loss_mlp": 1.00956666, "epoch": 0.18445813918532994, "flos": 56386403619840.0, "grad_norm": 0.8184909765164737, "language_loss": 0.5988096, "learning_rate": 3.754912376956657e-06, "loss": 0.61925346, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 0.06396484, "router_z_loss_mlp": 0.21484375, "step": 3068, "time_per_iteration": 2.8893840312957764 }, { "auxiliary_loss_clip": 0.0108591, "auxiliary_loss_mlp": 0.01060806, "balance_loss_clip": 1.02048934, "balance_loss_mlp": 1.02349842, "epoch": 0.1845182624379979, "flos": 20956196125440.0, "grad_norm": 1.8488439121799434, "language_loss": 0.77895951, "learning_rate": 3.7547255357441987e-06, "loss": 0.8004266, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.625, "step": 3069, "time_per_iteration": 2.401163101196289 }, { "auxiliary_loss_clip": 0.01087544, "auxiliary_loss_mlp": 0.01067337, "balance_loss_clip": 1.0248034, "balance_loss_mlp": 1.0243454, "epoch": 0.18457838569066587, "flos": 20484135365760.0, "grad_norm": 1.7342326555161953, "language_loss": 0.86857545, "learning_rate": 3.7545386279923718e-06, "loss": 0.89012426, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6328125, "step": 3070, "time_per_iteration": 2.3808462619781494 }, { "auxiliary_loss_clip": 0.01087965, "auxiliary_loss_mlp": 0.01070171, "balance_loss_clip": 1.02582526, "balance_loss_mlp": 1.02421904, "epoch": 0.18463850894333383, "flos": 25008844124160.0, "grad_norm": 1.8366442257470255, "language_loss": 0.7953608, "learning_rate": 3.754351653708265e-06, "loss": 0.81694216, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.640625, "step": 3071, "time_per_iteration": 2.4279744625091553 }, { "auxiliary_loss_clip": 0.01092491, "auxiliary_loss_mlp": 0.01078996, "balance_loss_clip": 1.03527045, "balance_loss_mlp": 1.02664018, "epoch": 0.1846986321960018, "flos": 16799681232000.0, "grad_norm": 1.88570003969625, "language_loss": 0.8066628, "learning_rate": 3.7541646128989674e-06, "loss": 0.82837772, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.66015625, "step": 3072, "time_per_iteration": 2.365452766418457 }, { "auxiliary_loss_clip": 0.01087126, "auxiliary_loss_mlp": 0.01062748, "balance_loss_clip": 1.0201664, "balance_loss_mlp": 1.02374315, "epoch": 0.18475875544866976, "flos": 20813261552640.0, "grad_norm": 1.9762396119604908, "language_loss": 0.87374622, "learning_rate": 3.7539775055715715e-06, "loss": 0.89524496, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6328125, "step": 3073, "time_per_iteration": 2.3982694149017334 }, { "auxiliary_loss_clip": 0.01088042, "auxiliary_loss_mlp": 0.01072552, "balance_loss_clip": 1.03209221, "balance_loss_mlp": 1.02439547, "epoch": 0.18481887870133773, "flos": 22600325871360.0, "grad_norm": 2.3955878251263933, "language_loss": 0.94812417, "learning_rate": 3.7537903317331732e-06, "loss": 0.96973008, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.63671875, "step": 3074, "time_per_iteration": 2.3870174884796143 }, { "auxiliary_loss_clip": 0.01085305, "auxiliary_loss_mlp": 0.01064011, "balance_loss_clip": 1.02054715, "balance_loss_mlp": 1.0230577, "epoch": 0.18487900195400572, "flos": 29457582030720.0, "grad_norm": 5.43571544791032, "language_loss": 0.65834761, "learning_rate": 3.75360309139087e-06, "loss": 0.67984074, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.62109375, "step": 3075, "time_per_iteration": 2.4557814598083496 }, { "auxiliary_loss_clip": 0.01084786, "auxiliary_loss_mlp": 0.01063901, "balance_loss_clip": 1.02503872, "balance_loss_mlp": 1.02424657, "epoch": 0.1849391252066737, "flos": 20627803077120.0, "grad_norm": 1.7349929174226795, "language_loss": 0.7393961, "learning_rate": 3.753415784551761e-06, "loss": 0.76088291, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.609375, "step": 3076, "time_per_iteration": 2.3872392177581787 }, { "auxiliary_loss_clip": 0.01087814, "auxiliary_loss_mlp": 0.01064472, "balance_loss_clip": 1.02444196, "balance_loss_mlp": 1.02336967, "epoch": 0.18499924845934165, "flos": 14427682128000.0, "grad_norm": 2.56929167047946, "language_loss": 0.83118182, "learning_rate": 3.7532284112229507e-06, "loss": 0.8527047, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.64453125, "step": 3077, "time_per_iteration": 2.3927054405212402 }, { "auxiliary_loss_clip": 0.01082005, "auxiliary_loss_mlp": 0.01059814, "balance_loss_clip": 1.02276385, "balance_loss_mlp": 1.02213991, "epoch": 0.18505937171200962, "flos": 23726659109760.0, "grad_norm": 1.695450509257729, "language_loss": 0.79981065, "learning_rate": 3.7530409714115424e-06, "loss": 0.8212288, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.59765625, "step": 3078, "time_per_iteration": 2.40616774559021 }, { "auxiliary_loss_clip": 0.01086317, "auxiliary_loss_mlp": 0.01068974, "balance_loss_clip": 1.02458084, "balance_loss_mlp": 1.02352369, "epoch": 0.18511949496467758, "flos": 25956317134080.0, "grad_norm": 1.853565857077892, "language_loss": 0.79692447, "learning_rate": 3.7528534651246453e-06, "loss": 0.81847745, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.62890625, "step": 3079, "time_per_iteration": 2.4430689811706543 }, { "auxiliary_loss_clip": 0.01083662, "auxiliary_loss_mlp": 0.01064207, "balance_loss_clip": 1.0260365, "balance_loss_mlp": 1.02200556, "epoch": 0.18517961821734555, "flos": 42411895205760.0, "grad_norm": 1.8645053860812602, "language_loss": 0.83218193, "learning_rate": 3.752665892369369e-06, "loss": 0.85366058, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.6171875, "step": 3080, "time_per_iteration": 2.6198067665100098 }, { "auxiliary_loss_clip": 0.01087823, "auxiliary_loss_mlp": 0.01071168, "balance_loss_clip": 1.02455759, "balance_loss_mlp": 1.02336192, "epoch": 0.18523974147001354, "flos": 24096423985920.0, "grad_norm": 2.064912826032835, "language_loss": 0.75803852, "learning_rate": 3.7524782531528266e-06, "loss": 0.77962846, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 0.64453125, "step": 3081, "time_per_iteration": 2.4019148349761963 }, { "auxiliary_loss_clip": 0.01088721, "auxiliary_loss_mlp": 0.01062737, "balance_loss_clip": 1.02070379, "balance_loss_mlp": 1.02513361, "epoch": 0.1852998647226815, "flos": 27374210069760.0, "grad_norm": 2.2895650575540523, "language_loss": 0.73344982, "learning_rate": 3.7522905474821334e-06, "loss": 0.75496441, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.63671875, "step": 3082, "time_per_iteration": 2.4332594871520996 }, { "auxiliary_loss_clip": 0.01088038, "auxiliary_loss_mlp": 0.01065402, "balance_loss_clip": 1.02065074, "balance_loss_mlp": 1.02449393, "epoch": 0.18535998797534947, "flos": 18331774824960.0, "grad_norm": 1.9112992980851653, "language_loss": 0.72203958, "learning_rate": 3.752102775364407e-06, "loss": 0.74357402, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6328125, "step": 3083, "time_per_iteration": 2.379075288772583 }, { "auxiliary_loss_clip": 0.01084877, "auxiliary_loss_mlp": 0.01056893, "balance_loss_clip": 1.01946127, "balance_loss_mlp": 1.02429295, "epoch": 0.18542011122801744, "flos": 37844522899200.0, "grad_norm": 2.328167418971216, "language_loss": 0.71095514, "learning_rate": 3.751914936806767e-06, "loss": 0.73237282, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.60546875, "step": 3084, "time_per_iteration": 2.522299289703369 }, { "auxiliary_loss_clip": 0.01085694, "auxiliary_loss_mlp": 0.01056977, "balance_loss_clip": 1.01718545, "balance_loss_mlp": 1.0235219, "epoch": 0.1854802344806854, "flos": 25185120912000.0, "grad_norm": 1.57987008824658, "language_loss": 0.78688335, "learning_rate": 3.7517270318163377e-06, "loss": 0.80831003, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.62109375, "step": 3085, "time_per_iteration": 2.498060941696167 }, { "auxiliary_loss_clip": 0.01085612, "auxiliary_loss_mlp": 0.01061127, "balance_loss_clip": 1.0194757, "balance_loss_mlp": 1.02344918, "epoch": 0.18554035773335337, "flos": 26683662821760.0, "grad_norm": 1.73068349005614, "language_loss": 0.75518072, "learning_rate": 3.751539060400244e-06, "loss": 0.7766481, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.62109375, "step": 3086, "time_per_iteration": 2.433401584625244 }, { "auxiliary_loss_clip": 0.01086411, "auxiliary_loss_mlp": 0.01067174, "balance_loss_clip": 1.02444935, "balance_loss_mlp": 1.02542579, "epoch": 0.18560048098602133, "flos": 22345774081920.0, "grad_norm": 3.179834886110542, "language_loss": 0.72181493, "learning_rate": 3.7513510225656132e-06, "loss": 0.7433508, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.609375, "step": 3087, "time_per_iteration": 2.400320291519165 }, { "auxiliary_loss_clip": 0.01084798, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.02047467, "balance_loss_mlp": 1.02305818, "epoch": 0.18566060423868933, "flos": 17747573178240.0, "grad_norm": 1.8982128534767901, "language_loss": 0.75159168, "learning_rate": 3.7511629183195764e-06, "loss": 0.77311623, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.6171875, "step": 3088, "time_per_iteration": 2.356985330581665 }, { "auxiliary_loss_clip": 0.01083691, "auxiliary_loss_mlp": 0.01057141, "balance_loss_clip": 1.01927972, "balance_loss_mlp": 1.02326226, "epoch": 0.1857207274913573, "flos": 24676226801280.0, "grad_norm": 1.8268488291895435, "language_loss": 0.94516218, "learning_rate": 3.7509747476692663e-06, "loss": 0.96657044, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.60546875, "step": 3089, "time_per_iteration": 2.418142080307007 }, { "auxiliary_loss_clip": 0.01082072, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.01691282, "balance_loss_mlp": 1.02210999, "epoch": 0.18578085074402526, "flos": 28146558366720.0, "grad_norm": 2.8117484639316372, "language_loss": 0.59140193, "learning_rate": 3.7507865106218176e-06, "loss": 0.61280763, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.6015625, "step": 3090, "time_per_iteration": 2.4544997215270996 }, { "auxiliary_loss_clip": 0.01082213, "auxiliary_loss_mlp": 0.01058162, "balance_loss_clip": 1.01631939, "balance_loss_mlp": 1.02273297, "epoch": 0.18584097399669322, "flos": 23950731415680.0, "grad_norm": 1.6377146111065106, "language_loss": 0.82996571, "learning_rate": 3.7505982071843695e-06, "loss": 0.85136944, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.59375, "step": 3091, "time_per_iteration": 2.4049360752105713 }, { "auxiliary_loss_clip": 0.01083761, "auxiliary_loss_mlp": 0.01062331, "balance_loss_clip": 1.02141881, "balance_loss_mlp": 1.02143121, "epoch": 0.18590109724936119, "flos": 17200728552960.0, "grad_norm": 2.514709605222868, "language_loss": 0.86232072, "learning_rate": 3.7504098373640617e-06, "loss": 0.88378161, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.625, "step": 3092, "time_per_iteration": 2.360720634460449 }, { "auxiliary_loss_clip": 0.01086163, "auxiliary_loss_mlp": 0.01063756, "balance_loss_clip": 1.022843, "balance_loss_mlp": 1.02246141, "epoch": 0.18596122050202915, "flos": 17233791477120.0, "grad_norm": 2.128200665200217, "language_loss": 0.94877237, "learning_rate": 3.750221401168038e-06, "loss": 0.97027159, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.63671875, "step": 3093, "time_per_iteration": 2.3438549041748047 }, { "auxiliary_loss_clip": 0.01087073, "auxiliary_loss_mlp": 0.01064659, "balance_loss_clip": 1.02603519, "balance_loss_mlp": 1.02558815, "epoch": 0.18602134375469712, "flos": 19019878277760.0, "grad_norm": 4.839807206379661, "language_loss": 0.786358, "learning_rate": 3.750032898603443e-06, "loss": 0.80787528, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.6171875, "step": 3094, "time_per_iteration": 2.371535301208496 }, { "auxiliary_loss_clip": 0.01082551, "auxiliary_loss_mlp": 0.01061438, "balance_loss_clip": 1.02374363, "balance_loss_mlp": 1.0226934, "epoch": 0.1860814670073651, "flos": 50948229248640.0, "grad_norm": 1.3978417674843782, "language_loss": 0.71490455, "learning_rate": 3.749844329677425e-06, "loss": 0.7363444, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.59765625, "step": 3095, "time_per_iteration": 2.6391406059265137 }, { "auxiliary_loss_clip": 0.01085902, "auxiliary_loss_mlp": 0.0106689, "balance_loss_clip": 1.02447522, "balance_loss_mlp": 1.02333677, "epoch": 0.18614159026003307, "flos": 19389957356160.0, "grad_norm": 1.9132089499732208, "language_loss": 0.82299411, "learning_rate": 3.749655694397135e-06, "loss": 0.844522, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.625, "step": 3096, "time_per_iteration": 2.383352279663086 }, { "auxiliary_loss_clip": 0.01085309, "auxiliary_loss_mlp": 0.01064978, "balance_loss_clip": 1.02425659, "balance_loss_mlp": 1.02406645, "epoch": 0.18620171351270104, "flos": 21797707559040.0, "grad_norm": 2.115629128419868, "language_loss": 0.76656556, "learning_rate": 3.7494669927697255e-06, "loss": 0.78806841, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.609375, "step": 3097, "time_per_iteration": 2.4038069248199463 }, { "auxiliary_loss_clip": 0.01083918, "auxiliary_loss_mlp": 0.01057491, "balance_loss_clip": 1.01829481, "balance_loss_mlp": 1.0239265, "epoch": 0.186261836765369, "flos": 16361940205440.0, "grad_norm": 2.2039627540891016, "language_loss": 0.68909633, "learning_rate": 3.749278224802352e-06, "loss": 0.71051037, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.6015625, "step": 3098, "time_per_iteration": 2.3690693378448486 }, { "auxiliary_loss_clip": 0.01084656, "auxiliary_loss_mlp": 0.01062663, "balance_loss_clip": 1.02012897, "balance_loss_mlp": 1.02260435, "epoch": 0.18632196001803697, "flos": 23368868830080.0, "grad_norm": 1.7973197256364408, "language_loss": 0.71624064, "learning_rate": 3.7490893905021733e-06, "loss": 0.73771381, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.62109375, "step": 3099, "time_per_iteration": 2.4097647666931152 }, { "auxiliary_loss_clip": 0.01085175, "auxiliary_loss_mlp": 0.01068012, "balance_loss_clip": 1.02819586, "balance_loss_mlp": 1.02385616, "epoch": 0.18638208327070493, "flos": 22490908070400.0, "grad_norm": 2.200512923547313, "language_loss": 0.73407805, "learning_rate": 3.7489004898763494e-06, "loss": 0.75560987, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.609375, "step": 3100, "time_per_iteration": 3.8744239807128906 }, { "auxiliary_loss_clip": 0.01085691, "auxiliary_loss_mlp": 0.01067252, "balance_loss_clip": 1.02676892, "balance_loss_mlp": 1.02420521, "epoch": 0.18644220652337293, "flos": 29164067297280.0, "grad_norm": 1.6518247994404798, "language_loss": 0.8157748, "learning_rate": 3.7487115229320444e-06, "loss": 0.83730423, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6171875, "step": 3101, "time_per_iteration": 3.854264259338379 }, { "auxiliary_loss_clip": 0.01081565, "auxiliary_loss_mlp": 0.01058071, "balance_loss_clip": 1.02235568, "balance_loss_mlp": 1.02295184, "epoch": 0.1865023297760409, "flos": 24242640226560.0, "grad_norm": 1.5926155058593665, "language_loss": 0.78555799, "learning_rate": 3.7485224896764222e-06, "loss": 0.80695438, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5859375, "step": 3102, "time_per_iteration": 2.4154343605041504 }, { "auxiliary_loss_clip": 0.01086823, "auxiliary_loss_mlp": 0.01060989, "balance_loss_clip": 1.02169788, "balance_loss_mlp": 1.02335382, "epoch": 0.18656245302870886, "flos": 19127899624320.0, "grad_norm": 2.0726202983989666, "language_loss": 0.78071654, "learning_rate": 3.7483333901166525e-06, "loss": 0.80219471, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.6328125, "step": 3103, "time_per_iteration": 2.4057273864746094 }, { "auxiliary_loss_clip": 0.01086125, "auxiliary_loss_mlp": 0.01064113, "balance_loss_clip": 1.02527416, "balance_loss_mlp": 1.02468431, "epoch": 0.18662257628137682, "flos": 17785104756480.0, "grad_norm": 1.7082538635663989, "language_loss": 0.81054825, "learning_rate": 3.7481442242599054e-06, "loss": 0.83205068, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.61328125, "step": 3104, "time_per_iteration": 3.9101979732513428 }, { "auxiliary_loss_clip": 0.01083837, "auxiliary_loss_mlp": 0.01056709, "balance_loss_clip": 1.02032626, "balance_loss_mlp": 1.0246985, "epoch": 0.1866826995340448, "flos": 24023246042880.0, "grad_norm": 1.8658067668836602, "language_loss": 0.86602473, "learning_rate": 3.747954992113354e-06, "loss": 0.88743025, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.58984375, "step": 3105, "time_per_iteration": 3.792893171310425 }, { "auxiliary_loss_clip": 0.01089357, "auxiliary_loss_mlp": 0.01064058, "balance_loss_clip": 1.01921177, "balance_loss_mlp": 1.02397943, "epoch": 0.18674282278671275, "flos": 26140030041600.0, "grad_norm": 1.9035677490731786, "language_loss": 0.88316184, "learning_rate": 3.7477656936841742e-06, "loss": 0.90469599, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.65625, "step": 3106, "time_per_iteration": 2.4368598461151123 }, { "auxiliary_loss_clip": 0.01086691, "auxiliary_loss_mlp": 0.01067201, "balance_loss_clip": 1.02640748, "balance_loss_mlp": 1.02462292, "epoch": 0.18680294603938072, "flos": 19201112478720.0, "grad_norm": 1.7679761986285105, "language_loss": 0.8015362, "learning_rate": 3.7475763289795445e-06, "loss": 0.82307518, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.62109375, "step": 3107, "time_per_iteration": 2.384347677230835 }, { "auxiliary_loss_clip": 0.01084009, "auxiliary_loss_mlp": 0.01069022, "balance_loss_clip": 1.02419972, "balance_loss_mlp": 1.02211428, "epoch": 0.1868630692920487, "flos": 28543730526720.0, "grad_norm": 1.9526447864493444, "language_loss": 0.77368689, "learning_rate": 3.7473868980066446e-06, "loss": 0.79521716, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6171875, "step": 3108, "time_per_iteration": 2.458315372467041 }, { "auxiliary_loss_clip": 0.01085451, "auxiliary_loss_mlp": 0.01067036, "balance_loss_clip": 1.0259093, "balance_loss_mlp": 1.0231123, "epoch": 0.18692319254471668, "flos": 17237073144960.0, "grad_norm": 1.6279058158068134, "language_loss": 0.75597364, "learning_rate": 3.747197400772658e-06, "loss": 0.77749848, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.625, "step": 3109, "time_per_iteration": 2.369593381881714 }, { "auxiliary_loss_clip": 0.0108214, "auxiliary_loss_mlp": 0.01058573, "balance_loss_clip": 1.01899552, "balance_loss_mlp": 1.02215719, "epoch": 0.18698331579738464, "flos": 23184073670400.0, "grad_norm": 1.469906943898414, "language_loss": 0.85757136, "learning_rate": 3.747007837284772e-06, "loss": 0.87897849, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.59765625, "step": 3110, "time_per_iteration": 2.403210401535034 }, { "auxiliary_loss_clip": 0.01085738, "auxiliary_loss_mlp": 0.01066514, "balance_loss_clip": 1.02564907, "balance_loss_mlp": 1.02321792, "epoch": 0.1870434390500526, "flos": 25515643553280.0, "grad_norm": 1.512226120124199, "language_loss": 0.85980129, "learning_rate": 3.7468182075501737e-06, "loss": 0.88132381, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.625, "step": 3111, "time_per_iteration": 2.4161765575408936 }, { "auxiliary_loss_clip": 0.01082834, "auxiliary_loss_mlp": 0.01059012, "balance_loss_clip": 1.02212882, "balance_loss_mlp": 1.02287769, "epoch": 0.18710356230272057, "flos": 19499794093440.0, "grad_norm": 1.8712439647147094, "language_loss": 0.785321, "learning_rate": 3.7466285115760536e-06, "loss": 0.80673957, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.59765625, "step": 3112, "time_per_iteration": 2.3757436275482178 }, { "auxiliary_loss_clip": 0.01083439, "auxiliary_loss_mlp": 0.01060833, "balance_loss_clip": 1.02220929, "balance_loss_mlp": 1.02314901, "epoch": 0.18716368555538854, "flos": 26759633673600.0, "grad_norm": 1.9213064707305052, "language_loss": 0.66646767, "learning_rate": 3.7464387493696046e-06, "loss": 0.68791032, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.6015625, "step": 3113, "time_per_iteration": 2.430325508117676 }, { "auxiliary_loss_clip": 0.01090215, "auxiliary_loss_mlp": 0.0106565, "balance_loss_clip": 1.02247238, "balance_loss_mlp": 1.02606773, "epoch": 0.1872238088080565, "flos": 25188716782080.0, "grad_norm": 2.096058527663695, "language_loss": 0.82670987, "learning_rate": 3.746248920938024e-06, "loss": 0.84826851, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.640625, "step": 3114, "time_per_iteration": 2.418806791305542 }, { "auxiliary_loss_clip": 0.01084763, "auxiliary_loss_mlp": 0.010654, "balance_loss_clip": 1.02241302, "balance_loss_mlp": 1.02269053, "epoch": 0.1872839320607245, "flos": 24133152602880.0, "grad_norm": 2.2421652752466756, "language_loss": 0.59772569, "learning_rate": 3.74605902628851e-06, "loss": 0.61922729, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6171875, "step": 3115, "time_per_iteration": 2.414844036102295 }, { "auxiliary_loss_clip": 0.01083003, "auxiliary_loss_mlp": 0.01061018, "balance_loss_clip": 1.02198923, "balance_loss_mlp": 1.02340877, "epoch": 0.18734405531339246, "flos": 21172867223040.0, "grad_norm": 1.7294891103731427, "language_loss": 0.72552228, "learning_rate": 3.745869065428261e-06, "loss": 0.74696255, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.59375, "step": 3116, "time_per_iteration": 2.4110524654388428 }, { "auxiliary_loss_clip": 0.01079641, "auxiliary_loss_mlp": 0.01050808, "balance_loss_clip": 1.01602256, "balance_loss_mlp": 1.02195764, "epoch": 0.18740417856606043, "flos": 17236758942720.0, "grad_norm": 2.520459830212574, "language_loss": 0.81244212, "learning_rate": 3.7456790383644833e-06, "loss": 0.83374661, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.578125, "step": 3117, "time_per_iteration": 2.3642938137054443 }, { "auxiliary_loss_clip": 0.01082681, "auxiliary_loss_mlp": 0.010602, "balance_loss_clip": 1.02336407, "balance_loss_mlp": 1.02306271, "epoch": 0.1874643018187284, "flos": 32556787176960.0, "grad_norm": 1.762504169801175, "language_loss": 0.85533237, "learning_rate": 3.745488945104381e-06, "loss": 0.8767612, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.59765625, "step": 3118, "time_per_iteration": 2.54512619972229 }, { "auxiliary_loss_clip": 0.01084295, "auxiliary_loss_mlp": 0.01067272, "balance_loss_clip": 1.02893388, "balance_loss_mlp": 1.02388, "epoch": 0.18752442507139636, "flos": 23257042145280.0, "grad_norm": 1.8919257838984636, "language_loss": 0.78184927, "learning_rate": 3.7452987856551636e-06, "loss": 0.80336493, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.60546875, "step": 3119, "time_per_iteration": 2.4746477603912354 }, { "auxiliary_loss_clip": 0.01084768, "auxiliary_loss_mlp": 0.0106419, "balance_loss_clip": 1.02342021, "balance_loss_mlp": 1.02265811, "epoch": 0.18758454832406432, "flos": 21759896689920.0, "grad_norm": 1.5586316851429034, "language_loss": 0.83838421, "learning_rate": 3.7451085600240406e-06, "loss": 0.85987377, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.62109375, "step": 3120, "time_per_iteration": 2.4567296504974365 }, { "auxiliary_loss_clip": 0.01081718, "auxiliary_loss_mlp": 0.0105635, "balance_loss_clip": 1.01872778, "balance_loss_mlp": 1.02281499, "epoch": 0.1876446715767323, "flos": 29568919956480.0, "grad_norm": 1.6912308389007238, "language_loss": 0.87107766, "learning_rate": 3.7449182682182263e-06, "loss": 0.89245832, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.58984375, "step": 3121, "time_per_iteration": 2.5234107971191406 }, { "auxiliary_loss_clip": 0.01082505, "auxiliary_loss_mlp": 0.01060612, "balance_loss_clip": 1.022084, "balance_loss_mlp": 1.02321005, "epoch": 0.18770479482940028, "flos": 30338580078720.0, "grad_norm": 1.7687768388054481, "language_loss": 0.73104614, "learning_rate": 3.744727910244937e-06, "loss": 0.75247729, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.59375, "step": 3122, "time_per_iteration": 2.542243003845215 }, { "auxiliary_loss_clip": 0.01082469, "auxiliary_loss_mlp": 0.01063807, "balance_loss_clip": 1.02620816, "balance_loss_mlp": 1.02268994, "epoch": 0.18776491808206824, "flos": 14464480567680.0, "grad_norm": 2.016185090872165, "language_loss": 0.7269361, "learning_rate": 3.7445374861113905e-06, "loss": 0.74839884, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.59765625, "step": 3123, "time_per_iteration": 2.410095453262329 }, { "auxiliary_loss_clip": 0.01079774, "auxiliary_loss_mlp": 0.01054255, "balance_loss_clip": 1.01811075, "balance_loss_mlp": 1.02104485, "epoch": 0.1878250413347362, "flos": 24497401484160.0, "grad_norm": 1.8618684487130834, "language_loss": 0.76249355, "learning_rate": 3.7443469958248066e-06, "loss": 0.78383386, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5859375, "step": 3124, "time_per_iteration": 2.4556119441986084 }, { "auxiliary_loss_clip": 0.01081597, "auxiliary_loss_mlp": 0.0106466, "balance_loss_clip": 1.02510655, "balance_loss_mlp": 1.02226102, "epoch": 0.18788516458740417, "flos": 39784611173760.0, "grad_norm": 1.7068599919495242, "language_loss": 0.8250612, "learning_rate": 3.7441564393924106e-06, "loss": 0.84652382, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.59375, "step": 3125, "time_per_iteration": 2.5839710235595703 }, { "auxiliary_loss_clip": 0.01038893, "auxiliary_loss_mlp": 0.01010345, "balance_loss_clip": 1.00276339, "balance_loss_mlp": 1.01709127, "epoch": 0.18794528784007214, "flos": 64696151738880.0, "grad_norm": 0.95228770775427, "language_loss": 0.63650155, "learning_rate": 3.7439658168214273e-06, "loss": 0.65699393, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 0.07568359, "router_z_loss_mlp": 0.21875, "step": 3126, "time_per_iteration": 3.1437361240386963 }, { "auxiliary_loss_clip": 0.01080757, "auxiliary_loss_mlp": 0.01058995, "balance_loss_clip": 1.02006078, "balance_loss_mlp": 1.02312803, "epoch": 0.1880054110927401, "flos": 28620783630720.0, "grad_norm": 1.523449309945607, "language_loss": 0.8248347, "learning_rate": 3.7437751281190857e-06, "loss": 0.84623218, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3127, "time_per_iteration": 2.474926233291626 }, { "auxiliary_loss_clip": 0.01029125, "auxiliary_loss_mlp": 0.01014344, "balance_loss_clip": 1.00867009, "balance_loss_mlp": 1.00902891, "epoch": 0.1880655343454081, "flos": 64485625040640.0, "grad_norm": 0.7732827563800881, "language_loss": 0.61985981, "learning_rate": 3.7435843732926164e-06, "loss": 0.64029443, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 0.05664062, "router_z_loss_mlp": 0.20117188, "step": 3128, "time_per_iteration": 3.1134345531463623 }, { "auxiliary_loss_clip": 0.01084577, "auxiliary_loss_mlp": 0.01052966, "balance_loss_clip": 1.0132457, "balance_loss_mlp": 1.02285147, "epoch": 0.18812565759807606, "flos": 32123095868160.0, "grad_norm": 1.9117802734220564, "language_loss": 0.74230522, "learning_rate": 3.7433935523492536e-06, "loss": 0.7636807, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6171875, "step": 3129, "time_per_iteration": 2.4954752922058105 }, { "auxiliary_loss_clip": 0.01083372, "auxiliary_loss_mlp": 0.01062488, "balance_loss_clip": 1.02052617, "balance_loss_mlp": 1.02337384, "epoch": 0.18818578085074403, "flos": 20623683536640.0, "grad_norm": 1.7216782591208217, "language_loss": 0.87057012, "learning_rate": 3.7432026652962314e-06, "loss": 0.89202869, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6015625, "step": 3130, "time_per_iteration": 2.3849294185638428 }, { "auxiliary_loss_clip": 0.01082307, "auxiliary_loss_mlp": 0.01059125, "balance_loss_clip": 1.0205245, "balance_loss_mlp": 1.0220325, "epoch": 0.188245904103412, "flos": 28839235207680.0, "grad_norm": 1.803559567871909, "language_loss": 0.78691256, "learning_rate": 3.7430117121407897e-06, "loss": 0.80832684, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.6015625, "step": 3131, "time_per_iteration": 2.454728126525879 }, { "auxiliary_loss_clip": 0.01083681, "auxiliary_loss_mlp": 0.01055931, "balance_loss_clip": 1.01840413, "balance_loss_mlp": 1.02564323, "epoch": 0.18830602735607996, "flos": 29419142757120.0, "grad_norm": 1.831038243895863, "language_loss": 0.83357215, "learning_rate": 3.74282069289017e-06, "loss": 0.85496831, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.578125, "step": 3132, "time_per_iteration": 2.5623202323913574 }, { "auxiliary_loss_clip": 0.01087748, "auxiliary_loss_mlp": 0.01063409, "balance_loss_clip": 1.02397418, "balance_loss_mlp": 1.0268743, "epoch": 0.18836615060874792, "flos": 28871774461440.0, "grad_norm": 2.006780157694938, "language_loss": 0.81906533, "learning_rate": 3.742629607551614e-06, "loss": 0.84057689, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.609375, "step": 3133, "time_per_iteration": 2.4789841175079346 }, { "auxiliary_loss_clip": 0.01083065, "auxiliary_loss_mlp": 0.01058188, "balance_loss_clip": 1.01911139, "balance_loss_mlp": 1.02500594, "epoch": 0.18842627386141592, "flos": 22600570250880.0, "grad_norm": 1.6269043830474632, "language_loss": 0.84168696, "learning_rate": 3.7424384561323698e-06, "loss": 0.86309952, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.58203125, "step": 3134, "time_per_iteration": 2.4466588497161865 }, { "auxiliary_loss_clip": 0.01080897, "auxiliary_loss_mlp": 0.01057515, "balance_loss_clip": 1.0204885, "balance_loss_mlp": 1.02342844, "epoch": 0.18848639711408388, "flos": 24572394817920.0, "grad_norm": 1.3686598160940613, "language_loss": 0.84467357, "learning_rate": 3.742247238639684e-06, "loss": 0.86605769, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.57421875, "step": 3135, "time_per_iteration": 2.4523096084594727 }, { "auxiliary_loss_clip": 0.01085028, "auxiliary_loss_mlp": 0.0106195, "balance_loss_clip": 1.02261126, "balance_loss_mlp": 1.02433658, "epoch": 0.18854652036675185, "flos": 34165514937600.0, "grad_norm": 1.692767507986843, "language_loss": 0.80176973, "learning_rate": 3.7420559550808083e-06, "loss": 0.82323945, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.60546875, "step": 3136, "time_per_iteration": 2.6902015209198 }, { "auxiliary_loss_clip": 0.01081822, "auxiliary_loss_mlp": 0.0105847, "balance_loss_clip": 1.01958394, "balance_loss_mlp": 1.02375031, "epoch": 0.1886066436194198, "flos": 24199278451200.0, "grad_norm": 2.0229625998800307, "language_loss": 0.83554947, "learning_rate": 3.741864605462996e-06, "loss": 0.85695243, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3137, "time_per_iteration": 2.483642339706421 }, { "auxiliary_loss_clip": 0.01087958, "auxiliary_loss_mlp": 0.01056083, "balance_loss_clip": 1.01877069, "balance_loss_mlp": 1.02805257, "epoch": 0.18866676687208778, "flos": 21250059972480.0, "grad_norm": 1.6435975605610813, "language_loss": 0.82577038, "learning_rate": 3.741673189793504e-06, "loss": 0.84721082, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.6015625, "step": 3138, "time_per_iteration": 2.4727394580841064 }, { "auxiliary_loss_clip": 0.01085725, "auxiliary_loss_mlp": 0.01060469, "balance_loss_clip": 1.02136886, "balance_loss_mlp": 1.0244664, "epoch": 0.18872689012475574, "flos": 37307069429760.0, "grad_norm": 1.7730291902610371, "language_loss": 0.65203702, "learning_rate": 3.7414817080795896e-06, "loss": 0.67349893, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.61328125, "step": 3139, "time_per_iteration": 4.021524429321289 }, { "auxiliary_loss_clip": 0.01084121, "auxiliary_loss_mlp": 0.01067057, "balance_loss_clip": 1.02364135, "balance_loss_mlp": 1.02363467, "epoch": 0.1887870133774237, "flos": 21651246938880.0, "grad_norm": 1.898163828307695, "language_loss": 0.72688675, "learning_rate": 3.741290160328514e-06, "loss": 0.74839854, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.60546875, "step": 3140, "time_per_iteration": 2.4232425689697266 }, { "auxiliary_loss_clip": 0.01085291, "auxiliary_loss_mlp": 0.01064285, "balance_loss_clip": 1.02198923, "balance_loss_mlp": 1.02415287, "epoch": 0.1888471366300917, "flos": 15923745331200.0, "grad_norm": 2.498201081769096, "language_loss": 0.89059693, "learning_rate": 3.7410985465475412e-06, "loss": 0.91209269, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.609375, "step": 3141, "time_per_iteration": 2.43583345413208 }, { "auxiliary_loss_clip": 0.01086697, "auxiliary_loss_mlp": 0.01068103, "balance_loss_clip": 1.02304161, "balance_loss_mlp": 1.02307081, "epoch": 0.18890725988275966, "flos": 18550959540480.0, "grad_norm": 1.751968190766829, "language_loss": 0.78508341, "learning_rate": 3.7409068667439378e-06, "loss": 0.80663145, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 0.63671875, "step": 3142, "time_per_iteration": 3.8420066833496094 }, { "auxiliary_loss_clip": 0.01083101, "auxiliary_loss_mlp": 0.01057703, "balance_loss_clip": 1.01788735, "balance_loss_mlp": 1.02272296, "epoch": 0.18896738313542763, "flos": 28839584321280.0, "grad_norm": 1.6477785775240157, "language_loss": 0.80593175, "learning_rate": 3.740715120924971e-06, "loss": 0.82733977, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.6015625, "step": 3143, "time_per_iteration": 2.463413715362549 }, { "auxiliary_loss_clip": 0.0108425, "auxiliary_loss_mlp": 0.01067495, "balance_loss_clip": 1.02565241, "balance_loss_mlp": 1.02313209, "epoch": 0.1890275063880956, "flos": 22411830107520.0, "grad_norm": 1.923034527201568, "language_loss": 0.7324152, "learning_rate": 3.740523309097912e-06, "loss": 0.75393265, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.609375, "step": 3144, "time_per_iteration": 3.8124053478240967 }, { "auxiliary_loss_clip": 0.01089826, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.02185297, "balance_loss_mlp": 1.02646971, "epoch": 0.18908762964076356, "flos": 24242744960640.0, "grad_norm": 2.451809397390331, "language_loss": 0.77358723, "learning_rate": 3.7403314312700356e-06, "loss": 0.79511917, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6328125, "step": 3145, "time_per_iteration": 3.8632805347442627 }, { "auxiliary_loss_clip": 0.01082537, "auxiliary_loss_mlp": 0.01058896, "balance_loss_clip": 1.02163148, "balance_loss_mlp": 1.02348208, "epoch": 0.18914775289343153, "flos": 16981962773760.0, "grad_norm": 2.2480848927304735, "language_loss": 0.78846687, "learning_rate": 3.740139487448616e-06, "loss": 0.80988121, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.58984375, "step": 3146, "time_per_iteration": 2.385925769805908 }, { "auxiliary_loss_clip": 0.01084327, "auxiliary_loss_mlp": 0.01063931, "balance_loss_clip": 1.02237451, "balance_loss_mlp": 1.02395523, "epoch": 0.1892078761460995, "flos": 21542701921920.0, "grad_norm": 1.6969528198107229, "language_loss": 0.80307269, "learning_rate": 3.7399474776409326e-06, "loss": 0.82455528, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.60546875, "step": 3147, "time_per_iteration": 2.4046342372894287 }, { "auxiliary_loss_clip": 0.01084084, "auxiliary_loss_mlp": 0.01061577, "balance_loss_clip": 1.02221406, "balance_loss_mlp": 1.02401185, "epoch": 0.18926799939876748, "flos": 23000465496960.0, "grad_norm": 2.925036254314726, "language_loss": 0.70171297, "learning_rate": 3.739755401854267e-06, "loss": 0.72316957, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.6015625, "step": 3148, "time_per_iteration": 2.4274344444274902 }, { "auxiliary_loss_clip": 0.01083246, "auxiliary_loss_mlp": 0.01061048, "balance_loss_clip": 1.02077913, "balance_loss_mlp": 1.02347052, "epoch": 0.18932812265143545, "flos": 22271932823040.0, "grad_norm": 2.2448464111302906, "language_loss": 0.78012884, "learning_rate": 3.739563260095902e-06, "loss": 0.80157179, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.59765625, "step": 3149, "time_per_iteration": 2.4002270698547363 }, { "auxiliary_loss_clip": 0.01082593, "auxiliary_loss_mlp": 0.01063673, "balance_loss_clip": 1.02609801, "balance_loss_mlp": 1.02527165, "epoch": 0.1893882459041034, "flos": 18623439256320.0, "grad_norm": 2.054983423126217, "language_loss": 0.83103019, "learning_rate": 3.7393710523731245e-06, "loss": 0.85249287, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 3150, "time_per_iteration": 2.3827619552612305 }, { "auxiliary_loss_clip": 0.01087494, "auxiliary_loss_mlp": 0.0106954, "balance_loss_clip": 1.0281502, "balance_loss_mlp": 1.02588153, "epoch": 0.18944836915677138, "flos": 22891885568640.0, "grad_norm": 1.9397348409952988, "language_loss": 0.87356341, "learning_rate": 3.7391787786932215e-06, "loss": 0.89513373, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6171875, "step": 3151, "time_per_iteration": 2.4008162021636963 }, { "auxiliary_loss_clip": 0.01083739, "auxiliary_loss_mlp": 0.01059437, "balance_loss_clip": 1.01859593, "balance_loss_mlp": 1.02279735, "epoch": 0.18950849240943934, "flos": 26795349861120.0, "grad_norm": 1.749166141950305, "language_loss": 0.75783861, "learning_rate": 3.7389864390634857e-06, "loss": 0.77927041, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.609375, "step": 3152, "time_per_iteration": 2.4389827251434326 }, { "auxiliary_loss_clip": 0.01084824, "auxiliary_loss_mlp": 0.01071637, "balance_loss_clip": 1.02688563, "balance_loss_mlp": 1.02342331, "epoch": 0.1895686156621073, "flos": 24970125559680.0, "grad_norm": 1.8917255608633212, "language_loss": 0.77252138, "learning_rate": 3.738794033491209e-06, "loss": 0.79408598, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.6171875, "step": 3153, "time_per_iteration": 2.417081594467163 }, { "auxiliary_loss_clip": 0.01086025, "auxiliary_loss_mlp": 0.01060107, "balance_loss_clip": 1.0199573, "balance_loss_mlp": 1.02327836, "epoch": 0.1896287389147753, "flos": 21943469952000.0, "grad_norm": 1.6891459572157945, "language_loss": 0.81947798, "learning_rate": 3.7386015619836887e-06, "loss": 0.84093928, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.62890625, "step": 3154, "time_per_iteration": 2.400468349456787 }, { "auxiliary_loss_clip": 0.01088576, "auxiliary_loss_mlp": 0.01069063, "balance_loss_clip": 1.02488387, "balance_loss_mlp": 1.0237596, "epoch": 0.18968886216744327, "flos": 18178297021440.0, "grad_norm": 2.68237654687604, "language_loss": 0.74585426, "learning_rate": 3.738409024548223e-06, "loss": 0.76743066, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6484375, "step": 3155, "time_per_iteration": 2.349619150161743 }, { "auxiliary_loss_clip": 0.01081762, "auxiliary_loss_mlp": 0.01063729, "balance_loss_clip": 1.02315044, "balance_loss_mlp": 1.02136016, "epoch": 0.18974898542011123, "flos": 20411446181760.0, "grad_norm": 1.6083172181443763, "language_loss": 0.75737143, "learning_rate": 3.7382164211921136e-06, "loss": 0.77882636, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6015625, "step": 3156, "time_per_iteration": 2.4029552936553955 }, { "auxiliary_loss_clip": 0.01087632, "auxiliary_loss_mlp": 0.0106095, "balance_loss_clip": 1.01781976, "balance_loss_mlp": 1.02517509, "epoch": 0.1898091086727792, "flos": 23983968896640.0, "grad_norm": 1.692904118807614, "language_loss": 0.69694459, "learning_rate": 3.7380237519226623e-06, "loss": 0.7184304, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 0.625, "step": 3157, "time_per_iteration": 2.414433002471924 }, { "auxiliary_loss_clip": 0.0108414, "auxiliary_loss_mlp": 0.01065181, "balance_loss_clip": 1.02164578, "balance_loss_mlp": 1.02156997, "epoch": 0.18986923192544716, "flos": 27635813953920.0, "grad_norm": 1.7326408635362065, "language_loss": 0.82271975, "learning_rate": 3.737831016747176e-06, "loss": 0.84421295, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.625, "step": 3158, "time_per_iteration": 2.458814859390259 }, { "auxiliary_loss_clip": 0.01091774, "auxiliary_loss_mlp": 0.01069028, "balance_loss_clip": 1.01986611, "balance_loss_mlp": 1.02480114, "epoch": 0.18992935517811513, "flos": 25482964654080.0, "grad_norm": 1.6702818893640745, "language_loss": 0.74299872, "learning_rate": 3.737638215672964e-06, "loss": 0.76460677, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 0.671875, "step": 3159, "time_per_iteration": 2.4202733039855957 }, { "auxiliary_loss_clip": 0.01086199, "auxiliary_loss_mlp": 0.01073915, "balance_loss_clip": 1.02968824, "balance_loss_mlp": 1.02293503, "epoch": 0.1899894784307831, "flos": 17419843445760.0, "grad_norm": 2.27787327405281, "language_loss": 0.87633908, "learning_rate": 3.7374453487073366e-06, "loss": 0.89794028, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.6328125, "step": 3160, "time_per_iteration": 2.3778696060180664 }, { "auxiliary_loss_clip": 0.01080756, "auxiliary_loss_mlp": 0.0106487, "balance_loss_clip": 1.02445817, "balance_loss_mlp": 1.02168226, "epoch": 0.19004960168345109, "flos": 27490959256320.0, "grad_norm": 1.8789033275510132, "language_loss": 0.75447452, "learning_rate": 3.7372524158576074e-06, "loss": 0.77593076, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.58984375, "step": 3161, "time_per_iteration": 2.443905830383301 }, { "auxiliary_loss_clip": 0.01084776, "auxiliary_loss_mlp": 0.01068556, "balance_loss_clip": 1.02890694, "balance_loss_mlp": 1.024212, "epoch": 0.19010972493611905, "flos": 38653145965440.0, "grad_norm": 1.545742370115131, "language_loss": 0.82422173, "learning_rate": 3.7370594171310926e-06, "loss": 0.84575498, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.60546875, "step": 3162, "time_per_iteration": 2.57304048538208 }, { "auxiliary_loss_clip": 0.01084004, "auxiliary_loss_mlp": 0.01065535, "balance_loss_clip": 1.02178514, "balance_loss_mlp": 1.02208757, "epoch": 0.19016984818878702, "flos": 19243741115520.0, "grad_norm": 2.1445804524552994, "language_loss": 0.77616632, "learning_rate": 3.73686635253511e-06, "loss": 0.79766172, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6171875, "step": 3163, "time_per_iteration": 2.389211416244507 }, { "auxiliary_loss_clip": 0.01081251, "auxiliary_loss_mlp": 0.01062002, "balance_loss_clip": 1.02232957, "balance_loss_mlp": 1.022012, "epoch": 0.19022997144145498, "flos": 37595382370560.0, "grad_norm": 1.6088192648577622, "language_loss": 0.7616421, "learning_rate": 3.736673222076982e-06, "loss": 0.78307462, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.59375, "step": 3164, "time_per_iteration": 2.5413148403167725 }, { "auxiliary_loss_clip": 0.01084558, "auxiliary_loss_mlp": 0.0106435, "balance_loss_clip": 1.02451026, "balance_loss_mlp": 1.02317524, "epoch": 0.19029009469412295, "flos": 61528762840320.0, "grad_norm": 1.8716228735454377, "language_loss": 0.6847474, "learning_rate": 3.7364800257640313e-06, "loss": 0.70623648, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.61328125, "step": 3165, "time_per_iteration": 2.746187448501587 }, { "auxiliary_loss_clip": 0.01086401, "auxiliary_loss_mlp": 0.01065453, "balance_loss_clip": 1.02294326, "balance_loss_mlp": 1.02486587, "epoch": 0.1903502179467909, "flos": 13953980534400.0, "grad_norm": 2.2561514765800816, "language_loss": 0.75709558, "learning_rate": 3.7362867636035835e-06, "loss": 0.77861404, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.6171875, "step": 3166, "time_per_iteration": 2.356621026992798 }, { "auxiliary_loss_clip": 0.01036869, "auxiliary_loss_mlp": 0.0101899, "balance_loss_clip": 1.01150417, "balance_loss_mlp": 1.01345515, "epoch": 0.1904103411994589, "flos": 66896168152320.0, "grad_norm": 0.7926581973361835, "language_loss": 0.50509405, "learning_rate": 3.736093435602968e-06, "loss": 0.52565265, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 0.07470703, "router_z_loss_mlp": 0.234375, "step": 3167, "time_per_iteration": 3.049598455429077 }, { "auxiliary_loss_clip": 0.01085627, "auxiliary_loss_mlp": 0.01063135, "balance_loss_clip": 1.02331865, "balance_loss_mlp": 1.02472866, "epoch": 0.19047046445212687, "flos": 21907649030400.0, "grad_norm": 2.1419219771123976, "language_loss": 0.75736415, "learning_rate": 3.7359000417695156e-06, "loss": 0.77885181, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.609375, "step": 3168, "time_per_iteration": 2.4138827323913574 }, { "auxiliary_loss_clip": 0.01032128, "auxiliary_loss_mlp": 0.01006693, "balance_loss_clip": 1.00001717, "balance_loss_mlp": 1.00868666, "epoch": 0.19053058770479483, "flos": 59252424595200.0, "grad_norm": 0.8623446191001776, "language_loss": 0.60113782, "learning_rate": 3.73570658211056e-06, "loss": 0.621526, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 0.06689453, "router_z_loss_mlp": 0.234375, "step": 3169, "time_per_iteration": 2.9420535564422607 }, { "auxiliary_loss_clip": 0.01089063, "auxiliary_loss_mlp": 0.0106774, "balance_loss_clip": 1.02413309, "balance_loss_mlp": 1.02463043, "epoch": 0.1905907109574628, "flos": 23950172833920.0, "grad_norm": 1.5010922674037304, "language_loss": 0.80275166, "learning_rate": 3.735513056633436e-06, "loss": 0.82431972, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.64453125, "step": 3170, "time_per_iteration": 2.4400479793548584 }, { "auxiliary_loss_clip": 0.01081967, "auxiliary_loss_mlp": 0.01061914, "balance_loss_clip": 1.02326584, "balance_loss_mlp": 1.02309084, "epoch": 0.19065083421013077, "flos": 20811306516480.0, "grad_norm": 1.8604138093733116, "language_loss": 0.79856074, "learning_rate": 3.7353194653454834e-06, "loss": 0.81999958, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5859375, "step": 3171, "time_per_iteration": 2.4013826847076416 }, { "auxiliary_loss_clip": 0.01086407, "auxiliary_loss_mlp": 0.01066253, "balance_loss_clip": 1.02224112, "balance_loss_mlp": 1.02432704, "epoch": 0.19071095746279873, "flos": 31283644204800.0, "grad_norm": 2.061955186713241, "language_loss": 0.81879348, "learning_rate": 3.7351258082540426e-06, "loss": 0.84031999, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.62109375, "step": 3172, "time_per_iteration": 2.482989549636841 }, { "auxiliary_loss_clip": 0.01084953, "auxiliary_loss_mlp": 0.01067299, "balance_loss_clip": 1.02438331, "balance_loss_mlp": 1.02468586, "epoch": 0.1907710807154667, "flos": 14355237323520.0, "grad_norm": 1.5297332414669371, "language_loss": 0.82102787, "learning_rate": 3.7349320853664576e-06, "loss": 0.8425504, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.6015625, "step": 3173, "time_per_iteration": 2.3886890411376953 }, { "auxiliary_loss_clip": 0.01085202, "auxiliary_loss_mlp": 0.01064, "balance_loss_clip": 1.02415991, "balance_loss_mlp": 1.02440619, "epoch": 0.1908312039681347, "flos": 26905815002880.0, "grad_norm": 1.4634268782381288, "language_loss": 0.80777693, "learning_rate": 3.7347382966900735e-06, "loss": 0.82926893, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.609375, "step": 3174, "time_per_iteration": 2.466651201248169 }, { "auxiliary_loss_clip": 0.01086902, "auxiliary_loss_mlp": 0.01070027, "balance_loss_clip": 1.02816033, "balance_loss_mlp": 1.02515626, "epoch": 0.19089132722080265, "flos": 14494017444480.0, "grad_norm": 1.8229771460757147, "language_loss": 0.82319987, "learning_rate": 3.7345444422322395e-06, "loss": 0.84476912, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6171875, "step": 3175, "time_per_iteration": 2.3872625827789307 }, { "auxiliary_loss_clip": 0.01086228, "auxiliary_loss_mlp": 0.01065538, "balance_loss_clip": 1.02624691, "balance_loss_mlp": 1.02434468, "epoch": 0.19095145047347062, "flos": 13952060409600.0, "grad_norm": 2.0588505950047, "language_loss": 0.87643075, "learning_rate": 3.7343505220003067e-06, "loss": 0.89794838, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.6171875, "step": 3176, "time_per_iteration": 2.383037567138672 }, { "auxiliary_loss_clip": 0.01088031, "auxiliary_loss_mlp": 0.01061442, "balance_loss_clip": 1.01838374, "balance_loss_mlp": 1.02613175, "epoch": 0.19101157372613858, "flos": 25300648200960.0, "grad_norm": 2.7567848265313644, "language_loss": 0.82828796, "learning_rate": 3.7341565360016285e-06, "loss": 0.84978265, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.62109375, "step": 3177, "time_per_iteration": 2.4586124420166016 }, { "auxiliary_loss_clip": 0.01081206, "auxiliary_loss_mlp": 0.0105969, "balance_loss_clip": 1.02066112, "balance_loss_mlp": 1.02247441, "epoch": 0.19107169697880655, "flos": 20557173663360.0, "grad_norm": 2.095976919705923, "language_loss": 0.77960569, "learning_rate": 3.73396248424356e-06, "loss": 0.80101466, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5859375, "step": 3178, "time_per_iteration": 2.4105772972106934 }, { "auxiliary_loss_clip": 0.01085461, "auxiliary_loss_mlp": 0.01050358, "balance_loss_clip": 1.01337957, "balance_loss_mlp": 1.02576017, "epoch": 0.19113182023147451, "flos": 22162130997120.0, "grad_norm": 1.7153744981735082, "language_loss": 0.82693523, "learning_rate": 3.7337683667334606e-06, "loss": 0.84829342, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.59765625, "step": 3179, "time_per_iteration": 3.856678009033203 }, { "auxiliary_loss_clip": 0.01086743, "auxiliary_loss_mlp": 0.01065372, "balance_loss_clip": 1.02634263, "balance_loss_mlp": 1.02481163, "epoch": 0.19119194348414248, "flos": 18580985176320.0, "grad_norm": 2.2561959521275896, "language_loss": 0.81299472, "learning_rate": 3.733574183478691e-06, "loss": 0.83451587, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.6171875, "step": 3180, "time_per_iteration": 2.381539821624756 }, { "auxiliary_loss_clip": 0.0108433, "auxiliary_loss_mlp": 0.01065505, "balance_loss_clip": 1.02487862, "balance_loss_mlp": 1.02412152, "epoch": 0.19125206673681047, "flos": 19025603740800.0, "grad_norm": 2.232466889612744, "language_loss": 0.81384355, "learning_rate": 3.733379934486615e-06, "loss": 0.83534187, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6015625, "step": 3181, "time_per_iteration": 3.77317476272583 }, { "auxiliary_loss_clip": 0.01085552, "auxiliary_loss_mlp": 0.01059295, "balance_loss_clip": 1.02119613, "balance_loss_mlp": 1.02413929, "epoch": 0.19131218998947844, "flos": 21689057808000.0, "grad_norm": 1.643304310203256, "language_loss": 0.75847828, "learning_rate": 3.7331856197645973e-06, "loss": 0.77992678, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.61328125, "step": 3182, "time_per_iteration": 2.423046350479126 }, { "auxiliary_loss_clip": 0.01084635, "auxiliary_loss_mlp": 0.01065519, "balance_loss_clip": 1.02567887, "balance_loss_mlp": 1.02418804, "epoch": 0.1913723132421464, "flos": 18441506828160.0, "grad_norm": 2.0691802698709703, "language_loss": 0.66590953, "learning_rate": 3.7329912393200084e-06, "loss": 0.68741107, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.60546875, "step": 3183, "time_per_iteration": 3.8191378116607666 }, { "auxiliary_loss_clip": 0.01084228, "auxiliary_loss_mlp": 0.01062663, "balance_loss_clip": 1.02186954, "balance_loss_mlp": 1.02354097, "epoch": 0.19143243649481437, "flos": 27158935426560.0, "grad_norm": 1.555961764338503, "language_loss": 0.74122155, "learning_rate": 3.7327967931602173e-06, "loss": 0.76269042, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.60546875, "step": 3184, "time_per_iteration": 3.8376667499542236 }, { "auxiliary_loss_clip": 0.0108341, "auxiliary_loss_mlp": 0.01067157, "balance_loss_clip": 1.02529025, "balance_loss_mlp": 1.02270222, "epoch": 0.19149255974748233, "flos": 21718071014400.0, "grad_norm": 1.7417618556848182, "language_loss": 0.89541709, "learning_rate": 3.732602281292598e-06, "loss": 0.91692269, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.609375, "step": 3185, "time_per_iteration": 2.451084613800049 }, { "auxiliary_loss_clip": 0.01083045, "auxiliary_loss_mlp": 0.01062207, "balance_loss_clip": 1.02141356, "balance_loss_mlp": 1.02219248, "epoch": 0.1915526830001503, "flos": 22962270602880.0, "grad_norm": 1.8510129003467697, "language_loss": 0.74276721, "learning_rate": 3.7324077037245267e-06, "loss": 0.76421976, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.609375, "step": 3186, "time_per_iteration": 2.4126322269439697 }, { "auxiliary_loss_clip": 0.01087031, "auxiliary_loss_mlp": 0.01070848, "balance_loss_clip": 1.02829027, "balance_loss_mlp": 1.02476883, "epoch": 0.1916128062528183, "flos": 26139541282560.0, "grad_norm": 1.8053096043223287, "language_loss": 0.85234058, "learning_rate": 3.7322130604633825e-06, "loss": 0.87391943, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.62109375, "step": 3187, "time_per_iteration": 2.4239084720611572 }, { "auxiliary_loss_clip": 0.01028324, "auxiliary_loss_mlp": 0.01006342, "balance_loss_clip": 1.00040507, "balance_loss_mlp": 1.00807357, "epoch": 0.19167292950548626, "flos": 54922809847680.0, "grad_norm": 0.8544133304846683, "language_loss": 0.55913365, "learning_rate": 3.732018351516544e-06, "loss": 0.57948029, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 0.05932617, "router_z_loss_mlp": 0.203125, "step": 3188, "time_per_iteration": 3.097135066986084 }, { "auxiliary_loss_clip": 0.01083165, "auxiliary_loss_mlp": 0.01071926, "balance_loss_clip": 1.02915382, "balance_loss_mlp": 1.02323508, "epoch": 0.19173305275815422, "flos": 29934286001280.0, "grad_norm": 1.7739328743056146, "language_loss": 0.71771872, "learning_rate": 3.731823576891397e-06, "loss": 0.73926967, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.59765625, "step": 3189, "time_per_iteration": 2.5172438621520996 }, { "auxiliary_loss_clip": 0.01079141, "auxiliary_loss_mlp": 0.01060974, "balance_loss_clip": 1.02232599, "balance_loss_mlp": 1.02189255, "epoch": 0.1917931760108222, "flos": 24751359780480.0, "grad_norm": 1.9957434834599905, "language_loss": 0.75773591, "learning_rate": 3.7316287365953266e-06, "loss": 0.77913707, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5703125, "step": 3190, "time_per_iteration": 2.434150218963623 }, { "auxiliary_loss_clip": 0.01081405, "auxiliary_loss_mlp": 0.0107479, "balance_loss_clip": 1.03394938, "balance_loss_mlp": 1.02324557, "epoch": 0.19185329926349015, "flos": 18842554149120.0, "grad_norm": 1.8505138907992102, "language_loss": 0.85646605, "learning_rate": 3.73143383063572e-06, "loss": 0.87802804, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.58203125, "step": 3191, "time_per_iteration": 2.3662095069885254 }, { "auxiliary_loss_clip": 0.01079372, "auxiliary_loss_mlp": 0.01058965, "balance_loss_clip": 1.01891088, "balance_loss_mlp": 1.02209711, "epoch": 0.19191342251615812, "flos": 22085880854400.0, "grad_norm": 1.7640933763799815, "language_loss": 0.9113009, "learning_rate": 3.73123885901997e-06, "loss": 0.9326843, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.57421875, "step": 3192, "time_per_iteration": 2.457148313522339 }, { "auxiliary_loss_clip": 0.01085946, "auxiliary_loss_mlp": 0.0106617, "balance_loss_clip": 1.02361178, "balance_loss_mlp": 1.02456343, "epoch": 0.19197354576882608, "flos": 22198056652800.0, "grad_norm": 1.7622438233253315, "language_loss": 0.76553321, "learning_rate": 3.7310438217554687e-06, "loss": 0.78705442, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.61328125, "step": 3193, "time_per_iteration": 2.3855626583099365 }, { "auxiliary_loss_clip": 0.01084739, "auxiliary_loss_mlp": 0.01064267, "balance_loss_clip": 1.02371192, "balance_loss_mlp": 1.02173471, "epoch": 0.19203366902149407, "flos": 24895132225920.0, "grad_norm": 1.979192657184348, "language_loss": 0.76786828, "learning_rate": 3.730848718849612e-06, "loss": 0.78935832, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6328125, "step": 3194, "time_per_iteration": 2.4153823852539062 }, { "auxiliary_loss_clip": 0.01023382, "auxiliary_loss_mlp": 0.01010983, "balance_loss_clip": 1.00430715, "balance_loss_mlp": 1.00406742, "epoch": 0.19209379227416204, "flos": 68413633885440.0, "grad_norm": 0.7797867732910367, "language_loss": 0.68477917, "learning_rate": 3.7306535503097985e-06, "loss": 0.70512283, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 0.06689453, "router_z_loss_mlp": 0.19335938, "step": 3195, "time_per_iteration": 2.969627857208252 }, { "auxiliary_loss_clip": 0.01081339, "auxiliary_loss_mlp": 0.01067672, "balance_loss_clip": 1.02787995, "balance_loss_mlp": 1.02209496, "epoch": 0.19215391552683, "flos": 22054074739200.0, "grad_norm": 1.808216375548017, "language_loss": 0.75313151, "learning_rate": 3.730458316143429e-06, "loss": 0.77462161, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.59375, "step": 3196, "time_per_iteration": 2.415837526321411 }, { "auxiliary_loss_clip": 0.01085512, "auxiliary_loss_mlp": 0.01067906, "balance_loss_clip": 1.02549171, "balance_loss_mlp": 1.02533531, "epoch": 0.19221403877949797, "flos": 20301923646720.0, "grad_norm": 2.3731255235729223, "language_loss": 0.84916329, "learning_rate": 3.7302630163579068e-06, "loss": 0.8706975, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.6015625, "step": 3197, "time_per_iteration": 2.389824628829956 }, { "auxiliary_loss_clip": 0.01083172, "auxiliary_loss_mlp": 0.01063367, "balance_loss_clip": 1.02259755, "balance_loss_mlp": 1.02283478, "epoch": 0.19227416203216594, "flos": 23184213315840.0, "grad_norm": 2.0908205694070374, "language_loss": 0.83494025, "learning_rate": 3.7300676509606373e-06, "loss": 0.85640568, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.60546875, "step": 3198, "time_per_iteration": 2.423679828643799 }, { "auxiliary_loss_clip": 0.01085099, "auxiliary_loss_mlp": 0.01064407, "balance_loss_clip": 1.02308929, "balance_loss_mlp": 1.02354372, "epoch": 0.1923342852848339, "flos": 25775397135360.0, "grad_norm": 1.7667595151577624, "language_loss": 0.80170572, "learning_rate": 3.729872219959029e-06, "loss": 0.82320082, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.61328125, "step": 3199, "time_per_iteration": 2.450272798538208 }, { "auxiliary_loss_clip": 0.01083197, "auxiliary_loss_mlp": 0.01054686, "balance_loss_clip": 1.01706314, "balance_loss_mlp": 1.02469397, "epoch": 0.19239440853750187, "flos": 17127410964480.0, "grad_norm": 3.1141269632375814, "language_loss": 0.8540597, "learning_rate": 3.7296767233604934e-06, "loss": 0.87543845, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5859375, "step": 3200, "time_per_iteration": 2.3859081268310547 }, { "auxiliary_loss_clip": 0.01083398, "auxiliary_loss_mlp": 0.01060581, "balance_loss_clip": 1.02224326, "balance_loss_mlp": 1.02414691, "epoch": 0.19245453179016986, "flos": 16434175541760.0, "grad_norm": 2.105446510890205, "language_loss": 0.80863166, "learning_rate": 3.729481161172443e-06, "loss": 0.83007145, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.59375, "step": 3201, "time_per_iteration": 2.3685388565063477 }, { "auxiliary_loss_clip": 0.01082682, "auxiliary_loss_mlp": 0.01057986, "balance_loss_clip": 1.01831293, "balance_loss_mlp": 1.022053, "epoch": 0.19251465504283782, "flos": 20229234462720.0, "grad_norm": 2.367856076724108, "language_loss": 0.71128309, "learning_rate": 3.7292855334022927e-06, "loss": 0.73268974, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.60546875, "step": 3202, "time_per_iteration": 2.396688222885132 }, { "auxiliary_loss_clip": 0.01081677, "auxiliary_loss_mlp": 0.01058993, "balance_loss_clip": 1.02082276, "balance_loss_mlp": 1.02357626, "epoch": 0.1925747782955058, "flos": 19463344767360.0, "grad_norm": 1.7449268271163034, "language_loss": 0.92699647, "learning_rate": 3.7290898400574627e-06, "loss": 0.94840318, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.58203125, "step": 3203, "time_per_iteration": 2.3831541538238525 }, { "auxiliary_loss_clip": 0.01083652, "auxiliary_loss_mlp": 0.01059027, "balance_loss_clip": 1.01873422, "balance_loss_mlp": 1.02430594, "epoch": 0.19263490154817375, "flos": 17784615997440.0, "grad_norm": 2.191808487051013, "language_loss": 0.84436929, "learning_rate": 3.7288940811453725e-06, "loss": 0.86579603, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.59375, "step": 3204, "time_per_iteration": 2.389613628387451 }, { "auxiliary_loss_clip": 0.01082067, "auxiliary_loss_mlp": 0.01058398, "balance_loss_clip": 1.01972651, "balance_loss_mlp": 1.02440012, "epoch": 0.19269502480084172, "flos": 17456118215040.0, "grad_norm": 2.0943506648191383, "language_loss": 0.77907646, "learning_rate": 3.7286982566734454e-06, "loss": 0.80048108, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.578125, "step": 3205, "time_per_iteration": 2.3694772720336914 }, { "auxiliary_loss_clip": 0.01084801, "auxiliary_loss_mlp": 0.01061508, "balance_loss_clip": 1.02245498, "balance_loss_mlp": 1.02421451, "epoch": 0.19275514805350968, "flos": 21505833659520.0, "grad_norm": 2.40647781046479, "language_loss": 0.86815929, "learning_rate": 3.728502366649107e-06, "loss": 0.88962233, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.609375, "step": 3206, "time_per_iteration": 2.400550365447998 }, { "auxiliary_loss_clip": 0.01023745, "auxiliary_loss_mlp": 0.01011363, "balance_loss_clip": 1.00633216, "balance_loss_mlp": 1.00745964, "epoch": 0.19281527130617768, "flos": 47693379928320.0, "grad_norm": 0.8649513357461319, "language_loss": 0.60694253, "learning_rate": 3.728306411079786e-06, "loss": 0.62729359, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 0.05029297, "router_z_loss_mlp": 0.16308594, "step": 3207, "time_per_iteration": 2.8528149127960205 }, { "auxiliary_loss_clip": 0.01083167, "auxiliary_loss_mlp": 0.01058959, "balance_loss_clip": 1.02002525, "balance_loss_mlp": 1.02385688, "epoch": 0.19287539455884564, "flos": 11800467918720.0, "grad_norm": 2.028885252293734, "language_loss": 0.78113526, "learning_rate": 3.7281103899729125e-06, "loss": 0.80255651, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.59375, "step": 3208, "time_per_iteration": 2.400811195373535 }, { "auxiliary_loss_clip": 0.01085929, "auxiliary_loss_mlp": 0.01062822, "balance_loss_clip": 1.02317262, "balance_loss_mlp": 1.02446091, "epoch": 0.1929355178115136, "flos": 20630386517760.0, "grad_norm": 1.9321199987250708, "language_loss": 0.62718225, "learning_rate": 3.7279143033359195e-06, "loss": 0.64866972, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.6171875, "step": 3209, "time_per_iteration": 2.410543918609619 }, { "auxiliary_loss_clip": 0.01085459, "auxiliary_loss_mlp": 0.01073398, "balance_loss_clip": 1.02960062, "balance_loss_mlp": 1.02472281, "epoch": 0.19299564106418157, "flos": 40806309467520.0, "grad_norm": 1.8046361334581782, "language_loss": 0.83063042, "learning_rate": 3.727718151176243e-06, "loss": 0.85221899, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.609375, "step": 3210, "time_per_iteration": 2.5702404975891113 }, { "auxiliary_loss_clip": 0.01080992, "auxiliary_loss_mlp": 0.01060753, "balance_loss_clip": 1.02408397, "balance_loss_mlp": 1.02318263, "epoch": 0.19305576431684954, "flos": 11360702033280.0, "grad_norm": 1.9242553054528904, "language_loss": 0.84630072, "learning_rate": 3.7275219335013217e-06, "loss": 0.8677181, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.578125, "step": 3211, "time_per_iteration": 2.3886914253234863 }, { "auxiliary_loss_clip": 0.01023072, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.02231717, "balance_loss_mlp": 1.00753307, "epoch": 0.1931158875695175, "flos": 54509299171200.0, "grad_norm": 0.9825845950165659, "language_loss": 0.63727278, "learning_rate": 3.7273256503185953e-06, "loss": 0.65777385, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 0.04711914, "router_z_loss_mlp": 0.15527344, "step": 3212, "time_per_iteration": 2.906512498855591 }, { "auxiliary_loss_clip": 0.01084352, "auxiliary_loss_mlp": 0.01073715, "balance_loss_clip": 1.03747559, "balance_loss_mlp": 1.02555978, "epoch": 0.19317601082218547, "flos": 19827419091840.0, "grad_norm": 1.5503197983534085, "language_loss": 0.77502429, "learning_rate": 3.7271293016355074e-06, "loss": 0.79660499, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5859375, "step": 3213, "time_per_iteration": 2.5043790340423584 }, { "auxiliary_loss_clip": 0.01087032, "auxiliary_loss_mlp": 0.0106132, "balance_loss_clip": 1.02171826, "balance_loss_mlp": 1.0256449, "epoch": 0.19323613407485346, "flos": 13151222576640.0, "grad_norm": 1.9578775752113764, "language_loss": 0.72586149, "learning_rate": 3.726932887459503e-06, "loss": 0.74734497, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.6171875, "step": 3214, "time_per_iteration": 2.381150245666504 }, { "auxiliary_loss_clip": 0.01084181, "auxiliary_loss_mlp": 0.010619, "balance_loss_clip": 1.02117848, "balance_loss_mlp": 1.02409434, "epoch": 0.19329625732752143, "flos": 14026390427520.0, "grad_norm": 2.3404290889292625, "language_loss": 0.77295029, "learning_rate": 3.72673640779803e-06, "loss": 0.79441112, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.6015625, "step": 3215, "time_per_iteration": 2.3550455570220947 }, { "auxiliary_loss_clip": 0.01081176, "auxiliary_loss_mlp": 0.01060444, "balance_loss_clip": 1.02403808, "balance_loss_mlp": 1.02485204, "epoch": 0.1933563805801894, "flos": 23440580496000.0, "grad_norm": 2.02210099837734, "language_loss": 0.89340574, "learning_rate": 3.72653986265854e-06, "loss": 0.91482198, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5625, "step": 3216, "time_per_iteration": 2.44232439994812 }, { "auxiliary_loss_clip": 0.01084926, "auxiliary_loss_mlp": 0.01065443, "balance_loss_clip": 1.02596092, "balance_loss_mlp": 1.02718139, "epoch": 0.19341650383285736, "flos": 20484275011200.0, "grad_norm": 1.6429507209507557, "language_loss": 0.81823635, "learning_rate": 3.726343252048485e-06, "loss": 0.83974004, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.578125, "step": 3217, "time_per_iteration": 2.4140210151672363 }, { "auxiliary_loss_clip": 0.01085421, "auxiliary_loss_mlp": 0.01066515, "balance_loss_clip": 1.02259851, "balance_loss_mlp": 1.02481484, "epoch": 0.19347662708552532, "flos": 17857514649600.0, "grad_norm": 2.5065899131835443, "language_loss": 0.66955304, "learning_rate": 3.7261465759753206e-06, "loss": 0.6910724, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.60546875, "step": 3218, "time_per_iteration": 2.3839337825775146 }, { "auxiliary_loss_clip": 0.01086285, "auxiliary_loss_mlp": 0.01064765, "balance_loss_clip": 1.02273226, "balance_loss_mlp": 1.02641344, "epoch": 0.1935367503381933, "flos": 18186256811520.0, "grad_norm": 1.6347597427004064, "language_loss": 0.81344879, "learning_rate": 3.7259498344465053e-06, "loss": 0.83495927, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.59765625, "step": 3219, "time_per_iteration": 3.8571202754974365 }, { "auxiliary_loss_clip": 0.01083486, "auxiliary_loss_mlp": 0.01061873, "balance_loss_clip": 1.02298713, "balance_loss_mlp": 1.0255233, "epoch": 0.19359687359086128, "flos": 15956319496320.0, "grad_norm": 1.8562803379847965, "language_loss": 0.8842082, "learning_rate": 3.7257530274694993e-06, "loss": 0.90566182, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3220, "time_per_iteration": 3.7959933280944824 }, { "auxiliary_loss_clip": 0.01079554, "auxiliary_loss_mlp": 0.01056569, "balance_loss_clip": 1.02137828, "balance_loss_mlp": 1.02504492, "epoch": 0.19365699684352924, "flos": 21214134316800.0, "grad_norm": 1.933790202867839, "language_loss": 0.85026824, "learning_rate": 3.725556155051766e-06, "loss": 0.87162948, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3221, "time_per_iteration": 2.410780191421509 }, { "auxiliary_loss_clip": 0.01082433, "auxiliary_loss_mlp": 0.0106338, "balance_loss_clip": 1.02728367, "balance_loss_mlp": 1.02499032, "epoch": 0.1937171200961972, "flos": 17310146353920.0, "grad_norm": 2.1849648010151865, "language_loss": 0.88021344, "learning_rate": 3.7253592172007702e-06, "loss": 0.90167159, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.57421875, "step": 3222, "time_per_iteration": 3.8178672790527344 }, { "auxiliary_loss_clip": 0.01082276, "auxiliary_loss_mlp": 0.01060369, "balance_loss_clip": 1.01955175, "balance_loss_mlp": 1.02333605, "epoch": 0.19377724334886517, "flos": 22634924895360.0, "grad_norm": 1.9763383561560408, "language_loss": 0.79929829, "learning_rate": 3.72516221392398e-06, "loss": 0.82072473, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.58984375, "step": 3223, "time_per_iteration": 2.4038100242614746 }, { "auxiliary_loss_clip": 0.01081392, "auxiliary_loss_mlp": 0.01053121, "balance_loss_clip": 1.01652336, "balance_loss_mlp": 1.02402794, "epoch": 0.19383736660153314, "flos": 15077136839040.0, "grad_norm": 1.9762995118129922, "language_loss": 0.77114952, "learning_rate": 3.7249651452288653e-06, "loss": 0.79249465, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.57421875, "step": 3224, "time_per_iteration": 3.8199617862701416 }, { "auxiliary_loss_clip": 0.01082199, "auxiliary_loss_mlp": 0.01061036, "balance_loss_clip": 1.02167296, "balance_loss_mlp": 1.02512729, "epoch": 0.1938974898542011, "flos": 47118152367360.0, "grad_norm": 1.9528349696147782, "language_loss": 0.73254651, "learning_rate": 3.7247680111229e-06, "loss": 0.75397885, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5703125, "step": 3225, "time_per_iteration": 2.6062021255493164 }, { "auxiliary_loss_clip": 0.01083732, "auxiliary_loss_mlp": 0.01055309, "balance_loss_clip": 1.01565969, "balance_loss_mlp": 1.02460873, "epoch": 0.19395761310686907, "flos": 25811357702400.0, "grad_norm": 2.0575037467472184, "language_loss": 0.70927036, "learning_rate": 3.7245708116135585e-06, "loss": 0.73066068, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.58984375, "step": 3226, "time_per_iteration": 2.4351251125335693 }, { "auxiliary_loss_clip": 0.01086455, "auxiliary_loss_mlp": 0.01060883, "balance_loss_clip": 1.02092433, "balance_loss_mlp": 1.02792096, "epoch": 0.19401773635953706, "flos": 23038485834240.0, "grad_norm": 1.5417084243803894, "language_loss": 0.77933848, "learning_rate": 3.7243735467083193e-06, "loss": 0.80081189, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5859375, "step": 3227, "time_per_iteration": 2.4073750972747803 }, { "auxiliary_loss_clip": 0.01083169, "auxiliary_loss_mlp": 0.01064298, "balance_loss_clip": 1.02669954, "balance_loss_mlp": 1.02405262, "epoch": 0.19407785961220503, "flos": 15919974904320.0, "grad_norm": 1.8953026924740666, "language_loss": 0.70795715, "learning_rate": 3.724176216414662e-06, "loss": 0.72943187, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.58984375, "step": 3228, "time_per_iteration": 2.3855338096618652 }, { "auxiliary_loss_clip": 0.01083768, "auxiliary_loss_mlp": 0.01062785, "balance_loss_clip": 1.01979852, "balance_loss_mlp": 1.02526546, "epoch": 0.194137982864873, "flos": 25920531123840.0, "grad_norm": 1.915018809092652, "language_loss": 0.75412071, "learning_rate": 3.72397882074007e-06, "loss": 0.77558625, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.5859375, "step": 3229, "time_per_iteration": 2.4520022869110107 }, { "auxiliary_loss_clip": 0.01083981, "auxiliary_loss_mlp": 0.01066145, "balance_loss_clip": 1.02885604, "balance_loss_mlp": 1.0254333, "epoch": 0.19419810611754096, "flos": 13260500732160.0, "grad_norm": 1.8344317906532686, "language_loss": 0.67613626, "learning_rate": 3.7237813596920285e-06, "loss": 0.6976375, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5859375, "step": 3230, "time_per_iteration": 2.389158010482788 }, { "auxiliary_loss_clip": 0.01081515, "auxiliary_loss_mlp": 0.01061162, "balance_loss_clip": 1.0223949, "balance_loss_mlp": 1.0238595, "epoch": 0.19425822937020892, "flos": 15704665349760.0, "grad_norm": 2.0465300869156025, "language_loss": 0.83470774, "learning_rate": 3.7235838332780254e-06, "loss": 0.85613453, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3231, "time_per_iteration": 2.4355874061584473 }, { "auxiliary_loss_clip": 0.0108654, "auxiliary_loss_mlp": 0.01071218, "balance_loss_clip": 1.02985239, "balance_loss_mlp": 1.02692461, "epoch": 0.1943183526228769, "flos": 23104472037120.0, "grad_norm": 1.6758413739534268, "language_loss": 0.87863314, "learning_rate": 3.72338624150555e-06, "loss": 0.90021074, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.59765625, "step": 3232, "time_per_iteration": 2.4273316860198975 }, { "auxiliary_loss_clip": 0.01083553, "auxiliary_loss_mlp": 0.01058245, "balance_loss_clip": 1.02012181, "balance_loss_mlp": 1.02533555, "epoch": 0.19437847587554485, "flos": 24711593875200.0, "grad_norm": 1.9562143056477546, "language_loss": 0.8675139, "learning_rate": 3.723188584382096e-06, "loss": 0.88893187, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.58203125, "step": 3233, "time_per_iteration": 2.4416403770446777 }, { "auxiliary_loss_clip": 0.01087363, "auxiliary_loss_mlp": 0.01068991, "balance_loss_clip": 1.02538395, "balance_loss_mlp": 1.02430928, "epoch": 0.19443859912821285, "flos": 23114910533760.0, "grad_norm": 1.6508397711375022, "language_loss": 0.90070772, "learning_rate": 3.722990861915158e-06, "loss": 0.92227125, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6328125, "step": 3234, "time_per_iteration": 2.4311366081237793 }, { "auxiliary_loss_clip": 0.01086359, "auxiliary_loss_mlp": 0.01069979, "balance_loss_clip": 1.02751637, "balance_loss_mlp": 1.02408636, "epoch": 0.1944987223808808, "flos": 15083525617920.0, "grad_norm": 2.308764307626172, "language_loss": 0.80655551, "learning_rate": 3.722793074112234e-06, "loss": 0.82811892, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.625, "step": 3235, "time_per_iteration": 2.3727498054504395 }, { "auxiliary_loss_clip": 0.01083305, "auxiliary_loss_mlp": 0.01058739, "balance_loss_clip": 1.02075875, "balance_loss_mlp": 1.02464449, "epoch": 0.19455884563354878, "flos": 17125979598720.0, "grad_norm": 1.9203238460183716, "language_loss": 0.81176734, "learning_rate": 3.7225952209808233e-06, "loss": 0.83318782, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5859375, "step": 3236, "time_per_iteration": 2.5392937660217285 }, { "auxiliary_loss_clip": 0.01083235, "auxiliary_loss_mlp": 0.0106356, "balance_loss_clip": 1.02348173, "balance_loss_mlp": 1.02295601, "epoch": 0.19461896888621674, "flos": 20192366200320.0, "grad_norm": 1.8909620424721945, "language_loss": 0.77735883, "learning_rate": 3.72239730252843e-06, "loss": 0.79882681, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.6015625, "step": 3237, "time_per_iteration": 2.414395332336426 }, { "auxiliary_loss_clip": 0.01086879, "auxiliary_loss_mlp": 0.01063061, "balance_loss_clip": 1.02229166, "balance_loss_mlp": 1.02436304, "epoch": 0.1946790921388847, "flos": 25300194353280.0, "grad_norm": 1.6472097756194939, "language_loss": 0.7622779, "learning_rate": 3.7221993187625583e-06, "loss": 0.7837773, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.625, "step": 3238, "time_per_iteration": 2.433539867401123 }, { "auxiliary_loss_clip": 0.01084108, "auxiliary_loss_mlp": 0.01066145, "balance_loss_clip": 1.02189469, "balance_loss_mlp": 1.02447987, "epoch": 0.19473921539155267, "flos": 20192366200320.0, "grad_norm": 3.0936659706921215, "language_loss": 0.75546062, "learning_rate": 3.7220012696907155e-06, "loss": 0.77696317, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.59375, "step": 3239, "time_per_iteration": 2.458822727203369 }, { "auxiliary_loss_clip": 0.01083518, "auxiliary_loss_mlp": 0.01062987, "balance_loss_clip": 1.02278948, "balance_loss_mlp": 1.02388477, "epoch": 0.19479933864422067, "flos": 20886474407040.0, "grad_norm": 2.114702087426134, "language_loss": 0.75190568, "learning_rate": 3.721803155320412e-06, "loss": 0.77337074, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.59765625, "step": 3240, "time_per_iteration": 2.4102861881256104 }, { "auxiliary_loss_clip": 0.01082747, "auxiliary_loss_mlp": 0.01056392, "balance_loss_clip": 1.01619446, "balance_loss_mlp": 1.02393258, "epoch": 0.19485946189688863, "flos": 23293945319040.0, "grad_norm": 1.8024648636142755, "language_loss": 0.68317902, "learning_rate": 3.7216049756591606e-06, "loss": 0.70457041, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.58984375, "step": 3241, "time_per_iteration": 2.401703119277954 }, { "auxiliary_loss_clip": 0.01083875, "auxiliary_loss_mlp": 0.01062634, "balance_loss_clip": 1.0203625, "balance_loss_mlp": 1.02547669, "epoch": 0.1949195851495566, "flos": 23293910407680.0, "grad_norm": 1.3808958963573523, "language_loss": 0.84263456, "learning_rate": 3.7214067307144754e-06, "loss": 0.86409962, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.5859375, "step": 3242, "time_per_iteration": 2.474928855895996 }, { "auxiliary_loss_clip": 0.01026446, "auxiliary_loss_mlp": 0.01008213, "balance_loss_clip": 1.00435114, "balance_loss_mlp": 1.01034582, "epoch": 0.19497970840222456, "flos": 64959536102400.0, "grad_norm": 0.8457895392540817, "language_loss": 0.57710612, "learning_rate": 3.721208420493875e-06, "loss": 0.59745264, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 0.03857422, "router_z_loss_mlp": 0.16113281, "step": 3243, "time_per_iteration": 3.0627028942108154 }, { "auxiliary_loss_clip": 0.01086105, "auxiliary_loss_mlp": 0.01066338, "balance_loss_clip": 1.02416134, "balance_loss_mlp": 1.0247947, "epoch": 0.19503983165489253, "flos": 19643741095680.0, "grad_norm": 1.8690854211890973, "language_loss": 0.85135633, "learning_rate": 3.7210100450048784e-06, "loss": 0.87288076, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.61328125, "step": 3244, "time_per_iteration": 2.4119060039520264 }, { "auxiliary_loss_clip": 0.01087512, "auxiliary_loss_mlp": 0.01076431, "balance_loss_clip": 1.03191853, "balance_loss_mlp": 1.02830219, "epoch": 0.1950999549075605, "flos": 21140921462400.0, "grad_norm": 1.7779362489232065, "language_loss": 0.7875042, "learning_rate": 3.7208116042550088e-06, "loss": 0.8091436, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.59375, "step": 3245, "time_per_iteration": 2.409534215927124 }, { "auxiliary_loss_clip": 0.01087113, "auxiliary_loss_mlp": 0.01073467, "balance_loss_clip": 1.02828693, "balance_loss_mlp": 1.02603102, "epoch": 0.19516007816022846, "flos": 20883821143680.0, "grad_norm": 1.9074995762852822, "language_loss": 0.86473, "learning_rate": 3.7206130982517906e-06, "loss": 0.88633585, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 0.609375, "step": 3246, "time_per_iteration": 2.4371230602264404 }, { "auxiliary_loss_clip": 0.0108779, "auxiliary_loss_mlp": 0.01073544, "balance_loss_clip": 1.02957916, "balance_loss_mlp": 1.02581203, "epoch": 0.19522020141289645, "flos": 16909552880640.0, "grad_norm": 1.9773480785277209, "language_loss": 0.79132938, "learning_rate": 3.7204145270027514e-06, "loss": 0.81294274, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.62109375, "step": 3247, "time_per_iteration": 2.3712823390960693 }, { "auxiliary_loss_clip": 0.01085017, "auxiliary_loss_mlp": 0.01062961, "balance_loss_clip": 1.02188134, "balance_loss_mlp": 1.02411008, "epoch": 0.19528032466556441, "flos": 26723603283840.0, "grad_norm": 1.7209590045377845, "language_loss": 0.77312672, "learning_rate": 3.720215890515421e-06, "loss": 0.79460657, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.609375, "step": 3248, "time_per_iteration": 2.449979305267334 }, { "auxiliary_loss_clip": 0.01084721, "auxiliary_loss_mlp": 0.01064659, "balance_loss_clip": 1.02255416, "balance_loss_mlp": 1.02447665, "epoch": 0.19534044791823238, "flos": 21031748040960.0, "grad_norm": 2.6796879430926683, "language_loss": 0.79201245, "learning_rate": 3.7200171887973316e-06, "loss": 0.81350631, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6015625, "step": 3249, "time_per_iteration": 2.399061918258667 }, { "auxiliary_loss_clip": 0.01086065, "auxiliary_loss_mlp": 0.01071364, "balance_loss_clip": 1.02897298, "balance_loss_mlp": 1.02537286, "epoch": 0.19540057117090034, "flos": 22343016084480.0, "grad_norm": 1.5513463413005752, "language_loss": 0.74515313, "learning_rate": 3.7198184218560176e-06, "loss": 0.76672739, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.60546875, "step": 3250, "time_per_iteration": 2.4383411407470703 }, { "auxiliary_loss_clip": 0.01079614, "auxiliary_loss_mlp": 0.010733, "balance_loss_clip": 1.03322172, "balance_loss_mlp": 1.02247238, "epoch": 0.1954606944235683, "flos": 20300631926400.0, "grad_norm": 1.840475524506516, "language_loss": 0.81221056, "learning_rate": 3.719619589699017e-06, "loss": 0.83373964, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5703125, "step": 3251, "time_per_iteration": 2.3836634159088135 }, { "auxiliary_loss_clip": 0.01085229, "auxiliary_loss_mlp": 0.01065036, "balance_loss_clip": 1.02619755, "balance_loss_mlp": 1.02483153, "epoch": 0.19552081767623627, "flos": 17345932364160.0, "grad_norm": 2.1765413347089018, "language_loss": 0.86041772, "learning_rate": 3.7194206923338695e-06, "loss": 0.88192034, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.60546875, "step": 3252, "time_per_iteration": 2.3689029216766357 }, { "auxiliary_loss_clip": 0.01087349, "auxiliary_loss_mlp": 0.01079276, "balance_loss_clip": 1.03168738, "balance_loss_mlp": 1.02476823, "epoch": 0.19558094092890424, "flos": 31976286134400.0, "grad_norm": 1.59396559352403, "language_loss": 0.74936748, "learning_rate": 3.719221729768117e-06, "loss": 0.77103382, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 0.625, "step": 3253, "time_per_iteration": 2.5081100463867188 }, { "auxiliary_loss_clip": 0.01086354, "auxiliary_loss_mlp": 0.01066709, "balance_loss_clip": 1.02315032, "balance_loss_mlp": 1.02419031, "epoch": 0.19564106418157223, "flos": 22267918016640.0, "grad_norm": 1.8586726193090524, "language_loss": 0.78355157, "learning_rate": 3.7190227020093037e-06, "loss": 0.8050822, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.62109375, "step": 3254, "time_per_iteration": 2.396371364593506 }, { "auxiliary_loss_clip": 0.01022588, "auxiliary_loss_mlp": 0.01010999, "balance_loss_clip": 1.00637412, "balance_loss_mlp": 1.00644422, "epoch": 0.1957011874342402, "flos": 54362000678400.0, "grad_norm": 0.7617728207790383, "language_loss": 0.55383474, "learning_rate": 3.7188236090649774e-06, "loss": 0.57417059, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 0.04614258, "router_z_loss_mlp": 0.16210938, "step": 3255, "time_per_iteration": 3.0039737224578857 }, { "auxiliary_loss_clip": 0.01087756, "auxiliary_loss_mlp": 0.01068649, "balance_loss_clip": 1.02322984, "balance_loss_mlp": 1.02572203, "epoch": 0.19576131068690816, "flos": 16505817384960.0, "grad_norm": 2.362473325419355, "language_loss": 0.7297051, "learning_rate": 3.718624450942688e-06, "loss": 0.75126916, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 0.6171875, "step": 3256, "time_per_iteration": 2.4022655487060547 }, { "auxiliary_loss_clip": 0.0108208, "auxiliary_loss_mlp": 0.01053381, "balance_loss_clip": 1.014328, "balance_loss_mlp": 1.02375531, "epoch": 0.19582143393957613, "flos": 14718822888960.0, "grad_norm": 2.1799805916113684, "language_loss": 0.81944227, "learning_rate": 3.718425227649987e-06, "loss": 0.84079689, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.58203125, "step": 3257, "time_per_iteration": 2.3655078411102295 }, { "auxiliary_loss_clip": 0.01084968, "auxiliary_loss_mlp": 0.01058792, "balance_loss_clip": 1.01816547, "balance_loss_mlp": 1.0251615, "epoch": 0.1958815571922441, "flos": 24424363186560.0, "grad_norm": 1.7454857819960676, "language_loss": 0.76814157, "learning_rate": 3.7182259391944292e-06, "loss": 0.78957915, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.6015625, "step": 3258, "time_per_iteration": 2.456507921218872 }, { "auxiliary_loss_clip": 0.01085862, "auxiliary_loss_mlp": 0.01061372, "balance_loss_clip": 1.02029228, "balance_loss_mlp": 1.02504611, "epoch": 0.19594168044491206, "flos": 24899112120960.0, "grad_norm": 1.7018844568082085, "language_loss": 0.7578097, "learning_rate": 3.7180265855835714e-06, "loss": 0.77928209, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.609375, "step": 3259, "time_per_iteration": 3.8855929374694824 }, { "auxiliary_loss_clip": 0.01085227, "auxiliary_loss_mlp": 0.01061585, "balance_loss_clip": 1.02107787, "balance_loss_mlp": 1.02439713, "epoch": 0.19600180369758005, "flos": 12056206694400.0, "grad_norm": 2.1035003053582044, "language_loss": 0.78808713, "learning_rate": 3.7178271668249735e-06, "loss": 0.80955523, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.609375, "step": 3260, "time_per_iteration": 3.84858775138855 }, { "auxiliary_loss_clip": 0.01084348, "auxiliary_loss_mlp": 0.01062177, "balance_loss_clip": 1.02324343, "balance_loss_mlp": 1.02443933, "epoch": 0.19606192695024802, "flos": 20849152296960.0, "grad_norm": 2.002539916446017, "language_loss": 0.84324801, "learning_rate": 3.7176276829261975e-06, "loss": 0.86471331, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.59765625, "step": 3261, "time_per_iteration": 2.4095029830932617 }, { "auxiliary_loss_clip": 0.01083439, "auxiliary_loss_mlp": 0.01065347, "balance_loss_clip": 1.02376664, "balance_loss_mlp": 1.02535892, "epoch": 0.19612205020291598, "flos": 28474253187840.0, "grad_norm": 1.7084418057694921, "language_loss": 0.77505237, "learning_rate": 3.717428133894807e-06, "loss": 0.79654026, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.58203125, "step": 3262, "time_per_iteration": 3.893674373626709 }, { "auxiliary_loss_clip": 0.01085646, "auxiliary_loss_mlp": 0.01057003, "balance_loss_clip": 1.01837969, "balance_loss_mlp": 1.02680981, "epoch": 0.19618217345558395, "flos": 25555444369920.0, "grad_norm": 1.508612408541025, "language_loss": 0.86876476, "learning_rate": 3.71722851973837e-06, "loss": 0.8901912, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.58984375, "step": 3263, "time_per_iteration": 2.4454469680786133 }, { "auxiliary_loss_clip": 0.01081015, "auxiliary_loss_mlp": 0.01066584, "balance_loss_clip": 1.02848434, "balance_loss_mlp": 1.02335107, "epoch": 0.1962422967082519, "flos": 25263256268160.0, "grad_norm": 1.639808589862323, "language_loss": 0.75494194, "learning_rate": 3.717028840464455e-06, "loss": 0.77641797, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.578125, "step": 3264, "time_per_iteration": 3.8143575191497803 }, { "auxiliary_loss_clip": 0.01080165, "auxiliary_loss_mlp": 0.01059377, "balance_loss_clip": 1.02285099, "balance_loss_mlp": 1.02497673, "epoch": 0.19630241996091988, "flos": 18806349202560.0, "grad_norm": 1.8375671917416017, "language_loss": 0.8084079, "learning_rate": 3.7168290960806344e-06, "loss": 0.82980335, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.55078125, "step": 3265, "time_per_iteration": 2.3842501640319824 }, { "auxiliary_loss_clip": 0.01020701, "auxiliary_loss_mlp": 0.01013692, "balance_loss_clip": 1.00925708, "balance_loss_mlp": 1.0046134, "epoch": 0.19636254321358784, "flos": 62318287526400.0, "grad_norm": 0.7881727588794616, "language_loss": 0.53524613, "learning_rate": 3.716629286594483e-06, "loss": 0.55559003, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.16113281, "step": 3266, "time_per_iteration": 3.0483055114746094 }, { "auxiliary_loss_clip": 0.01085162, "auxiliary_loss_mlp": 0.01062716, "balance_loss_clip": 1.02077794, "balance_loss_mlp": 1.02371573, "epoch": 0.19642266646625584, "flos": 21068267189760.0, "grad_norm": 1.7849483017988634, "language_loss": 0.82185704, "learning_rate": 3.7164294120135767e-06, "loss": 0.84333581, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6171875, "step": 3267, "time_per_iteration": 2.384989023208618 }, { "auxiliary_loss_clip": 0.01078492, "auxiliary_loss_mlp": 0.01052755, "balance_loss_clip": 1.01675439, "balance_loss_mlp": 1.02344894, "epoch": 0.1964827897189238, "flos": 14537763244800.0, "grad_norm": 1.814344854024507, "language_loss": 0.88595629, "learning_rate": 3.7162294723454953e-06, "loss": 0.90726876, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.55078125, "step": 3268, "time_per_iteration": 2.3979086875915527 }, { "auxiliary_loss_clip": 0.01083368, "auxiliary_loss_mlp": 0.01054692, "balance_loss_clip": 1.01952481, "balance_loss_mlp": 1.0262723, "epoch": 0.19654291297159177, "flos": 19243636381440.0, "grad_norm": 2.029877590060043, "language_loss": 0.72062612, "learning_rate": 3.7160294675978197e-06, "loss": 0.74200672, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5703125, "step": 3269, "time_per_iteration": 2.3872671127319336 }, { "auxiliary_loss_clip": 0.01084577, "auxiliary_loss_mlp": 0.01057571, "balance_loss_clip": 1.02095032, "balance_loss_mlp": 1.02571678, "epoch": 0.19660303622425973, "flos": 25774524351360.0, "grad_norm": 1.7691858619469076, "language_loss": 0.81743562, "learning_rate": 3.715829397778135e-06, "loss": 0.83885705, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.58984375, "step": 3270, "time_per_iteration": 2.443452835083008 }, { "auxiliary_loss_clip": 0.01080909, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.01744175, "balance_loss_mlp": 1.0236578, "epoch": 0.1966631594769277, "flos": 20594041925760.0, "grad_norm": 1.9720930003833919, "language_loss": 0.85886109, "learning_rate": 3.715629262894028e-06, "loss": 0.88022244, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5703125, "step": 3271, "time_per_iteration": 2.3985233306884766 }, { "auxiliary_loss_clip": 0.01078661, "auxiliary_loss_mlp": 0.01059644, "balance_loss_clip": 1.02106833, "balance_loss_mlp": 1.02459693, "epoch": 0.19672328272959566, "flos": 23622059076480.0, "grad_norm": 1.722700810482503, "language_loss": 0.81038868, "learning_rate": 3.715429062953087e-06, "loss": 0.83177179, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5390625, "step": 3272, "time_per_iteration": 2.460452079772949 }, { "auxiliary_loss_clip": 0.01080254, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.01914001, "balance_loss_mlp": 1.02412224, "epoch": 0.19678340598226365, "flos": 23109848386560.0, "grad_norm": 1.7585527412374538, "language_loss": 0.82201338, "learning_rate": 3.7152287979629043e-06, "loss": 0.84335953, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5625, "step": 3273, "time_per_iteration": 2.4052979946136475 }, { "auxiliary_loss_clip": 0.0108095, "auxiliary_loss_mlp": 0.01064836, "balance_loss_clip": 1.02780986, "balance_loss_mlp": 1.02452135, "epoch": 0.19684352923493162, "flos": 24533711164800.0, "grad_norm": 1.5992888757573545, "language_loss": 0.78954595, "learning_rate": 3.7150284679310735e-06, "loss": 0.8110038, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5625, "step": 3274, "time_per_iteration": 2.418731689453125 }, { "auxiliary_loss_clip": 0.01078858, "auxiliary_loss_mlp": 0.01055808, "balance_loss_clip": 1.01866221, "balance_loss_mlp": 1.02325594, "epoch": 0.19690365248759958, "flos": 21795438320640.0, "grad_norm": 2.4849995833887903, "language_loss": 0.83450687, "learning_rate": 3.7148280728651914e-06, "loss": 0.85585356, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5546875, "step": 3275, "time_per_iteration": 2.3808791637420654 }, { "auxiliary_loss_clip": 0.0107878, "auxiliary_loss_mlp": 0.01056237, "balance_loss_clip": 1.02006841, "balance_loss_mlp": 1.02297354, "epoch": 0.19696377574026755, "flos": 19055803933440.0, "grad_norm": 1.8051915476854958, "language_loss": 0.82332826, "learning_rate": 3.7146276127728563e-06, "loss": 0.8446784, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5546875, "step": 3276, "time_per_iteration": 2.367443323135376 }, { "auxiliary_loss_clip": 0.01079407, "auxiliary_loss_mlp": 0.01049442, "balance_loss_clip": 1.01148558, "balance_loss_mlp": 1.02362704, "epoch": 0.19702389899293551, "flos": 22819545498240.0, "grad_norm": 2.5247899354661456, "language_loss": 0.90818912, "learning_rate": 3.7144270876616713e-06, "loss": 0.92947757, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.55859375, "step": 3277, "time_per_iteration": 2.400944948196411 }, { "auxiliary_loss_clip": 0.010814, "auxiliary_loss_mlp": 0.01064346, "balance_loss_clip": 1.02214634, "balance_loss_mlp": 1.02279365, "epoch": 0.19708402224560348, "flos": 22893107466240.0, "grad_norm": 1.8464015012354407, "language_loss": 0.64160037, "learning_rate": 3.714226497539239e-06, "loss": 0.6630578, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.5859375, "step": 3278, "time_per_iteration": 2.389479160308838 }, { "auxiliary_loss_clip": 0.01081127, "auxiliary_loss_mlp": 0.01067398, "balance_loss_clip": 1.02860701, "balance_loss_mlp": 1.02465177, "epoch": 0.19714414549827144, "flos": 25661440857600.0, "grad_norm": 2.194794940533298, "language_loss": 0.76210839, "learning_rate": 3.714025842413166e-06, "loss": 0.78359365, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5625, "step": 3279, "time_per_iteration": 2.4656901359558105 }, { "auxiliary_loss_clip": 0.01080549, "auxiliary_loss_mlp": 0.01059672, "balance_loss_clip": 1.02517271, "balance_loss_mlp": 1.02327538, "epoch": 0.19720426875093944, "flos": 23914666114560.0, "grad_norm": 1.569702998998282, "language_loss": 0.84142232, "learning_rate": 3.713825122291061e-06, "loss": 0.86282456, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5703125, "step": 3280, "time_per_iteration": 2.425898313522339 }, { "auxiliary_loss_clip": 0.01080791, "auxiliary_loss_mlp": 0.01055744, "balance_loss_clip": 1.02057719, "balance_loss_mlp": 1.02253139, "epoch": 0.1972643920036074, "flos": 13881081882240.0, "grad_norm": 1.9898685995801606, "language_loss": 0.79225934, "learning_rate": 3.713624337180536e-06, "loss": 0.81362468, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.58203125, "step": 3281, "time_per_iteration": 2.374589443206787 }, { "auxiliary_loss_clip": 0.01077823, "auxiliary_loss_mlp": 0.01060471, "balance_loss_clip": 1.02635348, "balance_loss_mlp": 1.02358842, "epoch": 0.19732451525627537, "flos": 19862611608960.0, "grad_norm": 1.7063324202471128, "language_loss": 0.81290251, "learning_rate": 3.7134234870892045e-06, "loss": 0.83428544, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.54296875, "step": 3282, "time_per_iteration": 2.396775722503662 }, { "auxiliary_loss_clip": 0.01081887, "auxiliary_loss_mlp": 0.01053984, "balance_loss_clip": 1.01769662, "balance_loss_mlp": 1.02379394, "epoch": 0.19738463850894333, "flos": 24972255152640.0, "grad_norm": 2.021273994852195, "language_loss": 0.74001527, "learning_rate": 3.7132225720246826e-06, "loss": 0.761374, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.58203125, "step": 3283, "time_per_iteration": 2.4460651874542236 }, { "auxiliary_loss_clip": 0.01078383, "auxiliary_loss_mlp": 0.01068565, "balance_loss_clip": 1.03294516, "balance_loss_mlp": 1.02197134, "epoch": 0.1974447617616113, "flos": 18367909948800.0, "grad_norm": 1.6573146809718862, "language_loss": 0.80710816, "learning_rate": 3.7130215919945886e-06, "loss": 0.82857764, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.56640625, "step": 3284, "time_per_iteration": 2.359327554702759 }, { "auxiliary_loss_clip": 0.01081798, "auxiliary_loss_mlp": 0.01061094, "balance_loss_clip": 1.02356732, "balance_loss_mlp": 1.02321589, "epoch": 0.19750488501427926, "flos": 22891850657280.0, "grad_norm": 2.125667042280686, "language_loss": 0.87403202, "learning_rate": 3.7128205470065445e-06, "loss": 0.89546096, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5859375, "step": 3285, "time_per_iteration": 2.3930392265319824 }, { "auxiliary_loss_clip": 0.01079454, "auxiliary_loss_mlp": 0.0106265, "balance_loss_clip": 1.02543306, "balance_loss_mlp": 1.02448475, "epoch": 0.19756500826694723, "flos": 21870431654400.0, "grad_norm": 1.9059839958766212, "language_loss": 0.90037012, "learning_rate": 3.712619437068174e-06, "loss": 0.9217912, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.55078125, "step": 3286, "time_per_iteration": 2.423926591873169 }, { "auxiliary_loss_clip": 0.01084052, "auxiliary_loss_mlp": 0.01063933, "balance_loss_clip": 1.02416515, "balance_loss_mlp": 1.02584553, "epoch": 0.19762513151961522, "flos": 15158065104000.0, "grad_norm": 2.2421801372417947, "language_loss": 0.79497677, "learning_rate": 3.712418262187102e-06, "loss": 0.81645662, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.58203125, "step": 3287, "time_per_iteration": 2.4905669689178467 }, { "auxiliary_loss_clip": 0.01082792, "auxiliary_loss_mlp": 0.01054977, "balance_loss_clip": 1.01821291, "balance_loss_mlp": 1.02466774, "epoch": 0.1976852547722832, "flos": 16978331992320.0, "grad_norm": 1.930721817429809, "language_loss": 0.82889926, "learning_rate": 3.7122170223709584e-06, "loss": 0.85027695, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.578125, "step": 3288, "time_per_iteration": 2.3741817474365234 }, { "auxiliary_loss_clip": 0.0107636, "auxiliary_loss_mlp": 0.01054543, "balance_loss_clip": 1.0194, "balance_loss_mlp": 1.02198744, "epoch": 0.19774537802495115, "flos": 20301888735360.0, "grad_norm": 1.8539539368528961, "language_loss": 0.73779112, "learning_rate": 3.712015717627374e-06, "loss": 0.75910014, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.54296875, "step": 3289, "time_per_iteration": 2.3878018856048584 }, { "auxiliary_loss_clip": 0.01080562, "auxiliary_loss_mlp": 0.01053991, "balance_loss_clip": 1.01720273, "balance_loss_mlp": 1.02362847, "epoch": 0.19780550127761912, "flos": 27234242962560.0, "grad_norm": 1.642155026436488, "language_loss": 0.81419319, "learning_rate": 3.7118143479639813e-06, "loss": 0.83553874, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5703125, "step": 3290, "time_per_iteration": 2.427053213119507 }, { "auxiliary_loss_clip": 0.01019524, "auxiliary_loss_mlp": 0.01004168, "balance_loss_clip": 0.99937612, "balance_loss_mlp": 1.00387287, "epoch": 0.19786562453028708, "flos": 63547368629760.0, "grad_norm": 0.8971938191030218, "language_loss": 0.60443646, "learning_rate": 3.711612913388418e-06, "loss": 0.62467337, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 0.04785156, "router_z_loss_mlp": 0.15625, "step": 3291, "time_per_iteration": 3.0537054538726807 }, { "auxiliary_loss_clip": 0.01082555, "auxiliary_loss_mlp": 0.01060364, "balance_loss_clip": 1.01699543, "balance_loss_mlp": 1.02243686, "epoch": 0.19792574778295505, "flos": 26285443320960.0, "grad_norm": 1.7390938043985176, "language_loss": 0.83536983, "learning_rate": 3.7114114139083204e-06, "loss": 0.85679901, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.6015625, "step": 3292, "time_per_iteration": 2.419462203979492 }, { "auxiliary_loss_clip": 0.01074734, "auxiliary_loss_mlp": 0.01058425, "balance_loss_clip": 1.02282953, "balance_loss_mlp": 1.0214808, "epoch": 0.19798587103562304, "flos": 19937081272320.0, "grad_norm": 1.6774547119094516, "language_loss": 0.84225357, "learning_rate": 3.7112098495313313e-06, "loss": 0.86358517, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53125, "step": 3293, "time_per_iteration": 2.396024703979492 }, { "auxiliary_loss_clip": 0.01087615, "auxiliary_loss_mlp": 0.0107031, "balance_loss_clip": 1.02539182, "balance_loss_mlp": 1.02606118, "epoch": 0.198045994288291, "flos": 20119258080000.0, "grad_norm": 1.855918326626777, "language_loss": 0.63563347, "learning_rate": 3.711008220265093e-06, "loss": 0.65721273, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6171875, "step": 3294, "time_per_iteration": 2.376526117324829 }, { "auxiliary_loss_clip": 0.01078704, "auxiliary_loss_mlp": 0.01057871, "balance_loss_clip": 1.0203675, "balance_loss_mlp": 1.02210283, "epoch": 0.19810611754095897, "flos": 17966688071040.0, "grad_norm": 1.7837364789569605, "language_loss": 0.89377952, "learning_rate": 3.710806526117251e-06, "loss": 0.91514528, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5625, "step": 3295, "time_per_iteration": 2.366445779800415 }, { "auxiliary_loss_clip": 0.01080684, "auxiliary_loss_mlp": 0.01055031, "balance_loss_clip": 1.02026963, "balance_loss_mlp": 1.0234921, "epoch": 0.19816624079362694, "flos": 15084119111040.0, "grad_norm": 2.268540756841552, "language_loss": 0.83145887, "learning_rate": 3.7106047670954544e-06, "loss": 0.85281605, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5703125, "step": 3296, "time_per_iteration": 2.363415479660034 }, { "auxiliary_loss_clip": 0.01085123, "auxiliary_loss_mlp": 0.01066935, "balance_loss_clip": 1.01970458, "balance_loss_mlp": 1.024611, "epoch": 0.1982263640462949, "flos": 24899147032320.0, "grad_norm": 2.021485831276522, "language_loss": 0.69310403, "learning_rate": 3.710402943207354e-06, "loss": 0.71462464, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 0.60546875, "step": 3297, "time_per_iteration": 2.45711612701416 }, { "auxiliary_loss_clip": 0.01075817, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.01599121, "balance_loss_mlp": 1.02189827, "epoch": 0.19828648729896287, "flos": 20375136501120.0, "grad_norm": 1.608012730119746, "language_loss": 0.82971895, "learning_rate": 3.7102010544606016e-06, "loss": 0.85099679, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5390625, "step": 3298, "time_per_iteration": 3.8261466026306152 }, { "auxiliary_loss_clip": 0.01083279, "auxiliary_loss_mlp": 0.01061285, "balance_loss_clip": 1.01741576, "balance_loss_mlp": 1.02311277, "epoch": 0.19834661055163083, "flos": 18879038386560.0, "grad_norm": 1.9040768396078307, "language_loss": 0.86584336, "learning_rate": 3.7099991008628544e-06, "loss": 0.88728899, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.6015625, "step": 3299, "time_per_iteration": 2.3663084506988525 }, { "auxiliary_loss_clip": 0.0102024, "auxiliary_loss_mlp": 0.01009211, "balance_loss_clip": 1.00434709, "balance_loss_mlp": 1.0033977, "epoch": 0.19840673380429882, "flos": 60255897292800.0, "grad_norm": 0.7794681193137385, "language_loss": 0.53296947, "learning_rate": 3.7097970824217706e-06, "loss": 0.55326396, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 0.04858398, "router_z_loss_mlp": 0.16796875, "step": 3300, "time_per_iteration": 4.40226674079895 }, { "auxiliary_loss_clip": 0.01078798, "auxiliary_loss_mlp": 0.01065848, "balance_loss_clip": 1.0246973, "balance_loss_mlp": 1.02202022, "epoch": 0.1984668570569668, "flos": 19900981059840.0, "grad_norm": 1.5823739569328903, "language_loss": 0.75206113, "learning_rate": 3.7095949991450093e-06, "loss": 0.77350754, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.56640625, "step": 3301, "time_per_iteration": 2.3982300758361816 }, { "auxiliary_loss_clip": 0.01079684, "auxiliary_loss_mlp": 0.01053771, "balance_loss_clip": 1.01505184, "balance_loss_mlp": 1.0228337, "epoch": 0.19852698030963475, "flos": 15629916395520.0, "grad_norm": 2.2851310359430745, "language_loss": 0.89997405, "learning_rate": 3.709392851040235e-06, "loss": 0.92130864, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5703125, "step": 3302, "time_per_iteration": 3.843670606613159 }, { "auxiliary_loss_clip": 0.01082347, "auxiliary_loss_mlp": 0.01061965, "balance_loss_clip": 1.02191067, "balance_loss_mlp": 1.02377748, "epoch": 0.19858710356230272, "flos": 43141335575040.0, "grad_norm": 1.8194401724790086, "language_loss": 0.75640321, "learning_rate": 3.709190638115111e-06, "loss": 0.77784634, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5859375, "step": 3303, "time_per_iteration": 3.9413540363311768 }, { "auxiliary_loss_clip": 0.01081449, "auxiliary_loss_mlp": 0.01067234, "balance_loss_clip": 1.02539134, "balance_loss_mlp": 1.02398872, "epoch": 0.19864722681497068, "flos": 35142873937920.0, "grad_norm": 2.07927257164105, "language_loss": 0.76674622, "learning_rate": 3.7089883603773084e-06, "loss": 0.7882331, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.57421875, "step": 3304, "time_per_iteration": 2.492401599884033 }, { "auxiliary_loss_clip": 0.01079154, "auxiliary_loss_mlp": 0.01052422, "balance_loss_clip": 1.01849544, "balance_loss_mlp": 1.02315044, "epoch": 0.19870735006763865, "flos": 19425219696000.0, "grad_norm": 1.6167147576218766, "language_loss": 0.8774724, "learning_rate": 3.7087860178344955e-06, "loss": 0.89878821, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.55859375, "step": 3305, "time_per_iteration": 2.385627508163452 }, { "auxiliary_loss_clip": 0.01081246, "auxiliary_loss_mlp": 0.01060764, "balance_loss_clip": 1.0237143, "balance_loss_mlp": 1.02230334, "epoch": 0.19876747332030664, "flos": 23546332604160.0, "grad_norm": 1.5222421350462447, "language_loss": 0.70133382, "learning_rate": 3.7085836104943445e-06, "loss": 0.72275388, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.58984375, "step": 3306, "time_per_iteration": 2.4195263385772705 }, { "auxiliary_loss_clip": 0.01078908, "auxiliary_loss_mlp": 0.01059314, "balance_loss_clip": 1.01971316, "balance_loss_mlp": 1.02187896, "epoch": 0.1988275965729746, "flos": 19828361698560.0, "grad_norm": 1.8611716202788697, "language_loss": 0.77642268, "learning_rate": 3.7083811383645332e-06, "loss": 0.79780495, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.5703125, "step": 3307, "time_per_iteration": 2.4185261726379395 }, { "auxiliary_loss_clip": 0.01079714, "auxiliary_loss_mlp": 0.01055969, "balance_loss_clip": 1.02151775, "balance_loss_mlp": 1.02368641, "epoch": 0.19888771982564257, "flos": 23512501630080.0, "grad_norm": 1.7292401300168632, "language_loss": 0.77697599, "learning_rate": 3.708178601452737e-06, "loss": 0.79833281, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5625, "step": 3308, "time_per_iteration": 2.4124317169189453 }, { "auxiliary_loss_clip": 0.0108003, "auxiliary_loss_mlp": 0.01052888, "balance_loss_clip": 1.01433635, "balance_loss_mlp": 1.02214146, "epoch": 0.19894784307831054, "flos": 18149528194560.0, "grad_norm": 1.9707655771267316, "language_loss": 0.77673101, "learning_rate": 3.7079759997666374e-06, "loss": 0.79806024, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.578125, "step": 3309, "time_per_iteration": 2.377485513687134 }, { "auxiliary_loss_clip": 0.01079918, "auxiliary_loss_mlp": 0.01065487, "balance_loss_clip": 1.02791262, "balance_loss_mlp": 1.02305245, "epoch": 0.1990079663309785, "flos": 24275004923520.0, "grad_norm": 1.5235879387182822, "language_loss": 0.89239913, "learning_rate": 3.707773333313917e-06, "loss": 0.91385317, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 3310, "time_per_iteration": 2.4250919818878174 }, { "auxiliary_loss_clip": 0.01077007, "auxiliary_loss_mlp": 0.01055742, "balance_loss_clip": 1.01904905, "balance_loss_mlp": 1.0223546, "epoch": 0.19906808958364647, "flos": 34896212115840.0, "grad_norm": 1.9906853138709864, "language_loss": 0.65763807, "learning_rate": 3.70757060210226e-06, "loss": 0.67896557, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 3311, "time_per_iteration": 2.505826711654663 }, { "auxiliary_loss_clip": 0.01082231, "auxiliary_loss_mlp": 0.01060287, "balance_loss_clip": 1.0223074, "balance_loss_mlp": 1.02294958, "epoch": 0.19912821283631443, "flos": 24023734801920.0, "grad_norm": 2.2549863836852344, "language_loss": 0.76776177, "learning_rate": 3.707367806139355e-06, "loss": 0.78918695, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.59375, "step": 3312, "time_per_iteration": 2.396540641784668 }, { "auxiliary_loss_clip": 0.01080368, "auxiliary_loss_mlp": 0.01064157, "balance_loss_clip": 1.0273211, "balance_loss_mlp": 1.02369428, "epoch": 0.19918833608898243, "flos": 19858177866240.0, "grad_norm": 1.884056568472047, "language_loss": 0.85625732, "learning_rate": 3.7071649454328915e-06, "loss": 0.87770247, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.56640625, "step": 3313, "time_per_iteration": 2.3903145790100098 }, { "auxiliary_loss_clip": 0.01081106, "auxiliary_loss_mlp": 0.01061317, "balance_loss_clip": 1.02481461, "balance_loss_mlp": 1.02377248, "epoch": 0.1992484593416504, "flos": 29094520135680.0, "grad_norm": 1.9680400677524739, "language_loss": 0.82680357, "learning_rate": 3.7069620199905625e-06, "loss": 0.84822786, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.57421875, "step": 3314, "time_per_iteration": 2.4379985332489014 }, { "auxiliary_loss_clip": 0.01076469, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.01714659, "balance_loss_mlp": 1.02131641, "epoch": 0.19930858259431836, "flos": 23293875496320.0, "grad_norm": 1.4736603471429648, "language_loss": 0.88725781, "learning_rate": 3.7067590298200627e-06, "loss": 0.90852916, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.55078125, "step": 3315, "time_per_iteration": 2.4478707313537598 }, { "auxiliary_loss_clip": 0.01081077, "auxiliary_loss_mlp": 0.01058332, "balance_loss_clip": 1.02094746, "balance_loss_mlp": 1.02407432, "epoch": 0.19936870584698632, "flos": 25377526748160.0, "grad_norm": 1.4023462681535126, "language_loss": 0.72763228, "learning_rate": 3.7065559749290892e-06, "loss": 0.74902642, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 3316, "time_per_iteration": 2.4666762351989746 }, { "auxiliary_loss_clip": 0.01037722, "auxiliary_loss_mlp": 0.01062013, "balance_loss_clip": 1.05590916, "balance_loss_mlp": 1.0161922, "epoch": 0.1994288290996543, "flos": 62164388920320.0, "grad_norm": 0.8580797455409654, "language_loss": 0.6646325, "learning_rate": 3.706352855325342e-06, "loss": 0.68562984, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 0.06103516, "router_z_loss_mlp": 0.21484375, "step": 3317, "time_per_iteration": 3.1550674438476562 }, { "auxiliary_loss_clip": 0.0108279, "auxiliary_loss_mlp": 0.01067851, "balance_loss_clip": 1.0294416, "balance_loss_mlp": 1.02317476, "epoch": 0.19948895235232225, "flos": 19024835690880.0, "grad_norm": 2.0517361990808713, "language_loss": 0.75580907, "learning_rate": 3.7061496710165233e-06, "loss": 0.7773155, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.59375, "step": 3318, "time_per_iteration": 2.355504274368286 }, { "auxiliary_loss_clip": 0.01079819, "auxiliary_loss_mlp": 0.01058064, "balance_loss_clip": 1.02325535, "balance_loss_mlp": 1.02436829, "epoch": 0.19954907560499022, "flos": 37814287795200.0, "grad_norm": 2.0248954144951203, "language_loss": 0.81221652, "learning_rate": 3.7059464220103385e-06, "loss": 0.8335954, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5546875, "step": 3319, "time_per_iteration": 2.527707815170288 }, { "auxiliary_loss_clip": 0.01080979, "auxiliary_loss_mlp": 0.01070214, "balance_loss_clip": 1.03116071, "balance_loss_mlp": 1.02323377, "epoch": 0.1996091988576582, "flos": 49563329414400.0, "grad_norm": 2.1429904359690575, "language_loss": 0.78197861, "learning_rate": 3.7057431083144945e-06, "loss": 0.80349052, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3320, "time_per_iteration": 2.6357204914093018 }, { "auxiliary_loss_clip": 0.01080131, "auxiliary_loss_mlp": 0.01079927, "balance_loss_clip": 1.04371142, "balance_loss_mlp": 1.0231266, "epoch": 0.19966932211032618, "flos": 22634750338560.0, "grad_norm": 1.5569904265538717, "language_loss": 0.81730306, "learning_rate": 3.705539729936701e-06, "loss": 0.83890355, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5703125, "step": 3321, "time_per_iteration": 2.4090964794158936 }, { "auxiliary_loss_clip": 0.01046601, "auxiliary_loss_mlp": 0.01015904, "balance_loss_clip": 1.00908506, "balance_loss_mlp": 1.02618766, "epoch": 0.19972944536299414, "flos": 54079308466560.0, "grad_norm": 0.9221185868853592, "language_loss": 0.65278035, "learning_rate": 3.7053362868846696e-06, "loss": 0.67340541, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 0.06835938, "router_z_loss_mlp": 0.20410156, "step": 3322, "time_per_iteration": 2.925981283187866 }, { "auxiliary_loss_clip": 0.01037951, "auxiliary_loss_mlp": 0.01009645, "balance_loss_clip": 1.00356567, "balance_loss_mlp": 1.01785469, "epoch": 0.1997895686156621, "flos": 69352204498560.0, "grad_norm": 0.7842056666234318, "language_loss": 0.57141411, "learning_rate": 3.7051327791661153e-06, "loss": 0.59189004, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 0.06079102, "router_z_loss_mlp": 0.20117188, "step": 3323, "time_per_iteration": 3.2517261505126953 }, { "auxiliary_loss_clip": 0.01078904, "auxiliary_loss_mlp": 0.01085547, "balance_loss_clip": 1.04778123, "balance_loss_mlp": 1.02293754, "epoch": 0.19984969186833007, "flos": 18551064274560.0, "grad_norm": 1.778717345735008, "language_loss": 0.83088863, "learning_rate": 3.7049292067887555e-06, "loss": 0.8525331, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5625, "step": 3324, "time_per_iteration": 2.4766030311584473 }, { "auxiliary_loss_clip": 0.01078478, "auxiliary_loss_mlp": 0.01074439, "balance_loss_clip": 1.03784132, "balance_loss_mlp": 1.02332258, "epoch": 0.19990981512099804, "flos": 26428552450560.0, "grad_norm": 3.2817141480414027, "language_loss": 0.5563674, "learning_rate": 3.7047255697603092e-06, "loss": 0.5778966, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5546875, "step": 3325, "time_per_iteration": 2.5553600788116455 }, { "auxiliary_loss_clip": 0.01084037, "auxiliary_loss_mlp": 0.01084045, "balance_loss_clip": 1.04937935, "balance_loss_mlp": 1.02668869, "epoch": 0.19996993837366603, "flos": 16325071943040.0, "grad_norm": 1.798446405901701, "language_loss": 0.86941135, "learning_rate": 3.7045218680884984e-06, "loss": 0.89109224, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.57421875, "step": 3326, "time_per_iteration": 2.4714813232421875 }, { "auxiliary_loss_clip": 0.01083148, "auxiliary_loss_mlp": 0.01075561, "balance_loss_clip": 1.04361272, "balance_loss_mlp": 1.0280726, "epoch": 0.200030061626334, "flos": 20843287188480.0, "grad_norm": 1.8319564819365488, "language_loss": 0.73640645, "learning_rate": 3.7043181017810476e-06, "loss": 0.75799352, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.55078125, "step": 3327, "time_per_iteration": 2.4870636463165283 }, { "auxiliary_loss_clip": 0.01090861, "auxiliary_loss_mlp": 0.01068936, "balance_loss_clip": 1.02978802, "balance_loss_mlp": 1.03210163, "epoch": 0.20009018487900196, "flos": 23761677070080.0, "grad_norm": 1.627857748091465, "language_loss": 0.77914745, "learning_rate": 3.7041142708456833e-06, "loss": 0.80074537, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5859375, "step": 3328, "time_per_iteration": 2.502701759338379 }, { "auxiliary_loss_clip": 0.01086885, "auxiliary_loss_mlp": 0.01051384, "balance_loss_clip": 1.0206995, "balance_loss_mlp": 1.03333902, "epoch": 0.20015030813166992, "flos": 28110283597440.0, "grad_norm": 1.8589492427325165, "language_loss": 0.70258236, "learning_rate": 3.7039103752901353e-06, "loss": 0.72396505, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.5390625, "step": 3329, "time_per_iteration": 2.566476583480835 }, { "auxiliary_loss_clip": 0.01098136, "auxiliary_loss_mlp": 0.01056349, "balance_loss_clip": 1.01810706, "balance_loss_mlp": 1.0375998, "epoch": 0.2002104313843379, "flos": 26065979314560.0, "grad_norm": 2.336731956622303, "language_loss": 0.82191348, "learning_rate": 3.7037064151221353e-06, "loss": 0.84345829, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.60546875, "step": 3330, "time_per_iteration": 2.50862979888916 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01056281, "balance_loss_clip": 1.0187062, "balance_loss_mlp": 1.03907907, "epoch": 0.20027055463700585, "flos": 22965517359360.0, "grad_norm": 2.439616203289545, "language_loss": 0.79377413, "learning_rate": 3.703502390349417e-06, "loss": 0.81531036, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.58203125, "step": 3331, "time_per_iteration": 2.469548225402832 }, { "auxiliary_loss_clip": 0.01100469, "auxiliary_loss_mlp": 0.01062753, "balance_loss_clip": 1.02181649, "balance_loss_mlp": 1.0402782, "epoch": 0.20033067788967382, "flos": 17164698163200.0, "grad_norm": 1.9423214555842956, "language_loss": 0.80973554, "learning_rate": 3.7032983009797176e-06, "loss": 0.83136773, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.6015625, "step": 3332, "time_per_iteration": 2.460225820541382 }, { "auxiliary_loss_clip": 0.01060067, "auxiliary_loss_mlp": 0.01052258, "balance_loss_clip": 1.04415143, "balance_loss_mlp": 1.03618073, "epoch": 0.2003908011423418, "flos": 60822747993600.0, "grad_norm": 0.9675365705878571, "language_loss": 0.62084913, "learning_rate": 3.703094147020776e-06, "loss": 0.64197242, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 0.08105469, "router_z_loss_mlp": 0.23828125, "step": 3333, "time_per_iteration": 2.984750747680664 }, { "auxiliary_loss_clip": 0.0109666, "auxiliary_loss_mlp": 0.01069124, "balance_loss_clip": 1.03221703, "balance_loss_mlp": 1.03770781, "epoch": 0.20045092439500978, "flos": 24205108648320.0, "grad_norm": 2.1048184003341435, "language_loss": 0.83132589, "learning_rate": 3.7028899284803334e-06, "loss": 0.85298377, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.58984375, "step": 3334, "time_per_iteration": 2.488971710205078 }, { "auxiliary_loss_clip": 0.01099543, "auxiliary_loss_mlp": 0.01072902, "balance_loss_clip": 1.03289545, "balance_loss_mlp": 1.03825688, "epoch": 0.20051104764767774, "flos": 29386324212480.0, "grad_norm": 2.219718965331155, "language_loss": 0.77080059, "learning_rate": 3.702685645366134e-06, "loss": 0.79252505, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.61328125, "step": 3335, "time_per_iteration": 2.522547483444214 }, { "auxiliary_loss_clip": 0.01096788, "auxiliary_loss_mlp": 0.01079748, "balance_loss_clip": 1.04236364, "balance_loss_mlp": 1.03625655, "epoch": 0.2005711709003457, "flos": 23512676186880.0, "grad_norm": 1.95160214847437, "language_loss": 0.81367344, "learning_rate": 3.7024812976859243e-06, "loss": 0.83543873, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.60546875, "step": 3336, "time_per_iteration": 2.451488733291626 }, { "auxiliary_loss_clip": 0.01094572, "auxiliary_loss_mlp": 0.01069689, "balance_loss_clip": 1.02741766, "balance_loss_mlp": 1.03294218, "epoch": 0.20063129415301367, "flos": 22522434894720.0, "grad_norm": 1.9683041105257713, "language_loss": 0.80090547, "learning_rate": 3.7022768854474532e-06, "loss": 0.82254803, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.6171875, "step": 3337, "time_per_iteration": 2.4526665210723877 }, { "auxiliary_loss_clip": 0.01095834, "auxiliary_loss_mlp": 0.01080263, "balance_loss_clip": 1.03932619, "balance_loss_mlp": 1.03525698, "epoch": 0.20069141740568164, "flos": 25957050272640.0, "grad_norm": 1.9015099033290792, "language_loss": 0.70984638, "learning_rate": 3.7020724086584724e-06, "loss": 0.73160732, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.60546875, "step": 3338, "time_per_iteration": 3.9459729194641113 }, { "auxiliary_loss_clip": 0.01091621, "auxiliary_loss_mlp": 0.01079449, "balance_loss_clip": 1.04301882, "balance_loss_mlp": 1.03196621, "epoch": 0.2007515406583496, "flos": 24789449940480.0, "grad_norm": 1.9913561375805733, "language_loss": 0.7141633, "learning_rate": 3.701867867326735e-06, "loss": 0.73587406, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.59765625, "step": 3339, "time_per_iteration": 3.9836084842681885 }, { "auxiliary_loss_clip": 0.01092605, "auxiliary_loss_mlp": 0.01072698, "balance_loss_clip": 1.03443193, "balance_loss_mlp": 1.03063977, "epoch": 0.2008116639110176, "flos": 37924054709760.0, "grad_norm": 3.132017948914609, "language_loss": 0.68006003, "learning_rate": 3.7016632614599974e-06, "loss": 0.70171309, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.62109375, "step": 3340, "time_per_iteration": 2.6603782176971436 }, { "auxiliary_loss_clip": 0.01087822, "auxiliary_loss_mlp": 0.01062716, "balance_loss_clip": 1.02509344, "balance_loss_mlp": 1.02844286, "epoch": 0.20087178716368556, "flos": 20739490116480.0, "grad_norm": 2.8267823543542225, "language_loss": 0.76469362, "learning_rate": 3.701458591066019e-06, "loss": 0.78619909, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.59375, "step": 3341, "time_per_iteration": 3.8599376678466797 }, { "auxiliary_loss_clip": 0.01082547, "auxiliary_loss_mlp": 0.01054885, "balance_loss_clip": 1.02126777, "balance_loss_mlp": 1.02696514, "epoch": 0.20093191041635353, "flos": 23841139057920.0, "grad_norm": 2.2938589596073067, "language_loss": 0.73514783, "learning_rate": 3.70125385615256e-06, "loss": 0.75652212, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5546875, "step": 3342, "time_per_iteration": 3.9261491298675537 }, { "auxiliary_loss_clip": 0.01086174, "auxiliary_loss_mlp": 0.01071415, "balance_loss_clip": 1.0307405, "balance_loss_mlp": 1.02723813, "epoch": 0.2009920336690215, "flos": 21791179134720.0, "grad_norm": 2.2110705331751275, "language_loss": 0.7474966, "learning_rate": 3.701049056727384e-06, "loss": 0.76907253, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.58984375, "step": 3343, "time_per_iteration": 2.4341864585876465 }, { "auxiliary_loss_clip": 0.01086399, "auxiliary_loss_mlp": 0.01066653, "balance_loss_clip": 1.02464414, "balance_loss_mlp": 1.0278883, "epoch": 0.20105215692168946, "flos": 26358027770880.0, "grad_norm": 1.871258141894508, "language_loss": 0.82603049, "learning_rate": 3.7008441927982574e-06, "loss": 0.84756106, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.5859375, "step": 3344, "time_per_iteration": 2.6114580631256104 }, { "auxiliary_loss_clip": 0.01086493, "auxiliary_loss_mlp": 0.01062059, "balance_loss_clip": 1.02634394, "balance_loss_mlp": 1.02756429, "epoch": 0.20111228017435742, "flos": 18806279379840.0, "grad_norm": 2.7282885901765987, "language_loss": 0.85468316, "learning_rate": 3.700639264372948e-06, "loss": 0.87616867, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5859375, "step": 3345, "time_per_iteration": 2.4305505752563477 }, { "auxiliary_loss_clip": 0.01081648, "auxiliary_loss_mlp": 0.01052595, "balance_loss_clip": 1.01885843, "balance_loss_mlp": 1.02870226, "epoch": 0.20117240342702541, "flos": 19974019357440.0, "grad_norm": 1.806068562068603, "language_loss": 0.70239371, "learning_rate": 3.7004342714592283e-06, "loss": 0.72373611, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.53125, "step": 3346, "time_per_iteration": 2.5411899089813232 }, { "auxiliary_loss_clip": 0.01086924, "auxiliary_loss_mlp": 0.01061974, "balance_loss_clip": 1.02792764, "balance_loss_mlp": 1.02876008, "epoch": 0.20123252667969338, "flos": 23141759235840.0, "grad_norm": 2.2839584858810187, "language_loss": 0.75294083, "learning_rate": 3.70022921406487e-06, "loss": 0.7744298, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.58203125, "step": 3347, "time_per_iteration": 2.6223604679107666 }, { "auxiliary_loss_clip": 0.01085592, "auxiliary_loss_mlp": 0.01056815, "balance_loss_clip": 1.02288795, "balance_loss_mlp": 1.02920866, "epoch": 0.20129264993236134, "flos": 23220557907840.0, "grad_norm": 1.807354776460203, "language_loss": 0.87675911, "learning_rate": 3.70002409219765e-06, "loss": 0.89818317, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.56640625, "step": 3348, "time_per_iteration": 2.559908390045166 }, { "auxiliary_loss_clip": 0.01081382, "auxiliary_loss_mlp": 0.01048923, "balance_loss_clip": 1.01223004, "balance_loss_mlp": 1.02684021, "epoch": 0.2013527731850293, "flos": 21870396743040.0, "grad_norm": 2.193181988173124, "language_loss": 0.72569764, "learning_rate": 3.699818905865346e-06, "loss": 0.74700069, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 3349, "time_per_iteration": 2.4944732189178467 }, { "auxiliary_loss_clip": 0.01087574, "auxiliary_loss_mlp": 0.01062808, "balance_loss_clip": 1.02373171, "balance_loss_mlp": 1.02862132, "epoch": 0.20141289643769728, "flos": 18039831102720.0, "grad_norm": 1.5460533535430128, "language_loss": 0.73305702, "learning_rate": 3.6996136550757377e-06, "loss": 0.75456083, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.58984375, "step": 3350, "time_per_iteration": 2.486546754837036 }, { "auxiliary_loss_clip": 0.01085695, "auxiliary_loss_mlp": 0.0105594, "balance_loss_clip": 1.01722097, "balance_loss_mlp": 1.02670956, "epoch": 0.20147301969036524, "flos": 23950277568000.0, "grad_norm": 2.4536909489015697, "language_loss": 0.78826702, "learning_rate": 3.69940833983661e-06, "loss": 0.80968332, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.58984375, "step": 3351, "time_per_iteration": 2.4541614055633545 }, { "auxiliary_loss_clip": 0.01085353, "auxiliary_loss_mlp": 0.01061253, "balance_loss_clip": 1.01955318, "balance_loss_mlp": 1.02586007, "epoch": 0.2015331429430332, "flos": 25587425041920.0, "grad_norm": 1.5496842764746228, "language_loss": 0.82215965, "learning_rate": 3.699202960155748e-06, "loss": 0.84362572, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.59375, "step": 3352, "time_per_iteration": 2.511916160583496 }, { "auxiliary_loss_clip": 0.01082859, "auxiliary_loss_mlp": 0.01056045, "balance_loss_clip": 1.01832712, "balance_loss_mlp": 1.02495241, "epoch": 0.2015932661957012, "flos": 26723742929280.0, "grad_norm": 2.18908892610129, "language_loss": 0.81921756, "learning_rate": 3.6989975160409396e-06, "loss": 0.84060669, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.578125, "step": 3353, "time_per_iteration": 2.467240810394287 }, { "auxiliary_loss_clip": 0.0108033, "auxiliary_loss_mlp": 0.01050417, "balance_loss_clip": 1.01801622, "balance_loss_mlp": 1.02495325, "epoch": 0.20165338944836916, "flos": 15632220545280.0, "grad_norm": 1.7512100636744692, "language_loss": 0.91404587, "learning_rate": 3.6987920074999747e-06, "loss": 0.93535334, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5546875, "step": 3354, "time_per_iteration": 2.455674886703491 }, { "auxiliary_loss_clip": 0.01051993, "auxiliary_loss_mlp": 0.01023476, "balance_loss_clip": 1.01470256, "balance_loss_mlp": 1.02950597, "epoch": 0.20171351270103713, "flos": 57909629727360.0, "grad_norm": 0.8480104990314864, "language_loss": 0.55999786, "learning_rate": 3.6985864345406465e-06, "loss": 0.58075255, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 0.08789062, "router_z_loss_mlp": 0.22460938, "step": 3355, "time_per_iteration": 3.0621132850646973 }, { "auxiliary_loss_clip": 0.01079836, "auxiliary_loss_mlp": 0.01055364, "balance_loss_clip": 1.02132964, "balance_loss_mlp": 1.02500772, "epoch": 0.2017736359537051, "flos": 20813296464000.0, "grad_norm": 1.537189298249129, "language_loss": 0.85715771, "learning_rate": 3.698380797170751e-06, "loss": 0.87850964, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.546875, "step": 3356, "time_per_iteration": 2.582019805908203 }, { "auxiliary_loss_clip": 0.01085362, "auxiliary_loss_mlp": 0.01060545, "balance_loss_clip": 1.01698613, "balance_loss_mlp": 1.02296638, "epoch": 0.20183375920637306, "flos": 17091101283840.0, "grad_norm": 2.473792910425018, "language_loss": 0.71650076, "learning_rate": 3.698175095398085e-06, "loss": 0.73795986, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.625, "step": 3357, "time_per_iteration": 2.4947268962860107 }, { "auxiliary_loss_clip": 0.01083136, "auxiliary_loss_mlp": 0.01063642, "balance_loss_clip": 1.0217514, "balance_loss_mlp": 1.02314329, "epoch": 0.20189388245904102, "flos": 18660342430080.0, "grad_norm": 1.6807148264465332, "language_loss": 0.73630011, "learning_rate": 3.6979693292304493e-06, "loss": 0.75776792, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.6015625, "step": 3358, "time_per_iteration": 2.5460891723632812 }, { "auxiliary_loss_clip": 0.01076505, "auxiliary_loss_mlp": 0.01048867, "balance_loss_clip": 1.01758647, "balance_loss_mlp": 1.02129185, "epoch": 0.20195400571170902, "flos": 16796678855040.0, "grad_norm": 1.6387256325639246, "language_loss": 0.8459956, "learning_rate": 3.6977634986756463e-06, "loss": 0.86724925, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.55078125, "step": 3359, "time_per_iteration": 2.474761962890625 }, { "auxiliary_loss_clip": 0.0104536, "auxiliary_loss_mlp": 0.01022075, "balance_loss_clip": 1.01539886, "balance_loss_mlp": 1.02187252, "epoch": 0.20201412896437698, "flos": 67171703535360.0, "grad_norm": 0.7855790809797424, "language_loss": 0.5909586, "learning_rate": 3.697557603741482e-06, "loss": 0.61163294, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 0.06689453, "router_z_loss_mlp": 0.234375, "step": 3360, "time_per_iteration": 3.1155338287353516 }, { "auxiliary_loss_clip": 0.01081928, "auxiliary_loss_mlp": 0.01066346, "balance_loss_clip": 1.02564836, "balance_loss_mlp": 1.02390838, "epoch": 0.20207425221704495, "flos": 21323936142720.0, "grad_norm": 2.9519421654197995, "language_loss": 0.63439697, "learning_rate": 3.697351644435763e-06, "loss": 0.65587974, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.58203125, "step": 3361, "time_per_iteration": 2.6716678142547607 }, { "auxiliary_loss_clip": 0.01078481, "auxiliary_loss_mlp": 0.01070696, "balance_loss_clip": 1.03664935, "balance_loss_mlp": 1.02303696, "epoch": 0.2021343754697129, "flos": 22526100587520.0, "grad_norm": 1.883599790244205, "language_loss": 0.77750921, "learning_rate": 3.6971456207662993e-06, "loss": 0.79900098, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5546875, "step": 3362, "time_per_iteration": 2.570836305618286 }, { "auxiliary_loss_clip": 0.01078958, "auxiliary_loss_mlp": 0.01063137, "balance_loss_clip": 1.02830434, "balance_loss_mlp": 1.02364099, "epoch": 0.20219449872238088, "flos": 19061773776000.0, "grad_norm": 1.5128335746377586, "language_loss": 0.78733063, "learning_rate": 3.6969395327409035e-06, "loss": 0.80875158, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5546875, "step": 3363, "time_per_iteration": 2.4778952598571777 }, { "auxiliary_loss_clip": 0.01082348, "auxiliary_loss_mlp": 0.01076868, "balance_loss_clip": 1.04527712, "balance_loss_mlp": 1.02632153, "epoch": 0.20225462197504884, "flos": 24715887972480.0, "grad_norm": 1.5882351827976677, "language_loss": 0.76535451, "learning_rate": 3.696733380367391e-06, "loss": 0.78694665, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5625, "step": 3364, "time_per_iteration": 2.533872127532959 }, { "auxiliary_loss_clip": 0.01081601, "auxiliary_loss_mlp": 0.01073941, "balance_loss_clip": 1.03958511, "balance_loss_mlp": 1.02495337, "epoch": 0.2023147452277168, "flos": 22017206476800.0, "grad_norm": 2.0653837482382755, "language_loss": 0.74019152, "learning_rate": 3.6965271636535783e-06, "loss": 0.761747, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.56640625, "step": 3365, "time_per_iteration": 2.513274669647217 }, { "auxiliary_loss_clip": 0.01083298, "auxiliary_loss_mlp": 0.01063548, "balance_loss_clip": 1.03093266, "balance_loss_mlp": 1.02705324, "epoch": 0.2023748684803848, "flos": 17744500978560.0, "grad_norm": 1.778994636792695, "language_loss": 0.87279904, "learning_rate": 3.696320882607286e-06, "loss": 0.89426756, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5625, "step": 3366, "time_per_iteration": 2.496013879776001 }, { "auxiliary_loss_clip": 0.01083525, "auxiliary_loss_mlp": 0.01066545, "balance_loss_clip": 1.03497887, "balance_loss_mlp": 1.02897644, "epoch": 0.20243499173305277, "flos": 31137602520960.0, "grad_norm": 1.636083371096517, "language_loss": 0.71423566, "learning_rate": 3.696114537236335e-06, "loss": 0.73573637, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.546875, "step": 3367, "time_per_iteration": 2.5839383602142334 }, { "auxiliary_loss_clip": 0.01086378, "auxiliary_loss_mlp": 0.01063325, "balance_loss_clip": 1.023628, "balance_loss_mlp": 1.02739215, "epoch": 0.20249511498572073, "flos": 33837820116480.0, "grad_norm": 1.467762501803307, "language_loss": 0.69720852, "learning_rate": 3.6959081275485512e-06, "loss": 0.71870553, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.58984375, "step": 3368, "time_per_iteration": 2.6633589267730713 }, { "auxiliary_loss_clip": 0.01083983, "auxiliary_loss_mlp": 0.01050347, "balance_loss_clip": 1.01806521, "balance_loss_mlp": 1.03054571, "epoch": 0.2025552382383887, "flos": 21214553253120.0, "grad_norm": 1.6350782677366928, "language_loss": 0.79033387, "learning_rate": 3.6957016535517615e-06, "loss": 0.81167716, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53515625, "step": 3369, "time_per_iteration": 2.5226757526397705 }, { "auxiliary_loss_clip": 0.0108531, "auxiliary_loss_mlp": 0.01061452, "balance_loss_clip": 1.0284071, "balance_loss_mlp": 1.02952981, "epoch": 0.20261536149105666, "flos": 14646517729920.0, "grad_norm": 2.4203761279325353, "language_loss": 0.68332911, "learning_rate": 3.695495115253795e-06, "loss": 0.70479667, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.55859375, "step": 3370, "time_per_iteration": 2.449333429336548 }, { "auxiliary_loss_clip": 0.01042351, "auxiliary_loss_mlp": 0.01017429, "balance_loss_clip": 1.01132596, "balance_loss_mlp": 1.02023554, "epoch": 0.20267548474372463, "flos": 66780361572480.0, "grad_norm": 0.6826637093295438, "language_loss": 0.58161908, "learning_rate": 3.6952885126624834e-06, "loss": 0.60221684, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 0.06103516, "router_z_loss_mlp": 0.22070312, "step": 3371, "time_per_iteration": 3.169339179992676 }, { "auxiliary_loss_clip": 0.01085223, "auxiliary_loss_mlp": 0.01054794, "balance_loss_clip": 1.020033, "balance_loss_mlp": 1.02940655, "epoch": 0.2027356079963926, "flos": 24679648114560.0, "grad_norm": 1.6696526973909118, "language_loss": 0.92558646, "learning_rate": 3.6950818457856617e-06, "loss": 0.94698668, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.55859375, "step": 3372, "time_per_iteration": 2.482783079147339 }, { "auxiliary_loss_clip": 0.01086909, "auxiliary_loss_mlp": 0.01062907, "balance_loss_clip": 1.02504671, "balance_loss_mlp": 1.02960861, "epoch": 0.20279573124906058, "flos": 26391649276800.0, "grad_norm": 1.5921135880823727, "language_loss": 0.79852957, "learning_rate": 3.694875114631167e-06, "loss": 0.82002771, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5703125, "step": 3373, "time_per_iteration": 2.5324020385742188 }, { "auxiliary_loss_clip": 0.01081092, "auxiliary_loss_mlp": 0.01067765, "balance_loss_clip": 1.03393364, "balance_loss_mlp": 1.02831602, "epoch": 0.20285585450172855, "flos": 33798647704320.0, "grad_norm": 1.9317379267857537, "language_loss": 0.73018241, "learning_rate": 3.6946683192068377e-06, "loss": 0.75167102, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.52734375, "step": 3374, "time_per_iteration": 2.6139607429504395 }, { "auxiliary_loss_clip": 0.01030681, "auxiliary_loss_mlp": 0.01037284, "balance_loss_clip": 1.0304172, "balance_loss_mlp": 1.00983405, "epoch": 0.20291597775439651, "flos": 71161332796800.0, "grad_norm": 1.0195783722117788, "language_loss": 0.62570882, "learning_rate": 3.694461459520516e-06, "loss": 0.64638841, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 0.06884766, "router_z_loss_mlp": 0.20898438, "step": 3375, "time_per_iteration": 3.0842885971069336 }, { "auxiliary_loss_clip": 0.01079746, "auxiliary_loss_mlp": 0.01063037, "balance_loss_clip": 1.02853751, "balance_loss_mlp": 1.02520347, "epoch": 0.20297610100706448, "flos": 19493440225920.0, "grad_norm": 1.4802459177987417, "language_loss": 0.82972914, "learning_rate": 3.6942545355800463e-06, "loss": 0.85115695, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.54296875, "step": 3376, "time_per_iteration": 2.477184772491455 }, { "auxiliary_loss_clip": 0.01083451, "auxiliary_loss_mlp": 0.01068735, "balance_loss_clip": 1.03116035, "balance_loss_mlp": 1.02619326, "epoch": 0.20303622425973245, "flos": 25043128945920.0, "grad_norm": 2.0210227006813644, "language_loss": 0.82784545, "learning_rate": 3.6940475473932743e-06, "loss": 0.84936732, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 3377, "time_per_iteration": 3.9714958667755127 }, { "auxiliary_loss_clip": 0.01081356, "auxiliary_loss_mlp": 0.01057634, "balance_loss_clip": 1.02244341, "balance_loss_mlp": 1.02526879, "epoch": 0.2030963475124004, "flos": 21978941760000.0, "grad_norm": 1.7960219855218151, "language_loss": 0.78993821, "learning_rate": 3.69384049496805e-06, "loss": 0.81132805, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5625, "step": 3378, "time_per_iteration": 3.857881546020508 }, { "auxiliary_loss_clip": 0.01080033, "auxiliary_loss_mlp": 0.01057693, "balance_loss_clip": 1.02104831, "balance_loss_mlp": 1.02434254, "epoch": 0.2031564707650684, "flos": 19499375157120.0, "grad_norm": 1.8394034080071586, "language_loss": 0.81949568, "learning_rate": 3.6936333783122242e-06, "loss": 0.840873, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5546875, "step": 3379, "time_per_iteration": 2.443969488143921 }, { "auxiliary_loss_clip": 0.01079339, "auxiliary_loss_mlp": 0.01051389, "balance_loss_clip": 1.02051425, "balance_loss_mlp": 1.02664304, "epoch": 0.20321659401773637, "flos": 22745983530240.0, "grad_norm": 1.6701707804267938, "language_loss": 0.87904489, "learning_rate": 3.6934261974336505e-06, "loss": 0.90035212, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.52734375, "step": 3380, "time_per_iteration": 3.8981990814208984 }, { "auxiliary_loss_clip": 0.01085424, "auxiliary_loss_mlp": 0.0105873, "balance_loss_clip": 1.02525663, "balance_loss_mlp": 1.03153336, "epoch": 0.20327671727040433, "flos": 22454738035200.0, "grad_norm": 1.9412546093708938, "language_loss": 0.76518202, "learning_rate": 3.693218952340186e-06, "loss": 0.78662354, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5390625, "step": 3381, "time_per_iteration": 4.111631870269775 }, { "auxiliary_loss_clip": 0.01090228, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.02164793, "balance_loss_mlp": 1.03140843, "epoch": 0.2033368405230723, "flos": 19534044003840.0, "grad_norm": 1.5254779028223935, "language_loss": 0.80480748, "learning_rate": 3.6930116430396895e-06, "loss": 0.82629985, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.58984375, "step": 3382, "time_per_iteration": 2.483736753463745 }, { "auxiliary_loss_clip": 0.01091416, "auxiliary_loss_mlp": 0.0106371, "balance_loss_clip": 1.02484822, "balance_loss_mlp": 1.03214049, "epoch": 0.20339696377574026, "flos": 13808357786880.0, "grad_norm": 1.9243825332283717, "language_loss": 0.82234561, "learning_rate": 3.6928042695400214e-06, "loss": 0.84389687, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.59375, "step": 3383, "time_per_iteration": 2.554927349090576 }, { "auxiliary_loss_clip": 0.01088875, "auxiliary_loss_mlp": 0.0105834, "balance_loss_clip": 1.02252972, "balance_loss_mlp": 1.03181481, "epoch": 0.20345708702840823, "flos": 20338372972800.0, "grad_norm": 2.0487493454658194, "language_loss": 0.76764667, "learning_rate": 3.6925968318490464e-06, "loss": 0.78911883, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5703125, "step": 3384, "time_per_iteration": 2.510700225830078 }, { "auxiliary_loss_clip": 0.01094593, "auxiliary_loss_mlp": 0.01065431, "balance_loss_clip": 1.02504253, "balance_loss_mlp": 1.03412771, "epoch": 0.2035172102810762, "flos": 20333066446080.0, "grad_norm": 2.1678412139901426, "language_loss": 0.79697943, "learning_rate": 3.6923893299746293e-06, "loss": 0.81857961, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.60546875, "step": 3385, "time_per_iteration": 2.526949405670166 }, { "auxiliary_loss_clip": 0.01084574, "auxiliary_loss_mlp": 0.01072104, "balance_loss_clip": 1.0349102, "balance_loss_mlp": 1.02893019, "epoch": 0.2035773335337442, "flos": 23329870974720.0, "grad_norm": 1.5878476148511826, "language_loss": 0.70842159, "learning_rate": 3.692181763924639e-06, "loss": 0.72998834, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5546875, "step": 3386, "time_per_iteration": 2.549823045730591 }, { "auxiliary_loss_clip": 0.0108487, "auxiliary_loss_mlp": 0.01071434, "balance_loss_clip": 1.03507566, "balance_loss_mlp": 1.02828526, "epoch": 0.20363745678641215, "flos": 28329014465280.0, "grad_norm": 1.3784503027152957, "language_loss": 0.81966329, "learning_rate": 3.691974133706947e-06, "loss": 0.84122634, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.56640625, "step": 3387, "time_per_iteration": 2.597156524658203 }, { "auxiliary_loss_clip": 0.01081003, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.02320182, "balance_loss_mlp": 1.0281738, "epoch": 0.20369758003908012, "flos": 18914684751360.0, "grad_norm": 2.441832893124233, "language_loss": 0.81840312, "learning_rate": 3.6917664393294262e-06, "loss": 0.83977717, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.52734375, "step": 3388, "time_per_iteration": 2.5161049365997314 }, { "auxiliary_loss_clip": 0.01084263, "auxiliary_loss_mlp": 0.0105857, "balance_loss_clip": 1.02471447, "balance_loss_mlp": 1.02758086, "epoch": 0.20375770329174808, "flos": 19205546221440.0, "grad_norm": 1.8327658664859487, "language_loss": 0.7355597, "learning_rate": 3.6915586807999527e-06, "loss": 0.75698805, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.56640625, "step": 3389, "time_per_iteration": 2.5361363887786865 }, { "auxiliary_loss_clip": 0.01080297, "auxiliary_loss_mlp": 0.01054647, "balance_loss_clip": 1.02236557, "balance_loss_mlp": 1.02617085, "epoch": 0.20381782654441605, "flos": 19389992267520.0, "grad_norm": 1.8813690128145175, "language_loss": 0.8893441, "learning_rate": 3.691350858126404e-06, "loss": 0.91069353, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5390625, "step": 3390, "time_per_iteration": 2.5815482139587402 }, { "auxiliary_loss_clip": 0.01080487, "auxiliary_loss_mlp": 0.01061363, "balance_loss_clip": 1.02648282, "balance_loss_mlp": 1.02442288, "epoch": 0.203877949797084, "flos": 24826527671040.0, "grad_norm": 2.590403192734352, "language_loss": 0.73297858, "learning_rate": 3.691142971316662e-06, "loss": 0.75439709, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5625, "step": 3391, "time_per_iteration": 2.4868462085723877 }, { "auxiliary_loss_clip": 0.01079446, "auxiliary_loss_mlp": 0.01061406, "balance_loss_clip": 1.02697802, "balance_loss_mlp": 1.0242393, "epoch": 0.20393807304975198, "flos": 18002753372160.0, "grad_norm": 2.441955708226163, "language_loss": 0.88948178, "learning_rate": 3.6909350203786086e-06, "loss": 0.91089034, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.55078125, "step": 3392, "time_per_iteration": 2.483677864074707 }, { "auxiliary_loss_clip": 0.01082526, "auxiliary_loss_mlp": 0.01059011, "balance_loss_clip": 1.02162683, "balance_loss_mlp": 1.02419424, "epoch": 0.20399819630241997, "flos": 24205841786880.0, "grad_norm": 1.4008940769704208, "language_loss": 0.81670475, "learning_rate": 3.69072700532013e-06, "loss": 0.8381201, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5859375, "step": 3393, "time_per_iteration": 2.553766965866089 }, { "auxiliary_loss_clip": 0.01079435, "auxiliary_loss_mlp": 0.01055658, "balance_loss_clip": 1.0229708, "balance_loss_mlp": 1.0240674, "epoch": 0.20405831955508794, "flos": 20776079088000.0, "grad_norm": 1.6832163701048999, "language_loss": 0.87428629, "learning_rate": 3.6905189261491137e-06, "loss": 0.89563721, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5546875, "step": 3394, "time_per_iteration": 2.491151809692383 }, { "auxiliary_loss_clip": 0.01081471, "auxiliary_loss_mlp": 0.01054626, "balance_loss_clip": 1.01902962, "balance_loss_mlp": 1.02578402, "epoch": 0.2041184428077559, "flos": 15486004304640.0, "grad_norm": 2.297980616768581, "language_loss": 0.86004883, "learning_rate": 3.69031078287345e-06, "loss": 0.88140988, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 3395, "time_per_iteration": 2.4670214653015137 }, { "auxiliary_loss_clip": 0.01084261, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.01575828, "balance_loss_mlp": 1.02555966, "epoch": 0.20417856606042387, "flos": 15587776517760.0, "grad_norm": 1.9271382792552922, "language_loss": 0.86298382, "learning_rate": 3.690102575501033e-06, "loss": 0.88435137, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5859375, "step": 3396, "time_per_iteration": 2.5540900230407715 }, { "auxiliary_loss_clip": 0.01079057, "auxiliary_loss_mlp": 0.01046979, "balance_loss_clip": 1.01362467, "balance_loss_mlp": 1.02473426, "epoch": 0.20423868931309183, "flos": 24278216768640.0, "grad_norm": 2.0306887647645886, "language_loss": 0.79616487, "learning_rate": 3.6898943040397556e-06, "loss": 0.81742525, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.54296875, "step": 3397, "time_per_iteration": 2.503183364868164 }, { "auxiliary_loss_clip": 0.01079513, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 1.02461553, "epoch": 0.2042988125657598, "flos": 18614152834560.0, "grad_norm": 2.2973179795310585, "language_loss": 0.89579952, "learning_rate": 3.689685968497518e-06, "loss": 0.91711545, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.546875, "step": 3398, "time_per_iteration": 2.4394702911376953 }, { "auxiliary_loss_clip": 0.01082428, "auxiliary_loss_mlp": 0.0106378, "balance_loss_clip": 1.0257287, "balance_loss_mlp": 1.02644777, "epoch": 0.2043589358184278, "flos": 17850462554880.0, "grad_norm": 1.916967727223325, "language_loss": 0.8019464, "learning_rate": 3.6894775688822186e-06, "loss": 0.82340842, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.55859375, "step": 3399, "time_per_iteration": 2.4549760818481445 }, { "auxiliary_loss_clip": 0.01081591, "auxiliary_loss_mlp": 0.01052173, "balance_loss_clip": 1.01605308, "balance_loss_mlp": 1.02529454, "epoch": 0.20441905907109575, "flos": 21434121993600.0, "grad_norm": 2.112543226758495, "language_loss": 0.77452958, "learning_rate": 3.6892691052017603e-06, "loss": 0.7958672, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5625, "step": 3400, "time_per_iteration": 2.482046365737915 }, { "auxiliary_loss_clip": 0.01080747, "auxiliary_loss_mlp": 0.01043397, "balance_loss_clip": 1.01271224, "balance_loss_mlp": 1.02632558, "epoch": 0.20447918232376372, "flos": 27706513190400.0, "grad_norm": 1.6228678431976133, "language_loss": 0.80872917, "learning_rate": 3.6890605774640487e-06, "loss": 0.8299706, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.546875, "step": 3401, "time_per_iteration": 2.5284976959228516 }, { "auxiliary_loss_clip": 0.01082363, "auxiliary_loss_mlp": 0.01053504, "balance_loss_clip": 1.01776505, "balance_loss_mlp": 1.02507091, "epoch": 0.20453930557643168, "flos": 30522746833920.0, "grad_norm": 1.5680828466368133, "language_loss": 0.71153963, "learning_rate": 3.688851985676991e-06, "loss": 0.73289835, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5703125, "step": 3402, "time_per_iteration": 2.660501003265381 }, { "auxiliary_loss_clip": 0.01081148, "auxiliary_loss_mlp": 0.01049397, "balance_loss_clip": 1.01718688, "balance_loss_mlp": 1.02489662, "epoch": 0.20459942882909965, "flos": 18986815353600.0, "grad_norm": 1.8242491661217968, "language_loss": 0.8280772, "learning_rate": 3.688643329848496e-06, "loss": 0.84938264, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5625, "step": 3403, "time_per_iteration": 2.4998619556427 }, { "auxiliary_loss_clip": 0.01080834, "auxiliary_loss_mlp": 0.01060881, "balance_loss_clip": 1.02561915, "balance_loss_mlp": 1.02483559, "epoch": 0.20465955208176762, "flos": 20338023859200.0, "grad_norm": 1.8483662171207136, "language_loss": 0.85711038, "learning_rate": 3.6884346099864772e-06, "loss": 0.87852752, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.55859375, "step": 3404, "time_per_iteration": 2.475260019302368 }, { "auxiliary_loss_clip": 0.01079355, "auxiliary_loss_mlp": 0.01055475, "balance_loss_clip": 1.01823354, "balance_loss_mlp": 1.0216608, "epoch": 0.20471967533443558, "flos": 21250234529280.0, "grad_norm": 2.4795666752082184, "language_loss": 0.87503958, "learning_rate": 3.6882258260988487e-06, "loss": 0.89638788, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.578125, "step": 3405, "time_per_iteration": 2.5046210289001465 }, { "auxiliary_loss_clip": 0.01077672, "auxiliary_loss_mlp": 0.01052746, "balance_loss_clip": 1.01772213, "balance_loss_mlp": 1.02222705, "epoch": 0.20477979858710357, "flos": 14500685514240.0, "grad_norm": 2.3004473795105325, "language_loss": 0.85956526, "learning_rate": 3.6880169781935276e-06, "loss": 0.88086951, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5546875, "step": 3406, "time_per_iteration": 2.5068047046661377 }, { "auxiliary_loss_clip": 0.01079999, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.01773071, "balance_loss_mlp": 1.02469444, "epoch": 0.20483992183977154, "flos": 11399525331840.0, "grad_norm": 1.8719592103356284, "language_loss": 0.69443166, "learning_rate": 3.6878080662784336e-06, "loss": 0.71573848, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5546875, "step": 3407, "time_per_iteration": 2.4473609924316406 }, { "auxiliary_loss_clip": 0.01078114, "auxiliary_loss_mlp": 0.01053787, "balance_loss_clip": 1.01811934, "balance_loss_mlp": 1.0219233, "epoch": 0.2049000450924395, "flos": 19059329980800.0, "grad_norm": 2.0956891094529464, "language_loss": 0.85687715, "learning_rate": 3.6875990903614886e-06, "loss": 0.87819618, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5625, "step": 3408, "time_per_iteration": 2.4824230670928955 }, { "auxiliary_loss_clip": 0.01083142, "auxiliary_loss_mlp": 0.0105932, "balance_loss_clip": 1.02291358, "balance_loss_mlp": 1.02445054, "epoch": 0.20496016834510747, "flos": 14573688900480.0, "grad_norm": 2.1162215923682965, "language_loss": 0.65972614, "learning_rate": 3.6873900504506166e-06, "loss": 0.68115079, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.58984375, "step": 3409, "time_per_iteration": 2.4528567790985107 }, { "auxiliary_loss_clip": 0.01079508, "auxiliary_loss_mlp": 0.01057204, "balance_loss_clip": 1.02146482, "balance_loss_mlp": 1.02152753, "epoch": 0.20502029159777543, "flos": 22125576936960.0, "grad_norm": 1.5327772103257251, "language_loss": 0.81990111, "learning_rate": 3.687180946553745e-06, "loss": 0.84126818, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.578125, "step": 3410, "time_per_iteration": 2.5217361450195312 }, { "auxiliary_loss_clip": 0.010765, "auxiliary_loss_mlp": 0.01053156, "balance_loss_clip": 1.01853764, "balance_loss_mlp": 1.02225101, "epoch": 0.2050804148504434, "flos": 25366913694720.0, "grad_norm": 2.1165403750549183, "language_loss": 0.78134012, "learning_rate": 3.686971778678803e-06, "loss": 0.80263668, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.54296875, "step": 3411, "time_per_iteration": 2.555506944656372 }, { "auxiliary_loss_clip": 0.01077983, "auxiliary_loss_mlp": 0.01056447, "balance_loss_clip": 1.02287769, "balance_loss_mlp": 1.02289391, "epoch": 0.2051405381031114, "flos": 23619126522240.0, "grad_norm": 1.9862186391934864, "language_loss": 0.75212622, "learning_rate": 3.686762546833722e-06, "loss": 0.77347052, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5546875, "step": 3412, "time_per_iteration": 2.4741744995117188 }, { "auxiliary_loss_clip": 0.01080721, "auxiliary_loss_mlp": 0.01061385, "balance_loss_clip": 1.02495503, "balance_loss_mlp": 1.02154613, "epoch": 0.20520066135577936, "flos": 19564732955520.0, "grad_norm": 2.53559594632031, "language_loss": 0.80178857, "learning_rate": 3.6865532510264362e-06, "loss": 0.82320964, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.59375, "step": 3413, "time_per_iteration": 2.3855199813842773 }, { "auxiliary_loss_clip": 0.01075017, "auxiliary_loss_mlp": 0.01054537, "balance_loss_clip": 1.02142096, "balance_loss_mlp": 1.02152658, "epoch": 0.20526078460844732, "flos": 17675372753280.0, "grad_norm": 2.2932244490416616, "language_loss": 0.85955203, "learning_rate": 3.6863438912648823e-06, "loss": 0.88084751, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.53515625, "step": 3414, "time_per_iteration": 2.3977506160736084 }, { "auxiliary_loss_clip": 0.01078475, "auxiliary_loss_mlp": 0.01054598, "balance_loss_clip": 1.01761889, "balance_loss_mlp": 1.0209918, "epoch": 0.2053209078611153, "flos": 21499444880640.0, "grad_norm": 2.2484380593846636, "language_loss": 0.82257795, "learning_rate": 3.6861344675569986e-06, "loss": 0.84390867, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.57421875, "step": 3415, "time_per_iteration": 2.475327730178833 }, { "auxiliary_loss_clip": 0.01078817, "auxiliary_loss_mlp": 0.01054044, "balance_loss_clip": 1.02083254, "balance_loss_mlp": 1.02266192, "epoch": 0.20538103111378325, "flos": 25662418375680.0, "grad_norm": 3.0297632718918157, "language_loss": 0.74498451, "learning_rate": 3.6859249799107275e-06, "loss": 0.76631314, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5625, "step": 3416, "time_per_iteration": 2.4483890533447266 }, { "auxiliary_loss_clip": 0.01079034, "auxiliary_loss_mlp": 0.0105512, "balance_loss_clip": 1.0203588, "balance_loss_mlp": 1.02199173, "epoch": 0.20544115436645122, "flos": 23147833812480.0, "grad_norm": 2.3860077446664283, "language_loss": 0.80287045, "learning_rate": 3.6857154283340115e-06, "loss": 0.82421196, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5703125, "step": 3417, "time_per_iteration": 3.812279462814331 }, { "auxiliary_loss_clip": 0.01080304, "auxiliary_loss_mlp": 0.01057842, "balance_loss_clip": 1.01971865, "balance_loss_mlp": 1.02265525, "epoch": 0.20550127761911918, "flos": 19389433685760.0, "grad_norm": 2.1240258662215625, "language_loss": 0.89830112, "learning_rate": 3.685505812834798e-06, "loss": 0.91968262, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.578125, "step": 3418, "time_per_iteration": 2.4312901496887207 }, { "auxiliary_loss_clip": 0.01079762, "auxiliary_loss_mlp": 0.01062499, "balance_loss_clip": 1.02485275, "balance_loss_mlp": 1.02160537, "epoch": 0.20556140087178718, "flos": 22892025214080.0, "grad_norm": 2.2090514405943886, "language_loss": 0.63966012, "learning_rate": 3.685296133421035e-06, "loss": 0.66108268, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.58203125, "step": 3419, "time_per_iteration": 5.221128225326538 }, { "auxiliary_loss_clip": 0.01083013, "auxiliary_loss_mlp": 0.01061027, "balance_loss_clip": 1.01837397, "balance_loss_mlp": 1.0244441, "epoch": 0.20562152412445514, "flos": 19788700527360.0, "grad_norm": 1.7941150173476457, "language_loss": 0.87209088, "learning_rate": 3.685086390100674e-06, "loss": 0.89353132, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.5859375, "step": 3420, "time_per_iteration": 3.8003735542297363 }, { "auxiliary_loss_clip": 0.0107978, "auxiliary_loss_mlp": 0.0105492, "balance_loss_clip": 1.0182507, "balance_loss_mlp": 1.02312565, "epoch": 0.2056816473771231, "flos": 31500699327360.0, "grad_norm": 2.8645725146676906, "language_loss": 0.73578554, "learning_rate": 3.684876582881668e-06, "loss": 0.75713253, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5625, "step": 3421, "time_per_iteration": 2.541628360748291 }, { "auxiliary_loss_clip": 0.01077524, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.01772141, "balance_loss_mlp": 1.02217698, "epoch": 0.20574177062979107, "flos": 23257251613440.0, "grad_norm": 1.9289920939675889, "language_loss": 0.7282325, "learning_rate": 3.6846667117719732e-06, "loss": 0.74954975, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5546875, "step": 3422, "time_per_iteration": 2.5945487022399902 }, { "auxiliary_loss_clip": 0.01033335, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.032933, "balance_loss_mlp": 1.01216793, "epoch": 0.20580189388245904, "flos": 70309417777920.0, "grad_norm": 0.8325219443895382, "language_loss": 0.5571512, "learning_rate": 3.684456776779548e-06, "loss": 0.57786846, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 0.0546875, "router_z_loss_mlp": 0.21191406, "step": 3423, "time_per_iteration": 3.1985268592834473 }, { "auxiliary_loss_clip": 0.010795, "auxiliary_loss_mlp": 0.01056185, "balance_loss_clip": 1.01734662, "balance_loss_mlp": 1.02165985, "epoch": 0.205862017135127, "flos": 30736520288640.0, "grad_norm": 2.0242679550889546, "language_loss": 0.73818767, "learning_rate": 3.684246777912353e-06, "loss": 0.75954449, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3424, "time_per_iteration": 2.55841064453125 }, { "auxiliary_loss_clip": 0.010811, "auxiliary_loss_mlp": 0.01055413, "balance_loss_clip": 1.02005589, "balance_loss_mlp": 1.02420771, "epoch": 0.20592214038779497, "flos": 21323482295040.0, "grad_norm": 1.392352559189381, "language_loss": 0.7596736, "learning_rate": 3.684036715178351e-06, "loss": 0.7810387, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5703125, "step": 3425, "time_per_iteration": 2.4185333251953125 }, { "auxiliary_loss_clip": 0.0108057, "auxiliary_loss_mlp": 0.01058527, "balance_loss_clip": 1.02030873, "balance_loss_mlp": 1.02395225, "epoch": 0.20598226364046296, "flos": 22890593848320.0, "grad_norm": 1.7128955290262635, "language_loss": 0.89190334, "learning_rate": 3.683826588585508e-06, "loss": 0.91329432, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.56640625, "step": 3426, "time_per_iteration": 2.431118965148926 }, { "auxiliary_loss_clip": 0.01080031, "auxiliary_loss_mlp": 0.01062443, "balance_loss_clip": 1.02474856, "balance_loss_mlp": 1.0242995, "epoch": 0.20604238689313092, "flos": 23877413827200.0, "grad_norm": 1.7816908951563362, "language_loss": 0.78802097, "learning_rate": 3.6836163981417926e-06, "loss": 0.80944562, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.55859375, "step": 3427, "time_per_iteration": 2.438218116760254 }, { "auxiliary_loss_clip": 0.0108135, "auxiliary_loss_mlp": 0.0106142, "balance_loss_clip": 1.02227187, "balance_loss_mlp": 1.02267039, "epoch": 0.2061025101457989, "flos": 22490454222720.0, "grad_norm": 1.4752005978713252, "language_loss": 0.75699723, "learning_rate": 3.683406143855174e-06, "loss": 0.77842492, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5859375, "step": 3428, "time_per_iteration": 2.4123916625976562 }, { "auxiliary_loss_clip": 0.01080412, "auxiliary_loss_mlp": 0.0106419, "balance_loss_clip": 1.02301526, "balance_loss_mlp": 1.02141988, "epoch": 0.20616263339846685, "flos": 22777964202240.0, "grad_norm": 3.459563831983276, "language_loss": 0.74928808, "learning_rate": 3.6831958257336256e-06, "loss": 0.77073413, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.58984375, "step": 3429, "time_per_iteration": 2.3799846172332764 }, { "auxiliary_loss_clip": 0.0108691, "auxiliary_loss_mlp": 0.0106323, "balance_loss_clip": 1.02174497, "balance_loss_mlp": 1.02726388, "epoch": 0.20622275665113482, "flos": 20881272614400.0, "grad_norm": 1.8368102908029464, "language_loss": 0.87156141, "learning_rate": 3.6829854437851237e-06, "loss": 0.89306277, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.59765625, "step": 3430, "time_per_iteration": 2.4000511169433594 }, { "auxiliary_loss_clip": 0.01082276, "auxiliary_loss_mlp": 0.01064193, "balance_loss_clip": 1.02285171, "balance_loss_mlp": 1.02295256, "epoch": 0.20628287990380278, "flos": 19353403296000.0, "grad_norm": 1.4931025478692177, "language_loss": 0.71355724, "learning_rate": 3.6827749980176444e-06, "loss": 0.73502189, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.59375, "step": 3431, "time_per_iteration": 2.4930005073547363 }, { "auxiliary_loss_clip": 0.01034123, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.02020741, "balance_loss_mlp": 1.01238775, "epoch": 0.20634300315647078, "flos": 71514759156480.0, "grad_norm": 0.8246060400423088, "language_loss": 0.6021865, "learning_rate": 3.6825644884391693e-06, "loss": 0.62280273, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 0.07275391, "router_z_loss_mlp": 0.21679688, "step": 3432, "time_per_iteration": 3.208040475845337 }, { "auxiliary_loss_clip": 0.01082014, "auxiliary_loss_mlp": 0.01057958, "balance_loss_clip": 1.01995432, "balance_loss_mlp": 1.02340603, "epoch": 0.20640312640913874, "flos": 21722923693440.0, "grad_norm": 1.9261022170804483, "language_loss": 0.73669529, "learning_rate": 3.682353915057679e-06, "loss": 0.75809503, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5859375, "step": 3433, "time_per_iteration": 2.531592845916748 }, { "auxiliary_loss_clip": 0.01082055, "auxiliary_loss_mlp": 0.01068704, "balance_loss_clip": 1.02993703, "balance_loss_mlp": 1.02340579, "epoch": 0.2064632496618067, "flos": 20553682527360.0, "grad_norm": 1.8957738984038566, "language_loss": 0.87363064, "learning_rate": 3.6821432778811604e-06, "loss": 0.89513826, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5859375, "step": 3434, "time_per_iteration": 2.405364513397217 }, { "auxiliary_loss_clip": 0.0108438, "auxiliary_loss_mlp": 0.01070398, "balance_loss_clip": 1.02893662, "balance_loss_mlp": 1.02298903, "epoch": 0.20652337291447467, "flos": 29822040380160.0, "grad_norm": 2.061814239983077, "language_loss": 0.7072596, "learning_rate": 3.6819325769176004e-06, "loss": 0.72880745, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.6171875, "step": 3435, "time_per_iteration": 2.431727647781372 }, { "auxiliary_loss_clip": 0.0108102, "auxiliary_loss_mlp": 0.01059518, "balance_loss_clip": 1.02244377, "balance_loss_mlp": 1.02384281, "epoch": 0.20658349616714264, "flos": 26212439934720.0, "grad_norm": 1.5973930367228042, "language_loss": 0.90899295, "learning_rate": 3.681721812174988e-06, "loss": 0.93039829, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5703125, "step": 3436, "time_per_iteration": 2.4210095405578613 }, { "auxiliary_loss_clip": 0.01082341, "auxiliary_loss_mlp": 0.01066704, "balance_loss_clip": 1.02855754, "balance_loss_mlp": 1.02380693, "epoch": 0.2066436194198106, "flos": 25993185396480.0, "grad_norm": 2.9713264921302525, "language_loss": 0.79147828, "learning_rate": 3.6815109836613163e-06, "loss": 0.81296873, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5859375, "step": 3437, "time_per_iteration": 2.4167771339416504 }, { "auxiliary_loss_clip": 0.01080874, "auxiliary_loss_mlp": 0.01077791, "balance_loss_clip": 1.03964353, "balance_loss_mlp": 1.02237725, "epoch": 0.20670374267247857, "flos": 21360001443840.0, "grad_norm": 1.9327046772419387, "language_loss": 0.79931521, "learning_rate": 3.6813000913845795e-06, "loss": 0.82090181, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5859375, "step": 3438, "time_per_iteration": 2.445948600769043 }, { "auxiliary_loss_clip": 0.01031379, "auxiliary_loss_mlp": 0.01065617, "balance_loss_clip": 1.06018102, "balance_loss_mlp": 1.008479, "epoch": 0.20676386592514656, "flos": 66379977567360.0, "grad_norm": 0.8457568754135891, "language_loss": 0.67197919, "learning_rate": 3.6810891353527747e-06, "loss": 0.69294918, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 0.05444336, "router_z_loss_mlp": 0.22851562, "step": 3439, "time_per_iteration": 2.999277114868164 }, { "auxiliary_loss_clip": 0.01082814, "auxiliary_loss_mlp": 0.01067948, "balance_loss_clip": 1.0284903, "balance_loss_mlp": 1.02290118, "epoch": 0.20682398917781453, "flos": 17273627205120.0, "grad_norm": 7.113654006740864, "language_loss": 0.8594929, "learning_rate": 3.6808781155739014e-06, "loss": 0.88100052, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.59765625, "step": 3440, "time_per_iteration": 2.3838064670562744 }, { "auxiliary_loss_clip": 0.01082389, "auxiliary_loss_mlp": 0.01061164, "balance_loss_clip": 1.02482891, "balance_loss_mlp": 1.02429354, "epoch": 0.2068841124304825, "flos": 18076315340160.0, "grad_norm": 1.7907523385707482, "language_loss": 0.86460376, "learning_rate": 3.6806670320559614e-06, "loss": 0.88603926, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.58203125, "step": 3441, "time_per_iteration": 2.3778467178344727 }, { "auxiliary_loss_clip": 0.01080851, "auxiliary_loss_mlp": 0.01066987, "balance_loss_clip": 1.02872038, "balance_loss_mlp": 1.02391016, "epoch": 0.20694423568315046, "flos": 27345720533760.0, "grad_norm": 1.8399187021927128, "language_loss": 0.87594664, "learning_rate": 3.680455884806959e-06, "loss": 0.89742506, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5703125, "step": 3442, "time_per_iteration": 2.487086057662964 }, { "auxiliary_loss_clip": 0.01083552, "auxiliary_loss_mlp": 0.01064763, "balance_loss_clip": 1.02437472, "balance_loss_mlp": 1.02508533, "epoch": 0.20700435893581842, "flos": 20228815526400.0, "grad_norm": 1.8898497726678651, "language_loss": 0.7408967, "learning_rate": 3.6802446738349014e-06, "loss": 0.76237988, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.5859375, "step": 3443, "time_per_iteration": 2.393321990966797 }, { "auxiliary_loss_clip": 0.01081818, "auxiliary_loss_mlp": 0.01058934, "balance_loss_clip": 1.02195525, "balance_loss_mlp": 1.02354717, "epoch": 0.2070644821884864, "flos": 20630072315520.0, "grad_norm": 1.7833026802980583, "language_loss": 0.86340815, "learning_rate": 3.680033399147797e-06, "loss": 0.88481563, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.58203125, "step": 3444, "time_per_iteration": 2.4629693031311035 }, { "auxiliary_loss_clip": 0.01029894, "auxiliary_loss_mlp": 0.01008196, "balance_loss_clip": 1.00128222, "balance_loss_mlp": 1.01031518, "epoch": 0.20712460544115438, "flos": 65937907532160.0, "grad_norm": 0.710896124302317, "language_loss": 0.57233882, "learning_rate": 3.6798220607536585e-06, "loss": 0.59271967, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 0.06933594, "router_z_loss_mlp": 0.1953125, "step": 3445, "time_per_iteration": 2.9888806343078613 }, { "auxiliary_loss_clip": 0.0108351, "auxiliary_loss_mlp": 0.01060906, "balance_loss_clip": 1.02070904, "balance_loss_mlp": 1.02422142, "epoch": 0.20718472869382235, "flos": 19424765848320.0, "grad_norm": 1.5551783445421308, "language_loss": 0.79348969, "learning_rate": 3.6796106586604987e-06, "loss": 0.81493384, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.59375, "step": 3446, "time_per_iteration": 2.541849136352539 }, { "auxiliary_loss_clip": 0.0108625, "auxiliary_loss_mlp": 0.01067196, "balance_loss_clip": 1.02237344, "balance_loss_mlp": 1.02435637, "epoch": 0.2072448519464903, "flos": 24497890243200.0, "grad_norm": 2.1721581253454754, "language_loss": 0.65285647, "learning_rate": 3.679399192876334e-06, "loss": 0.67439097, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.6171875, "step": 3447, "time_per_iteration": 2.4382736682891846 }, { "auxiliary_loss_clip": 0.01084438, "auxiliary_loss_mlp": 0.01065234, "balance_loss_clip": 1.02320123, "balance_loss_mlp": 1.0251044, "epoch": 0.20730497519915828, "flos": 23074586046720.0, "grad_norm": 1.7109040521488625, "language_loss": 0.88112891, "learning_rate": 3.679187663409184e-06, "loss": 0.90262568, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.59375, "step": 3448, "time_per_iteration": 2.4012832641601562 }, { "auxiliary_loss_clip": 0.01081615, "auxiliary_loss_mlp": 0.01065667, "balance_loss_clip": 1.02275133, "balance_loss_mlp": 1.02347136, "epoch": 0.20736509845182624, "flos": 21067987898880.0, "grad_norm": 1.9360048281608466, "language_loss": 0.77406693, "learning_rate": 3.6789760702670696e-06, "loss": 0.79553974, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.58203125, "step": 3449, "time_per_iteration": 2.5030901432037354 }, { "auxiliary_loss_clip": 0.0108582, "auxiliary_loss_mlp": 0.01063154, "balance_loss_clip": 1.01916552, "balance_loss_mlp": 1.02312648, "epoch": 0.2074252217044942, "flos": 17632499736960.0, "grad_norm": 2.083926693713025, "language_loss": 0.78036571, "learning_rate": 3.6787644134580134e-06, "loss": 0.80185544, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.625, "step": 3450, "time_per_iteration": 2.4779601097106934 }, { "auxiliary_loss_clip": 0.01083465, "auxiliary_loss_mlp": 0.01066219, "balance_loss_clip": 1.02254081, "balance_loss_mlp": 1.02321923, "epoch": 0.20748534495716217, "flos": 23545948579200.0, "grad_norm": 1.5662543310249561, "language_loss": 0.83037031, "learning_rate": 3.6785526929900436e-06, "loss": 0.85186714, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.6015625, "step": 3451, "time_per_iteration": 2.4809937477111816 }, { "auxiliary_loss_clip": 0.01026974, "auxiliary_loss_mlp": 0.01067036, "balance_loss_clip": 1.06074131, "balance_loss_mlp": 1.00640106, "epoch": 0.20754546820983016, "flos": 52250313738240.0, "grad_norm": 0.834651532400257, "language_loss": 0.56653333, "learning_rate": 3.6783409088711875e-06, "loss": 0.58747351, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 0.06298828, "router_z_loss_mlp": 0.20507812, "step": 3452, "time_per_iteration": 2.9846134185791016 }, { "auxiliary_loss_clip": 0.01086407, "auxiliary_loss_mlp": 0.01072722, "balance_loss_clip": 1.03090334, "balance_loss_mlp": 1.02404809, "epoch": 0.20760559146249813, "flos": 20411341447680.0, "grad_norm": 1.8188041860647663, "language_loss": 0.90183049, "learning_rate": 3.6781290611094755e-06, "loss": 0.92342174, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.625, "step": 3453, "time_per_iteration": 2.4110021591186523 }, { "auxiliary_loss_clip": 0.01082842, "auxiliary_loss_mlp": 0.01064391, "balance_loss_clip": 1.02254844, "balance_loss_mlp": 1.0227809, "epoch": 0.2076657147151661, "flos": 23184876631680.0, "grad_norm": 1.5689014006818618, "language_loss": 0.81531161, "learning_rate": 3.6779171497129407e-06, "loss": 0.83678401, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6015625, "step": 3454, "time_per_iteration": 2.513597011566162 }, { "auxiliary_loss_clip": 0.0108189, "auxiliary_loss_mlp": 0.0106854, "balance_loss_clip": 1.02562428, "balance_loss_mlp": 1.02203548, "epoch": 0.20772583796783406, "flos": 18292323121920.0, "grad_norm": 4.5523196335609315, "language_loss": 0.79911673, "learning_rate": 3.6777051746896202e-06, "loss": 0.82062107, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.59765625, "step": 3455, "time_per_iteration": 2.3937573432922363 }, { "auxiliary_loss_clip": 0.01082722, "auxiliary_loss_mlp": 0.01067554, "balance_loss_clip": 1.02664161, "balance_loss_mlp": 1.02329707, "epoch": 0.20778596122050202, "flos": 17601845696640.0, "grad_norm": 1.798640563179911, "language_loss": 0.82354736, "learning_rate": 3.6774931360475516e-06, "loss": 0.84505016, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.59375, "step": 3456, "time_per_iteration": 2.500481367111206 }, { "auxiliary_loss_clip": 0.01084907, "auxiliary_loss_mlp": 0.01068578, "balance_loss_clip": 1.02420878, "balance_loss_mlp": 1.02357125, "epoch": 0.20784608447317, "flos": 23804445352320.0, "grad_norm": 1.6504554787600385, "language_loss": 0.79892361, "learning_rate": 3.6772810337947745e-06, "loss": 0.82045853, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 0.6171875, "step": 3457, "time_per_iteration": 3.891096830368042 }, { "auxiliary_loss_clip": 0.01086265, "auxiliary_loss_mlp": 0.01098262, "balance_loss_clip": 1.05253351, "balance_loss_mlp": 1.02315319, "epoch": 0.20790620772583795, "flos": 17638329934080.0, "grad_norm": 1.884268839282867, "language_loss": 0.85359001, "learning_rate": 3.677068867939333e-06, "loss": 0.87543529, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 0.6328125, "step": 3458, "time_per_iteration": 5.299649715423584 }, { "auxiliary_loss_clip": 0.01081607, "auxiliary_loss_mlp": 0.01076643, "balance_loss_clip": 1.03756666, "balance_loss_mlp": 1.02312899, "epoch": 0.20796633097850595, "flos": 27672228368640.0, "grad_norm": 1.7777466724680218, "language_loss": 0.77338862, "learning_rate": 3.676856638489272e-06, "loss": 0.79497111, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5859375, "step": 3459, "time_per_iteration": 2.4948678016662598 }, { "auxiliary_loss_clip": 0.01079926, "auxiliary_loss_mlp": 0.01092869, "balance_loss_clip": 1.05481744, "balance_loss_mlp": 1.02231193, "epoch": 0.2080264542311739, "flos": 19244578988160.0, "grad_norm": 2.9488171572086737, "language_loss": 0.78610408, "learning_rate": 3.6766443454526382e-06, "loss": 0.807832, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.578125, "step": 3460, "time_per_iteration": 3.8273770809173584 }, { "auxiliary_loss_clip": 0.01083825, "auxiliary_loss_mlp": 0.01087094, "balance_loss_clip": 1.04672968, "balance_loss_mlp": 1.0236944, "epoch": 0.20808657748384188, "flos": 27524720407680.0, "grad_norm": 2.1557182452839054, "language_loss": 0.76956534, "learning_rate": 3.6764319888374836e-06, "loss": 0.79127455, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.6015625, "step": 3461, "time_per_iteration": 2.4912352561950684 }, { "auxiliary_loss_clip": 0.01083629, "auxiliary_loss_mlp": 0.01084344, "balance_loss_clip": 1.04264426, "balance_loss_mlp": 1.02197635, "epoch": 0.20814670073650984, "flos": 26905710268800.0, "grad_norm": 2.352920069315673, "language_loss": 0.90057266, "learning_rate": 3.6762195686518604e-06, "loss": 0.9222523, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.6171875, "step": 3462, "time_per_iteration": 2.5406596660614014 }, { "auxiliary_loss_clip": 0.01029412, "auxiliary_loss_mlp": 0.01076399, "balance_loss_clip": 1.07196426, "balance_loss_mlp": 1.0110569, "epoch": 0.2082068239891778, "flos": 70172383224960.0, "grad_norm": 0.7836881169329297, "language_loss": 0.59117579, "learning_rate": 3.6760070849038226e-06, "loss": 0.61223388, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.18359375, "step": 3463, "time_per_iteration": 3.1784863471984863 }, { "auxiliary_loss_clip": 0.01081233, "auxiliary_loss_mlp": 0.01077884, "balance_loss_clip": 1.03322852, "balance_loss_mlp": 1.02097869, "epoch": 0.20826694724184577, "flos": 24606924019200.0, "grad_norm": 2.43066250151913, "language_loss": 0.69575334, "learning_rate": 3.675794537601429e-06, "loss": 0.71734452, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.6015625, "step": 3464, "time_per_iteration": 2.463078498840332 }, { "auxiliary_loss_clip": 0.01085528, "auxiliary_loss_mlp": 0.01075422, "balance_loss_clip": 1.03140998, "balance_loss_mlp": 1.02391517, "epoch": 0.20832707049451377, "flos": 12892097399040.0, "grad_norm": 1.8719361988412462, "language_loss": 0.86314642, "learning_rate": 3.6755819267527373e-06, "loss": 0.88475591, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 0.6171875, "step": 3465, "time_per_iteration": 2.4493515491485596 }, { "auxiliary_loss_clip": 0.01086221, "auxiliary_loss_mlp": 0.01065666, "balance_loss_clip": 1.02320361, "balance_loss_mlp": 1.02550006, "epoch": 0.20838719374718173, "flos": 22197777361920.0, "grad_norm": 2.2062954331755966, "language_loss": 0.83942378, "learning_rate": 3.6753692523658113e-06, "loss": 0.8609426, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.60546875, "step": 3466, "time_per_iteration": 2.45220947265625 }, { "auxiliary_loss_clip": 0.01081548, "auxiliary_loss_mlp": 0.01059279, "balance_loss_clip": 1.02098918, "balance_loss_mlp": 1.02347159, "epoch": 0.2084473169998497, "flos": 15157750901760.0, "grad_norm": 1.7719595531401438, "language_loss": 0.83769011, "learning_rate": 3.675156514448716e-06, "loss": 0.85909843, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.58203125, "step": 3467, "time_per_iteration": 2.497887134552002 }, { "auxiliary_loss_clip": 0.01080472, "auxiliary_loss_mlp": 0.01056439, "balance_loss_clip": 1.01998472, "balance_loss_mlp": 1.02457142, "epoch": 0.20850744025251766, "flos": 17455838924160.0, "grad_norm": 1.9984752380161444, "language_loss": 0.83218479, "learning_rate": 3.674943713009518e-06, "loss": 0.85355389, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.55859375, "step": 3468, "time_per_iteration": 2.4471094608306885 }, { "auxiliary_loss_clip": 0.01083759, "auxiliary_loss_mlp": 0.01064324, "balance_loss_clip": 1.02107453, "balance_loss_mlp": 1.02384639, "epoch": 0.20856756350518563, "flos": 25697890183680.0, "grad_norm": 1.9447082890174219, "language_loss": 0.9133445, "learning_rate": 3.6747308480562856e-06, "loss": 0.9348253, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.6015625, "step": 3469, "time_per_iteration": 2.5326671600341797 }, { "auxiliary_loss_clip": 0.01085035, "auxiliary_loss_mlp": 0.0106293, "balance_loss_clip": 1.02361465, "balance_loss_mlp": 1.02733493, "epoch": 0.2086276867578536, "flos": 37887535560960.0, "grad_norm": 1.8474981704439475, "language_loss": 0.78199339, "learning_rate": 3.674517919597092e-06, "loss": 0.80347306, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.578125, "step": 3470, "time_per_iteration": 2.606663465499878 }, { "auxiliary_loss_clip": 0.01080488, "auxiliary_loss_mlp": 0.01063044, "balance_loss_clip": 1.02344298, "balance_loss_mlp": 1.02296996, "epoch": 0.20868781001052156, "flos": 25555863306240.0, "grad_norm": 2.328080468389899, "language_loss": 0.77165937, "learning_rate": 3.674304927640011e-06, "loss": 0.79309464, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.57421875, "step": 3471, "time_per_iteration": 2.6040830612182617 }, { "auxiliary_loss_clip": 0.01085884, "auxiliary_loss_mlp": 0.01063108, "balance_loss_clip": 1.02009678, "balance_loss_mlp": 1.0238719, "epoch": 0.20874793326318955, "flos": 27527897341440.0, "grad_norm": 2.0834732381917487, "language_loss": 0.77362728, "learning_rate": 3.67409187219312e-06, "loss": 0.79511726, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.62109375, "step": 3472, "time_per_iteration": 2.5315070152282715 }, { "auxiliary_loss_clip": 0.0108492, "auxiliary_loss_mlp": 0.01065155, "balance_loss_clip": 1.02698469, "balance_loss_mlp": 1.02654386, "epoch": 0.20880805651585752, "flos": 18547887340800.0, "grad_norm": 2.001030060739717, "language_loss": 0.86416578, "learning_rate": 3.6738787532644966e-06, "loss": 0.88566649, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.58203125, "step": 3473, "time_per_iteration": 2.480185031890869 }, { "auxiliary_loss_clip": 0.01023624, "auxiliary_loss_mlp": 0.01015565, "balance_loss_clip": 1.00981879, "balance_loss_mlp": 1.00638008, "epoch": 0.20886817976852548, "flos": 65943318792960.0, "grad_norm": 0.9007945072550463, "language_loss": 0.63710821, "learning_rate": 3.6736655708622235e-06, "loss": 0.65750003, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 0.05737305, "router_z_loss_mlp": 0.171875, "step": 3474, "time_per_iteration": 3.014744997024536 }, { "auxiliary_loss_clip": 0.01087789, "auxiliary_loss_mlp": 0.01065561, "balance_loss_clip": 1.02369452, "balance_loss_mlp": 1.02705932, "epoch": 0.20892830302119345, "flos": 36537688598400.0, "grad_norm": 1.9005096448107774, "language_loss": 0.72705245, "learning_rate": 3.6734523249943844e-06, "loss": 0.74858594, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.60546875, "step": 3475, "time_per_iteration": 2.5894439220428467 }, { "auxiliary_loss_clip": 0.010851, "auxiliary_loss_mlp": 0.0106005, "balance_loss_clip": 1.02080584, "balance_loss_mlp": 1.02672887, "epoch": 0.2089884262738614, "flos": 20955777189120.0, "grad_norm": 1.6112843232057985, "language_loss": 0.72309959, "learning_rate": 3.673239015669065e-06, "loss": 0.74455106, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5859375, "step": 3476, "time_per_iteration": 2.4683215618133545 }, { "auxiliary_loss_clip": 0.01083568, "auxiliary_loss_mlp": 0.01056778, "balance_loss_clip": 1.01937032, "balance_loss_mlp": 1.02644646, "epoch": 0.20904854952652938, "flos": 22782921615360.0, "grad_norm": 1.8187398228568794, "language_loss": 0.90713066, "learning_rate": 3.6730256428943544e-06, "loss": 0.92853415, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 3477, "time_per_iteration": 2.5477263927459717 }, { "auxiliary_loss_clip": 0.01083927, "auxiliary_loss_mlp": 0.01059135, "balance_loss_clip": 1.02110767, "balance_loss_mlp": 1.02620232, "epoch": 0.20910867277919734, "flos": 27302184201600.0, "grad_norm": 2.2941911775548673, "language_loss": 0.70504916, "learning_rate": 3.672812206678344e-06, "loss": 0.72647977, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.578125, "step": 3478, "time_per_iteration": 2.4730052947998047 }, { "auxiliary_loss_clip": 0.01082967, "auxiliary_loss_mlp": 0.01068742, "balance_loss_clip": 1.03078604, "balance_loss_mlp": 1.02595866, "epoch": 0.20916879603186533, "flos": 14318369061120.0, "grad_norm": 1.9265775073012146, "language_loss": 0.86240149, "learning_rate": 3.672598707029127e-06, "loss": 0.88391852, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5703125, "step": 3479, "time_per_iteration": 2.4959194660186768 }, { "auxiliary_loss_clip": 0.01084455, "auxiliary_loss_mlp": 0.0107245, "balance_loss_clip": 1.03439832, "balance_loss_mlp": 1.02585673, "epoch": 0.2092289192845333, "flos": 22271932823040.0, "grad_norm": 2.6016377706739755, "language_loss": 0.7616297, "learning_rate": 3.6723851439548003e-06, "loss": 0.78319877, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5859375, "step": 3480, "time_per_iteration": 2.47739577293396 }, { "auxiliary_loss_clip": 0.01078733, "auxiliary_loss_mlp": 0.01075163, "balance_loss_clip": 1.04080653, "balance_loss_mlp": 1.02333844, "epoch": 0.20928904253720126, "flos": 14829811701120.0, "grad_norm": 1.92079381505119, "language_loss": 0.77507699, "learning_rate": 3.67217151746346e-06, "loss": 0.79661596, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5546875, "step": 3481, "time_per_iteration": 2.4941303730010986 }, { "auxiliary_loss_clip": 0.01082629, "auxiliary_loss_mlp": 0.01075945, "balance_loss_clip": 1.04056406, "balance_loss_mlp": 1.02446556, "epoch": 0.20934916578986923, "flos": 23258019663360.0, "grad_norm": 1.859210068912591, "language_loss": 0.86723137, "learning_rate": 3.671957827563209e-06, "loss": 0.88881713, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.58203125, "step": 3482, "time_per_iteration": 2.460158348083496 }, { "auxiliary_loss_clip": 0.010816, "auxiliary_loss_mlp": 0.01060279, "balance_loss_clip": 1.02301466, "balance_loss_mlp": 1.02432871, "epoch": 0.2094092890425372, "flos": 32013049662720.0, "grad_norm": 2.049512072197987, "language_loss": 0.72962904, "learning_rate": 3.6717440742621494e-06, "loss": 0.75104785, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5703125, "step": 3483, "time_per_iteration": 2.554166793823242 }, { "auxiliary_loss_clip": 0.0108243, "auxiliary_loss_mlp": 0.01086651, "balance_loss_clip": 1.04495144, "balance_loss_mlp": 1.02306306, "epoch": 0.20946941229520516, "flos": 20009630810880.0, "grad_norm": 1.618577042073715, "language_loss": 0.77365232, "learning_rate": 3.6715302575683865e-06, "loss": 0.7953431, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.59375, "step": 3484, "time_per_iteration": 2.447955369949341 }, { "auxiliary_loss_clip": 0.0108159, "auxiliary_loss_mlp": 0.01063806, "balance_loss_clip": 1.02756608, "balance_loss_mlp": 1.02442789, "epoch": 0.20952953554787315, "flos": 30738684792960.0, "grad_norm": 1.648411291324392, "language_loss": 0.71616703, "learning_rate": 3.6713163774900292e-06, "loss": 0.73762107, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5703125, "step": 3485, "time_per_iteration": 2.5829174518585205 }, { "auxiliary_loss_clip": 0.0108349, "auxiliary_loss_mlp": 0.01062726, "balance_loss_clip": 1.02414954, "balance_loss_mlp": 1.02446544, "epoch": 0.20958965880054112, "flos": 27048086259840.0, "grad_norm": 1.88693161329905, "language_loss": 0.84866309, "learning_rate": 3.6711024340351875e-06, "loss": 0.87012523, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.58984375, "step": 3486, "time_per_iteration": 2.488337993621826 }, { "auxiliary_loss_clip": 0.01080647, "auxiliary_loss_mlp": 0.01065614, "balance_loss_clip": 1.02811074, "balance_loss_mlp": 1.02222359, "epoch": 0.20964978205320908, "flos": 34202697402240.0, "grad_norm": 1.706761308043827, "language_loss": 0.88384086, "learning_rate": 3.6708884272119737e-06, "loss": 0.90530348, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5859375, "step": 3487, "time_per_iteration": 2.6209545135498047 }, { "auxiliary_loss_clip": 0.01077501, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.02036667, "balance_loss_mlp": 1.02181244, "epoch": 0.20970990530587705, "flos": 23476261772160.0, "grad_norm": 3.0979526579659535, "language_loss": 0.74752271, "learning_rate": 3.670674357028504e-06, "loss": 0.76886737, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.55859375, "step": 3488, "time_per_iteration": 2.489739179611206 }, { "auxiliary_loss_clip": 0.01080814, "auxiliary_loss_mlp": 0.0105861, "balance_loss_clip": 1.02096391, "balance_loss_mlp": 1.02325106, "epoch": 0.209770028558545, "flos": 18550470781440.0, "grad_norm": 4.540321485983759, "language_loss": 0.8176775, "learning_rate": 3.6704602234928945e-06, "loss": 0.83907175, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.57421875, "step": 3489, "time_per_iteration": 2.512094497680664 }, { "auxiliary_loss_clip": 0.01080209, "auxiliary_loss_mlp": 0.01061789, "balance_loss_clip": 1.02223492, "balance_loss_mlp": 1.02297974, "epoch": 0.20983015181121298, "flos": 21615914776320.0, "grad_norm": 1.7954753571760769, "language_loss": 0.74746668, "learning_rate": 3.670246026613266e-06, "loss": 0.76888669, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5703125, "step": 3490, "time_per_iteration": 2.516413688659668 }, { "auxiliary_loss_clip": 0.01076208, "auxiliary_loss_mlp": 0.01058779, "balance_loss_clip": 1.02349317, "balance_loss_mlp": 1.02282238, "epoch": 0.20989027506388094, "flos": 16613873642880.0, "grad_norm": 1.810607671683018, "language_loss": 0.72204244, "learning_rate": 3.6700317663977415e-06, "loss": 0.74339229, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53515625, "step": 3491, "time_per_iteration": 2.4933903217315674 }, { "auxiliary_loss_clip": 0.01081335, "auxiliary_loss_mlp": 0.01058512, "balance_loss_clip": 1.01790929, "balance_loss_mlp": 1.02358115, "epoch": 0.20995039831654894, "flos": 23215844874240.0, "grad_norm": 2.2710123969800744, "language_loss": 0.81423163, "learning_rate": 3.669817442854444e-06, "loss": 0.83563012, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.578125, "step": 3492, "time_per_iteration": 2.430368185043335 }, { "auxiliary_loss_clip": 0.0107906, "auxiliary_loss_mlp": 0.01058445, "balance_loss_clip": 1.02072716, "balance_loss_mlp": 1.02192986, "epoch": 0.2100105215692169, "flos": 18146595640320.0, "grad_norm": 1.8259088592094537, "language_loss": 0.88271517, "learning_rate": 3.669603055991502e-06, "loss": 0.90409029, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5703125, "step": 3493, "time_per_iteration": 2.5071730613708496 }, { "auxiliary_loss_clip": 0.01074479, "auxiliary_loss_mlp": 0.01057447, "balance_loss_clip": 1.02204204, "balance_loss_mlp": 1.02051008, "epoch": 0.21007064482188487, "flos": 15960683416320.0, "grad_norm": 1.5520883966444965, "language_loss": 0.71691978, "learning_rate": 3.6693886058170455e-06, "loss": 0.73823905, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5390625, "step": 3494, "time_per_iteration": 2.46099853515625 }, { "auxiliary_loss_clip": 0.01083311, "auxiliary_loss_mlp": 0.01058053, "balance_loss_clip": 1.02217078, "balance_loss_mlp": 1.02450395, "epoch": 0.21013076807455283, "flos": 32232932605440.0, "grad_norm": 1.6824633031720868, "language_loss": 0.80413532, "learning_rate": 3.6691740923392053e-06, "loss": 0.82554895, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5859375, "step": 3495, "time_per_iteration": 2.6416337490081787 }, { "auxiliary_loss_clip": 0.01080842, "auxiliary_loss_mlp": 0.010568, "balance_loss_clip": 1.02122819, "balance_loss_mlp": 1.02310038, "epoch": 0.2101908913272208, "flos": 23695481399040.0, "grad_norm": 1.524301454157636, "language_loss": 0.78966159, "learning_rate": 3.668959515566116e-06, "loss": 0.81103802, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.578125, "step": 3496, "time_per_iteration": 4.051529884338379 }, { "auxiliary_loss_clip": 0.01082032, "auxiliary_loss_mlp": 0.0106195, "balance_loss_clip": 1.02292109, "balance_loss_mlp": 1.02404821, "epoch": 0.21025101457988876, "flos": 20374752476160.0, "grad_norm": 1.7153110884139955, "language_loss": 0.8350966, "learning_rate": 3.668744875505915e-06, "loss": 0.85653639, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3497, "time_per_iteration": 2.4276392459869385 }, { "auxiliary_loss_clip": 0.01080588, "auxiliary_loss_mlp": 0.01054306, "balance_loss_clip": 1.01549101, "balance_loss_mlp": 1.02335525, "epoch": 0.21031113783255675, "flos": 25774454528640.0, "grad_norm": 2.3097452453200105, "language_loss": 0.69245642, "learning_rate": 3.668530172166741e-06, "loss": 0.71380532, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5703125, "step": 3498, "time_per_iteration": 5.447112083435059 }, { "auxiliary_loss_clip": 0.01080769, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.02412105, "balance_loss_mlp": 1.02241492, "epoch": 0.21037126108522472, "flos": 22017101742720.0, "grad_norm": 1.6364546343620845, "language_loss": 0.82991219, "learning_rate": 3.6683154055567352e-06, "loss": 0.85131943, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.58203125, "step": 3499, "time_per_iteration": 2.4871790409088135 }, { "auxiliary_loss_clip": 0.01079005, "auxiliary_loss_mlp": 0.01046455, "balance_loss_clip": 1.01281428, "balance_loss_mlp": 1.02237368, "epoch": 0.21043138433789269, "flos": 25333327100160.0, "grad_norm": 1.6090654867025471, "language_loss": 0.79425931, "learning_rate": 3.668100575684043e-06, "loss": 0.81551385, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.56640625, "step": 3500, "time_per_iteration": 3.939704656600952 }, { "auxiliary_loss_clip": 0.01077402, "auxiliary_loss_mlp": 0.01050442, "balance_loss_clip": 1.01467896, "balance_loss_mlp": 1.02084911, "epoch": 0.21049150759056065, "flos": 25555479281280.0, "grad_norm": 1.6052125454853359, "language_loss": 0.75827396, "learning_rate": 3.6678856825568094e-06, "loss": 0.77955246, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.56640625, "step": 3501, "time_per_iteration": 2.524071216583252 }, { "auxiliary_loss_clip": 0.01075696, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.02042866, "balance_loss_mlp": 1.02074742, "epoch": 0.21055163084322862, "flos": 24494538752640.0, "grad_norm": 1.5169545182945614, "language_loss": 0.76563883, "learning_rate": 3.667670726183183e-06, "loss": 0.78694057, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.55078125, "step": 3502, "time_per_iteration": 2.4781298637390137 }, { "auxiliary_loss_clip": 0.01076829, "auxiliary_loss_mlp": 0.01049451, "balance_loss_clip": 1.01409316, "balance_loss_mlp": 1.02208924, "epoch": 0.21061175409589658, "flos": 25737865557120.0, "grad_norm": 1.9403607815419364, "language_loss": 0.78484476, "learning_rate": 3.667455706571316e-06, "loss": 0.80610752, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.546875, "step": 3503, "time_per_iteration": 2.4953548908233643 }, { "auxiliary_loss_clip": 0.01081256, "auxiliary_loss_mlp": 0.01068273, "balance_loss_clip": 1.02693176, "balance_loss_mlp": 1.02182102, "epoch": 0.21067187734856455, "flos": 18988176896640.0, "grad_norm": 2.7018364083388544, "language_loss": 0.81515193, "learning_rate": 3.6672406237293617e-06, "loss": 0.83664727, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.59375, "step": 3504, "time_per_iteration": 2.428044080734253 }, { "auxiliary_loss_clip": 0.01081311, "auxiliary_loss_mlp": 0.01067906, "balance_loss_clip": 1.02885282, "balance_loss_mlp": 1.02261972, "epoch": 0.21073200060123254, "flos": 24680206696320.0, "grad_norm": 1.5373218428548072, "language_loss": 0.77947247, "learning_rate": 3.6670254776654754e-06, "loss": 0.80096471, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.58984375, "step": 3505, "time_per_iteration": 2.495490550994873 }, { "auxiliary_loss_clip": 0.01073975, "auxiliary_loss_mlp": 0.01060924, "balance_loss_clip": 1.02623439, "balance_loss_mlp": 1.01998496, "epoch": 0.2107921238539005, "flos": 28548059535360.0, "grad_norm": 1.8592482758563134, "language_loss": 0.6498819, "learning_rate": 3.6668102683878163e-06, "loss": 0.67123085, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5390625, "step": 3506, "time_per_iteration": 2.545870065689087 }, { "auxiliary_loss_clip": 0.01078284, "auxiliary_loss_mlp": 0.01060732, "balance_loss_clip": 1.02465916, "balance_loss_mlp": 1.02273762, "epoch": 0.21085224710656847, "flos": 25884640379520.0, "grad_norm": 1.6441995037273525, "language_loss": 0.83744764, "learning_rate": 3.6665949959045443e-06, "loss": 0.8588379, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5546875, "step": 3507, "time_per_iteration": 2.5157885551452637 }, { "auxiliary_loss_clip": 0.01077584, "auxiliary_loss_mlp": 0.01052375, "balance_loss_clip": 1.01930594, "balance_loss_mlp": 1.02296734, "epoch": 0.21091237035923643, "flos": 14975399537280.0, "grad_norm": 1.9802373392745583, "language_loss": 0.77682662, "learning_rate": 3.666379660223824e-06, "loss": 0.79812622, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.546875, "step": 3508, "time_per_iteration": 2.3967137336730957 }, { "auxiliary_loss_clip": 0.01081239, "auxiliary_loss_mlp": 0.01061688, "balance_loss_clip": 1.02351749, "balance_loss_mlp": 1.02317226, "epoch": 0.2109724936119044, "flos": 16361591091840.0, "grad_norm": 2.824364816398747, "language_loss": 0.87341201, "learning_rate": 3.6661642613538192e-06, "loss": 0.89484131, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.58203125, "step": 3509, "time_per_iteration": 2.3933122158050537 }, { "auxiliary_loss_clip": 0.01080193, "auxiliary_loss_mlp": 0.01057155, "balance_loss_clip": 1.01996183, "balance_loss_mlp": 1.02307642, "epoch": 0.21103261686457236, "flos": 31501188086400.0, "grad_norm": 1.8902045118338804, "language_loss": 0.69544005, "learning_rate": 3.6659487993026987e-06, "loss": 0.71681356, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5703125, "step": 3510, "time_per_iteration": 2.577608346939087 }, { "auxiliary_loss_clip": 0.01077392, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.01957703, "balance_loss_mlp": 1.0203526, "epoch": 0.21109274011724033, "flos": 27342857802240.0, "grad_norm": 1.740796558637469, "language_loss": 0.74011064, "learning_rate": 3.6657332740786327e-06, "loss": 0.76143706, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5703125, "step": 3511, "time_per_iteration": 2.503680944442749 }, { "auxiliary_loss_clip": 0.01080123, "auxiliary_loss_mlp": 0.01058537, "balance_loss_clip": 1.01805365, "balance_loss_mlp": 1.0220145, "epoch": 0.21115286336990832, "flos": 17819459400960.0, "grad_norm": 2.465041930275446, "language_loss": 0.72048414, "learning_rate": 3.665517685689794e-06, "loss": 0.74187076, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.578125, "step": 3512, "time_per_iteration": 2.3319976329803467 }, { "auxiliary_loss_clip": 0.01078772, "auxiliary_loss_mlp": 0.0105647, "balance_loss_clip": 1.01813281, "balance_loss_mlp": 1.02205968, "epoch": 0.2112129866225763, "flos": 27196781207040.0, "grad_norm": 1.5815788880746124, "language_loss": 0.74717116, "learning_rate": 3.6653020341443584e-06, "loss": 0.76852357, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5703125, "step": 3513, "time_per_iteration": 2.449000597000122 }, { "auxiliary_loss_clip": 0.01077166, "auxiliary_loss_mlp": 0.01047945, "balance_loss_clip": 1.01301718, "balance_loss_mlp": 1.02311826, "epoch": 0.21127310987524425, "flos": 23730185157120.0, "grad_norm": 2.0908572438768807, "language_loss": 0.76032579, "learning_rate": 3.665086319450502e-06, "loss": 0.78157687, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5390625, "step": 3514, "time_per_iteration": 2.39233660697937 }, { "auxiliary_loss_clip": 0.01080771, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.01662076, "balance_loss_mlp": 1.02309346, "epoch": 0.21133323312791222, "flos": 18331530445440.0, "grad_norm": 1.8918228637923897, "language_loss": 0.79036659, "learning_rate": 3.6648705416164062e-06, "loss": 0.81173968, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.578125, "step": 3515, "time_per_iteration": 2.3582193851470947 }, { "auxiliary_loss_clip": 0.01078668, "auxiliary_loss_mlp": 0.01056974, "balance_loss_clip": 1.02218843, "balance_loss_mlp": 1.02252889, "epoch": 0.21139335638058018, "flos": 17930238744960.0, "grad_norm": 2.1427093922967884, "language_loss": 0.70468462, "learning_rate": 3.6646547006502518e-06, "loss": 0.72604102, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5625, "step": 3516, "time_per_iteration": 2.335310459136963 }, { "auxiliary_loss_clip": 0.01082461, "auxiliary_loss_mlp": 0.01065132, "balance_loss_clip": 1.02209759, "balance_loss_mlp": 1.02331352, "epoch": 0.21145347963324815, "flos": 24570928540800.0, "grad_norm": 1.6965930147025639, "language_loss": 0.86300814, "learning_rate": 3.664438796560225e-06, "loss": 0.88448411, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.58984375, "step": 3517, "time_per_iteration": 2.418478488922119 }, { "auxiliary_loss_clip": 0.01077144, "auxiliary_loss_mlp": 0.01048735, "balance_loss_clip": 1.01130319, "balance_loss_mlp": 1.02088678, "epoch": 0.21151360288591614, "flos": 35844488087040.0, "grad_norm": 2.4284584833747163, "language_loss": 0.65358388, "learning_rate": 3.664222829354512e-06, "loss": 0.67484266, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5625, "step": 3518, "time_per_iteration": 2.526299238204956 }, { "auxiliary_loss_clip": 0.01077193, "auxiliary_loss_mlp": 0.01053728, "balance_loss_clip": 1.01717877, "balance_loss_mlp": 1.02094662, "epoch": 0.2115737261385841, "flos": 24640510613760.0, "grad_norm": 1.9842426761824759, "language_loss": 0.91318178, "learning_rate": 3.664006799041303e-06, "loss": 0.93449104, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5625, "step": 3519, "time_per_iteration": 2.446359395980835 }, { "auxiliary_loss_clip": 0.01082936, "auxiliary_loss_mlp": 0.01059468, "balance_loss_clip": 1.01862645, "balance_loss_mlp": 1.02432239, "epoch": 0.21163384939125207, "flos": 25225759601280.0, "grad_norm": 1.5747457148479167, "language_loss": 0.82521361, "learning_rate": 3.6637907056287886e-06, "loss": 0.84663767, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.5859375, "step": 3520, "time_per_iteration": 2.435229778289795 }, { "auxiliary_loss_clip": 0.01077297, "auxiliary_loss_mlp": 0.01051894, "balance_loss_clip": 1.01725137, "balance_loss_mlp": 1.02200842, "epoch": 0.21169397264392004, "flos": 26066328428160.0, "grad_norm": 1.5068059781286331, "language_loss": 0.77638853, "learning_rate": 3.6635745491251642e-06, "loss": 0.79768044, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5546875, "step": 3521, "time_per_iteration": 2.4477579593658447 }, { "auxiliary_loss_clip": 0.0107936, "auxiliary_loss_mlp": 0.01047629, "balance_loss_clip": 1.01327276, "balance_loss_mlp": 1.02280521, "epoch": 0.211754095896588, "flos": 23107264945920.0, "grad_norm": 1.9471875393671179, "language_loss": 0.76783288, "learning_rate": 3.663358329538626e-06, "loss": 0.78910285, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.56640625, "step": 3522, "time_per_iteration": 2.378406524658203 }, { "auxiliary_loss_clip": 0.0107858, "auxiliary_loss_mlp": 0.01066391, "balance_loss_clip": 1.02864909, "balance_loss_mlp": 1.02222884, "epoch": 0.21181421914925597, "flos": 27921264163200.0, "grad_norm": 1.821971492874629, "language_loss": 0.71163213, "learning_rate": 3.663142046877374e-06, "loss": 0.73308188, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5625, "step": 3523, "time_per_iteration": 2.4742448329925537 }, { "auxiliary_loss_clip": 0.01080308, "auxiliary_loss_mlp": 0.0107032, "balance_loss_clip": 1.03422308, "balance_loss_mlp": 1.02364218, "epoch": 0.21187434240192393, "flos": 17127690255360.0, "grad_norm": 2.6182169141902, "language_loss": 0.793571, "learning_rate": 3.6629257011496085e-06, "loss": 0.8150773, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5703125, "step": 3524, "time_per_iteration": 2.386579751968384 }, { "auxiliary_loss_clip": 0.01077842, "auxiliary_loss_mlp": 0.01053302, "balance_loss_clip": 1.01479781, "balance_loss_mlp": 1.02127016, "epoch": 0.21193446565459192, "flos": 22346193018240.0, "grad_norm": 1.7680579400449392, "language_loss": 0.83069462, "learning_rate": 3.6627092923635338e-06, "loss": 0.85200608, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.56640625, "step": 3525, "time_per_iteration": 2.421335220336914 }, { "auxiliary_loss_clip": 0.01081224, "auxiliary_loss_mlp": 0.0105613, "balance_loss_clip": 1.01898408, "balance_loss_mlp": 1.02399957, "epoch": 0.2119945889072599, "flos": 27198072927360.0, "grad_norm": 1.7354535220365397, "language_loss": 0.76685011, "learning_rate": 3.662492820527356e-06, "loss": 0.78822368, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5703125, "step": 3526, "time_per_iteration": 2.43612003326416 }, { "auxiliary_loss_clip": 0.0108017, "auxiliary_loss_mlp": 0.01053496, "balance_loss_clip": 1.01434767, "balance_loss_mlp": 1.02221513, "epoch": 0.21205471215992786, "flos": 20990934794880.0, "grad_norm": 2.105942925841992, "language_loss": 0.77934325, "learning_rate": 3.662276285649284e-06, "loss": 0.80067992, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3527, "time_per_iteration": 2.446697473526001 }, { "auxiliary_loss_clip": 0.01075422, "auxiliary_loss_mlp": 0.01061918, "balance_loss_clip": 1.0229125, "balance_loss_mlp": 1.01974666, "epoch": 0.21211483541259582, "flos": 20776602758400.0, "grad_norm": 1.9034721732058224, "language_loss": 0.7951569, "learning_rate": 3.662059687737528e-06, "loss": 0.81653029, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.55859375, "step": 3528, "time_per_iteration": 2.4107720851898193 }, { "auxiliary_loss_clip": 0.0107714, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.0217067, "balance_loss_mlp": 1.02140939, "epoch": 0.21217495866526379, "flos": 18988979857920.0, "grad_norm": 1.7807344436843526, "language_loss": 0.82401979, "learning_rate": 3.6618430268003024e-06, "loss": 0.84538758, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.55859375, "step": 3529, "time_per_iteration": 2.3664729595184326 }, { "auxiliary_loss_clip": 0.01078866, "auxiliary_loss_mlp": 0.01065973, "balance_loss_clip": 1.02699125, "balance_loss_mlp": 1.02165914, "epoch": 0.21223508191793175, "flos": 20666277262080.0, "grad_norm": 1.9949807434699562, "language_loss": 0.78806585, "learning_rate": 3.6616263028458235e-06, "loss": 0.80951422, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5703125, "step": 3530, "time_per_iteration": 2.424936532974243 }, { "auxiliary_loss_clip": 0.01078064, "auxiliary_loss_mlp": 0.0104916, "balance_loss_clip": 1.01566267, "balance_loss_mlp": 1.02271843, "epoch": 0.21229520517059972, "flos": 21615391105920.0, "grad_norm": 3.6414356103505092, "language_loss": 0.85269988, "learning_rate": 3.661409515882308e-06, "loss": 0.87397206, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5546875, "step": 3531, "time_per_iteration": 2.4150054454803467 }, { "auxiliary_loss_clip": 0.0108037, "auxiliary_loss_mlp": 0.01053778, "balance_loss_clip": 1.01486778, "balance_loss_mlp": 1.02400529, "epoch": 0.2123553284232677, "flos": 13990185480960.0, "grad_norm": 2.6036539394578764, "language_loss": 0.74962109, "learning_rate": 3.661192665917977e-06, "loss": 0.77096254, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5625, "step": 3532, "time_per_iteration": 2.381605625152588 }, { "auxiliary_loss_clip": 0.01074943, "auxiliary_loss_mlp": 0.0105145, "balance_loss_clip": 1.01807165, "balance_loss_mlp": 1.0208236, "epoch": 0.21241545167593567, "flos": 18295779346560.0, "grad_norm": 1.6524261742824184, "language_loss": 0.75960129, "learning_rate": 3.660975752961054e-06, "loss": 0.78086519, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5390625, "step": 3533, "time_per_iteration": 2.3671417236328125 }, { "auxiliary_loss_clip": 0.01078039, "auxiliary_loss_mlp": 0.01052455, "balance_loss_clip": 1.01454663, "balance_loss_mlp": 1.02113557, "epoch": 0.21247557492860364, "flos": 34711731158400.0, "grad_norm": 1.679169777724469, "language_loss": 0.72003299, "learning_rate": 3.6607587770197634e-06, "loss": 0.7413379, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5703125, "step": 3534, "time_per_iteration": 2.576613187789917 }, { "auxiliary_loss_clip": 0.01076444, "auxiliary_loss_mlp": 0.01050297, "balance_loss_clip": 1.01498747, "balance_loss_mlp": 1.02218151, "epoch": 0.2125356981812716, "flos": 22052748107520.0, "grad_norm": 2.3397176477962316, "language_loss": 0.73893762, "learning_rate": 3.6605417381023346e-06, "loss": 0.76020503, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.54296875, "step": 3535, "time_per_iteration": 2.394279956817627 }, { "auxiliary_loss_clip": 0.01073877, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.02144361, "balance_loss_mlp": 1.02090681, "epoch": 0.21259582143393957, "flos": 28547082017280.0, "grad_norm": 1.9853459743827278, "language_loss": 0.72891223, "learning_rate": 3.660324636216996e-06, "loss": 0.75020349, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.53125, "step": 3536, "time_per_iteration": 4.000921010971069 }, { "auxiliary_loss_clip": 0.01078003, "auxiliary_loss_mlp": 0.01052241, "balance_loss_clip": 1.015095, "balance_loss_mlp": 1.0221746, "epoch": 0.21265594468660753, "flos": 20119851573120.0, "grad_norm": 1.9679903315656282, "language_loss": 0.88912189, "learning_rate": 3.660107471371981e-06, "loss": 0.91042435, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.55859375, "step": 3537, "time_per_iteration": 3.9771268367767334 }, { "auxiliary_loss_clip": 0.01073461, "auxiliary_loss_mlp": 0.01054259, "balance_loss_clip": 1.02166724, "balance_loss_mlp": 1.02073753, "epoch": 0.21271606793927553, "flos": 23075039894400.0, "grad_norm": 1.667033295500522, "language_loss": 0.82401311, "learning_rate": 3.659890243575524e-06, "loss": 0.8452903, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.52734375, "step": 3538, "time_per_iteration": 3.8022384643554688 }, { "auxiliary_loss_clip": 0.01074949, "auxiliary_loss_mlp": 0.01051749, "balance_loss_clip": 1.01891899, "balance_loss_mlp": 1.02188754, "epoch": 0.2127761911919435, "flos": 26387215534080.0, "grad_norm": 1.6003455254754893, "language_loss": 0.88531435, "learning_rate": 3.659672952835863e-06, "loss": 0.90658134, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.53125, "step": 3539, "time_per_iteration": 4.895423889160156 }, { "auxiliary_loss_clip": 0.01075665, "auxiliary_loss_mlp": 0.01054287, "balance_loss_clip": 1.01830924, "balance_loss_mlp": 1.02130723, "epoch": 0.21283631444461146, "flos": 20227279426560.0, "grad_norm": 2.896414643848383, "language_loss": 0.60012078, "learning_rate": 3.659455599161237e-06, "loss": 0.62142026, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.54296875, "step": 3540, "time_per_iteration": 4.092722415924072 }, { "auxiliary_loss_clip": 0.01076839, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.02112126, "balance_loss_mlp": 1.02111316, "epoch": 0.21289643769727942, "flos": 13516134773760.0, "grad_norm": 2.226847074772495, "language_loss": 0.78767765, "learning_rate": 3.659238182559888e-06, "loss": 0.80900729, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.55859375, "step": 3541, "time_per_iteration": 3.605156421661377 }, { "auxiliary_loss_clip": 0.01075684, "auxiliary_loss_mlp": 0.01052774, "balance_loss_clip": 1.01980042, "balance_loss_mlp": 1.02143431, "epoch": 0.2129565609499474, "flos": 24825864355200.0, "grad_norm": 4.605673856277093, "language_loss": 0.71839589, "learning_rate": 3.6590207030400615e-06, "loss": 0.73968053, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.54296875, "step": 3542, "time_per_iteration": 3.5339195728302 }, { "auxiliary_loss_clip": 0.01073294, "auxiliary_loss_mlp": 0.0105435, "balance_loss_clip": 1.02209163, "balance_loss_mlp": 1.02043009, "epoch": 0.21301668420261535, "flos": 23658124377600.0, "grad_norm": 1.7624784051349671, "language_loss": 0.78376412, "learning_rate": 3.658803160610004e-06, "loss": 0.8050406, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53125, "step": 3543, "time_per_iteration": 2.5844695568084717 }, { "auxiliary_loss_clip": 0.01075783, "auxiliary_loss_mlp": 0.01053865, "balance_loss_clip": 1.01784003, "balance_loss_mlp": 1.02200592, "epoch": 0.21307680745528332, "flos": 16361870382720.0, "grad_norm": 2.011890661585965, "language_loss": 0.68564641, "learning_rate": 3.6585855552779634e-06, "loss": 0.70694286, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53515625, "step": 3544, "time_per_iteration": 2.4860470294952393 }, { "auxiliary_loss_clip": 0.01077432, "auxiliary_loss_mlp": 0.01050087, "balance_loss_clip": 1.01442003, "balance_loss_mlp": 1.02228761, "epoch": 0.2131369307079513, "flos": 19098048545280.0, "grad_norm": 1.8838346716196614, "language_loss": 0.7238847, "learning_rate": 3.6583678870521934e-06, "loss": 0.74515986, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.55078125, "step": 3545, "time_per_iteration": 2.502211570739746 }, { "auxiliary_loss_clip": 0.01077857, "auxiliary_loss_mlp": 0.01057475, "balance_loss_clip": 1.02056777, "balance_loss_mlp": 1.02163672, "epoch": 0.21319705396061928, "flos": 30370979687040.0, "grad_norm": 1.6056765603748655, "language_loss": 0.73789716, "learning_rate": 3.658150155940946e-06, "loss": 0.75925046, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5625, "step": 3546, "time_per_iteration": 2.5712828636169434 }, { "auxiliary_loss_clip": 0.01077349, "auxiliary_loss_mlp": 0.01058524, "balance_loss_clip": 1.02316618, "balance_loss_mlp": 1.02269018, "epoch": 0.21325717721328724, "flos": 21755288390400.0, "grad_norm": 1.7571312181466094, "language_loss": 0.81322825, "learning_rate": 3.657932361952479e-06, "loss": 0.83458698, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.546875, "step": 3547, "time_per_iteration": 2.5686182975769043 }, { "auxiliary_loss_clip": 0.01078799, "auxiliary_loss_mlp": 0.01059895, "balance_loss_clip": 1.02043629, "balance_loss_mlp": 1.02174842, "epoch": 0.2133173004659552, "flos": 28729607938560.0, "grad_norm": 2.7259620468008565, "language_loss": 0.77250826, "learning_rate": 3.6577145050950504e-06, "loss": 0.79389518, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5703125, "step": 3548, "time_per_iteration": 2.5431466102600098 }, { "auxiliary_loss_clip": 0.01082711, "auxiliary_loss_mlp": 0.01056197, "balance_loss_clip": 1.01623821, "balance_loss_mlp": 1.02413034, "epoch": 0.21337742371862317, "flos": 16836130558080.0, "grad_norm": 1.9597723203961, "language_loss": 0.75296259, "learning_rate": 3.657496585376922e-06, "loss": 0.7743516, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5859375, "step": 3549, "time_per_iteration": 2.4662280082702637 }, { "auxiliary_loss_clip": 0.01079935, "auxiliary_loss_mlp": 0.01058668, "balance_loss_clip": 1.02228498, "balance_loss_mlp": 1.02219725, "epoch": 0.21343754697129114, "flos": 24423804604800.0, "grad_norm": 1.7310685694299006, "language_loss": 0.81919748, "learning_rate": 3.657278602806357e-06, "loss": 0.8405835, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.578125, "step": 3550, "time_per_iteration": 2.5224063396453857 }, { "auxiliary_loss_clip": 0.01074352, "auxiliary_loss_mlp": 0.01049389, "balance_loss_clip": 1.01715434, "balance_loss_mlp": 1.02145123, "epoch": 0.21349767022395913, "flos": 19276908773760.0, "grad_norm": 1.6590566137264122, "language_loss": 0.88705981, "learning_rate": 3.657060557391621e-06, "loss": 0.90829718, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53125, "step": 3551, "time_per_iteration": 2.5013210773468018 }, { "auxiliary_loss_clip": 0.01078325, "auxiliary_loss_mlp": 0.01052822, "balance_loss_clip": 1.01784635, "balance_loss_mlp": 1.0217371, "epoch": 0.2135577934766271, "flos": 17346595680000.0, "grad_norm": 2.0666097222841664, "language_loss": 0.84808642, "learning_rate": 3.656842449140983e-06, "loss": 0.86939788, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.56640625, "step": 3552, "time_per_iteration": 2.4512176513671875 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01058128, "balance_loss_clip": 1.0203383, "balance_loss_mlp": 1.0218184, "epoch": 0.21361791672929506, "flos": 24056169321600.0, "grad_norm": 1.7440992287025727, "language_loss": 0.78001273, "learning_rate": 3.656624278062713e-06, "loss": 0.80136687, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5546875, "step": 3553, "time_per_iteration": 2.526937246322632 }, { "auxiliary_loss_clip": 0.01076171, "auxiliary_loss_mlp": 0.01060145, "balance_loss_clip": 1.02612209, "balance_loss_mlp": 1.02222276, "epoch": 0.21367803998196302, "flos": 22161258213120.0, "grad_norm": 1.6218052140369443, "language_loss": 0.73949873, "learning_rate": 3.6564060441650843e-06, "loss": 0.76086187, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5390625, "step": 3554, "time_per_iteration": 2.5060367584228516 }, { "auxiliary_loss_clip": 0.01076968, "auxiliary_loss_mlp": 0.01054896, "balance_loss_clip": 1.02102828, "balance_loss_mlp": 1.02224636, "epoch": 0.213738163234631, "flos": 20885811091200.0, "grad_norm": 1.9929622491852619, "language_loss": 0.69887239, "learning_rate": 3.6561877474563724e-06, "loss": 0.720191, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.546875, "step": 3555, "time_per_iteration": 2.5676944255828857 }, { "auxiliary_loss_clip": 0.01078282, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.01928389, "balance_loss_mlp": 1.02157593, "epoch": 0.21379828648729896, "flos": 28401843294720.0, "grad_norm": 2.0491694938709406, "language_loss": 0.6664049, "learning_rate": 3.6559693879448553e-06, "loss": 0.68774796, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.56640625, "step": 3556, "time_per_iteration": 2.480003595352173 }, { "auxiliary_loss_clip": 0.01077046, "auxiliary_loss_mlp": 0.01061459, "balance_loss_clip": 1.02271557, "balance_loss_mlp": 1.02118921, "epoch": 0.21385840973996692, "flos": 25478600734080.0, "grad_norm": 1.9109193340671864, "language_loss": 0.74595028, "learning_rate": 3.6557509656388125e-06, "loss": 0.7673353, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.55859375, "step": 3557, "time_per_iteration": 2.4668121337890625 }, { "auxiliary_loss_clip": 0.01079985, "auxiliary_loss_mlp": 0.01055189, "balance_loss_clip": 1.01661325, "balance_loss_mlp": 1.02180529, "epoch": 0.2139185329926349, "flos": 28073031310080.0, "grad_norm": 1.7917205395012528, "language_loss": 0.6936155, "learning_rate": 3.655532480546528e-06, "loss": 0.71496725, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.58203125, "step": 3558, "time_per_iteration": 2.4951870441436768 }, { "auxiliary_loss_clip": 0.01079654, "auxiliary_loss_mlp": 0.01053426, "balance_loss_clip": 1.01587486, "balance_loss_mlp": 1.0208106, "epoch": 0.21397865624530288, "flos": 19607710705920.0, "grad_norm": 1.864253143308329, "language_loss": 0.816131, "learning_rate": 3.655313932676286e-06, "loss": 0.83746177, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.58984375, "step": 3559, "time_per_iteration": 2.4470503330230713 }, { "auxiliary_loss_clip": 0.01075063, "auxiliary_loss_mlp": 0.01063641, "balance_loss_clip": 1.02895117, "balance_loss_mlp": 1.02080607, "epoch": 0.21403877949797084, "flos": 24680311430400.0, "grad_norm": 2.3465789422942445, "language_loss": 0.69249439, "learning_rate": 3.655095322036373e-06, "loss": 0.71388137, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.54296875, "step": 3560, "time_per_iteration": 2.4751124382019043 }, { "auxiliary_loss_clip": 0.0108045, "auxiliary_loss_mlp": 0.01051752, "balance_loss_clip": 1.01699042, "balance_loss_mlp": 1.02241111, "epoch": 0.2140989027506388, "flos": 19860237636480.0, "grad_norm": 2.0518011038148596, "language_loss": 0.75555611, "learning_rate": 3.65487664863508e-06, "loss": 0.77687812, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.578125, "step": 3561, "time_per_iteration": 2.4217565059661865 }, { "auxiliary_loss_clip": 0.01079073, "auxiliary_loss_mlp": 0.01061642, "balance_loss_clip": 1.02363825, "balance_loss_mlp": 1.02318048, "epoch": 0.21415902600330677, "flos": 19134323314560.0, "grad_norm": 2.528207343473672, "language_loss": 0.79578805, "learning_rate": 3.654657912480698e-06, "loss": 0.81719518, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5546875, "step": 3562, "time_per_iteration": 2.4195830821990967 }, { "auxiliary_loss_clip": 0.01078583, "auxiliary_loss_mlp": 0.010548, "balance_loss_clip": 1.01877487, "balance_loss_mlp": 1.02211988, "epoch": 0.21421914925597474, "flos": 22271548798080.0, "grad_norm": 1.4919946865586253, "language_loss": 0.85701585, "learning_rate": 3.6544391135815237e-06, "loss": 0.87834966, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5625, "step": 3563, "time_per_iteration": 2.4329442977905273 }, { "auxiliary_loss_clip": 0.01077155, "auxiliary_loss_mlp": 0.01049622, "balance_loss_clip": 1.01433599, "balance_loss_mlp": 1.02217293, "epoch": 0.2142792725086427, "flos": 33873710860800.0, "grad_norm": 1.5750206757641079, "language_loss": 0.78234357, "learning_rate": 3.6542202519458507e-06, "loss": 0.8036114, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.55078125, "step": 3564, "time_per_iteration": 2.5424301624298096 }, { "auxiliary_loss_clip": 0.01078593, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.01853585, "balance_loss_mlp": 1.02208638, "epoch": 0.2143393957613107, "flos": 19859329941120.0, "grad_norm": 1.6790447528623056, "language_loss": 0.90101826, "learning_rate": 3.654001327581981e-06, "loss": 0.92234361, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5625, "step": 3565, "time_per_iteration": 2.4943928718566895 }, { "auxiliary_loss_clip": 0.01032553, "auxiliary_loss_mlp": 0.01020719, "balance_loss_clip": 1.01542652, "balance_loss_mlp": 1.01147461, "epoch": 0.21439951901397866, "flos": 68526891936000.0, "grad_norm": 0.883926374346326, "language_loss": 0.52250433, "learning_rate": 3.653782340498215e-06, "loss": 0.543037, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 0.05297852, "router_z_loss_mlp": 0.2109375, "step": 3566, "time_per_iteration": 3.0206620693206787 }, { "auxiliary_loss_clip": 0.01076713, "auxiliary_loss_mlp": 0.01048933, "balance_loss_clip": 1.01367116, "balance_loss_mlp": 1.02275467, "epoch": 0.21445964226664663, "flos": 19681970901120.0, "grad_norm": 1.8683246294094795, "language_loss": 0.69002378, "learning_rate": 3.6535632907028566e-06, "loss": 0.71128023, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5390625, "step": 3567, "time_per_iteration": 2.400717258453369 }, { "auxiliary_loss_clip": 0.01077823, "auxiliary_loss_mlp": 0.01051263, "balance_loss_clip": 1.01688278, "balance_loss_mlp": 1.02386391, "epoch": 0.2145197655193146, "flos": 31105796405760.0, "grad_norm": 1.6801751952706347, "language_loss": 0.7498889, "learning_rate": 3.6533441782042126e-06, "loss": 0.77117974, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5390625, "step": 3568, "time_per_iteration": 2.532634735107422 }, { "auxiliary_loss_clip": 0.01082105, "auxiliary_loss_mlp": 0.01058109, "balance_loss_clip": 1.02091551, "balance_loss_mlp": 1.02528977, "epoch": 0.21457988877198256, "flos": 20119746839040.0, "grad_norm": 1.7178452630680678, "language_loss": 0.79574013, "learning_rate": 3.6531250030105917e-06, "loss": 0.81714225, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5703125, "step": 3569, "time_per_iteration": 2.4438276290893555 }, { "auxiliary_loss_clip": 0.01087303, "auxiliary_loss_mlp": 0.01060123, "balance_loss_clip": 1.01992536, "balance_loss_mlp": 1.02662635, "epoch": 0.21464001202465052, "flos": 18587059752960.0, "grad_norm": 2.198707820500583, "language_loss": 0.7209388, "learning_rate": 3.6529057651303053e-06, "loss": 0.74241304, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.609375, "step": 3570, "time_per_iteration": 2.3739547729492188 }, { "auxiliary_loss_clip": 0.01082354, "auxiliary_loss_mlp": 0.01061694, "balance_loss_clip": 1.02373767, "balance_loss_mlp": 1.02446556, "epoch": 0.21470013527731852, "flos": 21834087062400.0, "grad_norm": 2.4370424525047976, "language_loss": 0.81743014, "learning_rate": 3.6526864645716666e-06, "loss": 0.83887058, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.578125, "step": 3571, "time_per_iteration": 2.4606640338897705 }, { "auxiliary_loss_clip": 0.01082003, "auxiliary_loss_mlp": 0.01059801, "balance_loss_clip": 1.0197469, "balance_loss_mlp": 1.02480567, "epoch": 0.21476025852998648, "flos": 17602229721600.0, "grad_norm": 2.438234229883375, "language_loss": 0.85016525, "learning_rate": 3.652467101342991e-06, "loss": 0.87158328, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.57421875, "step": 3572, "time_per_iteration": 2.3747923374176025 }, { "auxiliary_loss_clip": 0.01083394, "auxiliary_loss_mlp": 0.01066373, "balance_loss_clip": 1.02548444, "balance_loss_mlp": 1.02258468, "epoch": 0.21482038178265445, "flos": 24826946607360.0, "grad_norm": 2.350614727132885, "language_loss": 0.67205262, "learning_rate": 3.652247675452598e-06, "loss": 0.69355029, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.609375, "step": 3573, "time_per_iteration": 2.455657958984375 }, { "auxiliary_loss_clip": 0.01076042, "auxiliary_loss_mlp": 0.01057024, "balance_loss_clip": 1.02054596, "balance_loss_mlp": 1.02249169, "epoch": 0.2148805050353224, "flos": 23257111968000.0, "grad_norm": 2.810532359432018, "language_loss": 0.77042526, "learning_rate": 3.652028186908807e-06, "loss": 0.79175591, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5390625, "step": 3574, "time_per_iteration": 2.4501664638519287 }, { "auxiliary_loss_clip": 0.01076666, "auxiliary_loss_mlp": 0.01058528, "balance_loss_clip": 1.0214541, "balance_loss_mlp": 1.02212727, "epoch": 0.21494062828799038, "flos": 21320130804480.0, "grad_norm": 2.214477791736336, "language_loss": 0.75522923, "learning_rate": 3.6518086357199416e-06, "loss": 0.77658117, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.546875, "step": 3575, "time_per_iteration": 3.8628697395324707 }, { "auxiliary_loss_clip": 0.01079163, "auxiliary_loss_mlp": 0.01060893, "balance_loss_clip": 1.02322268, "balance_loss_mlp": 1.02405834, "epoch": 0.21500075154065834, "flos": 18842344680960.0, "grad_norm": 1.6358474657294553, "language_loss": 0.70347989, "learning_rate": 3.6515890218943277e-06, "loss": 0.72488046, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.55078125, "step": 3576, "time_per_iteration": 2.398956060409546 }, { "auxiliary_loss_clip": 0.01079407, "auxiliary_loss_mlp": 0.0105865, "balance_loss_clip": 1.0186429, "balance_loss_mlp": 1.02227306, "epoch": 0.2150608747933263, "flos": 18441018069120.0, "grad_norm": 2.08734167681856, "language_loss": 0.90776616, "learning_rate": 3.651369345440292e-06, "loss": 0.92914677, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5703125, "step": 3577, "time_per_iteration": 5.174992322921753 }, { "auxiliary_loss_clip": 0.01028243, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.03452218, "balance_loss_mlp": 1.00926816, "epoch": 0.2151209980459943, "flos": 66595042742400.0, "grad_norm": 0.8360473313004207, "language_loss": 0.56211936, "learning_rate": 3.6511496063661654e-06, "loss": 0.58280855, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 0.06152344, "router_z_loss_mlp": 0.18945312, "step": 3578, "time_per_iteration": 3.00282883644104 }, { "auxiliary_loss_clip": 0.01078035, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.02302027, "balance_loss_mlp": 1.02239394, "epoch": 0.21518112129866226, "flos": 21574926973440.0, "grad_norm": 1.6506500438371856, "language_loss": 0.89359319, "learning_rate": 3.6509298046802807e-06, "loss": 0.91495973, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 3579, "time_per_iteration": 3.779792070388794 }, { "auxiliary_loss_clip": 0.01081093, "auxiliary_loss_mlp": 0.01061283, "balance_loss_clip": 1.0218966, "balance_loss_mlp": 1.02372634, "epoch": 0.21524124455133023, "flos": 20046603807360.0, "grad_norm": 1.8169005984089306, "language_loss": 0.79435802, "learning_rate": 3.650709940390972e-06, "loss": 0.81578183, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.57421875, "step": 3580, "time_per_iteration": 2.3938746452331543 }, { "auxiliary_loss_clip": 0.01077655, "auxiliary_loss_mlp": 0.01053594, "balance_loss_clip": 1.01914263, "balance_loss_mlp": 1.02311492, "epoch": 0.2153013678039982, "flos": 23950661592960.0, "grad_norm": 1.667991509928619, "language_loss": 0.7543329, "learning_rate": 3.6504900135065775e-06, "loss": 0.77564538, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.546875, "step": 3581, "time_per_iteration": 2.4758265018463135 }, { "auxiliary_loss_clip": 0.01080988, "auxiliary_loss_mlp": 0.0105889, "balance_loss_clip": 1.0206238, "balance_loss_mlp": 1.02541649, "epoch": 0.21536149105666616, "flos": 20593797546240.0, "grad_norm": 2.8453311658675013, "language_loss": 0.72998321, "learning_rate": 3.6502700240354357e-06, "loss": 0.75138199, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5546875, "step": 3582, "time_per_iteration": 2.3840277194976807 }, { "auxiliary_loss_clip": 0.0108119, "auxiliary_loss_mlp": 0.01066484, "balance_loss_clip": 1.02864742, "balance_loss_mlp": 1.02507997, "epoch": 0.21542161430933413, "flos": 12859209031680.0, "grad_norm": 2.63675628288287, "language_loss": 0.8717798, "learning_rate": 3.650049971985889e-06, "loss": 0.89325649, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5625, "step": 3583, "time_per_iteration": 2.4051544666290283 }, { "auxiliary_loss_clip": 0.01085134, "auxiliary_loss_mlp": 0.01069924, "balance_loss_clip": 1.03392315, "balance_loss_mlp": 1.02754772, "epoch": 0.21548173756200212, "flos": 26102742842880.0, "grad_norm": 2.192854479089798, "language_loss": 0.85377038, "learning_rate": 3.6498298573662824e-06, "loss": 0.87532103, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.578125, "step": 3584, "time_per_iteration": 2.428769826889038 }, { "auxiliary_loss_clip": 0.01079161, "auxiliary_loss_mlp": 0.01058527, "balance_loss_clip": 1.02481508, "balance_loss_mlp": 1.02545786, "epoch": 0.21554186081467008, "flos": 22162689578880.0, "grad_norm": 1.7890503104416033, "language_loss": 0.91852522, "learning_rate": 3.6496096801849625e-06, "loss": 0.93990207, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5390625, "step": 3585, "time_per_iteration": 2.4998252391815186 }, { "auxiliary_loss_clip": 0.01077453, "auxiliary_loss_mlp": 0.01060302, "balance_loss_clip": 1.02809143, "balance_loss_mlp": 1.02400374, "epoch": 0.21560198406733805, "flos": 22965622093440.0, "grad_norm": 1.7529066461844183, "language_loss": 0.76169896, "learning_rate": 3.649389440450277e-06, "loss": 0.78307641, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53125, "step": 3586, "time_per_iteration": 2.3942840099334717 }, { "auxiliary_loss_clip": 0.01080528, "auxiliary_loss_mlp": 0.01060243, "balance_loss_clip": 1.02571964, "balance_loss_mlp": 1.02416503, "epoch": 0.215662107320006, "flos": 22782956526720.0, "grad_norm": 1.7268256844416523, "language_loss": 0.83908904, "learning_rate": 3.6491691381705804e-06, "loss": 0.86049676, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5625, "step": 3587, "time_per_iteration": 2.4388835430145264 }, { "auxiliary_loss_clip": 0.01078793, "auxiliary_loss_mlp": 0.0106185, "balance_loss_clip": 1.02670741, "balance_loss_mlp": 1.02412879, "epoch": 0.21572223057267398, "flos": 30882527061120.0, "grad_norm": 1.7517542120606797, "language_loss": 0.7648021, "learning_rate": 3.648948773354224e-06, "loss": 0.78620857, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3588, "time_per_iteration": 2.4840939044952393 }, { "auxiliary_loss_clip": 0.01077395, "auxiliary_loss_mlp": 0.01064873, "balance_loss_clip": 1.03070712, "balance_loss_mlp": 1.02225709, "epoch": 0.21578235382534194, "flos": 26909166493440.0, "grad_norm": 1.9359750400478146, "language_loss": 0.82516342, "learning_rate": 3.6487283460095643e-06, "loss": 0.84658611, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.55078125, "step": 3589, "time_per_iteration": 2.4737401008605957 }, { "auxiliary_loss_clip": 0.01075756, "auxiliary_loss_mlp": 0.0105846, "balance_loss_clip": 1.02651215, "balance_loss_mlp": 1.0216347, "epoch": 0.2158424770780099, "flos": 24424572654720.0, "grad_norm": 1.8140386543093965, "language_loss": 0.74956858, "learning_rate": 3.648507856144961e-06, "loss": 0.77091074, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5390625, "step": 3590, "time_per_iteration": 2.409327983856201 }, { "auxiliary_loss_clip": 0.01077362, "auxiliary_loss_mlp": 0.01061316, "balance_loss_clip": 1.02400386, "balance_loss_mlp": 1.02106369, "epoch": 0.2159026003306779, "flos": 23948881113600.0, "grad_norm": 1.706182936332368, "language_loss": 0.85513538, "learning_rate": 3.648287303768775e-06, "loss": 0.87652218, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5625, "step": 3591, "time_per_iteration": 2.4395828247070312 }, { "auxiliary_loss_clip": 0.01078061, "auxiliary_loss_mlp": 0.0106039, "balance_loss_clip": 1.02226698, "balance_loss_mlp": 1.0207262, "epoch": 0.21596272358334587, "flos": 30039758818560.0, "grad_norm": 1.841778292137882, "language_loss": 0.7077713, "learning_rate": 3.6480666888893686e-06, "loss": 0.72915578, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.57421875, "step": 3592, "time_per_iteration": 2.519544839859009 }, { "auxiliary_loss_clip": 0.01077606, "auxiliary_loss_mlp": 0.01052168, "balance_loss_clip": 1.01726413, "balance_loss_mlp": 1.02181697, "epoch": 0.21602284683601383, "flos": 20375171412480.0, "grad_norm": 2.560179532515289, "language_loss": 0.86102891, "learning_rate": 3.647846011515108e-06, "loss": 0.8823266, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5546875, "step": 3593, "time_per_iteration": 2.483529806137085 }, { "auxiliary_loss_clip": 0.01078269, "auxiliary_loss_mlp": 0.01057896, "balance_loss_clip": 1.02108431, "balance_loss_mlp": 1.02242398, "epoch": 0.2160829700886818, "flos": 20776288556160.0, "grad_norm": 2.3936630583189085, "language_loss": 0.77533627, "learning_rate": 3.6476252716543625e-06, "loss": 0.79669791, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.55859375, "step": 3594, "time_per_iteration": 2.3891115188598633 }, { "auxiliary_loss_clip": 0.01073026, "auxiliary_loss_mlp": 0.01049983, "balance_loss_clip": 1.01770091, "balance_loss_mlp": 1.02048504, "epoch": 0.21614309334134976, "flos": 22308661440000.0, "grad_norm": 1.5475976172832224, "language_loss": 0.81817615, "learning_rate": 3.6474044693155007e-06, "loss": 0.83940625, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5234375, "step": 3595, "time_per_iteration": 2.4194211959838867 }, { "auxiliary_loss_clip": 0.01080284, "auxiliary_loss_mlp": 0.01050928, "balance_loss_clip": 1.01487911, "balance_loss_mlp": 1.02340221, "epoch": 0.21620321659401773, "flos": 19608513667200.0, "grad_norm": 1.9403694035521886, "language_loss": 0.80626571, "learning_rate": 3.647183604506897e-06, "loss": 0.82757777, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5703125, "step": 3596, "time_per_iteration": 2.3840231895446777 }, { "auxiliary_loss_clip": 0.01076966, "auxiliary_loss_mlp": 0.01052329, "balance_loss_clip": 1.01747227, "balance_loss_mlp": 1.02330077, "epoch": 0.2162633398466857, "flos": 18843531667200.0, "grad_norm": 1.5411130393797947, "language_loss": 0.85201538, "learning_rate": 3.6469626772369253e-06, "loss": 0.87330836, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53515625, "step": 3597, "time_per_iteration": 2.3868656158447266 }, { "auxiliary_loss_clip": 0.01077778, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.01849079, "balance_loss_mlp": 1.02268863, "epoch": 0.21632346309935369, "flos": 18767875017600.0, "grad_norm": 2.8426852217499463, "language_loss": 0.8139987, "learning_rate": 3.6467416875139642e-06, "loss": 0.83533281, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.55078125, "step": 3598, "time_per_iteration": 2.3663063049316406 }, { "auxiliary_loss_clip": 0.01078938, "auxiliary_loss_mlp": 0.0105749, "balance_loss_clip": 1.0190804, "balance_loss_mlp": 1.02199316, "epoch": 0.21638358635202165, "flos": 26322939987840.0, "grad_norm": 2.179651054724553, "language_loss": 0.83550125, "learning_rate": 3.6465206353463934e-06, "loss": 0.85686553, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5703125, "step": 3599, "time_per_iteration": 2.449903964996338 }, { "auxiliary_loss_clip": 0.01077025, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.01948118, "balance_loss_mlp": 1.0235368, "epoch": 0.21644370960468962, "flos": 20739804318720.0, "grad_norm": 2.102827965564334, "language_loss": 0.7794655, "learning_rate": 3.6462995207425947e-06, "loss": 0.80078435, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53125, "step": 3600, "time_per_iteration": 2.400702953338623 }, { "auxiliary_loss_clip": 0.01077151, "auxiliary_loss_mlp": 0.01061535, "balance_loss_clip": 1.02770364, "balance_loss_mlp": 1.02125096, "epoch": 0.21650383285735758, "flos": 23951080529280.0, "grad_norm": 1.921282567487024, "language_loss": 0.81958187, "learning_rate": 3.6460783437109533e-06, "loss": 0.84096873, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.55859375, "step": 3601, "time_per_iteration": 2.449150323867798 }, { "auxiliary_loss_clip": 0.01079766, "auxiliary_loss_mlp": 0.01059277, "balance_loss_clip": 1.02384758, "balance_loss_mlp": 1.02421272, "epoch": 0.21656395611002555, "flos": 23694957728640.0, "grad_norm": 2.1117338382257107, "language_loss": 0.85284764, "learning_rate": 3.6458571042598565e-06, "loss": 0.87423807, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 3602, "time_per_iteration": 2.471130132675171 }, { "auxiliary_loss_clip": 0.01078156, "auxiliary_loss_mlp": 0.01056977, "balance_loss_clip": 1.01701808, "balance_loss_mlp": 1.02177525, "epoch": 0.2166240793626935, "flos": 20665055364480.0, "grad_norm": 1.77972753948087, "language_loss": 0.76590347, "learning_rate": 3.645635802397693e-06, "loss": 0.78725481, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5625, "step": 3603, "time_per_iteration": 2.4325335025787354 }, { "auxiliary_loss_clip": 0.01076662, "auxiliary_loss_mlp": 0.01051139, "balance_loss_clip": 1.01580524, "balance_loss_mlp": 1.02152133, "epoch": 0.2166842026153615, "flos": 21579325804800.0, "grad_norm": 2.0257707522519426, "language_loss": 0.75924897, "learning_rate": 3.645414438132855e-06, "loss": 0.780527, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.55078125, "step": 3604, "time_per_iteration": 2.421057939529419 }, { "auxiliary_loss_clip": 0.0107576, "auxiliary_loss_mlp": 0.01045172, "balance_loss_clip": 1.01122129, "balance_loss_mlp": 1.02177656, "epoch": 0.21674432586802947, "flos": 25628761958400.0, "grad_norm": 1.7708673281509733, "language_loss": 0.8166647, "learning_rate": 3.6451930114737366e-06, "loss": 0.83787405, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5390625, "step": 3605, "time_per_iteration": 2.4958317279815674 }, { "auxiliary_loss_clip": 0.01035933, "auxiliary_loss_mlp": 0.01009201, "balance_loss_clip": 0.99971181, "balance_loss_mlp": 1.00902617, "epoch": 0.21680444912069743, "flos": 56414893155840.0, "grad_norm": 0.7560061165901457, "language_loss": 0.58413363, "learning_rate": 3.6449715224287347e-06, "loss": 0.60458493, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 0.09472656, "router_z_loss_mlp": 0.26953125, "step": 3606, "time_per_iteration": 3.1175642013549805 }, { "auxiliary_loss_clip": 0.01078942, "auxiliary_loss_mlp": 0.01059401, "balance_loss_clip": 1.01886988, "balance_loss_mlp": 1.02092862, "epoch": 0.2168645723733654, "flos": 23877797852160.0, "grad_norm": 2.075685618677511, "language_loss": 0.74941123, "learning_rate": 3.644749971006248e-06, "loss": 0.77079463, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.578125, "step": 3607, "time_per_iteration": 2.4181196689605713 }, { "auxiliary_loss_clip": 0.01080918, "auxiliary_loss_mlp": 0.01057543, "balance_loss_clip": 1.01870489, "balance_loss_mlp": 1.02289867, "epoch": 0.21692469562603336, "flos": 16945234156800.0, "grad_norm": 2.0333566061137707, "language_loss": 0.78896642, "learning_rate": 3.6445283572146765e-06, "loss": 0.81035101, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3608, "time_per_iteration": 2.3698537349700928 }, { "auxiliary_loss_clip": 0.01079013, "auxiliary_loss_mlp": 0.01057049, "balance_loss_clip": 1.01954579, "balance_loss_mlp": 1.02161407, "epoch": 0.21698481887870133, "flos": 25117877900160.0, "grad_norm": 1.797916668853597, "language_loss": 0.75989258, "learning_rate": 3.6443066810624255e-06, "loss": 0.78125316, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.57421875, "step": 3609, "time_per_iteration": 2.435075044631958 }, { "auxiliary_loss_clip": 0.01078535, "auxiliary_loss_mlp": 0.0105451, "balance_loss_clip": 1.01784086, "balance_loss_mlp": 1.02209151, "epoch": 0.2170449421313693, "flos": 17893719596160.0, "grad_norm": 40.10880221878159, "language_loss": 0.90571076, "learning_rate": 3.6440849425579e-06, "loss": 0.92704117, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.56640625, "step": 3610, "time_per_iteration": 2.3640034198760986 }, { "auxiliary_loss_clip": 0.01075983, "auxiliary_loss_mlp": 0.01052004, "balance_loss_clip": 1.01678967, "balance_loss_mlp": 1.02114391, "epoch": 0.2171050653840373, "flos": 22637333779200.0, "grad_norm": 1.5743607102706705, "language_loss": 0.79266703, "learning_rate": 3.6438631417095095e-06, "loss": 0.81394684, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3611, "time_per_iteration": 2.4144301414489746 }, { "auxiliary_loss_clip": 0.01077189, "auxiliary_loss_mlp": 0.01063537, "balance_loss_clip": 1.02703524, "balance_loss_mlp": 1.02165937, "epoch": 0.21716518863670525, "flos": 19498991132160.0, "grad_norm": 1.8865436480322173, "language_loss": 0.6514371, "learning_rate": 3.6436412785256637e-06, "loss": 0.67284435, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5546875, "step": 3612, "time_per_iteration": 2.3780720233917236 }, { "auxiliary_loss_clip": 0.01077525, "auxiliary_loss_mlp": 0.01063038, "balance_loss_clip": 1.02539194, "balance_loss_mlp": 1.02107406, "epoch": 0.21722531188937322, "flos": 19791004677120.0, "grad_norm": 1.918481300859056, "language_loss": 0.77652746, "learning_rate": 3.643419353014776e-06, "loss": 0.7979331, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5625, "step": 3613, "time_per_iteration": 2.398261785507202 }, { "auxiliary_loss_clip": 0.0107591, "auxiliary_loss_mlp": 0.01065041, "balance_loss_clip": 1.0259409, "balance_loss_mlp": 1.02086699, "epoch": 0.21728543514204118, "flos": 13333539029760.0, "grad_norm": 1.9527688733194357, "language_loss": 0.73295522, "learning_rate": 3.643197365185261e-06, "loss": 0.75436473, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.55078125, "step": 3614, "time_per_iteration": 3.81750750541687 }, { "auxiliary_loss_clip": 0.01076089, "auxiliary_loss_mlp": 0.01057502, "balance_loss_clip": 1.02154899, "balance_loss_mlp": 1.02087808, "epoch": 0.21734555839470915, "flos": 15230963756160.0, "grad_norm": 1.644551007125277, "language_loss": 0.74653929, "learning_rate": 3.6429753150455378e-06, "loss": 0.76787519, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.55078125, "step": 3615, "time_per_iteration": 2.3857126235961914 }, { "auxiliary_loss_clip": 0.01081006, "auxiliary_loss_mlp": 0.01058644, "balance_loss_clip": 1.0186379, "balance_loss_mlp": 1.02186024, "epoch": 0.2174056816473771, "flos": 19972972016640.0, "grad_norm": 2.198393141482538, "language_loss": 0.9181639, "learning_rate": 3.6427532026040263e-06, "loss": 0.93956053, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.59375, "step": 3616, "time_per_iteration": 3.7836973667144775 }, { "auxiliary_loss_clip": 0.01077658, "auxiliary_loss_mlp": 0.01057392, "balance_loss_clip": 1.01802945, "balance_loss_mlp": 1.02116609, "epoch": 0.21746580490004508, "flos": 16686458092800.0, "grad_norm": 3.025435472576452, "language_loss": 0.8354528, "learning_rate": 3.642531027869148e-06, "loss": 0.8568033, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.56640625, "step": 3617, "time_per_iteration": 3.714735984802246 }, { "auxiliary_loss_clip": 0.01079559, "auxiliary_loss_mlp": 0.01059278, "balance_loss_clip": 1.02043986, "balance_loss_mlp": 1.02177596, "epoch": 0.21752592815271307, "flos": 25771207772160.0, "grad_norm": 2.4405616283302227, "language_loss": 0.76465499, "learning_rate": 3.642308790849329e-06, "loss": 0.78604329, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.578125, "step": 3618, "time_per_iteration": 3.788785219192505 }, { "auxiliary_loss_clip": 0.01078525, "auxiliary_loss_mlp": 0.01060816, "balance_loss_clip": 1.02214408, "balance_loss_mlp": 1.02132678, "epoch": 0.21758605140538104, "flos": 11253902584320.0, "grad_norm": 1.9530326784205676, "language_loss": 0.71730661, "learning_rate": 3.642086491552996e-06, "loss": 0.73869997, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5703125, "step": 3619, "time_per_iteration": 2.3366973400115967 }, { "auxiliary_loss_clip": 0.01079288, "auxiliary_loss_mlp": 0.01059566, "balance_loss_clip": 1.02039349, "balance_loss_mlp": 1.02140534, "epoch": 0.217646174658049, "flos": 19241681345280.0, "grad_norm": 1.7006236640415535, "language_loss": 0.79327059, "learning_rate": 3.641864129988579e-06, "loss": 0.81465912, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3620, "time_per_iteration": 2.422985076904297 }, { "auxiliary_loss_clip": 0.01073879, "auxiliary_loss_mlp": 0.01052213, "balance_loss_clip": 1.01439977, "balance_loss_mlp": 1.02006173, "epoch": 0.21770629791071697, "flos": 21943993622400.0, "grad_norm": 1.4742900372777883, "language_loss": 0.81288534, "learning_rate": 3.641641706164509e-06, "loss": 0.83414626, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5390625, "step": 3621, "time_per_iteration": 2.426555871963501 }, { "auxiliary_loss_clip": 0.01075137, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.01069641, "balance_loss_mlp": 1.01961946, "epoch": 0.21776642116338493, "flos": 24935596358400.0, "grad_norm": 1.9743310511914032, "language_loss": 0.89041531, "learning_rate": 3.641419220089221e-06, "loss": 0.91161144, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5546875, "step": 3622, "time_per_iteration": 2.4637105464935303 }, { "auxiliary_loss_clip": 0.01079055, "auxiliary_loss_mlp": 0.01053948, "balance_loss_clip": 1.01379824, "balance_loss_mlp": 1.02222657, "epoch": 0.2178265444160529, "flos": 17820367096320.0, "grad_norm": 2.6543261005162537, "language_loss": 0.79788852, "learning_rate": 3.641196671771152e-06, "loss": 0.81921852, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5703125, "step": 3623, "time_per_iteration": 2.391129493713379 }, { "auxiliary_loss_clip": 0.01080691, "auxiliary_loss_mlp": 0.01067771, "balance_loss_clip": 1.02721643, "balance_loss_mlp": 1.02127624, "epoch": 0.2178866676687209, "flos": 17711926813440.0, "grad_norm": 1.9790541424059598, "language_loss": 0.85859627, "learning_rate": 3.640974061218741e-06, "loss": 0.88008088, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.59375, "step": 3624, "time_per_iteration": 2.4465255737304688 }, { "auxiliary_loss_clip": 0.01080506, "auxiliary_loss_mlp": 0.01062274, "balance_loss_clip": 1.02264917, "balance_loss_mlp": 1.02290726, "epoch": 0.21794679092138886, "flos": 16944919954560.0, "grad_norm": 2.6435606316248506, "language_loss": 0.80112731, "learning_rate": 3.640751388440429e-06, "loss": 0.82255512, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.57421875, "step": 3625, "time_per_iteration": 2.4003117084503174 }, { "auxiliary_loss_clip": 0.01038884, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.0205096, "balance_loss_mlp": 1.01311755, "epoch": 0.21800691417405682, "flos": 63715371425280.0, "grad_norm": 0.8215005596152994, "language_loss": 0.6072557, "learning_rate": 3.64052865344466e-06, "loss": 0.62792403, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 0.07421875, "router_z_loss_mlp": 0.2578125, "step": 3626, "time_per_iteration": 3.1468913555145264 }, { "auxiliary_loss_clip": 0.01080367, "auxiliary_loss_mlp": 0.01059748, "balance_loss_clip": 1.02086186, "balance_loss_mlp": 1.02095318, "epoch": 0.21806703742672479, "flos": 21615321283200.0, "grad_norm": 2.2255081583196605, "language_loss": 0.92065489, "learning_rate": 3.6403058562398795e-06, "loss": 0.94205594, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.59375, "step": 3627, "time_per_iteration": 2.4180872440338135 }, { "auxiliary_loss_clip": 0.01076967, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.01401734, "balance_loss_mlp": 1.02035201, "epoch": 0.21812716067939275, "flos": 19353857143680.0, "grad_norm": 4.361490247217413, "language_loss": 0.75762403, "learning_rate": 3.6400829968345365e-06, "loss": 0.77890563, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.56640625, "step": 3628, "time_per_iteration": 2.4645495414733887 }, { "auxiliary_loss_clip": 0.0107711, "auxiliary_loss_mlp": 0.01062286, "balance_loss_clip": 1.02354252, "balance_loss_mlp": 1.02003717, "epoch": 0.21818728393206072, "flos": 23546995920000.0, "grad_norm": 3.481698463952475, "language_loss": 0.79471183, "learning_rate": 3.6398600752370826e-06, "loss": 0.81610572, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5703125, "step": 3629, "time_per_iteration": 2.4796934127807617 }, { "auxiliary_loss_clip": 0.0107795, "auxiliary_loss_mlp": 0.01058313, "balance_loss_clip": 1.02250242, "balance_loss_mlp": 1.02115571, "epoch": 0.21824740718472868, "flos": 30224379421440.0, "grad_norm": 1.7070116811665181, "language_loss": 0.73199368, "learning_rate": 3.63963709145597e-06, "loss": 0.75335622, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.56640625, "step": 3630, "time_per_iteration": 2.53882098197937 }, { "auxiliary_loss_clip": 0.01075091, "auxiliary_loss_mlp": 0.01058275, "balance_loss_clip": 1.02375197, "balance_loss_mlp": 1.02105367, "epoch": 0.21830753043739667, "flos": 26133641262720.0, "grad_norm": 2.089826153725213, "language_loss": 0.78325498, "learning_rate": 3.6394140454996544e-06, "loss": 0.80458868, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5390625, "step": 3631, "time_per_iteration": 2.463078498840332 }, { "auxiliary_loss_clip": 0.01077848, "auxiliary_loss_mlp": 0.01062986, "balance_loss_clip": 1.02412391, "balance_loss_mlp": 1.02143013, "epoch": 0.21836765369006464, "flos": 21719781671040.0, "grad_norm": 2.8084006608511207, "language_loss": 0.76494604, "learning_rate": 3.639190937376594e-06, "loss": 0.78635436, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5625, "step": 3632, "time_per_iteration": 2.4561684131622314 }, { "auxiliary_loss_clip": 0.01073798, "auxiliary_loss_mlp": 0.01065278, "balance_loss_clip": 1.03003931, "balance_loss_mlp": 1.01897359, "epoch": 0.2184277769427326, "flos": 19936592513280.0, "grad_norm": 2.507264490092034, "language_loss": 0.85630119, "learning_rate": 3.638967767095249e-06, "loss": 0.87769192, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3633, "time_per_iteration": 2.452582597732544 }, { "auxiliary_loss_clip": 0.01076631, "auxiliary_loss_mlp": 0.01076153, "balance_loss_clip": 1.03817308, "balance_loss_mlp": 1.02026665, "epoch": 0.21848790019540057, "flos": 20339175934080.0, "grad_norm": 1.9893652520767453, "language_loss": 0.81987011, "learning_rate": 3.6387445346640823e-06, "loss": 0.84139788, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5625, "step": 3634, "time_per_iteration": 2.475949287414551 }, { "auxiliary_loss_clip": 0.01082452, "auxiliary_loss_mlp": 0.01068007, "balance_loss_clip": 1.03002667, "balance_loss_mlp": 1.02191794, "epoch": 0.21854802344806853, "flos": 15449904092160.0, "grad_norm": 1.7808670314219763, "language_loss": 0.76545846, "learning_rate": 3.638521240091558e-06, "loss": 0.78696311, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.60546875, "step": 3635, "time_per_iteration": 2.391291618347168 }, { "auxiliary_loss_clip": 0.01077314, "auxiliary_loss_mlp": 0.01070069, "balance_loss_clip": 1.03237486, "balance_loss_mlp": 1.02086711, "epoch": 0.2186081467007365, "flos": 16319939973120.0, "grad_norm": 15.920512277141414, "language_loss": 0.89669275, "learning_rate": 3.6382978833861445e-06, "loss": 0.91816664, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.56640625, "step": 3636, "time_per_iteration": 2.5335419178009033 }, { "auxiliary_loss_clip": 0.01078187, "auxiliary_loss_mlp": 0.01064782, "balance_loss_clip": 1.0287571, "balance_loss_mlp": 1.02099907, "epoch": 0.2186682699534045, "flos": 21688185024000.0, "grad_norm": 2.0684801242222712, "language_loss": 0.77663094, "learning_rate": 3.638074464556311e-06, "loss": 0.79806066, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5703125, "step": 3637, "time_per_iteration": 2.4054672718048096 }, { "auxiliary_loss_clip": 0.01081478, "auxiliary_loss_mlp": 0.01062945, "balance_loss_clip": 1.02162707, "balance_loss_mlp": 1.02222538, "epoch": 0.21872839320607246, "flos": 17738566047360.0, "grad_norm": 2.633944491478341, "language_loss": 0.93846357, "learning_rate": 3.63785098361053e-06, "loss": 0.95990783, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.59375, "step": 3638, "time_per_iteration": 2.4505255222320557 }, { "auxiliary_loss_clip": 0.01075278, "auxiliary_loss_mlp": 0.01061854, "balance_loss_clip": 1.02268159, "balance_loss_mlp": 1.01991296, "epoch": 0.21878851645874042, "flos": 18651544767360.0, "grad_norm": 3.364062918374491, "language_loss": 0.92254508, "learning_rate": 3.637627440557275e-06, "loss": 0.94391638, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5546875, "step": 3639, "time_per_iteration": 2.3909716606140137 }, { "auxiliary_loss_clip": 0.01078507, "auxiliary_loss_mlp": 0.01056584, "balance_loss_clip": 1.02175117, "balance_loss_mlp": 1.02192044, "epoch": 0.2188486397114084, "flos": 25556107685760.0, "grad_norm": 1.752420769180744, "language_loss": 0.80976689, "learning_rate": 3.637403835405024e-06, "loss": 0.83111787, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.56640625, "step": 3640, "time_per_iteration": 2.458373546600342 }, { "auxiliary_loss_clip": 0.01079664, "auxiliary_loss_mlp": 0.0106, "balance_loss_clip": 1.02016068, "balance_loss_mlp": 1.02291846, "epoch": 0.21890876296407635, "flos": 17891171066880.0, "grad_norm": 2.8004317026074608, "language_loss": 0.74107325, "learning_rate": 3.637180168162255e-06, "loss": 0.76246989, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.56640625, "step": 3641, "time_per_iteration": 2.4070820808410645 }, { "auxiliary_loss_clip": 0.0107843, "auxiliary_loss_mlp": 0.01055139, "balance_loss_clip": 1.01672947, "balance_loss_mlp": 1.02261353, "epoch": 0.21896888621674432, "flos": 17748131760000.0, "grad_norm": 2.014286715398528, "language_loss": 0.82695532, "learning_rate": 3.63695643883745e-06, "loss": 0.84829104, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.55859375, "step": 3642, "time_per_iteration": 2.3967037200927734 }, { "auxiliary_loss_clip": 0.0108207, "auxiliary_loss_mlp": 0.01058216, "balance_loss_clip": 1.01856732, "balance_loss_mlp": 1.0238905, "epoch": 0.21902900946941228, "flos": 23075039894400.0, "grad_norm": 1.6373162136761177, "language_loss": 0.72934932, "learning_rate": 3.6367326474390928e-06, "loss": 0.75075221, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.58203125, "step": 3643, "time_per_iteration": 2.4296765327453613 }, { "auxiliary_loss_clip": 0.01082419, "auxiliary_loss_mlp": 0.0105708, "balance_loss_clip": 1.01876581, "balance_loss_mlp": 1.02311659, "epoch": 0.21908913272208028, "flos": 48176718923520.0, "grad_norm": 2.18705834746407, "language_loss": 0.6904155, "learning_rate": 3.6365087939756696e-06, "loss": 0.71181047, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.59375, "step": 3644, "time_per_iteration": 2.702288866043091 }, { "auxiliary_loss_clip": 0.01079405, "auxiliary_loss_mlp": 0.01058477, "balance_loss_clip": 1.01837468, "balance_loss_mlp": 1.02139044, "epoch": 0.21914925597474824, "flos": 22235658053760.0, "grad_norm": 2.5026944135088063, "language_loss": 0.79165423, "learning_rate": 3.636284878455669e-06, "loss": 0.81303304, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.58203125, "step": 3645, "time_per_iteration": 2.4037957191467285 }, { "auxiliary_loss_clip": 0.01076793, "auxiliary_loss_mlp": 0.01048189, "balance_loss_clip": 1.01466691, "balance_loss_mlp": 1.0221417, "epoch": 0.2192093792274162, "flos": 22124564507520.0, "grad_norm": 1.708625233330519, "language_loss": 0.83388233, "learning_rate": 3.636060900887582e-06, "loss": 0.85513216, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.546875, "step": 3646, "time_per_iteration": 2.4480836391448975 }, { "auxiliary_loss_clip": 0.0107549, "auxiliary_loss_mlp": 0.01054942, "balance_loss_clip": 1.01801109, "balance_loss_mlp": 1.02124405, "epoch": 0.21926950248008417, "flos": 15668530225920.0, "grad_norm": 1.8864099734719884, "language_loss": 0.84112442, "learning_rate": 3.635836861279901e-06, "loss": 0.86242872, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5390625, "step": 3647, "time_per_iteration": 2.4078726768493652 }, { "auxiliary_loss_clip": 0.01075641, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.02097499, "balance_loss_mlp": 1.02032423, "epoch": 0.21932962573275214, "flos": 30261212772480.0, "grad_norm": 2.1014201236892314, "language_loss": 0.73299098, "learning_rate": 3.635612759641123e-06, "loss": 0.75430954, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5546875, "step": 3648, "time_per_iteration": 2.5395350456237793 }, { "auxiliary_loss_clip": 0.01077156, "auxiliary_loss_mlp": 0.01055339, "balance_loss_clip": 1.01824105, "balance_loss_mlp": 1.01967406, "epoch": 0.2193897489854201, "flos": 10779363118080.0, "grad_norm": 4.4825131658163295, "language_loss": 0.75344253, "learning_rate": 3.635388595979745e-06, "loss": 0.77476752, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.57421875, "step": 3649, "time_per_iteration": 2.4040582180023193 }, { "auxiliary_loss_clip": 0.01072919, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.01779914, "balance_loss_mlp": 1.02040958, "epoch": 0.21944987223808807, "flos": 19132368278400.0, "grad_norm": 2.005674220891219, "language_loss": 0.87110919, "learning_rate": 3.635164370304267e-06, "loss": 0.89234567, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.5234375, "step": 3650, "time_per_iteration": 2.4472267627716064 }, { "auxiliary_loss_clip": 0.01075728, "auxiliary_loss_mlp": 0.01052311, "balance_loss_clip": 1.01652408, "balance_loss_mlp": 1.01943946, "epoch": 0.21950999549075606, "flos": 22709988051840.0, "grad_norm": 2.1461400438689155, "language_loss": 0.86145568, "learning_rate": 3.6349400826231927e-06, "loss": 0.88273603, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5625, "step": 3651, "time_per_iteration": 2.4507038593292236 }, { "auxiliary_loss_clip": 0.01073065, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.01636219, "balance_loss_mlp": 1.01930022, "epoch": 0.21957011874342403, "flos": 10560562427520.0, "grad_norm": 2.709895609614865, "language_loss": 0.77428585, "learning_rate": 3.634715732945027e-06, "loss": 0.79549319, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.5390625, "step": 3652, "time_per_iteration": 2.4118905067443848 }, { "auxiliary_loss_clip": 0.01027491, "auxiliary_loss_mlp": 0.01011525, "balance_loss_clip": 1.00601792, "balance_loss_mlp": 1.00899887, "epoch": 0.219630241996092, "flos": 65745047848320.0, "grad_norm": 0.7915258013978405, "language_loss": 0.51722836, "learning_rate": 3.6344913212782764e-06, "loss": 0.53761852, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 0.05517578, "router_z_loss_mlp": 0.18554688, "step": 3653, "time_per_iteration": 4.465547800064087 }, { "auxiliary_loss_clip": 0.01074365, "auxiliary_loss_mlp": 0.01062134, "balance_loss_clip": 1.02758741, "balance_loss_mlp": 1.02105224, "epoch": 0.21969036524875996, "flos": 23695376664960.0, "grad_norm": 1.7426420820149264, "language_loss": 0.76678616, "learning_rate": 3.6342668476314514e-06, "loss": 0.78815114, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53515625, "step": 3654, "time_per_iteration": 2.457834243774414 }, { "auxiliary_loss_clip": 0.01078138, "auxiliary_loss_mlp": 0.01060251, "balance_loss_clip": 1.0223428, "balance_loss_mlp": 1.02176058, "epoch": 0.21975048850142792, "flos": 19640040491520.0, "grad_norm": 1.8295280314055467, "language_loss": 0.74236947, "learning_rate": 3.634042312013064e-06, "loss": 0.76375335, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5625, "step": 3655, "time_per_iteration": 2.449251890182495 }, { "auxiliary_loss_clip": 0.01075444, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.01141024, "balance_loss_mlp": 1.02076614, "epoch": 0.21981061175409589, "flos": 22447651029120.0, "grad_norm": 1.6848249092387113, "language_loss": 0.81961161, "learning_rate": 3.6338177144316276e-06, "loss": 0.8408218, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.546875, "step": 3656, "time_per_iteration": 4.727744102478027 }, { "auxiliary_loss_clip": 0.01075946, "auxiliary_loss_mlp": 0.01047495, "balance_loss_clip": 1.01335359, "balance_loss_mlp": 1.02166355, "epoch": 0.21987073500676388, "flos": 18150051864960.0, "grad_norm": 2.0473443216253995, "language_loss": 0.87136978, "learning_rate": 3.63359305489566e-06, "loss": 0.89260417, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.54296875, "step": 3657, "time_per_iteration": 3.764265775680542 }, { "auxiliary_loss_clip": 0.01076045, "auxiliary_loss_mlp": 0.01052876, "balance_loss_clip": 1.01692247, "balance_loss_mlp": 1.0216521, "epoch": 0.21993085825943184, "flos": 25625096265600.0, "grad_norm": 1.6512844143681358, "language_loss": 0.8201158, "learning_rate": 3.6333683334136803e-06, "loss": 0.84140503, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.54296875, "step": 3658, "time_per_iteration": 3.8667969703674316 }, { "auxiliary_loss_clip": 0.01032005, "auxiliary_loss_mlp": 0.01022006, "balance_loss_clip": 1.01759541, "balance_loss_mlp": 1.01492071, "epoch": 0.2199909815120998, "flos": 70919349096960.0, "grad_norm": 0.7813363199321396, "language_loss": 0.58273, "learning_rate": 3.6331435499942095e-06, "loss": 0.60327005, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 0.04418945, "router_z_loss_mlp": 0.17089844, "step": 3659, "time_per_iteration": 3.1734459400177 }, { "auxiliary_loss_clip": 0.01077252, "auxiliary_loss_mlp": 0.01047175, "balance_loss_clip": 1.01486969, "balance_loss_mlp": 1.02369976, "epoch": 0.22005110476476777, "flos": 21542457542400.0, "grad_norm": 2.1069754495823334, "language_loss": 0.75718296, "learning_rate": 3.632918704645772e-06, "loss": 0.77842724, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53515625, "step": 3660, "time_per_iteration": 2.399705171585083 }, { "auxiliary_loss_clip": 0.01078227, "auxiliary_loss_mlp": 0.01052579, "balance_loss_clip": 1.01736474, "balance_loss_mlp": 1.02412689, "epoch": 0.22011122801743574, "flos": 22053411423360.0, "grad_norm": 2.542889185780689, "language_loss": 0.81714797, "learning_rate": 3.632693797376893e-06, "loss": 0.83845603, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.54296875, "step": 3661, "time_per_iteration": 2.445868968963623 }, { "auxiliary_loss_clip": 0.01076514, "auxiliary_loss_mlp": 0.01045904, "balance_loss_clip": 1.01586378, "balance_loss_mlp": 1.0246768, "epoch": 0.2201713512701037, "flos": 26686385907840.0, "grad_norm": 1.7159015368158572, "language_loss": 0.74884677, "learning_rate": 3.632468828196102e-06, "loss": 0.77007103, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.51953125, "step": 3662, "time_per_iteration": 2.46297025680542 }, { "auxiliary_loss_clip": 0.01075892, "auxiliary_loss_mlp": 0.01050662, "balance_loss_clip": 1.02211142, "balance_loss_mlp": 1.02454472, "epoch": 0.22023147452277167, "flos": 22161153479040.0, "grad_norm": 1.6700417600311428, "language_loss": 0.80186772, "learning_rate": 3.632243797111929e-06, "loss": 0.82313323, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.515625, "step": 3663, "time_per_iteration": 2.4535279273986816 }, { "auxiliary_loss_clip": 0.01077557, "auxiliary_loss_mlp": 0.01056255, "balance_loss_clip": 1.02404428, "balance_loss_mlp": 1.02401364, "epoch": 0.22029159777543966, "flos": 22522330160640.0, "grad_norm": 2.949447933059579, "language_loss": 0.81599796, "learning_rate": 3.632018704132908e-06, "loss": 0.83733606, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.53515625, "step": 3664, "time_per_iteration": 2.4290482997894287 }, { "auxiliary_loss_clip": 0.01080139, "auxiliary_loss_mlp": 0.01063695, "balance_loss_clip": 1.02871943, "balance_loss_mlp": 1.02469277, "epoch": 0.22035172102810763, "flos": 13041630218880.0, "grad_norm": 2.741169024872157, "language_loss": 0.78783846, "learning_rate": 3.6317935492675742e-06, "loss": 0.80927682, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5546875, "step": 3665, "time_per_iteration": 2.4008941650390625 }, { "auxiliary_loss_clip": 0.01074832, "auxiliary_loss_mlp": 0.01066725, "balance_loss_clip": 1.0359689, "balance_loss_mlp": 1.02334762, "epoch": 0.2204118442807756, "flos": 12165031002240.0, "grad_norm": 3.1557182905746712, "language_loss": 0.99302453, "learning_rate": 3.631568332524466e-06, "loss": 1.01444006, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.515625, "step": 3666, "time_per_iteration": 2.373237133026123 }, { "auxiliary_loss_clip": 0.01073975, "auxiliary_loss_mlp": 0.0106226, "balance_loss_clip": 1.02971613, "balance_loss_mlp": 1.02168369, "epoch": 0.22047196753344356, "flos": 40107383493120.0, "grad_norm": 1.730850934356633, "language_loss": 0.81527555, "learning_rate": 3.631343053912122e-06, "loss": 0.83663797, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5234375, "step": 3667, "time_per_iteration": 2.592818260192871 }, { "auxiliary_loss_clip": 0.0107498, "auxiliary_loss_mlp": 0.01073154, "balance_loss_clip": 1.03929901, "balance_loss_mlp": 1.02133656, "epoch": 0.22053209078611152, "flos": 20700178058880.0, "grad_norm": 1.8766342092713673, "language_loss": 0.78284168, "learning_rate": 3.631117713439087e-06, "loss": 0.80432296, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5390625, "step": 3668, "time_per_iteration": 2.394470691680908 }, { "auxiliary_loss_clip": 0.01074389, "auxiliary_loss_mlp": 0.01063818, "balance_loss_clip": 1.03079724, "balance_loss_mlp": 1.02239394, "epoch": 0.2205922140387795, "flos": 24715189745280.0, "grad_norm": 1.6383201253323754, "language_loss": 0.73074746, "learning_rate": 3.630892311113904e-06, "loss": 0.75212955, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51953125, "step": 3669, "time_per_iteration": 2.495992422103882 }, { "auxiliary_loss_clip": 0.01072941, "auxiliary_loss_mlp": 0.01056893, "balance_loss_clip": 1.0259223, "balance_loss_mlp": 1.02150428, "epoch": 0.22065233729144745, "flos": 23476122126720.0, "grad_norm": 1.7998602960158632, "language_loss": 0.86199462, "learning_rate": 3.6306668469451215e-06, "loss": 0.88329291, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.515625, "step": 3670, "time_per_iteration": 2.456986427307129 }, { "auxiliary_loss_clip": 0.01077433, "auxiliary_loss_mlp": 0.01055387, "balance_loss_clip": 1.02534676, "balance_loss_mlp": 1.02352405, "epoch": 0.22071246054411545, "flos": 35224116405120.0, "grad_norm": 1.6766895852134245, "language_loss": 0.77440917, "learning_rate": 3.6304413209412886e-06, "loss": 0.79573739, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.5390625, "step": 3671, "time_per_iteration": 2.589750051498413 }, { "auxiliary_loss_clip": 0.01076145, "auxiliary_loss_mlp": 0.01051071, "balance_loss_clip": 1.01988554, "balance_loss_mlp": 1.02338886, "epoch": 0.2207725837967834, "flos": 18149318726400.0, "grad_norm": 2.0888457761017936, "language_loss": 0.82253331, "learning_rate": 3.6302157331109573e-06, "loss": 0.84380549, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.52734375, "step": 3672, "time_per_iteration": 2.401921510696411 }, { "auxiliary_loss_clip": 0.01080346, "auxiliary_loss_mlp": 0.01053853, "balance_loss_clip": 1.0220958, "balance_loss_mlp": 1.02720535, "epoch": 0.22083270704945138, "flos": 20478793927680.0, "grad_norm": 1.9700190571663594, "language_loss": 0.74882901, "learning_rate": 3.629990083462682e-06, "loss": 0.77017099, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.53125, "step": 3673, "time_per_iteration": 2.429598331451416 }, { "auxiliary_loss_clip": 0.01079708, "auxiliary_loss_mlp": 0.01053935, "balance_loss_clip": 1.01804078, "balance_loss_mlp": 1.0262903, "epoch": 0.22089283030211934, "flos": 34124527134720.0, "grad_norm": 2.033280219669827, "language_loss": 0.78043067, "learning_rate": 3.6297643720050203e-06, "loss": 0.80176711, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53515625, "step": 3674, "time_per_iteration": 2.550156354904175 }, { "auxiliary_loss_clip": 0.01083367, "auxiliary_loss_mlp": 0.01052607, "balance_loss_clip": 1.01581907, "balance_loss_mlp": 1.02907455, "epoch": 0.2209529535547873, "flos": 18076245517440.0, "grad_norm": 2.3671845016577424, "language_loss": 0.76841748, "learning_rate": 3.6295385987465293e-06, "loss": 0.78977722, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.54296875, "step": 3675, "time_per_iteration": 2.4818050861358643 }, { "auxiliary_loss_clip": 0.01081156, "auxiliary_loss_mlp": 0.01054185, "balance_loss_clip": 1.02154565, "balance_loss_mlp": 1.02614045, "epoch": 0.22101307680745527, "flos": 27234103317120.0, "grad_norm": 1.8346643327463348, "language_loss": 0.81429446, "learning_rate": 3.629312763695772e-06, "loss": 0.83564794, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.55078125, "step": 3676, "time_per_iteration": 2.477440595626831 }, { "auxiliary_loss_clip": 0.01086325, "auxiliary_loss_mlp": 0.01058877, "balance_loss_clip": 1.02149343, "balance_loss_mlp": 1.02979589, "epoch": 0.22107320006012326, "flos": 16542371445120.0, "grad_norm": 2.499892925672237, "language_loss": 0.76629579, "learning_rate": 3.6290868668613107e-06, "loss": 0.78774774, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.56640625, "step": 3677, "time_per_iteration": 2.399937629699707 }, { "auxiliary_loss_clip": 0.01080738, "auxiliary_loss_mlp": 0.01051851, "balance_loss_clip": 1.01909268, "balance_loss_mlp": 1.02641773, "epoch": 0.22113332331279123, "flos": 22053376512000.0, "grad_norm": 1.7973742561768564, "language_loss": 0.85199714, "learning_rate": 3.628860908251712e-06, "loss": 0.87332302, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.54296875, "step": 3678, "time_per_iteration": 2.426067590713501 }, { "auxiliary_loss_clip": 0.01084117, "auxiliary_loss_mlp": 0.01057499, "balance_loss_clip": 1.02207017, "balance_loss_mlp": 1.03067911, "epoch": 0.2211934465654592, "flos": 26611636953600.0, "grad_norm": 1.8345784904010092, "language_loss": 0.90666413, "learning_rate": 3.6286348878755452e-06, "loss": 0.92808032, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53515625, "step": 3679, "time_per_iteration": 2.501152276992798 }, { "auxiliary_loss_clip": 0.01085341, "auxiliary_loss_mlp": 0.0106454, "balance_loss_clip": 1.02811015, "balance_loss_mlp": 1.02970982, "epoch": 0.22125356981812716, "flos": 16359496410240.0, "grad_norm": 2.667651978536447, "language_loss": 0.87920356, "learning_rate": 3.6284088057413803e-06, "loss": 0.90070236, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5546875, "step": 3680, "time_per_iteration": 2.3931174278259277 }, { "auxiliary_loss_clip": 0.01080923, "auxiliary_loss_mlp": 0.0106248, "balance_loss_clip": 1.02848208, "balance_loss_mlp": 1.0294441, "epoch": 0.22131369307079513, "flos": 21650094864000.0, "grad_norm": 2.0484754987442377, "language_loss": 0.82039022, "learning_rate": 3.6281826618577894e-06, "loss": 0.84182429, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 3681, "time_per_iteration": 2.4510788917541504 }, { "auxiliary_loss_clip": 0.01078967, "auxiliary_loss_mlp": 0.01065651, "balance_loss_clip": 1.0328449, "balance_loss_mlp": 1.02690029, "epoch": 0.2213738163234631, "flos": 19608513667200.0, "grad_norm": 2.2740220521415027, "language_loss": 0.8120544, "learning_rate": 3.62795645623335e-06, "loss": 0.83350062, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51953125, "step": 3682, "time_per_iteration": 2.434425115585327 }, { "auxiliary_loss_clip": 0.01080444, "auxiliary_loss_mlp": 0.01063062, "balance_loss_clip": 1.02684617, "balance_loss_mlp": 1.026196, "epoch": 0.22143393957613106, "flos": 23622268544640.0, "grad_norm": 1.5774245557716333, "language_loss": 0.78728312, "learning_rate": 3.627730188876638e-06, "loss": 0.8087182, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.54296875, "step": 3683, "time_per_iteration": 2.4775383472442627 }, { "auxiliary_loss_clip": 0.01080756, "auxiliary_loss_mlp": 0.0106646, "balance_loss_clip": 1.03026772, "balance_loss_mlp": 1.02497244, "epoch": 0.22149406282879905, "flos": 26176584101760.0, "grad_norm": 1.9483620246264541, "language_loss": 0.74037635, "learning_rate": 3.627503859796234e-06, "loss": 0.76184845, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.55859375, "step": 3684, "time_per_iteration": 2.476248264312744 }, { "auxiliary_loss_clip": 0.01078932, "auxiliary_loss_mlp": 0.01060025, "balance_loss_clip": 1.02392864, "balance_loss_mlp": 1.02448726, "epoch": 0.221554186081467, "flos": 14537867978880.0, "grad_norm": 1.8216885217031387, "language_loss": 0.81001657, "learning_rate": 3.6272774690007207e-06, "loss": 0.83140618, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.54296875, "step": 3685, "time_per_iteration": 2.4087557792663574 }, { "auxiliary_loss_clip": 0.01075306, "auxiliary_loss_mlp": 0.0106184, "balance_loss_clip": 1.02660131, "balance_loss_mlp": 1.02301884, "epoch": 0.22161430933413498, "flos": 22237124330880.0, "grad_norm": 1.6290538355393331, "language_loss": 0.88194227, "learning_rate": 3.6270510164986823e-06, "loss": 0.90331376, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5234375, "step": 3686, "time_per_iteration": 2.452080488204956 }, { "auxiliary_loss_clip": 0.01075388, "auxiliary_loss_mlp": 0.01054316, "balance_loss_clip": 1.01888716, "balance_loss_mlp": 1.02251756, "epoch": 0.22167443258680294, "flos": 23475423899520.0, "grad_norm": 1.897713983838019, "language_loss": 0.80359101, "learning_rate": 3.626824502298707e-06, "loss": 0.82488811, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.52734375, "step": 3687, "time_per_iteration": 2.4678051471710205 }, { "auxiliary_loss_clip": 0.01081957, "auxiliary_loss_mlp": 0.01060926, "balance_loss_clip": 1.02163458, "balance_loss_mlp": 1.02371168, "epoch": 0.2217345558394709, "flos": 23220034237440.0, "grad_norm": 1.8461983255296257, "language_loss": 0.86874962, "learning_rate": 3.626597926409383e-06, "loss": 0.89017844, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.58203125, "step": 3688, "time_per_iteration": 2.450288772583008 }, { "auxiliary_loss_clip": 0.01079312, "auxiliary_loss_mlp": 0.01056853, "balance_loss_clip": 1.01846802, "balance_loss_mlp": 1.0239861, "epoch": 0.22179467909213887, "flos": 20010049747200.0, "grad_norm": 1.776696639717824, "language_loss": 0.82819569, "learning_rate": 3.6263712888393027e-06, "loss": 0.84955734, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5546875, "step": 3689, "time_per_iteration": 2.433804512023926 }, { "auxiliary_loss_clip": 0.0107979, "auxiliary_loss_mlp": 0.01059216, "balance_loss_clip": 1.02359629, "balance_loss_mlp": 1.02598786, "epoch": 0.22185480234480687, "flos": 19682005812480.0, "grad_norm": 1.7629901017607228, "language_loss": 0.72167015, "learning_rate": 3.626144589597061e-06, "loss": 0.74306017, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5390625, "step": 3690, "time_per_iteration": 2.449148654937744 }, { "auxiliary_loss_clip": 0.01079787, "auxiliary_loss_mlp": 0.01058661, "balance_loss_clip": 1.02022815, "balance_loss_mlp": 1.02394783, "epoch": 0.22191492559747483, "flos": 21980233480320.0, "grad_norm": 1.8348451931997531, "language_loss": 0.74223411, "learning_rate": 3.6259178286912528e-06, "loss": 0.76361853, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5546875, "step": 3691, "time_per_iteration": 2.437488079071045 }, { "auxiliary_loss_clip": 0.01081643, "auxiliary_loss_mlp": 0.01057941, "balance_loss_clip": 1.0220356, "balance_loss_mlp": 1.02809024, "epoch": 0.2219750488501428, "flos": 23220941932800.0, "grad_norm": 2.118115705037344, "language_loss": 0.73467183, "learning_rate": 3.625691006130477e-06, "loss": 0.75606769, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53515625, "step": 3692, "time_per_iteration": 2.4711406230926514 }, { "auxiliary_loss_clip": 0.01082869, "auxiliary_loss_mlp": 0.01061484, "balance_loss_clip": 1.02309835, "balance_loss_mlp": 1.02626967, "epoch": 0.22203517210281076, "flos": 22452643353600.0, "grad_norm": 1.750287445627396, "language_loss": 0.88279527, "learning_rate": 3.6254641219233362e-06, "loss": 0.90423882, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.56640625, "step": 3693, "time_per_iteration": 3.989405870437622 }, { "auxiliary_loss_clip": 0.0107885, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.01789463, "balance_loss_mlp": 1.02682745, "epoch": 0.22209529535547873, "flos": 17563650802560.0, "grad_norm": 2.15093843742596, "language_loss": 0.87533534, "learning_rate": 3.6252371760784325e-06, "loss": 0.89662439, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.51953125, "step": 3694, "time_per_iteration": 2.4410159587860107 }, { "auxiliary_loss_clip": 0.01087041, "auxiliary_loss_mlp": 0.0105706, "balance_loss_clip": 1.01710129, "balance_loss_mlp": 1.0292145, "epoch": 0.2221554186081467, "flos": 21467987879040.0, "grad_norm": 2.676437127247946, "language_loss": 0.71308362, "learning_rate": 3.6250101686043725e-06, "loss": 0.73452461, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.578125, "step": 3695, "time_per_iteration": 4.64707088470459 }, { "auxiliary_loss_clip": 0.01083555, "auxiliary_loss_mlp": 0.01041882, "balance_loss_clip": 1.01088798, "balance_loss_mlp": 1.03160286, "epoch": 0.22221554186081466, "flos": 27672193457280.0, "grad_norm": 1.6277341708000246, "language_loss": 0.73188239, "learning_rate": 3.6247830995097637e-06, "loss": 0.75313675, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.51953125, "step": 3696, "time_per_iteration": 3.9229955673217773 }, { "auxiliary_loss_clip": 0.01083333, "auxiliary_loss_mlp": 0.01049662, "balance_loss_clip": 1.01468611, "balance_loss_mlp": 1.02864492, "epoch": 0.22227566511348265, "flos": 25957713588480.0, "grad_norm": 1.7378580175663, "language_loss": 0.88799453, "learning_rate": 3.624555968803217e-06, "loss": 0.90932453, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.546875, "step": 3697, "time_per_iteration": 3.9732067584991455 }, { "auxiliary_loss_clip": 0.01076953, "auxiliary_loss_mlp": 0.01046602, "balance_loss_clip": 1.01641798, "balance_loss_mlp": 1.02610564, "epoch": 0.22233578836615062, "flos": 39202085272320.0, "grad_norm": 1.559396353572843, "language_loss": 0.67654616, "learning_rate": 3.624328776493346e-06, "loss": 0.69778168, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.5078125, "step": 3698, "time_per_iteration": 2.6268131732940674 }, { "auxiliary_loss_clip": 0.01081389, "auxiliary_loss_mlp": 0.01058821, "balance_loss_clip": 1.02212787, "balance_loss_mlp": 1.02598524, "epoch": 0.22239591161881858, "flos": 36282298936320.0, "grad_norm": 1.7958569107260984, "language_loss": 0.8409971, "learning_rate": 3.6241015225887637e-06, "loss": 0.86239922, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5546875, "step": 3699, "time_per_iteration": 2.585388422012329 }, { "auxiliary_loss_clip": 0.01081372, "auxiliary_loss_mlp": 0.01049637, "balance_loss_clip": 1.01551962, "balance_loss_mlp": 1.02752662, "epoch": 0.22245603487148655, "flos": 19718559872640.0, "grad_norm": 1.9710811813034157, "language_loss": 0.81149131, "learning_rate": 3.62387420709809e-06, "loss": 0.83280134, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5390625, "step": 3700, "time_per_iteration": 2.4705026149749756 }, { "auxiliary_loss_clip": 0.01084336, "auxiliary_loss_mlp": 0.01067251, "balance_loss_clip": 1.02722013, "balance_loss_mlp": 1.02729952, "epoch": 0.2225161581241545, "flos": 46278700704000.0, "grad_norm": 2.2984912798984, "language_loss": 0.73602057, "learning_rate": 3.623646830029943e-06, "loss": 0.75753641, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5703125, "step": 3701, "time_per_iteration": 2.6548750400543213 }, { "auxiliary_loss_clip": 0.01076999, "auxiliary_loss_mlp": 0.01055713, "balance_loss_clip": 1.02319264, "balance_loss_mlp": 1.02383614, "epoch": 0.22257628137682248, "flos": 23695062462720.0, "grad_norm": 1.7224129199326386, "language_loss": 0.82523394, "learning_rate": 3.6234193913929454e-06, "loss": 0.84656107, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.53125, "step": 3702, "time_per_iteration": 2.4736104011535645 }, { "auxiliary_loss_clip": 0.01072571, "auxiliary_loss_mlp": 0.01055312, "balance_loss_clip": 1.02279139, "balance_loss_mlp": 1.02236187, "epoch": 0.22263640462949044, "flos": 19352984359680.0, "grad_norm": 1.8400514293373422, "language_loss": 0.79894245, "learning_rate": 3.623191891195723e-06, "loss": 0.82022119, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.50390625, "step": 3703, "time_per_iteration": 2.437183141708374 }, { "auxiliary_loss_clip": 0.0107706, "auxiliary_loss_mlp": 0.01062353, "balance_loss_clip": 1.02811599, "balance_loss_mlp": 1.02176452, "epoch": 0.22269652788215843, "flos": 20775031747200.0, "grad_norm": 1.9965288626340112, "language_loss": 0.76487577, "learning_rate": 3.6229643294469005e-06, "loss": 0.7862699, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5546875, "step": 3704, "time_per_iteration": 2.4091475009918213 }, { "auxiliary_loss_clip": 0.01072493, "auxiliary_loss_mlp": 0.01055056, "balance_loss_clip": 1.02387047, "balance_loss_mlp": 1.02273822, "epoch": 0.2227566511348264, "flos": 47957045448960.0, "grad_norm": 4.340216078379273, "language_loss": 0.65674013, "learning_rate": 3.6227367061551074e-06, "loss": 0.67801559, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.49804688, "step": 3705, "time_per_iteration": 2.7136647701263428 }, { "auxiliary_loss_clip": 0.01031506, "auxiliary_loss_mlp": 0.01044999, "balance_loss_clip": 1.04023099, "balance_loss_mlp": 1.01675391, "epoch": 0.22281677438749437, "flos": 66216166001280.0, "grad_norm": 1.3418975660239376, "language_loss": 0.65370679, "learning_rate": 3.6225090213289766e-06, "loss": 0.67447186, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 0.04760742, "router_z_loss_mlp": 0.14746094, "step": 3706, "time_per_iteration": 2.9096553325653076 }, { "auxiliary_loss_clip": 0.01076008, "auxiliary_loss_mlp": 0.01054494, "balance_loss_clip": 1.01982784, "balance_loss_mlp": 1.02248907, "epoch": 0.22287689764016233, "flos": 21870536388480.0, "grad_norm": 3.0578313577447576, "language_loss": 0.82510471, "learning_rate": 3.622281274977141e-06, "loss": 0.84640968, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53515625, "step": 3707, "time_per_iteration": 2.4361467361450195 }, { "auxiliary_loss_clip": 0.01075946, "auxiliary_loss_mlp": 0.01054954, "balance_loss_clip": 1.01973963, "balance_loss_mlp": 1.02404392, "epoch": 0.2229370208928303, "flos": 27671250850560.0, "grad_norm": 1.6828363891243179, "language_loss": 0.80490088, "learning_rate": 3.6220534671082367e-06, "loss": 0.8262099, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51953125, "step": 3708, "time_per_iteration": 2.446653127670288 }, { "auxiliary_loss_clip": 0.01080679, "auxiliary_loss_mlp": 0.01051792, "balance_loss_clip": 1.01726925, "balance_loss_mlp": 1.02725673, "epoch": 0.22299714414549826, "flos": 30153331071360.0, "grad_norm": 1.9305707872171725, "language_loss": 0.82840008, "learning_rate": 3.6218255977309024e-06, "loss": 0.84972477, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53515625, "step": 3709, "time_per_iteration": 2.520690441131592 }, { "auxiliary_loss_clip": 0.01080048, "auxiliary_loss_mlp": 0.01052721, "balance_loss_clip": 1.01762629, "balance_loss_mlp": 1.02552056, "epoch": 0.22305726739816625, "flos": 23142178172160.0, "grad_norm": 2.1311535448042935, "language_loss": 0.71046317, "learning_rate": 3.6215976668537787e-06, "loss": 0.73179084, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3710, "time_per_iteration": 2.46634840965271 }, { "auxiliary_loss_clip": 0.01081765, "auxiliary_loss_mlp": 0.01048907, "balance_loss_clip": 1.01621962, "balance_loss_mlp": 1.02687514, "epoch": 0.22311739065083422, "flos": 19171051931520.0, "grad_norm": 4.247073522138863, "language_loss": 0.92473722, "learning_rate": 3.6213696744855096e-06, "loss": 0.94604397, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.546875, "step": 3711, "time_per_iteration": 2.4315686225891113 }, { "auxiliary_loss_clip": 0.01083237, "auxiliary_loss_mlp": 0.0105734, "balance_loss_clip": 1.02238774, "balance_loss_mlp": 1.02987075, "epoch": 0.22317751390350218, "flos": 13617138936960.0, "grad_norm": 2.364382512654724, "language_loss": 0.92011476, "learning_rate": 3.6211416206347395e-06, "loss": 0.94152057, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.53125, "step": 3712, "time_per_iteration": 2.373652935028076 }, { "auxiliary_loss_clip": 0.01083482, "auxiliary_loss_mlp": 0.01054788, "balance_loss_clip": 1.02000308, "balance_loss_mlp": 1.03161216, "epoch": 0.22323763715617015, "flos": 11028468735360.0, "grad_norm": 3.0535825222822024, "language_loss": 0.78459632, "learning_rate": 3.620913505310117e-06, "loss": 0.80597901, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51953125, "step": 3713, "time_per_iteration": 2.3980233669281006 }, { "auxiliary_loss_clip": 0.01082238, "auxiliary_loss_mlp": 0.01059236, "balance_loss_clip": 1.02418876, "balance_loss_mlp": 1.03058314, "epoch": 0.22329776040883811, "flos": 41350012070400.0, "grad_norm": 2.034392755089827, "language_loss": 0.64015257, "learning_rate": 3.6206853285202917e-06, "loss": 0.66156739, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.515625, "step": 3714, "time_per_iteration": 2.645505428314209 }, { "auxiliary_loss_clip": 0.01083461, "auxiliary_loss_mlp": 0.01059178, "balance_loss_clip": 1.02802896, "balance_loss_mlp": 1.03081226, "epoch": 0.22335788366150608, "flos": 25118296836480.0, "grad_norm": 1.803924241902877, "language_loss": 0.80821711, "learning_rate": 3.6204570902739164e-06, "loss": 0.82964349, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.52734375, "step": 3715, "time_per_iteration": 2.465149402618408 }, { "auxiliary_loss_clip": 0.01083669, "auxiliary_loss_mlp": 0.01067205, "balance_loss_clip": 1.03411222, "balance_loss_mlp": 1.03130484, "epoch": 0.22341800691417404, "flos": 16982416621440.0, "grad_norm": 1.543415868716006, "language_loss": 0.78222841, "learning_rate": 3.620228790579645e-06, "loss": 0.80373716, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5234375, "step": 3716, "time_per_iteration": 2.407839059829712 }, { "auxiliary_loss_clip": 0.0108078, "auxiliary_loss_mlp": 0.01065142, "balance_loss_clip": 1.03169155, "balance_loss_mlp": 1.0288465, "epoch": 0.22347813016684204, "flos": 14135878051200.0, "grad_norm": 2.347894321867998, "language_loss": 0.81116652, "learning_rate": 3.6200004294461367e-06, "loss": 0.83262575, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.51953125, "step": 3717, "time_per_iteration": 2.395129680633545 }, { "auxiliary_loss_clip": 0.01080749, "auxiliary_loss_mlp": 0.01065121, "balance_loss_clip": 1.03188515, "balance_loss_mlp": 1.02755177, "epoch": 0.22353825341951, "flos": 23582118614400.0, "grad_norm": 9.571191063909529, "language_loss": 0.69043493, "learning_rate": 3.6197720068820497e-06, "loss": 0.71189362, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.53125, "step": 3718, "time_per_iteration": 2.467946767807007 }, { "auxiliary_loss_clip": 0.01080045, "auxiliary_loss_mlp": 0.01058701, "balance_loss_clip": 1.02346301, "balance_loss_mlp": 1.02714634, "epoch": 0.22359837667217797, "flos": 29822948075520.0, "grad_norm": 1.786894299410668, "language_loss": 0.82468098, "learning_rate": 3.619543522896045e-06, "loss": 0.8460685, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53125, "step": 3719, "time_per_iteration": 2.4715735912323 }, { "auxiliary_loss_clip": 0.01080576, "auxiliary_loss_mlp": 0.01061185, "balance_loss_clip": 1.02556574, "balance_loss_mlp": 1.02506995, "epoch": 0.22365849992484593, "flos": 17602124987520.0, "grad_norm": 1.9461292135264183, "language_loss": 0.88537496, "learning_rate": 3.6193149774967885e-06, "loss": 0.90679252, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 3720, "time_per_iteration": 2.3799004554748535 }, { "auxiliary_loss_clip": 0.01074996, "auxiliary_loss_mlp": 0.01053132, "balance_loss_clip": 1.02114856, "balance_loss_mlp": 1.0249964, "epoch": 0.2237186231775139, "flos": 22709848406400.0, "grad_norm": 1.5878534228818606, "language_loss": 0.75521302, "learning_rate": 3.619086370692945e-06, "loss": 0.77649432, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5, "step": 3721, "time_per_iteration": 2.434608221054077 }, { "auxiliary_loss_clip": 0.01076887, "auxiliary_loss_mlp": 0.01055351, "balance_loss_clip": 1.01851487, "balance_loss_mlp": 1.02208793, "epoch": 0.22377874643018186, "flos": 13370651671680.0, "grad_norm": 2.305769587653549, "language_loss": 0.80373895, "learning_rate": 3.6188577024931844e-06, "loss": 0.82506132, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 3722, "time_per_iteration": 2.369907855987549 }, { "auxiliary_loss_clip": 0.01071502, "auxiliary_loss_mlp": 0.01047296, "balance_loss_clip": 1.0150615, "balance_loss_mlp": 1.02160835, "epoch": 0.22383886968284986, "flos": 17893998887040.0, "grad_norm": 1.9277601432998899, "language_loss": 0.84049833, "learning_rate": 3.618628972906178e-06, "loss": 0.86168623, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5, "step": 3723, "time_per_iteration": 2.408546209335327 }, { "auxiliary_loss_clip": 0.01074041, "auxiliary_loss_mlp": 0.01057539, "balance_loss_clip": 1.02406478, "balance_loss_mlp": 1.02105391, "epoch": 0.22389899293551782, "flos": 23877972408960.0, "grad_norm": 4.3806221256252735, "language_loss": 0.86645722, "learning_rate": 3.6184001819405984e-06, "loss": 0.88777304, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.52734375, "step": 3724, "time_per_iteration": 2.412684679031372 }, { "auxiliary_loss_clip": 0.01072975, "auxiliary_loss_mlp": 0.0104725, "balance_loss_clip": 1.01494443, "balance_loss_mlp": 1.02132654, "epoch": 0.2239591161881858, "flos": 27271181047680.0, "grad_norm": 1.7030504538746665, "language_loss": 0.81114405, "learning_rate": 3.618171329605121e-06, "loss": 0.83234632, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.515625, "step": 3725, "time_per_iteration": 2.469985246658325 }, { "auxiliary_loss_clip": 0.01073994, "auxiliary_loss_mlp": 0.01048548, "balance_loss_clip": 1.01531267, "balance_loss_mlp": 1.02252865, "epoch": 0.22401923944085375, "flos": 22235762787840.0, "grad_norm": 1.7866748850788903, "language_loss": 0.78731221, "learning_rate": 3.6179424159084254e-06, "loss": 0.80853766, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.515625, "step": 3726, "time_per_iteration": 2.417520523071289 }, { "auxiliary_loss_clip": 0.01078495, "auxiliary_loss_mlp": 0.01062286, "balance_loss_clip": 1.02409148, "balance_loss_mlp": 1.02169156, "epoch": 0.22407936269352172, "flos": 12052959937920.0, "grad_norm": 2.7890993205189543, "language_loss": 0.74366951, "learning_rate": 3.6177134408591914e-06, "loss": 0.76507735, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.56640625, "step": 3727, "time_per_iteration": 2.4231162071228027 }, { "auxiliary_loss_clip": 0.01078904, "auxiliary_loss_mlp": 0.01056032, "balance_loss_clip": 1.01840925, "balance_loss_mlp": 1.02342677, "epoch": 0.22413948594618968, "flos": 19352565423360.0, "grad_norm": 2.2068934721115028, "language_loss": 0.88888526, "learning_rate": 3.6174844044661013e-06, "loss": 0.91023463, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5546875, "step": 3728, "time_per_iteration": 2.4105615615844727 }, { "auxiliary_loss_clip": 0.0107589, "auxiliary_loss_mlp": 0.01054305, "balance_loss_clip": 1.01708794, "balance_loss_mlp": 1.02262068, "epoch": 0.22419960919885765, "flos": 24168868790400.0, "grad_norm": 2.848761672193175, "language_loss": 0.82694185, "learning_rate": 3.6172553067378406e-06, "loss": 0.84824377, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.53125, "step": 3729, "time_per_iteration": 2.467771530151367 }, { "auxiliary_loss_clip": 0.01076086, "auxiliary_loss_mlp": 0.01052256, "balance_loss_clip": 1.02183366, "balance_loss_mlp": 1.02505112, "epoch": 0.22425973245152564, "flos": 27377805939840.0, "grad_norm": 1.6000956780699833, "language_loss": 0.88277751, "learning_rate": 3.6170261476830964e-06, "loss": 0.90406096, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.51171875, "step": 3730, "time_per_iteration": 2.5001840591430664 }, { "auxiliary_loss_clip": 0.01077315, "auxiliary_loss_mlp": 0.01049955, "balance_loss_clip": 1.01776862, "balance_loss_mlp": 1.02575231, "epoch": 0.2243198557041936, "flos": 13734795818880.0, "grad_norm": 1.8286053836821556, "language_loss": 0.74970353, "learning_rate": 3.616796927310559e-06, "loss": 0.77097631, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.515625, "step": 3731, "time_per_iteration": 2.4410037994384766 }, { "auxiliary_loss_clip": 0.01082913, "auxiliary_loss_mlp": 0.01055538, "balance_loss_clip": 1.01872563, "balance_loss_mlp": 1.02821922, "epoch": 0.22437997895686157, "flos": 19529854640640.0, "grad_norm": 1.8192810309320693, "language_loss": 0.7696228, "learning_rate": 3.6165676456289195e-06, "loss": 0.79100728, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 3732, "time_per_iteration": 2.400709629058838 }, { "auxiliary_loss_clip": 0.01081096, "auxiliary_loss_mlp": 0.01051521, "balance_loss_clip": 1.01695061, "balance_loss_mlp": 1.0275892, "epoch": 0.22444010220952954, "flos": 23695097374080.0, "grad_norm": 1.7034674376543364, "language_loss": 0.89063776, "learning_rate": 3.616338302646873e-06, "loss": 0.91196394, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53515625, "step": 3733, "time_per_iteration": 3.8822503089904785 }, { "auxiliary_loss_clip": 0.01075758, "auxiliary_loss_mlp": 0.01050312, "balance_loss_clip": 1.01614618, "balance_loss_mlp": 1.02302265, "epoch": 0.2245002254621975, "flos": 22381804471680.0, "grad_norm": 1.6439294889564866, "language_loss": 0.85971105, "learning_rate": 3.6161088983731166e-06, "loss": 0.88097173, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.52734375, "step": 3734, "time_per_iteration": 2.390564441680908 }, { "auxiliary_loss_clip": 0.01077334, "auxiliary_loss_mlp": 0.01055745, "balance_loss_clip": 1.0205785, "balance_loss_mlp": 1.0244472, "epoch": 0.22456034871486547, "flos": 26941112254080.0, "grad_norm": 2.066821635884086, "language_loss": 0.7792089, "learning_rate": 3.6158794328163482e-06, "loss": 0.80053967, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53125, "step": 3735, "time_per_iteration": 4.120121479034424 }, { "auxiliary_loss_clip": 0.01071968, "auxiliary_loss_mlp": 0.01050941, "balance_loss_clip": 1.01902866, "balance_loss_mlp": 1.02257586, "epoch": 0.22462047196753343, "flos": 28982344337280.0, "grad_norm": 1.956759547756066, "language_loss": 0.8522706, "learning_rate": 3.6156499059852702e-06, "loss": 0.87349975, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.49414062, "step": 3736, "time_per_iteration": 5.205406188964844 }, { "auxiliary_loss_clip": 0.0107658, "auxiliary_loss_mlp": 0.01052589, "balance_loss_clip": 1.01851869, "balance_loss_mlp": 1.02353334, "epoch": 0.22468059522020142, "flos": 20010294126720.0, "grad_norm": 1.9800575663822655, "language_loss": 0.8812409, "learning_rate": 3.615420317888586e-06, "loss": 0.90253258, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.53125, "step": 3737, "time_per_iteration": 2.4491443634033203 }, { "auxiliary_loss_clip": 0.01076541, "auxiliary_loss_mlp": 0.01057589, "balance_loss_clip": 1.01891732, "balance_loss_mlp": 1.02225041, "epoch": 0.2247407184728694, "flos": 29312971712640.0, "grad_norm": 2.182586289678639, "language_loss": 0.8053754, "learning_rate": 3.6151906685350006e-06, "loss": 0.82671666, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.54296875, "step": 3738, "time_per_iteration": 2.4488210678100586 }, { "auxiliary_loss_clip": 0.01075922, "auxiliary_loss_mlp": 0.01048868, "balance_loss_clip": 1.01482201, "balance_loss_mlp": 1.02160263, "epoch": 0.22480084172553735, "flos": 22309254933120.0, "grad_norm": 2.0400655902194704, "language_loss": 0.7797246, "learning_rate": 3.614960957933224e-06, "loss": 0.80097252, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.54296875, "step": 3739, "time_per_iteration": 2.4318060874938965 }, { "auxiliary_loss_clip": 0.01074603, "auxiliary_loss_mlp": 0.01054272, "balance_loss_clip": 1.01815176, "balance_loss_mlp": 1.02145672, "epoch": 0.22486096497820532, "flos": 25590148128000.0, "grad_norm": 2.0028869255423, "language_loss": 0.76069522, "learning_rate": 3.6147311860919655e-06, "loss": 0.78198391, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 3740, "time_per_iteration": 2.4435157775878906 }, { "auxiliary_loss_clip": 0.01072964, "auxiliary_loss_mlp": 0.01050589, "balance_loss_clip": 1.01713896, "balance_loss_mlp": 1.02073121, "epoch": 0.22492108823087328, "flos": 17638853604480.0, "grad_norm": 2.003208018822855, "language_loss": 0.76900792, "learning_rate": 3.614501353019939e-06, "loss": 0.79024345, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.51953125, "step": 3741, "time_per_iteration": 2.508251190185547 }, { "auxiliary_loss_clip": 0.01074664, "auxiliary_loss_mlp": 0.01046871, "balance_loss_clip": 1.01413596, "balance_loss_mlp": 1.02292728, "epoch": 0.22498121148354125, "flos": 16033721713920.0, "grad_norm": 1.7162476377977107, "language_loss": 0.89720774, "learning_rate": 3.6142714587258592e-06, "loss": 0.91842306, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.515625, "step": 3742, "time_per_iteration": 2.449946403503418 }, { "auxiliary_loss_clip": 0.01075209, "auxiliary_loss_mlp": 0.01060337, "balance_loss_clip": 1.02278662, "balance_loss_mlp": 1.02274776, "epoch": 0.22504133473620924, "flos": 24022652549760.0, "grad_norm": 1.7928813574301303, "language_loss": 0.82633126, "learning_rate": 3.614041503218444e-06, "loss": 0.84768671, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5234375, "step": 3743, "time_per_iteration": 2.424997329711914 }, { "auxiliary_loss_clip": 0.01074838, "auxiliary_loss_mlp": 0.01050078, "balance_loss_clip": 1.01472104, "balance_loss_mlp": 1.0217067, "epoch": 0.2251014579888772, "flos": 16763022437760.0, "grad_norm": 2.758491594045999, "language_loss": 0.64393544, "learning_rate": 3.6138114865064134e-06, "loss": 0.66518462, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53125, "step": 3744, "time_per_iteration": 2.356741189956665 }, { "auxiliary_loss_clip": 0.01073481, "auxiliary_loss_mlp": 0.01054688, "balance_loss_clip": 1.01787663, "balance_loss_mlp": 1.02020311, "epoch": 0.22516158124154517, "flos": 13990150569600.0, "grad_norm": 3.0169022852405827, "language_loss": 0.78888905, "learning_rate": 3.613581408598489e-06, "loss": 0.81017077, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.53125, "step": 3745, "time_per_iteration": 2.3743629455566406 }, { "auxiliary_loss_clip": 0.01075923, "auxiliary_loss_mlp": 0.01057689, "balance_loss_clip": 1.02099681, "balance_loss_mlp": 1.02213264, "epoch": 0.22522170449421314, "flos": 14389207943040.0, "grad_norm": 1.9753480622855957, "language_loss": 0.81916124, "learning_rate": 3.6133512695033965e-06, "loss": 0.84049731, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5390625, "step": 3746, "time_per_iteration": 2.368948221206665 }, { "auxiliary_loss_clip": 0.01074352, "auxiliary_loss_mlp": 0.01058549, "balance_loss_clip": 1.0218569, "balance_loss_mlp": 1.01998925, "epoch": 0.2252818277468811, "flos": 23804410440960.0, "grad_norm": 2.642852693434061, "language_loss": 0.87288272, "learning_rate": 3.613121069229862e-06, "loss": 0.89421177, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.54296875, "step": 3747, "time_per_iteration": 2.4143056869506836 }, { "auxiliary_loss_clip": 0.01075121, "auxiliary_loss_mlp": 0.01051494, "balance_loss_clip": 1.01740003, "balance_loss_mlp": 1.02112603, "epoch": 0.22534195099954907, "flos": 24716865490560.0, "grad_norm": 1.7737672003025267, "language_loss": 0.78601527, "learning_rate": 3.6128908077866145e-06, "loss": 0.80728143, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5390625, "step": 3748, "time_per_iteration": 2.450960159301758 }, { "auxiliary_loss_clip": 0.01078356, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.02331495, "balance_loss_mlp": 1.02280903, "epoch": 0.22540207425221703, "flos": 21031294193280.0, "grad_norm": 2.0841925296530346, "language_loss": 0.81185114, "learning_rate": 3.6126604851823864e-06, "loss": 0.83323359, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5546875, "step": 3749, "time_per_iteration": 2.4315288066864014 }, { "auxiliary_loss_clip": 0.01072968, "auxiliary_loss_mlp": 0.01053781, "balance_loss_clip": 1.01837623, "balance_loss_mlp": 1.02127957, "epoch": 0.22546219750488503, "flos": 19389363863040.0, "grad_norm": 1.7099897909663124, "language_loss": 0.80889589, "learning_rate": 3.6124301014259108e-06, "loss": 0.83016342, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.515625, "step": 3750, "time_per_iteration": 2.402207374572754 }, { "auxiliary_loss_clip": 0.01077923, "auxiliary_loss_mlp": 0.01060427, "balance_loss_clip": 1.02137411, "balance_loss_mlp": 1.02212501, "epoch": 0.225522320757553, "flos": 25191439868160.0, "grad_norm": 1.99409046581492, "language_loss": 0.84323943, "learning_rate": 3.6121996565259244e-06, "loss": 0.86462289, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.55859375, "step": 3751, "time_per_iteration": 2.462245464324951 }, { "auxiliary_loss_clip": 0.01077869, "auxiliary_loss_mlp": 0.01056406, "balance_loss_clip": 1.01718593, "balance_loss_mlp": 1.02190292, "epoch": 0.22558244401022096, "flos": 17162219456640.0, "grad_norm": 1.9290964690679395, "language_loss": 0.85115117, "learning_rate": 3.611969150491165e-06, "loss": 0.87249386, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.55859375, "step": 3752, "time_per_iteration": 2.3641231060028076 }, { "auxiliary_loss_clip": 0.01074642, "auxiliary_loss_mlp": 0.01056794, "balance_loss_clip": 1.02157927, "balance_loss_mlp": 1.02121472, "epoch": 0.22564256726288892, "flos": 15230125883520.0, "grad_norm": 1.9468173262183865, "language_loss": 0.79427242, "learning_rate": 3.611738583330375e-06, "loss": 0.81558681, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53515625, "step": 3753, "time_per_iteration": 2.3877739906311035 }, { "auxiliary_loss_clip": 0.01074104, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.02283669, "balance_loss_mlp": 1.02077866, "epoch": 0.2257026905155569, "flos": 34567225574400.0, "grad_norm": 1.761120256880445, "language_loss": 0.80148923, "learning_rate": 3.611507955052295e-06, "loss": 0.82284135, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.53125, "step": 3754, "time_per_iteration": 2.5627999305725098 }, { "auxiliary_loss_clip": 0.01076014, "auxiliary_loss_mlp": 0.01062639, "balance_loss_clip": 1.02265632, "balance_loss_mlp": 1.0229336, "epoch": 0.22576281376822485, "flos": 19937395474560.0, "grad_norm": 1.8309953566097916, "language_loss": 0.71505678, "learning_rate": 3.6112772656656727e-06, "loss": 0.73644328, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.53125, "step": 3755, "time_per_iteration": 2.475804090499878 }, { "auxiliary_loss_clip": 0.01080936, "auxiliary_loss_mlp": 0.01063887, "balance_loss_clip": 1.02166355, "balance_loss_mlp": 1.02398503, "epoch": 0.22582293702089282, "flos": 24601023999360.0, "grad_norm": 2.3243880735699665, "language_loss": 0.79018605, "learning_rate": 3.6110465151792547e-06, "loss": 0.8116343, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.5703125, "step": 3756, "time_per_iteration": 2.4097156524658203 }, { "auxiliary_loss_clip": 0.01078497, "auxiliary_loss_mlp": 0.01058682, "balance_loss_clip": 1.01822233, "balance_loss_mlp": 1.02217853, "epoch": 0.2258830602735608, "flos": 23034436116480.0, "grad_norm": 1.8030275551302184, "language_loss": 0.84123921, "learning_rate": 3.6108157036017916e-06, "loss": 0.86261106, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.5625, "step": 3757, "time_per_iteration": 2.391437530517578 }, { "auxiliary_loss_clip": 0.01079585, "auxiliary_loss_mlp": 0.01057481, "balance_loss_clip": 1.01556683, "balance_loss_mlp": 1.02355886, "epoch": 0.22594318352622877, "flos": 22157487786240.0, "grad_norm": 1.7585797077820955, "language_loss": 0.74257922, "learning_rate": 3.6105848309420358e-06, "loss": 0.76394987, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.5625, "step": 3758, "time_per_iteration": 2.3796911239624023 }, { "auxiliary_loss_clip": 0.01078886, "auxiliary_loss_mlp": 0.01065148, "balance_loss_clip": 1.02261448, "balance_loss_mlp": 1.02268088, "epoch": 0.22600330677889674, "flos": 20593273875840.0, "grad_norm": 2.2120709841464894, "language_loss": 0.78754312, "learning_rate": 3.6103538972087412e-06, "loss": 0.8089835, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.5625, "step": 3759, "time_per_iteration": 2.366210460662842 }, { "auxiliary_loss_clip": 0.01077351, "auxiliary_loss_mlp": 0.01060623, "balance_loss_clip": 1.01825571, "balance_loss_mlp": 1.02076125, "epoch": 0.2260634300315647, "flos": 35658436118400.0, "grad_norm": 1.6400533011059164, "language_loss": 0.79728132, "learning_rate": 3.6101229024106655e-06, "loss": 0.81866109, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.56640625, "step": 3760, "time_per_iteration": 2.527611255645752 }, { "auxiliary_loss_clip": 0.0102277, "auxiliary_loss_mlp": 0.01012322, "balance_loss_clip": 1.00726736, "balance_loss_mlp": 1.00644112, "epoch": 0.22612355328423267, "flos": 72087579699840.0, "grad_norm": 0.9653545928513323, "language_loss": 0.60138416, "learning_rate": 3.609891846556569e-06, "loss": 0.6217351, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 0.05053711, "router_z_loss_mlp": 0.16308594, "step": 3761, "time_per_iteration": 2.967061758041382 }, { "auxiliary_loss_clip": 0.01081928, "auxiliary_loss_mlp": 0.01066662, "balance_loss_clip": 1.02169621, "balance_loss_mlp": 1.02362823, "epoch": 0.22618367653690064, "flos": 22782677235840.0, "grad_norm": 2.099608677852756, "language_loss": 0.79281557, "learning_rate": 3.609660729655211e-06, "loss": 0.81430149, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 0.58203125, "step": 3762, "time_per_iteration": 2.4093644618988037 }, { "auxiliary_loss_clip": 0.0108189, "auxiliary_loss_mlp": 0.01066698, "balance_loss_clip": 1.02425909, "balance_loss_mlp": 1.02402282, "epoch": 0.22624379978956863, "flos": 20447232192000.0, "grad_norm": 2.1827158605893775, "language_loss": 0.80767208, "learning_rate": 3.6094295517153573e-06, "loss": 0.82915801, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.578125, "step": 3763, "time_per_iteration": 2.389605760574341 }, { "auxiliary_loss_clip": 0.01081098, "auxiliary_loss_mlp": 0.0107746, "balance_loss_clip": 1.03158855, "balance_loss_mlp": 1.02301133, "epoch": 0.2263039230422366, "flos": 17493335591040.0, "grad_norm": 1.748025847668226, "language_loss": 0.92769641, "learning_rate": 3.6091983127457743e-06, "loss": 0.94928199, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 0.58203125, "step": 3764, "time_per_iteration": 2.4005534648895264 }, { "auxiliary_loss_clip": 0.01078018, "auxiliary_loss_mlp": 0.01071112, "balance_loss_clip": 1.03198719, "balance_loss_mlp": 1.02362823, "epoch": 0.22636404629490456, "flos": 28328490794880.0, "grad_norm": 2.4298426257038908, "language_loss": 0.76489854, "learning_rate": 3.6089670127552293e-06, "loss": 0.78638983, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.54296875, "step": 3765, "time_per_iteration": 2.4509835243225098 }, { "auxiliary_loss_clip": 0.0107913, "auxiliary_loss_mlp": 0.01072647, "balance_loss_clip": 1.0311389, "balance_loss_mlp": 1.0232811, "epoch": 0.22642416954757252, "flos": 17488308355200.0, "grad_norm": 3.6673798661362746, "language_loss": 0.92181659, "learning_rate": 3.608735651752494e-06, "loss": 0.94333434, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.55859375, "step": 3766, "time_per_iteration": 2.4195759296417236 }, { "auxiliary_loss_clip": 0.01074924, "auxiliary_loss_mlp": 0.01071165, "balance_loss_clip": 1.02956128, "balance_loss_mlp": 1.02158248, "epoch": 0.2264842928002405, "flos": 24383515029120.0, "grad_norm": 1.5377269365882533, "language_loss": 0.76294994, "learning_rate": 3.6085042297463417e-06, "loss": 0.78441083, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.53515625, "step": 3767, "time_per_iteration": 2.4248480796813965 }, { "auxiliary_loss_clip": 0.01077395, "auxiliary_loss_mlp": 0.01067837, "balance_loss_clip": 1.02473116, "balance_loss_mlp": 1.02135706, "epoch": 0.22654441605290845, "flos": 19829443950720.0, "grad_norm": 1.5689595315607066, "language_loss": 0.72895324, "learning_rate": 3.6082727467455477e-06, "loss": 0.75040555, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.5625, "step": 3768, "time_per_iteration": 2.424954891204834 }, { "auxiliary_loss_clip": 0.01078614, "auxiliary_loss_mlp": 0.01073538, "balance_loss_clip": 1.03362703, "balance_loss_mlp": 1.02361655, "epoch": 0.22660453930557642, "flos": 27453322944000.0, "grad_norm": 1.7506208268526953, "language_loss": 0.80012155, "learning_rate": 3.6080412027588905e-06, "loss": 0.82164311, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.55078125, "step": 3769, "time_per_iteration": 2.4324588775634766 }, { "auxiliary_loss_clip": 0.010791, "auxiliary_loss_mlp": 0.01064473, "balance_loss_clip": 1.02346492, "balance_loss_mlp": 1.02205062, "epoch": 0.2266646625582444, "flos": 23987006184960.0, "grad_norm": 1.7148365973901927, "language_loss": 0.70762956, "learning_rate": 3.6078095977951488e-06, "loss": 0.7290653, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.5703125, "step": 3770, "time_per_iteration": 2.4687676429748535 }, { "auxiliary_loss_clip": 0.01080765, "auxiliary_loss_mlp": 0.01071736, "balance_loss_clip": 1.03230202, "balance_loss_mlp": 1.02441049, "epoch": 0.22672478581091238, "flos": 26026946547840.0, "grad_norm": 1.4973181902171637, "language_loss": 0.81425035, "learning_rate": 3.6075779318631067e-06, "loss": 0.83577543, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5625, "step": 3771, "time_per_iteration": 2.4268112182617188 }, { "auxiliary_loss_clip": 0.01077122, "auxiliary_loss_mlp": 0.01060338, "balance_loss_clip": 1.02197647, "balance_loss_mlp": 1.02422047, "epoch": 0.22678490906358034, "flos": 23840685210240.0, "grad_norm": 1.7325538981947635, "language_loss": 0.79400682, "learning_rate": 3.6073462049715486e-06, "loss": 0.81538135, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.52734375, "step": 3772, "time_per_iteration": 3.8738229274749756 }, { "auxiliary_loss_clip": 0.0102236, "auxiliary_loss_mlp": 0.01016896, "balance_loss_clip": 1.01136458, "balance_loss_mlp": 1.00635552, "epoch": 0.2268450323162483, "flos": 65044409351040.0, "grad_norm": 0.6596604093680121, "language_loss": 0.54462755, "learning_rate": 3.607114417129261e-06, "loss": 0.56502008, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 0.05541992, "router_z_loss_mlp": 0.16015625, "step": 3773, "time_per_iteration": 3.114429473876953 }, { "auxiliary_loss_clip": 0.01077899, "auxiliary_loss_mlp": 0.01058079, "balance_loss_clip": 1.02019453, "balance_loss_mlp": 1.02411282, "epoch": 0.22690515556891627, "flos": 22525053246720.0, "grad_norm": 1.7139651221697103, "language_loss": 0.71583951, "learning_rate": 3.6068825683450334e-06, "loss": 0.73719925, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5390625, "step": 3774, "time_per_iteration": 4.066363573074341 }, { "auxiliary_loss_clip": 0.01079371, "auxiliary_loss_mlp": 0.01061415, "balance_loss_clip": 1.02207589, "balance_loss_mlp": 1.02534449, "epoch": 0.22696527882158424, "flos": 18222461758080.0, "grad_norm": 2.00572599222934, "language_loss": 0.7539171, "learning_rate": 3.606650658627658e-06, "loss": 0.775325, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5390625, "step": 3775, "time_per_iteration": 3.7913928031921387 }, { "auxiliary_loss_clip": 0.01080676, "auxiliary_loss_mlp": 0.01053977, "balance_loss_clip": 1.01606894, "balance_loss_mlp": 1.02507305, "epoch": 0.22702540207425223, "flos": 17018307365760.0, "grad_norm": 2.018726172943752, "language_loss": 0.84216273, "learning_rate": 3.606418687985928e-06, "loss": 0.86350924, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5546875, "step": 3776, "time_per_iteration": 3.7953555583953857 }, { "auxiliary_loss_clip": 0.01080264, "auxiliary_loss_mlp": 0.0106147, "balance_loss_clip": 1.02117705, "balance_loss_mlp": 1.0243696, "epoch": 0.2270855253269202, "flos": 21324634369920.0, "grad_norm": 2.144313083575548, "language_loss": 0.84432113, "learning_rate": 3.606186656428641e-06, "loss": 0.86573851, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5546875, "step": 3777, "time_per_iteration": 2.4127323627471924 }, { "auxiliary_loss_clip": 0.01080335, "auxiliary_loss_mlp": 0.01064941, "balance_loss_clip": 1.02836788, "balance_loss_mlp": 1.02578938, "epoch": 0.22714564857958816, "flos": 23549334981120.0, "grad_norm": 1.8445656371394266, "language_loss": 0.74792439, "learning_rate": 3.6059545639645955e-06, "loss": 0.76937711, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 3778, "time_per_iteration": 2.4186604022979736 }, { "auxiliary_loss_clip": 0.01079663, "auxiliary_loss_mlp": 0.01065255, "balance_loss_clip": 1.02727509, "balance_loss_mlp": 1.02390528, "epoch": 0.22720577183225613, "flos": 25988821476480.0, "grad_norm": 2.479747366543403, "language_loss": 0.67030466, "learning_rate": 3.605722410602591e-06, "loss": 0.69175386, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.55859375, "step": 3779, "time_per_iteration": 2.439324140548706 }, { "auxiliary_loss_clip": 0.01077739, "auxiliary_loss_mlp": 0.01066768, "balance_loss_clip": 1.03136313, "balance_loss_mlp": 1.02465129, "epoch": 0.2272658950849241, "flos": 20813017173120.0, "grad_norm": 1.735996973023303, "language_loss": 0.72194207, "learning_rate": 3.6054901963514323e-06, "loss": 0.7433871, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53125, "step": 3780, "time_per_iteration": 2.392807960510254 }, { "auxiliary_loss_clip": 0.01078731, "auxiliary_loss_mlp": 0.01068096, "balance_loss_clip": 1.02906656, "balance_loss_mlp": 1.02450752, "epoch": 0.22732601833759206, "flos": 23908347158400.0, "grad_norm": 2.6206620092282016, "language_loss": 0.90543306, "learning_rate": 3.6052579212199246e-06, "loss": 0.92690134, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.54296875, "step": 3781, "time_per_iteration": 2.4279842376708984 }, { "auxiliary_loss_clip": 0.01077243, "auxiliary_loss_mlp": 0.0107685, "balance_loss_clip": 1.03860784, "balance_loss_mlp": 1.02314115, "epoch": 0.22738614159026002, "flos": 15923500951680.0, "grad_norm": 2.370614931144449, "language_loss": 0.76325476, "learning_rate": 3.6050255852168753e-06, "loss": 0.78479564, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5390625, "step": 3782, "time_per_iteration": 2.3630990982055664 }, { "auxiliary_loss_clip": 0.01072768, "auxiliary_loss_mlp": 0.01063222, "balance_loss_clip": 1.03053451, "balance_loss_mlp": 1.02091706, "epoch": 0.22744626484292801, "flos": 24204410421120.0, "grad_norm": 1.4296595950945508, "language_loss": 0.83800763, "learning_rate": 3.604793188351095e-06, "loss": 0.85936755, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.51953125, "step": 3783, "time_per_iteration": 2.449126958847046 }, { "auxiliary_loss_clip": 0.01074811, "auxiliary_loss_mlp": 0.01079996, "balance_loss_clip": 1.04041862, "balance_loss_mlp": 1.02110767, "epoch": 0.22750638809559598, "flos": 24790427458560.0, "grad_norm": 11.897368688239734, "language_loss": 0.77707493, "learning_rate": 3.6045607306313964e-06, "loss": 0.79862297, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.53515625, "step": 3784, "time_per_iteration": 2.4196720123291016 }, { "auxiliary_loss_clip": 0.01072876, "auxiliary_loss_mlp": 0.01063075, "balance_loss_clip": 1.02802753, "balance_loss_mlp": 1.02039123, "epoch": 0.22756651134826394, "flos": 22235413674240.0, "grad_norm": 1.570363928891433, "language_loss": 0.71642983, "learning_rate": 3.604328212066594e-06, "loss": 0.73778939, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.52734375, "step": 3785, "time_per_iteration": 2.3873963356018066 }, { "auxiliary_loss_clip": 0.01022912, "auxiliary_loss_mlp": 0.01137434, "balance_loss_clip": 1.1314497, "balance_loss_mlp": 1.00831246, "epoch": 0.2276266346009319, "flos": 62704006894080.0, "grad_norm": 0.9088014436395508, "language_loss": 0.6195038, "learning_rate": 3.6040956326655047e-06, "loss": 0.64110726, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 0.05981445, "router_z_loss_mlp": 0.14648438, "step": 3786, "time_per_iteration": 3.0614724159240723 }, { "auxiliary_loss_clip": 0.0108226, "auxiliary_loss_mlp": 0.01060798, "balance_loss_clip": 1.02217436, "balance_loss_mlp": 1.0263077, "epoch": 0.22768675785359987, "flos": 18613245139200.0, "grad_norm": 2.7531578060162145, "language_loss": 0.88978052, "learning_rate": 3.6038629924369486e-06, "loss": 0.91121113, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.55859375, "step": 3787, "time_per_iteration": 2.4823479652404785 }, { "auxiliary_loss_clip": 0.01079054, "auxiliary_loss_mlp": 0.01055273, "balance_loss_clip": 1.02144182, "balance_loss_mlp": 1.02721858, "epoch": 0.22774688110626784, "flos": 26868981651840.0, "grad_norm": 1.3316552857065989, "language_loss": 0.74358332, "learning_rate": 3.6036302913897474e-06, "loss": 0.76492661, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.51953125, "step": 3788, "time_per_iteration": 2.4530677795410156 }, { "auxiliary_loss_clip": 0.01083123, "auxiliary_loss_mlp": 0.01061204, "balance_loss_clip": 1.02510715, "balance_loss_mlp": 1.02870059, "epoch": 0.2278070043589358, "flos": 15552863291520.0, "grad_norm": 2.2482902875850375, "language_loss": 0.69660223, "learning_rate": 3.6033975295327243e-06, "loss": 0.71804541, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.546875, "step": 3789, "time_per_iteration": 2.5739314556121826 }, { "auxiliary_loss_clip": 0.01083261, "auxiliary_loss_mlp": 0.01075228, "balance_loss_clip": 1.0384872, "balance_loss_mlp": 1.03025842, "epoch": 0.2278671276116038, "flos": 22415775091200.0, "grad_norm": 1.9028082488207625, "language_loss": 0.7663188, "learning_rate": 3.6031647068747065e-06, "loss": 0.78790361, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.53125, "step": 3790, "time_per_iteration": 2.5688347816467285 }, { "auxiliary_loss_clip": 0.01082448, "auxiliary_loss_mlp": 0.01078365, "balance_loss_clip": 1.04400921, "balance_loss_mlp": 1.0302856, "epoch": 0.22792725086427176, "flos": 20630316695040.0, "grad_norm": 1.9588276168581793, "language_loss": 0.92733955, "learning_rate": 3.602931823424522e-06, "loss": 0.94894767, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5234375, "step": 3791, "time_per_iteration": 2.4066824913024902 }, { "auxiliary_loss_clip": 0.01085036, "auxiliary_loss_mlp": 0.01082148, "balance_loss_clip": 1.04581261, "balance_loss_mlp": 1.0305692, "epoch": 0.22798737411693973, "flos": 31427661029760.0, "grad_norm": 1.9173723442049408, "language_loss": 0.84493244, "learning_rate": 3.6026988791910026e-06, "loss": 0.86660421, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.54296875, "step": 3792, "time_per_iteration": 2.6513619422912598 }, { "auxiliary_loss_clip": 0.01052317, "auxiliary_loss_mlp": 0.01100871, "balance_loss_clip": 1.09550703, "balance_loss_mlp": 1.03480268, "epoch": 0.2280474973696077, "flos": 52394121095040.0, "grad_norm": 1.1981163944147672, "language_loss": 0.65805137, "learning_rate": 3.602465874182981e-06, "loss": 0.67958331, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 0.05371094, "router_z_loss_mlp": 0.17578125, "step": 3793, "time_per_iteration": 2.80916166305542 }, { "auxiliary_loss_clip": 0.0108691, "auxiliary_loss_mlp": 0.0110086, "balance_loss_clip": 1.06173587, "balance_loss_mlp": 1.02894855, "epoch": 0.22810762062227566, "flos": 26394861121920.0, "grad_norm": 1.8279853037267464, "language_loss": 0.79174948, "learning_rate": 3.602232808409293e-06, "loss": 0.81362718, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.578125, "step": 3794, "time_per_iteration": 2.4345741271972656 }, { "auxiliary_loss_clip": 0.01080164, "auxiliary_loss_mlp": 0.01068714, "balance_loss_clip": 1.03488255, "balance_loss_mlp": 1.02650511, "epoch": 0.22816774387494362, "flos": 25629076160640.0, "grad_norm": 1.9275908242812172, "language_loss": 0.8207283, "learning_rate": 3.6019996818787755e-06, "loss": 0.84221709, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.53515625, "step": 3795, "time_per_iteration": 2.4776947498321533 }, { "auxiliary_loss_clip": 0.0107757, "auxiliary_loss_mlp": 0.01079634, "balance_loss_clip": 1.04401445, "balance_loss_mlp": 1.02432489, "epoch": 0.22822786712761162, "flos": 22450618494720.0, "grad_norm": 2.495370410469278, "language_loss": 0.78674644, "learning_rate": 3.6017664946002704e-06, "loss": 0.8083185, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53125, "step": 3796, "time_per_iteration": 2.4067163467407227 }, { "auxiliary_loss_clip": 0.01077288, "auxiliary_loss_mlp": 0.01066944, "balance_loss_clip": 1.03492391, "balance_loss_mlp": 1.02462089, "epoch": 0.22828799038027958, "flos": 12201759619200.0, "grad_norm": 2.4594310372463206, "language_loss": 0.97052014, "learning_rate": 3.6015332465826188e-06, "loss": 0.99196243, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.52734375, "step": 3797, "time_per_iteration": 2.412616491317749 }, { "auxiliary_loss_clip": 0.0107689, "auxiliary_loss_mlp": 0.01064951, "balance_loss_clip": 1.03028536, "balance_loss_mlp": 1.0254519, "epoch": 0.22834811363294755, "flos": 22084763690880.0, "grad_norm": 2.1766701571844163, "language_loss": 0.82676709, "learning_rate": 3.601299937834666e-06, "loss": 0.84818554, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 3798, "time_per_iteration": 2.506627082824707 }, { "auxiliary_loss_clip": 0.0108, "auxiliary_loss_mlp": 0.01061674, "balance_loss_clip": 1.02419424, "balance_loss_mlp": 1.02669489, "epoch": 0.2284082368856155, "flos": 24859555683840.0, "grad_norm": 1.8327463962727781, "language_loss": 0.80652761, "learning_rate": 3.6010665683652596e-06, "loss": 0.8279444, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53125, "step": 3799, "time_per_iteration": 2.5439326763153076 }, { "auxiliary_loss_clip": 0.01082891, "auxiliary_loss_mlp": 0.01053404, "balance_loss_clip": 1.02004957, "balance_loss_mlp": 1.02948916, "epoch": 0.22846836013828348, "flos": 23291815726080.0, "grad_norm": 1.7479429438928942, "language_loss": 0.77442139, "learning_rate": 3.6008331381832484e-06, "loss": 0.79578435, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.53515625, "step": 3800, "time_per_iteration": 2.574922800064087 }, { "auxiliary_loss_clip": 0.01083571, "auxiliary_loss_mlp": 0.01051859, "balance_loss_clip": 1.01964855, "balance_loss_mlp": 1.03085637, "epoch": 0.22852848339095144, "flos": 27415093138560.0, "grad_norm": 1.9179511185760247, "language_loss": 0.65726745, "learning_rate": 3.600599647297484e-06, "loss": 0.67862177, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.52734375, "step": 3801, "time_per_iteration": 2.635030508041382 }, { "auxiliary_loss_clip": 0.01087872, "auxiliary_loss_mlp": 0.01054117, "balance_loss_clip": 1.02266932, "balance_loss_mlp": 1.03665161, "epoch": 0.2285886066436194, "flos": 26320007433600.0, "grad_norm": 1.5476234461322504, "language_loss": 0.81905222, "learning_rate": 3.60036609571682e-06, "loss": 0.8404721, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.51171875, "step": 3802, "time_per_iteration": 2.5267748832702637 }, { "auxiliary_loss_clip": 0.01089294, "auxiliary_loss_mlp": 0.01071616, "balance_loss_clip": 1.03258693, "balance_loss_mlp": 1.03399134, "epoch": 0.2286487298962874, "flos": 29715171108480.0, "grad_norm": 2.0171383866546484, "language_loss": 0.79933929, "learning_rate": 3.600132483450114e-06, "loss": 0.82094842, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5546875, "step": 3803, "time_per_iteration": 2.535651445388794 }, { "auxiliary_loss_clip": 0.01091819, "auxiliary_loss_mlp": 0.0105943, "balance_loss_clip": 1.02302325, "balance_loss_mlp": 1.03698051, "epoch": 0.22870885314895537, "flos": 21286160184960.0, "grad_norm": 2.1772108885169725, "language_loss": 0.86666542, "learning_rate": 3.5998988105062235e-06, "loss": 0.88817799, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.546875, "step": 3804, "time_per_iteration": 2.467409372329712 }, { "auxiliary_loss_clip": 0.01091565, "auxiliary_loss_mlp": 0.01064342, "balance_loss_clip": 1.02822185, "balance_loss_mlp": 1.03576231, "epoch": 0.22876897640162333, "flos": 14938566186240.0, "grad_norm": 2.046465337724459, "language_loss": 0.78884363, "learning_rate": 3.59966507689401e-06, "loss": 0.81040269, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.55859375, "step": 3805, "time_per_iteration": 2.3974766731262207 }, { "auxiliary_loss_clip": 0.01091523, "auxiliary_loss_mlp": 0.01059099, "balance_loss_clip": 1.02166772, "balance_loss_mlp": 1.03579283, "epoch": 0.2288290996542913, "flos": 18112939223040.0, "grad_norm": 5.759009039180327, "language_loss": 0.81158751, "learning_rate": 3.5994312826223363e-06, "loss": 0.83309376, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.55859375, "step": 3806, "time_per_iteration": 2.3993659019470215 }, { "auxiliary_loss_clip": 0.01093333, "auxiliary_loss_mlp": 0.01067699, "balance_loss_clip": 1.03365278, "balance_loss_mlp": 1.03891003, "epoch": 0.22888922290695926, "flos": 39853983778560.0, "grad_norm": 3.435165574401829, "language_loss": 0.72109228, "learning_rate": 3.5991974277000684e-06, "loss": 0.7427026, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.546875, "step": 3807, "time_per_iteration": 2.6026110649108887 }, { "auxiliary_loss_clip": 0.01093624, "auxiliary_loss_mlp": 0.01059472, "balance_loss_clip": 1.02177811, "balance_loss_mlp": 1.03782499, "epoch": 0.22894934615962723, "flos": 23402664892800.0, "grad_norm": 2.2443196715142735, "language_loss": 0.6716392, "learning_rate": 3.5989635121360733e-06, "loss": 0.69317019, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5546875, "step": 3808, "time_per_iteration": 2.4397480487823486 }, { "auxiliary_loss_clip": 0.01090073, "auxiliary_loss_mlp": 0.01056719, "balance_loss_clip": 1.02150416, "balance_loss_mlp": 1.0351932, "epoch": 0.22900946941229522, "flos": 18842030478720.0, "grad_norm": 2.1241815276886, "language_loss": 0.76037002, "learning_rate": 3.598729535939222e-06, "loss": 0.78183794, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3809, "time_per_iteration": 2.4108328819274902 }, { "auxiliary_loss_clip": 0.01087021, "auxiliary_loss_mlp": 0.01063797, "balance_loss_clip": 1.02946448, "balance_loss_mlp": 1.03298903, "epoch": 0.22906959266496318, "flos": 22928299983360.0, "grad_norm": 1.5071166635324513, "language_loss": 0.82967126, "learning_rate": 3.5984954991183862e-06, "loss": 0.85117948, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5390625, "step": 3810, "time_per_iteration": 2.4223828315734863 }, { "auxiliary_loss_clip": 0.01080442, "auxiliary_loss_mlp": 0.01055736, "balance_loss_clip": 1.02216661, "balance_loss_mlp": 1.02760851, "epoch": 0.22912971591763115, "flos": 19353508030080.0, "grad_norm": 2.2565356386753948, "language_loss": 0.79861641, "learning_rate": 3.598261401682441e-06, "loss": 0.81997824, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.52734375, "step": 3811, "time_per_iteration": 3.8563482761383057 }, { "auxiliary_loss_clip": 0.01083473, "auxiliary_loss_mlp": 0.01066743, "balance_loss_clip": 1.02976441, "balance_loss_mlp": 1.02873874, "epoch": 0.22918983917029911, "flos": 19932647529600.0, "grad_norm": 1.6796878744160415, "language_loss": 0.83922154, "learning_rate": 3.5980272436402632e-06, "loss": 0.86072367, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.546875, "step": 3812, "time_per_iteration": 2.393127202987671 }, { "auxiliary_loss_clip": 0.01083805, "auxiliary_loss_mlp": 0.01059066, "balance_loss_clip": 1.02258813, "balance_loss_mlp": 1.02780664, "epoch": 0.22924996242296708, "flos": 16689949228800.0, "grad_norm": 2.5196303163533735, "language_loss": 0.85880673, "learning_rate": 3.5977930250007324e-06, "loss": 0.88023543, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.55859375, "step": 3813, "time_per_iteration": 2.473724126815796 }, { "auxiliary_loss_clip": 0.01077553, "auxiliary_loss_mlp": 0.01058769, "balance_loss_clip": 1.02226758, "balance_loss_mlp": 1.02354074, "epoch": 0.22931008567563504, "flos": 33034782867840.0, "grad_norm": 1.5347605163130804, "language_loss": 0.71178246, "learning_rate": 3.5975587457727298e-06, "loss": 0.73314559, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5390625, "step": 3814, "time_per_iteration": 3.988420248031616 }, { "auxiliary_loss_clip": 0.01073868, "auxiliary_loss_mlp": 0.01057143, "balance_loss_clip": 1.02085614, "balance_loss_mlp": 1.02183843, "epoch": 0.229370208928303, "flos": 23329591683840.0, "grad_norm": 2.674126582663077, "language_loss": 0.68373144, "learning_rate": 3.597324405965139e-06, "loss": 0.70504153, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51953125, "step": 3815, "time_per_iteration": 5.24737548828125 }, { "auxiliary_loss_clip": 0.01078775, "auxiliary_loss_mlp": 0.01077661, "balance_loss_clip": 1.03834581, "balance_loss_mlp": 1.02509201, "epoch": 0.229430332180971, "flos": 28616070597120.0, "grad_norm": 1.5676399597643875, "language_loss": 0.84253085, "learning_rate": 3.597090005586848e-06, "loss": 0.86409521, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5390625, "step": 3816, "time_per_iteration": 2.4860639572143555 }, { "auxiliary_loss_clip": 0.01076402, "auxiliary_loss_mlp": 0.01059769, "balance_loss_clip": 1.02271843, "balance_loss_mlp": 1.02305841, "epoch": 0.22949045543363897, "flos": 17237247701760.0, "grad_norm": 2.5632249904248177, "language_loss": 0.88786429, "learning_rate": 3.596855544646742e-06, "loss": 0.909226, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.53515625, "step": 3817, "time_per_iteration": 2.395220994949341 }, { "auxiliary_loss_clip": 0.01082759, "auxiliary_loss_mlp": 0.01058699, "balance_loss_clip": 1.0239141, "balance_loss_mlp": 1.02735281, "epoch": 0.22955057868630693, "flos": 27488236170240.0, "grad_norm": 1.6143322631932495, "language_loss": 0.76532567, "learning_rate": 3.5966210231537154e-06, "loss": 0.78674024, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5546875, "step": 3818, "time_per_iteration": 2.4864964485168457 }, { "auxiliary_loss_clip": 0.0108143, "auxiliary_loss_mlp": 0.01057728, "balance_loss_clip": 1.02034402, "balance_loss_mlp": 1.02562475, "epoch": 0.2296107019389749, "flos": 23475319165440.0, "grad_norm": 1.6822920567235238, "language_loss": 0.76447725, "learning_rate": 3.596386441116659e-06, "loss": 0.78586888, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.55859375, "step": 3819, "time_per_iteration": 2.433497905731201 }, { "auxiliary_loss_clip": 0.0108093, "auxiliary_loss_mlp": 0.01054437, "balance_loss_clip": 1.02110624, "balance_loss_mlp": 1.02717686, "epoch": 0.22967082519164286, "flos": 31283818761600.0, "grad_norm": 1.7059456985632468, "language_loss": 0.81796867, "learning_rate": 3.5961517985444684e-06, "loss": 0.83932233, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5390625, "step": 3820, "time_per_iteration": 2.4901673793792725 }, { "auxiliary_loss_clip": 0.01083998, "auxiliary_loss_mlp": 0.01057688, "balance_loss_clip": 1.0202086, "balance_loss_mlp": 1.02829921, "epoch": 0.22973094844431083, "flos": 14642188721280.0, "grad_norm": 2.2725665362968512, "language_loss": 0.71303183, "learning_rate": 3.595917095446042e-06, "loss": 0.73444867, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.55859375, "step": 3821, "time_per_iteration": 2.3944218158721924 }, { "auxiliary_loss_clip": 0.01080394, "auxiliary_loss_mlp": 0.01047537, "balance_loss_clip": 1.01527905, "balance_loss_mlp": 1.02829242, "epoch": 0.2297910716969788, "flos": 22822652609280.0, "grad_norm": 1.4576916618847127, "language_loss": 0.84348977, "learning_rate": 3.5956823318302796e-06, "loss": 0.8647691, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.51953125, "step": 3822, "time_per_iteration": 2.4481804370880127 }, { "auxiliary_loss_clip": 0.01082642, "auxiliary_loss_mlp": 0.01050669, "balance_loss_clip": 1.01566875, "balance_loss_mlp": 1.02915716, "epoch": 0.2298511949496468, "flos": 23037927252480.0, "grad_norm": 1.4446168406178788, "language_loss": 0.67887318, "learning_rate": 3.5954475077060833e-06, "loss": 0.70020628, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.53515625, "step": 3823, "time_per_iteration": 2.4602413177490234 }, { "auxiliary_loss_clip": 0.01044774, "auxiliary_loss_mlp": 0.01018885, "balance_loss_clip": 1.01485598, "balance_loss_mlp": 1.02918291, "epoch": 0.22991131820231475, "flos": 66887684691840.0, "grad_norm": 0.8946945510217301, "language_loss": 0.56779897, "learning_rate": 3.595212623082357e-06, "loss": 0.58843565, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.15625, "step": 3824, "time_per_iteration": 3.1251044273376465 }, { "auxiliary_loss_clip": 0.01083086, "auxiliary_loss_mlp": 0.01062631, "balance_loss_clip": 1.02827525, "balance_loss_mlp": 1.03049159, "epoch": 0.22997144145498272, "flos": 17886492944640.0, "grad_norm": 2.196981468235083, "language_loss": 0.75365782, "learning_rate": 3.594977677968009e-06, "loss": 0.77511501, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5234375, "step": 3825, "time_per_iteration": 2.3993124961853027 }, { "auxiliary_loss_clip": 0.01086892, "auxiliary_loss_mlp": 0.01072959, "balance_loss_clip": 1.03559852, "balance_loss_mlp": 1.03190875, "epoch": 0.23003156470765068, "flos": 24675807864960.0, "grad_norm": 1.7754565771592172, "language_loss": 0.89169484, "learning_rate": 3.5947426723719473e-06, "loss": 0.91329336, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.55078125, "step": 3826, "time_per_iteration": 2.472381830215454 }, { "auxiliary_loss_clip": 0.01085286, "auxiliary_loss_mlp": 0.01069287, "balance_loss_clip": 1.03183174, "balance_loss_mlp": 1.02884078, "epoch": 0.23009168796031865, "flos": 15813245278080.0, "grad_norm": 2.2254702792978898, "language_loss": 0.83982635, "learning_rate": 3.594507606303083e-06, "loss": 0.86137211, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5625, "step": 3827, "time_per_iteration": 2.396425724029541 }, { "auxiliary_loss_clip": 0.01079954, "auxiliary_loss_mlp": 0.01064881, "balance_loss_clip": 1.03124022, "balance_loss_mlp": 1.02825069, "epoch": 0.2301518112129866, "flos": 16212023360640.0, "grad_norm": 1.7662975057166757, "language_loss": 0.88212711, "learning_rate": 3.5942724797703314e-06, "loss": 0.90357548, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 3828, "time_per_iteration": 2.3893747329711914 }, { "auxiliary_loss_clip": 0.01081232, "auxiliary_loss_mlp": 0.01082532, "balance_loss_clip": 1.04786634, "balance_loss_mlp": 1.02656078, "epoch": 0.2302119344656546, "flos": 20594391039360.0, "grad_norm": 1.9307601830250327, "language_loss": 0.72571397, "learning_rate": 3.594037292782607e-06, "loss": 0.74735165, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.546875, "step": 3829, "time_per_iteration": 2.4291698932647705 }, { "auxiliary_loss_clip": 0.01077458, "auxiliary_loss_mlp": 0.01075462, "balance_loss_clip": 1.04329932, "balance_loss_mlp": 1.02562404, "epoch": 0.23027205771832257, "flos": 26795698974720.0, "grad_norm": 1.6759985216374191, "language_loss": 0.8572855, "learning_rate": 3.5938020453488293e-06, "loss": 0.87881464, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.51953125, "step": 3830, "time_per_iteration": 2.479471206665039 }, { "auxiliary_loss_clip": 0.01076807, "auxiliary_loss_mlp": 0.01080235, "balance_loss_clip": 1.04595089, "balance_loss_mlp": 1.02388465, "epoch": 0.23033218097099054, "flos": 43871439260160.0, "grad_norm": 1.7005254020765854, "language_loss": 0.68420684, "learning_rate": 3.5935667374779177e-06, "loss": 0.70577729, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.52734375, "step": 3831, "time_per_iteration": 2.6173925399780273 }, { "auxiliary_loss_clip": 0.01078386, "auxiliary_loss_mlp": 0.01085482, "balance_loss_clip": 1.04945683, "balance_loss_mlp": 1.02323794, "epoch": 0.2303923042236585, "flos": 26066468073600.0, "grad_norm": 3.3710234354371855, "language_loss": 0.77231652, "learning_rate": 3.5933313691787957e-06, "loss": 0.79395521, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.55078125, "step": 3832, "time_per_iteration": 2.4454007148742676 }, { "auxiliary_loss_clip": 0.01077886, "auxiliary_loss_mlp": 0.01072717, "balance_loss_clip": 1.03628659, "balance_loss_mlp": 1.02339613, "epoch": 0.23045242747632647, "flos": 18295395321600.0, "grad_norm": 1.7456724105041774, "language_loss": 0.88826501, "learning_rate": 3.593095940460389e-06, "loss": 0.90977097, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.54296875, "step": 3833, "time_per_iteration": 2.3832011222839355 }, { "auxiliary_loss_clip": 0.01077375, "auxiliary_loss_mlp": 0.01073637, "balance_loss_clip": 1.0368011, "balance_loss_mlp": 1.02321827, "epoch": 0.23051255072899443, "flos": 25519344157440.0, "grad_norm": 1.6209488467526094, "language_loss": 0.76802683, "learning_rate": 3.592860451331624e-06, "loss": 0.78953695, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.54296875, "step": 3834, "time_per_iteration": 2.4406964778900146 }, { "auxiliary_loss_clip": 0.0107803, "auxiliary_loss_mlp": 0.01071765, "balance_loss_clip": 1.03287911, "balance_loss_mlp": 1.0253247, "epoch": 0.2305726739816624, "flos": 21214134316800.0, "grad_norm": 1.8089720173055888, "language_loss": 0.87640178, "learning_rate": 3.592624901801432e-06, "loss": 0.89789969, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.52734375, "step": 3835, "time_per_iteration": 2.4242966175079346 }, { "auxiliary_loss_clip": 0.01083558, "auxiliary_loss_mlp": 0.01074649, "balance_loss_clip": 1.03530979, "balance_loss_mlp": 1.02573037, "epoch": 0.2306327972343304, "flos": 23330010620160.0, "grad_norm": 2.120321685173461, "language_loss": 0.84784555, "learning_rate": 3.5923892918787432e-06, "loss": 0.86942762, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.578125, "step": 3836, "time_per_iteration": 2.4211487770080566 }, { "auxiliary_loss_clip": 0.0108259, "auxiliary_loss_mlp": 0.01052278, "balance_loss_clip": 1.01560903, "balance_loss_mlp": 1.02748346, "epoch": 0.23069292048699835, "flos": 20665718680320.0, "grad_norm": 1.58680414190843, "language_loss": 0.81130826, "learning_rate": 3.5921536215724934e-06, "loss": 0.83265698, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.55078125, "step": 3837, "time_per_iteration": 2.469966173171997 }, { "auxiliary_loss_clip": 0.01027624, "auxiliary_loss_mlp": 0.01008995, "balance_loss_clip": 1.00482225, "balance_loss_mlp": 1.01326168, "epoch": 0.23075304373966632, "flos": 70451828680320.0, "grad_norm": 0.8979738606755284, "language_loss": 0.65471232, "learning_rate": 3.5919178908916184e-06, "loss": 0.67507851, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.14355469, "step": 3838, "time_per_iteration": 2.972496271133423 }, { "auxiliary_loss_clip": 0.0108291, "auxiliary_loss_mlp": 0.01059192, "balance_loss_clip": 1.02104473, "balance_loss_mlp": 1.02980888, "epoch": 0.23081316699233428, "flos": 16617050576640.0, "grad_norm": 1.8882657201820703, "language_loss": 0.76819861, "learning_rate": 3.591682099845058e-06, "loss": 0.78961968, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.53125, "step": 3839, "time_per_iteration": 2.42195463180542 }, { "auxiliary_loss_clip": 0.01085871, "auxiliary_loss_mlp": 0.01056913, "balance_loss_clip": 1.01774132, "balance_loss_mlp": 1.0291419, "epoch": 0.23087329024500225, "flos": 13297229349120.0, "grad_norm": 2.0166163954711616, "language_loss": 0.70947504, "learning_rate": 3.591446248441752e-06, "loss": 0.73090291, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5703125, "step": 3840, "time_per_iteration": 2.3671774864196777 }, { "auxiliary_loss_clip": 0.01086988, "auxiliary_loss_mlp": 0.01061636, "balance_loss_clip": 1.01955509, "balance_loss_mlp": 1.03212357, "epoch": 0.23093341349767021, "flos": 17784755642880.0, "grad_norm": 2.392429705017942, "language_loss": 0.80433583, "learning_rate": 3.591210336690645e-06, "loss": 0.82582206, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.546875, "step": 3841, "time_per_iteration": 2.404345989227295 }, { "auxiliary_loss_clip": 0.01081794, "auxiliary_loss_mlp": 0.01060668, "balance_loss_clip": 1.02426171, "balance_loss_mlp": 1.02805543, "epoch": 0.23099353675033818, "flos": 23986936362240.0, "grad_norm": 1.7891897988560148, "language_loss": 0.84517938, "learning_rate": 3.590974364600683e-06, "loss": 0.86660397, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 3842, "time_per_iteration": 2.420278310775757 }, { "auxiliary_loss_clip": 0.01083304, "auxiliary_loss_mlp": 0.01060791, "balance_loss_clip": 1.02405024, "balance_loss_mlp": 1.02943003, "epoch": 0.23105366000300617, "flos": 35993601970560.0, "grad_norm": 1.4771713114810963, "language_loss": 0.67328256, "learning_rate": 3.5907383321808135e-06, "loss": 0.69472349, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5390625, "step": 3843, "time_per_iteration": 2.691020965576172 }, { "auxiliary_loss_clip": 0.01079702, "auxiliary_loss_mlp": 0.0107002, "balance_loss_clip": 1.03218281, "balance_loss_mlp": 1.02738094, "epoch": 0.23111378325567414, "flos": 31244087767680.0, "grad_norm": 1.7520987629700062, "language_loss": 0.79233742, "learning_rate": 3.590502239439987e-06, "loss": 0.81383467, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5234375, "step": 3844, "time_per_iteration": 2.492827892303467 }, { "auxiliary_loss_clip": 0.01081066, "auxiliary_loss_mlp": 0.01066393, "balance_loss_clip": 1.02805495, "balance_loss_mlp": 1.0264852, "epoch": 0.2311739065083421, "flos": 19207221966720.0, "grad_norm": 1.7027254585797362, "language_loss": 0.79021549, "learning_rate": 3.590266086387156e-06, "loss": 0.81169003, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.546875, "step": 3845, "time_per_iteration": 2.4231648445129395 }, { "auxiliary_loss_clip": 0.01074266, "auxiliary_loss_mlp": 0.01056544, "balance_loss_clip": 1.01970863, "balance_loss_mlp": 1.02345395, "epoch": 0.23123402976101007, "flos": 23359268206080.0, "grad_norm": 2.107521785294402, "language_loss": 0.77725959, "learning_rate": 3.590029873031276e-06, "loss": 0.79856771, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5078125, "step": 3846, "time_per_iteration": 2.4166083335876465 }, { "auxiliary_loss_clip": 0.0107782, "auxiliary_loss_mlp": 0.01063569, "balance_loss_clip": 1.02544558, "balance_loss_mlp": 1.0240407, "epoch": 0.23129415301367803, "flos": 13734516528000.0, "grad_norm": 15.412235081732916, "language_loss": 0.72215766, "learning_rate": 3.589793599381304e-06, "loss": 0.74357158, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5390625, "step": 3847, "time_per_iteration": 2.422774314880371 }, { "auxiliary_loss_clip": 0.01020206, "auxiliary_loss_mlp": 0.01011304, "balance_loss_clip": 1.00698876, "balance_loss_mlp": 1.00599849, "epoch": 0.231354276266346, "flos": 69733699591680.0, "grad_norm": 0.7991716046350351, "language_loss": 0.61126554, "learning_rate": 3.589557265446198e-06, "loss": 0.63158065, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.14257812, "step": 3848, "time_per_iteration": 2.9412503242492676 }, { "auxiliary_loss_clip": 0.01075455, "auxiliary_loss_mlp": 0.01065942, "balance_loss_clip": 1.0276283, "balance_loss_mlp": 1.0217917, "epoch": 0.231414399519014, "flos": 18835118029440.0, "grad_norm": 1.962019724401641, "language_loss": 0.79821956, "learning_rate": 3.589320871234923e-06, "loss": 0.8196336, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5390625, "step": 3849, "time_per_iteration": 2.4136805534362793 }, { "auxiliary_loss_clip": 0.01077863, "auxiliary_loss_mlp": 0.01066217, "balance_loss_clip": 1.02864218, "balance_loss_mlp": 1.02325392, "epoch": 0.23147452277168196, "flos": 36134057836800.0, "grad_norm": 1.908384855656575, "language_loss": 0.734909, "learning_rate": 3.5890844167564405e-06, "loss": 0.7563498, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.546875, "step": 3850, "time_per_iteration": 3.988124370574951 }, { "auxiliary_loss_clip": 0.0107442, "auxiliary_loss_mlp": 0.01057031, "balance_loss_clip": 1.0183599, "balance_loss_mlp": 1.02175093, "epoch": 0.23153464602434992, "flos": 20811900009600.0, "grad_norm": 2.1193110614384936, "language_loss": 0.78200793, "learning_rate": 3.588847902019718e-06, "loss": 0.80332243, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5234375, "step": 3851, "time_per_iteration": 2.4412426948547363 }, { "auxiliary_loss_clip": 0.01075785, "auxiliary_loss_mlp": 0.01059049, "balance_loss_clip": 1.02218997, "balance_loss_mlp": 1.02336788, "epoch": 0.2315947692770179, "flos": 19938198435840.0, "grad_norm": 2.7730733839084567, "language_loss": 0.71329618, "learning_rate": 3.588611327033723e-06, "loss": 0.73464447, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5234375, "step": 3852, "time_per_iteration": 2.4036614894866943 }, { "auxiliary_loss_clip": 0.01079925, "auxiliary_loss_mlp": 0.0105848, "balance_loss_clip": 1.02319384, "balance_loss_mlp": 1.02525377, "epoch": 0.23165489252968585, "flos": 12854845111680.0, "grad_norm": 2.0900406131465563, "language_loss": 0.69817513, "learning_rate": 3.588374691807428e-06, "loss": 0.71955913, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 3853, "time_per_iteration": 2.4215612411499023 }, { "auxiliary_loss_clip": 0.01077925, "auxiliary_loss_mlp": 0.01061294, "balance_loss_clip": 1.02188349, "balance_loss_mlp": 1.02374065, "epoch": 0.23171501578235382, "flos": 30626962842240.0, "grad_norm": 1.577177893863858, "language_loss": 0.81854284, "learning_rate": 3.5881379963498053e-06, "loss": 0.83993506, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.54296875, "step": 3854, "time_per_iteration": 3.972559690475464 }, { "auxiliary_loss_clip": 0.01083504, "auxiliary_loss_mlp": 0.01067245, "balance_loss_clip": 1.02640402, "balance_loss_mlp": 1.02523434, "epoch": 0.23177513903502178, "flos": 23841627816960.0, "grad_norm": 1.9540903752090786, "language_loss": 0.68880332, "learning_rate": 3.587901240669831e-06, "loss": 0.71031082, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.58203125, "step": 3855, "time_per_iteration": 3.8932440280914307 }, { "auxiliary_loss_clip": 0.01081233, "auxiliary_loss_mlp": 0.0106835, "balance_loss_clip": 1.02848661, "balance_loss_mlp": 1.02511775, "epoch": 0.23183526228768978, "flos": 29568989779200.0, "grad_norm": 2.2537957095707366, "language_loss": 0.73104459, "learning_rate": 3.5876644247764815e-06, "loss": 0.75254041, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5625, "step": 3856, "time_per_iteration": 2.478543996810913 }, { "auxiliary_loss_clip": 0.01078323, "auxiliary_loss_mlp": 0.01058525, "balance_loss_clip": 1.02405012, "balance_loss_mlp": 1.02492869, "epoch": 0.23189538554035774, "flos": 34457284103040.0, "grad_norm": 1.5371372799892635, "language_loss": 0.7830472, "learning_rate": 3.5874275486787387e-06, "loss": 0.8044157, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.53515625, "step": 3857, "time_per_iteration": 2.578979253768921 }, { "auxiliary_loss_clip": 0.01084473, "auxiliary_loss_mlp": 0.01071975, "balance_loss_clip": 1.03170657, "balance_loss_mlp": 1.02664995, "epoch": 0.2319555087930257, "flos": 18002858106240.0, "grad_norm": 2.1201502130445844, "language_loss": 0.92967904, "learning_rate": 3.587190612385584e-06, "loss": 0.95124352, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.578125, "step": 3858, "time_per_iteration": 2.371002674102783 }, { "auxiliary_loss_clip": 0.01076029, "auxiliary_loss_mlp": 0.01054053, "balance_loss_clip": 1.02138996, "balance_loss_mlp": 1.02453637, "epoch": 0.23201563204569367, "flos": 23142876399360.0, "grad_norm": 1.69314717698541, "language_loss": 0.7776745, "learning_rate": 3.5869536159060026e-06, "loss": 0.79897529, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.515625, "step": 3859, "time_per_iteration": 2.4574496746063232 }, { "auxiliary_loss_clip": 0.01080219, "auxiliary_loss_mlp": 0.01054932, "balance_loss_clip": 1.017524, "balance_loss_mlp": 1.02576613, "epoch": 0.23207575529836164, "flos": 20666940577920.0, "grad_norm": 2.1788548230399183, "language_loss": 0.85462558, "learning_rate": 3.58671655924898e-06, "loss": 0.87597704, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.54296875, "step": 3860, "time_per_iteration": 2.397991418838501 }, { "auxiliary_loss_clip": 0.01078931, "auxiliary_loss_mlp": 0.01065067, "balance_loss_clip": 1.02882791, "balance_loss_mlp": 1.02547121, "epoch": 0.2321358785510296, "flos": 16471253272320.0, "grad_norm": 2.2026270961483836, "language_loss": 0.84940714, "learning_rate": 3.586479442423508e-06, "loss": 0.87084711, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.53515625, "step": 3861, "time_per_iteration": 2.3635354042053223 }, { "auxiliary_loss_clip": 0.01078382, "auxiliary_loss_mlp": 0.01058052, "balance_loss_clip": 1.01890373, "balance_loss_mlp": 1.02474999, "epoch": 0.2321960018036976, "flos": 21615251460480.0, "grad_norm": 1.487606543572715, "language_loss": 0.87306678, "learning_rate": 3.586242265438576e-06, "loss": 0.89443111, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53515625, "step": 3862, "time_per_iteration": 2.4222333431243896 }, { "auxiliary_loss_clip": 0.01076676, "auxiliary_loss_mlp": 0.01054823, "balance_loss_clip": 1.02237463, "balance_loss_mlp": 1.02482557, "epoch": 0.23225612505636556, "flos": 22270431634560.0, "grad_norm": 1.4158006110965622, "language_loss": 0.76120085, "learning_rate": 3.5860050283031773e-06, "loss": 0.78251582, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51953125, "step": 3863, "time_per_iteration": 2.39215350151062 }, { "auxiliary_loss_clip": 0.01075587, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.02184796, "balance_loss_mlp": 1.0244441, "epoch": 0.23231624830903352, "flos": 17051475024000.0, "grad_norm": 1.627684964442918, "language_loss": 0.75652093, "learning_rate": 3.58576773102631e-06, "loss": 0.77784288, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51171875, "step": 3864, "time_per_iteration": 2.3792147636413574 }, { "auxiliary_loss_clip": 0.01076871, "auxiliary_loss_mlp": 0.01054122, "balance_loss_clip": 1.0180254, "balance_loss_mlp": 1.02295578, "epoch": 0.2323763715617015, "flos": 34638657949440.0, "grad_norm": 1.7801298398750258, "language_loss": 0.71828538, "learning_rate": 3.5855303736169714e-06, "loss": 0.73959529, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5390625, "step": 3865, "time_per_iteration": 2.493830919265747 }, { "auxiliary_loss_clip": 0.01082239, "auxiliary_loss_mlp": 0.01070906, "balance_loss_clip": 1.02732337, "balance_loss_mlp": 1.0243386, "epoch": 0.23243649481436945, "flos": 25550661513600.0, "grad_norm": 1.7526053152444734, "language_loss": 0.96896851, "learning_rate": 3.5852929560841617e-06, "loss": 0.99049997, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 0.578125, "step": 3866, "time_per_iteration": 2.4490180015563965 }, { "auxiliary_loss_clip": 0.01075136, "auxiliary_loss_mlp": 0.01060131, "balance_loss_clip": 1.02470183, "balance_loss_mlp": 1.02199411, "epoch": 0.23249661806703742, "flos": 20482494531840.0, "grad_norm": 2.3110849101851745, "language_loss": 0.75179374, "learning_rate": 3.5850554784368846e-06, "loss": 0.77314633, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53125, "step": 3867, "time_per_iteration": 2.3598415851593018 }, { "auxiliary_loss_clip": 0.01077354, "auxiliary_loss_mlp": 0.01059715, "balance_loss_clip": 1.02216387, "balance_loss_mlp": 1.02394557, "epoch": 0.23255674131970538, "flos": 20375555437440.0, "grad_norm": 1.673926165795074, "language_loss": 0.83805048, "learning_rate": 3.584817940684145e-06, "loss": 0.85942113, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53125, "step": 3868, "time_per_iteration": 2.413832902908325 }, { "auxiliary_loss_clip": 0.0107445, "auxiliary_loss_mlp": 0.01050032, "balance_loss_clip": 1.01560462, "balance_loss_mlp": 1.02368855, "epoch": 0.23261686457237338, "flos": 17055140716800.0, "grad_norm": 1.8419555056956922, "language_loss": 0.75278968, "learning_rate": 3.58458034283495e-06, "loss": 0.7740345, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 3869, "time_per_iteration": 2.3546109199523926 }, { "auxiliary_loss_clip": 0.0107661, "auxiliary_loss_mlp": 0.01058667, "balance_loss_clip": 1.0198288, "balance_loss_mlp": 1.02419043, "epoch": 0.23267698782504134, "flos": 29168570862720.0, "grad_norm": 2.1177823780160794, "language_loss": 0.81323314, "learning_rate": 3.5843426848983097e-06, "loss": 0.83458591, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5234375, "step": 3870, "time_per_iteration": 2.4818832874298096 }, { "auxiliary_loss_clip": 0.01079202, "auxiliary_loss_mlp": 0.01057979, "balance_loss_clip": 1.01663709, "balance_loss_mlp": 1.02382565, "epoch": 0.2327371110777093, "flos": 21173705095680.0, "grad_norm": 1.9144938696687324, "language_loss": 0.72030073, "learning_rate": 3.5841049668832357e-06, "loss": 0.74167252, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.5546875, "step": 3871, "time_per_iteration": 2.3687267303466797 }, { "auxiliary_loss_clip": 0.01078578, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.01748729, "balance_loss_mlp": 1.02328372, "epoch": 0.23279723433037727, "flos": 24861964567680.0, "grad_norm": 2.349815006854031, "language_loss": 0.70717758, "learning_rate": 3.5838671887987433e-06, "loss": 0.72853708, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5546875, "step": 3872, "time_per_iteration": 2.4941580295562744 }, { "auxiliary_loss_clip": 0.0108243, "auxiliary_loss_mlp": 0.01065535, "balance_loss_clip": 1.02340651, "balance_loss_mlp": 1.0251205, "epoch": 0.23285735758304524, "flos": 38799082915200.0, "grad_norm": 2.194373677702728, "language_loss": 0.79815519, "learning_rate": 3.5836293506538474e-06, "loss": 0.8196348, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.57421875, "step": 3873, "time_per_iteration": 2.542754888534546 }, { "auxiliary_loss_clip": 0.01021409, "auxiliary_loss_mlp": 0.01007128, "balance_loss_clip": 1.0027169, "balance_loss_mlp": 1.00707817, "epoch": 0.2329174808357132, "flos": 53941086927360.0, "grad_norm": 0.8628338971461693, "language_loss": 0.60627913, "learning_rate": 3.5833914524575687e-06, "loss": 0.6265645, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 0.04418945, "router_z_loss_mlp": 0.14257812, "step": 3874, "time_per_iteration": 2.9577412605285645 }, { "auxiliary_loss_clip": 0.01077826, "auxiliary_loss_mlp": 0.01059301, "balance_loss_clip": 1.01996207, "balance_loss_mlp": 1.02446008, "epoch": 0.23297760408838117, "flos": 21214937278080.0, "grad_norm": 2.2134337132195085, "language_loss": 0.83059251, "learning_rate": 3.583153494218927e-06, "loss": 0.85196382, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.53125, "step": 3875, "time_per_iteration": 2.4001595973968506 }, { "auxiliary_loss_clip": 0.01078438, "auxiliary_loss_mlp": 0.01053373, "balance_loss_clip": 1.01887381, "balance_loss_mlp": 1.02576542, "epoch": 0.23303772734104916, "flos": 28401738560640.0, "grad_norm": 1.5933742876896877, "language_loss": 0.62685668, "learning_rate": 3.5829154759469464e-06, "loss": 0.64817482, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.52734375, "step": 3876, "time_per_iteration": 2.457732677459717 }, { "auxiliary_loss_clip": 0.01084608, "auxiliary_loss_mlp": 0.0105608, "balance_loss_clip": 1.01845789, "balance_loss_mlp": 1.02851391, "epoch": 0.23309785059371713, "flos": 24313618753920.0, "grad_norm": 1.6830564169295352, "language_loss": 0.71796131, "learning_rate": 3.5826773976506523e-06, "loss": 0.7393682, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5625, "step": 3877, "time_per_iteration": 2.4071102142333984 }, { "auxiliary_loss_clip": 0.01080447, "auxiliary_loss_mlp": 0.01065147, "balance_loss_clip": 1.02695262, "balance_loss_mlp": 1.02436316, "epoch": 0.2331579738463851, "flos": 15992140417920.0, "grad_norm": 2.1846063140342293, "language_loss": 0.8306247, "learning_rate": 3.582439259339073e-06, "loss": 0.85208064, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.55859375, "step": 3878, "time_per_iteration": 2.3781116008758545 }, { "auxiliary_loss_clip": 0.01085143, "auxiliary_loss_mlp": 0.01060649, "balance_loss_clip": 1.02095199, "balance_loss_mlp": 1.02827954, "epoch": 0.23321809709905306, "flos": 36425547711360.0, "grad_norm": 2.041269393547575, "language_loss": 0.76607305, "learning_rate": 3.5822010610212374e-06, "loss": 0.78753096, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.5703125, "step": 3879, "time_per_iteration": 2.5186405181884766 }, { "auxiliary_loss_clip": 0.01080217, "auxiliary_loss_mlp": 0.0105105, "balance_loss_clip": 1.01886356, "balance_loss_mlp": 1.02479959, "epoch": 0.23327822035172102, "flos": 21323691763200.0, "grad_norm": 2.3035415892809765, "language_loss": 0.91065085, "learning_rate": 3.5819628027061795e-06, "loss": 0.9319635, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5546875, "step": 3880, "time_per_iteration": 2.420231819152832 }, { "auxiliary_loss_clip": 0.01081128, "auxiliary_loss_mlp": 0.01053846, "balance_loss_clip": 1.01896501, "balance_loss_mlp": 1.02666008, "epoch": 0.233338343604389, "flos": 19170877374720.0, "grad_norm": 1.7346536572233047, "language_loss": 0.7331928, "learning_rate": 3.5817244844029334e-06, "loss": 0.75454247, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.546875, "step": 3881, "time_per_iteration": 2.3863301277160645 }, { "auxiliary_loss_clip": 0.01077543, "auxiliary_loss_mlp": 0.01060656, "balance_loss_clip": 1.02482128, "balance_loss_mlp": 1.02559638, "epoch": 0.23339846685705698, "flos": 26907106723200.0, "grad_norm": 1.5510522823000543, "language_loss": 0.68820971, "learning_rate": 3.581486106120537e-06, "loss": 0.70959175, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51953125, "step": 3882, "time_per_iteration": 2.4737913608551025 }, { "auxiliary_loss_clip": 0.01079139, "auxiliary_loss_mlp": 0.01054259, "balance_loss_clip": 1.01885366, "balance_loss_mlp": 1.02509236, "epoch": 0.23345859010972494, "flos": 32341791824640.0, "grad_norm": 2.827983133658746, "language_loss": 0.78419244, "learning_rate": 3.5812476678680287e-06, "loss": 0.80552638, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5390625, "step": 3883, "time_per_iteration": 2.467496871948242 }, { "auxiliary_loss_clip": 0.01019394, "auxiliary_loss_mlp": 0.01019819, "balance_loss_clip": 1.01555181, "balance_loss_mlp": 1.00523627, "epoch": 0.2335187133623929, "flos": 58480633013760.0, "grad_norm": 0.7979469821507169, "language_loss": 0.59082848, "learning_rate": 3.58100916965445e-06, "loss": 0.6112206, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 0.04272461, "router_z_loss_mlp": 0.14160156, "step": 3884, "time_per_iteration": 3.167285203933716 }, { "auxiliary_loss_clip": 0.01077011, "auxiliary_loss_mlp": 0.01059213, "balance_loss_clip": 1.02273464, "balance_loss_mlp": 1.02404857, "epoch": 0.23357883661506088, "flos": 24501067176960.0, "grad_norm": 1.6105170347528268, "language_loss": 0.81751716, "learning_rate": 3.5807706114888455e-06, "loss": 0.83887941, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.53125, "step": 3885, "time_per_iteration": 2.431936025619507 }, { "auxiliary_loss_clip": 0.01076049, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.01891112, "balance_loss_mlp": 1.02228153, "epoch": 0.23363895986772884, "flos": 18947642941440.0, "grad_norm": 2.4431487082985295, "language_loss": 0.88447118, "learning_rate": 3.580531993380261e-06, "loss": 0.90578079, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5390625, "step": 3886, "time_per_iteration": 2.3971192836761475 }, { "auxiliary_loss_clip": 0.01079711, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.01462352, "balance_loss_mlp": 1.02490902, "epoch": 0.2336990831203968, "flos": 31685459575680.0, "grad_norm": 1.8075117581229074, "language_loss": 0.74260265, "learning_rate": 3.5802933153377445e-06, "loss": 0.76391387, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 3887, "time_per_iteration": 2.467482566833496 }, { "auxiliary_loss_clip": 0.01077955, "auxiliary_loss_mlp": 0.01051728, "balance_loss_clip": 1.01658547, "balance_loss_mlp": 1.02382064, "epoch": 0.23375920637306477, "flos": 27708503137920.0, "grad_norm": 1.9829235660399014, "language_loss": 0.85209596, "learning_rate": 3.5800545773703475e-06, "loss": 0.87339282, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5390625, "step": 3888, "time_per_iteration": 2.444537878036499 }, { "auxiliary_loss_clip": 0.01077123, "auxiliary_loss_mlp": 0.0105708, "balance_loss_clip": 1.01862347, "balance_loss_mlp": 1.0232029, "epoch": 0.23381932962573276, "flos": 17674674526080.0, "grad_norm": 2.1854554441469594, "language_loss": 0.88621247, "learning_rate": 3.5798157794871225e-06, "loss": 0.90755451, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5390625, "step": 3889, "time_per_iteration": 2.3650519847869873 }, { "auxiliary_loss_clip": 0.01081605, "auxiliary_loss_mlp": 0.01054908, "balance_loss_clip": 1.017452, "balance_loss_mlp": 1.02556682, "epoch": 0.23387945287840073, "flos": 14390010904320.0, "grad_norm": 2.4739840690192554, "language_loss": 0.77979279, "learning_rate": 3.579576921697125e-06, "loss": 0.80115789, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5625, "step": 3890, "time_per_iteration": 3.8126108646392822 }, { "auxiliary_loss_clip": 0.01077608, "auxiliary_loss_mlp": 0.01056098, "balance_loss_clip": 1.01964426, "balance_loss_mlp": 1.02353132, "epoch": 0.2339395761310687, "flos": 46096244605440.0, "grad_norm": 1.8256540184503642, "language_loss": 0.74825859, "learning_rate": 3.579338004009412e-06, "loss": 0.76959562, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 3891, "time_per_iteration": 2.61108136177063 }, { "auxiliary_loss_clip": 0.01074359, "auxiliary_loss_mlp": 0.01051772, "balance_loss_clip": 1.01875091, "balance_loss_mlp": 1.02368879, "epoch": 0.23399969938373666, "flos": 22380966599040.0, "grad_norm": 1.644076662523735, "language_loss": 0.84570456, "learning_rate": 3.5790990264330433e-06, "loss": 0.86696583, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.5078125, "step": 3892, "time_per_iteration": 2.403780698776245 }, { "auxiliary_loss_clip": 0.01078595, "auxiliary_loss_mlp": 0.01060694, "balance_loss_clip": 1.02388263, "balance_loss_mlp": 1.02363729, "epoch": 0.23405982263640462, "flos": 43506841265280.0, "grad_norm": 1.883219731303419, "language_loss": 0.65962172, "learning_rate": 3.578859988977082e-06, "loss": 0.6810146, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.55078125, "step": 3893, "time_per_iteration": 2.6260251998901367 }, { "auxiliary_loss_clip": 0.01077034, "auxiliary_loss_mlp": 0.01059422, "balance_loss_clip": 1.02058387, "balance_loss_mlp": 1.02384555, "epoch": 0.2341199458890726, "flos": 22563597254400.0, "grad_norm": 2.150981373372674, "language_loss": 0.8097434, "learning_rate": 3.5786208916505916e-06, "loss": 0.83110797, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.53125, "step": 3894, "time_per_iteration": 3.900834798812866 }, { "auxiliary_loss_clip": 0.0107881, "auxiliary_loss_mlp": 0.01060512, "balance_loss_clip": 1.02234161, "balance_loss_mlp": 1.02504623, "epoch": 0.23418006914174055, "flos": 25632672030720.0, "grad_norm": 1.4283242960616884, "language_loss": 0.83029962, "learning_rate": 3.5783817344626383e-06, "loss": 0.8516928, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5390625, "step": 3895, "time_per_iteration": 3.853341579437256 }, { "auxiliary_loss_clip": 0.01079886, "auxiliary_loss_mlp": 0.01064895, "balance_loss_clip": 1.02593708, "balance_loss_mlp": 1.02480507, "epoch": 0.23424019239440855, "flos": 13545287625600.0, "grad_norm": 2.186795249919968, "language_loss": 0.82349551, "learning_rate": 3.578142517422292e-06, "loss": 0.84494328, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.55078125, "step": 3896, "time_per_iteration": 2.3750054836273193 }, { "auxiliary_loss_clip": 0.01080671, "auxiliary_loss_mlp": 0.01068503, "balance_loss_clip": 1.02689874, "balance_loss_mlp": 1.02411413, "epoch": 0.2343003156470765, "flos": 22418393443200.0, "grad_norm": 1.9027400253851017, "language_loss": 0.84629679, "learning_rate": 3.577903240538623e-06, "loss": 0.86778849, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.56640625, "step": 3897, "time_per_iteration": 2.3953840732574463 }, { "auxiliary_loss_clip": 0.01079571, "auxiliary_loss_mlp": 0.01062446, "balance_loss_clip": 1.02258277, "balance_loss_mlp": 1.02370369, "epoch": 0.23436043889974448, "flos": 14790010884480.0, "grad_norm": 1.7954760734954813, "language_loss": 0.80590415, "learning_rate": 3.577663903820705e-06, "loss": 0.82732427, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.55859375, "step": 3898, "time_per_iteration": 2.37689208984375 }, { "auxiliary_loss_clip": 0.01075648, "auxiliary_loss_mlp": 0.01059063, "balance_loss_clip": 1.02208471, "balance_loss_mlp": 1.02273881, "epoch": 0.23442056215241244, "flos": 22964609664000.0, "grad_norm": 2.092190475990345, "language_loss": 0.75740743, "learning_rate": 3.577424507277614e-06, "loss": 0.77875447, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53125, "step": 3899, "time_per_iteration": 2.390730142593384 }, { "auxiliary_loss_clip": 0.0108134, "auxiliary_loss_mlp": 0.01062585, "balance_loss_clip": 1.02062297, "balance_loss_mlp": 1.02439356, "epoch": 0.2344806854050804, "flos": 23070885442560.0, "grad_norm": 1.7475747249663987, "language_loss": 0.76987612, "learning_rate": 3.5771850509184277e-06, "loss": 0.79131544, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.5703125, "step": 3900, "time_per_iteration": 2.4205384254455566 }, { "auxiliary_loss_clip": 0.01077797, "auxiliary_loss_mlp": 0.01065451, "balance_loss_clip": 1.02747142, "balance_loss_mlp": 1.02340817, "epoch": 0.23454080865774837, "flos": 16326119283840.0, "grad_norm": 1.8934772353221367, "language_loss": 0.68097222, "learning_rate": 3.5769455347522256e-06, "loss": 0.70240474, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.54296875, "step": 3901, "time_per_iteration": 2.3614702224731445 }, { "auxiliary_loss_clip": 0.01023192, "auxiliary_loss_mlp": 0.01030165, "balance_loss_clip": 1.02532554, "balance_loss_mlp": 1.00854576, "epoch": 0.23460093191041637, "flos": 67757860218240.0, "grad_norm": 0.7849834948752606, "language_loss": 0.58328414, "learning_rate": 3.576705958788091e-06, "loss": 0.6038177, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 0.04833984, "router_z_loss_mlp": 0.14648438, "step": 3902, "time_per_iteration": 2.9888994693756104 }, { "auxiliary_loss_clip": 0.01076614, "auxiliary_loss_mlp": 0.01058702, "balance_loss_clip": 1.02119851, "balance_loss_mlp": 1.02271461, "epoch": 0.23466105516308433, "flos": 20076769088640.0, "grad_norm": 1.9035244787634606, "language_loss": 0.82970989, "learning_rate": 3.576466323035108e-06, "loss": 0.85106307, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5390625, "step": 3903, "time_per_iteration": 2.372032403945923 }, { "auxiliary_loss_clip": 0.01078053, "auxiliary_loss_mlp": 0.01059313, "balance_loss_clip": 1.01990211, "balance_loss_mlp": 1.02333045, "epoch": 0.2347211784157523, "flos": 24534549037440.0, "grad_norm": 1.9246864911882018, "language_loss": 0.83908719, "learning_rate": 3.5762266275023645e-06, "loss": 0.86046088, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.546875, "step": 3904, "time_per_iteration": 2.4618828296661377 }, { "auxiliary_loss_clip": 0.01077943, "auxiliary_loss_mlp": 0.0106214, "balance_loss_clip": 1.02451754, "balance_loss_mlp": 1.02366877, "epoch": 0.23478130166842026, "flos": 23803921681920.0, "grad_norm": 2.1011057547738314, "language_loss": 0.73126912, "learning_rate": 3.57598687219895e-06, "loss": 0.75266993, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.54296875, "step": 3905, "time_per_iteration": 2.401921510696411 }, { "auxiliary_loss_clip": 0.0107669, "auxiliary_loss_mlp": 0.01054752, "balance_loss_clip": 1.01693845, "balance_loss_mlp": 1.02413869, "epoch": 0.23484142492108823, "flos": 24092583736320.0, "grad_norm": 1.7092114234134448, "language_loss": 0.72172213, "learning_rate": 3.5757470571339543e-06, "loss": 0.74303657, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5234375, "step": 3906, "time_per_iteration": 2.4239165782928467 }, { "auxiliary_loss_clip": 0.0108305, "auxiliary_loss_mlp": 0.01067421, "balance_loss_clip": 1.02367139, "balance_loss_mlp": 1.02458882, "epoch": 0.2349015481737562, "flos": 29094555047040.0, "grad_norm": 2.1408830986576164, "language_loss": 0.75929606, "learning_rate": 3.575507182316473e-06, "loss": 0.78080076, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.5859375, "step": 3907, "time_per_iteration": 2.4485673904418945 }, { "auxiliary_loss_clip": 0.01081038, "auxiliary_loss_mlp": 0.0107316, "balance_loss_clip": 1.03281951, "balance_loss_mlp": 1.02531314, "epoch": 0.23496167142642416, "flos": 18915313155840.0, "grad_norm": 1.8173964837277947, "language_loss": 0.75041509, "learning_rate": 3.575267247755601e-06, "loss": 0.77195716, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.5546875, "step": 3908, "time_per_iteration": 2.3869290351867676 }, { "auxiliary_loss_clip": 0.01020545, "auxiliary_loss_mlp": 0.01024796, "balance_loss_clip": 1.02048075, "balance_loss_mlp": 1.00620461, "epoch": 0.23502179467909215, "flos": 55865255621760.0, "grad_norm": 1.0492259729787043, "language_loss": 0.73460305, "learning_rate": 3.5750272534604367e-06, "loss": 0.7550565, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.14355469, "step": 3909, "time_per_iteration": 2.7609524726867676 }, { "auxiliary_loss_clip": 0.01079723, "auxiliary_loss_mlp": 0.01059548, "balance_loss_clip": 1.02333236, "balance_loss_mlp": 1.02489233, "epoch": 0.23508191793176011, "flos": 23400709856640.0, "grad_norm": 1.5904369967128753, "language_loss": 0.89161479, "learning_rate": 3.5747871994400822e-06, "loss": 0.91300756, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.546875, "step": 3910, "time_per_iteration": 2.405442714691162 }, { "auxiliary_loss_clip": 0.01082675, "auxiliary_loss_mlp": 0.01058027, "balance_loss_clip": 1.01911759, "balance_loss_mlp": 1.0265888, "epoch": 0.23514204118442808, "flos": 20046638718720.0, "grad_norm": 2.2365501359952353, "language_loss": 0.78027737, "learning_rate": 3.5745470857036386e-06, "loss": 0.80168432, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5625, "step": 3911, "time_per_iteration": 2.3970396518707275 }, { "auxiliary_loss_clip": 0.010775, "auxiliary_loss_mlp": 0.01062506, "balance_loss_clip": 1.02760148, "balance_loss_mlp": 1.02599573, "epoch": 0.23520216443709605, "flos": 21579500361600.0, "grad_norm": 1.6613118140115142, "language_loss": 0.82257295, "learning_rate": 3.5743069122602122e-06, "loss": 0.84397304, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 3912, "time_per_iteration": 2.4070284366607666 }, { "auxiliary_loss_clip": 0.01076327, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.01977539, "balance_loss_mlp": 1.02469397, "epoch": 0.235262287689764, "flos": 23184667163520.0, "grad_norm": 1.950961030018402, "language_loss": 0.73088837, "learning_rate": 3.574066679118909e-06, "loss": 0.75220084, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 3913, "time_per_iteration": 2.4060187339782715 }, { "auxiliary_loss_clip": 0.01082864, "auxiliary_loss_mlp": 0.01057797, "balance_loss_clip": 1.01666963, "balance_loss_mlp": 1.02515292, "epoch": 0.23532241094243198, "flos": 23184108581760.0, "grad_norm": 2.519771638224627, "language_loss": 0.77479368, "learning_rate": 3.57382638628884e-06, "loss": 0.79620028, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.578125, "step": 3914, "time_per_iteration": 2.419952869415283 }, { "auxiliary_loss_clip": 0.01080778, "auxiliary_loss_mlp": 0.01061066, "balance_loss_clip": 1.02196503, "balance_loss_mlp": 1.02583456, "epoch": 0.23538253419509997, "flos": 17018377188480.0, "grad_norm": 3.1562011585268834, "language_loss": 0.91888511, "learning_rate": 3.5735860337791174e-06, "loss": 0.94030356, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.546875, "step": 3915, "time_per_iteration": 2.3770058155059814 }, { "auxiliary_loss_clip": 0.01016867, "auxiliary_loss_mlp": 0.01010157, "balance_loss_clip": 1.00631893, "balance_loss_mlp": 1.0036037, "epoch": 0.23544265744776793, "flos": 63445807751040.0, "grad_norm": 0.8141721448585336, "language_loss": 0.59362686, "learning_rate": 3.573345621598854e-06, "loss": 0.61389709, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.1328125, "step": 3916, "time_per_iteration": 2.9524471759796143 }, { "auxiliary_loss_clip": 0.01017187, "auxiliary_loss_mlp": 0.0100862, "balance_loss_clip": 1.00423265, "balance_loss_mlp": 1.00402701, "epoch": 0.2355027807004359, "flos": 70511668617600.0, "grad_norm": 0.7612913765875053, "language_loss": 0.49457031, "learning_rate": 3.5731051497571675e-06, "loss": 0.51482838, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 0.04394531, "router_z_loss_mlp": 0.13183594, "step": 3917, "time_per_iteration": 3.0565991401672363 }, { "auxiliary_loss_clip": 0.01084215, "auxiliary_loss_mlp": 0.0107448, "balance_loss_clip": 1.03537929, "balance_loss_mlp": 1.02581286, "epoch": 0.23556290395310386, "flos": 21433214298240.0, "grad_norm": 2.031603758792656, "language_loss": 0.78153861, "learning_rate": 3.5728646182631756e-06, "loss": 0.80312556, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5859375, "step": 3918, "time_per_iteration": 2.4157657623291016 }, { "auxiliary_loss_clip": 0.0108235, "auxiliary_loss_mlp": 0.01073531, "balance_loss_clip": 1.03762507, "balance_loss_mlp": 1.02570891, "epoch": 0.23562302720577183, "flos": 18185453850240.0, "grad_norm": 2.1384653607966895, "language_loss": 0.71288717, "learning_rate": 3.5726240271259995e-06, "loss": 0.73444599, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.56640625, "step": 3919, "time_per_iteration": 2.3611302375793457 }, { "auxiliary_loss_clip": 0.01074906, "auxiliary_loss_mlp": 0.01063324, "balance_loss_clip": 1.02870536, "balance_loss_mlp": 1.02353549, "epoch": 0.2356831504584398, "flos": 33729065631360.0, "grad_norm": 1.6832214196775594, "language_loss": 0.71265924, "learning_rate": 3.5723833763547634e-06, "loss": 0.73404151, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 3920, "time_per_iteration": 2.517735004425049 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01069378, "balance_loss_clip": 1.03340054, "balance_loss_mlp": 1.02455449, "epoch": 0.23574327371110776, "flos": 24931721197440.0, "grad_norm": 1.6399914833714748, "language_loss": 0.78651619, "learning_rate": 3.5721426659585916e-06, "loss": 0.80797708, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 3921, "time_per_iteration": 2.420454502105713 }, { "auxiliary_loss_clip": 0.01078331, "auxiliary_loss_mlp": 0.01062016, "balance_loss_clip": 1.02503717, "balance_loss_mlp": 1.02490425, "epoch": 0.23580339696377575, "flos": 17821135146240.0, "grad_norm": 2.083858806669315, "language_loss": 0.76961392, "learning_rate": 3.571901895946612e-06, "loss": 0.79101735, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53515625, "step": 3922, "time_per_iteration": 2.4158968925476074 }, { "auxiliary_loss_clip": 0.01077404, "auxiliary_loss_mlp": 0.01060275, "balance_loss_clip": 1.02203238, "balance_loss_mlp": 1.02326322, "epoch": 0.23586352021644372, "flos": 26285408409600.0, "grad_norm": 2.3672797348494123, "language_loss": 0.81485039, "learning_rate": 3.571661066327956e-06, "loss": 0.83622718, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.54296875, "step": 3923, "time_per_iteration": 2.439154863357544 }, { "auxiliary_loss_clip": 0.01077481, "auxiliary_loss_mlp": 0.01066302, "balance_loss_clip": 1.02736783, "balance_loss_mlp": 1.02342272, "epoch": 0.23592364346911168, "flos": 14245819522560.0, "grad_norm": 1.6869276354047356, "language_loss": 0.75625324, "learning_rate": 3.571420177111754e-06, "loss": 0.77769113, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.54296875, "step": 3924, "time_per_iteration": 2.383049726486206 }, { "auxiliary_loss_clip": 0.01077398, "auxiliary_loss_mlp": 0.01060413, "balance_loss_clip": 1.02202809, "balance_loss_mlp": 1.02372408, "epoch": 0.23598376672177965, "flos": 18586955018880.0, "grad_norm": 2.3558105543786265, "language_loss": 0.83884943, "learning_rate": 3.5711792283071416e-06, "loss": 0.86022747, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5390625, "step": 3925, "time_per_iteration": 2.380774736404419 }, { "auxiliary_loss_clip": 0.0108122, "auxiliary_loss_mlp": 0.01063989, "balance_loss_clip": 1.02357745, "balance_loss_mlp": 1.02457261, "epoch": 0.2360438899744476, "flos": 22674411509760.0, "grad_norm": 2.0424717848448544, "language_loss": 0.61381578, "learning_rate": 3.5709382199232564e-06, "loss": 0.63526785, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.56640625, "step": 3926, "time_per_iteration": 2.4361178874969482 }, { "auxiliary_loss_clip": 0.01072223, "auxiliary_loss_mlp": 0.0105259, "balance_loss_clip": 1.01818657, "balance_loss_mlp": 1.02197289, "epoch": 0.23610401322711558, "flos": 29568850133760.0, "grad_norm": 2.018462948724551, "language_loss": 0.74100161, "learning_rate": 3.570697151969235e-06, "loss": 0.76224977, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.50390625, "step": 3927, "time_per_iteration": 2.4562835693359375 }, { "auxiliary_loss_clip": 0.0107568, "auxiliary_loss_mlp": 0.01054897, "balance_loss_clip": 1.02075505, "balance_loss_mlp": 1.0219357, "epoch": 0.23616413647978354, "flos": 17857549560960.0, "grad_norm": 2.0727158242587644, "language_loss": 0.76951468, "learning_rate": 3.570456024454221e-06, "loss": 0.79082048, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5390625, "step": 3928, "time_per_iteration": 2.402432918548584 }, { "auxiliary_loss_clip": 0.01078899, "auxiliary_loss_mlp": 0.01059278, "balance_loss_clip": 1.02184606, "balance_loss_mlp": 1.02482915, "epoch": 0.23622425973245154, "flos": 11034089464320.0, "grad_norm": 2.5642000241660816, "language_loss": 0.8430776, "learning_rate": 3.5702148373873576e-06, "loss": 0.8644594, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5390625, "step": 3929, "time_per_iteration": 3.7743000984191895 }, { "auxiliary_loss_clip": 0.01085194, "auxiliary_loss_mlp": 0.01060572, "balance_loss_clip": 1.01613092, "balance_loss_mlp": 1.02695155, "epoch": 0.2362843829851195, "flos": 23402944183680.0, "grad_norm": 1.8025649725781583, "language_loss": 0.72718388, "learning_rate": 3.569973590777789e-06, "loss": 0.74864149, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.58203125, "step": 3930, "time_per_iteration": 2.427443742752075 }, { "auxiliary_loss_clip": 0.01078782, "auxiliary_loss_mlp": 0.01052796, "balance_loss_clip": 1.01379025, "balance_loss_mlp": 1.02416706, "epoch": 0.23634450623778747, "flos": 39528313816320.0, "grad_norm": 1.7998679800413144, "language_loss": 0.7563386, "learning_rate": 3.569732284634665e-06, "loss": 0.77765429, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.546875, "step": 3931, "time_per_iteration": 2.5272216796875 }, { "auxiliary_loss_clip": 0.01081307, "auxiliary_loss_mlp": 0.0105527, "balance_loss_clip": 1.01600206, "balance_loss_mlp": 1.0264225, "epoch": 0.23640462949045543, "flos": 24206016343680.0, "grad_norm": 2.053289642228667, "language_loss": 0.81657898, "learning_rate": 3.569490918967136e-06, "loss": 0.83794475, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.546875, "step": 3932, "time_per_iteration": 2.4401259422302246 }, { "auxiliary_loss_clip": 0.01078027, "auxiliary_loss_mlp": 0.01054876, "balance_loss_clip": 1.01959026, "balance_loss_mlp": 1.02547073, "epoch": 0.2364647527431234, "flos": 26176409544960.0, "grad_norm": 1.4462524065467668, "language_loss": 0.87698877, "learning_rate": 3.5692494937843537e-06, "loss": 0.89831781, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.52734375, "step": 3933, "time_per_iteration": 3.8063340187072754 }, { "auxiliary_loss_clip": 0.0108405, "auxiliary_loss_mlp": 0.01056515, "balance_loss_clip": 1.01827276, "balance_loss_mlp": 1.02805293, "epoch": 0.23652487599579136, "flos": 22635937324800.0, "grad_norm": 1.923574500297908, "language_loss": 0.83753073, "learning_rate": 3.5690080090954727e-06, "loss": 0.85893637, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5625, "step": 3934, "time_per_iteration": 3.9151978492736816 }, { "auxiliary_loss_clip": 0.01080195, "auxiliary_loss_mlp": 0.01059742, "balance_loss_clip": 1.02045071, "balance_loss_mlp": 1.0257659, "epoch": 0.23658499924845935, "flos": 21761188410240.0, "grad_norm": 1.6584110059564192, "language_loss": 0.80188096, "learning_rate": 3.5687664649096515e-06, "loss": 0.82328033, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.546875, "step": 3935, "time_per_iteration": 2.443239212036133 }, { "auxiliary_loss_clip": 0.01078792, "auxiliary_loss_mlp": 0.01058136, "balance_loss_clip": 1.02280235, "balance_loss_mlp": 1.02650952, "epoch": 0.23664512250112732, "flos": 21797917027200.0, "grad_norm": 1.8551049702115125, "language_loss": 0.81177258, "learning_rate": 3.5685248612360487e-06, "loss": 0.83314186, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5234375, "step": 3936, "time_per_iteration": 2.390111207962036 }, { "auxiliary_loss_clip": 0.01082331, "auxiliary_loss_mlp": 0.01057716, "balance_loss_clip": 1.02083254, "balance_loss_mlp": 1.02752805, "epoch": 0.23670524575379528, "flos": 22636775197440.0, "grad_norm": 1.4832195847087861, "language_loss": 0.80785537, "learning_rate": 3.568283198083826e-06, "loss": 0.82925588, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 3937, "time_per_iteration": 2.45430064201355 }, { "auxiliary_loss_clip": 0.01073655, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.02080083, "balance_loss_mlp": 1.02344048, "epoch": 0.23676536900646325, "flos": 16724129316480.0, "grad_norm": 2.2110801907466677, "language_loss": 0.87530708, "learning_rate": 3.568041475462147e-06, "loss": 0.89658308, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5, "step": 3938, "time_per_iteration": 2.3888449668884277 }, { "auxiliary_loss_clip": 0.01075703, "auxiliary_loss_mlp": 0.01064944, "balance_loss_clip": 1.02651072, "balance_loss_mlp": 1.02338862, "epoch": 0.23682549225913122, "flos": 11135093627520.0, "grad_norm": 2.8506664392006242, "language_loss": 0.95270014, "learning_rate": 3.5677996933801785e-06, "loss": 0.97410661, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5234375, "step": 3939, "time_per_iteration": 2.4036147594451904 }, { "auxiliary_loss_clip": 0.01078983, "auxiliary_loss_mlp": 0.01062495, "balance_loss_clip": 1.02203536, "balance_loss_mlp": 1.02385592, "epoch": 0.23688561551179918, "flos": 22558290727680.0, "grad_norm": 1.6208198112365364, "language_loss": 0.83595556, "learning_rate": 3.567557851847088e-06, "loss": 0.85737038, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.55078125, "step": 3940, "time_per_iteration": 2.398594617843628 }, { "auxiliary_loss_clip": 0.01082765, "auxiliary_loss_mlp": 0.0106947, "balance_loss_clip": 1.02810502, "balance_loss_mlp": 1.02408004, "epoch": 0.23694573876446715, "flos": 18513916721280.0, "grad_norm": 2.1103817979441066, "language_loss": 0.91125536, "learning_rate": 3.5673159508720464e-06, "loss": 0.93277764, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.5859375, "step": 3941, "time_per_iteration": 2.431144952774048 }, { "auxiliary_loss_clip": 0.01078575, "auxiliary_loss_mlp": 0.01067426, "balance_loss_clip": 1.02751493, "balance_loss_mlp": 1.02310205, "epoch": 0.23700586201713514, "flos": 15334970296320.0, "grad_norm": 2.124074872017382, "language_loss": 0.86314225, "learning_rate": 3.5670739904642274e-06, "loss": 0.88460231, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5546875, "step": 3942, "time_per_iteration": 2.383678913116455 }, { "auxiliary_loss_clip": 0.01077178, "auxiliary_loss_mlp": 0.01057203, "balance_loss_clip": 1.01836419, "balance_loss_mlp": 1.02251041, "epoch": 0.2370659852698031, "flos": 23946576963840.0, "grad_norm": 1.8545072067336272, "language_loss": 0.82265413, "learning_rate": 3.5668319706328065e-06, "loss": 0.84399796, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.546875, "step": 3943, "time_per_iteration": 2.5040805339813232 }, { "auxiliary_loss_clip": 0.0107849, "auxiliary_loss_mlp": 0.01059226, "balance_loss_clip": 1.01886177, "balance_loss_mlp": 1.02220345, "epoch": 0.23712610852247107, "flos": 15331863185280.0, "grad_norm": 2.473068128632784, "language_loss": 0.69051814, "learning_rate": 3.566589891386959e-06, "loss": 0.71189523, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.5625, "step": 3944, "time_per_iteration": 2.4462296962738037 }, { "auxiliary_loss_clip": 0.01077164, "auxiliary_loss_mlp": 0.01062209, "balance_loss_clip": 1.02451491, "balance_loss_mlp": 1.0221628, "epoch": 0.23718623177513903, "flos": 19681551964800.0, "grad_norm": 1.8208133984438517, "language_loss": 0.77391869, "learning_rate": 3.566347752735866e-06, "loss": 0.7953124, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.55078125, "step": 3945, "time_per_iteration": 2.3933048248291016 }, { "auxiliary_loss_clip": 0.01078977, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.01747787, "balance_loss_mlp": 1.02521038, "epoch": 0.237246355027807, "flos": 24972150418560.0, "grad_norm": 1.7113427036696485, "language_loss": 0.65392619, "learning_rate": 3.5661055546887094e-06, "loss": 0.6752336, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.53515625, "step": 3946, "time_per_iteration": 2.4553017616271973 }, { "auxiliary_loss_clip": 0.01075482, "auxiliary_loss_mlp": 0.01056593, "balance_loss_clip": 1.0171231, "balance_loss_mlp": 1.02182102, "epoch": 0.23730647828047496, "flos": 15376516680960.0, "grad_norm": 2.166216705863622, "language_loss": 0.78668833, "learning_rate": 3.5658632972546734e-06, "loss": 0.80800909, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.53515625, "step": 3947, "time_per_iteration": 2.3876068592071533 }, { "auxiliary_loss_clip": 0.01079457, "auxiliary_loss_mlp": 0.01056132, "balance_loss_clip": 1.01963031, "balance_loss_mlp": 1.02537441, "epoch": 0.23736660153314296, "flos": 28149316364160.0, "grad_norm": 1.5108802865678734, "language_loss": 0.81493664, "learning_rate": 3.565620980442944e-06, "loss": 0.83629251, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 3948, "time_per_iteration": 2.492887258529663 }, { "auxiliary_loss_clip": 0.01077972, "auxiliary_loss_mlp": 0.01057591, "balance_loss_clip": 1.02170861, "balance_loss_mlp": 1.02406836, "epoch": 0.23742672478581092, "flos": 22085601563520.0, "grad_norm": 1.713528398090911, "language_loss": 0.81397402, "learning_rate": 3.5653786042627107e-06, "loss": 0.83532965, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5390625, "step": 3949, "time_per_iteration": 2.4455695152282715 }, { "auxiliary_loss_clip": 0.01077005, "auxiliary_loss_mlp": 0.01054845, "balance_loss_clip": 1.0169605, "balance_loss_mlp": 1.02282453, "epoch": 0.2374868480384789, "flos": 19536068862720.0, "grad_norm": 1.669113887673585, "language_loss": 0.74316537, "learning_rate": 3.565136168723163e-06, "loss": 0.76448393, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.54296875, "step": 3950, "time_per_iteration": 2.395921468734741 }, { "auxiliary_loss_clip": 0.01074335, "auxiliary_loss_mlp": 0.01045858, "balance_loss_clip": 1.01352823, "balance_loss_mlp": 1.02419209, "epoch": 0.23754697129114685, "flos": 19421623825920.0, "grad_norm": 1.84496879742927, "language_loss": 0.73657465, "learning_rate": 3.564893673833495e-06, "loss": 0.75777662, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5, "step": 3951, "time_per_iteration": 2.462151288986206 }, { "auxiliary_loss_clip": 0.01078471, "auxiliary_loss_mlp": 0.01059743, "balance_loss_clip": 1.02088106, "balance_loss_mlp": 1.02376544, "epoch": 0.23760709454381482, "flos": 19499968650240.0, "grad_norm": 1.7823565503616736, "language_loss": 0.75906181, "learning_rate": 3.564651119602903e-06, "loss": 0.78044403, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.546875, "step": 3952, "time_per_iteration": 2.3894307613372803 }, { "auxiliary_loss_clip": 0.01078169, "auxiliary_loss_mlp": 0.01059294, "balance_loss_clip": 1.02596354, "balance_loss_mlp": 1.02391243, "epoch": 0.23766721779648278, "flos": 27635360106240.0, "grad_norm": 2.32734103973321, "language_loss": 0.71840394, "learning_rate": 3.564408506040583e-06, "loss": 0.73977852, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.54296875, "step": 3953, "time_per_iteration": 2.4721179008483887 }, { "auxiliary_loss_clip": 0.0107791, "auxiliary_loss_mlp": 0.01058801, "balance_loss_clip": 1.02220368, "balance_loss_mlp": 1.02347124, "epoch": 0.23772734104915075, "flos": 23403223474560.0, "grad_norm": 1.925823514329314, "language_loss": 0.83508605, "learning_rate": 3.5641658331557356e-06, "loss": 0.85645318, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.546875, "step": 3954, "time_per_iteration": 2.434901714324951 }, { "auxiliary_loss_clip": 0.01078501, "auxiliary_loss_mlp": 0.01055641, "balance_loss_clip": 1.01801836, "balance_loss_mlp": 1.02425802, "epoch": 0.23778746430181874, "flos": 15704595527040.0, "grad_norm": 2.426720640614203, "language_loss": 0.68861401, "learning_rate": 3.5639231009575634e-06, "loss": 0.70995545, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.54296875, "step": 3955, "time_per_iteration": 2.3716843128204346 }, { "auxiliary_loss_clip": 0.0107431, "auxiliary_loss_mlp": 0.01053697, "balance_loss_clip": 1.01786304, "balance_loss_mlp": 1.02260876, "epoch": 0.2378475875544867, "flos": 19425464075520.0, "grad_norm": 1.5806167571069345, "language_loss": 0.84614837, "learning_rate": 3.5636803094552704e-06, "loss": 0.86742842, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51953125, "step": 3956, "time_per_iteration": 2.4694573879241943 }, { "auxiliary_loss_clip": 0.01072267, "auxiliary_loss_mlp": 0.01044452, "balance_loss_clip": 1.01212215, "balance_loss_mlp": 1.02222812, "epoch": 0.23790771080715467, "flos": 22267603814400.0, "grad_norm": 1.9615500310051626, "language_loss": 0.86739689, "learning_rate": 3.5634374586580635e-06, "loss": 0.88856405, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5, "step": 3957, "time_per_iteration": 2.4048397541046143 }, { "auxiliary_loss_clip": 0.01075318, "auxiliary_loss_mlp": 0.01057061, "balance_loss_clip": 1.02274024, "balance_loss_mlp": 1.02256763, "epoch": 0.23796783405982264, "flos": 20046289605120.0, "grad_norm": 2.1378156335997094, "language_loss": 0.71441412, "learning_rate": 3.563194548575151e-06, "loss": 0.73573792, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.52734375, "step": 3958, "time_per_iteration": 2.4352898597717285 }, { "auxiliary_loss_clip": 0.0107549, "auxiliary_loss_mlp": 0.01059731, "balance_loss_clip": 1.02179825, "balance_loss_mlp": 1.02172995, "epoch": 0.2380279573124906, "flos": 14245086384000.0, "grad_norm": 2.379183965493073, "language_loss": 0.69287455, "learning_rate": 3.562951579215745e-06, "loss": 0.71422672, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.53515625, "step": 3959, "time_per_iteration": 2.3870909214019775 }, { "auxiliary_loss_clip": 0.01074025, "auxiliary_loss_mlp": 0.01053887, "balance_loss_clip": 1.01948357, "balance_loss_mlp": 1.02135372, "epoch": 0.23808808056515857, "flos": 21178103927040.0, "grad_norm": 1.8965668801310605, "language_loss": 0.74054658, "learning_rate": 3.5627085505890586e-06, "loss": 0.76182568, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.52734375, "step": 3960, "time_per_iteration": 2.431682825088501 }, { "auxiliary_loss_clip": 0.01076553, "auxiliary_loss_mlp": 0.01052164, "balance_loss_clip": 1.01458871, "balance_loss_mlp": 1.02304292, "epoch": 0.23814820381782653, "flos": 22527217751040.0, "grad_norm": 1.7296033976366532, "language_loss": 0.76493096, "learning_rate": 3.562465462704307e-06, "loss": 0.78621805, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53515625, "step": 3961, "time_per_iteration": 2.4458205699920654 }, { "auxiliary_loss_clip": 0.01075924, "auxiliary_loss_mlp": 0.01062578, "balance_loss_clip": 1.02121258, "balance_loss_mlp": 1.02124274, "epoch": 0.23820832707049452, "flos": 22303389824640.0, "grad_norm": 2.0898375366064554, "language_loss": 0.67958164, "learning_rate": 3.5622223155707085e-06, "loss": 0.70096672, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.546875, "step": 3962, "time_per_iteration": 2.403914213180542 }, { "auxiliary_loss_clip": 0.0107492, "auxiliary_loss_mlp": 0.0105338, "balance_loss_clip": 1.01752186, "balance_loss_mlp": 1.02208567, "epoch": 0.2382684503231625, "flos": 24863046819840.0, "grad_norm": 1.820191376082861, "language_loss": 0.75947493, "learning_rate": 3.561979109197483e-06, "loss": 0.7807579, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 3963, "time_per_iteration": 2.459411382675171 }, { "auxiliary_loss_clip": 0.0107913, "auxiliary_loss_mlp": 0.01053349, "balance_loss_clip": 1.01682317, "balance_loss_mlp": 1.02497435, "epoch": 0.23832857357583045, "flos": 21870536388480.0, "grad_norm": 2.488666506472448, "language_loss": 0.79385018, "learning_rate": 3.5617358435938538e-06, "loss": 0.81517494, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5390625, "step": 3964, "time_per_iteration": 2.425105333328247 }, { "auxiliary_loss_clip": 0.01073998, "auxiliary_loss_mlp": 0.0105491, "balance_loss_clip": 1.02088797, "balance_loss_mlp": 1.02191877, "epoch": 0.23838869682849842, "flos": 21286998057600.0, "grad_norm": 2.5090335301211457, "language_loss": 0.73518348, "learning_rate": 3.561492518769045e-06, "loss": 0.75647259, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5234375, "step": 3965, "time_per_iteration": 2.453462839126587 }, { "auxiliary_loss_clip": 0.0107433, "auxiliary_loss_mlp": 0.01052358, "balance_loss_clip": 1.01869369, "balance_loss_mlp": 1.02297115, "epoch": 0.23844882008116638, "flos": 16179658663680.0, "grad_norm": 1.7527664008769912, "language_loss": 0.80005145, "learning_rate": 3.561249134732282e-06, "loss": 0.82131839, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51171875, "step": 3966, "time_per_iteration": 2.4368093013763428 }, { "auxiliary_loss_clip": 0.0107707, "auxiliary_loss_mlp": 0.01056382, "balance_loss_clip": 1.02257383, "balance_loss_mlp": 1.02455652, "epoch": 0.23850894333383435, "flos": 21068651214720.0, "grad_norm": 1.532383176698599, "language_loss": 0.70064682, "learning_rate": 3.561005691492797e-06, "loss": 0.72198129, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5234375, "step": 3967, "time_per_iteration": 2.3868916034698486 }, { "auxiliary_loss_clip": 0.01074362, "auxiliary_loss_mlp": 0.01061574, "balance_loss_clip": 1.02411819, "balance_loss_mlp": 1.02198434, "epoch": 0.23856906658650234, "flos": 17200658730240.0, "grad_norm": 2.0387623600773086, "language_loss": 0.70291322, "learning_rate": 3.5607621890598185e-06, "loss": 0.72427255, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5234375, "step": 3968, "time_per_iteration": 2.4130916595458984 }, { "auxiliary_loss_clip": 0.01074555, "auxiliary_loss_mlp": 0.01059565, "balance_loss_clip": 1.02375472, "balance_loss_mlp": 1.02082109, "epoch": 0.2386291898391703, "flos": 29493018927360.0, "grad_norm": 2.300239420982784, "language_loss": 0.78684676, "learning_rate": 3.5605186274425823e-06, "loss": 0.80818802, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5390625, "step": 3969, "time_per_iteration": 3.903502941131592 }, { "auxiliary_loss_clip": 0.01071671, "auxiliary_loss_mlp": 0.01047663, "balance_loss_clip": 1.01561987, "balance_loss_mlp": 1.02214098, "epoch": 0.23868931309183827, "flos": 21141375310080.0, "grad_norm": 3.2777187657664433, "language_loss": 0.78482622, "learning_rate": 3.5602750066503225e-06, "loss": 0.80601954, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.49414062, "step": 3970, "time_per_iteration": 2.417163848876953 }, { "auxiliary_loss_clip": 0.0107495, "auxiliary_loss_mlp": 0.01062242, "balance_loss_clip": 1.02507257, "balance_loss_mlp": 1.02118909, "epoch": 0.23874943634450624, "flos": 25658403569280.0, "grad_norm": 2.0040316741195485, "language_loss": 0.86180788, "learning_rate": 3.5600313266922793e-06, "loss": 0.88317978, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5390625, "step": 3971, "time_per_iteration": 2.4627456665039062 }, { "auxiliary_loss_clip": 0.01020481, "auxiliary_loss_mlp": 0.01020322, "balance_loss_clip": 1.0164361, "balance_loss_mlp": 1.00798035, "epoch": 0.2388095595971742, "flos": 58983243079680.0, "grad_norm": 0.7530235810556672, "language_loss": 0.62846982, "learning_rate": 3.5597875875776915e-06, "loss": 0.64887786, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 0.03881836, "router_z_loss_mlp": 0.125, "step": 3972, "time_per_iteration": 3.1184005737304688 }, { "auxiliary_loss_clip": 0.01073938, "auxiliary_loss_mlp": 0.01049, "balance_loss_clip": 1.01426256, "balance_loss_mlp": 1.02158201, "epoch": 0.23886968284984217, "flos": 16799401941120.0, "grad_norm": 3.0657404849008376, "language_loss": 0.84538603, "learning_rate": 3.5595437893158013e-06, "loss": 0.86661541, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5234375, "step": 3973, "time_per_iteration": 3.9098262786865234 }, { "auxiliary_loss_clip": 0.01075796, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.02507389, "balance_loss_mlp": 1.02222466, "epoch": 0.23892980610251013, "flos": 22381560092160.0, "grad_norm": 1.6517086498178086, "language_loss": 0.80565089, "learning_rate": 3.5592999319158546e-06, "loss": 0.82700765, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53515625, "step": 3974, "time_per_iteration": 3.838167428970337 }, { "auxiliary_loss_clip": 0.0107534, "auxiliary_loss_mlp": 0.01057331, "balance_loss_clip": 1.01870751, "balance_loss_mlp": 1.02172232, "epoch": 0.23898992935517813, "flos": 12822375680640.0, "grad_norm": 1.838919502493363, "language_loss": 0.8665818, "learning_rate": 3.5590560153870984e-06, "loss": 0.88790858, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.53515625, "step": 3975, "time_per_iteration": 2.3493382930755615 }, { "auxiliary_loss_clip": 0.01073377, "auxiliary_loss_mlp": 0.01054481, "balance_loss_clip": 1.0178597, "balance_loss_mlp": 1.02103019, "epoch": 0.2390500526078461, "flos": 22344587095680.0, "grad_norm": 2.176846875396924, "language_loss": 0.85229903, "learning_rate": 3.5588120397387816e-06, "loss": 0.87357754, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5234375, "step": 3976, "time_per_iteration": 2.426551103591919 }, { "auxiliary_loss_clip": 0.01072137, "auxiliary_loss_mlp": 0.01053747, "balance_loss_clip": 1.01588643, "balance_loss_mlp": 1.02019858, "epoch": 0.23911017586051406, "flos": 22634121934080.0, "grad_norm": 6.509190957931099, "language_loss": 0.75725096, "learning_rate": 3.5585680049801566e-06, "loss": 0.77850986, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.51953125, "step": 3977, "time_per_iteration": 2.4058144092559814 }, { "auxiliary_loss_clip": 0.01076447, "auxiliary_loss_mlp": 0.01062771, "balance_loss_clip": 1.02696013, "balance_loss_mlp": 1.02269435, "epoch": 0.23917029911318202, "flos": 23652329091840.0, "grad_norm": 1.7344880242107745, "language_loss": 0.73472315, "learning_rate": 3.5583239111204764e-06, "loss": 0.75611532, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5390625, "step": 3978, "time_per_iteration": 2.453629732131958 }, { "auxiliary_loss_clip": 0.0107818, "auxiliary_loss_mlp": 0.01057078, "balance_loss_clip": 1.01747656, "balance_loss_mlp": 1.02360213, "epoch": 0.23923042236585, "flos": 22782502679040.0, "grad_norm": 2.8917501261807246, "language_loss": 0.80336416, "learning_rate": 3.558079758168997e-06, "loss": 0.82471675, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.546875, "step": 3979, "time_per_iteration": 2.4246766567230225 }, { "auxiliary_loss_clip": 0.01074087, "auxiliary_loss_mlp": 0.01068471, "balance_loss_clip": 1.03039598, "balance_loss_mlp": 1.02157116, "epoch": 0.23929054561851795, "flos": 28146453632640.0, "grad_norm": 1.7209041263400275, "language_loss": 0.83129001, "learning_rate": 3.557835546134977e-06, "loss": 0.85271561, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5234375, "step": 3980, "time_per_iteration": 2.482715368270874 }, { "auxiliary_loss_clip": 0.01073134, "auxiliary_loss_mlp": 0.01055505, "balance_loss_clip": 1.01742935, "balance_loss_mlp": 1.02136803, "epoch": 0.23935066887118592, "flos": 21685531760640.0, "grad_norm": 1.8140468471541995, "language_loss": 0.84734297, "learning_rate": 3.5575912750276775e-06, "loss": 0.86862934, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.515625, "step": 3981, "time_per_iteration": 2.4528872966766357 }, { "auxiliary_loss_clip": 0.01077341, "auxiliary_loss_mlp": 0.01058149, "balance_loss_clip": 1.02047849, "balance_loss_mlp": 1.02256787, "epoch": 0.2394107921238539, "flos": 32120966275200.0, "grad_norm": 2.5778757783985577, "language_loss": 0.77666903, "learning_rate": 3.5573469448563607e-06, "loss": 0.79802394, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.546875, "step": 3982, "time_per_iteration": 2.5151655673980713 }, { "auxiliary_loss_clip": 0.01075816, "auxiliary_loss_mlp": 0.01057734, "balance_loss_clip": 1.02187538, "balance_loss_mlp": 1.02244854, "epoch": 0.23947091537652188, "flos": 17018237543040.0, "grad_norm": 3.242368148338704, "language_loss": 0.78947818, "learning_rate": 3.5571025556302915e-06, "loss": 0.81081372, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53515625, "step": 3983, "time_per_iteration": 2.4055511951446533 }, { "auxiliary_loss_clip": 0.0107449, "auxiliary_loss_mlp": 0.01059133, "balance_loss_clip": 1.01872063, "balance_loss_mlp": 1.02171326, "epoch": 0.23953103862918984, "flos": 20592575648640.0, "grad_norm": 1.8207216195305846, "language_loss": 0.74724627, "learning_rate": 3.556858107358737e-06, "loss": 0.76858246, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.52734375, "step": 3984, "time_per_iteration": 2.467677116394043 }, { "auxiliary_loss_clip": 0.01078624, "auxiliary_loss_mlp": 0.01059093, "balance_loss_clip": 1.02092195, "balance_loss_mlp": 1.02335751, "epoch": 0.2395911618818578, "flos": 20703354992640.0, "grad_norm": 2.0409626954330964, "language_loss": 0.80851054, "learning_rate": 3.5566136000509674e-06, "loss": 0.82988769, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5546875, "step": 3985, "time_per_iteration": 2.40859055519104 }, { "auxiliary_loss_clip": 0.01076842, "auxiliary_loss_mlp": 0.01060171, "balance_loss_clip": 1.02195275, "balance_loss_mlp": 1.02363205, "epoch": 0.23965128513452577, "flos": 27052275623040.0, "grad_norm": 1.947394766236021, "language_loss": 0.75091326, "learning_rate": 3.556369033716254e-06, "loss": 0.77228338, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.53125, "step": 3986, "time_per_iteration": 2.463433265686035 }, { "auxiliary_loss_clip": 0.01080054, "auxiliary_loss_mlp": 0.01062253, "balance_loss_clip": 1.02305675, "balance_loss_mlp": 1.0243268, "epoch": 0.23971140838719374, "flos": 23143330247040.0, "grad_norm": 1.955056923679609, "language_loss": 0.89247888, "learning_rate": 3.556124408363871e-06, "loss": 0.91390193, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.55859375, "step": 3987, "time_per_iteration": 2.4156103134155273 }, { "auxiliary_loss_clip": 0.010722, "auxiliary_loss_mlp": 0.01050684, "balance_loss_clip": 1.01768708, "balance_loss_mlp": 1.02254272, "epoch": 0.23977153163986173, "flos": 18033756526080.0, "grad_norm": 2.3098642636749864, "language_loss": 0.85421145, "learning_rate": 3.5558797240030945e-06, "loss": 0.8754403, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.49609375, "step": 3988, "time_per_iteration": 2.3934428691864014 }, { "auxiliary_loss_clip": 0.0107641, "auxiliary_loss_mlp": 0.01057907, "balance_loss_clip": 1.02003407, "balance_loss_mlp": 1.02280843, "epoch": 0.2398316548925297, "flos": 18112415552640.0, "grad_norm": 1.7469935438475088, "language_loss": 0.86585778, "learning_rate": 3.5556349806432035e-06, "loss": 0.88720095, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.53515625, "step": 3989, "time_per_iteration": 2.424023151397705 }, { "auxiliary_loss_clip": 0.01074542, "auxiliary_loss_mlp": 0.01051432, "balance_loss_clip": 1.01569307, "balance_loss_mlp": 1.02212703, "epoch": 0.23989177814519766, "flos": 12566916195840.0, "grad_norm": 2.0359668325166655, "language_loss": 0.86223918, "learning_rate": 3.555390178293477e-06, "loss": 0.88349891, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5234375, "step": 3990, "time_per_iteration": 2.372601270675659 }, { "auxiliary_loss_clip": 0.01073526, "auxiliary_loss_mlp": 0.01055634, "balance_loss_clip": 1.02187443, "balance_loss_mlp": 1.02186215, "epoch": 0.23995190139786562, "flos": 25263430824960.0, "grad_norm": 1.5013292090190815, "language_loss": 0.77250898, "learning_rate": 3.5551453169631994e-06, "loss": 0.79380059, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.515625, "step": 3991, "time_per_iteration": 2.4849021434783936 }, { "auxiliary_loss_clip": 0.01019178, "auxiliary_loss_mlp": 0.01011908, "balance_loss_clip": 1.00752103, "balance_loss_mlp": 1.00590038, "epoch": 0.2400120246505336, "flos": 61957425047040.0, "grad_norm": 0.8880633501765254, "language_loss": 0.63784277, "learning_rate": 3.554900396661656e-06, "loss": 0.65815365, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 0.04394531, "router_z_loss_mlp": 0.1328125, "step": 3992, "time_per_iteration": 3.004757881164551 }, { "auxiliary_loss_clip": 0.01019162, "auxiliary_loss_mlp": 0.01009359, "balance_loss_clip": 1.00423312, "balance_loss_mlp": 1.00566673, "epoch": 0.24007214790320155, "flos": 66705333327360.0, "grad_norm": 0.7573507134863463, "language_loss": 0.62977213, "learning_rate": 3.5546554173981334e-06, "loss": 0.65005732, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 0.05126953, "router_z_loss_mlp": 0.13476562, "step": 3993, "time_per_iteration": 3.1419780254364014 }, { "auxiliary_loss_clip": 0.01081069, "auxiliary_loss_mlp": 0.01059175, "balance_loss_clip": 1.02281642, "balance_loss_mlp": 1.02498007, "epoch": 0.24013227115586952, "flos": 25807971300480.0, "grad_norm": 1.847983505272801, "language_loss": 0.7845881, "learning_rate": 3.5544103791819218e-06, "loss": 0.80599058, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5625, "step": 3994, "time_per_iteration": 2.457596778869629 }, { "auxiliary_loss_clip": 0.01077279, "auxiliary_loss_mlp": 0.01065415, "balance_loss_clip": 1.02505112, "balance_loss_mlp": 1.02334154, "epoch": 0.2401923944085375, "flos": 25556282242560.0, "grad_norm": 2.077118922438508, "language_loss": 0.79705656, "learning_rate": 3.5541652820223124e-06, "loss": 0.81848347, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5390625, "step": 3995, "time_per_iteration": 2.4557464122772217 }, { "auxiliary_loss_clip": 0.01021171, "auxiliary_loss_mlp": 0.01007207, "balance_loss_clip": 1.0028199, "balance_loss_mlp": 1.00831294, "epoch": 0.24025251766120548, "flos": 54937751909760.0, "grad_norm": 0.9164335503466496, "language_loss": 0.63519537, "learning_rate": 3.5539201259286006e-06, "loss": 0.65547907, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 0.04394531, "router_z_loss_mlp": 0.12890625, "step": 3996, "time_per_iteration": 3.11061692237854 }, { "auxiliary_loss_clip": 0.01080885, "auxiliary_loss_mlp": 0.01068858, "balance_loss_clip": 1.03063965, "balance_loss_mlp": 1.02498055, "epoch": 0.24031264091387344, "flos": 20630037404160.0, "grad_norm": 2.687051859787606, "language_loss": 0.71523559, "learning_rate": 3.5536749109100808e-06, "loss": 0.73673302, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.55859375, "step": 3997, "time_per_iteration": 2.390627145767212 }, { "auxiliary_loss_clip": 0.0107919, "auxiliary_loss_mlp": 0.01063947, "balance_loss_clip": 1.02987719, "balance_loss_mlp": 1.02494764, "epoch": 0.2403727641665414, "flos": 20885217598080.0, "grad_norm": 1.831951281247983, "language_loss": 0.88359123, "learning_rate": 3.5534296369760535e-06, "loss": 0.90502262, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.54296875, "step": 3998, "time_per_iteration": 2.396756410598755 }, { "auxiliary_loss_clip": 0.01079247, "auxiliary_loss_mlp": 0.01058418, "balance_loss_clip": 1.02067673, "balance_loss_mlp": 1.02250886, "epoch": 0.24043288741920937, "flos": 22818952005120.0, "grad_norm": 1.7233867684568847, "language_loss": 0.77454561, "learning_rate": 3.5531843041358183e-06, "loss": 0.79592222, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.56640625, "step": 3999, "time_per_iteration": 2.3993148803710938 }, { "auxiliary_loss_clip": 0.01077512, "auxiliary_loss_mlp": 0.01069598, "balance_loss_clip": 1.03583789, "balance_loss_mlp": 1.0239985, "epoch": 0.24049301067187734, "flos": 27958551361920.0, "grad_norm": 2.154910441416475, "language_loss": 0.74891686, "learning_rate": 3.552938912398679e-06, "loss": 0.77038801, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.53515625, "step": 4000, "time_per_iteration": 2.5048508644104004 }, { "auxiliary_loss_clip": 0.01079859, "auxiliary_loss_mlp": 0.01075256, "balance_loss_clip": 1.03672743, "balance_loss_mlp": 1.02325892, "epoch": 0.24055313392454533, "flos": 27450250744320.0, "grad_norm": 1.7317595373730905, "language_loss": 0.67870224, "learning_rate": 3.5526934617739397e-06, "loss": 0.70025337, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.56640625, "step": 4001, "time_per_iteration": 2.4317779541015625 }, { "auxiliary_loss_clip": 0.010764, "auxiliary_loss_mlp": 0.01067419, "balance_loss_clip": 1.02970147, "balance_loss_mlp": 1.02202022, "epoch": 0.2406132571772133, "flos": 25555444369920.0, "grad_norm": 1.7614096401265154, "language_loss": 0.84261513, "learning_rate": 3.5524479522709095e-06, "loss": 0.86405337, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.54296875, "step": 4002, "time_per_iteration": 2.493609666824341 }, { "auxiliary_loss_clip": 0.0107564, "auxiliary_loss_mlp": 0.01064927, "balance_loss_clip": 1.0281148, "balance_loss_mlp": 1.02226675, "epoch": 0.24067338042988126, "flos": 24790217990400.0, "grad_norm": 2.109105661742295, "language_loss": 0.84948802, "learning_rate": 3.552202383898897e-06, "loss": 0.87089366, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.53515625, "step": 4003, "time_per_iteration": 2.4183051586151123 }, { "auxiliary_loss_clip": 0.01078542, "auxiliary_loss_mlp": 0.01066714, "balance_loss_clip": 1.0267787, "balance_loss_mlp": 1.02382565, "epoch": 0.24073350368254923, "flos": 21176882029440.0, "grad_norm": 2.137383665741016, "language_loss": 0.88617098, "learning_rate": 3.551956756667215e-06, "loss": 0.90762353, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.546875, "step": 4004, "time_per_iteration": 2.4606592655181885 }, { "auxiliary_loss_clip": 0.01081638, "auxiliary_loss_mlp": 0.01067394, "balance_loss_clip": 1.02922344, "balance_loss_mlp": 1.0248735, "epoch": 0.2407936269352172, "flos": 22493142397440.0, "grad_norm": 2.0558894859289034, "language_loss": 0.79546916, "learning_rate": 3.551711070585177e-06, "loss": 0.8169595, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.56640625, "step": 4005, "time_per_iteration": 2.4592299461364746 }, { "auxiliary_loss_clip": 0.01077054, "auxiliary_loss_mlp": 0.01053745, "balance_loss_clip": 1.01745737, "balance_loss_mlp": 1.02484846, "epoch": 0.24085375018788516, "flos": 18550156579200.0, "grad_norm": 1.840116936072851, "language_loss": 0.79304707, "learning_rate": 3.5514653256620995e-06, "loss": 0.81435502, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51953125, "step": 4006, "time_per_iteration": 2.4487011432647705 }, { "auxiliary_loss_clip": 0.01083178, "auxiliary_loss_mlp": 0.01071087, "balance_loss_clip": 1.02988863, "balance_loss_mlp": 1.02474582, "epoch": 0.24091387344055312, "flos": 24169392460800.0, "grad_norm": 1.7138987185899188, "language_loss": 0.72930193, "learning_rate": 3.551219521907302e-06, "loss": 0.7508446, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.58203125, "step": 4007, "time_per_iteration": 2.4413487911224365 }, { "auxiliary_loss_clip": 0.01077413, "auxiliary_loss_mlp": 0.01051977, "balance_loss_clip": 1.01647663, "balance_loss_mlp": 1.02386427, "epoch": 0.24097399669322112, "flos": 11035520830080.0, "grad_norm": 1.8450255246582303, "language_loss": 0.77505124, "learning_rate": 3.5509736593301042e-06, "loss": 0.79634517, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5390625, "step": 4008, "time_per_iteration": 2.465357542037964 }, { "auxiliary_loss_clip": 0.0108049, "auxiliary_loss_mlp": 0.01054006, "balance_loss_clip": 1.01759934, "balance_loss_mlp": 1.02672732, "epoch": 0.24103411994588908, "flos": 17164139581440.0, "grad_norm": 3.0711405382426578, "language_loss": 0.76561511, "learning_rate": 3.5507277379398295e-06, "loss": 0.78696012, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 4009, "time_per_iteration": 3.776369571685791 }, { "auxiliary_loss_clip": 0.01080384, "auxiliary_loss_mlp": 0.01054209, "balance_loss_clip": 1.0169208, "balance_loss_mlp": 1.02711654, "epoch": 0.24109424319855705, "flos": 20666905666560.0, "grad_norm": 1.9726961828837983, "language_loss": 0.81644273, "learning_rate": 3.550481757745804e-06, "loss": 0.8377887, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.53125, "step": 4010, "time_per_iteration": 2.40700364112854 }, { "auxiliary_loss_clip": 0.01082261, "auxiliary_loss_mlp": 0.01068792, "balance_loss_clip": 1.02554309, "balance_loss_mlp": 1.02668428, "epoch": 0.241154366451225, "flos": 28180598808960.0, "grad_norm": 2.0270255832860293, "language_loss": 0.71875942, "learning_rate": 3.5502357187573555e-06, "loss": 0.7402699, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 0.5546875, "step": 4011, "time_per_iteration": 2.444082021713257 }, { "auxiliary_loss_clip": 0.01081837, "auxiliary_loss_mlp": 0.01053045, "balance_loss_clip": 1.0184505, "balance_loss_mlp": 1.02667844, "epoch": 0.24121448970389298, "flos": 21688638871680.0, "grad_norm": 1.6089169072599032, "language_loss": 0.70895493, "learning_rate": 3.5499896209838118e-06, "loss": 0.7303037, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.55078125, "step": 4012, "time_per_iteration": 3.9555392265319824 }, { "auxiliary_loss_clip": 0.01081763, "auxiliary_loss_mlp": 0.01057022, "balance_loss_clip": 1.02009082, "balance_loss_mlp": 1.02704, "epoch": 0.24127461295656094, "flos": 39674634791040.0, "grad_norm": 2.359561900469933, "language_loss": 0.75559062, "learning_rate": 3.5497434644345073e-06, "loss": 0.77697849, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 4013, "time_per_iteration": 5.497684955596924 }, { "auxiliary_loss_clip": 0.01081478, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.02372968, "balance_loss_mlp": 1.02782941, "epoch": 0.2413347362092289, "flos": 19134846984960.0, "grad_norm": 2.076783381083543, "language_loss": 0.89921892, "learning_rate": 3.5494972491187753e-06, "loss": 0.92061365, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5390625, "step": 4014, "time_per_iteration": 2.399038791656494 }, { "auxiliary_loss_clip": 0.01085017, "auxiliary_loss_mlp": 0.01060062, "balance_loss_clip": 1.02248681, "balance_loss_mlp": 1.02813566, "epoch": 0.2413948594618969, "flos": 26938319345280.0, "grad_norm": 2.0197882489369507, "language_loss": 0.96836126, "learning_rate": 3.549250975045952e-06, "loss": 0.98981202, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5703125, "step": 4015, "time_per_iteration": 2.447695255279541 }, { "auxiliary_loss_clip": 0.0107916, "auxiliary_loss_mlp": 0.01057615, "balance_loss_clip": 1.02278161, "balance_loss_mlp": 1.02442491, "epoch": 0.24145498271456486, "flos": 25226946587520.0, "grad_norm": 1.7531597012689384, "language_loss": 0.84258318, "learning_rate": 3.5490046422253768e-06, "loss": 0.86395097, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.546875, "step": 4016, "time_per_iteration": 2.444931983947754 }, { "auxiliary_loss_clip": 0.01077259, "auxiliary_loss_mlp": 0.01060783, "balance_loss_clip": 1.02642643, "balance_loss_mlp": 1.02584529, "epoch": 0.24151510596723283, "flos": 40660163049600.0, "grad_norm": 1.987612800760057, "language_loss": 0.70728099, "learning_rate": 3.54875825066639e-06, "loss": 0.72866142, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4017, "time_per_iteration": 2.559148073196411 }, { "auxiliary_loss_clip": 0.01079824, "auxiliary_loss_mlp": 0.01068088, "balance_loss_clip": 1.03001308, "balance_loss_mlp": 1.02367389, "epoch": 0.2415752292199008, "flos": 18145792679040.0, "grad_norm": 1.779031024214066, "language_loss": 0.85864198, "learning_rate": 3.5485118003783353e-06, "loss": 0.88012111, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5625, "step": 4018, "time_per_iteration": 2.429260015487671 }, { "auxiliary_loss_clip": 0.01017905, "auxiliary_loss_mlp": 0.01019244, "balance_loss_clip": 1.01521432, "balance_loss_mlp": 1.00593841, "epoch": 0.24163535247256876, "flos": 67285275788160.0, "grad_norm": 0.83927061487824, "language_loss": 0.60754818, "learning_rate": 3.548265291370558e-06, "loss": 0.62791967, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.12011719, "step": 4019, "time_per_iteration": 3.0853142738342285 }, { "auxiliary_loss_clip": 0.01076133, "auxiliary_loss_mlp": 0.01056494, "balance_loss_clip": 1.02285337, "balance_loss_mlp": 1.02239799, "epoch": 0.24169547572523672, "flos": 24928963200000.0, "grad_norm": 2.24034955991038, "language_loss": 0.74860072, "learning_rate": 3.5480187236524055e-06, "loss": 0.76992702, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.53515625, "step": 4020, "time_per_iteration": 2.4534990787506104 }, { "auxiliary_loss_clip": 0.01078018, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.02152395, "balance_loss_mlp": 1.02515209, "epoch": 0.24175559897790472, "flos": 18727480707840.0, "grad_norm": 2.1000586244176924, "language_loss": 0.83267361, "learning_rate": 3.5477720972332285e-06, "loss": 0.8540076, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.52734375, "step": 4021, "time_per_iteration": 2.4020185470581055 }, { "auxiliary_loss_clip": 0.01079737, "auxiliary_loss_mlp": 0.01064526, "balance_loss_clip": 1.02509165, "balance_loss_mlp": 1.02375758, "epoch": 0.24181572223057268, "flos": 23038171632000.0, "grad_norm": 2.323561537512663, "language_loss": 0.78040266, "learning_rate": 3.547525412122378e-06, "loss": 0.80184525, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.55859375, "step": 4022, "time_per_iteration": 2.470710515975952 }, { "auxiliary_loss_clip": 0.01080567, "auxiliary_loss_mlp": 0.01062884, "balance_loss_clip": 1.02430773, "balance_loss_mlp": 1.02369666, "epoch": 0.24187584548324065, "flos": 20375101589760.0, "grad_norm": 1.7775226075205641, "language_loss": 0.77044034, "learning_rate": 3.5472786683292083e-06, "loss": 0.79187489, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.56640625, "step": 4023, "time_per_iteration": 2.391624689102173 }, { "auxiliary_loss_clip": 0.01079005, "auxiliary_loss_mlp": 0.0105863, "balance_loss_clip": 1.02208042, "balance_loss_mlp": 1.02612901, "epoch": 0.2419359687359086, "flos": 21396450769920.0, "grad_norm": 1.845963525267873, "language_loss": 0.84466577, "learning_rate": 3.5470318658630766e-06, "loss": 0.86604208, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.52734375, "step": 4024, "time_per_iteration": 2.4090967178344727 }, { "auxiliary_loss_clip": 0.01076221, "auxiliary_loss_mlp": 0.01055553, "balance_loss_clip": 1.01936054, "balance_loss_mlp": 1.02317858, "epoch": 0.24199609198857658, "flos": 18368398707840.0, "grad_norm": 1.6746360328574112, "language_loss": 0.87281585, "learning_rate": 3.5467850047333424e-06, "loss": 0.89413351, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53125, "step": 4025, "time_per_iteration": 2.3693811893463135 }, { "auxiliary_loss_clip": 0.0107815, "auxiliary_loss_mlp": 0.01058617, "balance_loss_clip": 1.02368915, "balance_loss_mlp": 1.02392602, "epoch": 0.24205621524124454, "flos": 19462856008320.0, "grad_norm": 1.8871058157001273, "language_loss": 0.73747367, "learning_rate": 3.546538084949365e-06, "loss": 0.7588414, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.54296875, "step": 4026, "time_per_iteration": 2.455371379852295 }, { "auxiliary_loss_clip": 0.01077656, "auxiliary_loss_mlp": 0.01055538, "balance_loss_clip": 1.02242172, "balance_loss_mlp": 1.02483821, "epoch": 0.2421163384939125, "flos": 14975434448640.0, "grad_norm": 1.8684419400818408, "language_loss": 0.66084582, "learning_rate": 3.546291106520509e-06, "loss": 0.68217778, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.52734375, "step": 4027, "time_per_iteration": 2.3672025203704834 }, { "auxiliary_loss_clip": 0.01080366, "auxiliary_loss_mlp": 0.01055988, "balance_loss_clip": 1.01929498, "balance_loss_mlp": 1.02544177, "epoch": 0.2421764617465805, "flos": 18661040657280.0, "grad_norm": 2.150020879041981, "language_loss": 0.72415972, "learning_rate": 3.5460440694561388e-06, "loss": 0.74552321, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.546875, "step": 4028, "time_per_iteration": 2.435603380203247 }, { "auxiliary_loss_clip": 0.01022265, "auxiliary_loss_mlp": 0.01007043, "balance_loss_clip": 1.00306118, "balance_loss_mlp": 1.00923181, "epoch": 0.24223658499924847, "flos": 64343877454080.0, "grad_norm": 0.9370061142320416, "language_loss": 0.55335116, "learning_rate": 3.545796973765623e-06, "loss": 0.57364428, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 0.03979492, "router_z_loss_mlp": 0.13085938, "step": 4029, "time_per_iteration": 2.997204065322876 }, { "auxiliary_loss_clip": 0.01080181, "auxiliary_loss_mlp": 0.01059426, "balance_loss_clip": 1.0187757, "balance_loss_mlp": 1.02542043, "epoch": 0.24229670825191643, "flos": 25774070503680.0, "grad_norm": 2.1093250819254914, "language_loss": 0.75381267, "learning_rate": 3.54554981945833e-06, "loss": 0.77520871, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.546875, "step": 4030, "time_per_iteration": 2.4757838249206543 }, { "auxiliary_loss_clip": 0.01078186, "auxiliary_loss_mlp": 0.01057806, "balance_loss_clip": 1.0222578, "balance_loss_mlp": 1.02560997, "epoch": 0.2423568315045844, "flos": 20666067793920.0, "grad_norm": 1.8835533094366022, "language_loss": 0.78362024, "learning_rate": 3.5453026065436343e-06, "loss": 0.80498016, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5234375, "step": 4031, "time_per_iteration": 2.415560722351074 }, { "auxiliary_loss_clip": 0.01082176, "auxiliary_loss_mlp": 0.01061099, "balance_loss_clip": 1.02204585, "balance_loss_mlp": 1.02561474, "epoch": 0.24241695475725236, "flos": 22415775091200.0, "grad_norm": 2.5950219368272496, "language_loss": 0.67922282, "learning_rate": 3.5450553350309083e-06, "loss": 0.70065558, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5625, "step": 4032, "time_per_iteration": 2.4404025077819824 }, { "auxiliary_loss_clip": 0.0107995, "auxiliary_loss_mlp": 0.01055751, "balance_loss_clip": 1.02041721, "balance_loss_mlp": 1.02571726, "epoch": 0.24247707800992033, "flos": 17128039368960.0, "grad_norm": 2.0965458164921387, "language_loss": 0.82925487, "learning_rate": 3.5448080049295286e-06, "loss": 0.85061193, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.54296875, "step": 4033, "time_per_iteration": 2.366330146789551 }, { "auxiliary_loss_clip": 0.01077019, "auxiliary_loss_mlp": 0.01049303, "balance_loss_clip": 1.01530421, "balance_loss_mlp": 1.02631962, "epoch": 0.2425372012625883, "flos": 31612386366720.0, "grad_norm": 1.8414824625200181, "language_loss": 0.70649588, "learning_rate": 3.5445606162488754e-06, "loss": 0.72775906, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4034, "time_per_iteration": 2.512877941131592 }, { "auxiliary_loss_clip": 0.01079243, "auxiliary_loss_mlp": 0.01057042, "balance_loss_clip": 1.02120733, "balance_loss_mlp": 1.02503729, "epoch": 0.24259732451525629, "flos": 16325106854400.0, "grad_norm": 2.49452827183652, "language_loss": 0.97683889, "learning_rate": 3.5443131689983283e-06, "loss": 0.99820173, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.54296875, "step": 4035, "time_per_iteration": 2.3536953926086426 }, { "auxiliary_loss_clip": 0.01073839, "auxiliary_loss_mlp": 0.01051327, "balance_loss_clip": 1.01943851, "balance_loss_mlp": 1.02462959, "epoch": 0.24265744776792425, "flos": 22855540976640.0, "grad_norm": 1.4985255712496286, "language_loss": 0.79332823, "learning_rate": 3.5440656631872715e-06, "loss": 0.81457984, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4921875, "step": 4036, "time_per_iteration": 2.446871519088745 }, { "auxiliary_loss_clip": 0.01078473, "auxiliary_loss_mlp": 0.01058909, "balance_loss_clip": 1.02111983, "balance_loss_mlp": 1.02519464, "epoch": 0.24271757102059222, "flos": 21870501477120.0, "grad_norm": 1.579150100521124, "language_loss": 0.75897521, "learning_rate": 3.5438180988250898e-06, "loss": 0.78034902, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.53125, "step": 4037, "time_per_iteration": 2.3892405033111572 }, { "auxiliary_loss_clip": 0.01077016, "auxiliary_loss_mlp": 0.01060729, "balance_loss_clip": 1.02353573, "balance_loss_mlp": 1.02291155, "epoch": 0.24277769427326018, "flos": 19207571080320.0, "grad_norm": 2.7699569039793683, "language_loss": 0.78764749, "learning_rate": 3.543570475921171e-06, "loss": 0.80902499, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.54296875, "step": 4038, "time_per_iteration": 2.4123494625091553 }, { "auxiliary_loss_clip": 0.01076612, "auxiliary_loss_mlp": 0.01061571, "balance_loss_clip": 1.0248065, "balance_loss_mlp": 1.02333963, "epoch": 0.24283781752592815, "flos": 19498886398080.0, "grad_norm": 1.9446176555507715, "language_loss": 0.73951364, "learning_rate": 3.543322794484905e-06, "loss": 0.76089549, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.53125, "step": 4039, "time_per_iteration": 2.3720524311065674 }, { "auxiliary_loss_clip": 0.01077891, "auxiliary_loss_mlp": 0.0106295, "balance_loss_clip": 1.02811766, "balance_loss_mlp": 1.02352035, "epoch": 0.2428979407785961, "flos": 19901155616640.0, "grad_norm": 2.168588271105371, "language_loss": 0.79834509, "learning_rate": 3.5430750545256843e-06, "loss": 0.81975353, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.546875, "step": 4040, "time_per_iteration": 2.441763401031494 }, { "auxiliary_loss_clip": 0.01071998, "auxiliary_loss_mlp": 0.01050446, "balance_loss_clip": 1.01756835, "balance_loss_mlp": 1.0214678, "epoch": 0.2429580640312641, "flos": 24714770808960.0, "grad_norm": 1.7118062181089024, "language_loss": 0.81460315, "learning_rate": 3.5428272560529027e-06, "loss": 0.83582759, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.50390625, "step": 4041, "time_per_iteration": 2.416722059249878 }, { "auxiliary_loss_clip": 0.01077696, "auxiliary_loss_mlp": 0.01059254, "balance_loss_clip": 1.02258515, "balance_loss_mlp": 1.02502942, "epoch": 0.24301818728393207, "flos": 25629145983360.0, "grad_norm": 1.9311063059099964, "language_loss": 0.78507715, "learning_rate": 3.542579399075957e-06, "loss": 0.80644667, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5234375, "step": 4042, "time_per_iteration": 2.492285966873169 }, { "auxiliary_loss_clip": 0.01075065, "auxiliary_loss_mlp": 0.0104853, "balance_loss_clip": 1.01636708, "balance_loss_mlp": 1.02317119, "epoch": 0.24307831053660003, "flos": 26140169687040.0, "grad_norm": 2.034610150191903, "language_loss": 0.82251257, "learning_rate": 3.542331483604246e-06, "loss": 0.84374857, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.51953125, "step": 4043, "time_per_iteration": 2.464989185333252 }, { "auxiliary_loss_clip": 0.01078247, "auxiliary_loss_mlp": 0.01056528, "balance_loss_clip": 1.02055049, "balance_loss_mlp": 1.02341557, "epoch": 0.243138433789268, "flos": 14971629110400.0, "grad_norm": 2.4000180548885983, "language_loss": 0.75167894, "learning_rate": 3.5420835096471706e-06, "loss": 0.7730267, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.546875, "step": 4044, "time_per_iteration": 2.4083938598632812 }, { "auxiliary_loss_clip": 0.01077344, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.02057397, "balance_loss_mlp": 1.02513921, "epoch": 0.24319855704193596, "flos": 25190532172800.0, "grad_norm": 1.7637474235695692, "language_loss": 0.84580266, "learning_rate": 3.5418354772141337e-06, "loss": 0.86712348, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.51953125, "step": 4045, "time_per_iteration": 2.503157377243042 }, { "auxiliary_loss_clip": 0.01078927, "auxiliary_loss_mlp": 0.0105616, "balance_loss_clip": 1.02106452, "balance_loss_mlp": 1.02618515, "epoch": 0.24325868029460393, "flos": 22126135518720.0, "grad_norm": 1.5663968267718156, "language_loss": 0.87577933, "learning_rate": 3.541587386314541e-06, "loss": 0.89713013, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.52734375, "step": 4046, "time_per_iteration": 2.4831786155700684 }, { "auxiliary_loss_clip": 0.01074936, "auxiliary_loss_mlp": 0.01054695, "balance_loss_clip": 1.01898038, "balance_loss_mlp": 1.02477837, "epoch": 0.2433188035472719, "flos": 23581106184960.0, "grad_norm": 1.802450615123667, "language_loss": 0.74445379, "learning_rate": 3.5413392369578e-06, "loss": 0.76575005, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.50390625, "step": 4047, "time_per_iteration": 2.4012045860290527 }, { "auxiliary_loss_clip": 0.01077826, "auxiliary_loss_mlp": 0.01054701, "balance_loss_clip": 1.01779377, "balance_loss_mlp": 1.0248189, "epoch": 0.2433789267999399, "flos": 24461650385280.0, "grad_norm": 3.85577516962487, "language_loss": 0.75266147, "learning_rate": 3.5410910291533213e-06, "loss": 0.7739867, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53125, "step": 4048, "time_per_iteration": 3.854748010635376 }, { "auxiliary_loss_clip": 0.01076663, "auxiliary_loss_mlp": 0.01050025, "balance_loss_clip": 1.01829147, "balance_loss_mlp": 1.02549636, "epoch": 0.24343905005260785, "flos": 16726957136640.0, "grad_norm": 1.898110146032987, "language_loss": 0.74694324, "learning_rate": 3.5408427629105155e-06, "loss": 0.76821017, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.51171875, "step": 4049, "time_per_iteration": 2.3780853748321533 }, { "auxiliary_loss_clip": 0.01074201, "auxiliary_loss_mlp": 0.01046884, "balance_loss_clip": 1.01505554, "balance_loss_mlp": 1.02330399, "epoch": 0.24349917330527582, "flos": 20042833380480.0, "grad_norm": 1.6514405730114698, "language_loss": 0.75455326, "learning_rate": 3.5405944382387985e-06, "loss": 0.77576411, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.5078125, "step": 4050, "time_per_iteration": 2.404536485671997 }, { "auxiliary_loss_clip": 0.01074891, "auxiliary_loss_mlp": 0.01049501, "balance_loss_clip": 1.0165987, "balance_loss_mlp": 1.0237174, "epoch": 0.24355929655794378, "flos": 17419599066240.0, "grad_norm": 2.837810372424419, "language_loss": 0.78151459, "learning_rate": 3.5403460551475854e-06, "loss": 0.80275846, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51171875, "step": 4051, "time_per_iteration": 2.360166072845459 }, { "auxiliary_loss_clip": 0.01076745, "auxiliary_loss_mlp": 0.01053023, "balance_loss_clip": 1.01814246, "balance_loss_mlp": 1.02445269, "epoch": 0.24361941981061175, "flos": 25409751799680.0, "grad_norm": 2.225092795586866, "language_loss": 0.71622491, "learning_rate": 3.540097613646296e-06, "loss": 0.73752266, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5234375, "step": 4052, "time_per_iteration": 5.361985206604004 }, { "auxiliary_loss_clip": 0.01079084, "auxiliary_loss_mlp": 0.0105512, "balance_loss_clip": 1.01892769, "balance_loss_mlp": 1.02556324, "epoch": 0.2436795430632797, "flos": 22819685143680.0, "grad_norm": 4.859315187168331, "language_loss": 0.82454562, "learning_rate": 3.539849113744351e-06, "loss": 0.84588766, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53515625, "step": 4053, "time_per_iteration": 3.8306734561920166 }, { "auxiliary_loss_clip": 0.01079796, "auxiliary_loss_mlp": 0.01056215, "balance_loss_clip": 1.01873565, "balance_loss_mlp": 1.02462983, "epoch": 0.2437396663159477, "flos": 15156913029120.0, "grad_norm": 1.6102297878755432, "language_loss": 0.78477401, "learning_rate": 3.539600555451172e-06, "loss": 0.8061341, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.55078125, "step": 4054, "time_per_iteration": 2.3878581523895264 }, { "auxiliary_loss_clip": 0.01076121, "auxiliary_loss_mlp": 0.01060595, "balance_loss_clip": 1.02514243, "balance_loss_mlp": 1.02334356, "epoch": 0.24379978956861567, "flos": 22090035306240.0, "grad_norm": 1.5859927161887852, "language_loss": 0.85046244, "learning_rate": 3.5393519387761866e-06, "loss": 0.87182963, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.52734375, "step": 4055, "time_per_iteration": 2.395505666732788 }, { "auxiliary_loss_clip": 0.01078852, "auxiliary_loss_mlp": 0.01055163, "balance_loss_clip": 1.01744509, "balance_loss_mlp": 1.02343678, "epoch": 0.24385991282128364, "flos": 31466414505600.0, "grad_norm": 4.145669128228798, "language_loss": 0.57510948, "learning_rate": 3.5391032637288217e-06, "loss": 0.59644961, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5546875, "step": 4056, "time_per_iteration": 2.571068048477173 }, { "auxiliary_loss_clip": 0.01078051, "auxiliary_loss_mlp": 0.01064581, "balance_loss_clip": 1.02838922, "balance_loss_mlp": 1.0230974, "epoch": 0.2439200360739516, "flos": 23837752656000.0, "grad_norm": 2.4295222152856066, "language_loss": 0.8144123, "learning_rate": 3.538854530318506e-06, "loss": 0.83583862, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.55078125, "step": 4057, "time_per_iteration": 2.3896138668060303 }, { "auxiliary_loss_clip": 0.01073432, "auxiliary_loss_mlp": 0.01058698, "balance_loss_clip": 1.02374601, "balance_loss_mlp": 1.02191103, "epoch": 0.24398015932661957, "flos": 19169027072640.0, "grad_norm": 1.8453101238365188, "language_loss": 0.80755377, "learning_rate": 3.538605738554673e-06, "loss": 0.82887506, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.515625, "step": 4058, "time_per_iteration": 2.4148991107940674 }, { "auxiliary_loss_clip": 0.01077334, "auxiliary_loss_mlp": 0.01055419, "balance_loss_clip": 1.0191555, "balance_loss_mlp": 1.02241826, "epoch": 0.24404028257928753, "flos": 25261371054720.0, "grad_norm": 1.7857767011827075, "language_loss": 0.8669045, "learning_rate": 3.538356888446756e-06, "loss": 0.88823211, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.55078125, "step": 4059, "time_per_iteration": 2.4296374320983887 }, { "auxiliary_loss_clip": 0.01073479, "auxiliary_loss_mlp": 0.01052023, "balance_loss_clip": 1.01964605, "balance_loss_mlp": 1.02305579, "epoch": 0.2441004058319555, "flos": 26466433142400.0, "grad_norm": 2.1923685084732787, "language_loss": 0.75336993, "learning_rate": 3.5381079800041913e-06, "loss": 0.77462494, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.50390625, "step": 4060, "time_per_iteration": 2.4816532135009766 }, { "auxiliary_loss_clip": 0.01081117, "auxiliary_loss_mlp": 0.01062244, "balance_loss_clip": 1.02049661, "balance_loss_mlp": 1.0231719, "epoch": 0.2441605290846235, "flos": 26759319471360.0, "grad_norm": 2.126842377447476, "language_loss": 0.75674349, "learning_rate": 3.5378590132364182e-06, "loss": 0.77817714, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.578125, "step": 4061, "time_per_iteration": 2.435314655303955 }, { "auxiliary_loss_clip": 0.01074519, "auxiliary_loss_mlp": 0.01050087, "balance_loss_clip": 1.01692331, "balance_loss_mlp": 1.02293003, "epoch": 0.24422065233729146, "flos": 21104786338560.0, "grad_norm": 1.8316670769531485, "language_loss": 0.77502209, "learning_rate": 3.5376099881528768e-06, "loss": 0.79626811, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.515625, "step": 4062, "time_per_iteration": 2.4098353385925293 }, { "auxiliary_loss_clip": 0.01072214, "auxiliary_loss_mlp": 0.01054627, "balance_loss_clip": 1.02112937, "balance_loss_mlp": 1.02218962, "epoch": 0.24428077558995942, "flos": 25262034370560.0, "grad_norm": 1.814163256189854, "language_loss": 0.86056113, "learning_rate": 3.537360904763011e-06, "loss": 0.8818295, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5, "step": 4063, "time_per_iteration": 2.4213056564331055 }, { "auxiliary_loss_clip": 0.01080323, "auxiliary_loss_mlp": 0.0105313, "balance_loss_clip": 1.01665139, "balance_loss_mlp": 1.02515435, "epoch": 0.24434089884262739, "flos": 20484240099840.0, "grad_norm": 2.365248376969329, "language_loss": 0.71151829, "learning_rate": 3.5371117630762656e-06, "loss": 0.73285282, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5546875, "step": 4064, "time_per_iteration": 2.419502019882202 }, { "auxiliary_loss_clip": 0.01080727, "auxiliary_loss_mlp": 0.01056408, "balance_loss_clip": 1.01697361, "balance_loss_mlp": 1.02358961, "epoch": 0.24440102209529535, "flos": 23620802267520.0, "grad_norm": 1.8645364614035302, "language_loss": 0.71372253, "learning_rate": 3.536862563102088e-06, "loss": 0.73509383, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5703125, "step": 4065, "time_per_iteration": 2.4062345027923584 }, { "auxiliary_loss_clip": 0.01083289, "auxiliary_loss_mlp": 0.01055913, "balance_loss_clip": 1.01628804, "balance_loss_mlp": 1.02581334, "epoch": 0.24446114534796332, "flos": 20553787261440.0, "grad_norm": 1.901042979799193, "language_loss": 0.86078542, "learning_rate": 3.5366133048499282e-06, "loss": 0.88217747, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.578125, "step": 4066, "time_per_iteration": 2.391258478164673 }, { "auxiliary_loss_clip": 0.01018473, "auxiliary_loss_mlp": 0.01017425, "balance_loss_clip": 1.01222765, "balance_loss_mlp": 1.00539827, "epoch": 0.24452126860063128, "flos": 60386717623680.0, "grad_norm": 0.7496188756551327, "language_loss": 0.52401447, "learning_rate": 3.5363639883292374e-06, "loss": 0.54437339, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 0.05200195, "router_z_loss_mlp": 0.13085938, "step": 4067, "time_per_iteration": 2.942322254180908 }, { "auxiliary_loss_clip": 0.01080322, "auxiliary_loss_mlp": 0.01062183, "balance_loss_clip": 1.02286816, "balance_loss_mlp": 1.02500129, "epoch": 0.24458139185329927, "flos": 15120777905280.0, "grad_norm": 3.9295171125283317, "language_loss": 0.7506758, "learning_rate": 3.5361146135494706e-06, "loss": 0.77210093, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5546875, "step": 4068, "time_per_iteration": 2.413534641265869 }, { "auxiliary_loss_clip": 0.01080881, "auxiliary_loss_mlp": 0.01057166, "balance_loss_clip": 1.01954353, "balance_loss_mlp": 1.02718222, "epoch": 0.24464151510596724, "flos": 27997549217280.0, "grad_norm": 2.480080020663797, "language_loss": 0.78916574, "learning_rate": 3.5358651805200835e-06, "loss": 0.81054622, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5390625, "step": 4069, "time_per_iteration": 2.467576503753662 }, { "auxiliary_loss_clip": 0.01080329, "auxiliary_loss_mlp": 0.01070378, "balance_loss_clip": 1.03392386, "balance_loss_mlp": 1.02777433, "epoch": 0.2447016383586352, "flos": 19791842549760.0, "grad_norm": 1.9403531348174947, "language_loss": 0.82166547, "learning_rate": 3.5356156892505347e-06, "loss": 0.84317255, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.52734375, "step": 4070, "time_per_iteration": 2.4292781352996826 }, { "auxiliary_loss_clip": 0.01077136, "auxiliary_loss_mlp": 0.01060688, "balance_loss_clip": 1.02452016, "balance_loss_mlp": 1.02433181, "epoch": 0.24476176161130317, "flos": 26066153871360.0, "grad_norm": 1.5008924857237538, "language_loss": 0.84870189, "learning_rate": 3.5353661397502854e-06, "loss": 0.87008011, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.52734375, "step": 4071, "time_per_iteration": 2.464749574661255 }, { "auxiliary_loss_clip": 0.01082924, "auxiliary_loss_mlp": 0.01074001, "balance_loss_clip": 1.03449476, "balance_loss_mlp": 1.02630389, "epoch": 0.24482188486397113, "flos": 18842554149120.0, "grad_norm": 1.8849275454239685, "language_loss": 0.81087017, "learning_rate": 3.535116532028798e-06, "loss": 0.83243942, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5625, "step": 4072, "time_per_iteration": 2.4048304557800293 }, { "auxiliary_loss_clip": 0.01078391, "auxiliary_loss_mlp": 0.01074576, "balance_loss_clip": 1.04062521, "balance_loss_mlp": 1.02600002, "epoch": 0.2448820081166391, "flos": 21250723288320.0, "grad_norm": 3.295255207109717, "language_loss": 0.71148777, "learning_rate": 3.5348668660955382e-06, "loss": 0.73301744, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5234375, "step": 4073, "time_per_iteration": 2.4344606399536133 }, { "auxiliary_loss_clip": 0.01078822, "auxiliary_loss_mlp": 0.01064011, "balance_loss_clip": 1.02877283, "balance_loss_mlp": 1.0254252, "epoch": 0.2449421313693071, "flos": 23949474606720.0, "grad_norm": 2.203530145134386, "language_loss": 0.6982922, "learning_rate": 3.5346171419599728e-06, "loss": 0.71972054, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53515625, "step": 4074, "time_per_iteration": 2.4109864234924316 }, { "auxiliary_loss_clip": 0.0101987, "auxiliary_loss_mlp": 0.01006861, "balance_loss_clip": 1.00206864, "balance_loss_mlp": 1.00734973, "epoch": 0.24500225462197506, "flos": 60684631188480.0, "grad_norm": 0.9064427775436178, "language_loss": 0.68771493, "learning_rate": 3.5343673596315718e-06, "loss": 0.70798224, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 0.04785156, "router_z_loss_mlp": 0.125, "step": 4075, "time_per_iteration": 3.102020263671875 }, { "auxiliary_loss_clip": 0.01075938, "auxiliary_loss_mlp": 0.01072903, "balance_loss_clip": 1.03516102, "balance_loss_mlp": 1.02394795, "epoch": 0.24506237787464302, "flos": 26283069348480.0, "grad_norm": 1.925283636322416, "language_loss": 0.81347185, "learning_rate": 3.5341175191198063e-06, "loss": 0.83496022, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.51953125, "step": 4076, "time_per_iteration": 2.456791400909424 }, { "auxiliary_loss_clip": 0.01079452, "auxiliary_loss_mlp": 0.01084592, "balance_loss_clip": 1.04401362, "balance_loss_mlp": 1.02329731, "epoch": 0.245122501127311, "flos": 20551413288960.0, "grad_norm": 2.069191218410067, "language_loss": 0.84023339, "learning_rate": 3.533867620434151e-06, "loss": 0.86187387, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.5625, "step": 4077, "time_per_iteration": 2.3764889240264893 }, { "auxiliary_loss_clip": 0.01076783, "auxiliary_loss_mlp": 0.01082822, "balance_loss_clip": 1.04515219, "balance_loss_mlp": 1.02233672, "epoch": 0.24518262437997895, "flos": 29131318575360.0, "grad_norm": 1.9341679867177062, "language_loss": 0.64506721, "learning_rate": 3.533617663584082e-06, "loss": 0.66666329, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.54296875, "step": 4078, "time_per_iteration": 2.4667975902557373 }, { "auxiliary_loss_clip": 0.01073985, "auxiliary_loss_mlp": 0.0106914, "balance_loss_clip": 1.03554678, "balance_loss_mlp": 1.02268958, "epoch": 0.24524274763264692, "flos": 23475807924480.0, "grad_norm": 1.4981269244582167, "language_loss": 0.76422048, "learning_rate": 3.5333676485790765e-06, "loss": 0.78565168, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51171875, "step": 4079, "time_per_iteration": 2.406325101852417 }, { "auxiliary_loss_clip": 0.01073614, "auxiliary_loss_mlp": 0.01076204, "balance_loss_clip": 1.0379374, "balance_loss_mlp": 1.0211364, "epoch": 0.24530287088531488, "flos": 17200239793920.0, "grad_norm": 1.8352484979806343, "language_loss": 0.76581204, "learning_rate": 3.5331175754286173e-06, "loss": 0.78731024, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5234375, "step": 4080, "time_per_iteration": 2.3974366188049316 }, { "auxiliary_loss_clip": 0.01073728, "auxiliary_loss_mlp": 0.0106845, "balance_loss_clip": 1.03502345, "balance_loss_mlp": 1.02232099, "epoch": 0.24536299413798288, "flos": 14866540318080.0, "grad_norm": 1.7905302539273775, "language_loss": 0.84700578, "learning_rate": 3.532867444142186e-06, "loss": 0.86842752, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 4081, "time_per_iteration": 2.3796255588531494 }, { "auxiliary_loss_clip": 0.01075695, "auxiliary_loss_mlp": 0.01057281, "balance_loss_clip": 1.02240014, "balance_loss_mlp": 1.02428603, "epoch": 0.24542311739065084, "flos": 35260600642560.0, "grad_norm": 2.217441464887436, "language_loss": 0.7490865, "learning_rate": 3.532617254729267e-06, "loss": 0.77041626, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 4082, "time_per_iteration": 2.5373880863189697 }, { "auxiliary_loss_clip": 0.0107565, "auxiliary_loss_mlp": 0.0105755, "balance_loss_clip": 1.02455258, "balance_loss_mlp": 1.02363372, "epoch": 0.2454832406433188, "flos": 21502167966720.0, "grad_norm": 1.5032267305197065, "language_loss": 0.7280035, "learning_rate": 3.5323670071993485e-06, "loss": 0.74933541, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.51953125, "step": 4083, "time_per_iteration": 2.4061079025268555 }, { "auxiliary_loss_clip": 0.01076867, "auxiliary_loss_mlp": 0.01068816, "balance_loss_clip": 1.02992964, "balance_loss_mlp": 1.02271867, "epoch": 0.24554336389598677, "flos": 14755795885440.0, "grad_norm": 2.0395061278324946, "language_loss": 0.7722435, "learning_rate": 3.532116701561919e-06, "loss": 0.79370034, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5390625, "step": 4084, "time_per_iteration": 2.3770596981048584 }, { "auxiliary_loss_clip": 0.01077131, "auxiliary_loss_mlp": 0.01047083, "balance_loss_clip": 1.01313186, "balance_loss_mlp": 1.02434993, "epoch": 0.24560348714865474, "flos": 14975504271360.0, "grad_norm": 2.9894793932898804, "language_loss": 0.86884457, "learning_rate": 3.531866337826471e-06, "loss": 0.89008677, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.52734375, "step": 4085, "time_per_iteration": 2.41023850440979 }, { "auxiliary_loss_clip": 0.01082699, "auxiliary_loss_mlp": 0.01063516, "balance_loss_clip": 1.02498722, "balance_loss_mlp": 1.02895534, "epoch": 0.2456636104013227, "flos": 22674202041600.0, "grad_norm": 1.832318984215963, "language_loss": 0.8073504, "learning_rate": 3.5316159160024982e-06, "loss": 0.82881248, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5390625, "step": 4086, "time_per_iteration": 2.4116299152374268 }, { "auxiliary_loss_clip": 0.01077715, "auxiliary_loss_mlp": 0.01051663, "balance_loss_clip": 1.01752186, "balance_loss_mlp": 1.02709103, "epoch": 0.2457237336539907, "flos": 27416629238400.0, "grad_norm": 1.541543970042599, "language_loss": 0.7648536, "learning_rate": 3.531365436099496e-06, "loss": 0.78614736, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4087, "time_per_iteration": 2.4473538398742676 }, { "auxiliary_loss_clip": 0.01084066, "auxiliary_loss_mlp": 0.01057891, "balance_loss_clip": 1.02005363, "balance_loss_mlp": 1.02905893, "epoch": 0.24578385690665866, "flos": 20411341447680.0, "grad_norm": 2.7007281656740476, "language_loss": 0.80600071, "learning_rate": 3.5311148981269635e-06, "loss": 0.82742029, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.546875, "step": 4088, "time_per_iteration": 3.862217903137207 }, { "auxiliary_loss_clip": 0.01075357, "auxiliary_loss_mlp": 0.01047965, "balance_loss_clip": 1.01480079, "balance_loss_mlp": 1.02610743, "epoch": 0.24584398015932662, "flos": 23914247178240.0, "grad_norm": 1.506149924242245, "language_loss": 0.78525472, "learning_rate": 3.5308643020944e-06, "loss": 0.80648792, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49414062, "step": 4089, "time_per_iteration": 2.4388489723205566 }, { "auxiliary_loss_clip": 0.01082393, "auxiliary_loss_mlp": 0.01056873, "balance_loss_clip": 1.01841617, "balance_loss_mlp": 1.0282172, "epoch": 0.2459041034119946, "flos": 41494866768000.0, "grad_norm": 1.9237033947739768, "language_loss": 0.83088112, "learning_rate": 3.530613648011309e-06, "loss": 0.85227376, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5390625, "step": 4090, "time_per_iteration": 2.5869576930999756 }, { "auxiliary_loss_clip": 0.01082128, "auxiliary_loss_mlp": 0.01065169, "balance_loss_clip": 1.02370811, "balance_loss_mlp": 1.02802157, "epoch": 0.24596422666466256, "flos": 19935824463360.0, "grad_norm": 1.6647342848811633, "language_loss": 0.75150383, "learning_rate": 3.5303629358871946e-06, "loss": 0.77297676, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.54296875, "step": 4091, "time_per_iteration": 3.8456361293792725 }, { "auxiliary_loss_clip": 0.01081312, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.02672505, "balance_loss_mlp": 1.02817321, "epoch": 0.24602434991733052, "flos": 21543295415040.0, "grad_norm": 2.3545421574834777, "language_loss": 0.78910434, "learning_rate": 3.5301121657315653e-06, "loss": 0.81056452, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.53125, "step": 4092, "time_per_iteration": 3.804647445678711 }, { "auxiliary_loss_clip": 0.01081915, "auxiliary_loss_mlp": 0.01058369, "balance_loss_clip": 1.02105665, "balance_loss_mlp": 1.02526999, "epoch": 0.24608447316999849, "flos": 23183968936320.0, "grad_norm": 3.349861538877092, "language_loss": 0.83658987, "learning_rate": 3.5298613375539287e-06, "loss": 0.85799271, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.56640625, "step": 4093, "time_per_iteration": 3.78472638130188 }, { "auxiliary_loss_clip": 0.01082956, "auxiliary_loss_mlp": 0.0106724, "balance_loss_clip": 1.02864003, "balance_loss_mlp": 1.02630091, "epoch": 0.24614459642266648, "flos": 19641052920960.0, "grad_norm": 1.8889598990676102, "language_loss": 0.89378333, "learning_rate": 3.529610451363797e-06, "loss": 0.91528523, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.56640625, "step": 4094, "time_per_iteration": 2.4280807971954346 }, { "auxiliary_loss_clip": 0.01027151, "auxiliary_loss_mlp": 0.01005687, "balance_loss_clip": 0.99963087, "balance_loss_mlp": 1.01376557, "epoch": 0.24620471967533444, "flos": 61736913699840.0, "grad_norm": 0.7478623501955826, "language_loss": 0.57607472, "learning_rate": 3.5293595071706833e-06, "loss": 0.59640312, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 0.06054688, "router_z_loss_mlp": 0.13378906, "step": 4095, "time_per_iteration": 3.112612724304199 }, { "auxiliary_loss_clip": 0.01019396, "auxiliary_loss_mlp": 0.01006798, "balance_loss_clip": 1.00040829, "balance_loss_mlp": 1.00658584, "epoch": 0.2462648429280024, "flos": 69151103867520.0, "grad_norm": 0.6510964876685286, "language_loss": 0.56325638, "learning_rate": 3.5291085049841042e-06, "loss": 0.58351827, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 0.06396484, "router_z_loss_mlp": 0.12792969, "step": 4096, "time_per_iteration": 3.164193630218506 }, { "auxiliary_loss_clip": 0.01075914, "auxiliary_loss_mlp": 0.01049103, "balance_loss_clip": 1.01398408, "balance_loss_mlp": 1.02262449, "epoch": 0.24632496618067037, "flos": 29458350080640.0, "grad_norm": 1.708619716713177, "language_loss": 0.78881979, "learning_rate": 3.5288574448135773e-06, "loss": 0.81007004, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53125, "step": 4097, "time_per_iteration": 2.4562301635742188 }, { "auxiliary_loss_clip": 0.01080699, "auxiliary_loss_mlp": 0.01070734, "balance_loss_clip": 1.02896285, "balance_loss_mlp": 1.02435327, "epoch": 0.24638508943333834, "flos": 24315294499200.0, "grad_norm": 1.8512357099687724, "language_loss": 0.77553487, "learning_rate": 3.5286063266686235e-06, "loss": 0.79704916, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.5625, "step": 4098, "time_per_iteration": 2.441605806350708 }, { "auxiliary_loss_clip": 0.01080359, "auxiliary_loss_mlp": 0.01060187, "balance_loss_clip": 1.02442384, "balance_loss_mlp": 1.02522182, "epoch": 0.2464452126860063, "flos": 26612090801280.0, "grad_norm": 1.899677103919735, "language_loss": 0.7021755, "learning_rate": 3.528355150558764e-06, "loss": 0.72358096, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5546875, "step": 4099, "time_per_iteration": 2.4292664527893066 }, { "auxiliary_loss_clip": 0.0107624, "auxiliary_loss_mlp": 0.01056977, "balance_loss_clip": 1.0230267, "balance_loss_mlp": 1.02600443, "epoch": 0.24650533593867427, "flos": 31211059754880.0, "grad_norm": 2.2452633704015854, "language_loss": 0.67956233, "learning_rate": 3.5281039164935237e-06, "loss": 0.70089447, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 4100, "time_per_iteration": 2.508368492126465 }, { "auxiliary_loss_clip": 0.01020668, "auxiliary_loss_mlp": 0.01009206, "balance_loss_clip": 1.00398421, "balance_loss_mlp": 1.00767159, "epoch": 0.24656545919134226, "flos": 68490791723520.0, "grad_norm": 0.7191769112386236, "language_loss": 0.61504519, "learning_rate": 3.5278526244824304e-06, "loss": 0.63534391, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 0.05224609, "router_z_loss_mlp": 0.13085938, "step": 4101, "time_per_iteration": 3.119555950164795 }, { "auxiliary_loss_clip": 0.01078061, "auxiliary_loss_mlp": 0.0106332, "balance_loss_clip": 1.02529216, "balance_loss_mlp": 1.02574253, "epoch": 0.24662558244401023, "flos": 20083157867520.0, "grad_norm": 2.1960740621879773, "language_loss": 0.7417109, "learning_rate": 3.527601274535012e-06, "loss": 0.7631247, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5234375, "step": 4102, "time_per_iteration": 2.412973165512085 }, { "auxiliary_loss_clip": 0.01080593, "auxiliary_loss_mlp": 0.01064842, "balance_loss_clip": 1.02581286, "balance_loss_mlp": 1.02676773, "epoch": 0.2466857056966782, "flos": 30700036051200.0, "grad_norm": 2.256476779632115, "language_loss": 0.7643671, "learning_rate": 3.5273498666608004e-06, "loss": 0.78582144, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5390625, "step": 4103, "time_per_iteration": 2.4640629291534424 }, { "auxiliary_loss_clip": 0.01082929, "auxiliary_loss_mlp": 0.01056949, "balance_loss_clip": 1.01734722, "balance_loss_mlp": 1.02812374, "epoch": 0.24674582894934616, "flos": 22527427219200.0, "grad_norm": 2.5089721577435586, "language_loss": 0.80122852, "learning_rate": 3.5270984008693288e-06, "loss": 0.82262731, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.546875, "step": 4104, "time_per_iteration": 2.428025484085083 }, { "auxiliary_loss_clip": 0.01087706, "auxiliary_loss_mlp": 0.01054585, "balance_loss_clip": 1.014817, "balance_loss_mlp": 1.03403044, "epoch": 0.24680595220201412, "flos": 20703250258560.0, "grad_norm": 1.8701561339186086, "language_loss": 0.85148472, "learning_rate": 3.526846877170133e-06, "loss": 0.87290764, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5390625, "step": 4105, "time_per_iteration": 2.4216604232788086 }, { "auxiliary_loss_clip": 0.01087152, "auxiliary_loss_mlp": 0.01056883, "balance_loss_clip": 1.0183543, "balance_loss_mlp": 1.03238368, "epoch": 0.2468660754546821, "flos": 21830211901440.0, "grad_norm": 1.8284901216229983, "language_loss": 0.78235996, "learning_rate": 3.52659529557275e-06, "loss": 0.80380028, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.546875, "step": 4106, "time_per_iteration": 2.445884943008423 }, { "auxiliary_loss_clip": 0.01086203, "auxiliary_loss_mlp": 0.01061523, "balance_loss_clip": 1.0216831, "balance_loss_mlp": 1.03107238, "epoch": 0.24692619870735008, "flos": 15266819589120.0, "grad_norm": 2.5019238857695614, "language_loss": 0.74762279, "learning_rate": 3.5263436560867205e-06, "loss": 0.76910007, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.55078125, "step": 4107, "time_per_iteration": 2.388078212738037 }, { "auxiliary_loss_clip": 0.01087629, "auxiliary_loss_mlp": 0.01064677, "balance_loss_clip": 1.0255053, "balance_loss_mlp": 1.03245938, "epoch": 0.24698632196001805, "flos": 29678791605120.0, "grad_norm": 1.62429019251575, "language_loss": 0.67317533, "learning_rate": 3.526091958721587e-06, "loss": 0.69469845, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.55078125, "step": 4108, "time_per_iteration": 2.542109251022339 }, { "auxiliary_loss_clip": 0.01086527, "auxiliary_loss_mlp": 0.01070262, "balance_loss_clip": 1.02825236, "balance_loss_mlp": 1.0300622, "epoch": 0.247046445212686, "flos": 39163925289600.0, "grad_norm": 2.1495606279939428, "language_loss": 0.74320567, "learning_rate": 3.5258402034868936e-06, "loss": 0.76477355, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.56640625, "step": 4109, "time_per_iteration": 2.5963244438171387 }, { "auxiliary_loss_clip": 0.01083853, "auxiliary_loss_mlp": 0.01062558, "balance_loss_clip": 1.02197909, "balance_loss_mlp": 1.02764714, "epoch": 0.24710656846535398, "flos": 22997847144960.0, "grad_norm": 1.9904973572082112, "language_loss": 0.80545163, "learning_rate": 3.5255883903921866e-06, "loss": 0.82691574, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.5625, "step": 4110, "time_per_iteration": 2.4469001293182373 }, { "auxiliary_loss_clip": 0.01084, "auxiliary_loss_mlp": 0.01067321, "balance_loss_clip": 1.02569366, "balance_loss_mlp": 1.02796149, "epoch": 0.24716669171802194, "flos": 26431589738880.0, "grad_norm": 2.0165848248137452, "language_loss": 0.8284595, "learning_rate": 3.5253365194470144e-06, "loss": 0.84997267, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.5625, "step": 4111, "time_per_iteration": 2.417048692703247 }, { "auxiliary_loss_clip": 0.01081531, "auxiliary_loss_mlp": 0.01058163, "balance_loss_clip": 1.01922953, "balance_loss_mlp": 1.02667725, "epoch": 0.2472268149706899, "flos": 23328788722560.0, "grad_norm": 2.2985209595084077, "language_loss": 0.76875716, "learning_rate": 3.5250845906609294e-06, "loss": 0.7901541, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.55078125, "step": 4112, "time_per_iteration": 2.437804698944092 }, { "auxiliary_loss_clip": 0.01080643, "auxiliary_loss_mlp": 0.01068369, "balance_loss_clip": 1.02988863, "balance_loss_mlp": 1.02494001, "epoch": 0.24728693822335787, "flos": 23767612001280.0, "grad_norm": 1.868060448676148, "language_loss": 0.84033823, "learning_rate": 3.5248326040434835e-06, "loss": 0.86182833, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5546875, "step": 4113, "time_per_iteration": 2.4234378337860107 }, { "auxiliary_loss_clip": 0.01078447, "auxiliary_loss_mlp": 0.0105939, "balance_loss_clip": 1.01749957, "balance_loss_mlp": 1.02360177, "epoch": 0.24734706147602586, "flos": 19316500122240.0, "grad_norm": 2.1282645429704092, "language_loss": 0.89845634, "learning_rate": 3.5245805596042322e-06, "loss": 0.91983473, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.55078125, "step": 4114, "time_per_iteration": 2.4025440216064453 }, { "auxiliary_loss_clip": 0.01079366, "auxiliary_loss_mlp": 0.0106359, "balance_loss_clip": 1.02413142, "balance_loss_mlp": 1.02330959, "epoch": 0.24740718472869383, "flos": 28035709200000.0, "grad_norm": 1.7229729758228118, "language_loss": 0.76758415, "learning_rate": 3.524328457352734e-06, "loss": 0.78901374, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5625, "step": 4115, "time_per_iteration": 2.4383676052093506 }, { "auxiliary_loss_clip": 0.01019029, "auxiliary_loss_mlp": 0.01020626, "balance_loss_clip": 1.01461792, "balance_loss_mlp": 1.00556958, "epoch": 0.2474673079813618, "flos": 68103953326080.0, "grad_norm": 0.6968632919127377, "language_loss": 0.58345836, "learning_rate": 3.5240762972985475e-06, "loss": 0.60385495, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 0.06005859, "router_z_loss_mlp": 0.13476562, "step": 4116, "time_per_iteration": 3.1503050327301025 }, { "auxiliary_loss_clip": 0.01080053, "auxiliary_loss_mlp": 0.0106698, "balance_loss_clip": 1.02313423, "balance_loss_mlp": 1.02346921, "epoch": 0.24752743123402976, "flos": 29460793875840.0, "grad_norm": 1.466037532565512, "language_loss": 0.84507513, "learning_rate": 3.523824079451235e-06, "loss": 0.86654544, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.56640625, "step": 4117, "time_per_iteration": 2.4562833309173584 }, { "auxiliary_loss_clip": 0.01019571, "auxiliary_loss_mlp": 0.01009339, "balance_loss_clip": 1.00368834, "balance_loss_mlp": 1.0062201, "epoch": 0.24758755448669773, "flos": 58347545310720.0, "grad_norm": 0.9060165209000494, "language_loss": 0.63467872, "learning_rate": 3.5235718038203602e-06, "loss": 0.6549679, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 0.05639648, "router_z_loss_mlp": 0.1328125, "step": 4118, "time_per_iteration": 2.913499355316162 }, { "auxiliary_loss_clip": 0.01078727, "auxiliary_loss_mlp": 0.01063533, "balance_loss_clip": 1.02374148, "balance_loss_mlp": 1.02500451, "epoch": 0.2476476777393657, "flos": 20483402227200.0, "grad_norm": 1.5908131387188509, "language_loss": 0.80583549, "learning_rate": 3.523319470415491e-06, "loss": 0.82725811, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5390625, "step": 4119, "time_per_iteration": 2.3861520290374756 }, { "auxiliary_loss_clip": 0.01079377, "auxiliary_loss_mlp": 0.01060273, "balance_loss_clip": 1.02403378, "balance_loss_mlp": 1.025141, "epoch": 0.24770780099203366, "flos": 20484798681600.0, "grad_norm": 1.509772473337409, "language_loss": 0.76656902, "learning_rate": 3.5230670792461943e-06, "loss": 0.78796548, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.54296875, "step": 4120, "time_per_iteration": 2.4449617862701416 }, { "auxiliary_loss_clip": 0.01083638, "auxiliary_loss_mlp": 0.01067152, "balance_loss_clip": 1.02261591, "balance_loss_mlp": 1.02677441, "epoch": 0.24776792424470165, "flos": 15152653843200.0, "grad_norm": 11.134287198808924, "language_loss": 0.90499443, "learning_rate": 3.522814630322041e-06, "loss": 0.92650229, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 0.5703125, "step": 4121, "time_per_iteration": 2.365471839904785 }, { "auxiliary_loss_clip": 0.01085718, "auxiliary_loss_mlp": 0.01066251, "balance_loss_clip": 1.02157116, "balance_loss_mlp": 1.02824259, "epoch": 0.2478280474973696, "flos": 21724389970560.0, "grad_norm": 5.729419427803232, "language_loss": 0.71628761, "learning_rate": 3.5225621236526045e-06, "loss": 0.73780727, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 0.57421875, "step": 4122, "time_per_iteration": 2.4486865997314453 }, { "auxiliary_loss_clip": 0.01086587, "auxiliary_loss_mlp": 0.01066552, "balance_loss_clip": 1.02408934, "balance_loss_mlp": 1.02828908, "epoch": 0.24788817075003758, "flos": 20411166890880.0, "grad_norm": 2.248662068703609, "language_loss": 0.81648588, "learning_rate": 3.5223095592474596e-06, "loss": 0.83801723, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.58203125, "step": 4123, "time_per_iteration": 2.4198250770568848 }, { "auxiliary_loss_clip": 0.01084221, "auxiliary_loss_mlp": 0.01065897, "balance_loss_clip": 1.02636755, "balance_loss_mlp": 1.02788866, "epoch": 0.24794829400270554, "flos": 22593553067520.0, "grad_norm": 1.891888499671389, "language_loss": 0.7606473, "learning_rate": 3.5220569371161846e-06, "loss": 0.78214848, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.5625, "step": 4124, "time_per_iteration": 2.463684558868408 }, { "auxiliary_loss_clip": 0.01085033, "auxiliary_loss_mlp": 0.01056914, "balance_loss_clip": 1.01888657, "balance_loss_mlp": 1.03119683, "epoch": 0.2480084172553735, "flos": 39674495145600.0, "grad_norm": 1.7910322761784299, "language_loss": 0.75037318, "learning_rate": 3.521804257268357e-06, "loss": 0.77179265, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5390625, "step": 4125, "time_per_iteration": 2.5674262046813965 }, { "auxiliary_loss_clip": 0.01087655, "auxiliary_loss_mlp": 0.01069042, "balance_loss_clip": 1.02722323, "balance_loss_mlp": 1.02826595, "epoch": 0.24806854050804147, "flos": 22052643373440.0, "grad_norm": 1.9917358192151446, "language_loss": 0.72184068, "learning_rate": 3.5215515197135595e-06, "loss": 0.74340761, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.59375, "step": 4126, "time_per_iteration": 2.490355968475342 }, { "auxiliary_loss_clip": 0.0108877, "auxiliary_loss_mlp": 0.01063158, "balance_loss_clip": 1.02126813, "balance_loss_mlp": 1.03041625, "epoch": 0.24812866376070947, "flos": 15485864659200.0, "grad_norm": 2.1266833865996544, "language_loss": 0.83596849, "learning_rate": 3.5212987244613764e-06, "loss": 0.8574878, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.5859375, "step": 4127, "time_per_iteration": 3.8470637798309326 }, { "auxiliary_loss_clip": 0.01087437, "auxiliary_loss_mlp": 0.01060213, "balance_loss_clip": 1.01918089, "balance_loss_mlp": 1.03049409, "epoch": 0.24818878701337743, "flos": 14756529024000.0, "grad_norm": 2.278020913814939, "language_loss": 0.86644912, "learning_rate": 3.5210458715213927e-06, "loss": 0.88792562, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.5703125, "step": 4128, "time_per_iteration": 2.4354209899902344 }, { "auxiliary_loss_clip": 0.01084819, "auxiliary_loss_mlp": 0.01059737, "balance_loss_clip": 1.02046907, "balance_loss_mlp": 1.02823496, "epoch": 0.2482489102660454, "flos": 27088271101440.0, "grad_norm": 2.4797213969908225, "language_loss": 0.67561781, "learning_rate": 3.5207929609031973e-06, "loss": 0.69706333, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.56640625, "step": 4129, "time_per_iteration": 2.452214241027832 }, { "auxiliary_loss_clip": 0.0108129, "auxiliary_loss_mlp": 0.01063552, "balance_loss_clip": 1.0235697, "balance_loss_mlp": 1.02740574, "epoch": 0.24830903351871336, "flos": 26466363319680.0, "grad_norm": 1.7706383951366746, "language_loss": 0.76961535, "learning_rate": 3.5205399926163806e-06, "loss": 0.79106379, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.5390625, "step": 4130, "time_per_iteration": 3.9370250701904297 }, { "auxiliary_loss_clip": 0.01083123, "auxiliary_loss_mlp": 0.01063436, "balance_loss_clip": 1.02233255, "balance_loss_mlp": 1.02564478, "epoch": 0.24836915677138133, "flos": 10227805459200.0, "grad_norm": 2.516836443065612, "language_loss": 0.79092675, "learning_rate": 3.520286966670535e-06, "loss": 0.81239235, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.57421875, "step": 4131, "time_per_iteration": 3.715996503829956 }, { "auxiliary_loss_clip": 0.0107723, "auxiliary_loss_mlp": 0.01057675, "balance_loss_clip": 1.02176881, "balance_loss_mlp": 1.02322292, "epoch": 0.2484292800240493, "flos": 30079140698880.0, "grad_norm": 1.5122974581930986, "language_loss": 0.84776473, "learning_rate": 3.520033883075255e-06, "loss": 0.8691138, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5390625, "step": 4132, "time_per_iteration": 3.8950085639953613 }, { "auxiliary_loss_clip": 0.01078606, "auxiliary_loss_mlp": 0.0106058, "balance_loss_clip": 1.01792717, "balance_loss_mlp": 1.02287126, "epoch": 0.24848940327671726, "flos": 13442118958080.0, "grad_norm": 1.7468271133996423, "language_loss": 0.72457433, "learning_rate": 3.5197807418401386e-06, "loss": 0.7459662, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 0.5546875, "step": 4133, "time_per_iteration": 2.3726863861083984 }, { "auxiliary_loss_clip": 0.01084237, "auxiliary_loss_mlp": 0.01070962, "balance_loss_clip": 1.01922512, "balance_loss_mlp": 1.02450764, "epoch": 0.24854952652938525, "flos": 19969341235200.0, "grad_norm": 2.5180858646530684, "language_loss": 0.6394037, "learning_rate": 3.5195275429747834e-06, "loss": 0.66095561, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 0.59375, "step": 4134, "time_per_iteration": 2.3851380348205566 }, { "auxiliary_loss_clip": 0.01078761, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.0173521, "balance_loss_mlp": 1.02212906, "epoch": 0.24860964978205322, "flos": 18149213992320.0, "grad_norm": 2.7729579342113273, "language_loss": 0.79950857, "learning_rate": 3.5192742864887914e-06, "loss": 0.8208555, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.56640625, "step": 4135, "time_per_iteration": 2.36698317527771 }, { "auxiliary_loss_clip": 0.01078939, "auxiliary_loss_mlp": 0.01057331, "balance_loss_clip": 1.01882672, "balance_loss_mlp": 1.02425933, "epoch": 0.24866977303472118, "flos": 11727848557440.0, "grad_norm": 2.5513006750789025, "language_loss": 0.84207606, "learning_rate": 3.5190209723917662e-06, "loss": 0.86343873, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.546875, "step": 4136, "time_per_iteration": 2.3955652713775635 }, { "auxiliary_loss_clip": 0.0108012, "auxiliary_loss_mlp": 0.0106016, "balance_loss_clip": 1.02072513, "balance_loss_mlp": 1.02413154, "epoch": 0.24872989628738915, "flos": 34822161388800.0, "grad_norm": 1.8330838453202505, "language_loss": 0.7264992, "learning_rate": 3.518767600693314e-06, "loss": 0.74790198, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.55859375, "step": 4137, "time_per_iteration": 2.484415292739868 }, { "auxiliary_loss_clip": 0.01077692, "auxiliary_loss_mlp": 0.01054987, "balance_loss_clip": 1.01660144, "balance_loss_mlp": 1.02179027, "epoch": 0.2487900195400571, "flos": 13698486138240.0, "grad_norm": 2.529355796803447, "language_loss": 0.70562118, "learning_rate": 3.518514171403042e-06, "loss": 0.72694802, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.55859375, "step": 4138, "time_per_iteration": 2.391927719116211 }, { "auxiliary_loss_clip": 0.0107762, "auxiliary_loss_mlp": 0.01053332, "balance_loss_clip": 1.01706862, "balance_loss_mlp": 1.02353692, "epoch": 0.24885014279272508, "flos": 25336643679360.0, "grad_norm": 1.873073594662746, "language_loss": 0.85325813, "learning_rate": 3.51826068453056e-06, "loss": 0.87456775, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.54296875, "step": 4139, "time_per_iteration": 2.414858818054199 }, { "auxiliary_loss_clip": 0.01079862, "auxiliary_loss_mlp": 0.01060154, "balance_loss_clip": 1.01828778, "balance_loss_mlp": 1.02373469, "epoch": 0.24891026604539307, "flos": 20630386517760.0, "grad_norm": 1.4989153600431897, "language_loss": 0.80510312, "learning_rate": 3.518007140085481e-06, "loss": 0.82650328, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.5625, "step": 4140, "time_per_iteration": 2.4023523330688477 }, { "auxiliary_loss_clip": 0.01018482, "auxiliary_loss_mlp": 0.01007378, "balance_loss_clip": 1.00175142, "balance_loss_mlp": 1.00529659, "epoch": 0.24897038929806103, "flos": 66957162030720.0, "grad_norm": 0.9456506251621527, "language_loss": 0.61130095, "learning_rate": 3.51775353807742e-06, "loss": 0.63155955, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 0.05615234, "router_z_loss_mlp": 0.13183594, "step": 4141, "time_per_iteration": 3.0560102462768555 }, { "auxiliary_loss_clip": 0.01084525, "auxiliary_loss_mlp": 0.01062217, "balance_loss_clip": 1.02290154, "balance_loss_mlp": 1.02794456, "epoch": 0.249030512550729, "flos": 36391088332800.0, "grad_norm": 1.7733729439896249, "language_loss": 0.73407769, "learning_rate": 3.5174998785159913e-06, "loss": 0.75554514, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.56640625, "step": 4142, "time_per_iteration": 2.542952060699463 }, { "auxiliary_loss_clip": 0.01080373, "auxiliary_loss_mlp": 0.01061381, "balance_loss_clip": 1.02213693, "balance_loss_mlp": 1.02570081, "epoch": 0.24909063580339696, "flos": 20153612724480.0, "grad_norm": 1.9388110043515512, "language_loss": 0.83226115, "learning_rate": 3.5172461614108157e-06, "loss": 0.8536787, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.546875, "step": 4143, "time_per_iteration": 2.386887550354004 }, { "auxiliary_loss_clip": 0.01074951, "auxiliary_loss_mlp": 0.01049364, "balance_loss_clip": 1.01527011, "balance_loss_mlp": 1.02360392, "epoch": 0.24915075905606493, "flos": 26395349880960.0, "grad_norm": 1.9773641439897351, "language_loss": 0.60292566, "learning_rate": 3.5169923867715137e-06, "loss": 0.62416881, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51171875, "step": 4144, "time_per_iteration": 2.538722515106201 }, { "auxiliary_loss_clip": 0.01079047, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.0203439, "balance_loss_mlp": 1.02563858, "epoch": 0.2492108823087329, "flos": 27525977216640.0, "grad_norm": 1.8067875286259487, "language_loss": 0.79835415, "learning_rate": 3.516738554607708e-06, "loss": 0.81969333, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53515625, "step": 4145, "time_per_iteration": 2.4495434761047363 }, { "auxiliary_loss_clip": 0.01085811, "auxiliary_loss_mlp": 0.01066715, "balance_loss_clip": 1.02253544, "balance_loss_mlp": 1.02613711, "epoch": 0.24927100556140086, "flos": 16690437987840.0, "grad_norm": 2.2430370491417073, "language_loss": 0.68957031, "learning_rate": 3.5164846649290253e-06, "loss": 0.71109557, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 0.59765625, "step": 4146, "time_per_iteration": 2.408151149749756 }, { "auxiliary_loss_clip": 0.01020467, "auxiliary_loss_mlp": 0.01007011, "balance_loss_clip": 1.0011934, "balance_loss_mlp": 1.0065248, "epoch": 0.24933112881406885, "flos": 62769225047040.0, "grad_norm": 0.9379186542297425, "language_loss": 0.67292118, "learning_rate": 3.5162307177450915e-06, "loss": 0.69319594, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 0.05810547, "router_z_loss_mlp": 0.13867188, "step": 4147, "time_per_iteration": 3.120908737182617 }, { "auxiliary_loss_clip": 0.01079976, "auxiliary_loss_mlp": 0.01056397, "balance_loss_clip": 1.01846504, "balance_loss_mlp": 1.02688241, "epoch": 0.24939125206673682, "flos": 26650669720320.0, "grad_norm": 1.7050143163163691, "language_loss": 0.90759265, "learning_rate": 3.5159767130655366e-06, "loss": 0.92895633, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.53125, "step": 4148, "time_per_iteration": 2.4532992839813232 }, { "auxiliary_loss_clip": 0.01082735, "auxiliary_loss_mlp": 0.01062415, "balance_loss_clip": 1.01947618, "balance_loss_mlp": 1.02610159, "epoch": 0.24945137531940478, "flos": 20703285169920.0, "grad_norm": 1.8315167876952603, "language_loss": 0.70225871, "learning_rate": 3.5157226508999935e-06, "loss": 0.72371024, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 0.56640625, "step": 4149, "time_per_iteration": 2.4139347076416016 }, { "auxiliary_loss_clip": 0.01075764, "auxiliary_loss_mlp": 0.01055298, "balance_loss_clip": 1.01989293, "balance_loss_mlp": 1.0234375, "epoch": 0.24951149857207275, "flos": 23767542178560.0, "grad_norm": 1.5483054578651156, "language_loss": 0.72304076, "learning_rate": 3.515468531258095e-06, "loss": 0.74435139, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5234375, "step": 4150, "time_per_iteration": 2.418304443359375 }, { "auxiliary_loss_clip": 0.01078297, "auxiliary_loss_mlp": 0.01063491, "balance_loss_clip": 1.02303135, "balance_loss_mlp": 1.02362752, "epoch": 0.2495716218247407, "flos": 15664096483200.0, "grad_norm": 1.750338162435281, "language_loss": 0.74243599, "learning_rate": 3.515214354149478e-06, "loss": 0.76385385, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.546875, "step": 4151, "time_per_iteration": 2.3468077182769775 }, { "auxiliary_loss_clip": 0.01083698, "auxiliary_loss_mlp": 0.01073497, "balance_loss_clip": 1.03139234, "balance_loss_mlp": 1.02571785, "epoch": 0.24963174507740868, "flos": 24051595933440.0, "grad_norm": 2.8058552693259378, "language_loss": 0.66364336, "learning_rate": 3.514960119583781e-06, "loss": 0.68521529, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.578125, "step": 4152, "time_per_iteration": 2.4043893814086914 }, { "auxiliary_loss_clip": 0.01075103, "auxiliary_loss_mlp": 0.01058093, "balance_loss_clip": 1.02421331, "balance_loss_mlp": 1.0234642, "epoch": 0.24969186833007664, "flos": 21798405786240.0, "grad_norm": 1.8896346355671247, "language_loss": 0.78764451, "learning_rate": 3.514705827570645e-06, "loss": 0.80897653, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 4153, "time_per_iteration": 2.3852810859680176 }, { "auxiliary_loss_clip": 0.01074078, "auxiliary_loss_mlp": 0.01058192, "balance_loss_clip": 1.02300191, "balance_loss_mlp": 1.0224632, "epoch": 0.24975199158274464, "flos": 19937116183680.0, "grad_norm": 2.226582114660258, "language_loss": 0.78315294, "learning_rate": 3.514451478119711e-06, "loss": 0.80447567, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 4154, "time_per_iteration": 2.3911995887756348 }, { "auxiliary_loss_clip": 0.01079533, "auxiliary_loss_mlp": 0.01061862, "balance_loss_clip": 1.01801717, "balance_loss_mlp": 1.0221777, "epoch": 0.2498121148354126, "flos": 25337202261120.0, "grad_norm": 2.1221823890438634, "language_loss": 0.73134553, "learning_rate": 3.5141970712406258e-06, "loss": 0.75275946, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 0.57421875, "step": 4155, "time_per_iteration": 2.398071527481079 }, { "auxiliary_loss_clip": 0.01077804, "auxiliary_loss_mlp": 0.01062904, "balance_loss_clip": 1.02237296, "balance_loss_mlp": 1.02203465, "epoch": 0.24987223808808057, "flos": 20557732245120.0, "grad_norm": 1.6500431000907563, "language_loss": 0.76256275, "learning_rate": 3.513942606943036e-06, "loss": 0.78396982, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.55859375, "step": 4156, "time_per_iteration": 2.3931994438171387 }, { "auxiliary_loss_clip": 0.01074532, "auxiliary_loss_mlp": 0.01054426, "balance_loss_clip": 1.01794839, "balance_loss_mlp": 1.02064347, "epoch": 0.24993236134074853, "flos": 19748201483520.0, "grad_norm": 2.001688407110972, "language_loss": 0.78130579, "learning_rate": 3.513688085236591e-06, "loss": 0.80259538, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5390625, "step": 4157, "time_per_iteration": 2.374446392059326 }, { "auxiliary_loss_clip": 0.01077262, "auxiliary_loss_mlp": 0.01056289, "balance_loss_clip": 1.01873851, "balance_loss_mlp": 1.02270269, "epoch": 0.2499924845934165, "flos": 18769306383360.0, "grad_norm": 1.531215933859981, "language_loss": 0.82760155, "learning_rate": 3.513433506130942e-06, "loss": 0.84893709, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.54296875, "step": 4158, "time_per_iteration": 2.388986110687256 }, { "auxiliary_loss_clip": 0.01077669, "auxiliary_loss_mlp": 0.01052939, "balance_loss_clip": 1.01481581, "balance_loss_mlp": 1.0225122, "epoch": 0.25005260784608446, "flos": 16871288163840.0, "grad_norm": 1.8322834356388111, "language_loss": 0.77446276, "learning_rate": 3.5131788696357427e-06, "loss": 0.79576886, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.55078125, "step": 4159, "time_per_iteration": 2.345208168029785 }, { "auxiliary_loss_clip": 0.01078562, "auxiliary_loss_mlp": 0.0105665, "balance_loss_clip": 1.01766884, "balance_loss_mlp": 1.02234149, "epoch": 0.2501127310987524, "flos": 22123901191680.0, "grad_norm": 1.874001779434463, "language_loss": 0.72067797, "learning_rate": 3.512924175760649e-06, "loss": 0.74203002, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5625, "step": 4160, "time_per_iteration": 2.3951282501220703 }, { "auxiliary_loss_clip": 0.01031434, "auxiliary_loss_mlp": 0.01009431, "balance_loss_clip": 1.00280273, "balance_loss_mlp": 1.01480794, "epoch": 0.2501728543514204, "flos": 69454393781760.0, "grad_norm": 0.7553054937286073, "language_loss": 0.56776309, "learning_rate": 3.5126694245153186e-06, "loss": 0.58817172, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 0.06640625, "router_z_loss_mlp": 0.16601562, "step": 4161, "time_per_iteration": 3.0608177185058594 }, { "auxiliary_loss_clip": 0.01085247, "auxiliary_loss_mlp": 0.01061679, "balance_loss_clip": 1.02005148, "balance_loss_mlp": 1.02610159, "epoch": 0.25023297760408836, "flos": 16289041553280.0, "grad_norm": 1.846200933393925, "language_loss": 0.83402824, "learning_rate": 3.5124146159094125e-06, "loss": 0.85549748, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.59375, "step": 4162, "time_per_iteration": 2.411557197570801 }, { "auxiliary_loss_clip": 0.01081511, "auxiliary_loss_mlp": 0.01062081, "balance_loss_clip": 1.02155006, "balance_loss_mlp": 1.02367759, "epoch": 0.2502931008567563, "flos": 12237231427200.0, "grad_norm": 2.3183482785020955, "language_loss": 0.89269984, "learning_rate": 3.5121597499525927e-06, "loss": 0.91413581, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.578125, "step": 4163, "time_per_iteration": 2.3530325889587402 }, { "auxiliary_loss_clip": 0.01078885, "auxiliary_loss_mlp": 0.01061278, "balance_loss_clip": 1.02627826, "balance_loss_mlp": 1.02362609, "epoch": 0.25035322410942434, "flos": 23180861825280.0, "grad_norm": 1.6610205674405634, "language_loss": 0.84875709, "learning_rate": 3.5119048266545232e-06, "loss": 0.87015873, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.55078125, "step": 4164, "time_per_iteration": 2.421299934387207 }, { "auxiliary_loss_clip": 0.01075523, "auxiliary_loss_mlp": 0.01061645, "balance_loss_clip": 1.02815902, "balance_loss_mlp": 1.02427268, "epoch": 0.2504133473620923, "flos": 20916639688320.0, "grad_norm": 1.7701499616047314, "language_loss": 0.7506597, "learning_rate": 3.5116498460248716e-06, "loss": 0.77203137, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 4165, "time_per_iteration": 2.400285482406616 }, { "auxiliary_loss_clip": 0.01080834, "auxiliary_loss_mlp": 0.01075113, "balance_loss_clip": 1.03386688, "balance_loss_mlp": 1.02321649, "epoch": 0.2504734706147603, "flos": 20775520506240.0, "grad_norm": 2.701473086840755, "language_loss": 0.75123852, "learning_rate": 3.5113948080733062e-06, "loss": 0.77279794, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.578125, "step": 4166, "time_per_iteration": 3.8191535472869873 }, { "auxiliary_loss_clip": 0.01073291, "auxiliary_loss_mlp": 0.010693, "balance_loss_clip": 1.03377521, "balance_loss_mlp": 1.02124238, "epoch": 0.25053359386742824, "flos": 24348322512000.0, "grad_norm": 1.7467253704422765, "language_loss": 0.83047152, "learning_rate": 3.5111397128094973e-06, "loss": 0.85189748, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5234375, "step": 4167, "time_per_iteration": 2.40356183052063 }, { "auxiliary_loss_clip": 0.01073703, "auxiliary_loss_mlp": 0.01065158, "balance_loss_clip": 1.02825117, "balance_loss_mlp": 1.02098083, "epoch": 0.2505937171200962, "flos": 21213296444160.0, "grad_norm": 2.46109888360696, "language_loss": 0.81836879, "learning_rate": 3.51088456024312e-06, "loss": 0.83975744, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.52734375, "step": 4168, "time_per_iteration": 2.4010775089263916 }, { "auxiliary_loss_clip": 0.01078181, "auxiliary_loss_mlp": 0.01065821, "balance_loss_clip": 1.02421689, "balance_loss_mlp": 1.02225661, "epoch": 0.25065384037276417, "flos": 41425633808640.0, "grad_norm": 2.1546978891637503, "language_loss": 0.7214148, "learning_rate": 3.510629350383849e-06, "loss": 0.74285477, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 0.55859375, "step": 4169, "time_per_iteration": 2.5698513984680176 }, { "auxiliary_loss_clip": 0.01071056, "auxiliary_loss_mlp": 0.01061456, "balance_loss_clip": 1.02729082, "balance_loss_mlp": 1.02061069, "epoch": 0.25071396362543213, "flos": 26101241654400.0, "grad_norm": 1.8729608955481363, "language_loss": 0.79113519, "learning_rate": 3.510374083241361e-06, "loss": 0.8124603, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5078125, "step": 4170, "time_per_iteration": 3.9379851818084717 }, { "auxiliary_loss_clip": 0.01073945, "auxiliary_loss_mlp": 0.01062969, "balance_loss_clip": 1.02818394, "balance_loss_mlp": 1.02061081, "epoch": 0.2507740868781001, "flos": 19097978722560.0, "grad_norm": 2.3545331759156354, "language_loss": 0.78458071, "learning_rate": 3.5101187588253368e-06, "loss": 0.80594987, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 4171, "time_per_iteration": 3.7507925033569336 }, { "auxiliary_loss_clip": 0.01019914, "auxiliary_loss_mlp": 0.01016879, "balance_loss_clip": 1.01168168, "balance_loss_mlp": 1.00511932, "epoch": 0.25083421013076806, "flos": 64338570927360.0, "grad_norm": 0.8608151312430881, "language_loss": 0.60186553, "learning_rate": 3.509863377145458e-06, "loss": 0.62223351, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 0.05200195, "router_z_loss_mlp": 0.1484375, "step": 4172, "time_per_iteration": 4.454200983047485 }, { "auxiliary_loss_clip": 0.01073981, "auxiliary_loss_mlp": 0.01065744, "balance_loss_clip": 1.02962375, "balance_loss_mlp": 1.02106094, "epoch": 0.25089433338343603, "flos": 24278461148160.0, "grad_norm": 2.1862998908383515, "language_loss": 0.80328882, "learning_rate": 3.509607938211409e-06, "loss": 0.82468605, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.52734375, "step": 4173, "time_per_iteration": 2.397447109222412 }, { "auxiliary_loss_clip": 0.01076815, "auxiliary_loss_mlp": 0.01054859, "balance_loss_clip": 1.0186193, "balance_loss_mlp": 1.0239414, "epoch": 0.250954456636104, "flos": 14720568456960.0, "grad_norm": 1.9552884421112848, "language_loss": 0.85587633, "learning_rate": 3.509352442032875e-06, "loss": 0.87719309, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.53125, "step": 4174, "time_per_iteration": 2.3786256313323975 }, { "auxiliary_loss_clip": 0.01080544, "auxiliary_loss_mlp": 0.01056319, "balance_loss_clip": 1.02031755, "balance_loss_mlp": 1.02575529, "epoch": 0.25101457988877196, "flos": 22272491404800.0, "grad_norm": 2.2616243253516215, "language_loss": 0.72302437, "learning_rate": 3.509096888619545e-06, "loss": 0.74439299, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.546875, "step": 4175, "time_per_iteration": 2.3921878337860107 }, { "auxiliary_loss_clip": 0.01079127, "auxiliary_loss_mlp": 0.01052396, "balance_loss_clip": 1.01546526, "balance_loss_mlp": 1.02446485, "epoch": 0.2510747031414399, "flos": 25187843998080.0, "grad_norm": 2.084760146076791, "language_loss": 0.82263529, "learning_rate": 3.50884127798111e-06, "loss": 0.84395051, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 4176, "time_per_iteration": 2.432888984680176 }, { "auxiliary_loss_clip": 0.01081941, "auxiliary_loss_mlp": 0.01051184, "balance_loss_clip": 1.01434827, "balance_loss_mlp": 1.02807522, "epoch": 0.25113482639410795, "flos": 20703145524480.0, "grad_norm": 2.1438036750250977, "language_loss": 0.85197479, "learning_rate": 3.5085856101272623e-06, "loss": 0.8733061, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5390625, "step": 4177, "time_per_iteration": 2.3856217861175537 }, { "auxiliary_loss_clip": 0.01085234, "auxiliary_loss_mlp": 0.01054694, "balance_loss_clip": 1.01978946, "balance_loss_mlp": 1.03099251, "epoch": 0.2511949496467759, "flos": 21505868570880.0, "grad_norm": 2.314804170315218, "language_loss": 0.84918904, "learning_rate": 3.508329885067698e-06, "loss": 0.87058842, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.54296875, "step": 4178, "time_per_iteration": 2.4285356998443604 }, { "auxiliary_loss_clip": 0.01081923, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.01905608, "balance_loss_mlp": 1.0304656, "epoch": 0.2512550728994439, "flos": 20701015931520.0, "grad_norm": 2.326151775682438, "language_loss": 0.76979637, "learning_rate": 3.508074102812112e-06, "loss": 0.79114139, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 4179, "time_per_iteration": 2.3736279010772705 }, { "auxiliary_loss_clip": 0.01085636, "auxiliary_loss_mlp": 0.01061521, "balance_loss_clip": 1.02704549, "balance_loss_mlp": 1.03042579, "epoch": 0.25131519615211184, "flos": 18477641952000.0, "grad_norm": 2.2080230806160204, "language_loss": 0.73433203, "learning_rate": 3.507818263370206e-06, "loss": 0.75580359, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5546875, "step": 4180, "time_per_iteration": 2.398108959197998 }, { "auxiliary_loss_clip": 0.01084017, "auxiliary_loss_mlp": 0.01056374, "balance_loss_clip": 1.02306676, "balance_loss_mlp": 1.03106308, "epoch": 0.2513753194047798, "flos": 20483925897600.0, "grad_norm": 2.0996582655720677, "language_loss": 0.87329769, "learning_rate": 3.5075623667516796e-06, "loss": 0.8947016, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.53125, "step": 4181, "time_per_iteration": 2.403031349182129 }, { "auxiliary_loss_clip": 0.01081248, "auxiliary_loss_mlp": 0.01061628, "balance_loss_clip": 1.02891707, "balance_loss_mlp": 1.02904284, "epoch": 0.25143544265744777, "flos": 37668560313600.0, "grad_norm": 4.843528119638757, "language_loss": 0.69405603, "learning_rate": 3.507306412966238e-06, "loss": 0.71548474, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5234375, "step": 4182, "time_per_iteration": 2.556149959564209 }, { "auxiliary_loss_clip": 0.01048826, "auxiliary_loss_mlp": 0.01010968, "balance_loss_clip": 1.00596154, "balance_loss_mlp": 1.03313959, "epoch": 0.25149556591011574, "flos": 69364283829120.0, "grad_norm": 0.8628053291772059, "language_loss": 0.70236641, "learning_rate": 3.5070504020235853e-06, "loss": 0.72296435, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 0.05004883, "router_z_loss_mlp": 0.15625, "step": 4183, "time_per_iteration": 3.039799451828003 }, { "auxiliary_loss_clip": 0.01077612, "auxiliary_loss_mlp": 0.01063695, "balance_loss_clip": 1.02731252, "balance_loss_mlp": 1.02335048, "epoch": 0.2515556891627837, "flos": 13989557076480.0, "grad_norm": 2.286502126723049, "language_loss": 0.75399435, "learning_rate": 3.506794333933431e-06, "loss": 0.77540737, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.54296875, "step": 4184, "time_per_iteration": 2.3901941776275635 }, { "auxiliary_loss_clip": 0.01076076, "auxiliary_loss_mlp": 0.01068189, "balance_loss_clip": 1.03306985, "balance_loss_mlp": 1.02456498, "epoch": 0.25161581241545167, "flos": 22162445199360.0, "grad_norm": 2.003941475333182, "language_loss": 0.85097128, "learning_rate": 3.506538208705484e-06, "loss": 0.87241399, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 4185, "time_per_iteration": 2.396497964859009 }, { "auxiliary_loss_clip": 0.01027217, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.03970206, "balance_loss_mlp": 1.010777, "epoch": 0.25167593566811963, "flos": 69355486166400.0, "grad_norm": 1.4341687882489893, "language_loss": 0.61573112, "learning_rate": 3.5062820263494574e-06, "loss": 0.63644981, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 0.04956055, "router_z_loss_mlp": 0.1640625, "step": 4186, "time_per_iteration": 2.9387497901916504 }, { "auxiliary_loss_clip": 0.01075147, "auxiliary_loss_mlp": 0.01070569, "balance_loss_clip": 1.03490126, "balance_loss_mlp": 1.02181792, "epoch": 0.2517360589207876, "flos": 13260605466240.0, "grad_norm": 4.6285064656986155, "language_loss": 0.81753963, "learning_rate": 3.5060257868750656e-06, "loss": 0.83899677, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53125, "step": 4187, "time_per_iteration": 2.3818259239196777 }, { "auxiliary_loss_clip": 0.01075801, "auxiliary_loss_mlp": 0.01068053, "balance_loss_clip": 1.03388786, "balance_loss_mlp": 1.02251029, "epoch": 0.25179618217345556, "flos": 20375764905600.0, "grad_norm": 1.455657455157059, "language_loss": 0.80829567, "learning_rate": 3.5057694902920244e-06, "loss": 0.82973415, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.53125, "step": 4188, "time_per_iteration": 2.4196341037750244 }, { "auxiliary_loss_clip": 0.01075298, "auxiliary_loss_mlp": 0.01060683, "balance_loss_clip": 1.02716136, "balance_loss_mlp": 1.02477348, "epoch": 0.25185630542612353, "flos": 27663709996800.0, "grad_norm": 2.3083509965040374, "language_loss": 0.75245166, "learning_rate": 3.5055131366100534e-06, "loss": 0.77381152, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.50390625, "step": 4189, "time_per_iteration": 2.4350953102111816 }, { "auxiliary_loss_clip": 0.01078248, "auxiliary_loss_mlp": 0.01048704, "balance_loss_clip": 1.0193783, "balance_loss_mlp": 1.02839398, "epoch": 0.25191642867879155, "flos": 20995368537600.0, "grad_norm": 2.181154716852697, "language_loss": 0.86537719, "learning_rate": 3.5052567258388745e-06, "loss": 0.88664675, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.49804688, "step": 4190, "time_per_iteration": 2.448875665664673 }, { "auxiliary_loss_clip": 0.01083641, "auxiliary_loss_mlp": 0.01058783, "balance_loss_clip": 1.0222578, "balance_loss_mlp": 1.03057444, "epoch": 0.2519765519314595, "flos": 21104611781760.0, "grad_norm": 2.171910836275233, "language_loss": 0.76840949, "learning_rate": 3.5050002579882082e-06, "loss": 0.78983366, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.53125, "step": 4191, "time_per_iteration": 2.409207582473755 }, { "auxiliary_loss_clip": 0.01044479, "auxiliary_loss_mlp": 0.01047885, "balance_loss_clip": 1.04325962, "balance_loss_mlp": 1.02838373, "epoch": 0.2520366751841275, "flos": 62741503560960.0, "grad_norm": 0.7682086639001763, "language_loss": 0.57276005, "learning_rate": 3.5047437330677823e-06, "loss": 0.59368366, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 0.04614258, "router_z_loss_mlp": 0.16015625, "step": 4192, "time_per_iteration": 3.1230616569519043 }, { "auxiliary_loss_clip": 0.0108982, "auxiliary_loss_mlp": 0.01051207, "balance_loss_clip": 1.01848364, "balance_loss_mlp": 1.03732848, "epoch": 0.25209679843679544, "flos": 22229792945280.0, "grad_norm": 2.001803912591678, "language_loss": 0.77661043, "learning_rate": 3.504487151087323e-06, "loss": 0.79802072, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5234375, "step": 4193, "time_per_iteration": 2.4132697582244873 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01059853, "balance_loss_clip": 1.02366102, "balance_loss_mlp": 1.03993523, "epoch": 0.2521569216894634, "flos": 12165833963520.0, "grad_norm": 3.2985968236305334, "language_loss": 0.86110628, "learning_rate": 3.5042305120565598e-06, "loss": 0.88265175, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.546875, "step": 4194, "time_per_iteration": 2.5010087490081787 }, { "auxiliary_loss_clip": 0.01093195, "auxiliary_loss_mlp": 0.01075285, "balance_loss_clip": 1.0414772, "balance_loss_mlp": 1.03754926, "epoch": 0.2522170449421314, "flos": 23698553598720.0, "grad_norm": 1.4335774638891599, "language_loss": 0.88889956, "learning_rate": 3.5039738159852253e-06, "loss": 0.91058433, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5546875, "step": 4195, "time_per_iteration": 2.485008716583252 }, { "auxiliary_loss_clip": 0.01095704, "auxiliary_loss_mlp": 0.01071143, "balance_loss_clip": 1.03442621, "balance_loss_mlp": 1.04045689, "epoch": 0.25227716819479934, "flos": 20954520380160.0, "grad_norm": 2.0259413998903066, "language_loss": 0.87445676, "learning_rate": 3.503717062883053e-06, "loss": 0.8961252, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5546875, "step": 4196, "time_per_iteration": 2.4339165687561035 }, { "auxiliary_loss_clip": 0.01093425, "auxiliary_loss_mlp": 0.01092019, "balance_loss_clip": 1.05749631, "balance_loss_mlp": 1.03880382, "epoch": 0.2523372914474673, "flos": 23330220088320.0, "grad_norm": 1.6780720249747834, "language_loss": 0.84735811, "learning_rate": 3.5034602527597786e-06, "loss": 0.86921251, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.546875, "step": 4197, "time_per_iteration": 2.4338364601135254 }, { "auxiliary_loss_clip": 0.01093963, "auxiliary_loss_mlp": 0.01084517, "balance_loss_clip": 1.04698944, "balance_loss_mlp": 1.03811133, "epoch": 0.25239741470013527, "flos": 36969005934720.0, "grad_norm": 3.0854388323660116, "language_loss": 0.74601471, "learning_rate": 3.5032033856251405e-06, "loss": 0.7677995, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.55859375, "step": 4198, "time_per_iteration": 2.5597028732299805 }, { "auxiliary_loss_clip": 0.01090204, "auxiliary_loss_mlp": 0.0108388, "balance_loss_clip": 1.04661489, "balance_loss_mlp": 1.03447425, "epoch": 0.25245753795280323, "flos": 18514754593920.0, "grad_norm": 2.2318758492946555, "language_loss": 0.787907, "learning_rate": 3.50294646148888e-06, "loss": 0.80964786, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.55859375, "step": 4199, "time_per_iteration": 2.397740602493286 }, { "auxiliary_loss_clip": 0.01088522, "auxiliary_loss_mlp": 0.01087506, "balance_loss_clip": 1.05443776, "balance_loss_mlp": 1.03438842, "epoch": 0.2525176612054712, "flos": 32343467569920.0, "grad_norm": 1.9674569056283773, "language_loss": 0.74132341, "learning_rate": 3.502689480360739e-06, "loss": 0.7630837, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.54296875, "step": 4200, "time_per_iteration": 2.5268242359161377 }, { "auxiliary_loss_clip": 0.01086461, "auxiliary_loss_mlp": 0.0107585, "balance_loss_clip": 1.04280555, "balance_loss_mlp": 1.03273678, "epoch": 0.25257778445813917, "flos": 45256513651200.0, "grad_norm": 1.749173320480223, "language_loss": 0.83707917, "learning_rate": 3.5024324422504616e-06, "loss": 0.8587023, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.53515625, "step": 4201, "time_per_iteration": 2.6112546920776367 }, { "auxiliary_loss_clip": 0.01084894, "auxiliary_loss_mlp": 0.01073967, "balance_loss_clip": 1.03837109, "balance_loss_mlp": 1.02960825, "epoch": 0.25263790771080713, "flos": 23366669414400.0, "grad_norm": 1.9906087121825033, "language_loss": 0.76071811, "learning_rate": 3.5021753471677965e-06, "loss": 0.78230673, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 4202, "time_per_iteration": 2.4609363079071045 }, { "auxiliary_loss_clip": 0.01077461, "auxiliary_loss_mlp": 0.01059709, "balance_loss_clip": 1.02556777, "balance_loss_mlp": 1.0254724, "epoch": 0.25269803096347515, "flos": 18514056366720.0, "grad_norm": 2.1953459030125866, "language_loss": 0.74941266, "learning_rate": 3.501918195122491e-06, "loss": 0.77078438, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51953125, "step": 4203, "time_per_iteration": 2.3874549865722656 }, { "auxiliary_loss_clip": 0.01078476, "auxiliary_loss_mlp": 0.01065233, "balance_loss_clip": 1.02894533, "balance_loss_mlp": 1.02459693, "epoch": 0.2527581542161431, "flos": 24609332903040.0, "grad_norm": 1.51174367499265, "language_loss": 0.78882349, "learning_rate": 3.501660986124297e-06, "loss": 0.81026053, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 4204, "time_per_iteration": 2.5023257732391357 }, { "auxiliary_loss_clip": 0.01075051, "auxiliary_loss_mlp": 0.01054723, "balance_loss_clip": 1.02182126, "balance_loss_mlp": 1.02212751, "epoch": 0.2528182774688111, "flos": 12640443252480.0, "grad_norm": 2.0982503706454714, "language_loss": 0.74243271, "learning_rate": 3.5014037201829684e-06, "loss": 0.76373047, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.53125, "step": 4205, "time_per_iteration": 2.359304666519165 }, { "auxiliary_loss_clip": 0.010724, "auxiliary_loss_mlp": 0.01047904, "balance_loss_clip": 1.01736259, "balance_loss_mlp": 1.02320468, "epoch": 0.25287840072147905, "flos": 46935032952960.0, "grad_norm": 1.6262159650039363, "language_loss": 0.77150667, "learning_rate": 3.50114639730826e-06, "loss": 0.79270971, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4921875, "step": 4206, "time_per_iteration": 4.134872198104858 }, { "auxiliary_loss_clip": 0.01075341, "auxiliary_loss_mlp": 0.01051361, "balance_loss_clip": 1.01514506, "balance_loss_mlp": 1.02315211, "epoch": 0.252938523974147, "flos": 18878724184320.0, "grad_norm": 1.7281346679705978, "language_loss": 0.80136329, "learning_rate": 3.5008890175099296e-06, "loss": 0.82263029, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5234375, "step": 4207, "time_per_iteration": 2.400001049041748 }, { "auxiliary_loss_clip": 0.01076288, "auxiliary_loss_mlp": 0.01056464, "balance_loss_clip": 1.02171493, "balance_loss_mlp": 1.02475381, "epoch": 0.252998647226815, "flos": 21433633234560.0, "grad_norm": 2.5193797022491413, "language_loss": 0.77810645, "learning_rate": 3.5006315807977375e-06, "loss": 0.79943395, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 4208, "time_per_iteration": 2.3930201530456543 }, { "auxiliary_loss_clip": 0.01074465, "auxiliary_loss_mlp": 0.01047178, "balance_loss_clip": 1.0157783, "balance_loss_mlp": 1.02351665, "epoch": 0.25305877047948294, "flos": 25441138978560.0, "grad_norm": 3.2356188731244564, "language_loss": 0.71822143, "learning_rate": 3.5003740871814456e-06, "loss": 0.73943794, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.51171875, "step": 4209, "time_per_iteration": 2.4395506381988525 }, { "auxiliary_loss_clip": 0.01025824, "auxiliary_loss_mlp": 0.01006672, "balance_loss_clip": 1.00238085, "balance_loss_mlp": 1.01089144, "epoch": 0.2531188937321509, "flos": 60182335324800.0, "grad_norm": 0.7627831080885793, "language_loss": 0.5519464, "learning_rate": 3.5001165366708175e-06, "loss": 0.57227135, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 0.04296875, "router_z_loss_mlp": 0.1484375, "step": 4210, "time_per_iteration": 4.598435640335083 }, { "auxiliary_loss_clip": 0.01080071, "auxiliary_loss_mlp": 0.01050382, "balance_loss_clip": 1.01829123, "balance_loss_mlp": 1.02632594, "epoch": 0.25317901698481887, "flos": 19681377408000.0, "grad_norm": 2.2624597791804386, "language_loss": 0.81174088, "learning_rate": 3.4998589292756204e-06, "loss": 0.83304536, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5390625, "step": 4211, "time_per_iteration": 3.7780921459198 }, { "auxiliary_loss_clip": 0.01079596, "auxiliary_loss_mlp": 0.01052049, "balance_loss_clip": 1.01995826, "balance_loss_mlp": 1.02891338, "epoch": 0.25323914023748684, "flos": 24423246023040.0, "grad_norm": 2.5516090590190235, "language_loss": 0.79461837, "learning_rate": 3.499601265005622e-06, "loss": 0.8159349, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5078125, "step": 4212, "time_per_iteration": 3.8367197513580322 }, { "auxiliary_loss_clip": 0.01079682, "auxiliary_loss_mlp": 0.01052, "balance_loss_clip": 1.01683331, "balance_loss_mlp": 1.02612257, "epoch": 0.2532992634901548, "flos": 25446270948480.0, "grad_norm": 8.9593650125193, "language_loss": 0.55548352, "learning_rate": 3.4993435438705938e-06, "loss": 0.57680035, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5390625, "step": 4213, "time_per_iteration": 2.430173397064209 }, { "auxiliary_loss_clip": 0.01080654, "auxiliary_loss_mlp": 0.01054722, "balance_loss_clip": 1.01762366, "balance_loss_mlp": 1.02645326, "epoch": 0.25335938674282277, "flos": 18879527145600.0, "grad_norm": 2.5672288647055144, "language_loss": 0.66811097, "learning_rate": 3.499085765880308e-06, "loss": 0.68946475, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.54296875, "step": 4214, "time_per_iteration": 2.4070982933044434 }, { "auxiliary_loss_clip": 0.01024137, "auxiliary_loss_mlp": 0.01007202, "balance_loss_clip": 1.00310123, "balance_loss_mlp": 1.00936604, "epoch": 0.25341950999549073, "flos": 53059809588480.0, "grad_norm": 0.8368001895756312, "language_loss": 0.58142912, "learning_rate": 3.4988279310445396e-06, "loss": 0.60174251, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.1484375, "step": 4215, "time_per_iteration": 2.8031153678894043 }, { "auxiliary_loss_clip": 0.01077465, "auxiliary_loss_mlp": 0.01052948, "balance_loss_clip": 1.01639807, "balance_loss_mlp": 1.02462792, "epoch": 0.2534796332481587, "flos": 39018686567040.0, "grad_norm": 2.0872360687700238, "language_loss": 0.84890956, "learning_rate": 3.498570039373066e-06, "loss": 0.87021369, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.52734375, "step": 4216, "time_per_iteration": 2.538862705230713 }, { "auxiliary_loss_clip": 0.01079177, "auxiliary_loss_mlp": 0.01053292, "balance_loss_clip": 1.01750529, "balance_loss_mlp": 1.02618313, "epoch": 0.2535397565008267, "flos": 23585854129920.0, "grad_norm": 2.388306225907289, "language_loss": 0.81676215, "learning_rate": 3.498312090875666e-06, "loss": 0.83808684, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.53125, "step": 4217, "time_per_iteration": 2.427696704864502 }, { "auxiliary_loss_clip": 0.01076765, "auxiliary_loss_mlp": 0.01050875, "balance_loss_clip": 1.01473069, "balance_loss_mlp": 1.02397919, "epoch": 0.2535998797534947, "flos": 19280364998400.0, "grad_norm": 2.4135125028091076, "language_loss": 0.77307284, "learning_rate": 3.4980540855621218e-06, "loss": 0.79434919, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.52734375, "step": 4218, "time_per_iteration": 2.3871657848358154 }, { "auxiliary_loss_clip": 0.01077302, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.01611733, "balance_loss_mlp": 1.02251899, "epoch": 0.25366000300616265, "flos": 24023246042880.0, "grad_norm": 2.2065231473986207, "language_loss": 0.7571975, "learning_rate": 3.4977960234422167e-06, "loss": 0.77848351, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 4219, "time_per_iteration": 2.406975507736206 }, { "auxiliary_loss_clip": 0.01078842, "auxiliary_loss_mlp": 0.01056208, "balance_loss_clip": 1.02051711, "balance_loss_mlp": 1.02452946, "epoch": 0.2537201262588306, "flos": 16288448060160.0, "grad_norm": 2.3959429438860083, "language_loss": 0.83107066, "learning_rate": 3.497537904525736e-06, "loss": 0.85242116, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.54296875, "step": 4220, "time_per_iteration": 2.3811001777648926 }, { "auxiliary_loss_clip": 0.01078409, "auxiliary_loss_mlp": 0.01051849, "balance_loss_clip": 1.01527548, "balance_loss_mlp": 1.02407897, "epoch": 0.2537802495114986, "flos": 23293561294080.0, "grad_norm": 2.112156310622191, "language_loss": 0.72528195, "learning_rate": 3.497279728822468e-06, "loss": 0.74658453, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.54296875, "step": 4221, "time_per_iteration": 2.410367250442505 }, { "auxiliary_loss_clip": 0.01076133, "auxiliary_loss_mlp": 0.01054893, "balance_loss_clip": 1.01696074, "balance_loss_mlp": 1.02340126, "epoch": 0.25384037276416654, "flos": 17638190288640.0, "grad_norm": 2.2467507118204004, "language_loss": 0.63598317, "learning_rate": 3.497021496342202e-06, "loss": 0.65729344, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.52734375, "step": 4222, "time_per_iteration": 2.3821916580200195 }, { "auxiliary_loss_clip": 0.01078024, "auxiliary_loss_mlp": 0.01060016, "balance_loss_clip": 1.02043819, "balance_loss_mlp": 1.02270341, "epoch": 0.2539004960168345, "flos": 21505973304960.0, "grad_norm": 1.7966814899444752, "language_loss": 0.76058519, "learning_rate": 3.496763207094731e-06, "loss": 0.78196555, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5546875, "step": 4223, "time_per_iteration": 2.377962350845337 }, { "auxiliary_loss_clip": 0.01074196, "auxiliary_loss_mlp": 0.01055211, "balance_loss_clip": 1.02030659, "balance_loss_mlp": 1.02262318, "epoch": 0.2539606192695025, "flos": 23949788808960.0, "grad_norm": 1.8184141238450242, "language_loss": 0.81672072, "learning_rate": 3.49650486108985e-06, "loss": 0.83801478, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.515625, "step": 4224, "time_per_iteration": 2.459200382232666 }, { "auxiliary_loss_clip": 0.01073418, "auxiliary_loss_mlp": 0.0105391, "balance_loss_clip": 1.01979196, "balance_loss_mlp": 1.0220263, "epoch": 0.25402074252217044, "flos": 24168659322240.0, "grad_norm": 1.605796869984486, "language_loss": 0.78215897, "learning_rate": 3.496246458337354e-06, "loss": 0.80343223, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.515625, "step": 4225, "time_per_iteration": 2.418851852416992 }, { "auxiliary_loss_clip": 0.01075765, "auxiliary_loss_mlp": 0.01066223, "balance_loss_clip": 1.02938747, "balance_loss_mlp": 1.02152991, "epoch": 0.2540808657748384, "flos": 22302831242880.0, "grad_norm": 1.6278799069999246, "language_loss": 0.86017859, "learning_rate": 3.4959879988470426e-06, "loss": 0.88159847, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.54296875, "step": 4226, "time_per_iteration": 2.416271924972534 }, { "auxiliary_loss_clip": 0.01073384, "auxiliary_loss_mlp": 0.01061288, "balance_loss_clip": 1.02473795, "balance_loss_mlp": 1.02136099, "epoch": 0.25414098902750637, "flos": 27598317287040.0, "grad_norm": 1.818038049451049, "language_loss": 0.72167826, "learning_rate": 3.4957294826287164e-06, "loss": 0.74302495, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.51953125, "step": 4227, "time_per_iteration": 2.4422335624694824 }, { "auxiliary_loss_clip": 0.01028033, "auxiliary_loss_mlp": 0.01022101, "balance_loss_clip": 1.01780951, "balance_loss_mlp": 1.01336694, "epoch": 0.25420111228017434, "flos": 58167847209600.0, "grad_norm": 0.9872389784882549, "language_loss": 0.61832708, "learning_rate": 3.4954709096921785e-06, "loss": 0.6388284, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 0.04296875, "router_z_loss_mlp": 0.14648438, "step": 4228, "time_per_iteration": 2.8637194633483887 }, { "auxiliary_loss_clip": 0.01075, "auxiliary_loss_mlp": 0.01057377, "balance_loss_clip": 1.01965928, "balance_loss_mlp": 1.02197385, "epoch": 0.2542612355328423, "flos": 11463870700800.0, "grad_norm": 2.6015135021238214, "language_loss": 0.88437462, "learning_rate": 3.4952122800472336e-06, "loss": 0.90569836, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.53125, "step": 4229, "time_per_iteration": 2.3503832817077637 }, { "auxiliary_loss_clip": 0.01076023, "auxiliary_loss_mlp": 0.01059523, "balance_loss_clip": 1.02023208, "balance_loss_mlp": 1.02237415, "epoch": 0.2543213587855103, "flos": 22964784220800.0, "grad_norm": 2.6268718272387352, "language_loss": 0.78569907, "learning_rate": 3.4949535937036892e-06, "loss": 0.80705452, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5390625, "step": 4230, "time_per_iteration": 2.4031782150268555 }, { "auxiliary_loss_clip": 0.01073883, "auxiliary_loss_mlp": 0.01052579, "balance_loss_clip": 1.01614916, "balance_loss_mlp": 1.02172852, "epoch": 0.2543814820381783, "flos": 18252382659840.0, "grad_norm": 2.236641616598609, "language_loss": 0.77245843, "learning_rate": 3.4946948506713544e-06, "loss": 0.79372311, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51953125, "step": 4231, "time_per_iteration": 2.3790180683135986 }, { "auxiliary_loss_clip": 0.01073682, "auxiliary_loss_mlp": 0.01053055, "balance_loss_clip": 1.0175786, "balance_loss_mlp": 1.02079129, "epoch": 0.25444160529084625, "flos": 15631801608960.0, "grad_norm": 2.8088845554065474, "language_loss": 0.75599998, "learning_rate": 3.4944360509600416e-06, "loss": 0.77726734, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.52734375, "step": 4232, "time_per_iteration": 2.3696837425231934 }, { "auxiliary_loss_clip": 0.01074432, "auxiliary_loss_mlp": 0.01055277, "balance_loss_clip": 1.01763058, "balance_loss_mlp": 1.02224135, "epoch": 0.2545017285435142, "flos": 24600639974400.0, "grad_norm": 2.339636349626308, "language_loss": 0.87682462, "learning_rate": 3.4941771945795637e-06, "loss": 0.89812171, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5234375, "step": 4233, "time_per_iteration": 2.4111387729644775 }, { "auxiliary_loss_clip": 0.01070348, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.01806605, "balance_loss_mlp": 1.02030301, "epoch": 0.2545618517961822, "flos": 24677972369280.0, "grad_norm": 1.8053586146770977, "language_loss": 0.75730693, "learning_rate": 3.493918281539737e-06, "loss": 0.77850789, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5, "step": 4234, "time_per_iteration": 2.4393951892852783 }, { "auxiliary_loss_clip": 0.01074865, "auxiliary_loss_mlp": 0.01051418, "balance_loss_clip": 1.0175631, "balance_loss_mlp": 1.02133608, "epoch": 0.25462197504885015, "flos": 23913967887360.0, "grad_norm": 1.5268444188308696, "language_loss": 0.76347482, "learning_rate": 3.493659311850379e-06, "loss": 0.78473771, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.53515625, "step": 4235, "time_per_iteration": 2.4027724266052246 }, { "auxiliary_loss_clip": 0.01079846, "auxiliary_loss_mlp": 0.01058806, "balance_loss_clip": 1.01803708, "balance_loss_mlp": 1.02221537, "epoch": 0.2546820983015181, "flos": 24788262954240.0, "grad_norm": 2.3929245568302573, "language_loss": 0.66879594, "learning_rate": 3.4934002855213106e-06, "loss": 0.69018245, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.57421875, "step": 4236, "time_per_iteration": 2.428398609161377 }, { "auxiliary_loss_clip": 0.01072863, "auxiliary_loss_mlp": 0.0104618, "balance_loss_clip": 1.01518595, "balance_loss_mlp": 1.02180004, "epoch": 0.2547422215541861, "flos": 18733136348160.0, "grad_norm": 1.611765520439152, "language_loss": 0.68003571, "learning_rate": 3.493141202562354e-06, "loss": 0.70122612, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.51171875, "step": 4237, "time_per_iteration": 2.3625876903533936 }, { "auxiliary_loss_clip": 0.01073095, "auxiliary_loss_mlp": 0.01054144, "balance_loss_clip": 1.01942992, "balance_loss_mlp": 1.02093077, "epoch": 0.25480234480685404, "flos": 21031398927360.0, "grad_norm": 2.0056956362019123, "language_loss": 0.76623166, "learning_rate": 3.492882062983333e-06, "loss": 0.78750402, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5234375, "step": 4238, "time_per_iteration": 2.4061968326568604 }, { "auxiliary_loss_clip": 0.01073569, "auxiliary_loss_mlp": 0.01049756, "balance_loss_clip": 1.01389802, "balance_loss_mlp": 1.02122903, "epoch": 0.254862468059522, "flos": 25081009637760.0, "grad_norm": 2.9001407981046787, "language_loss": 0.8179217, "learning_rate": 3.492622866794074e-06, "loss": 0.83915496, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 4239, "time_per_iteration": 2.4281363487243652 }, { "auxiliary_loss_clip": 0.01073132, "auxiliary_loss_mlp": 0.01044332, "balance_loss_clip": 1.01187181, "balance_loss_mlp": 1.02209997, "epoch": 0.25492259131219, "flos": 20557348220160.0, "grad_norm": 1.9015545876714797, "language_loss": 0.79197937, "learning_rate": 3.492363614004407e-06, "loss": 0.81315398, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51171875, "step": 4240, "time_per_iteration": 2.3879575729370117 }, { "auxiliary_loss_clip": 0.01076192, "auxiliary_loss_mlp": 0.01052943, "balance_loss_clip": 1.01491547, "balance_loss_mlp": 1.02124119, "epoch": 0.25498271456485794, "flos": 25041418289280.0, "grad_norm": 1.8488801357307783, "language_loss": 0.84481716, "learning_rate": 3.492104304624162e-06, "loss": 0.86610842, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.55078125, "step": 4241, "time_per_iteration": 2.3948469161987305 }, { "auxiliary_loss_clip": 0.01074975, "auxiliary_loss_mlp": 0.01057039, "balance_loss_clip": 1.02191997, "balance_loss_mlp": 1.02221251, "epoch": 0.2550428378175259, "flos": 26177177594880.0, "grad_norm": 3.899051064071679, "language_loss": 0.74997211, "learning_rate": 3.4918449386631725e-06, "loss": 0.77129221, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.52734375, "step": 4242, "time_per_iteration": 2.4260313510894775 }, { "auxiliary_loss_clip": 0.01073218, "auxiliary_loss_mlp": 0.01051268, "balance_loss_clip": 1.01785398, "balance_loss_mlp": 1.02031958, "epoch": 0.2551029610701939, "flos": 15266295918720.0, "grad_norm": 2.6260894762084375, "language_loss": 0.75479132, "learning_rate": 3.491585516131273e-06, "loss": 0.7760362, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.53125, "step": 4243, "time_per_iteration": 2.33795166015625 }, { "auxiliary_loss_clip": 0.01071864, "auxiliary_loss_mlp": 0.01050879, "balance_loss_clip": 1.01716638, "balance_loss_mlp": 1.01989067, "epoch": 0.2551630843228619, "flos": 18111263477760.0, "grad_norm": 1.6914865599684927, "language_loss": 0.82899624, "learning_rate": 3.491326037038301e-06, "loss": 0.85022366, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.51953125, "step": 4244, "time_per_iteration": 2.376638412475586 }, { "auxiliary_loss_clip": 0.01024, "auxiliary_loss_mlp": 0.01031852, "balance_loss_clip": 1.02717936, "balance_loss_mlp": 1.00986123, "epoch": 0.25522320757552985, "flos": 70516381651200.0, "grad_norm": 0.703523570983836, "language_loss": 0.57700312, "learning_rate": 3.4910665013940967e-06, "loss": 0.5975616, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 0.04663086, "router_z_loss_mlp": 0.14160156, "step": 4245, "time_per_iteration": 3.171160936355591 }, { "auxiliary_loss_clip": 0.01074919, "auxiliary_loss_mlp": 0.01059321, "balance_loss_clip": 1.022843, "balance_loss_mlp": 1.02157497, "epoch": 0.2552833308281978, "flos": 22891990302720.0, "grad_norm": 2.2263416549745956, "language_loss": 0.67205805, "learning_rate": 3.4908069092085015e-06, "loss": 0.69340044, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.53125, "step": 4246, "time_per_iteration": 3.8074207305908203 }, { "auxiliary_loss_clip": 0.0107087, "auxiliary_loss_mlp": 0.010523, "balance_loss_clip": 1.0207696, "balance_loss_mlp": 1.02052116, "epoch": 0.2553434540808658, "flos": 22052538639360.0, "grad_norm": 1.806780632926359, "language_loss": 0.82441473, "learning_rate": 3.4905472604913585e-06, "loss": 0.8456465, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.50390625, "step": 4247, "time_per_iteration": 2.3782925605773926 }, { "auxiliary_loss_clip": 0.01078697, "auxiliary_loss_mlp": 0.01063497, "balance_loss_clip": 1.02120209, "balance_loss_mlp": 1.02155399, "epoch": 0.25540357733353375, "flos": 16543279140480.0, "grad_norm": 2.4428008515100044, "language_loss": 0.85376298, "learning_rate": 3.490287555252514e-06, "loss": 0.87518489, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 0.5703125, "step": 4248, "time_per_iteration": 2.3580310344696045 }, { "auxiliary_loss_clip": 0.01075163, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.01609826, "balance_loss_mlp": 1.02235413, "epoch": 0.2554637005862017, "flos": 17564104650240.0, "grad_norm": 2.3830630995979254, "language_loss": 0.85323852, "learning_rate": 3.4900277935018166e-06, "loss": 0.87449086, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.52734375, "step": 4249, "time_per_iteration": 2.356660842895508 }, { "auxiliary_loss_clip": 0.01020356, "auxiliary_loss_mlp": 0.01010562, "balance_loss_clip": 1.00612736, "balance_loss_mlp": 1.00685477, "epoch": 0.2555238238388697, "flos": 72241650996480.0, "grad_norm": 0.7670595583513616, "language_loss": 0.56323379, "learning_rate": 3.489767975249115e-06, "loss": 0.58354294, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.13476562, "step": 4250, "time_per_iteration": 5.815124273300171 }, { "auxiliary_loss_clip": 0.01075943, "auxiliary_loss_mlp": 0.01053674, "balance_loss_clip": 1.01767266, "balance_loss_mlp": 1.02132988, "epoch": 0.25558394709153764, "flos": 24388262974080.0, "grad_norm": 3.308284214456833, "language_loss": 0.82193714, "learning_rate": 3.4895081005042632e-06, "loss": 0.84323335, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.546875, "step": 4251, "time_per_iteration": 3.852959632873535 }, { "auxiliary_loss_clip": 0.01018219, "auxiliary_loss_mlp": 0.01016391, "balance_loss_clip": 1.01155066, "balance_loss_mlp": 1.00466299, "epoch": 0.2556440703442056, "flos": 69227772946560.0, "grad_norm": 0.8050938264999051, "language_loss": 0.66093105, "learning_rate": 3.4892481692771146e-06, "loss": 0.68127716, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 0.04833984, "router_z_loss_mlp": 0.13574219, "step": 4252, "time_per_iteration": 3.0473873615264893 }, { "auxiliary_loss_clip": 0.01075939, "auxiliary_loss_mlp": 0.01057233, "balance_loss_clip": 1.02540445, "balance_loss_mlp": 1.02315128, "epoch": 0.2557041935968736, "flos": 24862732617600.0, "grad_norm": 1.975591373014621, "language_loss": 0.75268579, "learning_rate": 3.4889881815775267e-06, "loss": 0.77401751, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.52734375, "step": 4253, "time_per_iteration": 2.439805030822754 }, { "auxiliary_loss_clip": 0.01079401, "auxiliary_loss_mlp": 0.0106021, "balance_loss_clip": 1.02408922, "balance_loss_mlp": 1.02637506, "epoch": 0.25576431684954154, "flos": 22491012804480.0, "grad_norm": 2.0975399868434788, "language_loss": 0.73943925, "learning_rate": 3.488728137415357e-06, "loss": 0.76083541, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4254, "time_per_iteration": 2.421982765197754 }, { "auxiliary_loss_clip": 0.01082292, "auxiliary_loss_mlp": 0.01053551, "balance_loss_clip": 1.01571441, "balance_loss_mlp": 1.02788973, "epoch": 0.2558244401022095, "flos": 19825778257920.0, "grad_norm": 1.8300949877272896, "language_loss": 0.82018912, "learning_rate": 3.4884680368004675e-06, "loss": 0.84154749, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.54296875, "step": 4255, "time_per_iteration": 2.3822181224823 }, { "auxiliary_loss_clip": 0.01083759, "auxiliary_loss_mlp": 0.01057442, "balance_loss_clip": 1.02201259, "balance_loss_mlp": 1.03045225, "epoch": 0.2558845633548775, "flos": 23219405832960.0, "grad_norm": 2.5111253924531516, "language_loss": 0.86731452, "learning_rate": 3.488207879742721e-06, "loss": 0.88872647, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53515625, "step": 4256, "time_per_iteration": 2.4200334548950195 }, { "auxiliary_loss_clip": 0.010839, "auxiliary_loss_mlp": 0.01061762, "balance_loss_clip": 1.02139759, "balance_loss_mlp": 1.02821481, "epoch": 0.2559446866075455, "flos": 16836898608000.0, "grad_norm": 1.8178068337876152, "language_loss": 0.76714194, "learning_rate": 3.4879476662519826e-06, "loss": 0.78859854, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5546875, "step": 4257, "time_per_iteration": 2.434225082397461 }, { "auxiliary_loss_clip": 0.01027668, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.0271672, "balance_loss_mlp": 1.01349211, "epoch": 0.25600480986021346, "flos": 57590627834880.0, "grad_norm": 0.8153965182966899, "language_loss": 0.65376866, "learning_rate": 3.4876873963381196e-06, "loss": 0.6743592, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.14160156, "step": 4258, "time_per_iteration": 2.953946113586426 }, { "auxiliary_loss_clip": 0.01080788, "auxiliary_loss_mlp": 0.01049387, "balance_loss_clip": 1.01574588, "balance_loss_mlp": 1.02996993, "epoch": 0.2560649331128814, "flos": 27818270052480.0, "grad_norm": 2.015320111075788, "language_loss": 0.78186166, "learning_rate": 3.4874270700110013e-06, "loss": 0.80316341, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5078125, "step": 4259, "time_per_iteration": 2.456165075302124 }, { "auxiliary_loss_clip": 0.01029476, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.02606964, "balance_loss_mlp": 1.01584721, "epoch": 0.2561250563655494, "flos": 70946896026240.0, "grad_norm": 0.7905459937992877, "language_loss": 0.58512831, "learning_rate": 3.4871666872804994e-06, "loss": 0.60573167, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 0.04785156, "router_z_loss_mlp": 0.13671875, "step": 4260, "time_per_iteration": 3.121680736541748 }, { "auxiliary_loss_clip": 0.01082973, "auxiliary_loss_mlp": 0.01059966, "balance_loss_clip": 1.02551413, "balance_loss_mlp": 1.02874196, "epoch": 0.25618517961821735, "flos": 27011217997440.0, "grad_norm": 1.9425785878902948, "language_loss": 0.78015232, "learning_rate": 3.4869062481564875e-06, "loss": 0.80158162, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.54296875, "step": 4261, "time_per_iteration": 2.455806255340576 }, { "auxiliary_loss_clip": 0.0107899, "auxiliary_loss_mlp": 0.01057792, "balance_loss_clip": 1.02260137, "balance_loss_mlp": 1.02658391, "epoch": 0.2562453028708853, "flos": 23067394306560.0, "grad_norm": 1.9787884227091022, "language_loss": 0.84256053, "learning_rate": 3.486645752648842e-06, "loss": 0.86392844, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5234375, "step": 4262, "time_per_iteration": 2.4539594650268555 }, { "auxiliary_loss_clip": 0.01082447, "auxiliary_loss_mlp": 0.0107765, "balance_loss_clip": 1.03573573, "balance_loss_mlp": 1.02535319, "epoch": 0.2563054261235533, "flos": 15120079678080.0, "grad_norm": 2.1563108248189176, "language_loss": 0.75761569, "learning_rate": 3.4863852007674405e-06, "loss": 0.77921665, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.5703125, "step": 4263, "time_per_iteration": 2.4302217960357666 }, { "auxiliary_loss_clip": 0.01079607, "auxiliary_loss_mlp": 0.01071016, "balance_loss_clip": 1.03434718, "balance_loss_mlp": 1.02694726, "epoch": 0.25636554937622125, "flos": 27853637126400.0, "grad_norm": 1.831008434595662, "language_loss": 0.83279991, "learning_rate": 3.486124592522163e-06, "loss": 0.8543061, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.52734375, "step": 4264, "time_per_iteration": 2.4578843116760254 }, { "auxiliary_loss_clip": 0.01078659, "auxiliary_loss_mlp": 0.01082939, "balance_loss_clip": 1.04421949, "balance_loss_mlp": 1.02547669, "epoch": 0.2564256726288892, "flos": 28905430878720.0, "grad_norm": 29.95908713009559, "language_loss": 0.76675332, "learning_rate": 3.4858639279228924e-06, "loss": 0.7883693, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.53125, "step": 4265, "time_per_iteration": 2.481652021408081 }, { "auxiliary_loss_clip": 0.01074275, "auxiliary_loss_mlp": 0.01073804, "balance_loss_clip": 1.04161799, "balance_loss_mlp": 1.02186966, "epoch": 0.2564857958815572, "flos": 18513951632640.0, "grad_norm": 1.8090801899114544, "language_loss": 0.83472002, "learning_rate": 3.485603206979513e-06, "loss": 0.85620081, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5234375, "step": 4266, "time_per_iteration": 2.4632465839385986 }, { "auxiliary_loss_clip": 0.01073356, "auxiliary_loss_mlp": 0.01068437, "balance_loss_clip": 1.0355587, "balance_loss_mlp": 1.02273524, "epoch": 0.25654591913422514, "flos": 25807203250560.0, "grad_norm": 1.6783862094478106, "language_loss": 0.80796039, "learning_rate": 3.4853424297019103e-06, "loss": 0.82937831, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5078125, "step": 4267, "time_per_iteration": 2.4630885124206543 }, { "auxiliary_loss_clip": 0.01073657, "auxiliary_loss_mlp": 0.01079358, "balance_loss_clip": 1.04173529, "balance_loss_mlp": 1.02242923, "epoch": 0.2566060423868931, "flos": 19098642038400.0, "grad_norm": 1.783321124905324, "language_loss": 0.80595112, "learning_rate": 3.4850815960999736e-06, "loss": 0.82748127, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.515625, "step": 4268, "time_per_iteration": 2.453575372695923 }, { "auxiliary_loss_clip": 0.01077456, "auxiliary_loss_mlp": 0.01083816, "balance_loss_clip": 1.04819655, "balance_loss_mlp": 1.02462232, "epoch": 0.25666616563956113, "flos": 23841523082880.0, "grad_norm": 2.799405429202468, "language_loss": 0.69599712, "learning_rate": 3.484820706183595e-06, "loss": 0.71760982, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.52734375, "step": 4269, "time_per_iteration": 2.4554498195648193 }, { "auxiliary_loss_clip": 0.01081965, "auxiliary_loss_mlp": 0.01072945, "balance_loss_clip": 1.03520322, "balance_loss_mlp": 1.02710819, "epoch": 0.2567262888922291, "flos": 14603574890880.0, "grad_norm": 3.2057594178216293, "language_loss": 0.82185954, "learning_rate": 3.484559759962666e-06, "loss": 0.84340858, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.55078125, "step": 4270, "time_per_iteration": 2.4059133529663086 }, { "auxiliary_loss_clip": 0.01086131, "auxiliary_loss_mlp": 0.0107765, "balance_loss_clip": 1.0361414, "balance_loss_mlp": 1.02836537, "epoch": 0.25678641214489706, "flos": 32921839019520.0, "grad_norm": 4.0736051116778755, "language_loss": 0.70268995, "learning_rate": 3.4842987574470816e-06, "loss": 0.72432774, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.578125, "step": 4271, "time_per_iteration": 2.5596084594726562 }, { "auxiliary_loss_clip": 0.01080911, "auxiliary_loss_mlp": 0.0106466, "balance_loss_clip": 1.02901649, "balance_loss_mlp": 1.02565157, "epoch": 0.256846535397565, "flos": 24097750617600.0, "grad_norm": 1.5163567847736132, "language_loss": 0.88257134, "learning_rate": 3.4840376986467403e-06, "loss": 0.90402704, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5546875, "step": 4272, "time_per_iteration": 2.467536687850952 }, { "auxiliary_loss_clip": 0.01085235, "auxiliary_loss_mlp": 0.01057126, "balance_loss_clip": 1.02014732, "balance_loss_mlp": 1.0310595, "epoch": 0.256906658650233, "flos": 19717442709120.0, "grad_norm": 1.9756020959708118, "language_loss": 0.83407664, "learning_rate": 3.483776583571541e-06, "loss": 0.85550022, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5390625, "step": 4273, "time_per_iteration": 2.3916962146759033 }, { "auxiliary_loss_clip": 0.0108074, "auxiliary_loss_mlp": 0.01050553, "balance_loss_clip": 1.0176754, "balance_loss_mlp": 1.02970731, "epoch": 0.25696678190290095, "flos": 22925018315520.0, "grad_norm": 1.5446316796228075, "language_loss": 0.78278291, "learning_rate": 3.4835154122313846e-06, "loss": 0.80409586, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51171875, "step": 4274, "time_per_iteration": 2.4484987258911133 }, { "auxiliary_loss_clip": 0.01081847, "auxiliary_loss_mlp": 0.01048253, "balance_loss_clip": 1.01427817, "balance_loss_mlp": 1.03109252, "epoch": 0.2570269051555689, "flos": 27306617944320.0, "grad_norm": 1.708233099023805, "language_loss": 0.85230851, "learning_rate": 3.4832541846361743e-06, "loss": 0.87360954, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4275, "time_per_iteration": 2.4452574253082275 }, { "auxiliary_loss_clip": 0.01085576, "auxiliary_loss_mlp": 0.01052944, "balance_loss_clip": 1.01587033, "balance_loss_mlp": 1.0306251, "epoch": 0.2570870284082369, "flos": 27562182163200.0, "grad_norm": 1.9089706259764554, "language_loss": 0.80175793, "learning_rate": 3.4829929007958175e-06, "loss": 0.82314312, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.546875, "step": 4276, "time_per_iteration": 2.4899728298187256 }, { "auxiliary_loss_clip": 0.01085685, "auxiliary_loss_mlp": 0.010584, "balance_loss_clip": 1.0233283, "balance_loss_mlp": 1.03219938, "epoch": 0.25714715166090485, "flos": 28729573027200.0, "grad_norm": 1.697217589466997, "language_loss": 0.81146967, "learning_rate": 3.4827315607202214e-06, "loss": 0.83291054, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.53515625, "step": 4277, "time_per_iteration": 2.4556496143341064 }, { "auxiliary_loss_clip": 0.01082347, "auxiliary_loss_mlp": 0.01056501, "balance_loss_clip": 1.02250218, "balance_loss_mlp": 1.02949142, "epoch": 0.2572072749135728, "flos": 20115243273600.0, "grad_norm": 2.023292266868303, "language_loss": 0.80322701, "learning_rate": 3.482470164419295e-06, "loss": 0.82461548, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.52734375, "step": 4278, "time_per_iteration": 2.4086005687713623 }, { "auxiliary_loss_clip": 0.01088526, "auxiliary_loss_mlp": 0.0105833, "balance_loss_clip": 1.02495158, "balance_loss_mlp": 1.03265941, "epoch": 0.2572673981662408, "flos": 26029669633920.0, "grad_norm": 1.7604429142794675, "language_loss": 0.76211679, "learning_rate": 3.482208711902952e-06, "loss": 0.78358543, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.55859375, "step": 4279, "time_per_iteration": 2.455392837524414 }, { "auxiliary_loss_clip": 0.01084845, "auxiliary_loss_mlp": 0.01063502, "balance_loss_clip": 1.02843034, "balance_loss_mlp": 1.02919197, "epoch": 0.25732752141890874, "flos": 16105712670720.0, "grad_norm": 2.1840913083891245, "language_loss": 0.88097709, "learning_rate": 3.4819472031811065e-06, "loss": 0.90246058, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5546875, "step": 4280, "time_per_iteration": 2.403960704803467 }, { "auxiliary_loss_clip": 0.01082884, "auxiliary_loss_mlp": 0.01065631, "balance_loss_clip": 1.03134632, "balance_loss_mlp": 1.02858973, "epoch": 0.2573876446715767, "flos": 22523447324160.0, "grad_norm": 2.358089911094378, "language_loss": 0.81389737, "learning_rate": 3.4816856382636744e-06, "loss": 0.83538258, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.54296875, "step": 4281, "time_per_iteration": 2.4034228324890137 }, { "auxiliary_loss_clip": 0.01080557, "auxiliary_loss_mlp": 0.01062731, "balance_loss_clip": 1.02892327, "balance_loss_mlp": 1.02781153, "epoch": 0.2574477679242447, "flos": 23949718986240.0, "grad_norm": 2.064327533783522, "language_loss": 0.87976706, "learning_rate": 3.4814240171605737e-06, "loss": 0.90119994, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.52734375, "step": 4282, "time_per_iteration": 2.445082187652588 }, { "auxiliary_loss_clip": 0.01082973, "auxiliary_loss_mlp": 0.0106525, "balance_loss_clip": 1.03125167, "balance_loss_mlp": 1.02799535, "epoch": 0.2575078911769127, "flos": 21980617505280.0, "grad_norm": 1.4812189318027709, "language_loss": 0.71907818, "learning_rate": 3.4811623398817267e-06, "loss": 0.74056041, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.55078125, "step": 4283, "time_per_iteration": 2.411618947982788 }, { "auxiliary_loss_clip": 0.01075335, "auxiliary_loss_mlp": 0.01060008, "balance_loss_clip": 1.03068256, "balance_loss_mlp": 1.02601779, "epoch": 0.25756801442958066, "flos": 21944307824640.0, "grad_norm": 1.7486191456999747, "language_loss": 0.81591856, "learning_rate": 3.4809006064370553e-06, "loss": 0.83727199, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4921875, "step": 4284, "time_per_iteration": 2.4249141216278076 }, { "auxiliary_loss_clip": 0.01077024, "auxiliary_loss_mlp": 0.01053497, "balance_loss_clip": 1.02197838, "balance_loss_mlp": 1.02500796, "epoch": 0.2576281376822486, "flos": 35260530819840.0, "grad_norm": 5.0290470536187115, "language_loss": 0.72834831, "learning_rate": 3.4806388168364835e-06, "loss": 0.74965358, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.51953125, "step": 4285, "time_per_iteration": 3.9808921813964844 }, { "auxiliary_loss_clip": 0.01077469, "auxiliary_loss_mlp": 0.01054593, "balance_loss_clip": 1.02486229, "balance_loss_mlp": 1.02561808, "epoch": 0.2576882609349166, "flos": 14131549042560.0, "grad_norm": 1.7757241573689395, "language_loss": 0.61003494, "learning_rate": 3.4803769710899402e-06, "loss": 0.63135558, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.51953125, "step": 4286, "time_per_iteration": 2.3627216815948486 }, { "auxiliary_loss_clip": 0.01079658, "auxiliary_loss_mlp": 0.01064282, "balance_loss_clip": 1.02956831, "balance_loss_mlp": 1.02462351, "epoch": 0.25774838418758456, "flos": 23257216702080.0, "grad_norm": 1.5883821348488658, "language_loss": 0.66645157, "learning_rate": 3.480115069207354e-06, "loss": 0.68789101, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.55078125, "step": 4287, "time_per_iteration": 2.453338623046875 }, { "auxiliary_loss_clip": 0.01080199, "auxiliary_loss_mlp": 0.01057085, "balance_loss_clip": 1.01970077, "balance_loss_mlp": 1.02458453, "epoch": 0.2578085074402525, "flos": 22600640073600.0, "grad_norm": 2.5709339843878287, "language_loss": 0.72757232, "learning_rate": 3.4798531111986557e-06, "loss": 0.74894512, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5546875, "step": 4288, "time_per_iteration": 2.3893046379089355 }, { "auxiliary_loss_clip": 0.01074467, "auxiliary_loss_mlp": 0.01051579, "balance_loss_clip": 1.0191896, "balance_loss_mlp": 1.0236181, "epoch": 0.2578686306929205, "flos": 24570684161280.0, "grad_norm": 1.4564573970265737, "language_loss": 0.78285396, "learning_rate": 3.4795910970737786e-06, "loss": 0.8041144, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5078125, "step": 4289, "time_per_iteration": 3.878025531768799 }, { "auxiliary_loss_clip": 0.0107451, "auxiliary_loss_mlp": 0.01049946, "balance_loss_clip": 1.01701999, "balance_loss_mlp": 1.02231395, "epoch": 0.25792875394558845, "flos": 18112974134400.0, "grad_norm": 2.0132656818653056, "language_loss": 0.86195374, "learning_rate": 3.4793290268426592e-06, "loss": 0.88319826, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5234375, "step": 4290, "time_per_iteration": 2.3555593490600586 }, { "auxiliary_loss_clip": 0.01079415, "auxiliary_loss_mlp": 0.01064891, "balance_loss_clip": 1.02617216, "balance_loss_mlp": 1.02396917, "epoch": 0.2579888771982564, "flos": 17711926813440.0, "grad_norm": 2.156421465686289, "language_loss": 0.74150693, "learning_rate": 3.4790669005152354e-06, "loss": 0.76294994, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5546875, "step": 4291, "time_per_iteration": 3.763063907623291 }, { "auxiliary_loss_clip": 0.01080982, "auxiliary_loss_mlp": 0.01055242, "balance_loss_clip": 1.0183351, "balance_loss_mlp": 1.02642679, "epoch": 0.2580490004509244, "flos": 16433966073600.0, "grad_norm": 2.501981961390261, "language_loss": 0.83413827, "learning_rate": 3.4788047181014458e-06, "loss": 0.85550046, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 4292, "time_per_iteration": 2.4080584049224854 }, { "auxiliary_loss_clip": 0.01082294, "auxiliary_loss_mlp": 0.01056374, "balance_loss_clip": 1.01934719, "balance_loss_mlp": 1.02717042, "epoch": 0.25810912370359235, "flos": 33833840221440.0, "grad_norm": 2.480162463713019, "language_loss": 0.69386786, "learning_rate": 3.4785424796112337e-06, "loss": 0.71525455, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.55078125, "step": 4293, "time_per_iteration": 2.5038866996765137 }, { "auxiliary_loss_clip": 0.01076025, "auxiliary_loss_mlp": 0.01046063, "balance_loss_clip": 1.0139246, "balance_loss_mlp": 1.02382469, "epoch": 0.2581692469562603, "flos": 25191020931840.0, "grad_norm": 1.8233350376311408, "language_loss": 0.77085698, "learning_rate": 3.478280185054542e-06, "loss": 0.7920779, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5234375, "step": 4294, "time_per_iteration": 2.4974441528320312 }, { "auxiliary_loss_clip": 0.01078371, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.01787138, "balance_loss_mlp": 1.0257678, "epoch": 0.2582293702089283, "flos": 34930811139840.0, "grad_norm": 2.6860045759973494, "language_loss": 0.82166612, "learning_rate": 3.478017834441318e-06, "loss": 0.84298623, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.52734375, "step": 4295, "time_per_iteration": 2.5160601139068604 }, { "auxiliary_loss_clip": 0.01081424, "auxiliary_loss_mlp": 0.01054695, "balance_loss_clip": 1.01845574, "balance_loss_mlp": 1.02506804, "epoch": 0.2582894934615963, "flos": 26832532325760.0, "grad_norm": 2.099819476058287, "language_loss": 0.73788536, "learning_rate": 3.4777554277815096e-06, "loss": 0.75924653, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5625, "step": 4296, "time_per_iteration": 2.4435131549835205 }, { "auxiliary_loss_clip": 0.01084763, "auxiliary_loss_mlp": 0.01054907, "balance_loss_clip": 1.01702225, "balance_loss_mlp": 1.02905989, "epoch": 0.25834961671426426, "flos": 23514072641280.0, "grad_norm": 1.613740084083968, "language_loss": 0.87267774, "learning_rate": 3.477492965085067e-06, "loss": 0.89407444, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5546875, "step": 4297, "time_per_iteration": 2.430133581161499 }, { "auxiliary_loss_clip": 0.01078622, "auxiliary_loss_mlp": 0.01054891, "balance_loss_clip": 1.02065432, "balance_loss_mlp": 1.02508116, "epoch": 0.25840973996693223, "flos": 22450059912960.0, "grad_norm": 11.69624748768489, "language_loss": 0.86250389, "learning_rate": 3.477230446361943e-06, "loss": 0.88383907, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.53515625, "step": 4298, "time_per_iteration": 2.409881591796875 }, { "auxiliary_loss_clip": 0.01078353, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.02067792, "balance_loss_mlp": 1.02547216, "epoch": 0.2584698632196002, "flos": 11290072619520.0, "grad_norm": 2.4091422014281187, "language_loss": 0.85478818, "learning_rate": 3.4769678716220927e-06, "loss": 0.87611401, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.53125, "step": 4299, "time_per_iteration": 2.384657621383667 }, { "auxiliary_loss_clip": 0.0107658, "auxiliary_loss_mlp": 0.01044062, "balance_loss_clip": 1.0153321, "balance_loss_mlp": 1.0256474, "epoch": 0.25852998647226816, "flos": 17929051758720.0, "grad_norm": 2.3671988414562306, "language_loss": 0.84647644, "learning_rate": 3.4767052408754726e-06, "loss": 0.86768281, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.5078125, "step": 4300, "time_per_iteration": 2.3479833602905273 }, { "auxiliary_loss_clip": 0.01079464, "auxiliary_loss_mlp": 0.01056801, "balance_loss_clip": 1.02187228, "balance_loss_mlp": 1.02391315, "epoch": 0.2585901097249361, "flos": 33254700721920.0, "grad_norm": 1.975697322289114, "language_loss": 0.69756126, "learning_rate": 3.4764425541320417e-06, "loss": 0.71892387, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5546875, "step": 4301, "time_per_iteration": 2.517148971557617 }, { "auxiliary_loss_clip": 0.0108036, "auxiliary_loss_mlp": 0.01055845, "balance_loss_clip": 1.02113104, "balance_loss_mlp": 1.02428532, "epoch": 0.2586502329776041, "flos": 18440319841920.0, "grad_norm": 2.779357530837604, "language_loss": 0.84289163, "learning_rate": 3.4761798114017617e-06, "loss": 0.86425364, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5625, "step": 4302, "time_per_iteration": 2.3479785919189453 }, { "auxiliary_loss_clip": 0.01078988, "auxiliary_loss_mlp": 0.01055101, "balance_loss_clip": 1.02188993, "balance_loss_mlp": 1.02549314, "epoch": 0.25871035623027205, "flos": 17967141918720.0, "grad_norm": 1.9129157406621091, "language_loss": 0.93693447, "learning_rate": 3.475917012694595e-06, "loss": 0.95827538, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.53125, "step": 4303, "time_per_iteration": 2.3682994842529297 }, { "auxiliary_loss_clip": 0.01077102, "auxiliary_loss_mlp": 0.01051381, "balance_loss_clip": 1.01723945, "balance_loss_mlp": 1.02414727, "epoch": 0.25877047948294, "flos": 27776618933760.0, "grad_norm": 2.181178004725439, "language_loss": 0.68600273, "learning_rate": 3.475654158020507e-06, "loss": 0.70728755, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.53125, "step": 4304, "time_per_iteration": 2.420952796936035 }, { "auxiliary_loss_clip": 0.01076297, "auxiliary_loss_mlp": 0.01055128, "balance_loss_clip": 1.01936483, "balance_loss_mlp": 1.02266169, "epoch": 0.258830602735608, "flos": 27124615693440.0, "grad_norm": 3.0748785583992606, "language_loss": 0.74593812, "learning_rate": 3.4753912473894657e-06, "loss": 0.76725233, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.53515625, "step": 4305, "time_per_iteration": 2.441922664642334 }, { "auxiliary_loss_clip": 0.01077771, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.02416205, "balance_loss_mlp": 1.02269244, "epoch": 0.25889072598827595, "flos": 17890612485120.0, "grad_norm": 2.057286559420386, "language_loss": 0.77135795, "learning_rate": 3.4751282808114403e-06, "loss": 0.79272246, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.546875, "step": 4306, "time_per_iteration": 2.3484416007995605 }, { "auxiliary_loss_clip": 0.01028255, "auxiliary_loss_mlp": 0.010082, "balance_loss_clip": 1.00171506, "balance_loss_mlp": 1.01027, "epoch": 0.2589508492409439, "flos": 53932184530560.0, "grad_norm": 0.85194137655323, "language_loss": 0.57224, "learning_rate": 3.474865258296403e-06, "loss": 0.59260458, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 0.06494141, "router_z_loss_mlp": 0.1796875, "step": 4307, "time_per_iteration": 2.961881399154663 }, { "auxiliary_loss_clip": 0.010739, "auxiliary_loss_mlp": 0.01052426, "balance_loss_clip": 1.02112174, "balance_loss_mlp": 1.02252841, "epoch": 0.2590109724936119, "flos": 22124739064320.0, "grad_norm": 1.9048236111702301, "language_loss": 0.73339623, "learning_rate": 3.474602179854327e-06, "loss": 0.75465947, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.51171875, "step": 4308, "time_per_iteration": 2.3959856033325195 }, { "auxiliary_loss_clip": 0.01076503, "auxiliary_loss_mlp": 0.01057708, "balance_loss_clip": 1.02432907, "balance_loss_mlp": 1.02251339, "epoch": 0.2590710957462799, "flos": 13473610871040.0, "grad_norm": 1.8991488174520352, "language_loss": 0.85895729, "learning_rate": 3.4743390454951886e-06, "loss": 0.88029939, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5390625, "step": 4309, "time_per_iteration": 2.3701188564300537 }, { "auxiliary_loss_clip": 0.0107448, "auxiliary_loss_mlp": 0.01049921, "balance_loss_clip": 1.01806831, "balance_loss_mlp": 1.02249098, "epoch": 0.25913121899894787, "flos": 22306077999360.0, "grad_norm": 1.8632425074194043, "language_loss": 0.85792696, "learning_rate": 3.474075855228966e-06, "loss": 0.87917101, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.51953125, "step": 4310, "time_per_iteration": 2.3835034370422363 }, { "auxiliary_loss_clip": 0.01077441, "auxiliary_loss_mlp": 0.0106008, "balance_loss_clip": 1.02517533, "balance_loss_mlp": 1.02280116, "epoch": 0.25919134225161583, "flos": 25810554741120.0, "grad_norm": 1.9857849584725553, "language_loss": 0.78891599, "learning_rate": 3.473812609065639e-06, "loss": 0.81029117, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.546875, "step": 4311, "time_per_iteration": 2.4389965534210205 }, { "auxiliary_loss_clip": 0.01074712, "auxiliary_loss_mlp": 0.01060278, "balance_loss_clip": 1.02806783, "balance_loss_mlp": 1.02120852, "epoch": 0.2592514655042838, "flos": 31210920109440.0, "grad_norm": 1.8730700469754076, "language_loss": 0.73415285, "learning_rate": 3.4735493070151904e-06, "loss": 0.75550276, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5390625, "step": 4312, "time_per_iteration": 2.4606926441192627 }, { "auxiliary_loss_clip": 0.010753, "auxiliary_loss_mlp": 0.0106053, "balance_loss_clip": 1.0257926, "balance_loss_mlp": 1.02332914, "epoch": 0.25931158875695176, "flos": 18474115904640.0, "grad_norm": 2.9162153715125, "language_loss": 0.72596979, "learning_rate": 3.4732859490876044e-06, "loss": 0.74732804, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51953125, "step": 4313, "time_per_iteration": 2.372936248779297 }, { "auxiliary_loss_clip": 0.01073779, "auxiliary_loss_mlp": 0.01057046, "balance_loss_clip": 1.02667141, "balance_loss_mlp": 1.02237725, "epoch": 0.2593717120096197, "flos": 19206942675840.0, "grad_norm": 1.959846323404861, "language_loss": 0.81772751, "learning_rate": 3.473022535292867e-06, "loss": 0.83903575, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.515625, "step": 4314, "time_per_iteration": 2.431692123413086 }, { "auxiliary_loss_clip": 0.01077686, "auxiliary_loss_mlp": 0.01062009, "balance_loss_clip": 1.02658057, "balance_loss_mlp": 1.02181053, "epoch": 0.2594318352622877, "flos": 31246775942400.0, "grad_norm": 1.9997072996560323, "language_loss": 0.67968279, "learning_rate": 3.472759065640968e-06, "loss": 0.70107973, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.55859375, "step": 4315, "time_per_iteration": 2.4657020568847656 }, { "auxiliary_loss_clip": 0.01073769, "auxiliary_loss_mlp": 0.01052302, "balance_loss_clip": 1.02110469, "balance_loss_mlp": 1.02238154, "epoch": 0.25949195851495566, "flos": 22236042078720.0, "grad_norm": 3.3355826127461725, "language_loss": 0.80552888, "learning_rate": 3.4724955401418976e-06, "loss": 0.82678962, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.515625, "step": 4316, "time_per_iteration": 2.3831307888031006 }, { "auxiliary_loss_clip": 0.01074309, "auxiliary_loss_mlp": 0.01055821, "balance_loss_clip": 1.02017689, "balance_loss_mlp": 1.02131653, "epoch": 0.2595520817676236, "flos": 28074427764480.0, "grad_norm": 1.6447468009322173, "language_loss": 0.78872555, "learning_rate": 3.4722319588056487e-06, "loss": 0.81002688, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53125, "step": 4317, "time_per_iteration": 2.4524190425872803 }, { "auxiliary_loss_clip": 0.01075491, "auxiliary_loss_mlp": 0.01061875, "balance_loss_clip": 1.02654147, "balance_loss_mlp": 1.0236156, "epoch": 0.2596122050202916, "flos": 20189992227840.0, "grad_norm": 2.215793343012506, "language_loss": 0.7896806, "learning_rate": 3.4719683216422163e-06, "loss": 0.81105429, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.51953125, "step": 4318, "time_per_iteration": 2.3714840412139893 }, { "auxiliary_loss_clip": 0.01074131, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.0146575, "balance_loss_mlp": 1.02261424, "epoch": 0.25967232827295955, "flos": 22526868637440.0, "grad_norm": 1.741170785961475, "language_loss": 0.77653337, "learning_rate": 3.471704628661598e-06, "loss": 0.79776579, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4319, "time_per_iteration": 2.447252035140991 }, { "auxiliary_loss_clip": 0.01070707, "auxiliary_loss_mlp": 0.01050252, "balance_loss_clip": 1.01963925, "balance_loss_mlp": 1.02034926, "epoch": 0.2597324515256275, "flos": 21067219848960.0, "grad_norm": 2.243173585820958, "language_loss": 0.77908164, "learning_rate": 3.4714408798737925e-06, "loss": 0.80029118, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.50390625, "step": 4320, "time_per_iteration": 2.3880057334899902 }, { "auxiliary_loss_clip": 0.01075448, "auxiliary_loss_mlp": 0.0105213, "balance_loss_clip": 1.02012193, "balance_loss_mlp": 1.02245069, "epoch": 0.2597925747782955, "flos": 22049047503360.0, "grad_norm": 6.458865626837107, "language_loss": 0.72355795, "learning_rate": 3.471177075288801e-06, "loss": 0.74483371, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.53125, "step": 4321, "time_per_iteration": 2.395052909851074 }, { "auxiliary_loss_clip": 0.0107745, "auxiliary_loss_mlp": 0.01055753, "balance_loss_clip": 1.01975167, "balance_loss_mlp": 1.02268028, "epoch": 0.2598526980309635, "flos": 19535929217280.0, "grad_norm": 2.5338826503220275, "language_loss": 0.76287216, "learning_rate": 3.4709132149166277e-06, "loss": 0.78420419, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.546875, "step": 4322, "time_per_iteration": 2.376047134399414 }, { "auxiliary_loss_clip": 0.01074656, "auxiliary_loss_mlp": 0.01057366, "balance_loss_clip": 1.01992202, "balance_loss_mlp": 1.02162707, "epoch": 0.25991282128363147, "flos": 24494154727680.0, "grad_norm": 2.118315046902038, "language_loss": 0.7482574, "learning_rate": 3.470649298767278e-06, "loss": 0.76957762, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53125, "step": 4323, "time_per_iteration": 2.429680347442627 }, { "auxiliary_loss_clip": 0.01078464, "auxiliary_loss_mlp": 0.01053606, "balance_loss_clip": 1.01624537, "balance_loss_mlp": 1.02164388, "epoch": 0.25997294453629943, "flos": 24200465437440.0, "grad_norm": 1.830521610321595, "language_loss": 0.68545103, "learning_rate": 3.4703853268507597e-06, "loss": 0.70677173, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5703125, "step": 4324, "time_per_iteration": 3.7968106269836426 }, { "auxiliary_loss_clip": 0.01073215, "auxiliary_loss_mlp": 0.01051081, "balance_loss_clip": 1.01857305, "balance_loss_mlp": 1.02189898, "epoch": 0.2600330677889674, "flos": 31430104824960.0, "grad_norm": 2.0957276456776612, "language_loss": 0.72408247, "learning_rate": 3.470121299177082e-06, "loss": 0.74532539, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51171875, "step": 4325, "time_per_iteration": 2.4652199745178223 }, { "auxiliary_loss_clip": 0.01073369, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.02138686, "balance_loss_mlp": 1.02027774, "epoch": 0.26009319104163536, "flos": 32265262391040.0, "grad_norm": 2.472866592451999, "language_loss": 0.74793828, "learning_rate": 3.469857215756257e-06, "loss": 0.76922512, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.53125, "step": 4326, "time_per_iteration": 2.461482048034668 }, { "auxiliary_loss_clip": 0.01070659, "auxiliary_loss_mlp": 0.01044288, "balance_loss_clip": 1.01472414, "balance_loss_mlp": 1.02038574, "epoch": 0.26015331429430333, "flos": 26285548055040.0, "grad_norm": 1.793825952644206, "language_loss": 0.88048708, "learning_rate": 3.4695930765982997e-06, "loss": 0.90163654, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.50390625, "step": 4327, "time_per_iteration": 2.442293167114258 }, { "auxiliary_loss_clip": 0.0107843, "auxiliary_loss_mlp": 0.0106121, "balance_loss_clip": 1.02091765, "balance_loss_mlp": 1.02286947, "epoch": 0.2602134375469713, "flos": 21141270576000.0, "grad_norm": 2.058602752696256, "language_loss": 0.81928498, "learning_rate": 3.4693288817132255e-06, "loss": 0.84068143, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5546875, "step": 4328, "time_per_iteration": 2.384488821029663 }, { "auxiliary_loss_clip": 0.01071583, "auxiliary_loss_mlp": 0.01048263, "balance_loss_clip": 1.01710176, "balance_loss_mlp": 1.02037001, "epoch": 0.26027356079963926, "flos": 25920147098880.0, "grad_norm": 1.5834764506403771, "language_loss": 0.89003396, "learning_rate": 3.4690646311110525e-06, "loss": 0.91123241, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.51171875, "step": 4329, "time_per_iteration": 3.944563627243042 }, { "auxiliary_loss_clip": 0.01072424, "auxiliary_loss_mlp": 0.01045391, "balance_loss_clip": 1.01493263, "balance_loss_mlp": 1.02175605, "epoch": 0.2603336840523072, "flos": 26358027770880.0, "grad_norm": 2.0251589679589266, "language_loss": 0.78649986, "learning_rate": 3.468800324801802e-06, "loss": 0.80767804, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.5078125, "step": 4330, "time_per_iteration": 2.440662384033203 }, { "auxiliary_loss_clip": 0.01079927, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.02056062, "balance_loss_mlp": 1.02399313, "epoch": 0.2603938073049752, "flos": 23512536541440.0, "grad_norm": 1.5510334898907268, "language_loss": 0.76541388, "learning_rate": 3.4685359627954958e-06, "loss": 0.78676045, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5625, "step": 4331, "time_per_iteration": 3.794708728790283 }, { "auxiliary_loss_clip": 0.010733, "auxiliary_loss_mlp": 0.01054805, "balance_loss_clip": 1.0239892, "balance_loss_mlp": 1.02238512, "epoch": 0.26045393055764315, "flos": 25373127916800.0, "grad_norm": 1.4035966223016234, "language_loss": 0.7041899, "learning_rate": 3.4682715451021584e-06, "loss": 0.72547096, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.5078125, "step": 4332, "time_per_iteration": 2.4303629398345947 }, { "auxiliary_loss_clip": 0.01075134, "auxiliary_loss_mlp": 0.01053804, "balance_loss_clip": 1.01844633, "balance_loss_mlp": 1.02149844, "epoch": 0.2605140538103111, "flos": 27634068385920.0, "grad_norm": 2.454597704683902, "language_loss": 0.81336772, "learning_rate": 3.4680070717318174e-06, "loss": 0.83465707, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53515625, "step": 4333, "time_per_iteration": 2.427786111831665 }, { "auxiliary_loss_clip": 0.01072847, "auxiliary_loss_mlp": 0.0104803, "balance_loss_clip": 1.01577163, "balance_loss_mlp": 1.02106941, "epoch": 0.2605741770629791, "flos": 13769045729280.0, "grad_norm": 1.9362733487493158, "language_loss": 0.81186533, "learning_rate": 3.467742542694501e-06, "loss": 0.83307409, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.515625, "step": 4334, "time_per_iteration": 2.3746626377105713 }, { "auxiliary_loss_clip": 0.01074322, "auxiliary_loss_mlp": 0.01051257, "balance_loss_clip": 1.01647139, "balance_loss_mlp": 1.02155924, "epoch": 0.26063430031564705, "flos": 26030472595200.0, "grad_norm": 1.7592378062730563, "language_loss": 0.80796647, "learning_rate": 3.46747795800024e-06, "loss": 0.82922232, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 4335, "time_per_iteration": 2.455803871154785 }, { "auxiliary_loss_clip": 0.01023571, "auxiliary_loss_mlp": 0.01011124, "balance_loss_clip": 1.00640333, "balance_loss_mlp": 1.00821912, "epoch": 0.26069442356831507, "flos": 62440587619200.0, "grad_norm": 0.8304028826942695, "language_loss": 0.60752141, "learning_rate": 3.467213317659068e-06, "loss": 0.62786841, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 0.04711914, "router_z_loss_mlp": 0.15332031, "step": 4336, "time_per_iteration": 3.0014219284057617 }, { "auxiliary_loss_clip": 0.01074725, "auxiliary_loss_mlp": 0.01054559, "balance_loss_clip": 1.02172852, "balance_loss_mlp": 1.02164316, "epoch": 0.26075454682098304, "flos": 13625517663360.0, "grad_norm": 1.7855990358468834, "language_loss": 0.79169303, "learning_rate": 3.46694862168102e-06, "loss": 0.8129859, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.53125, "step": 4337, "time_per_iteration": 2.3511011600494385 }, { "auxiliary_loss_clip": 0.01076179, "auxiliary_loss_mlp": 0.01053702, "balance_loss_clip": 1.01846409, "balance_loss_mlp": 1.02160656, "epoch": 0.260814670073651, "flos": 12125823678720.0, "grad_norm": 2.347252809973351, "language_loss": 0.76598531, "learning_rate": 3.4666838700761334e-06, "loss": 0.78728414, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 4338, "time_per_iteration": 2.3604750633239746 }, { "auxiliary_loss_clip": 0.01076979, "auxiliary_loss_mlp": 0.01056801, "balance_loss_clip": 1.02058506, "balance_loss_mlp": 1.02260697, "epoch": 0.26087479332631897, "flos": 15121615777920.0, "grad_norm": 2.1654072602600523, "language_loss": 0.81942666, "learning_rate": 3.466419062854447e-06, "loss": 0.8407644, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.54296875, "step": 4339, "time_per_iteration": 2.359541654586792 }, { "auxiliary_loss_clip": 0.01072547, "auxiliary_loss_mlp": 0.01046041, "balance_loss_clip": 1.01473713, "balance_loss_mlp": 1.02153528, "epoch": 0.26093491657898693, "flos": 24679787760000.0, "grad_norm": 1.7265116503646307, "language_loss": 0.77139068, "learning_rate": 3.4661542000260033e-06, "loss": 0.79257655, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.5078125, "step": 4340, "time_per_iteration": 2.4139230251312256 }, { "auxiliary_loss_clip": 0.01076319, "auxiliary_loss_mlp": 0.01052298, "balance_loss_clip": 1.0182755, "balance_loss_mlp": 1.02203894, "epoch": 0.2609950398316549, "flos": 25115050080000.0, "grad_norm": 1.677140175040068, "language_loss": 0.83773458, "learning_rate": 3.465889281600845e-06, "loss": 0.85902083, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.54296875, "step": 4341, "time_per_iteration": 2.4035019874572754 }, { "auxiliary_loss_clip": 0.01073759, "auxiliary_loss_mlp": 0.0105677, "balance_loss_clip": 1.02081656, "balance_loss_mlp": 1.02171755, "epoch": 0.26105516308432286, "flos": 28547326396800.0, "grad_norm": 1.8356666906858434, "language_loss": 0.77731442, "learning_rate": 3.4656243075890183e-06, "loss": 0.79861969, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51953125, "step": 4342, "time_per_iteration": 2.445312023162842 }, { "auxiliary_loss_clip": 0.0107335, "auxiliary_loss_mlp": 0.01053596, "balance_loss_clip": 1.02012241, "balance_loss_mlp": 1.02118945, "epoch": 0.2611152863369908, "flos": 39529046954880.0, "grad_norm": 1.8529670231957953, "language_loss": 0.67540729, "learning_rate": 3.4653592780005707e-06, "loss": 0.69667673, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5234375, "step": 4343, "time_per_iteration": 2.5378615856170654 }, { "auxiliary_loss_clip": 0.0107292, "auxiliary_loss_mlp": 0.01054901, "balance_loss_clip": 1.02178502, "balance_loss_mlp": 1.01954246, "epoch": 0.2611754095896588, "flos": 13734481616640.0, "grad_norm": 2.3268496488028405, "language_loss": 0.75760335, "learning_rate": 3.465094192845553e-06, "loss": 0.77888155, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.53515625, "step": 4344, "time_per_iteration": 2.369339942932129 }, { "auxiliary_loss_clip": 0.01074775, "auxiliary_loss_mlp": 0.01056119, "balance_loss_clip": 1.01918781, "balance_loss_mlp": 1.0217073, "epoch": 0.26123553284232676, "flos": 21505589280000.0, "grad_norm": 2.131038828039069, "language_loss": 0.88215935, "learning_rate": 3.4648290521340165e-06, "loss": 0.90346837, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53125, "step": 4345, "time_per_iteration": 2.381333351135254 }, { "auxiliary_loss_clip": 0.01071246, "auxiliary_loss_mlp": 0.01047593, "balance_loss_clip": 1.0174334, "balance_loss_mlp": 1.02142692, "epoch": 0.2612956560949947, "flos": 21138791869440.0, "grad_norm": 1.9422444996717085, "language_loss": 0.77984381, "learning_rate": 3.464563855876015e-06, "loss": 0.80103219, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.49804688, "step": 4346, "time_per_iteration": 2.441781997680664 }, { "auxiliary_loss_clip": 0.01072756, "auxiliary_loss_mlp": 0.01054135, "balance_loss_clip": 1.02054191, "balance_loss_mlp": 1.02086449, "epoch": 0.2613557793476627, "flos": 25117842988800.0, "grad_norm": 2.158721893466456, "language_loss": 0.77314299, "learning_rate": 3.464298604081606e-06, "loss": 0.79441184, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51953125, "step": 4347, "time_per_iteration": 2.4101173877716064 }, { "auxiliary_loss_clip": 0.01071502, "auxiliary_loss_mlp": 0.01046581, "balance_loss_clip": 1.01549101, "balance_loss_mlp": 1.02074325, "epoch": 0.26141590260033065, "flos": 26066502984960.0, "grad_norm": 1.4060562024447225, "language_loss": 0.74727488, "learning_rate": 3.4640332967608476e-06, "loss": 0.76845562, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.5078125, "step": 4348, "time_per_iteration": 2.471334457397461 }, { "auxiliary_loss_clip": 0.01073311, "auxiliary_loss_mlp": 0.01051702, "balance_loss_clip": 1.01818085, "balance_loss_mlp": 1.01989734, "epoch": 0.2614760258529987, "flos": 25700368890240.0, "grad_norm": 2.01920030646507, "language_loss": 0.93448657, "learning_rate": 3.463767933923799e-06, "loss": 0.95573664, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.53125, "step": 4349, "time_per_iteration": 2.4081010818481445 }, { "auxiliary_loss_clip": 0.01071766, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.01243472, "balance_loss_mlp": 1.02092195, "epoch": 0.26153614910566664, "flos": 17456188037760.0, "grad_norm": 2.9429252964090558, "language_loss": 0.81996727, "learning_rate": 3.463502515580524e-06, "loss": 0.84112668, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5078125, "step": 4350, "time_per_iteration": 2.3827712535858154 }, { "auxiliary_loss_clip": 0.01070446, "auxiliary_loss_mlp": 0.01050832, "balance_loss_clip": 1.02019525, "balance_loss_mlp": 1.02010953, "epoch": 0.2615962723583346, "flos": 17711856990720.0, "grad_norm": 5.252955689955155, "language_loss": 0.65117335, "learning_rate": 3.4632370417410866e-06, "loss": 0.67238617, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.50390625, "step": 4351, "time_per_iteration": 2.3749914169311523 }, { "auxiliary_loss_clip": 0.01073416, "auxiliary_loss_mlp": 0.01055108, "balance_loss_clip": 1.01822484, "balance_loss_mlp": 1.02055144, "epoch": 0.26165639561100257, "flos": 23256623208960.0, "grad_norm": 1.9978575640111786, "language_loss": 0.85868651, "learning_rate": 3.462971512415555e-06, "loss": 0.87997174, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53125, "step": 4352, "time_per_iteration": 2.4563987255096436 }, { "auxiliary_loss_clip": 0.01023088, "auxiliary_loss_mlp": 0.01008059, "balance_loss_clip": 1.00250423, "balance_loss_mlp": 1.00789118, "epoch": 0.26171651886367053, "flos": 66734940026880.0, "grad_norm": 0.7960481905605304, "language_loss": 0.7066282, "learning_rate": 3.462705927613996e-06, "loss": 0.72693968, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 0.05566406, "router_z_loss_mlp": 0.15234375, "step": 4353, "time_per_iteration": 2.9228315353393555 }, { "auxiliary_loss_clip": 0.01071141, "auxiliary_loss_mlp": 0.01058067, "balance_loss_clip": 1.02406907, "balance_loss_mlp": 1.01957369, "epoch": 0.2617766421163385, "flos": 22348392433920.0, "grad_norm": 1.9106782608450519, "language_loss": 0.79149318, "learning_rate": 3.4624402873464816e-06, "loss": 0.81278527, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 4354, "time_per_iteration": 2.4554953575134277 }, { "auxiliary_loss_clip": 0.01075195, "auxiliary_loss_mlp": 0.01055758, "balance_loss_clip": 1.01892233, "balance_loss_mlp": 1.02012491, "epoch": 0.26183676536900646, "flos": 26065944403200.0, "grad_norm": 2.37505933932333, "language_loss": 0.70129329, "learning_rate": 3.462174591623085e-06, "loss": 0.72260278, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.55078125, "step": 4355, "time_per_iteration": 2.4110467433929443 }, { "auxiliary_loss_clip": 0.0107322, "auxiliary_loss_mlp": 0.01045496, "balance_loss_clip": 1.01147342, "balance_loss_mlp": 1.02161407, "epoch": 0.26189688862167443, "flos": 20995403448960.0, "grad_norm": 2.0618172212860983, "language_loss": 0.69353861, "learning_rate": 3.4619088404538815e-06, "loss": 0.71472579, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 4356, "time_per_iteration": 2.435774564743042 }, { "auxiliary_loss_clip": 0.01020801, "auxiliary_loss_mlp": 0.01005474, "balance_loss_clip": 0.99991935, "balance_loss_mlp": 1.00536466, "epoch": 0.2619570118743424, "flos": 65795007870720.0, "grad_norm": 0.6817224665486104, "language_loss": 0.53214431, "learning_rate": 3.4616430338489487e-06, "loss": 0.55240709, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 0.05566406, "router_z_loss_mlp": 0.15429688, "step": 4357, "time_per_iteration": 2.9298925399780273 }, { "auxiliary_loss_clip": 0.01073918, "auxiliary_loss_mlp": 0.01056503, "balance_loss_clip": 1.02298093, "balance_loss_mlp": 1.02068233, "epoch": 0.26201713512701036, "flos": 28765568505600.0, "grad_norm": 1.9020245968283926, "language_loss": 0.8606267, "learning_rate": 3.4613771718183654e-06, "loss": 0.88193095, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.53125, "step": 4358, "time_per_iteration": 2.434086561203003 }, { "auxiliary_loss_clip": 0.01076251, "auxiliary_loss_mlp": 0.01057127, "balance_loss_clip": 1.02181685, "balance_loss_mlp": 1.02151835, "epoch": 0.2620772583796783, "flos": 26431310448000.0, "grad_norm": 1.9928014143877297, "language_loss": 0.68779707, "learning_rate": 3.4611112543722127e-06, "loss": 0.70913082, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.546875, "step": 4359, "time_per_iteration": 2.4158825874328613 }, { "auxiliary_loss_clip": 0.01072657, "auxiliary_loss_mlp": 0.01049641, "balance_loss_clip": 1.01809788, "balance_loss_mlp": 1.02093148, "epoch": 0.2621373816323463, "flos": 20155532849280.0, "grad_norm": 1.8390268760174922, "language_loss": 0.79267991, "learning_rate": 3.4608452815205757e-06, "loss": 0.81390297, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.51953125, "step": 4360, "time_per_iteration": 2.4023942947387695 }, { "auxiliary_loss_clip": 0.01071282, "auxiliary_loss_mlp": 0.01046654, "balance_loss_clip": 1.0165658, "balance_loss_mlp": 1.02061868, "epoch": 0.26219750488501425, "flos": 28619980669440.0, "grad_norm": 1.7844327970749396, "language_loss": 0.69250047, "learning_rate": 3.4605792532735387e-06, "loss": 0.71367979, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.5078125, "step": 4361, "time_per_iteration": 2.429924249649048 }, { "auxiliary_loss_clip": 0.01073374, "auxiliary_loss_mlp": 0.01056513, "balance_loss_clip": 1.02153707, "balance_loss_mlp": 1.02107918, "epoch": 0.2622576281376823, "flos": 15041839587840.0, "grad_norm": 1.9532582427800615, "language_loss": 0.85483241, "learning_rate": 3.46031316964119e-06, "loss": 0.8761313, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5234375, "step": 4362, "time_per_iteration": 2.394083261489868 }, { "auxiliary_loss_clip": 0.0107293, "auxiliary_loss_mlp": 0.01054282, "balance_loss_clip": 1.01999736, "balance_loss_mlp": 1.02152908, "epoch": 0.26231775139035024, "flos": 26394965856000.0, "grad_norm": 2.8871697402762777, "language_loss": 0.66678905, "learning_rate": 3.4600470306336197e-06, "loss": 0.68806118, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 4363, "time_per_iteration": 2.4192521572113037 }, { "auxiliary_loss_clip": 0.01018608, "auxiliary_loss_mlp": 0.01012827, "balance_loss_clip": 1.00696206, "balance_loss_mlp": 1.00333714, "epoch": 0.2623778746430182, "flos": 65405341653120.0, "grad_norm": 0.8948577204897447, "language_loss": 0.61236382, "learning_rate": 3.4597808362609194e-06, "loss": 0.63267815, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 0.05859375, "router_z_loss_mlp": 0.15234375, "step": 4364, "time_per_iteration": 4.489120721817017 }, { "auxiliary_loss_clip": 0.01075468, "auxiliary_loss_mlp": 0.01053017, "balance_loss_clip": 1.0181365, "balance_loss_mlp": 1.02231574, "epoch": 0.26243799789568617, "flos": 12603400433280.0, "grad_norm": 2.6068395661194175, "language_loss": 0.74981117, "learning_rate": 3.459514586533184e-06, "loss": 0.77109599, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.53125, "step": 4365, "time_per_iteration": 2.3775956630706787 }, { "auxiliary_loss_clip": 0.01073562, "auxiliary_loss_mlp": 0.01051862, "balance_loss_clip": 1.01991415, "balance_loss_mlp": 1.02181947, "epoch": 0.26249812114835414, "flos": 28622494287360.0, "grad_norm": 1.678796955104702, "language_loss": 0.7851603, "learning_rate": 3.459248281460509e-06, "loss": 0.80641454, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.515625, "step": 4366, "time_per_iteration": 2.4507999420166016 }, { "auxiliary_loss_clip": 0.01074393, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.0170604, "balance_loss_mlp": 1.02195668, "epoch": 0.2625582444010221, "flos": 14464515479040.0, "grad_norm": 1.8096582470263507, "language_loss": 0.77484095, "learning_rate": 3.4589819210529927e-06, "loss": 0.79608214, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5234375, "step": 4367, "time_per_iteration": 2.370509386062622 }, { "auxiliary_loss_clip": 0.01071435, "auxiliary_loss_mlp": 0.01042482, "balance_loss_clip": 1.01231027, "balance_loss_mlp": 1.02135015, "epoch": 0.26261836765369007, "flos": 16612372454400.0, "grad_norm": 2.551111189414717, "language_loss": 0.71039283, "learning_rate": 3.458715505320736e-06, "loss": 0.73153204, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.5, "step": 4368, "time_per_iteration": 5.412498235702515 }, { "auxiliary_loss_clip": 0.01071168, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.01683927, "balance_loss_mlp": 1.02039039, "epoch": 0.26267849090635803, "flos": 20518943857920.0, "grad_norm": 4.33302994961259, "language_loss": 0.80292642, "learning_rate": 3.458449034273841e-06, "loss": 0.82415485, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5078125, "step": 4369, "time_per_iteration": 2.4068639278411865 }, { "auxiliary_loss_clip": 0.01070505, "auxiliary_loss_mlp": 0.01049225, "balance_loss_clip": 1.01602495, "balance_loss_mlp": 1.02002275, "epoch": 0.262738614159026, "flos": 21322888801920.0, "grad_norm": 1.884842959364299, "language_loss": 0.84700143, "learning_rate": 3.4581825079224133e-06, "loss": 0.86819875, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.50390625, "step": 4370, "time_per_iteration": 3.8132314682006836 }, { "auxiliary_loss_clip": 0.01076014, "auxiliary_loss_mlp": 0.01054178, "balance_loss_clip": 1.01924992, "balance_loss_mlp": 1.02244735, "epoch": 0.26279873741169396, "flos": 17602613746560.0, "grad_norm": 1.6993829999798655, "language_loss": 0.72578192, "learning_rate": 3.4579159262765575e-06, "loss": 0.74708378, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5390625, "step": 4371, "time_per_iteration": 2.377772808074951 }, { "auxiliary_loss_clip": 0.01018091, "auxiliary_loss_mlp": 0.01015393, "balance_loss_clip": 1.01000428, "balance_loss_mlp": 1.00286913, "epoch": 0.2628588606643619, "flos": 60946514363520.0, "grad_norm": 0.7044636279822712, "language_loss": 0.56634283, "learning_rate": 3.457649289346384e-06, "loss": 0.58667767, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 0.05395508, "router_z_loss_mlp": 0.15234375, "step": 4372, "time_per_iteration": 3.132427453994751 }, { "auxiliary_loss_clip": 0.01069466, "auxiliary_loss_mlp": 0.01048252, "balance_loss_clip": 1.01691175, "balance_loss_mlp": 1.02094471, "epoch": 0.2629189839170299, "flos": 27015093158400.0, "grad_norm": 1.556597330961399, "language_loss": 0.78949457, "learning_rate": 3.4573825971420042e-06, "loss": 0.81067181, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.484375, "step": 4373, "time_per_iteration": 2.425274610519409 }, { "auxiliary_loss_clip": 0.01070972, "auxiliary_loss_mlp": 0.01048908, "balance_loss_clip": 1.01736569, "balance_loss_mlp": 1.02056718, "epoch": 0.26297910716969786, "flos": 17018900858880.0, "grad_norm": 2.545545797527707, "language_loss": 0.72815669, "learning_rate": 3.4571158496735294e-06, "loss": 0.74935544, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.50390625, "step": 4374, "time_per_iteration": 2.4075911045074463 }, { "auxiliary_loss_clip": 0.01071875, "auxiliary_loss_mlp": 0.01054828, "balance_loss_clip": 1.02056718, "balance_loss_mlp": 1.02003694, "epoch": 0.2630392304223659, "flos": 24896284300800.0, "grad_norm": 1.7072455381322775, "language_loss": 0.82015687, "learning_rate": 3.4568490469510756e-06, "loss": 0.84142387, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4375, "time_per_iteration": 2.440300703048706 }, { "auxiliary_loss_clip": 0.01069186, "auxiliary_loss_mlp": 0.01050676, "balance_loss_clip": 1.01801252, "balance_loss_mlp": 1.02030897, "epoch": 0.26309935367503384, "flos": 32852640971520.0, "grad_norm": 2.1664880913367304, "language_loss": 0.68038678, "learning_rate": 3.4565821889847603e-06, "loss": 0.70158529, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.48828125, "step": 4376, "time_per_iteration": 2.508558511734009 }, { "auxiliary_loss_clip": 0.01071573, "auxiliary_loss_mlp": 0.01052755, "balance_loss_clip": 1.02166569, "balance_loss_mlp": 1.02057719, "epoch": 0.2631594769277018, "flos": 15887051625600.0, "grad_norm": 1.857856961149813, "language_loss": 0.70849031, "learning_rate": 3.4563152757847026e-06, "loss": 0.72973359, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.5078125, "step": 4377, "time_per_iteration": 2.3727734088897705 }, { "auxiliary_loss_clip": 0.01071186, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.01570153, "balance_loss_mlp": 1.02024055, "epoch": 0.2632196001803698, "flos": 50803060348800.0, "grad_norm": 2.188057773850793, "language_loss": 0.81357157, "learning_rate": 3.4560483073610233e-06, "loss": 0.83479059, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5078125, "step": 4378, "time_per_iteration": 2.6476705074310303 }, { "auxiliary_loss_clip": 0.01071592, "auxiliary_loss_mlp": 0.0105243, "balance_loss_clip": 1.02078009, "balance_loss_mlp": 1.02148581, "epoch": 0.26327972343303774, "flos": 13732247289600.0, "grad_norm": 2.115085212185895, "language_loss": 0.78831565, "learning_rate": 3.455781283723846e-06, "loss": 0.80955589, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5, "step": 4379, "time_per_iteration": 2.3553049564361572 }, { "auxiliary_loss_clip": 0.01074975, "auxiliary_loss_mlp": 0.01062382, "balance_loss_clip": 1.0238297, "balance_loss_mlp": 1.02036619, "epoch": 0.2633398466857057, "flos": 23767926203520.0, "grad_norm": 2.2618575531424305, "language_loss": 0.80257642, "learning_rate": 3.4555142048832975e-06, "loss": 0.82395005, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.546875, "step": 4380, "time_per_iteration": 2.411187171936035 }, { "auxiliary_loss_clip": 0.01072493, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.01596117, "balance_loss_mlp": 1.01996279, "epoch": 0.26339996993837367, "flos": 27598980602880.0, "grad_norm": 1.987610887800834, "language_loss": 0.65933168, "learning_rate": 3.4552470708495036e-06, "loss": 0.68054545, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.52734375, "step": 4381, "time_per_iteration": 2.4205918312072754 }, { "auxiliary_loss_clip": 0.0107033, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.01974356, "balance_loss_mlp": 1.02046156, "epoch": 0.26346009319104163, "flos": 16945373802240.0, "grad_norm": 2.336024924939081, "language_loss": 0.83574873, "learning_rate": 3.454979881632595e-06, "loss": 0.85694611, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.49804688, "step": 4382, "time_per_iteration": 2.3898627758026123 }, { "auxiliary_loss_clip": 0.01074245, "auxiliary_loss_mlp": 0.01053144, "balance_loss_clip": 1.01792955, "balance_loss_mlp": 1.01964223, "epoch": 0.2635202164437096, "flos": 37230714552960.0, "grad_norm": 1.9475356981253675, "language_loss": 0.71328592, "learning_rate": 3.4547126372427035e-06, "loss": 0.73455977, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.546875, "step": 4383, "time_per_iteration": 2.5041112899780273 }, { "auxiliary_loss_clip": 0.01073093, "auxiliary_loss_mlp": 0.01048298, "balance_loss_clip": 1.01697028, "balance_loss_mlp": 1.02107728, "epoch": 0.26358033969637756, "flos": 20995298714880.0, "grad_norm": 2.4514888713762324, "language_loss": 0.71068275, "learning_rate": 3.4544453376899638e-06, "loss": 0.73189664, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.51953125, "step": 4384, "time_per_iteration": 2.396869659423828 }, { "auxiliary_loss_clip": 0.01069869, "auxiliary_loss_mlp": 0.01053486, "balance_loss_clip": 1.02231228, "balance_loss_mlp": 1.02000129, "epoch": 0.26364046294904553, "flos": 27744847729920.0, "grad_norm": 2.4552271821809684, "language_loss": 0.71779841, "learning_rate": 3.45417798298451e-06, "loss": 0.73903197, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.49804688, "step": 4385, "time_per_iteration": 2.4241960048675537 }, { "auxiliary_loss_clip": 0.01073885, "auxiliary_loss_mlp": 0.01047898, "balance_loss_clip": 1.01563978, "balance_loss_mlp": 1.02189493, "epoch": 0.2637005862017135, "flos": 22891990302720.0, "grad_norm": 1.8630245294285674, "language_loss": 0.85936046, "learning_rate": 3.453910573136482e-06, "loss": 0.88057828, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51953125, "step": 4386, "time_per_iteration": 2.402496337890625 }, { "auxiliary_loss_clip": 0.01074296, "auxiliary_loss_mlp": 0.01052495, "balance_loss_clip": 1.01720881, "balance_loss_mlp": 1.02113962, "epoch": 0.26376070945438146, "flos": 15047949075840.0, "grad_norm": 2.317443098218713, "language_loss": 0.78721803, "learning_rate": 3.4536431081560196e-06, "loss": 0.80848587, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53125, "step": 4387, "time_per_iteration": 2.3532235622406006 }, { "auxiliary_loss_clip": 0.0107399, "auxiliary_loss_mlp": 0.0105448, "balance_loss_clip": 1.02048147, "balance_loss_mlp": 1.02366519, "epoch": 0.2638208327070494, "flos": 21140781816960.0, "grad_norm": 1.913303795235619, "language_loss": 0.77323854, "learning_rate": 3.453375588053264e-06, "loss": 0.79452324, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 4388, "time_per_iteration": 2.411665916442871 }, { "auxiliary_loss_clip": 0.01072425, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.01644444, "balance_loss_mlp": 1.02069473, "epoch": 0.26388095595971744, "flos": 21724529616000.0, "grad_norm": 2.0344393221731614, "language_loss": 0.87365133, "learning_rate": 3.4531080128383617e-06, "loss": 0.89487481, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 4389, "time_per_iteration": 2.3712828159332275 }, { "auxiliary_loss_clip": 0.010176, "auxiliary_loss_mlp": 0.01007884, "balance_loss_clip": 1.00325847, "balance_loss_mlp": 1.00271702, "epoch": 0.2639410792123854, "flos": 65512036368000.0, "grad_norm": 0.8087847824361902, "language_loss": 0.60417962, "learning_rate": 3.452840382521457e-06, "loss": 0.62443447, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 0.04614258, "router_z_loss_mlp": 0.1484375, "step": 4390, "time_per_iteration": 3.0851640701293945 }, { "auxiliary_loss_clip": 0.01075767, "auxiliary_loss_mlp": 0.01052959, "balance_loss_clip": 1.01882911, "balance_loss_mlp": 1.02262926, "epoch": 0.2640012024650534, "flos": 23947519570560.0, "grad_norm": 1.6112758985339612, "language_loss": 0.78939849, "learning_rate": 3.4525726971127e-06, "loss": 0.81068575, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.53125, "step": 4391, "time_per_iteration": 2.4139206409454346 }, { "auxiliary_loss_clip": 0.01017085, "auxiliary_loss_mlp": 0.01004083, "balance_loss_clip": 0.99900419, "balance_loss_mlp": 1.00304842, "epoch": 0.26406132571772134, "flos": 56437620451200.0, "grad_norm": 0.8816296268494779, "language_loss": 0.58848631, "learning_rate": 3.45230495662224e-06, "loss": 0.60869801, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 0.05078125, "router_z_loss_mlp": 0.140625, "step": 4392, "time_per_iteration": 3.013625383377075 }, { "auxiliary_loss_clip": 0.01075899, "auxiliary_loss_mlp": 0.01056363, "balance_loss_clip": 1.0224483, "balance_loss_mlp": 1.02265227, "epoch": 0.2641214489703893, "flos": 22089476724480.0, "grad_norm": 1.768621624007672, "language_loss": 0.70007789, "learning_rate": 3.4520371610602306e-06, "loss": 0.7214005, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.53125, "step": 4393, "time_per_iteration": 2.4076552391052246 }, { "auxiliary_loss_clip": 0.01077326, "auxiliary_loss_mlp": 0.01063317, "balance_loss_clip": 1.02533698, "balance_loss_mlp": 1.02198005, "epoch": 0.26418157222305727, "flos": 16543837722240.0, "grad_norm": 4.900983563578649, "language_loss": 0.85495836, "learning_rate": 3.4517693104368267e-06, "loss": 0.87636483, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5546875, "step": 4394, "time_per_iteration": 2.4367597103118896 }, { "auxiliary_loss_clip": 0.01080281, "auxiliary_loss_mlp": 0.01062346, "balance_loss_clip": 1.02374578, "balance_loss_mlp": 1.02344549, "epoch": 0.26424169547572524, "flos": 18001566385920.0, "grad_norm": 3.0673260240288918, "language_loss": 0.71700317, "learning_rate": 3.4515014047621856e-06, "loss": 0.73842943, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5703125, "step": 4395, "time_per_iteration": 2.367868185043335 }, { "auxiliary_loss_clip": 0.01074548, "auxiliary_loss_mlp": 0.01059416, "balance_loss_clip": 1.0248816, "balance_loss_mlp": 1.02255666, "epoch": 0.2643018187283932, "flos": 16982207153280.0, "grad_norm": 1.779207680020714, "language_loss": 0.87638766, "learning_rate": 3.4512334440464655e-06, "loss": 0.89772725, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.51953125, "step": 4396, "time_per_iteration": 2.4258487224578857 }, { "auxiliary_loss_clip": 0.01017505, "auxiliary_loss_mlp": 0.01008365, "balance_loss_clip": 1.00261879, "balance_loss_mlp": 1.0031302, "epoch": 0.26436194198106117, "flos": 59661396794880.0, "grad_norm": 0.7840055535939273, "language_loss": 0.55092192, "learning_rate": 3.4509654282998277e-06, "loss": 0.57118058, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 0.05737305, "router_z_loss_mlp": 0.14355469, "step": 4397, "time_per_iteration": 2.796813488006592 }, { "auxiliary_loss_clip": 0.01073692, "auxiliary_loss_mlp": 0.0106266, "balance_loss_clip": 1.02794623, "balance_loss_mlp": 1.02209532, "epoch": 0.26442206523372913, "flos": 32920093451520.0, "grad_norm": 2.462275416721509, "language_loss": 0.79551864, "learning_rate": 3.450697357532435e-06, "loss": 0.81688207, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.515625, "step": 4398, "time_per_iteration": 2.5414960384368896 }, { "auxiliary_loss_clip": 0.01077684, "auxiliary_loss_mlp": 0.01059664, "balance_loss_clip": 1.0235672, "balance_loss_mlp": 1.02437735, "epoch": 0.2644821884863971, "flos": 21030281763840.0, "grad_norm": 1.8795839434736703, "language_loss": 0.68680739, "learning_rate": 3.4504292317544534e-06, "loss": 0.7081809, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53125, "step": 4399, "time_per_iteration": 2.3847877979278564 }, { "auxiliary_loss_clip": 0.01067514, "auxiliary_loss_mlp": 0.01048005, "balance_loss_clip": 1.01803637, "balance_loss_mlp": 1.01967084, "epoch": 0.26454231173906506, "flos": 20775764885760.0, "grad_norm": 1.7245315170562143, "language_loss": 0.8779701, "learning_rate": 3.4501610509760504e-06, "loss": 0.89912528, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4765625, "step": 4400, "time_per_iteration": 2.4151861667633057 }, { "auxiliary_loss_clip": 0.01074036, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.02276313, "balance_loss_mlp": 1.02075052, "epoch": 0.264602434991733, "flos": 16617713892480.0, "grad_norm": 2.186771841995937, "language_loss": 0.78020489, "learning_rate": 3.4498928152073944e-06, "loss": 0.80157584, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.53515625, "step": 4401, "time_per_iteration": 2.3639345169067383 }, { "auxiliary_loss_clip": 0.01075111, "auxiliary_loss_mlp": 0.0105754, "balance_loss_clip": 1.01893985, "balance_loss_mlp": 1.02072692, "epoch": 0.26466255824440105, "flos": 19061669041920.0, "grad_norm": 1.7835480232152183, "language_loss": 0.8918519, "learning_rate": 3.4496245244586577e-06, "loss": 0.91317844, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.54296875, "step": 4402, "time_per_iteration": 2.4081978797912598 }, { "auxiliary_loss_clip": 0.01074722, "auxiliary_loss_mlp": 0.01048177, "balance_loss_clip": 1.01229477, "balance_loss_mlp": 1.02064991, "epoch": 0.264722681497069, "flos": 22637438513280.0, "grad_norm": 1.5945054664744287, "language_loss": 0.79567623, "learning_rate": 3.4493561787400137e-06, "loss": 0.8169052, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5390625, "step": 4403, "time_per_iteration": 2.3940067291259766 }, { "auxiliary_loss_clip": 0.01074963, "auxiliary_loss_mlp": 0.01049149, "balance_loss_clip": 1.01433945, "balance_loss_mlp": 1.02096355, "epoch": 0.264782804749737, "flos": 22491152449920.0, "grad_norm": 2.251026592938621, "language_loss": 0.89896894, "learning_rate": 3.4490877780616387e-06, "loss": 0.92021012, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5390625, "step": 4404, "time_per_iteration": 3.8373467922210693 }, { "auxiliary_loss_clip": 0.01073727, "auxiliary_loss_mlp": 0.01053911, "balance_loss_clip": 1.02284527, "balance_loss_mlp": 1.02107954, "epoch": 0.26484292800240494, "flos": 16799332118400.0, "grad_norm": 1.7512179372268681, "language_loss": 0.76998001, "learning_rate": 3.448819322433709e-06, "loss": 0.79125643, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.52734375, "step": 4405, "time_per_iteration": 2.367297887802124 }, { "auxiliary_loss_clip": 0.01073361, "auxiliary_loss_mlp": 0.0104682, "balance_loss_clip": 1.01263118, "balance_loss_mlp": 1.02177894, "epoch": 0.2649030512550729, "flos": 20448523912320.0, "grad_norm": 1.8566187587673808, "language_loss": 0.71482456, "learning_rate": 3.4485508118664066e-06, "loss": 0.73602641, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.515625, "step": 4406, "time_per_iteration": 2.3857016563415527 }, { "auxiliary_loss_clip": 0.01072359, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.01306939, "balance_loss_mlp": 1.0210526, "epoch": 0.2649631745077409, "flos": 22415111775360.0, "grad_norm": 1.9747583401611728, "language_loss": 0.8521868, "learning_rate": 3.448282246369912e-06, "loss": 0.87339461, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.51171875, "step": 4407, "time_per_iteration": 2.4085209369659424 }, { "auxiliary_loss_clip": 0.0107254, "auxiliary_loss_mlp": 0.01044241, "balance_loss_clip": 1.01132739, "balance_loss_mlp": 1.0209434, "epoch": 0.26502329776040884, "flos": 35114663692800.0, "grad_norm": 1.6418963007304697, "language_loss": 0.77838743, "learning_rate": 3.4480136259544084e-06, "loss": 0.79955524, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.515625, "step": 4408, "time_per_iteration": 4.6377458572387695 }, { "auxiliary_loss_clip": 0.01071699, "auxiliary_loss_mlp": 0.01047874, "balance_loss_clip": 1.01459098, "balance_loss_mlp": 1.02136874, "epoch": 0.2650834210130768, "flos": 38686069244160.0, "grad_norm": 2.004287653102171, "language_loss": 0.72387266, "learning_rate": 3.447744950630084e-06, "loss": 0.74506843, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5, "step": 4409, "time_per_iteration": 2.5407700538635254 }, { "auxiliary_loss_clip": 0.01076913, "auxiliary_loss_mlp": 0.01055827, "balance_loss_clip": 1.02182817, "balance_loss_mlp": 1.02410507, "epoch": 0.26514354426574477, "flos": 24715713415680.0, "grad_norm": 1.9199239050013146, "language_loss": 0.74880129, "learning_rate": 3.4474762204071253e-06, "loss": 0.77012873, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.52734375, "step": 4410, "time_per_iteration": 3.8978984355926514 }, { "auxiliary_loss_clip": 0.01077251, "auxiliary_loss_mlp": 0.01048727, "balance_loss_clip": 1.01429951, "balance_loss_mlp": 1.02376747, "epoch": 0.26520366751841273, "flos": 20339001377280.0, "grad_norm": 1.8900529293633495, "language_loss": 0.74886668, "learning_rate": 3.4472074352957244e-06, "loss": 0.77012646, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.53515625, "step": 4411, "time_per_iteration": 2.3954174518585205 }, { "auxiliary_loss_clip": 0.01074289, "auxiliary_loss_mlp": 0.01054588, "balance_loss_clip": 1.02023232, "balance_loss_mlp": 1.02279973, "epoch": 0.2652637907710807, "flos": 22342841527680.0, "grad_norm": 2.218615436977912, "language_loss": 0.83401418, "learning_rate": 3.446938595306071e-06, "loss": 0.85530299, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4412, "time_per_iteration": 2.4159350395202637 }, { "auxiliary_loss_clip": 0.01074394, "auxiliary_loss_mlp": 0.01063184, "balance_loss_clip": 1.03125942, "balance_loss_mlp": 1.02262282, "epoch": 0.26532391402374866, "flos": 19353228739200.0, "grad_norm": 2.0289900916496015, "language_loss": 0.75693786, "learning_rate": 3.4466697004483622e-06, "loss": 0.77831364, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.515625, "step": 4413, "time_per_iteration": 2.385753870010376 }, { "auxiliary_loss_clip": 0.01024476, "auxiliary_loss_mlp": 0.01007897, "balance_loss_clip": 1.00339067, "balance_loss_mlp": 1.01096463, "epoch": 0.26538403727641663, "flos": 44784800138880.0, "grad_norm": 0.8872707873771541, "language_loss": 0.56966448, "learning_rate": 3.446400750732793e-06, "loss": 0.58998823, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 0.04516602, "router_z_loss_mlp": 0.13476562, "step": 4414, "time_per_iteration": 2.9851279258728027 }, { "auxiliary_loss_clip": 0.01070528, "auxiliary_loss_mlp": 0.01054276, "balance_loss_clip": 1.02347219, "balance_loss_mlp": 1.02178693, "epoch": 0.26544416052908465, "flos": 28180913011200.0, "grad_norm": 1.9745228663093242, "language_loss": 0.75394487, "learning_rate": 3.4461317461695625e-06, "loss": 0.77519286, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48632812, "step": 4415, "time_per_iteration": 2.4561593532562256 }, { "auxiliary_loss_clip": 0.01074639, "auxiliary_loss_mlp": 0.01062094, "balance_loss_clip": 1.02330303, "balance_loss_mlp": 1.02116525, "epoch": 0.2655042837817526, "flos": 17564349029760.0, "grad_norm": 2.759697729214561, "language_loss": 0.88711846, "learning_rate": 3.4458626867688707e-06, "loss": 0.90848577, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.53515625, "step": 4416, "time_per_iteration": 2.404308319091797 }, { "auxiliary_loss_clip": 0.01074088, "auxiliary_loss_mlp": 0.01056198, "balance_loss_clip": 1.02160299, "balance_loss_mlp": 1.02150428, "epoch": 0.2655644070344206, "flos": 23403502765440.0, "grad_norm": 1.8502565706068381, "language_loss": 0.78403437, "learning_rate": 3.4455935725409217e-06, "loss": 0.80533731, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5234375, "step": 4417, "time_per_iteration": 2.420186758041382 }, { "auxiliary_loss_clip": 0.01071369, "auxiliary_loss_mlp": 0.01059749, "balance_loss_clip": 1.02462959, "balance_loss_mlp": 1.02081275, "epoch": 0.26562453028708854, "flos": 26467271015040.0, "grad_norm": 1.6302959759438682, "language_loss": 0.81383401, "learning_rate": 3.4453244034959196e-06, "loss": 0.83514524, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.50390625, "step": 4418, "time_per_iteration": 2.495574474334717 }, { "auxiliary_loss_clip": 0.01074878, "auxiliary_loss_mlp": 0.01069858, "balance_loss_clip": 1.03206873, "balance_loss_mlp": 1.02226973, "epoch": 0.2656846535397565, "flos": 19206593562240.0, "grad_norm": 2.346353920187919, "language_loss": 0.68998063, "learning_rate": 3.445055179644071e-06, "loss": 0.71142799, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5234375, "step": 4419, "time_per_iteration": 2.402921438217163 }, { "auxiliary_loss_clip": 0.01074616, "auxiliary_loss_mlp": 0.01062022, "balance_loss_clip": 1.02600896, "balance_loss_mlp": 1.02175593, "epoch": 0.2657447767924245, "flos": 30550119206400.0, "grad_norm": 1.7564905694905417, "language_loss": 0.80253053, "learning_rate": 3.444785900995585e-06, "loss": 0.82389688, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4420, "time_per_iteration": 2.478043794631958 }, { "auxiliary_loss_clip": 0.01075533, "auxiliary_loss_mlp": 0.01059594, "balance_loss_clip": 1.02175736, "balance_loss_mlp": 1.0220356, "epoch": 0.26580490004509244, "flos": 20921701835520.0, "grad_norm": 1.9267863479778327, "language_loss": 0.82908463, "learning_rate": 3.444516567560673e-06, "loss": 0.85043597, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.53515625, "step": 4421, "time_per_iteration": 2.3880460262298584 }, { "auxiliary_loss_clip": 0.01071816, "auxiliary_loss_mlp": 0.01055726, "balance_loss_clip": 1.0253756, "balance_loss_mlp": 1.02150404, "epoch": 0.2658650232977604, "flos": 43943988798720.0, "grad_norm": 1.552903800572558, "language_loss": 0.67600667, "learning_rate": 3.444247179349548e-06, "loss": 0.69728208, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.50390625, "step": 4422, "time_per_iteration": 2.614163637161255 }, { "auxiliary_loss_clip": 0.01075837, "auxiliary_loss_mlp": 0.01054379, "balance_loss_clip": 1.01887822, "balance_loss_mlp": 1.0220325, "epoch": 0.26592514655042837, "flos": 29715136197120.0, "grad_norm": 2.654681800080909, "language_loss": 0.7659986, "learning_rate": 3.4439777363724252e-06, "loss": 0.78730071, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5390625, "step": 4423, "time_per_iteration": 2.4544618129730225 }, { "auxiliary_loss_clip": 0.01073284, "auxiliary_loss_mlp": 0.01053484, "balance_loss_clip": 1.01755476, "balance_loss_mlp": 1.02083123, "epoch": 0.26598526980309634, "flos": 46676082332160.0, "grad_norm": 1.7945235670677797, "language_loss": 0.79370296, "learning_rate": 3.443708238639522e-06, "loss": 0.81497061, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 4424, "time_per_iteration": 2.609229803085327 }, { "auxiliary_loss_clip": 0.01075593, "auxiliary_loss_mlp": 0.01051831, "balance_loss_clip": 1.01823831, "balance_loss_mlp": 1.02316391, "epoch": 0.2660453930557643, "flos": 11508663841920.0, "grad_norm": 2.994664917255243, "language_loss": 0.80922914, "learning_rate": 3.4434386861610573e-06, "loss": 0.83050334, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5234375, "step": 4425, "time_per_iteration": 2.352877378463745 }, { "auxiliary_loss_clip": 0.01074698, "auxiliary_loss_mlp": 0.01049209, "balance_loss_clip": 1.0179044, "balance_loss_mlp": 1.02443743, "epoch": 0.26610551630843227, "flos": 24790392547200.0, "grad_norm": 1.6550601586860452, "language_loss": 0.82597041, "learning_rate": 3.4431690789472532e-06, "loss": 0.84720945, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.50390625, "step": 4426, "time_per_iteration": 2.455880880355835 }, { "auxiliary_loss_clip": 0.01079919, "auxiliary_loss_mlp": 0.01056946, "balance_loss_clip": 1.02104044, "balance_loss_mlp": 1.0272522, "epoch": 0.26616563956110023, "flos": 27635150638080.0, "grad_norm": 1.5485942954486374, "language_loss": 0.78378457, "learning_rate": 3.442899417008333e-06, "loss": 0.80515325, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 4427, "time_per_iteration": 2.4552979469299316 }, { "auxiliary_loss_clip": 0.01073168, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.01138711, "balance_loss_mlp": 1.02380967, "epoch": 0.26622576281376825, "flos": 28361728275840.0, "grad_norm": 2.0327135138317836, "language_loss": 0.7775349, "learning_rate": 3.4426297003545227e-06, "loss": 0.79870206, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.49414062, "step": 4428, "time_per_iteration": 2.4775850772857666 }, { "auxiliary_loss_clip": 0.0107767, "auxiliary_loss_mlp": 0.01050879, "balance_loss_clip": 1.01807213, "balance_loss_mlp": 1.02430558, "epoch": 0.2662858860664362, "flos": 18040354773120.0, "grad_norm": 2.0649971471335435, "language_loss": 0.85101199, "learning_rate": 3.4423599289960495e-06, "loss": 0.87229753, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.53125, "step": 4429, "time_per_iteration": 2.3745672702789307 }, { "auxiliary_loss_clip": 0.01073933, "auxiliary_loss_mlp": 0.01050148, "balance_loss_clip": 1.01672196, "balance_loss_mlp": 1.02361727, "epoch": 0.2663460093191042, "flos": 22744761632640.0, "grad_norm": 1.683707102786659, "language_loss": 0.74085599, "learning_rate": 3.442090102943143e-06, "loss": 0.76209676, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.50390625, "step": 4430, "time_per_iteration": 2.4466476440429688 }, { "auxiliary_loss_clip": 0.01078158, "auxiliary_loss_mlp": 0.01061735, "balance_loss_clip": 1.02623463, "balance_loss_mlp": 1.02445412, "epoch": 0.26640613257177215, "flos": 16507842243840.0, "grad_norm": 2.4170130626429827, "language_loss": 0.83435285, "learning_rate": 3.441820222206035e-06, "loss": 0.85575181, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.53515625, "step": 4431, "time_per_iteration": 2.372795581817627 }, { "auxiliary_loss_clip": 0.01080335, "auxiliary_loss_mlp": 0.01057496, "balance_loss_clip": 1.02226925, "balance_loss_mlp": 1.02573395, "epoch": 0.2664662558244401, "flos": 23074830426240.0, "grad_norm": 2.737064242022605, "language_loss": 0.78595757, "learning_rate": 3.44155028679496e-06, "loss": 0.80733585, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.546875, "step": 4432, "time_per_iteration": 2.440993309020996 }, { "auxiliary_loss_clip": 0.01075815, "auxiliary_loss_mlp": 0.01047164, "balance_loss_clip": 1.01280808, "balance_loss_mlp": 1.02452064, "epoch": 0.2665263790771081, "flos": 23768135671680.0, "grad_norm": 3.250746465679421, "language_loss": 0.84568697, "learning_rate": 3.441280296720154e-06, "loss": 0.86691678, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 4433, "time_per_iteration": 2.411165475845337 }, { "auxiliary_loss_clip": 0.01074335, "auxiliary_loss_mlp": 0.01059823, "balance_loss_clip": 1.02648067, "balance_loss_mlp": 1.0246104, "epoch": 0.26658650232977604, "flos": 28000027923840.0, "grad_norm": 1.9573983007248246, "language_loss": 0.7815389, "learning_rate": 3.441010251991854e-06, "loss": 0.80288053, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.49609375, "step": 4434, "time_per_iteration": 2.463564872741699 }, { "auxiliary_loss_clip": 0.01074167, "auxiliary_loss_mlp": 0.01059514, "balance_loss_clip": 1.02556348, "balance_loss_mlp": 1.02247226, "epoch": 0.266646625582444, "flos": 22162549933440.0, "grad_norm": 1.8337714829246643, "language_loss": 0.84080172, "learning_rate": 3.440740152620301e-06, "loss": 0.86213857, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 4435, "time_per_iteration": 2.3972725868225098 }, { "auxiliary_loss_clip": 0.01077376, "auxiliary_loss_mlp": 0.01066691, "balance_loss_clip": 1.02847242, "balance_loss_mlp": 1.02363467, "epoch": 0.266706748835112, "flos": 27852345406080.0, "grad_norm": 2.187311084003557, "language_loss": 0.90432, "learning_rate": 3.4404699986157376e-06, "loss": 0.92576063, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5390625, "step": 4436, "time_per_iteration": 2.4430606365203857 }, { "auxiliary_loss_clip": 0.01075429, "auxiliary_loss_mlp": 0.01059241, "balance_loss_clip": 1.02481389, "balance_loss_mlp": 1.02281094, "epoch": 0.26676687208777994, "flos": 25810938766080.0, "grad_norm": 1.3288860175014947, "language_loss": 0.79482567, "learning_rate": 3.440199789988407e-06, "loss": 0.81617242, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.52734375, "step": 4437, "time_per_iteration": 2.4560909271240234 }, { "auxiliary_loss_clip": 0.01074356, "auxiliary_loss_mlp": 0.01057928, "balance_loss_clip": 1.02493072, "balance_loss_mlp": 1.02272642, "epoch": 0.2668269953404479, "flos": 36063114220800.0, "grad_norm": 2.2494981972854404, "language_loss": 0.66014171, "learning_rate": 3.439929526748556e-06, "loss": 0.68146455, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.515625, "step": 4438, "time_per_iteration": 2.521247625350952 }, { "auxiliary_loss_clip": 0.01073864, "auxiliary_loss_mlp": 0.01061089, "balance_loss_clip": 1.0264945, "balance_loss_mlp": 1.02229762, "epoch": 0.26688711859311587, "flos": 26569985834880.0, "grad_norm": 1.7495985651751287, "language_loss": 0.77063626, "learning_rate": 3.4396592089064334e-06, "loss": 0.79198581, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.515625, "step": 4439, "time_per_iteration": 2.4206268787384033 }, { "auxiliary_loss_clip": 0.01073698, "auxiliary_loss_mlp": 0.01054294, "balance_loss_clip": 1.0179354, "balance_loss_mlp": 1.02175355, "epoch": 0.26694724184578383, "flos": 26760331900800.0, "grad_norm": 1.6381865952208705, "language_loss": 0.7312457, "learning_rate": 3.4393888364722897e-06, "loss": 0.75252557, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51953125, "step": 4440, "time_per_iteration": 2.4655959606170654 }, { "auxiliary_loss_clip": 0.01076438, "auxiliary_loss_mlp": 0.01063172, "balance_loss_clip": 1.02855372, "balance_loss_mlp": 1.02334976, "epoch": 0.2670073650984518, "flos": 20958535186560.0, "grad_norm": 1.9375921722768221, "language_loss": 0.68463135, "learning_rate": 3.439118409456376e-06, "loss": 0.70602745, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 4441, "time_per_iteration": 2.39318585395813 }, { "auxiliary_loss_clip": 0.01073786, "auxiliary_loss_mlp": 0.01058619, "balance_loss_clip": 1.02145004, "balance_loss_mlp": 1.02222848, "epoch": 0.2670674883511198, "flos": 28364800475520.0, "grad_norm": 3.3109606620341627, "language_loss": 0.77490914, "learning_rate": 3.4388479278689486e-06, "loss": 0.79623324, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.515625, "step": 4442, "time_per_iteration": 2.461061716079712 }, { "auxiliary_loss_clip": 0.01026944, "auxiliary_loss_mlp": 0.01017329, "balance_loss_clip": 1.01284671, "balance_loss_mlp": 1.01342559, "epoch": 0.2671276116037878, "flos": 58968370840320.0, "grad_norm": 0.9336348364823988, "language_loss": 0.61309135, "learning_rate": 3.4385773917202637e-06, "loss": 0.63353407, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 0.04492188, "router_z_loss_mlp": 0.13476562, "step": 4443, "time_per_iteration": 2.966085433959961 }, { "auxiliary_loss_clip": 0.01073006, "auxiliary_loss_mlp": 0.01053569, "balance_loss_clip": 1.01907039, "balance_loss_mlp": 1.02146423, "epoch": 0.26718773485645575, "flos": 43943395305600.0, "grad_norm": 1.5314309560568364, "language_loss": 0.77793545, "learning_rate": 3.4383068010205793e-06, "loss": 0.79920125, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.515625, "step": 4444, "time_per_iteration": 4.022387266159058 }, { "auxiliary_loss_clip": 0.01076122, "auxiliary_loss_mlp": 0.01050261, "balance_loss_clip": 1.01633406, "balance_loss_mlp": 1.02389503, "epoch": 0.2672478581091237, "flos": 25227156055680.0, "grad_norm": 1.727499854727591, "language_loss": 0.81925899, "learning_rate": 3.438036155780158e-06, "loss": 0.84052277, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51953125, "step": 4445, "time_per_iteration": 2.430450677871704 }, { "auxiliary_loss_clip": 0.01076681, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.01581645, "balance_loss_mlp": 1.0235095, "epoch": 0.2673079813617917, "flos": 15267273436800.0, "grad_norm": 2.0308988675332786, "language_loss": 0.91210425, "learning_rate": 3.43776545600926e-06, "loss": 0.93338835, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4446, "time_per_iteration": 2.380800724029541 }, { "auxiliary_loss_clip": 0.01077024, "auxiliary_loss_mlp": 0.01061186, "balance_loss_clip": 1.02847505, "balance_loss_mlp": 1.02514422, "epoch": 0.26736810461445965, "flos": 25811532259200.0, "grad_norm": 2.2042589045568275, "language_loss": 0.69300926, "learning_rate": 3.437494701718153e-06, "loss": 0.71439135, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.51953125, "step": 4447, "time_per_iteration": 3.8453664779663086 }, { "auxiliary_loss_clip": 0.01079396, "auxiliary_loss_mlp": 0.01060789, "balance_loss_clip": 1.02586031, "balance_loss_mlp": 1.02532041, "epoch": 0.2674282278671276, "flos": 24311663717760.0, "grad_norm": 2.5769621217621386, "language_loss": 0.84273052, "learning_rate": 3.4372238929171026e-06, "loss": 0.8641324, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5390625, "step": 4448, "time_per_iteration": 3.9560627937316895 }, { "auxiliary_loss_clip": 0.01076408, "auxiliary_loss_mlp": 0.0107401, "balance_loss_clip": 1.04051208, "balance_loss_mlp": 1.02504373, "epoch": 0.2674883511197956, "flos": 22814553173760.0, "grad_norm": 1.695952249700283, "language_loss": 0.85830009, "learning_rate": 3.436953029616378e-06, "loss": 0.87980425, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.515625, "step": 4449, "time_per_iteration": 3.826409339904785 }, { "auxiliary_loss_clip": 0.01080747, "auxiliary_loss_mlp": 0.01066122, "balance_loss_clip": 1.02492332, "balance_loss_mlp": 1.02448535, "epoch": 0.26754847437246354, "flos": 25369113110400.0, "grad_norm": 1.6772338586879099, "language_loss": 0.85174739, "learning_rate": 3.4366821118262506e-06, "loss": 0.87321609, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.5625, "step": 4450, "time_per_iteration": 2.4365384578704834 }, { "auxiliary_loss_clip": 0.01074248, "auxiliary_loss_mlp": 0.01057021, "balance_loss_clip": 1.02359438, "balance_loss_mlp": 1.02365041, "epoch": 0.2676085976251315, "flos": 20229374108160.0, "grad_norm": 1.7679951003885779, "language_loss": 0.81655002, "learning_rate": 3.4364111395569937e-06, "loss": 0.83786273, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 4451, "time_per_iteration": 2.398487091064453 }, { "auxiliary_loss_clip": 0.01076089, "auxiliary_loss_mlp": 0.01061989, "balance_loss_clip": 1.02913499, "balance_loss_mlp": 1.02547097, "epoch": 0.26766872087779947, "flos": 28036966008960.0, "grad_norm": 1.6172904477095364, "language_loss": 0.87669969, "learning_rate": 3.436140112818882e-06, "loss": 0.89808053, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.50390625, "step": 4452, "time_per_iteration": 2.4805595874786377 }, { "auxiliary_loss_clip": 0.01078851, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.02489471, "balance_loss_mlp": 1.02411878, "epoch": 0.26772884413046744, "flos": 18324408528000.0, "grad_norm": 2.3303143714187105, "language_loss": 0.85209334, "learning_rate": 3.435869031622194e-06, "loss": 0.87353969, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.546875, "step": 4453, "time_per_iteration": 2.385587692260742 }, { "auxiliary_loss_clip": 0.0107913, "auxiliary_loss_mlp": 0.01072828, "balance_loss_clip": 1.03580189, "balance_loss_mlp": 1.02596557, "epoch": 0.2677889673831354, "flos": 22126414809600.0, "grad_norm": 1.5232494554021099, "language_loss": 0.80689251, "learning_rate": 3.435597895977208e-06, "loss": 0.82841206, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53125, "step": 4454, "time_per_iteration": 2.424654245376587 }, { "auxiliary_loss_clip": 0.01077221, "auxiliary_loss_mlp": 0.01059077, "balance_loss_clip": 1.02503037, "balance_loss_mlp": 1.02315319, "epoch": 0.2678490906358034, "flos": 23728649057280.0, "grad_norm": 1.5613152582371714, "language_loss": 0.73622394, "learning_rate": 3.435326705894206e-06, "loss": 0.75758696, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5390625, "step": 4455, "time_per_iteration": 2.4213528633117676 }, { "auxiliary_loss_clip": 0.01072984, "auxiliary_loss_mlp": 0.01053904, "balance_loss_clip": 1.02081132, "balance_loss_mlp": 1.02242017, "epoch": 0.2679092138884714, "flos": 21761781903360.0, "grad_norm": 1.6779732380198227, "language_loss": 0.74429232, "learning_rate": 3.435055461383471e-06, "loss": 0.76556122, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.50390625, "step": 4456, "time_per_iteration": 2.4432196617126465 }, { "auxiliary_loss_clip": 0.01077355, "auxiliary_loss_mlp": 0.01058231, "balance_loss_clip": 1.02017975, "balance_loss_mlp": 1.02231002, "epoch": 0.26796933714113935, "flos": 19860272547840.0, "grad_norm": 3.384174829339199, "language_loss": 0.72601771, "learning_rate": 3.4347841624552896e-06, "loss": 0.74737358, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.55078125, "step": 4457, "time_per_iteration": 2.392247438430786 }, { "auxiliary_loss_clip": 0.01076928, "auxiliary_loss_mlp": 0.01055336, "balance_loss_clip": 1.01776135, "balance_loss_mlp": 1.02344418, "epoch": 0.2680294603938073, "flos": 20046848186880.0, "grad_norm": 2.331580297025153, "language_loss": 0.8047809, "learning_rate": 3.4345128091199493e-06, "loss": 0.82610351, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53515625, "step": 4458, "time_per_iteration": 2.4061574935913086 }, { "auxiliary_loss_clip": 0.01018423, "auxiliary_loss_mlp": 0.01022965, "balance_loss_clip": 1.01836383, "balance_loss_mlp": 1.00553381, "epoch": 0.2680895836464753, "flos": 72110237172480.0, "grad_norm": 0.8859083138563465, "language_loss": 0.58771718, "learning_rate": 3.434241401387739e-06, "loss": 0.60813105, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 0.04589844, "router_z_loss_mlp": 0.12890625, "step": 4459, "time_per_iteration": 3.0255489349365234 }, { "auxiliary_loss_clip": 0.01074195, "auxiliary_loss_mlp": 0.01054024, "balance_loss_clip": 1.01899993, "balance_loss_mlp": 1.02200794, "epoch": 0.26814970689914325, "flos": 20448000241920.0, "grad_norm": 2.3141396385503206, "language_loss": 0.8615886, "learning_rate": 3.4339699392689507e-06, "loss": 0.88287079, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5234375, "step": 4460, "time_per_iteration": 2.4130585193634033 }, { "auxiliary_loss_clip": 0.01076231, "auxiliary_loss_mlp": 0.01054698, "balance_loss_clip": 1.01767194, "balance_loss_mlp": 1.02306306, "epoch": 0.2682098301518112, "flos": 17565710572800.0, "grad_norm": 1.914329414146881, "language_loss": 0.69021732, "learning_rate": 3.4336984227738796e-06, "loss": 0.71152663, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.53125, "step": 4461, "time_per_iteration": 2.3903214931488037 }, { "auxiliary_loss_clip": 0.01076994, "auxiliary_loss_mlp": 0.01062374, "balance_loss_clip": 1.02515686, "balance_loss_mlp": 1.02425134, "epoch": 0.2682699534044792, "flos": 18332263584000.0, "grad_norm": 1.5639224771243898, "language_loss": 0.69220281, "learning_rate": 3.43342685191282e-06, "loss": 0.71359646, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.52734375, "step": 4462, "time_per_iteration": 2.4480509757995605 }, { "auxiliary_loss_clip": 0.01080018, "auxiliary_loss_mlp": 0.0105767, "balance_loss_clip": 1.02076316, "balance_loss_mlp": 1.02702665, "epoch": 0.26833007665714714, "flos": 25300124530560.0, "grad_norm": 1.8127175192101044, "language_loss": 0.70817077, "learning_rate": 3.4331552266960705e-06, "loss": 0.72954762, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.53125, "step": 4463, "time_per_iteration": 2.438394069671631 }, { "auxiliary_loss_clip": 0.01085028, "auxiliary_loss_mlp": 0.0105985, "balance_loss_clip": 1.02294326, "balance_loss_mlp": 1.02934098, "epoch": 0.2683901999098151, "flos": 16099044600960.0, "grad_norm": 2.428844848218226, "language_loss": 0.79834807, "learning_rate": 3.432883547133931e-06, "loss": 0.81979692, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5546875, "step": 4464, "time_per_iteration": 2.372387409210205 }, { "auxiliary_loss_clip": 0.01080816, "auxiliary_loss_mlp": 0.01053587, "balance_loss_clip": 1.01892102, "balance_loss_mlp": 1.02788556, "epoch": 0.2684503231624831, "flos": 27306827412480.0, "grad_norm": 1.7586771045861913, "language_loss": 0.72028899, "learning_rate": 3.432611813236704e-06, "loss": 0.74163294, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 4465, "time_per_iteration": 2.471996545791626 }, { "auxiliary_loss_clip": 0.01028311, "auxiliary_loss_mlp": 0.01005431, "balance_loss_clip": 1.001616, "balance_loss_mlp": 1.01575828, "epoch": 0.26851044641515104, "flos": 71854498396800.0, "grad_norm": 0.6997889838820115, "language_loss": 0.53265154, "learning_rate": 3.4323400250146943e-06, "loss": 0.55298895, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 0.03808594, "router_z_loss_mlp": 0.125, "step": 4466, "time_per_iteration": 3.1587204933166504 }, { "auxiliary_loss_clip": 0.01081618, "auxiliary_loss_mlp": 0.01067708, "balance_loss_clip": 1.03211176, "balance_loss_mlp": 1.02966428, "epoch": 0.268570569667819, "flos": 18732787234560.0, "grad_norm": 2.0038914978239992, "language_loss": 0.75368607, "learning_rate": 3.4320681824782057e-06, "loss": 0.77517933, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.51953125, "step": 4467, "time_per_iteration": 2.387817859649658 }, { "auxiliary_loss_clip": 0.01080591, "auxiliary_loss_mlp": 0.01067379, "balance_loss_clip": 1.02932787, "balance_loss_mlp": 1.02597451, "epoch": 0.268630692920487, "flos": 18177633705600.0, "grad_norm": 2.2426441861507493, "language_loss": 0.82623041, "learning_rate": 3.4317962856375493e-06, "loss": 0.84771013, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.546875, "step": 4468, "time_per_iteration": 2.3790152072906494 }, { "auxiliary_loss_clip": 0.01022344, "auxiliary_loss_mlp": 0.01006427, "balance_loss_clip": 1.00189745, "balance_loss_mlp": 1.00919986, "epoch": 0.268690816173155, "flos": 68728025612160.0, "grad_norm": 0.8554368110100365, "language_loss": 0.59697372, "learning_rate": 3.4315243345030334e-06, "loss": 0.61726141, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 0.04541016, "router_z_loss_mlp": 0.13085938, "step": 4469, "time_per_iteration": 3.117133378982544 }, { "auxiliary_loss_clip": 0.01079794, "auxiliary_loss_mlp": 0.01071111, "balance_loss_clip": 1.03286827, "balance_loss_mlp": 1.02656472, "epoch": 0.26875093942582295, "flos": 23292548864640.0, "grad_norm": 2.5505715315572544, "language_loss": 0.83655512, "learning_rate": 3.431252329084972e-06, "loss": 0.85806417, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.53125, "step": 4470, "time_per_iteration": 2.4219741821289062 }, { "auxiliary_loss_clip": 0.01071548, "auxiliary_loss_mlp": 0.01060694, "balance_loss_clip": 1.02548003, "balance_loss_mlp": 1.0214268, "epoch": 0.2688110626784909, "flos": 21542387719680.0, "grad_norm": 2.343131633392283, "language_loss": 0.84107673, "learning_rate": 3.4309802693936786e-06, "loss": 0.8623991, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5, "step": 4471, "time_per_iteration": 2.408945322036743 }, { "auxiliary_loss_clip": 0.01072913, "auxiliary_loss_mlp": 0.01060384, "balance_loss_clip": 1.02791131, "balance_loss_mlp": 1.02242017, "epoch": 0.2688711859311589, "flos": 28399399499520.0, "grad_norm": 2.020575159230114, "language_loss": 0.71991956, "learning_rate": 3.43070815543947e-06, "loss": 0.74125254, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.50390625, "step": 4472, "time_per_iteration": 2.497464418411255 }, { "auxiliary_loss_clip": 0.01073874, "auxiliary_loss_mlp": 0.01067186, "balance_loss_clip": 1.03433156, "balance_loss_mlp": 1.02280831, "epoch": 0.26893130918382685, "flos": 25993743978240.0, "grad_norm": 2.415709645356201, "language_loss": 0.69308305, "learning_rate": 3.4304359872326656e-06, "loss": 0.71449363, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51171875, "step": 4473, "time_per_iteration": 2.4493463039398193 }, { "auxiliary_loss_clip": 0.01073068, "auxiliary_loss_mlp": 0.01063133, "balance_loss_clip": 1.02791834, "balance_loss_mlp": 1.02249312, "epoch": 0.2689914324364948, "flos": 20338582440960.0, "grad_norm": 1.8119295038376493, "language_loss": 0.85421842, "learning_rate": 3.4301637647835843e-06, "loss": 0.87558043, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5078125, "step": 4474, "time_per_iteration": 2.4423153400421143 }, { "auxiliary_loss_clip": 0.01072493, "auxiliary_loss_mlp": 0.01058728, "balance_loss_clip": 1.0254209, "balance_loss_mlp": 1.02226901, "epoch": 0.2690515556891628, "flos": 19463519324160.0, "grad_norm": 2.76222023987949, "language_loss": 0.72076267, "learning_rate": 3.4298914881025494e-06, "loss": 0.74207485, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.50390625, "step": 4475, "time_per_iteration": 2.3792662620544434 }, { "auxiliary_loss_clip": 0.01075785, "auxiliary_loss_mlp": 0.01063422, "balance_loss_clip": 1.02880383, "balance_loss_mlp": 1.02338982, "epoch": 0.26911167894183075, "flos": 18145757767680.0, "grad_norm": 1.9595815190776056, "language_loss": 0.74868435, "learning_rate": 3.4296191571998863e-06, "loss": 0.77007639, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5234375, "step": 4476, "time_per_iteration": 2.3939402103424072 }, { "auxiliary_loss_clip": 0.01074089, "auxiliary_loss_mlp": 0.01052672, "balance_loss_clip": 1.01993704, "balance_loss_mlp": 1.02321672, "epoch": 0.2691718021944987, "flos": 19974089180160.0, "grad_norm": 1.5765396288246891, "language_loss": 0.81733245, "learning_rate": 3.429346772085922e-06, "loss": 0.83860004, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5078125, "step": 4477, "time_per_iteration": 2.4235072135925293 }, { "auxiliary_loss_clip": 0.01078389, "auxiliary_loss_mlp": 0.01054993, "balance_loss_clip": 1.0207088, "balance_loss_mlp": 1.0244801, "epoch": 0.2692319254471667, "flos": 37445814639360.0, "grad_norm": 1.704638653942128, "language_loss": 0.67204201, "learning_rate": 3.429074332770984e-06, "loss": 0.69337583, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5390625, "step": 4478, "time_per_iteration": 2.5652217864990234 }, { "auxiliary_loss_clip": 0.01080727, "auxiliary_loss_mlp": 0.01048847, "balance_loss_clip": 1.01537347, "balance_loss_mlp": 1.02854323, "epoch": 0.26929204869983464, "flos": 22126694100480.0, "grad_norm": 3.2210995123961093, "language_loss": 0.82587636, "learning_rate": 3.4288018392654047e-06, "loss": 0.84717202, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5234375, "step": 4479, "time_per_iteration": 2.423214912414551 }, { "auxiliary_loss_clip": 0.01081965, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.02260745, "balance_loss_mlp": 1.02865601, "epoch": 0.2693521719525026, "flos": 19791772727040.0, "grad_norm": 2.9552735491991955, "language_loss": 0.82037961, "learning_rate": 3.4285292915795166e-06, "loss": 0.84176755, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.53125, "step": 4480, "time_per_iteration": 2.4368138313293457 }, { "auxiliary_loss_clip": 0.01077264, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.01376545, "balance_loss_mlp": 1.02781487, "epoch": 0.2694122952051706, "flos": 20993378590080.0, "grad_norm": 1.7845991042314484, "language_loss": 0.78819978, "learning_rate": 3.4282566897236543e-06, "loss": 0.80940092, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.49414062, "step": 4481, "time_per_iteration": 2.4556541442871094 }, { "auxiliary_loss_clip": 0.01083016, "auxiliary_loss_mlp": 0.01055658, "balance_loss_clip": 1.02075386, "balance_loss_mlp": 1.03005326, "epoch": 0.2694724184578386, "flos": 25848086319360.0, "grad_norm": 1.7632311172867987, "language_loss": 0.7543394, "learning_rate": 3.4279840337081547e-06, "loss": 0.77572608, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.53125, "step": 4482, "time_per_iteration": 2.5267765522003174 }, { "auxiliary_loss_clip": 0.01086978, "auxiliary_loss_mlp": 0.01049001, "balance_loss_clip": 1.01348829, "balance_loss_mlp": 1.03258836, "epoch": 0.26953254171050656, "flos": 21725856247680.0, "grad_norm": 4.110515363310741, "language_loss": 0.73834264, "learning_rate": 3.4277113235433584e-06, "loss": 0.75970244, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.546875, "step": 4483, "time_per_iteration": 2.4514079093933105 }, { "auxiliary_loss_clip": 0.0108564, "auxiliary_loss_mlp": 0.01060549, "balance_loss_clip": 1.02156734, "balance_loss_mlp": 1.03053987, "epoch": 0.2695926649631745, "flos": 19681901078400.0, "grad_norm": 2.66010145519583, "language_loss": 0.88473737, "learning_rate": 3.427438559239605e-06, "loss": 0.90619928, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5546875, "step": 4484, "time_per_iteration": 3.8702521324157715 }, { "auxiliary_loss_clip": 0.01084432, "auxiliary_loss_mlp": 0.01052531, "balance_loss_clip": 1.01755512, "balance_loss_mlp": 1.03027821, "epoch": 0.2696527882158425, "flos": 32885319870720.0, "grad_norm": 1.4716760769728843, "language_loss": 0.68070889, "learning_rate": 3.427165740807239e-06, "loss": 0.70207852, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.54296875, "step": 4485, "time_per_iteration": 2.5176010131835938 }, { "auxiliary_loss_clip": 0.01083507, "auxiliary_loss_mlp": 0.01062697, "balance_loss_clip": 1.02600431, "balance_loss_mlp": 1.02957714, "epoch": 0.26971291146851045, "flos": 12124182844800.0, "grad_norm": 9.037358991890008, "language_loss": 0.7522223, "learning_rate": 3.426892868256604e-06, "loss": 0.77368432, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5390625, "step": 4486, "time_per_iteration": 2.42329478263855 }, { "auxiliary_loss_clip": 0.01087311, "auxiliary_loss_mlp": 0.01057575, "balance_loss_clip": 1.02066755, "balance_loss_mlp": 1.0308125, "epoch": 0.2697730347211784, "flos": 22633458618240.0, "grad_norm": 1.910318987145019, "language_loss": 0.85289031, "learning_rate": 3.4266199415980495e-06, "loss": 0.87433916, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.56640625, "step": 4487, "time_per_iteration": 5.267353296279907 }, { "auxiliary_loss_clip": 0.01081948, "auxiliary_loss_mlp": 0.01059956, "balance_loss_clip": 1.02266765, "balance_loss_mlp": 1.02810609, "epoch": 0.2698331579738464, "flos": 23511943048320.0, "grad_norm": 2.205742482182975, "language_loss": 0.74087733, "learning_rate": 3.4263469608419234e-06, "loss": 0.76229644, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5390625, "step": 4488, "time_per_iteration": 2.444610118865967 }, { "auxiliary_loss_clip": 0.01081115, "auxiliary_loss_mlp": 0.0105983, "balance_loss_clip": 1.0229466, "balance_loss_mlp": 1.02765942, "epoch": 0.26989328122651435, "flos": 24639986943360.0, "grad_norm": 1.7270330148667488, "language_loss": 0.84642744, "learning_rate": 3.426073925998578e-06, "loss": 0.86783695, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53515625, "step": 4489, "time_per_iteration": 3.884119749069214 }, { "auxiliary_loss_clip": 0.01079627, "auxiliary_loss_mlp": 0.01059643, "balance_loss_clip": 1.02316475, "balance_loss_mlp": 1.02570987, "epoch": 0.2699534044791823, "flos": 10771996821120.0, "grad_norm": 2.80775985968407, "language_loss": 0.91354632, "learning_rate": 3.4258008370783656e-06, "loss": 0.93493903, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5390625, "step": 4490, "time_per_iteration": 2.378265619277954 }, { "auxiliary_loss_clip": 0.01074776, "auxiliary_loss_mlp": 0.01052924, "balance_loss_clip": 1.02083325, "balance_loss_mlp": 1.02428544, "epoch": 0.2700135277318503, "flos": 36170192960640.0, "grad_norm": 1.766931230042687, "language_loss": 0.74646431, "learning_rate": 3.4255276940916434e-06, "loss": 0.76774138, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.50390625, "step": 4491, "time_per_iteration": 2.5096938610076904 }, { "auxiliary_loss_clip": 0.01078741, "auxiliary_loss_mlp": 0.01056906, "balance_loss_clip": 1.02125072, "balance_loss_mlp": 1.02590334, "epoch": 0.27007365098451824, "flos": 17417713852800.0, "grad_norm": 2.1684216070802793, "language_loss": 0.75891864, "learning_rate": 3.4252544970487676e-06, "loss": 0.78027511, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.52734375, "step": 4492, "time_per_iteration": 2.414543867111206 }, { "auxiliary_loss_clip": 0.01074656, "auxiliary_loss_mlp": 0.01055445, "balance_loss_clip": 1.02108872, "balance_loss_mlp": 1.02340496, "epoch": 0.2701337742371862, "flos": 23184562429440.0, "grad_norm": 1.895995877568906, "language_loss": 0.90445864, "learning_rate": 3.4249812459600986e-06, "loss": 0.92575967, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 4493, "time_per_iteration": 2.46675705909729 }, { "auxiliary_loss_clip": 0.01074662, "auxiliary_loss_mlp": 0.01049996, "balance_loss_clip": 1.01799965, "balance_loss_mlp": 1.0241704, "epoch": 0.2701938974898542, "flos": 24388297885440.0, "grad_norm": 1.6250921841986714, "language_loss": 0.72315037, "learning_rate": 3.424707940835998e-06, "loss": 0.74439692, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.50390625, "step": 4494, "time_per_iteration": 2.4872848987579346 }, { "auxiliary_loss_clip": 0.01071786, "auxiliary_loss_mlp": 0.01047835, "balance_loss_clip": 1.01476693, "balance_loss_mlp": 1.02084494, "epoch": 0.2702540207425222, "flos": 26213103250560.0, "grad_norm": 1.9829476952183305, "language_loss": 0.879803, "learning_rate": 3.42443458168683e-06, "loss": 0.90099919, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.5078125, "step": 4495, "time_per_iteration": 2.446254014968872 }, { "auxiliary_loss_clip": 0.01073262, "auxiliary_loss_mlp": 0.01056747, "balance_loss_clip": 1.02234316, "balance_loss_mlp": 1.02271485, "epoch": 0.27031414399519016, "flos": 22925367429120.0, "grad_norm": 2.982129866319449, "language_loss": 0.77635396, "learning_rate": 3.424161168522959e-06, "loss": 0.79765403, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 4496, "time_per_iteration": 2.419271469116211 }, { "auxiliary_loss_clip": 0.01020045, "auxiliary_loss_mlp": 0.01003056, "balance_loss_clip": 0.99943256, "balance_loss_mlp": 1.00717783, "epoch": 0.2703742672478581, "flos": 63016759653120.0, "grad_norm": 0.6918861217157642, "language_loss": 0.50203735, "learning_rate": 3.423887701354754e-06, "loss": 0.52226835, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 0.03613281, "router_z_loss_mlp": 0.12890625, "step": 4497, "time_per_iteration": 3.0774805545806885 }, { "auxiliary_loss_clip": 0.01077812, "auxiliary_loss_mlp": 0.01053876, "balance_loss_clip": 1.02135563, "balance_loss_mlp": 1.02604985, "epoch": 0.2704343905005261, "flos": 18839900885760.0, "grad_norm": 1.682706776823039, "language_loss": 0.74006504, "learning_rate": 3.4236141801925847e-06, "loss": 0.76138198, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.515625, "step": 4498, "time_per_iteration": 2.4216818809509277 }, { "auxiliary_loss_clip": 0.01016139, "auxiliary_loss_mlp": 0.01003936, "balance_loss_clip": 1.00057387, "balance_loss_mlp": 1.00361872, "epoch": 0.27049451375319405, "flos": 71230042085760.0, "grad_norm": 0.8910268459121352, "language_loss": 0.59243906, "learning_rate": 3.4233406050468237e-06, "loss": 0.61263978, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 0.03369141, "router_z_loss_mlp": 0.125, "step": 4499, "time_per_iteration": 3.0479917526245117 }, { "auxiliary_loss_clip": 0.01078042, "auxiliary_loss_mlp": 0.01049707, "balance_loss_clip": 1.01516044, "balance_loss_mlp": 1.02662921, "epoch": 0.270554637005862, "flos": 24277483630080.0, "grad_norm": 1.931993567951779, "language_loss": 0.74916184, "learning_rate": 3.4230669759278438e-06, "loss": 0.77043933, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.515625, "step": 4500, "time_per_iteration": 2.4671826362609863 }, { "auxiliary_loss_clip": 0.01074717, "auxiliary_loss_mlp": 0.01051005, "balance_loss_clip": 1.01905644, "balance_loss_mlp": 1.02319431, "epoch": 0.27061476025853, "flos": 17631557130240.0, "grad_norm": 2.5575588375060825, "language_loss": 0.83810151, "learning_rate": 3.4227932928460215e-06, "loss": 0.85935879, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.515625, "step": 4501, "time_per_iteration": 2.403182029724121 }, { "auxiliary_loss_clip": 0.01077232, "auxiliary_loss_mlp": 0.01051335, "balance_loss_clip": 1.01767039, "balance_loss_mlp": 1.02515996, "epoch": 0.27067488351119795, "flos": 22709045445120.0, "grad_norm": 1.710333638194561, "language_loss": 0.7375862, "learning_rate": 3.422519555811735e-06, "loss": 0.75887185, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51953125, "step": 4502, "time_per_iteration": 2.41780686378479 }, { "auxiliary_loss_clip": 0.01079017, "auxiliary_loss_mlp": 0.0105623, "balance_loss_clip": 1.02008581, "balance_loss_mlp": 1.02509737, "epoch": 0.2707350067638659, "flos": 41717996467200.0, "grad_norm": 1.8494952723065698, "language_loss": 0.69741744, "learning_rate": 3.4222457648353642e-06, "loss": 0.71876997, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5390625, "step": 4503, "time_per_iteration": 2.5837082862854004 }, { "auxiliary_loss_clip": 0.01082196, "auxiliary_loss_mlp": 0.01050135, "balance_loss_clip": 1.01738811, "balance_loss_mlp": 1.02953386, "epoch": 0.2707951300165339, "flos": 20192017086720.0, "grad_norm": 1.9089302828992918, "language_loss": 0.69690812, "learning_rate": 3.4219719199272918e-06, "loss": 0.71823138, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.52734375, "step": 4504, "time_per_iteration": 2.4206793308258057 }, { "auxiliary_loss_clip": 0.01079783, "auxiliary_loss_mlp": 0.01063174, "balance_loss_clip": 1.0268867, "balance_loss_mlp": 1.02898526, "epoch": 0.27085525326920185, "flos": 21432900096000.0, "grad_norm": 1.8145306230033325, "language_loss": 0.7702781, "learning_rate": 3.421698021097902e-06, "loss": 0.79170763, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5078125, "step": 4505, "time_per_iteration": 2.4123997688293457 }, { "auxiliary_loss_clip": 0.01083769, "auxiliary_loss_mlp": 0.01059484, "balance_loss_clip": 1.02364945, "balance_loss_mlp": 1.02801538, "epoch": 0.2709153765218698, "flos": 17674290501120.0, "grad_norm": 2.5958396934877848, "language_loss": 0.75593239, "learning_rate": 3.42142406835758e-06, "loss": 0.77736497, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.55859375, "step": 4506, "time_per_iteration": 2.403510332107544 }, { "auxiliary_loss_clip": 0.01080021, "auxiliary_loss_mlp": 0.01052819, "balance_loss_clip": 1.01684129, "balance_loss_mlp": 1.02641344, "epoch": 0.2709754997745378, "flos": 24455261606400.0, "grad_norm": 1.9094190771261172, "language_loss": 0.81652892, "learning_rate": 3.421150061716715e-06, "loss": 0.83785737, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53515625, "step": 4507, "time_per_iteration": 2.4387030601501465 }, { "auxiliary_loss_clip": 0.01025602, "auxiliary_loss_mlp": 0.01016443, "balance_loss_clip": 1.01191342, "balance_loss_mlp": 1.01312232, "epoch": 0.2710356230272058, "flos": 65207664201600.0, "grad_norm": 0.7363581958317527, "language_loss": 0.50947499, "learning_rate": 3.420876001185698e-06, "loss": 0.52989548, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 0.04541016, "router_z_loss_mlp": 0.125, "step": 4508, "time_per_iteration": 2.931016683578491 }, { "auxiliary_loss_clip": 0.01079671, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.0195775, "balance_loss_mlp": 1.02962184, "epoch": 0.27109574627987376, "flos": 25483243944960.0, "grad_norm": 1.822730300280252, "language_loss": 0.76877695, "learning_rate": 3.4206018867749197e-06, "loss": 0.79007435, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.5, "step": 4509, "time_per_iteration": 2.4285154342651367 }, { "auxiliary_loss_clip": 0.01074396, "auxiliary_loss_mlp": 0.01049305, "balance_loss_clip": 1.02076674, "balance_loss_mlp": 1.02619195, "epoch": 0.2711558695325417, "flos": 19681761432960.0, "grad_norm": 1.8908230522729756, "language_loss": 0.7290684, "learning_rate": 3.4203277184947757e-06, "loss": 0.75030541, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.48242188, "step": 4510, "time_per_iteration": 2.4095330238342285 }, { "auxiliary_loss_clip": 0.01077008, "auxiliary_loss_mlp": 0.0105392, "balance_loss_clip": 1.02287769, "balance_loss_mlp": 1.02815747, "epoch": 0.2712159927852097, "flos": 18586780462080.0, "grad_norm": 3.072833760295191, "language_loss": 0.72004628, "learning_rate": 3.4200534963556627e-06, "loss": 0.74135554, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.48828125, "step": 4511, "time_per_iteration": 2.386728048324585 }, { "auxiliary_loss_clip": 0.01079318, "auxiliary_loss_mlp": 0.01054646, "balance_loss_clip": 1.02067173, "balance_loss_mlp": 1.0276897, "epoch": 0.27127611603787766, "flos": 25629041249280.0, "grad_norm": 2.0577878869843813, "language_loss": 0.82825398, "learning_rate": 3.419779220367979e-06, "loss": 0.84959364, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.515625, "step": 4512, "time_per_iteration": 2.5171661376953125 }, { "auxiliary_loss_clip": 0.01075303, "auxiliary_loss_mlp": 0.01054633, "balance_loss_clip": 1.02461588, "balance_loss_mlp": 1.0268147, "epoch": 0.2713362392905456, "flos": 23147833812480.0, "grad_norm": 1.7933555223172841, "language_loss": 0.8205657, "learning_rate": 3.419504890542124e-06, "loss": 0.84186506, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.484375, "step": 4513, "time_per_iteration": 2.401294231414795 }, { "auxiliary_loss_clip": 0.01078061, "auxiliary_loss_mlp": 0.01067517, "balance_loss_clip": 1.03738046, "balance_loss_mlp": 1.02585375, "epoch": 0.2713963625432136, "flos": 18365151951360.0, "grad_norm": 3.743266820654981, "language_loss": 0.90130258, "learning_rate": 3.4192305068885026e-06, "loss": 0.92275834, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.5234375, "step": 4514, "time_per_iteration": 2.4261319637298584 }, { "auxiliary_loss_clip": 0.01074818, "auxiliary_loss_mlp": 0.01068169, "balance_loss_clip": 1.03768682, "balance_loss_mlp": 1.02528644, "epoch": 0.27145648579588155, "flos": 22490663690880.0, "grad_norm": 1.672920709130401, "language_loss": 0.93105006, "learning_rate": 3.418956069417517e-06, "loss": 0.95247996, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.49609375, "step": 4515, "time_per_iteration": 2.401991128921509 }, { "auxiliary_loss_clip": 0.01077396, "auxiliary_loss_mlp": 0.01086759, "balance_loss_clip": 1.04863632, "balance_loss_mlp": 1.02416754, "epoch": 0.2715166090485495, "flos": 19238329854720.0, "grad_norm": 2.5977939703651862, "language_loss": 0.76070333, "learning_rate": 3.4186815781395756e-06, "loss": 0.78234494, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.53125, "step": 4516, "time_per_iteration": 2.39780592918396 }, { "auxiliary_loss_clip": 0.01073903, "auxiliary_loss_mlp": 0.01074457, "balance_loss_clip": 1.04072094, "balance_loss_mlp": 1.023211, "epoch": 0.2715767323012175, "flos": 17708714968320.0, "grad_norm": 2.0632615351501036, "language_loss": 0.77473772, "learning_rate": 3.4184070330650866e-06, "loss": 0.79622126, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5078125, "step": 4517, "time_per_iteration": 2.3561458587646484 }, { "auxiliary_loss_clip": 0.01072683, "auxiliary_loss_mlp": 0.01067538, "balance_loss_clip": 1.03377807, "balance_loss_mlp": 1.02249765, "epoch": 0.27163685555388545, "flos": 22381734648960.0, "grad_norm": 2.496365147501928, "language_loss": 0.80259621, "learning_rate": 3.4181324342044607e-06, "loss": 0.82399845, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5, "step": 4518, "time_per_iteration": 2.422555446624756 }, { "auxiliary_loss_clip": 0.01073405, "auxiliary_loss_mlp": 0.01065717, "balance_loss_clip": 1.03174233, "balance_loss_mlp": 1.0236094, "epoch": 0.2716969788065534, "flos": 22345599525120.0, "grad_norm": 1.704977610776915, "language_loss": 0.70343524, "learning_rate": 3.41785778156811e-06, "loss": 0.72482646, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49804688, "step": 4519, "time_per_iteration": 2.4186031818389893 }, { "auxiliary_loss_clip": 0.01073087, "auxiliary_loss_mlp": 0.01055701, "balance_loss_clip": 1.02463543, "balance_loss_mlp": 1.02304232, "epoch": 0.2717571020592214, "flos": 25227295701120.0, "grad_norm": 1.867034231520625, "language_loss": 0.76696157, "learning_rate": 3.417583075166451e-06, "loss": 0.78824937, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.5, "step": 4520, "time_per_iteration": 2.5162932872772217 }, { "auxiliary_loss_clip": 0.01077464, "auxiliary_loss_mlp": 0.01065299, "balance_loss_clip": 1.02853489, "balance_loss_mlp": 1.02491355, "epoch": 0.2718172253118894, "flos": 20188840152960.0, "grad_norm": 2.0569901582916135, "language_loss": 0.78802919, "learning_rate": 3.4173083150099e-06, "loss": 0.80945683, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.52734375, "step": 4521, "time_per_iteration": 2.3844425678253174 }, { "auxiliary_loss_clip": 0.01077254, "auxiliary_loss_mlp": 0.01057545, "balance_loss_clip": 1.02132916, "balance_loss_mlp": 1.02410066, "epoch": 0.27187734856455736, "flos": 14318264327040.0, "grad_norm": 2.3880216575140705, "language_loss": 0.76926148, "learning_rate": 3.417033501108875e-06, "loss": 0.79060954, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53125, "step": 4522, "time_per_iteration": 2.3921091556549072 }, { "auxiliary_loss_clip": 0.01079402, "auxiliary_loss_mlp": 0.01061662, "balance_loss_clip": 1.02566135, "balance_loss_mlp": 1.0262413, "epoch": 0.27193747181722533, "flos": 21106566817920.0, "grad_norm": 2.2704248152842665, "language_loss": 0.73483682, "learning_rate": 3.416758633473798e-06, "loss": 0.75624752, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4523, "time_per_iteration": 2.414151191711426 }, { "auxiliary_loss_clip": 0.01075233, "auxiliary_loss_mlp": 0.01053098, "balance_loss_clip": 1.02000523, "balance_loss_mlp": 1.02573907, "epoch": 0.2719975950698933, "flos": 19681761432960.0, "grad_norm": 1.5506806396223207, "language_loss": 0.76025397, "learning_rate": 3.4164837121150915e-06, "loss": 0.78153729, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.49414062, "step": 4524, "time_per_iteration": 3.8186089992523193 }, { "auxiliary_loss_clip": 0.01078752, "auxiliary_loss_mlp": 0.01056242, "balance_loss_clip": 1.02090847, "balance_loss_mlp": 1.02692854, "epoch": 0.27205771832256126, "flos": 24753314816640.0, "grad_norm": 1.7764797010703288, "language_loss": 0.77810705, "learning_rate": 3.4162087370431803e-06, "loss": 0.79945695, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.515625, "step": 4525, "time_per_iteration": 2.4182326793670654 }, { "auxiliary_loss_clip": 0.01082182, "auxiliary_loss_mlp": 0.01053792, "balance_loss_clip": 1.0204134, "balance_loss_mlp": 1.02999878, "epoch": 0.2721178415752292, "flos": 21754694897280.0, "grad_norm": 1.990814982584223, "language_loss": 0.8294397, "learning_rate": 3.4159337082684926e-06, "loss": 0.85079944, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.51953125, "step": 4526, "time_per_iteration": 3.9255146980285645 }, { "auxiliary_loss_clip": 0.01085959, "auxiliary_loss_mlp": 0.01053482, "balance_loss_clip": 1.01411951, "balance_loss_mlp": 1.0298636, "epoch": 0.2721779648278972, "flos": 12676019794560.0, "grad_norm": 6.776218229718864, "language_loss": 0.79732746, "learning_rate": 3.4156586258014566e-06, "loss": 0.81872189, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.5625, "step": 4527, "time_per_iteration": 3.936542510986328 }, { "auxiliary_loss_clip": 0.01082677, "auxiliary_loss_mlp": 0.01062736, "balance_loss_clip": 1.02461314, "balance_loss_mlp": 1.02907193, "epoch": 0.27223808808056515, "flos": 16252278024960.0, "grad_norm": 2.0163594122793858, "language_loss": 0.83486831, "learning_rate": 3.415383489652503e-06, "loss": 0.85632241, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5390625, "step": 4528, "time_per_iteration": 2.3741657733917236 }, { "auxiliary_loss_clip": 0.01084581, "auxiliary_loss_mlp": 0.01061499, "balance_loss_clip": 1.02676105, "balance_loss_mlp": 1.03191161, "epoch": 0.2722982113332331, "flos": 27744568439040.0, "grad_norm": 1.73664397854751, "language_loss": 0.78255868, "learning_rate": 3.4151082998320666e-06, "loss": 0.80401945, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5234375, "step": 4529, "time_per_iteration": 3.897963762283325 }, { "auxiliary_loss_clip": 0.01082755, "auxiliary_loss_mlp": 0.01066294, "balance_loss_clip": 1.03141296, "balance_loss_mlp": 1.02816391, "epoch": 0.2723583345859011, "flos": 21725158020480.0, "grad_norm": 1.894639729751049, "language_loss": 0.84480059, "learning_rate": 3.4148330563505805e-06, "loss": 0.86629105, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.546875, "step": 4530, "time_per_iteration": 2.4287378787994385 }, { "auxiliary_loss_clip": 0.01081808, "auxiliary_loss_mlp": 0.01065314, "balance_loss_clip": 1.02883637, "balance_loss_mlp": 1.02884376, "epoch": 0.27241845783856905, "flos": 17346316389120.0, "grad_norm": 2.355760461405491, "language_loss": 0.92989618, "learning_rate": 3.4145577592184838e-06, "loss": 0.95136738, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.53125, "step": 4531, "time_per_iteration": 2.3752546310424805 }, { "auxiliary_loss_clip": 0.0108292, "auxiliary_loss_mlp": 0.01066953, "balance_loss_clip": 1.03126204, "balance_loss_mlp": 1.02827084, "epoch": 0.272478581091237, "flos": 24753140259840.0, "grad_norm": 1.8883606464291378, "language_loss": 0.77884221, "learning_rate": 3.4142824084462155e-06, "loss": 0.80034095, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.546875, "step": 4532, "time_per_iteration": 2.4509592056274414 }, { "auxiliary_loss_clip": 0.01077989, "auxiliary_loss_mlp": 0.01057181, "balance_loss_clip": 1.02299142, "balance_loss_mlp": 1.02768898, "epoch": 0.272538704343905, "flos": 17889774612480.0, "grad_norm": 2.948359497429654, "language_loss": 0.90018177, "learning_rate": 3.4140070040442162e-06, "loss": 0.92153347, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.50390625, "step": 4533, "time_per_iteration": 2.372528553009033 }, { "auxiliary_loss_clip": 0.0107568, "auxiliary_loss_mlp": 0.01054149, "balance_loss_clip": 1.02029419, "balance_loss_mlp": 1.02618897, "epoch": 0.272598827596573, "flos": 22930848512640.0, "grad_norm": 1.7758313058288122, "language_loss": 0.72729278, "learning_rate": 3.413731546022929e-06, "loss": 0.74859101, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49609375, "step": 4534, "time_per_iteration": 2.449655532836914 }, { "auxiliary_loss_clip": 0.01077707, "auxiliary_loss_mlp": 0.01057441, "balance_loss_clip": 1.02012849, "balance_loss_mlp": 1.02443922, "epoch": 0.27265895084924097, "flos": 24237403522560.0, "grad_norm": 1.6377728823454702, "language_loss": 0.92334455, "learning_rate": 3.4134560343928005e-06, "loss": 0.94469601, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.53125, "step": 4535, "time_per_iteration": 2.42824125289917 }, { "auxiliary_loss_clip": 0.01077804, "auxiliary_loss_mlp": 0.01064223, "balance_loss_clip": 1.02538431, "balance_loss_mlp": 1.02382863, "epoch": 0.27271907410190893, "flos": 27012020958720.0, "grad_norm": 4.270012466075696, "language_loss": 0.74073404, "learning_rate": 3.4131804691642778e-06, "loss": 0.76215428, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5390625, "step": 4536, "time_per_iteration": 2.4511008262634277 }, { "auxiliary_loss_clip": 0.01073062, "auxiliary_loss_mlp": 0.01049422, "balance_loss_clip": 1.01644921, "balance_loss_mlp": 1.02218676, "epoch": 0.2727791973545769, "flos": 34451349171840.0, "grad_norm": 2.0255679489029212, "language_loss": 0.73427844, "learning_rate": 3.41290485034781e-06, "loss": 0.7555033, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.5078125, "step": 4537, "time_per_iteration": 2.487445831298828 }, { "auxiliary_loss_clip": 0.01073827, "auxiliary_loss_mlp": 0.01049396, "balance_loss_clip": 1.01506364, "balance_loss_mlp": 1.02224541, "epoch": 0.27283932060724486, "flos": 15041036626560.0, "grad_norm": 2.1225739545966227, "language_loss": 0.79116029, "learning_rate": 3.4126291779538485e-06, "loss": 0.81239247, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4538, "time_per_iteration": 2.3808610439300537 }, { "auxiliary_loss_clip": 0.01070491, "auxiliary_loss_mlp": 0.01054308, "balance_loss_clip": 1.02038074, "balance_loss_mlp": 1.02007771, "epoch": 0.2728994438599128, "flos": 21651351672960.0, "grad_norm": 1.4811048464126957, "language_loss": 0.91031158, "learning_rate": 3.412353451992847e-06, "loss": 0.9315595, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 4539, "time_per_iteration": 2.39631724357605 }, { "auxiliary_loss_clip": 0.01072708, "auxiliary_loss_mlp": 0.01050276, "balance_loss_clip": 1.01654005, "balance_loss_mlp": 1.02248812, "epoch": 0.2729595671125808, "flos": 17487610128000.0, "grad_norm": 2.04523959865395, "language_loss": 0.89499307, "learning_rate": 3.4120776724752607e-06, "loss": 0.91622293, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5, "step": 4540, "time_per_iteration": 2.3650896549224854 }, { "auxiliary_loss_clip": 0.01073516, "auxiliary_loss_mlp": 0.01053399, "balance_loss_clip": 1.02066386, "balance_loss_mlp": 1.02122808, "epoch": 0.27301969036524876, "flos": 19317128526720.0, "grad_norm": 1.9326391379780228, "language_loss": 0.83551776, "learning_rate": 3.4118018394115476e-06, "loss": 0.85678691, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5234375, "step": 4541, "time_per_iteration": 2.3827805519104004 }, { "auxiliary_loss_clip": 0.01071894, "auxiliary_loss_mlp": 0.01052808, "balance_loss_clip": 1.01892853, "balance_loss_mlp": 1.02109551, "epoch": 0.2730798136179167, "flos": 21064706231040.0, "grad_norm": 2.0817007098897937, "language_loss": 0.80788666, "learning_rate": 3.4115259528121678e-06, "loss": 0.82913363, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4542, "time_per_iteration": 2.391402006149292 }, { "auxiliary_loss_clip": 0.0107347, "auxiliary_loss_mlp": 0.01048504, "balance_loss_clip": 1.01607871, "balance_loss_mlp": 1.02258825, "epoch": 0.2731399368705847, "flos": 19170737729280.0, "grad_norm": 2.138247718852104, "language_loss": 0.91592449, "learning_rate": 3.411250012687582e-06, "loss": 0.93714428, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51171875, "step": 4543, "time_per_iteration": 2.431884288787842 }, { "auxiliary_loss_clip": 0.01075727, "auxiliary_loss_mlp": 0.01056541, "balance_loss_clip": 1.02020574, "balance_loss_mlp": 1.02220595, "epoch": 0.27320006012325265, "flos": 18289320744960.0, "grad_norm": 2.0814661169806428, "language_loss": 0.65256983, "learning_rate": 3.410974019048255e-06, "loss": 0.67389256, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.53515625, "step": 4544, "time_per_iteration": 2.357912302017212 }, { "auxiliary_loss_clip": 0.01075215, "auxiliary_loss_mlp": 0.01056019, "balance_loss_clip": 1.01930308, "balance_loss_mlp": 1.02306557, "epoch": 0.2732601833759206, "flos": 34859483498880.0, "grad_norm": 3.1949960012761203, "language_loss": 0.71193635, "learning_rate": 3.410697971904651e-06, "loss": 0.73324865, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.51953125, "step": 4545, "time_per_iteration": 2.521660089492798 }, { "auxiliary_loss_clip": 0.01019208, "auxiliary_loss_mlp": 0.0101022, "balance_loss_clip": 1.0060004, "balance_loss_mlp": 1.0059526, "epoch": 0.2733203066285886, "flos": 53907709800960.0, "grad_norm": 0.7357713910684953, "language_loss": 0.61624885, "learning_rate": 3.4104218712672383e-06, "loss": 0.63654315, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.1328125, "step": 4546, "time_per_iteration": 3.0820863246917725 }, { "auxiliary_loss_clip": 0.01078071, "auxiliary_loss_mlp": 0.01052716, "balance_loss_clip": 1.02107739, "balance_loss_mlp": 1.02726269, "epoch": 0.2733804298812566, "flos": 20659539369600.0, "grad_norm": 1.772739420437474, "language_loss": 0.66485536, "learning_rate": 3.410145717146488e-06, "loss": 0.68616325, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.5078125, "step": 4547, "time_per_iteration": 2.412407398223877 }, { "auxiliary_loss_clip": 0.01073108, "auxiliary_loss_mlp": 0.01050315, "balance_loss_clip": 1.0194633, "balance_loss_mlp": 1.02437842, "epoch": 0.27344055313392457, "flos": 25883174102400.0, "grad_norm": 2.2206728760946026, "language_loss": 0.798527, "learning_rate": 3.4098695095528694e-06, "loss": 0.81976116, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48828125, "step": 4548, "time_per_iteration": 2.4449100494384766 }, { "auxiliary_loss_clip": 0.01075818, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.02305007, "balance_loss_mlp": 1.02653646, "epoch": 0.27350067638659253, "flos": 22928649096960.0, "grad_norm": 2.7845662838176697, "language_loss": 0.84027815, "learning_rate": 3.4095932484968585e-06, "loss": 0.86155653, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4921875, "step": 4549, "time_per_iteration": 2.414067029953003 }, { "auxiliary_loss_clip": 0.0107839, "auxiliary_loss_mlp": 0.01059857, "balance_loss_clip": 1.02416623, "balance_loss_mlp": 1.02550352, "epoch": 0.2735607996392605, "flos": 16574072826240.0, "grad_norm": 2.03933313003626, "language_loss": 0.72387505, "learning_rate": 3.4093169339889305e-06, "loss": 0.7452575, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.52734375, "step": 4550, "time_per_iteration": 2.3573460578918457 }, { "auxiliary_loss_clip": 0.01074531, "auxiliary_loss_mlp": 0.0104504, "balance_loss_clip": 1.01670384, "balance_loss_mlp": 1.02525342, "epoch": 0.27362092289192846, "flos": 19644299677440.0, "grad_norm": 2.5161746762160524, "language_loss": 0.80338997, "learning_rate": 3.409040566039563e-06, "loss": 0.82458568, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4921875, "step": 4551, "time_per_iteration": 2.4047656059265137 }, { "auxiliary_loss_clip": 0.01076231, "auxiliary_loss_mlp": 0.01055113, "balance_loss_clip": 1.02378464, "balance_loss_mlp": 1.02527523, "epoch": 0.27368104614459643, "flos": 17638190288640.0, "grad_norm": 2.5344629755334007, "language_loss": 0.72957373, "learning_rate": 3.4087641446592362e-06, "loss": 0.7508871, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.5078125, "step": 4552, "time_per_iteration": 2.3753767013549805 }, { "auxiliary_loss_clip": 0.01077413, "auxiliary_loss_mlp": 0.01054606, "balance_loss_clip": 1.02120364, "balance_loss_mlp": 1.0259378, "epoch": 0.2737411693972644, "flos": 21578941779840.0, "grad_norm": 1.8920686071096664, "language_loss": 0.7284193, "learning_rate": 3.408487669858431e-06, "loss": 0.74973953, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.515625, "step": 4553, "time_per_iteration": 2.4301674365997314 }, { "auxiliary_loss_clip": 0.010736, "auxiliary_loss_mlp": 0.01056791, "balance_loss_clip": 1.02410388, "balance_loss_mlp": 1.02332997, "epoch": 0.27380129264993236, "flos": 25482859920000.0, "grad_norm": 1.8273507903724784, "language_loss": 0.61243927, "learning_rate": 3.4082111416476337e-06, "loss": 0.63374317, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.50390625, "step": 4554, "time_per_iteration": 2.4275763034820557 }, { "auxiliary_loss_clip": 0.01076475, "auxiliary_loss_mlp": 0.01059569, "balance_loss_clip": 1.02380586, "balance_loss_mlp": 1.02387857, "epoch": 0.2738614159026003, "flos": 18660202784640.0, "grad_norm": 1.7374347653295796, "language_loss": 0.75637293, "learning_rate": 3.4079345600373275e-06, "loss": 0.77773345, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.52734375, "step": 4555, "time_per_iteration": 2.4255292415618896 }, { "auxiliary_loss_clip": 0.01074684, "auxiliary_loss_mlp": 0.01057038, "balance_loss_clip": 1.02075124, "balance_loss_mlp": 1.02366829, "epoch": 0.2739215391552683, "flos": 23476017392640.0, "grad_norm": 1.9683367497327542, "language_loss": 0.78591603, "learning_rate": 3.407657925038002e-06, "loss": 0.80723321, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51171875, "step": 4556, "time_per_iteration": 2.4037158489227295 }, { "auxiliary_loss_clip": 0.0108108, "auxiliary_loss_mlp": 0.01063843, "balance_loss_clip": 1.02407444, "balance_loss_mlp": 1.02456486, "epoch": 0.27398166240793626, "flos": 17127690255360.0, "grad_norm": 2.1641177048514293, "language_loss": 0.83670223, "learning_rate": 3.4073812366601473e-06, "loss": 0.85815144, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.56640625, "step": 4557, "time_per_iteration": 2.3856043815612793 }, { "auxiliary_loss_clip": 0.01071119, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 1.02146864, "balance_loss_mlp": 1.02172136, "epoch": 0.2740417856606042, "flos": 23403607499520.0, "grad_norm": 1.821617851297972, "language_loss": 0.75141239, "learning_rate": 3.4071044949142547e-06, "loss": 0.77265322, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4921875, "step": 4558, "time_per_iteration": 2.3971292972564697 }, { "auxiliary_loss_clip": 0.01072843, "auxiliary_loss_mlp": 0.0105167, "balance_loss_clip": 1.01930428, "balance_loss_mlp": 1.02173519, "epoch": 0.2741019089132722, "flos": 12779781955200.0, "grad_norm": 2.220757140090357, "language_loss": 0.70826721, "learning_rate": 3.406827699810819e-06, "loss": 0.72951239, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51171875, "step": 4559, "time_per_iteration": 2.3884692192077637 }, { "auxiliary_loss_clip": 0.01071593, "auxiliary_loss_mlp": 0.01056798, "balance_loss_clip": 1.0237534, "balance_loss_mlp": 1.02108932, "epoch": 0.27416203216594015, "flos": 20630491251840.0, "grad_norm": 2.594010823423996, "language_loss": 0.73177218, "learning_rate": 3.4065508513603353e-06, "loss": 0.75305617, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.50390625, "step": 4560, "time_per_iteration": 2.392277717590332 }, { "auxiliary_loss_clip": 0.01071294, "auxiliary_loss_mlp": 0.01048092, "balance_loss_clip": 1.01364028, "balance_loss_mlp": 1.01989293, "epoch": 0.27422215541860817, "flos": 26540379135360.0, "grad_norm": 1.7226201761427198, "language_loss": 0.83097512, "learning_rate": 3.406273949573303e-06, "loss": 0.85216904, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4561, "time_per_iteration": 2.476163148880005 }, { "auxiliary_loss_clip": 0.01072274, "auxiliary_loss_mlp": 0.01050991, "balance_loss_clip": 1.01859033, "balance_loss_mlp": 1.02081072, "epoch": 0.27428227867127614, "flos": 23330045531520.0, "grad_norm": 1.7397448701904699, "language_loss": 0.76757425, "learning_rate": 3.4059969944602214e-06, "loss": 0.78880692, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.515625, "step": 4562, "time_per_iteration": 2.4096767902374268 }, { "auxiliary_loss_clip": 0.01072227, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.01778924, "balance_loss_mlp": 1.02148747, "epoch": 0.2743424019239441, "flos": 23034121914240.0, "grad_norm": 1.5092487370907206, "language_loss": 0.76199389, "learning_rate": 3.4057199860315928e-06, "loss": 0.78321689, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5078125, "step": 4563, "time_per_iteration": 2.419037103652954 }, { "auxiliary_loss_clip": 0.01075752, "auxiliary_loss_mlp": 0.0106109, "balance_loss_clip": 1.02268124, "balance_loss_mlp": 1.02166915, "epoch": 0.27440252517661207, "flos": 21980024012160.0, "grad_norm": 1.9367770804965188, "language_loss": 0.65068376, "learning_rate": 3.4054429242979213e-06, "loss": 0.67205215, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.54296875, "step": 4564, "time_per_iteration": 3.789053440093994 }, { "auxiliary_loss_clip": 0.01073596, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.01302445, "balance_loss_mlp": 1.02195311, "epoch": 0.27446264842928003, "flos": 40185867962880.0, "grad_norm": 1.8240163210558014, "language_loss": 0.80439126, "learning_rate": 3.4051658092697135e-06, "loss": 0.82558644, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.515625, "step": 4565, "time_per_iteration": 2.5750885009765625 }, { "auxiliary_loss_clip": 0.01074237, "auxiliary_loss_mlp": 0.01057269, "balance_loss_clip": 1.02434325, "balance_loss_mlp": 1.02349687, "epoch": 0.274522771681948, "flos": 13478847575040.0, "grad_norm": 1.9273681853567755, "language_loss": 0.70528555, "learning_rate": 3.404888640957477e-06, "loss": 0.72660065, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5078125, "step": 4566, "time_per_iteration": 3.7810823917388916 }, { "auxiliary_loss_clip": 0.0107195, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.02127266, "balance_loss_mlp": 1.02220273, "epoch": 0.27458289493461596, "flos": 28620853453440.0, "grad_norm": 1.7952770104617188, "language_loss": 0.62620711, "learning_rate": 3.404611419371723e-06, "loss": 0.6474545, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.49609375, "step": 4567, "time_per_iteration": 3.894662380218506 }, { "auxiliary_loss_clip": 0.01074782, "auxiliary_loss_mlp": 0.01052599, "balance_loss_clip": 1.0178616, "balance_loss_mlp": 1.02377319, "epoch": 0.2746430181872839, "flos": 20118804232320.0, "grad_norm": 1.7794119169808733, "language_loss": 0.83394372, "learning_rate": 3.4043341445229627e-06, "loss": 0.85521758, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51171875, "step": 4568, "time_per_iteration": 2.4015963077545166 }, { "auxiliary_loss_clip": 0.01073725, "auxiliary_loss_mlp": 0.01052685, "balance_loss_clip": 1.01971197, "balance_loss_mlp": 1.02215016, "epoch": 0.2747031414399519, "flos": 20192436023040.0, "grad_norm": 2.3073131701439835, "language_loss": 0.69258225, "learning_rate": 3.4040568164217117e-06, "loss": 0.71384633, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.515625, "step": 4569, "time_per_iteration": 3.8007290363311768 }, { "auxiliary_loss_clip": 0.01072688, "auxiliary_loss_mlp": 0.01060094, "balance_loss_clip": 1.02565432, "balance_loss_mlp": 1.02142489, "epoch": 0.27476326469261986, "flos": 13515506369280.0, "grad_norm": 2.2911836826202148, "language_loss": 0.72457778, "learning_rate": 3.4037794350784848e-06, "loss": 0.74590564, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 4570, "time_per_iteration": 2.3502182960510254 }, { "auxiliary_loss_clip": 0.01023824, "auxiliary_loss_mlp": 0.01004838, "balance_loss_clip": 0.99990231, "balance_loss_mlp": 1.01076174, "epoch": 0.2748233879452878, "flos": 65934067282560.0, "grad_norm": 0.7303904829886919, "language_loss": 0.55812889, "learning_rate": 3.4035020005038014e-06, "loss": 0.57841551, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 0.04931641, "router_z_loss_mlp": 0.13085938, "step": 4571, "time_per_iteration": 3.187305450439453 }, { "auxiliary_loss_clip": 0.01076556, "auxiliary_loss_mlp": 0.01064788, "balance_loss_clip": 1.02866769, "balance_loss_mlp": 1.02318275, "epoch": 0.2748835111979558, "flos": 17383254474240.0, "grad_norm": 2.4809910447790777, "language_loss": 0.80067712, "learning_rate": 3.4032245127081812e-06, "loss": 0.82209063, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53515625, "step": 4572, "time_per_iteration": 2.406135082244873 }, { "auxiliary_loss_clip": 0.01066863, "auxiliary_loss_mlp": 0.0104705, "balance_loss_clip": 1.01855862, "balance_loss_mlp": 1.02014899, "epoch": 0.27494363445062375, "flos": 23586412711680.0, "grad_norm": 1.4561912485367179, "language_loss": 0.82280201, "learning_rate": 3.402946971702147e-06, "loss": 0.84394115, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.46875, "step": 4573, "time_per_iteration": 2.443777561187744 }, { "auxiliary_loss_clip": 0.01070854, "auxiliary_loss_mlp": 0.01050024, "balance_loss_clip": 1.01788485, "balance_loss_mlp": 1.02147031, "epoch": 0.2750037577032918, "flos": 17163650822400.0, "grad_norm": 2.154897857044085, "language_loss": 0.80187631, "learning_rate": 3.402669377496223e-06, "loss": 0.82308507, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.49414062, "step": 4574, "time_per_iteration": 2.3696236610412598 }, { "auxiliary_loss_clip": 0.01071491, "auxiliary_loss_mlp": 0.01054682, "balance_loss_clip": 1.02340126, "balance_loss_mlp": 1.02075946, "epoch": 0.27506388095595974, "flos": 24490942882560.0, "grad_norm": 2.0563741677412204, "language_loss": 0.75959039, "learning_rate": 3.402391730100936e-06, "loss": 0.78085214, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.5078125, "step": 4575, "time_per_iteration": 2.414783477783203 }, { "auxiliary_loss_clip": 0.01069297, "auxiliary_loss_mlp": 0.01058415, "balance_loss_clip": 1.02687204, "balance_loss_mlp": 1.02054787, "epoch": 0.2751240042086277, "flos": 38763157259520.0, "grad_norm": 1.6074792643352929, "language_loss": 0.73112828, "learning_rate": 3.402114029526814e-06, "loss": 0.75240541, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48828125, "step": 4576, "time_per_iteration": 2.5387487411499023 }, { "auxiliary_loss_clip": 0.01070832, "auxiliary_loss_mlp": 0.01050538, "balance_loss_clip": 1.0191381, "balance_loss_mlp": 1.02160466, "epoch": 0.27518412746129567, "flos": 26905815002880.0, "grad_norm": 1.7616521614086194, "language_loss": 0.74599075, "learning_rate": 3.4018362757843866e-06, "loss": 0.76720446, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4921875, "step": 4577, "time_per_iteration": 2.458364486694336 }, { "auxiliary_loss_clip": 0.0107233, "auxiliary_loss_mlp": 0.01054285, "balance_loss_clip": 1.02090645, "balance_loss_mlp": 1.02167392, "epoch": 0.27524425071396363, "flos": 24899356500480.0, "grad_norm": 1.8921720859889564, "language_loss": 0.78271383, "learning_rate": 3.401558468884188e-06, "loss": 0.80397999, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 4578, "time_per_iteration": 2.42012882232666 }, { "auxiliary_loss_clip": 0.01074312, "auxiliary_loss_mlp": 0.01054084, "balance_loss_clip": 1.01548421, "balance_loss_mlp": 1.02108419, "epoch": 0.2753043739666316, "flos": 26286804864000.0, "grad_norm": 1.5836003275866866, "language_loss": 0.67731154, "learning_rate": 3.4012806088367516e-06, "loss": 0.69859552, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.53125, "step": 4579, "time_per_iteration": 2.459341287612915 }, { "auxiliary_loss_clip": 0.01074212, "auxiliary_loss_mlp": 0.01058425, "balance_loss_clip": 1.02325797, "balance_loss_mlp": 1.02245367, "epoch": 0.27536449721929956, "flos": 24205632318720.0, "grad_norm": 1.7215787305466974, "language_loss": 0.8128264, "learning_rate": 3.4010026956526137e-06, "loss": 0.83415276, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51953125, "step": 4580, "time_per_iteration": 2.4095144271850586 }, { "auxiliary_loss_clip": 0.01071295, "auxiliary_loss_mlp": 0.01053659, "balance_loss_clip": 1.02001834, "balance_loss_mlp": 1.02133918, "epoch": 0.27542462047196753, "flos": 19536243419520.0, "grad_norm": 1.6043714855069107, "language_loss": 0.68724155, "learning_rate": 3.4007247293423137e-06, "loss": 0.70849109, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5, "step": 4581, "time_per_iteration": 2.417628526687622 }, { "auxiliary_loss_clip": 0.01075321, "auxiliary_loss_mlp": 0.0105105, "balance_loss_clip": 1.02034211, "balance_loss_mlp": 1.02372169, "epoch": 0.2754847437246355, "flos": 14318299238400.0, "grad_norm": 2.9109276055221978, "language_loss": 0.79614592, "learning_rate": 3.400446709916392e-06, "loss": 0.81740957, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.515625, "step": 4582, "time_per_iteration": 2.3754961490631104 }, { "auxiliary_loss_clip": 0.01072624, "auxiliary_loss_mlp": 0.01054438, "balance_loss_clip": 1.02048683, "balance_loss_mlp": 1.02206349, "epoch": 0.27554486697730346, "flos": 18837910938240.0, "grad_norm": 1.6692080629293908, "language_loss": 0.8586036, "learning_rate": 3.4001686373853895e-06, "loss": 0.87987423, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4583, "time_per_iteration": 2.433675765991211 }, { "auxiliary_loss_clip": 0.0107544, "auxiliary_loss_mlp": 0.01051011, "balance_loss_clip": 1.01856256, "balance_loss_mlp": 1.02342057, "epoch": 0.2756049902299714, "flos": 22381210978560.0, "grad_norm": 1.818314884236316, "language_loss": 0.68698597, "learning_rate": 3.3998905117598528e-06, "loss": 0.70825052, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51953125, "step": 4584, "time_per_iteration": 2.3907196521759033 }, { "auxiliary_loss_clip": 0.01070223, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.02265334, "balance_loss_mlp": 1.0215981, "epoch": 0.2756651134826394, "flos": 19572867302400.0, "grad_norm": 2.284960384035675, "language_loss": 0.78693902, "learning_rate": 3.399612333050327e-06, "loss": 0.80817884, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.48632812, "step": 4585, "time_per_iteration": 2.421934127807617 }, { "auxiliary_loss_clip": 0.01076635, "auxiliary_loss_mlp": 0.0105595, "balance_loss_clip": 1.02099776, "balance_loss_mlp": 1.02403045, "epoch": 0.27572523673530736, "flos": 23585435193600.0, "grad_norm": 1.780813417738699, "language_loss": 0.73844254, "learning_rate": 3.399334101267362e-06, "loss": 0.75976837, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.52734375, "step": 4586, "time_per_iteration": 2.415919780731201 }, { "auxiliary_loss_clip": 0.01072378, "auxiliary_loss_mlp": 0.01051792, "balance_loss_clip": 1.019701, "balance_loss_mlp": 1.02228904, "epoch": 0.2757853599879754, "flos": 22819021827840.0, "grad_norm": 1.4763840780036939, "language_loss": 0.82013714, "learning_rate": 3.3990558164215073e-06, "loss": 0.84137887, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5, "step": 4587, "time_per_iteration": 2.4464282989501953 }, { "auxiliary_loss_clip": 0.01070832, "auxiliary_loss_mlp": 0.01060473, "balance_loss_clip": 1.026999, "balance_loss_mlp": 1.02146506, "epoch": 0.27584548324064334, "flos": 18550715160960.0, "grad_norm": 1.8397972968792529, "language_loss": 0.84037447, "learning_rate": 3.398777478523316e-06, "loss": 0.86168754, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 4588, "time_per_iteration": 2.3718390464782715 }, { "auxiliary_loss_clip": 0.01068748, "auxiliary_loss_mlp": 0.01047402, "balance_loss_clip": 1.01678872, "balance_loss_mlp": 1.02019536, "epoch": 0.2759056064933113, "flos": 23768729164800.0, "grad_norm": 1.4066858647829044, "language_loss": 0.76754636, "learning_rate": 3.398499087583342e-06, "loss": 0.78870785, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.48632812, "step": 4589, "time_per_iteration": 2.4657483100891113 }, { "auxiliary_loss_clip": 0.01069449, "auxiliary_loss_mlp": 0.0105432, "balance_loss_clip": 1.022753, "balance_loss_mlp": 1.01999938, "epoch": 0.27596572974597927, "flos": 24280695475200.0, "grad_norm": 1.6318565636427187, "language_loss": 0.90064037, "learning_rate": 3.398220643612143e-06, "loss": 0.92187804, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.49414062, "step": 4590, "time_per_iteration": 2.4347591400146484 }, { "auxiliary_loss_clip": 0.01069565, "auxiliary_loss_mlp": 0.01051522, "balance_loss_clip": 1.01795244, "balance_loss_mlp": 1.01897311, "epoch": 0.27602585299864724, "flos": 35039600536320.0, "grad_norm": 1.6303374409188511, "language_loss": 0.72947192, "learning_rate": 3.397942146620277e-06, "loss": 0.75068277, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.50390625, "step": 4591, "time_per_iteration": 2.516845226287842 }, { "auxiliary_loss_clip": 0.01072856, "auxiliary_loss_mlp": 0.01055848, "balance_loss_clip": 1.02099085, "balance_loss_mlp": 1.02175951, "epoch": 0.2760859762513152, "flos": 24308451872640.0, "grad_norm": 1.900805149490003, "language_loss": 0.81643236, "learning_rate": 3.3976635966183046e-06, "loss": 0.83771944, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51171875, "step": 4592, "time_per_iteration": 2.414559841156006 }, { "auxiliary_loss_clip": 0.0101835, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.02435982, "balance_loss_mlp": 1.00640607, "epoch": 0.27614609950398317, "flos": 71257623926400.0, "grad_norm": 0.7312760779356849, "language_loss": 0.61681879, "learning_rate": 3.3973849936167886e-06, "loss": 0.63729072, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 0.04492188, "router_z_loss_mlp": 0.11914062, "step": 4593, "time_per_iteration": 2.9908697605133057 }, { "auxiliary_loss_clip": 0.01070144, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.01552474, "balance_loss_mlp": 1.01931095, "epoch": 0.27620622275665113, "flos": 29673694546560.0, "grad_norm": 2.0341018840279057, "language_loss": 0.78189397, "learning_rate": 3.3971063376262937e-06, "loss": 0.80311018, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5078125, "step": 4594, "time_per_iteration": 2.450208902359009 }, { "auxiliary_loss_clip": 0.01072062, "auxiliary_loss_mlp": 0.01046429, "balance_loss_clip": 1.01574445, "balance_loss_mlp": 1.02274632, "epoch": 0.2762663460093191, "flos": 15377145085440.0, "grad_norm": 1.5062076677730314, "language_loss": 0.92738408, "learning_rate": 3.3968276286573866e-06, "loss": 0.94856894, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4921875, "step": 4595, "time_per_iteration": 2.433159589767456 }, { "auxiliary_loss_clip": 0.01073408, "auxiliary_loss_mlp": 0.01052912, "balance_loss_clip": 1.01955748, "balance_loss_mlp": 1.02249408, "epoch": 0.27632646926198706, "flos": 20703040790400.0, "grad_norm": 1.9070236656025337, "language_loss": 0.70678794, "learning_rate": 3.3965488667206353e-06, "loss": 0.72805119, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 4596, "time_per_iteration": 2.389512300491333 }, { "auxiliary_loss_clip": 0.0107729, "auxiliary_loss_mlp": 0.01054762, "balance_loss_clip": 1.01945233, "balance_loss_mlp": 1.02323878, "epoch": 0.276386592514655, "flos": 32812107016320.0, "grad_norm": 1.6690693131531837, "language_loss": 0.65438986, "learning_rate": 3.3962700518266113e-06, "loss": 0.67571038, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5390625, "step": 4597, "time_per_iteration": 2.5295517444610596 }, { "auxiliary_loss_clip": 0.01071334, "auxiliary_loss_mlp": 0.01051628, "balance_loss_clip": 1.02127755, "balance_loss_mlp": 1.02348852, "epoch": 0.276446715767323, "flos": 18550715160960.0, "grad_norm": 1.948476302116572, "language_loss": 0.87418848, "learning_rate": 3.395991183985887e-06, "loss": 0.89541817, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47851562, "step": 4598, "time_per_iteration": 2.3935153484344482 }, { "auxiliary_loss_clip": 0.01073474, "auxiliary_loss_mlp": 0.01055072, "balance_loss_clip": 1.02250409, "balance_loss_mlp": 1.02295184, "epoch": 0.27650683901999096, "flos": 22818533068800.0, "grad_norm": 2.518017860763197, "language_loss": 0.81706989, "learning_rate": 3.395712263209037e-06, "loss": 0.83835536, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.50390625, "step": 4599, "time_per_iteration": 2.427734375 }, { "auxiliary_loss_clip": 0.01075369, "auxiliary_loss_mlp": 0.01060059, "balance_loss_clip": 1.02615571, "balance_loss_mlp": 1.0231806, "epoch": 0.276566962272659, "flos": 21360455291520.0, "grad_norm": 1.8989281736837642, "language_loss": 0.80745113, "learning_rate": 3.395433289506639e-06, "loss": 0.82880545, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51953125, "step": 4600, "time_per_iteration": 2.412236213684082 }, { "auxiliary_loss_clip": 0.01079002, "auxiliary_loss_mlp": 0.01074545, "balance_loss_clip": 1.03739929, "balance_loss_mlp": 1.02451432, "epoch": 0.27662708552532694, "flos": 17709692486400.0, "grad_norm": 2.0205282498912194, "language_loss": 0.74233866, "learning_rate": 3.3951542628892694e-06, "loss": 0.76387417, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.54296875, "step": 4601, "time_per_iteration": 2.404066801071167 }, { "auxiliary_loss_clip": 0.01071861, "auxiliary_loss_mlp": 0.01059218, "balance_loss_clip": 1.02719843, "balance_loss_mlp": 1.02170742, "epoch": 0.2766872087779949, "flos": 21251630983680.0, "grad_norm": 1.7836627767919369, "language_loss": 0.81638932, "learning_rate": 3.3948751833675113e-06, "loss": 0.83770013, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.50390625, "step": 4602, "time_per_iteration": 2.428396224975586 }, { "auxiliary_loss_clip": 0.01076368, "auxiliary_loss_mlp": 0.01059441, "balance_loss_clip": 1.02186632, "balance_loss_mlp": 1.02212334, "epoch": 0.2767473320306629, "flos": 12931095254400.0, "grad_norm": 2.367602394352554, "language_loss": 0.78961265, "learning_rate": 3.3945960509519455e-06, "loss": 0.81097078, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.54296875, "step": 4603, "time_per_iteration": 3.76438570022583 }, { "auxiliary_loss_clip": 0.01070813, "auxiliary_loss_mlp": 0.01059837, "balance_loss_clip": 1.02825844, "balance_loss_mlp": 1.02229571, "epoch": 0.27680745528333084, "flos": 15011953597440.0, "grad_norm": 1.5905051716807581, "language_loss": 0.82918113, "learning_rate": 3.3943168656531585e-06, "loss": 0.85048759, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.484375, "step": 4604, "time_per_iteration": 2.355046510696411 }, { "auxiliary_loss_clip": 0.01071546, "auxiliary_loss_mlp": 0.01054178, "balance_loss_clip": 1.01963115, "balance_loss_mlp": 1.02070415, "epoch": 0.2768675785359988, "flos": 22636740286080.0, "grad_norm": 1.8548228046838384, "language_loss": 0.71574402, "learning_rate": 3.3940376274817363e-06, "loss": 0.7370013, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5078125, "step": 4605, "time_per_iteration": 2.427611827850342 }, { "auxiliary_loss_clip": 0.01021188, "auxiliary_loss_mlp": 0.0101846, "balance_loss_clip": 1.01454973, "balance_loss_mlp": 1.00866485, "epoch": 0.27692770178866677, "flos": 66127171345920.0, "grad_norm": 0.7397887751549002, "language_loss": 0.57251322, "learning_rate": 3.3937583364482673e-06, "loss": 0.59290969, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 0.0390625, "router_z_loss_mlp": 0.125, "step": 4606, "time_per_iteration": 4.516442537307739 }, { "auxiliary_loss_clip": 0.01074575, "auxiliary_loss_mlp": 0.01054342, "balance_loss_clip": 1.02229893, "balance_loss_mlp": 1.02232087, "epoch": 0.27698782504133473, "flos": 26463884613120.0, "grad_norm": 1.9874370868949134, "language_loss": 0.71575445, "learning_rate": 3.3934789925633424e-06, "loss": 0.73704362, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5234375, "step": 4607, "time_per_iteration": 3.9149959087371826 }, { "auxiliary_loss_clip": 0.0106806, "auxiliary_loss_mlp": 0.01050938, "balance_loss_clip": 1.0220772, "balance_loss_mlp": 1.02139056, "epoch": 0.2770479482940027, "flos": 25883627950080.0, "grad_norm": 1.6197845495514234, "language_loss": 0.71069419, "learning_rate": 3.393199595837555e-06, "loss": 0.73188412, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.46679688, "step": 4608, "time_per_iteration": 4.009166955947876 }, { "auxiliary_loss_clip": 0.01073815, "auxiliary_loss_mlp": 0.01043421, "balance_loss_clip": 1.0119257, "balance_loss_mlp": 1.02270293, "epoch": 0.27710807154667066, "flos": 22856134469760.0, "grad_norm": 1.781093868746056, "language_loss": 0.74713254, "learning_rate": 3.392920146281499e-06, "loss": 0.76830494, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.51171875, "step": 4609, "time_per_iteration": 2.4308862686157227 }, { "auxiliary_loss_clip": 0.01073996, "auxiliary_loss_mlp": 0.0105275, "balance_loss_clip": 1.01856053, "balance_loss_mlp": 1.02353454, "epoch": 0.27716819479933863, "flos": 17710146334080.0, "grad_norm": 2.1958786035047906, "language_loss": 0.85422409, "learning_rate": 3.3926406439057714e-06, "loss": 0.8754915, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5078125, "step": 4610, "time_per_iteration": 2.355536460876465 }, { "auxiliary_loss_clip": 0.01082873, "auxiliary_loss_mlp": 0.0106416, "balance_loss_clip": 1.02730095, "balance_loss_mlp": 1.02922428, "epoch": 0.2772283180520066, "flos": 19645032816000.0, "grad_norm": 2.0030321497172774, "language_loss": 0.71375048, "learning_rate": 3.3923610887209705e-06, "loss": 0.73522079, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.53515625, "step": 4611, "time_per_iteration": 2.416422128677368 }, { "auxiliary_loss_clip": 0.01075013, "auxiliary_loss_mlp": 0.01042949, "balance_loss_clip": 1.01257443, "balance_loss_mlp": 1.02795208, "epoch": 0.27728844130467456, "flos": 21031573484160.0, "grad_norm": 1.7150717490139142, "language_loss": 0.76009667, "learning_rate": 3.392081480737698e-06, "loss": 0.78127629, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47070312, "step": 4612, "time_per_iteration": 2.399803876876831 }, { "auxiliary_loss_clip": 0.01077805, "auxiliary_loss_mlp": 0.01051842, "balance_loss_clip": 1.01953626, "balance_loss_mlp": 1.02724504, "epoch": 0.2773485645573425, "flos": 18988211808000.0, "grad_norm": 2.3710040314568026, "language_loss": 0.67849261, "learning_rate": 3.3918018199665563e-06, "loss": 0.69978905, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.50390625, "step": 4613, "time_per_iteration": 2.4346001148223877 }, { "auxiliary_loss_clip": 0.01076504, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.02337933, "balance_loss_mlp": 1.02719545, "epoch": 0.27740868781001055, "flos": 21467429297280.0, "grad_norm": 1.604830070836584, "language_loss": 0.81069005, "learning_rate": 3.39152210641815e-06, "loss": 0.83198255, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4921875, "step": 4614, "time_per_iteration": 2.461322069168091 }, { "auxiliary_loss_clip": 0.01078711, "auxiliary_loss_mlp": 0.01052419, "balance_loss_clip": 1.02022028, "balance_loss_mlp": 1.02760363, "epoch": 0.2774688110626785, "flos": 19826825598720.0, "grad_norm": 2.7187713797783273, "language_loss": 0.83482063, "learning_rate": 3.3912423401030865e-06, "loss": 0.85613191, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.51171875, "step": 4615, "time_per_iteration": 2.393500566482544 }, { "auxiliary_loss_clip": 0.01081586, "auxiliary_loss_mlp": 0.01056996, "balance_loss_clip": 1.02385581, "balance_loss_mlp": 1.02923822, "epoch": 0.2775289343153465, "flos": 18215444574720.0, "grad_norm": 2.4516283905090135, "language_loss": 0.65835458, "learning_rate": 3.3909625210319735e-06, "loss": 0.67974043, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5234375, "step": 4616, "time_per_iteration": 2.359668493270874 }, { "auxiliary_loss_clip": 0.01076496, "auxiliary_loss_mlp": 0.01054194, "balance_loss_clip": 1.02324748, "balance_loss_mlp": 1.02733541, "epoch": 0.27758905756801444, "flos": 16471532563200.0, "grad_norm": 2.5912449066217853, "language_loss": 0.84606594, "learning_rate": 3.3906826492154226e-06, "loss": 0.86737287, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4921875, "step": 4617, "time_per_iteration": 2.4042820930480957 }, { "auxiliary_loss_clip": 0.01076571, "auxiliary_loss_mlp": 0.01054896, "balance_loss_clip": 1.02338934, "balance_loss_mlp": 1.02528608, "epoch": 0.2776491808206824, "flos": 18727410885120.0, "grad_norm": 1.9638553825747433, "language_loss": 0.78000677, "learning_rate": 3.3904027246640458e-06, "loss": 0.80132139, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.51171875, "step": 4618, "time_per_iteration": 2.3831894397735596 }, { "auxiliary_loss_clip": 0.010779, "auxiliary_loss_mlp": 0.01053519, "balance_loss_clip": 1.02121305, "balance_loss_mlp": 1.0266726, "epoch": 0.27770930407335037, "flos": 28036931097600.0, "grad_norm": 1.9707115643701252, "language_loss": 0.85655093, "learning_rate": 3.390122747388459e-06, "loss": 0.87786508, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.51171875, "step": 4619, "time_per_iteration": 2.5510616302490234 }, { "auxiliary_loss_clip": 0.01074125, "auxiliary_loss_mlp": 0.01058424, "balance_loss_clip": 1.029814, "balance_loss_mlp": 1.02464676, "epoch": 0.27776942732601834, "flos": 23548706576640.0, "grad_norm": 1.4126479189903516, "language_loss": 0.78116155, "learning_rate": 3.3898427173992778e-06, "loss": 0.80248702, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.49609375, "step": 4620, "time_per_iteration": 2.4771981239318848 }, { "auxiliary_loss_clip": 0.01070684, "auxiliary_loss_mlp": 0.01059985, "balance_loss_clip": 1.02637959, "balance_loss_mlp": 1.02196527, "epoch": 0.2778295505786863, "flos": 23907753665280.0, "grad_norm": 1.7919381526307359, "language_loss": 0.79709995, "learning_rate": 3.389562634707122e-06, "loss": 0.81840664, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48828125, "step": 4621, "time_per_iteration": 2.4520630836486816 }, { "auxiliary_loss_clip": 0.0107285, "auxiliary_loss_mlp": 0.01062429, "balance_loss_clip": 1.02918172, "balance_loss_mlp": 1.02177715, "epoch": 0.27788967383135427, "flos": 25553454422400.0, "grad_norm": 5.605483084362551, "language_loss": 0.89297593, "learning_rate": 3.389282499322611e-06, "loss": 0.91432875, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5078125, "step": 4622, "time_per_iteration": 2.407270669937134 }, { "auxiliary_loss_clip": 0.01071907, "auxiliary_loss_mlp": 0.0105954, "balance_loss_clip": 1.02799726, "balance_loss_mlp": 1.02156377, "epoch": 0.27794979708402223, "flos": 16251719443200.0, "grad_norm": 1.9778230753638553, "language_loss": 0.83154505, "learning_rate": 3.389002311256369e-06, "loss": 0.85285962, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.50390625, "step": 4623, "time_per_iteration": 2.3918397426605225 }, { "auxiliary_loss_clip": 0.01073699, "auxiliary_loss_mlp": 0.01059356, "balance_loss_clip": 1.02883828, "balance_loss_mlp": 1.02356315, "epoch": 0.2780099203366902, "flos": 20666591464320.0, "grad_norm": 1.9955116853345316, "language_loss": 0.83334768, "learning_rate": 3.3887220705190204e-06, "loss": 0.85467815, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.5, "step": 4624, "time_per_iteration": 2.4015495777130127 }, { "auxiliary_loss_clip": 0.0106973, "auxiliary_loss_mlp": 0.01059722, "balance_loss_clip": 1.02684367, "balance_loss_mlp": 1.02049232, "epoch": 0.27807004358935816, "flos": 17738880249600.0, "grad_norm": 2.2491409988623645, "language_loss": 0.78172791, "learning_rate": 3.388441777121191e-06, "loss": 0.80302244, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 4625, "time_per_iteration": 2.376760244369507 }, { "auxiliary_loss_clip": 0.01073185, "auxiliary_loss_mlp": 0.01060735, "balance_loss_clip": 1.02808309, "balance_loss_mlp": 1.0234971, "epoch": 0.2781301668420261, "flos": 16726189086720.0, "grad_norm": 2.115294245861918, "language_loss": 0.71735889, "learning_rate": 3.388161431073511e-06, "loss": 0.73869812, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.49804688, "step": 4626, "time_per_iteration": 2.409721612930298 }, { "auxiliary_loss_clip": 0.01076668, "auxiliary_loss_mlp": 0.01056568, "balance_loss_clip": 1.02171159, "balance_loss_mlp": 1.02312386, "epoch": 0.27819029009469415, "flos": 13843899417600.0, "grad_norm": 2.171775768468512, "language_loss": 0.94348454, "learning_rate": 3.38788103238661e-06, "loss": 0.96481687, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53515625, "step": 4627, "time_per_iteration": 2.37099289894104 }, { "auxiliary_loss_clip": 0.01075817, "auxiliary_loss_mlp": 0.01049237, "balance_loss_clip": 1.01719379, "balance_loss_mlp": 1.02544546, "epoch": 0.2782504133473621, "flos": 27088061633280.0, "grad_norm": 1.8231731043383834, "language_loss": 0.86904883, "learning_rate": 3.387600581071121e-06, "loss": 0.89029944, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.50390625, "step": 4628, "time_per_iteration": 2.495032548904419 }, { "auxiliary_loss_clip": 0.01072104, "auxiliary_loss_mlp": 0.01049655, "balance_loss_clip": 1.01863682, "balance_loss_mlp": 1.02307642, "epoch": 0.2783105366000301, "flos": 21067778430720.0, "grad_norm": 1.9001713172620474, "language_loss": 0.80904675, "learning_rate": 3.387320077137679e-06, "loss": 0.83026433, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4921875, "step": 4629, "time_per_iteration": 2.3927762508392334 }, { "auxiliary_loss_clip": 0.0107271, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.01482368, "balance_loss_mlp": 1.02568579, "epoch": 0.27837065985269804, "flos": 26500717964160.0, "grad_norm": 1.4965784396963704, "language_loss": 0.85588849, "learning_rate": 3.3870395205969208e-06, "loss": 0.8770566, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.47070312, "step": 4630, "time_per_iteration": 2.4466190338134766 }, { "auxiliary_loss_clip": 0.01076984, "auxiliary_loss_mlp": 0.01048108, "balance_loss_clip": 1.01380014, "balance_loss_mlp": 1.02629125, "epoch": 0.278430783105366, "flos": 20222356924800.0, "grad_norm": 2.1523739721966813, "language_loss": 0.83486271, "learning_rate": 3.386758911459485e-06, "loss": 0.85611361, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 4631, "time_per_iteration": 2.3849973678588867 }, { "auxiliary_loss_clip": 0.01082283, "auxiliary_loss_mlp": 0.01049079, "balance_loss_clip": 1.01310134, "balance_loss_mlp": 1.02948833, "epoch": 0.278490906358034, "flos": 25591719139200.0, "grad_norm": 1.9252694112719715, "language_loss": 0.72817087, "learning_rate": 3.3864782497360126e-06, "loss": 0.74948454, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4632, "time_per_iteration": 2.4917008876800537 }, { "auxiliary_loss_clip": 0.01074353, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.01790905, "balance_loss_mlp": 1.02711952, "epoch": 0.27855102961070194, "flos": 16170861000960.0, "grad_norm": 1.7463738748973807, "language_loss": 0.83630258, "learning_rate": 3.386197535437145e-06, "loss": 0.857548, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47265625, "step": 4633, "time_per_iteration": 2.3542466163635254 }, { "auxiliary_loss_clip": 0.01078003, "auxiliary_loss_mlp": 0.0104846, "balance_loss_clip": 1.01438963, "balance_loss_mlp": 1.02733207, "epoch": 0.2786111528633699, "flos": 22926554415360.0, "grad_norm": 1.6838674909701292, "language_loss": 0.88985443, "learning_rate": 3.385916768573529e-06, "loss": 0.9111191, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4634, "time_per_iteration": 2.399946928024292 }, { "auxiliary_loss_clip": 0.01079702, "auxiliary_loss_mlp": 0.01051518, "balance_loss_clip": 1.01616085, "balance_loss_mlp": 1.02798867, "epoch": 0.27867127611603787, "flos": 23403083829120.0, "grad_norm": 1.5224078005921926, "language_loss": 0.77592874, "learning_rate": 3.38563594915581e-06, "loss": 0.79724091, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.515625, "step": 4635, "time_per_iteration": 2.4442081451416016 }, { "auxiliary_loss_clip": 0.01076313, "auxiliary_loss_mlp": 0.01055833, "balance_loss_clip": 1.0211674, "balance_loss_mlp": 1.02563119, "epoch": 0.27873139936870583, "flos": 19827977673600.0, "grad_norm": 1.6835297636683026, "language_loss": 0.66851783, "learning_rate": 3.385355077194637e-06, "loss": 0.68983924, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5078125, "step": 4636, "time_per_iteration": 2.413301944732666 }, { "auxiliary_loss_clip": 0.01079341, "auxiliary_loss_mlp": 0.01054852, "balance_loss_clip": 1.01997137, "balance_loss_mlp": 1.02609921, "epoch": 0.2787915226213738, "flos": 17706829754880.0, "grad_norm": 2.649595170574568, "language_loss": 0.86198997, "learning_rate": 3.3850741527006604e-06, "loss": 0.88333189, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 4637, "time_per_iteration": 2.3539958000183105 }, { "auxiliary_loss_clip": 0.01071889, "auxiliary_loss_mlp": 0.01054259, "balance_loss_clip": 1.0236938, "balance_loss_mlp": 1.02397346, "epoch": 0.27885164587404176, "flos": 22089476724480.0, "grad_norm": 1.4495973520316157, "language_loss": 0.77773446, "learning_rate": 3.384793175684533e-06, "loss": 0.79899585, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47851562, "step": 4638, "time_per_iteration": 2.446038246154785 }, { "auxiliary_loss_clip": 0.0107398, "auxiliary_loss_mlp": 0.0105901, "balance_loss_clip": 1.02505875, "balance_loss_mlp": 1.02369976, "epoch": 0.27891176912670973, "flos": 19206698296320.0, "grad_norm": 1.4502198639346005, "language_loss": 0.72858274, "learning_rate": 3.38451214615691e-06, "loss": 0.74991262, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 4639, "time_per_iteration": 2.3728268146514893 }, { "auxiliary_loss_clip": 0.0107156, "auxiliary_loss_mlp": 0.01051987, "balance_loss_clip": 1.01879942, "balance_loss_mlp": 1.02174222, "epoch": 0.27897189237937775, "flos": 27598771134720.0, "grad_norm": 2.8913667347213607, "language_loss": 0.66902852, "learning_rate": 3.384231064128447e-06, "loss": 0.69026399, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49804688, "step": 4640, "time_per_iteration": 2.460193634033203 }, { "auxiliary_loss_clip": 0.01072704, "auxiliary_loss_mlp": 0.01050607, "balance_loss_clip": 1.01765728, "balance_loss_mlp": 1.02228975, "epoch": 0.2790320156320457, "flos": 21177161320320.0, "grad_norm": 2.5881800263575405, "language_loss": 0.73256922, "learning_rate": 3.383949929609804e-06, "loss": 0.75380236, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.50390625, "step": 4641, "time_per_iteration": 2.3921053409576416 }, { "auxiliary_loss_clip": 0.01073793, "auxiliary_loss_mlp": 0.01057322, "balance_loss_clip": 1.02132046, "balance_loss_mlp": 1.02215958, "epoch": 0.2790921388847137, "flos": 22782816881280.0, "grad_norm": 1.5850970656942875, "language_loss": 0.77033806, "learning_rate": 3.383668742611641e-06, "loss": 0.7916491, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.515625, "step": 4642, "time_per_iteration": 3.8508524894714355 }, { "auxiliary_loss_clip": 0.01073842, "auxiliary_loss_mlp": 0.01053597, "balance_loss_clip": 1.02048075, "balance_loss_mlp": 1.02306485, "epoch": 0.27915226213738165, "flos": 23399627604480.0, "grad_norm": 2.317147592076058, "language_loss": 0.87279326, "learning_rate": 3.3833875031446205e-06, "loss": 0.89406765, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5078125, "step": 4643, "time_per_iteration": 2.3984670639038086 }, { "auxiliary_loss_clip": 0.01072733, "auxiliary_loss_mlp": 0.01057108, "balance_loss_clip": 1.02246547, "balance_loss_mlp": 1.02242696, "epoch": 0.2792123853900496, "flos": 22746681757440.0, "grad_norm": 1.805655091986533, "language_loss": 0.84260941, "learning_rate": 3.383106211219407e-06, "loss": 0.86390775, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.50390625, "step": 4644, "time_per_iteration": 2.425086736679077 }, { "auxiliary_loss_clip": 0.01073375, "auxiliary_loss_mlp": 0.0105069, "balance_loss_clip": 1.0181222, "balance_loss_mlp": 1.02208591, "epoch": 0.2792725086427176, "flos": 15048472746240.0, "grad_norm": 1.8788181501255627, "language_loss": 0.8046416, "learning_rate": 3.3828248668466673e-06, "loss": 0.8258822, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.515625, "step": 4645, "time_per_iteration": 2.370615005493164 }, { "auxiliary_loss_clip": 0.01018563, "auxiliary_loss_mlp": 0.01011556, "balance_loss_clip": 1.00759828, "balance_loss_mlp": 1.00647724, "epoch": 0.27933263189538554, "flos": 62541871073280.0, "grad_norm": 0.7825148058184128, "language_loss": 0.62396514, "learning_rate": 3.3825434700370705e-06, "loss": 0.64426637, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.12109375, "step": 4646, "time_per_iteration": 5.810253858566284 }, { "auxiliary_loss_clip": 0.01071375, "auxiliary_loss_mlp": 0.01050964, "balance_loss_clip": 1.02049387, "balance_loss_mlp": 1.02366567, "epoch": 0.2793927551480535, "flos": 25117214584320.0, "grad_norm": 1.5719970074648184, "language_loss": 0.90405607, "learning_rate": 3.3822620208012865e-06, "loss": 0.92527944, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4765625, "step": 4647, "time_per_iteration": 2.4505696296691895 }, { "auxiliary_loss_clip": 0.0107752, "auxiliary_loss_mlp": 0.01053217, "balance_loss_clip": 1.01890898, "balance_loss_mlp": 1.02595472, "epoch": 0.27945287840072147, "flos": 21323517206400.0, "grad_norm": 1.7811598880785062, "language_loss": 0.87785876, "learning_rate": 3.381980519149988e-06, "loss": 0.89916611, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4648, "time_per_iteration": 3.850947856903076 }, { "auxiliary_loss_clip": 0.01081056, "auxiliary_loss_mlp": 0.01045949, "balance_loss_clip": 1.01259387, "balance_loss_mlp": 1.02755475, "epoch": 0.27951300165338944, "flos": 27449412871680.0, "grad_norm": 2.1169970092315817, "language_loss": 0.74778318, "learning_rate": 3.38169896509385e-06, "loss": 0.76905322, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.53515625, "step": 4649, "time_per_iteration": 2.4620604515075684 }, { "auxiliary_loss_clip": 0.01076113, "auxiliary_loss_mlp": 0.01053917, "balance_loss_clip": 1.01696253, "balance_loss_mlp": 1.02569866, "epoch": 0.2795731249060574, "flos": 15158100015360.0, "grad_norm": 2.2102141439751137, "language_loss": 0.82275057, "learning_rate": 3.381417358643549e-06, "loss": 0.84405077, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.50390625, "step": 4650, "time_per_iteration": 2.4198408126831055 }, { "auxiliary_loss_clip": 0.01022799, "auxiliary_loss_mlp": 0.01004804, "balance_loss_clip": 1.00075078, "balance_loss_mlp": 1.01027346, "epoch": 0.27963324815872537, "flos": 60116628412800.0, "grad_norm": 0.8287267065210739, "language_loss": 0.58853072, "learning_rate": 3.3811356998097624e-06, "loss": 0.60880673, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.125, "step": 4651, "time_per_iteration": 3.057788848876953 }, { "auxiliary_loss_clip": 0.01079829, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.01666403, "balance_loss_mlp": 1.02601457, "epoch": 0.27969337141139333, "flos": 21764784280320.0, "grad_norm": 1.7186968341322653, "language_loss": 0.75266051, "learning_rate": 3.3808539886031726e-06, "loss": 0.77401334, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5390625, "step": 4652, "time_per_iteration": 2.447598457336426 }, { "auxiliary_loss_clip": 0.010801, "auxiliary_loss_mlp": 0.01054507, "balance_loss_clip": 1.01988852, "balance_loss_mlp": 1.02685976, "epoch": 0.27975349466406135, "flos": 39850038794880.0, "grad_norm": 2.129726512659265, "language_loss": 0.80194092, "learning_rate": 3.380572225034461e-06, "loss": 0.82328695, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53125, "step": 4653, "time_per_iteration": 2.5436012744903564 }, { "auxiliary_loss_clip": 0.01078584, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.02076077, "balance_loss_mlp": 1.02723503, "epoch": 0.2798136179167293, "flos": 21578732311680.0, "grad_norm": 1.97922260636658, "language_loss": 0.80019343, "learning_rate": 3.380290409114312e-06, "loss": 0.82153857, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51171875, "step": 4654, "time_per_iteration": 2.451482057571411 }, { "auxiliary_loss_clip": 0.01080403, "auxiliary_loss_mlp": 0.01059467, "balance_loss_clip": 1.02229714, "balance_loss_mlp": 1.02651644, "epoch": 0.2798737411693973, "flos": 21536766990720.0, "grad_norm": 1.9419464234738197, "language_loss": 0.82176769, "learning_rate": 3.3800085408534127e-06, "loss": 0.84316641, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5390625, "step": 4655, "time_per_iteration": 2.3884999752044678 }, { "auxiliary_loss_clip": 0.01078656, "auxiliary_loss_mlp": 0.0105812, "balance_loss_clip": 1.02280974, "balance_loss_mlp": 1.02560842, "epoch": 0.27993386442206525, "flos": 26979795907200.0, "grad_norm": 1.873400977843817, "language_loss": 0.83283681, "learning_rate": 3.3797266202624506e-06, "loss": 0.85420454, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.53125, "step": 4656, "time_per_iteration": 2.464714288711548 }, { "auxiliary_loss_clip": 0.01077136, "auxiliary_loss_mlp": 0.01063631, "balance_loss_clip": 1.02536488, "balance_loss_mlp": 1.02458024, "epoch": 0.2799939876747332, "flos": 24348811271040.0, "grad_norm": 1.6471747766486, "language_loss": 0.83953416, "learning_rate": 3.3794446473521176e-06, "loss": 0.86094189, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.52734375, "step": 4657, "time_per_iteration": 2.414790153503418 }, { "auxiliary_loss_clip": 0.01075602, "auxiliary_loss_mlp": 0.01056796, "balance_loss_clip": 1.02365613, "balance_loss_mlp": 1.02389479, "epoch": 0.2800541109274012, "flos": 33655573486080.0, "grad_norm": 1.9235403287445256, "language_loss": 0.65368879, "learning_rate": 3.379162622133105e-06, "loss": 0.67501271, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.515625, "step": 4658, "time_per_iteration": 2.5225112438201904 }, { "auxiliary_loss_clip": 0.01075384, "auxiliary_loss_mlp": 0.01063709, "balance_loss_clip": 1.02675366, "balance_loss_mlp": 1.02358103, "epoch": 0.28011423418006914, "flos": 21613401158400.0, "grad_norm": 1.7802344930872605, "language_loss": 0.80216265, "learning_rate": 3.3788805446161073e-06, "loss": 0.82355356, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.51953125, "step": 4659, "time_per_iteration": 2.4269728660583496 }, { "auxiliary_loss_clip": 0.01076984, "auxiliary_loss_mlp": 0.0106831, "balance_loss_clip": 1.03073478, "balance_loss_mlp": 1.02368283, "epoch": 0.2801743574327371, "flos": 23111314663680.0, "grad_norm": 1.8942537733874574, "language_loss": 0.81089187, "learning_rate": 3.3785984148118215e-06, "loss": 0.83234477, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.53125, "step": 4660, "time_per_iteration": 2.402141571044922 }, { "auxiliary_loss_clip": 0.01071538, "auxiliary_loss_mlp": 0.01055238, "balance_loss_clip": 1.02202594, "balance_loss_mlp": 1.02227724, "epoch": 0.2802344806854051, "flos": 12640582897920.0, "grad_norm": 2.1666504065612084, "language_loss": 0.82090926, "learning_rate": 3.3783162327309453e-06, "loss": 0.84217697, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 4661, "time_per_iteration": 2.362389326095581 }, { "auxiliary_loss_clip": 0.01075862, "auxiliary_loss_mlp": 0.01061799, "balance_loss_clip": 1.02703714, "balance_loss_mlp": 1.0249455, "epoch": 0.28029460393807304, "flos": 37266395829120.0, "grad_norm": 1.804585638079182, "language_loss": 0.7989859, "learning_rate": 3.3780339983841794e-06, "loss": 0.82036245, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5078125, "step": 4662, "time_per_iteration": 2.528850793838501 }, { "auxiliary_loss_clip": 0.01078547, "auxiliary_loss_mlp": 0.01061908, "balance_loss_clip": 1.02099514, "balance_loss_mlp": 1.02345061, "epoch": 0.280354727190741, "flos": 20740048698240.0, "grad_norm": 1.6966514247449367, "language_loss": 0.71169984, "learning_rate": 3.377751711782227e-06, "loss": 0.73310441, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.55078125, "step": 4663, "time_per_iteration": 2.379404067993164 }, { "auxiliary_loss_clip": 0.01077526, "auxiliary_loss_mlp": 0.01062694, "balance_loss_clip": 1.02142382, "balance_loss_mlp": 1.02412999, "epoch": 0.28041485044340897, "flos": 21469942915200.0, "grad_norm": 1.7421386372719467, "language_loss": 0.80264735, "learning_rate": 3.377469372935791e-06, "loss": 0.82404959, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 0.53515625, "step": 4664, "time_per_iteration": 2.440117120742798 }, { "auxiliary_loss_clip": 0.01072514, "auxiliary_loss_mlp": 0.01060878, "balance_loss_clip": 1.02432847, "balance_loss_mlp": 1.02172208, "epoch": 0.28047497369607693, "flos": 14793362375040.0, "grad_norm": 1.7605160689979775, "language_loss": 0.81202066, "learning_rate": 3.377186981855578e-06, "loss": 0.83335459, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5078125, "step": 4665, "time_per_iteration": 2.3460710048675537 }, { "auxiliary_loss_clip": 0.01077168, "auxiliary_loss_mlp": 0.01058519, "balance_loss_clip": 1.02256548, "balance_loss_mlp": 1.02541518, "epoch": 0.2805350969487449, "flos": 23069768279040.0, "grad_norm": 1.7531591754793212, "language_loss": 0.81929469, "learning_rate": 3.3769045385522968e-06, "loss": 0.84065151, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51953125, "step": 4666, "time_per_iteration": 2.4438018798828125 }, { "auxiliary_loss_clip": 0.0108217, "auxiliary_loss_mlp": 0.0105612, "balance_loss_clip": 1.01916528, "balance_loss_mlp": 1.02743387, "epoch": 0.2805952202014129, "flos": 20478968484480.0, "grad_norm": 2.3256609352915207, "language_loss": 0.8728075, "learning_rate": 3.376622043036658e-06, "loss": 0.89419043, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.546875, "step": 4667, "time_per_iteration": 2.460062265396118 }, { "auxiliary_loss_clip": 0.01080352, "auxiliary_loss_mlp": 0.01064537, "balance_loss_clip": 1.0255084, "balance_loss_mlp": 1.02610743, "epoch": 0.2806553434540809, "flos": 27416105568000.0, "grad_norm": 2.1448869192435636, "language_loss": 0.81216931, "learning_rate": 3.376339495319373e-06, "loss": 0.83361816, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.54296875, "step": 4668, "time_per_iteration": 2.4775025844573975 }, { "auxiliary_loss_clip": 0.01077652, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.02092075, "balance_loss_mlp": 1.0236702, "epoch": 0.28071546670674885, "flos": 26503825075200.0, "grad_norm": 1.3900241586094073, "language_loss": 0.77166843, "learning_rate": 3.3760568954111563e-06, "loss": 0.7930187, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5390625, "step": 4669, "time_per_iteration": 2.4621458053588867 }, { "auxiliary_loss_clip": 0.01080603, "auxiliary_loss_mlp": 0.01064544, "balance_loss_clip": 1.02837622, "balance_loss_mlp": 1.02643049, "epoch": 0.2807755899594168, "flos": 20557627511040.0, "grad_norm": 2.3032031758222735, "language_loss": 0.81161356, "learning_rate": 3.375774243322725e-06, "loss": 0.83306503, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.54296875, "step": 4670, "time_per_iteration": 2.422586441040039 }, { "auxiliary_loss_clip": 0.01083178, "auxiliary_loss_mlp": 0.01065899, "balance_loss_clip": 1.0280149, "balance_loss_mlp": 1.02788544, "epoch": 0.2808357132120848, "flos": 24312257210880.0, "grad_norm": 2.0140508194992917, "language_loss": 0.81115055, "learning_rate": 3.3754915390647955e-06, "loss": 0.83264136, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5546875, "step": 4671, "time_per_iteration": 2.4333016872406006 }, { "auxiliary_loss_clip": 0.01076685, "auxiliary_loss_mlp": 0.01049992, "balance_loss_clip": 1.01668501, "balance_loss_mlp": 1.02664101, "epoch": 0.28089583646475275, "flos": 26431205713920.0, "grad_norm": 1.8505935922550074, "language_loss": 0.76447517, "learning_rate": 3.37520878264809e-06, "loss": 0.78574193, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5, "step": 4672, "time_per_iteration": 2.4659483432769775 }, { "auxiliary_loss_clip": 0.01084693, "auxiliary_loss_mlp": 0.01068439, "balance_loss_clip": 1.02709711, "balance_loss_mlp": 1.0293107, "epoch": 0.2809559597174207, "flos": 23110721170560.0, "grad_norm": 2.630489905644986, "language_loss": 0.77324677, "learning_rate": 3.3749259740833286e-06, "loss": 0.79477805, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.5546875, "step": 4673, "time_per_iteration": 2.3980906009674072 }, { "auxiliary_loss_clip": 0.01077794, "auxiliary_loss_mlp": 0.01056871, "balance_loss_clip": 1.01922464, "balance_loss_mlp": 1.02434015, "epoch": 0.2810160829700887, "flos": 20922434974080.0, "grad_norm": 2.248175506604307, "language_loss": 0.73431802, "learning_rate": 3.374643113381237e-06, "loss": 0.75566465, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.53515625, "step": 4674, "time_per_iteration": 2.4272267818450928 }, { "auxiliary_loss_clip": 0.01084078, "auxiliary_loss_mlp": 0.01062474, "balance_loss_clip": 1.02358818, "balance_loss_mlp": 1.02846932, "epoch": 0.28107620622275664, "flos": 14355027855360.0, "grad_norm": 2.460728561415191, "language_loss": 0.79073429, "learning_rate": 3.374360200552541e-06, "loss": 0.81219977, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5546875, "step": 4675, "time_per_iteration": 2.364560127258301 }, { "auxiliary_loss_clip": 0.01078683, "auxiliary_loss_mlp": 0.01059344, "balance_loss_clip": 1.02196026, "balance_loss_mlp": 1.0246352, "epoch": 0.2811363294754246, "flos": 20918140876800.0, "grad_norm": 2.1454887605451876, "language_loss": 0.71479356, "learning_rate": 3.374077235607968e-06, "loss": 0.73617381, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.54296875, "step": 4676, "time_per_iteration": 2.4208877086639404 }, { "auxiliary_loss_clip": 0.01073907, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.01919782, "balance_loss_mlp": 1.02456164, "epoch": 0.28119645272809257, "flos": 20593797546240.0, "grad_norm": 1.4984206123770427, "language_loss": 0.72127086, "learning_rate": 3.3737942185582487e-06, "loss": 0.74256504, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.4921875, "step": 4677, "time_per_iteration": 2.3764734268188477 }, { "auxiliary_loss_clip": 0.01076891, "auxiliary_loss_mlp": 0.01055143, "balance_loss_clip": 1.01910651, "balance_loss_mlp": 1.02358627, "epoch": 0.28125657598076054, "flos": 25336259654400.0, "grad_norm": 3.0408568614982183, "language_loss": 0.64009249, "learning_rate": 3.3735111494141153e-06, "loss": 0.66141278, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.53125, "step": 4678, "time_per_iteration": 2.433130979537964 }, { "auxiliary_loss_clip": 0.01075383, "auxiliary_loss_mlp": 0.01060978, "balance_loss_clip": 1.02256835, "balance_loss_mlp": 1.0226202, "epoch": 0.2813166992334285, "flos": 24825934177920.0, "grad_norm": 1.5747668625149234, "language_loss": 0.71840012, "learning_rate": 3.3732280281863013e-06, "loss": 0.73976374, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.52734375, "step": 4679, "time_per_iteration": 2.4161243438720703 }, { "auxiliary_loss_clip": 0.0107562, "auxiliary_loss_mlp": 0.01059454, "balance_loss_clip": 1.02180743, "balance_loss_mlp": 1.02272987, "epoch": 0.2813768224860965, "flos": 21759722133120.0, "grad_norm": 1.9146496257083523, "language_loss": 0.75607181, "learning_rate": 3.3729448548855422e-06, "loss": 0.77742255, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.52734375, "step": 4680, "time_per_iteration": 2.4204468727111816 }, { "auxiliary_loss_clip": 0.01077649, "auxiliary_loss_mlp": 0.01055174, "balance_loss_clip": 1.01895833, "balance_loss_mlp": 1.02411509, "epoch": 0.2814369457387645, "flos": 24315643612800.0, "grad_norm": 1.5876086403023753, "language_loss": 0.78479981, "learning_rate": 3.3726616295225774e-06, "loss": 0.80612808, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.53515625, "step": 4681, "time_per_iteration": 2.4202840328216553 }, { "auxiliary_loss_clip": 0.0107626, "auxiliary_loss_mlp": 0.01058012, "balance_loss_clip": 1.01924539, "balance_loss_mlp": 1.02262545, "epoch": 0.28149706899143245, "flos": 18514335657600.0, "grad_norm": 2.1904869768750346, "language_loss": 0.76306874, "learning_rate": 3.372378352108146e-06, "loss": 0.78441149, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.53515625, "step": 4682, "time_per_iteration": 3.7881886959075928 }, { "auxiliary_loss_clip": 0.01074142, "auxiliary_loss_mlp": 0.01055862, "balance_loss_clip": 1.01866913, "balance_loss_mlp": 1.02219307, "epoch": 0.2815571922441004, "flos": 24862104213120.0, "grad_norm": 2.070927810340737, "language_loss": 0.8209852, "learning_rate": 3.3720950226529894e-06, "loss": 0.84228522, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.51953125, "step": 4683, "time_per_iteration": 2.4148483276367188 }, { "auxiliary_loss_clip": 0.01076493, "auxiliary_loss_mlp": 0.01056769, "balance_loss_clip": 1.02174532, "balance_loss_mlp": 1.02350736, "epoch": 0.2816173154967684, "flos": 19900597034880.0, "grad_norm": 2.4858029449364287, "language_loss": 0.77324909, "learning_rate": 3.371811641167852e-06, "loss": 0.79458171, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.53125, "step": 4684, "time_per_iteration": 2.392758846282959 }, { "auxiliary_loss_clip": 0.01073065, "auxiliary_loss_mlp": 0.01049314, "balance_loss_clip": 1.01591182, "balance_loss_mlp": 1.02213931, "epoch": 0.28167743874943635, "flos": 17490437948160.0, "grad_norm": 1.6853096583050504, "language_loss": 0.7749542, "learning_rate": 3.3715282076634807e-06, "loss": 0.79617798, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 4685, "time_per_iteration": 3.8161308765411377 }, { "auxiliary_loss_clip": 0.01070713, "auxiliary_loss_mlp": 0.01050004, "balance_loss_clip": 1.01610076, "balance_loss_mlp": 1.02125263, "epoch": 0.2817375620021043, "flos": 25300927491840.0, "grad_norm": 1.507850484658547, "language_loss": 0.77059197, "learning_rate": 3.3712447221506218e-06, "loss": 0.79179919, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49414062, "step": 4686, "time_per_iteration": 3.965028762817383 }, { "auxiliary_loss_clip": 0.01076811, "auxiliary_loss_mlp": 0.01059403, "balance_loss_clip": 1.02063632, "balance_loss_mlp": 1.02340198, "epoch": 0.2817976852547723, "flos": 18692358013440.0, "grad_norm": 3.042565515878938, "language_loss": 0.65233904, "learning_rate": 3.370961184640025e-06, "loss": 0.67370117, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.53515625, "step": 4687, "time_per_iteration": 2.4101359844207764 }, { "auxiliary_loss_clip": 0.01077388, "auxiliary_loss_mlp": 0.01060127, "balance_loss_clip": 1.0236485, "balance_loss_mlp": 1.02533913, "epoch": 0.28185780850744024, "flos": 22741305408000.0, "grad_norm": 2.5664833465295387, "language_loss": 0.78177023, "learning_rate": 3.3706775951424433e-06, "loss": 0.80314541, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51953125, "step": 4688, "time_per_iteration": 3.8244881629943848 }, { "auxiliary_loss_clip": 0.01074161, "auxiliary_loss_mlp": 0.01053045, "balance_loss_clip": 1.01763988, "balance_loss_mlp": 1.02339292, "epoch": 0.2819179317601082, "flos": 14933189836800.0, "grad_norm": 1.9077793850065246, "language_loss": 0.80201823, "learning_rate": 3.37039395366863e-06, "loss": 0.82329029, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5078125, "step": 4689, "time_per_iteration": 2.3489952087402344 }, { "auxiliary_loss_clip": 0.01075698, "auxiliary_loss_mlp": 0.0104856, "balance_loss_clip": 1.01372695, "balance_loss_mlp": 1.02441537, "epoch": 0.2819780550127762, "flos": 23144307765120.0, "grad_norm": 1.6093650302615685, "language_loss": 0.78959715, "learning_rate": 3.37011026022934e-06, "loss": 0.81083977, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51171875, "step": 4690, "time_per_iteration": 2.4115638732910156 }, { "auxiliary_loss_clip": 0.01075653, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.01641369, "balance_loss_mlp": 1.02306056, "epoch": 0.28203817826544414, "flos": 21615286371840.0, "grad_norm": 3.3055515073188517, "language_loss": 0.88851738, "learning_rate": 3.369826514835332e-06, "loss": 0.90977836, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5234375, "step": 4691, "time_per_iteration": 2.384486198425293 }, { "auxiliary_loss_clip": 0.01079977, "auxiliary_loss_mlp": 0.01060821, "balance_loss_clip": 1.02706087, "balance_loss_mlp": 1.02653646, "epoch": 0.2820983015181121, "flos": 24025585104000.0, "grad_norm": 1.7103776242309447, "language_loss": 0.83197814, "learning_rate": 3.3695427174973654e-06, "loss": 0.85338604, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.53125, "step": 4692, "time_per_iteration": 2.4489409923553467 }, { "auxiliary_loss_clip": 0.01076642, "auxiliary_loss_mlp": 0.01060059, "balance_loss_clip": 1.02350938, "balance_loss_mlp": 1.02468419, "epoch": 0.2821584247707801, "flos": 30006626071680.0, "grad_norm": 1.5725828492621323, "language_loss": 0.75767666, "learning_rate": 3.3692588682262022e-06, "loss": 0.77904367, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.51953125, "step": 4693, "time_per_iteration": 2.4895682334899902 }, { "auxiliary_loss_clip": 0.01075775, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.01456583, "balance_loss_mlp": 1.0232619, "epoch": 0.2822185480234481, "flos": 21395717631360.0, "grad_norm": 1.7299452960929549, "language_loss": 0.79036152, "learning_rate": 3.3689749670326046e-06, "loss": 0.8116135, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5234375, "step": 4694, "time_per_iteration": 2.4006242752075195 }, { "auxiliary_loss_clip": 0.010745, "auxiliary_loss_mlp": 0.01049439, "balance_loss_clip": 1.0169183, "balance_loss_mlp": 1.02518725, "epoch": 0.28227867127611606, "flos": 27451612287360.0, "grad_norm": 1.7353932559278333, "language_loss": 0.6827755, "learning_rate": 3.3686910139273392e-06, "loss": 0.7040149, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.49414062, "step": 4695, "time_per_iteration": 2.4656476974487305 }, { "auxiliary_loss_clip": 0.01077869, "auxiliary_loss_mlp": 0.01058895, "balance_loss_clip": 1.02277446, "balance_loss_mlp": 1.0245676, "epoch": 0.282338794528784, "flos": 22592854840320.0, "grad_norm": 2.08248830040284, "language_loss": 0.76893044, "learning_rate": 3.3684070089211736e-06, "loss": 0.79029804, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.53125, "step": 4696, "time_per_iteration": 2.378814220428467 }, { "auxiliary_loss_clip": 0.01078481, "auxiliary_loss_mlp": 0.01055785, "balance_loss_clip": 1.02076149, "balance_loss_mlp": 1.02664435, "epoch": 0.282398917781452, "flos": 42009311784960.0, "grad_norm": 1.8841364001543182, "language_loss": 0.6343829, "learning_rate": 3.368122952024877e-06, "loss": 0.6557256, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 4697, "time_per_iteration": 2.596346855163574 }, { "auxiliary_loss_clip": 0.01070532, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.01797473, "balance_loss_mlp": 1.02230275, "epoch": 0.28245904103411995, "flos": 23223525373440.0, "grad_norm": 1.606275432149893, "language_loss": 0.74177808, "learning_rate": 3.3678388432492214e-06, "loss": 0.76296085, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.48242188, "step": 4698, "time_per_iteration": 2.4619977474212646 }, { "auxiliary_loss_clip": 0.0107196, "auxiliary_loss_mlp": 0.01051307, "balance_loss_clip": 1.02086139, "balance_loss_mlp": 1.02303231, "epoch": 0.2825191642867879, "flos": 25373442119040.0, "grad_norm": 3.3876386134487246, "language_loss": 0.77114475, "learning_rate": 3.3675546826049788e-06, "loss": 0.79237747, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.48828125, "step": 4699, "time_per_iteration": 2.459455728530884 }, { "auxiliary_loss_clip": 0.01076197, "auxiliary_loss_mlp": 0.01055288, "balance_loss_clip": 1.01797545, "balance_loss_mlp": 1.02395022, "epoch": 0.2825792875394559, "flos": 17235886158720.0, "grad_norm": 6.897366929514765, "language_loss": 0.83459896, "learning_rate": 3.3672704701029265e-06, "loss": 0.85591388, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5234375, "step": 4700, "time_per_iteration": 2.3565571308135986 }, { "auxiliary_loss_clip": 0.01068034, "auxiliary_loss_mlp": 0.01045582, "balance_loss_clip": 1.01687622, "balance_loss_mlp": 1.02163625, "epoch": 0.28263941079212385, "flos": 26722765411200.0, "grad_norm": 1.726302704720677, "language_loss": 0.82934475, "learning_rate": 3.3669862057538402e-06, "loss": 0.85048085, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.46484375, "step": 4701, "time_per_iteration": 2.4480061531066895 }, { "auxiliary_loss_clip": 0.01071317, "auxiliary_loss_mlp": 0.01050153, "balance_loss_clip": 1.01810932, "balance_loss_mlp": 1.02115655, "epoch": 0.2826995340447918, "flos": 25920147098880.0, "grad_norm": 2.3050024689287363, "language_loss": 0.75630534, "learning_rate": 3.3667018895685004e-06, "loss": 0.77752, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.5, "step": 4702, "time_per_iteration": 2.4229376316070557 }, { "auxiliary_loss_clip": 0.01070087, "auxiliary_loss_mlp": 0.01047948, "balance_loss_clip": 1.0168817, "balance_loss_mlp": 1.02223134, "epoch": 0.2827596572974598, "flos": 22378697360640.0, "grad_norm": 1.8226248368435236, "language_loss": 0.79917049, "learning_rate": 3.3664175215576886e-06, "loss": 0.82035077, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.47851562, "step": 4703, "time_per_iteration": 2.439295530319214 }, { "auxiliary_loss_clip": 0.01070917, "auxiliary_loss_mlp": 0.0105216, "balance_loss_clip": 1.01980639, "balance_loss_mlp": 1.02139246, "epoch": 0.28281978055012774, "flos": 33545736748800.0, "grad_norm": 1.8523219666451411, "language_loss": 0.70716125, "learning_rate": 3.3661331017321867e-06, "loss": 0.728392, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.49609375, "step": 4704, "time_per_iteration": 2.487262725830078 }, { "auxiliary_loss_clip": 0.01071736, "auxiliary_loss_mlp": 0.01056591, "balance_loss_clip": 1.02359414, "balance_loss_mlp": 1.02353096, "epoch": 0.2828799038027957, "flos": 23439742623360.0, "grad_norm": 2.825825832611166, "language_loss": 0.71874326, "learning_rate": 3.3658486301027807e-06, "loss": 0.74002647, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.48242188, "step": 4705, "time_per_iteration": 2.449760675430298 }, { "auxiliary_loss_clip": 0.01018686, "auxiliary_loss_mlp": 0.01008896, "balance_loss_clip": 1.00462878, "balance_loss_mlp": 1.00686038, "epoch": 0.2829400270554637, "flos": 69870629410560.0, "grad_norm": 0.7216904597050283, "language_loss": 0.59359223, "learning_rate": 3.3655641066802577e-06, "loss": 0.61386806, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 0.04272461, "router_z_loss_mlp": 0.11816406, "step": 4706, "time_per_iteration": 3.1052396297454834 }, { "auxiliary_loss_clip": 0.0106837, "auxiliary_loss_mlp": 0.01044338, "balance_loss_clip": 1.0167532, "balance_loss_mlp": 1.02169228, "epoch": 0.2830001503081317, "flos": 24787913840640.0, "grad_norm": 1.5544511839354211, "language_loss": 0.82715386, "learning_rate": 3.365279531475407e-06, "loss": 0.84828091, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.46875, "step": 4707, "time_per_iteration": 2.4354686737060547 }, { "auxiliary_loss_clip": 0.0107426, "auxiliary_loss_mlp": 0.01049852, "balance_loss_clip": 1.01578188, "balance_loss_mlp": 1.02262449, "epoch": 0.28306027356079966, "flos": 27668248473600.0, "grad_norm": 1.8709330592639417, "language_loss": 0.81738544, "learning_rate": 3.36499490449902e-06, "loss": 0.83862662, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.515625, "step": 4708, "time_per_iteration": 2.444035053253174 }, { "auxiliary_loss_clip": 0.01020949, "auxiliary_loss_mlp": 0.01025797, "balance_loss_clip": 1.02145791, "balance_loss_mlp": 1.00918674, "epoch": 0.2831203968134676, "flos": 60525288276480.0, "grad_norm": 0.9020369355131317, "language_loss": 0.62869424, "learning_rate": 3.3647102257618895e-06, "loss": 0.6491617, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 0.04345703, "router_z_loss_mlp": 0.1171875, "step": 4709, "time_per_iteration": 2.939412832260132 }, { "auxiliary_loss_clip": 0.01074097, "auxiliary_loss_mlp": 0.01046889, "balance_loss_clip": 1.01565623, "balance_loss_mlp": 1.02544761, "epoch": 0.2831805200661356, "flos": 22053690714240.0, "grad_norm": 1.479323472615513, "language_loss": 0.75264448, "learning_rate": 3.3644254952748103e-06, "loss": 0.77385437, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.48632812, "step": 4710, "time_per_iteration": 2.4848527908325195 }, { "auxiliary_loss_clip": 0.01076303, "auxiliary_loss_mlp": 0.01060904, "balance_loss_clip": 1.02585673, "balance_loss_mlp": 1.02395868, "epoch": 0.28324064331880355, "flos": 22599592732800.0, "grad_norm": 1.8142361819399593, "language_loss": 0.81385469, "learning_rate": 3.364140713048579e-06, "loss": 0.83522677, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5234375, "step": 4711, "time_per_iteration": 2.405076265335083 }, { "auxiliary_loss_clip": 0.01076509, "auxiliary_loss_mlp": 0.01052689, "balance_loss_clip": 1.02117026, "balance_loss_mlp": 1.02468729, "epoch": 0.2833007665714715, "flos": 30402960359040.0, "grad_norm": 2.1213666931100867, "language_loss": 0.72170973, "learning_rate": 3.363855879093996e-06, "loss": 0.7430017, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.515625, "step": 4712, "time_per_iteration": 2.4853057861328125 }, { "auxiliary_loss_clip": 0.01075558, "auxiliary_loss_mlp": 0.01062017, "balance_loss_clip": 1.02851939, "balance_loss_mlp": 1.0246048, "epoch": 0.2833608898241395, "flos": 23548392374400.0, "grad_norm": 1.7952975578431338, "language_loss": 0.83828217, "learning_rate": 3.3635709934218605e-06, "loss": 0.85965788, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5078125, "step": 4713, "time_per_iteration": 2.4018986225128174 }, { "auxiliary_loss_clip": 0.01075767, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.02492702, "balance_loss_mlp": 1.0261538, "epoch": 0.28342101307680745, "flos": 20265683788800.0, "grad_norm": 1.758811263825591, "language_loss": 0.77426279, "learning_rate": 3.3632860560429766e-06, "loss": 0.79558754, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.49609375, "step": 4714, "time_per_iteration": 2.4213197231292725 }, { "auxiliary_loss_clip": 0.0107499, "auxiliary_loss_mlp": 0.01061152, "balance_loss_clip": 1.03056252, "balance_loss_mlp": 1.02551889, "epoch": 0.2834811363294754, "flos": 30845728621440.0, "grad_norm": 1.4416170518889595, "language_loss": 0.79435378, "learning_rate": 3.3630010669681494e-06, "loss": 0.81571519, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.49609375, "step": 4715, "time_per_iteration": 2.513606309890747 }, { "auxiliary_loss_clip": 0.01074895, "auxiliary_loss_mlp": 0.01059316, "balance_loss_clip": 1.02572334, "balance_loss_mlp": 1.02502453, "epoch": 0.2835412595821434, "flos": 22709918229120.0, "grad_norm": 1.7031063393484447, "language_loss": 0.74794328, "learning_rate": 3.3627160262081845e-06, "loss": 0.76928544, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5, "step": 4716, "time_per_iteration": 2.461986780166626 }, { "auxiliary_loss_clip": 0.01076971, "auxiliary_loss_mlp": 0.01069454, "balance_loss_clip": 1.03206968, "balance_loss_mlp": 1.02255106, "epoch": 0.28360138283481134, "flos": 18076734276480.0, "grad_norm": 2.2729664851207403, "language_loss": 0.76687682, "learning_rate": 3.3624309337738917e-06, "loss": 0.78834105, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.546875, "step": 4717, "time_per_iteration": 2.3891706466674805 }, { "auxiliary_loss_clip": 0.01075213, "auxiliary_loss_mlp": 0.01064477, "balance_loss_clip": 1.02954876, "balance_loss_mlp": 1.02345192, "epoch": 0.2836615060874793, "flos": 17853918779520.0, "grad_norm": 1.5566477876659615, "language_loss": 0.68559635, "learning_rate": 3.3621457896760813e-06, "loss": 0.70699322, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.51953125, "step": 4718, "time_per_iteration": 2.397733688354492 }, { "auxiliary_loss_clip": 0.01076139, "auxiliary_loss_mlp": 0.01061572, "balance_loss_clip": 1.02700162, "balance_loss_mlp": 1.02358508, "epoch": 0.2837216293401473, "flos": 25739087454720.0, "grad_norm": 1.6433678478870297, "language_loss": 0.74389148, "learning_rate": 3.361860593925566e-06, "loss": 0.76526862, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5234375, "step": 4719, "time_per_iteration": 2.443596124649048 }, { "auxiliary_loss_clip": 0.01072691, "auxiliary_loss_mlp": 0.01057427, "balance_loss_clip": 1.02271295, "balance_loss_mlp": 1.02343249, "epoch": 0.2837817525928153, "flos": 20922469885440.0, "grad_norm": 1.815107214020504, "language_loss": 0.81781822, "learning_rate": 3.3615753465331605e-06, "loss": 0.83911943, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.4921875, "step": 4720, "time_per_iteration": 2.3948445320129395 }, { "auxiliary_loss_clip": 0.01075249, "auxiliary_loss_mlp": 0.01057191, "balance_loss_clip": 1.01851988, "balance_loss_mlp": 1.02322078, "epoch": 0.28384187584548326, "flos": 18915697180800.0, "grad_norm": 1.822250441707057, "language_loss": 0.8079623, "learning_rate": 3.3612900475096817e-06, "loss": 0.82928669, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.51953125, "step": 4721, "time_per_iteration": 2.4417712688446045 }, { "auxiliary_loss_clip": 0.01071824, "auxiliary_loss_mlp": 0.01051227, "balance_loss_clip": 1.01863551, "balance_loss_mlp": 1.0215528, "epoch": 0.2839019990981512, "flos": 27342753068160.0, "grad_norm": 1.9615977759241718, "language_loss": 0.84351826, "learning_rate": 3.3610046968659474e-06, "loss": 0.86474878, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.50390625, "step": 4722, "time_per_iteration": 3.964268684387207 }, { "auxiliary_loss_clip": 0.01073584, "auxiliary_loss_mlp": 0.0104903, "balance_loss_clip": 1.01631904, "balance_loss_mlp": 1.02306306, "epoch": 0.2839621223508192, "flos": 18113323248000.0, "grad_norm": 1.8690806976543404, "language_loss": 0.71081311, "learning_rate": 3.3607192946127785e-06, "loss": 0.73203921, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.50390625, "step": 4723, "time_per_iteration": 2.400887966156006 }, { "auxiliary_loss_clip": 0.01075444, "auxiliary_loss_mlp": 0.01052084, "balance_loss_clip": 1.01558232, "balance_loss_mlp": 1.02365446, "epoch": 0.28402224560348716, "flos": 26357189898240.0, "grad_norm": 1.5831876150218114, "language_loss": 0.79948676, "learning_rate": 3.360433840760998e-06, "loss": 0.82076204, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.51953125, "step": 4724, "time_per_iteration": 3.8484649658203125 }, { "auxiliary_loss_clip": 0.01076815, "auxiliary_loss_mlp": 0.01056098, "balance_loss_clip": 1.02014482, "balance_loss_mlp": 1.02459276, "epoch": 0.2840823688561551, "flos": 24059660457600.0, "grad_norm": 1.8255867507188768, "language_loss": 0.93966043, "learning_rate": 3.36014833532143e-06, "loss": 0.96098959, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 4725, "time_per_iteration": 2.411259651184082 }, { "auxiliary_loss_clip": 0.01077186, "auxiliary_loss_mlp": 0.01058698, "balance_loss_clip": 1.02140963, "balance_loss_mlp": 1.02478433, "epoch": 0.2841424921088231, "flos": 29458559548800.0, "grad_norm": 2.3997252007971213, "language_loss": 0.90161264, "learning_rate": 3.3598627783049e-06, "loss": 0.92297149, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5234375, "step": 4726, "time_per_iteration": 3.947939872741699 }, { "auxiliary_loss_clip": 0.01077161, "auxiliary_loss_mlp": 0.01059857, "balance_loss_clip": 1.02523911, "balance_loss_mlp": 1.02562833, "epoch": 0.28420261536149105, "flos": 48098688301440.0, "grad_norm": 2.0506136916434885, "language_loss": 0.80552876, "learning_rate": 3.359577169722238e-06, "loss": 0.82689893, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.515625, "step": 4727, "time_per_iteration": 4.058341026306152 }, { "auxiliary_loss_clip": 0.01073043, "auxiliary_loss_mlp": 0.01049946, "balance_loss_clip": 1.01835573, "balance_loss_mlp": 1.02348638, "epoch": 0.284262738614159, "flos": 25664966904960.0, "grad_norm": 3.6178745915667476, "language_loss": 0.68747115, "learning_rate": 3.3592915095842733e-06, "loss": 0.70870101, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.49609375, "step": 4728, "time_per_iteration": 2.4283571243286133 }, { "auxiliary_loss_clip": 0.01074001, "auxiliary_loss_mlp": 0.01059988, "balance_loss_clip": 1.02496481, "balance_loss_mlp": 1.02337551, "epoch": 0.284322861866827, "flos": 19717966379520.0, "grad_norm": 3.4989805553700246, "language_loss": 0.78457457, "learning_rate": 3.3590057979018386e-06, "loss": 0.8059144, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.50390625, "step": 4729, "time_per_iteration": 2.406920909881592 }, { "auxiliary_loss_clip": 0.01078605, "auxiliary_loss_mlp": 0.0105751, "balance_loss_clip": 1.01995945, "balance_loss_mlp": 1.02596056, "epoch": 0.28438298511949495, "flos": 23914107532800.0, "grad_norm": 1.793091625493373, "language_loss": 0.67934281, "learning_rate": 3.3587200346857674e-06, "loss": 0.70070398, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.52734375, "step": 4730, "time_per_iteration": 2.452794313430786 }, { "auxiliary_loss_clip": 0.01077291, "auxiliary_loss_mlp": 0.01052987, "balance_loss_clip": 1.01684284, "balance_loss_mlp": 1.02569306, "epoch": 0.2844431083721629, "flos": 26066153871360.0, "grad_norm": 1.6758702146814628, "language_loss": 0.76265371, "learning_rate": 3.3584342199468965e-06, "loss": 0.78395653, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.515625, "step": 4731, "time_per_iteration": 2.4511759281158447 }, { "auxiliary_loss_clip": 0.01074882, "auxiliary_loss_mlp": 0.01056428, "balance_loss_clip": 1.02290678, "balance_loss_mlp": 1.02397084, "epoch": 0.2845032316248309, "flos": 25809297932160.0, "grad_norm": 1.491089932463519, "language_loss": 0.84510922, "learning_rate": 3.3581483536960638e-06, "loss": 0.8664223, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5078125, "step": 4732, "time_per_iteration": 2.466252326965332 }, { "auxiliary_loss_clip": 0.01077078, "auxiliary_loss_mlp": 0.01064418, "balance_loss_clip": 1.0260092, "balance_loss_mlp": 1.02444088, "epoch": 0.2845633548774989, "flos": 19822322033280.0, "grad_norm": 1.6431155688000156, "language_loss": 0.80677259, "learning_rate": 3.357862435944109e-06, "loss": 0.82818758, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.52734375, "step": 4733, "time_per_iteration": 2.3997511863708496 }, { "auxiliary_loss_clip": 0.01079883, "auxiliary_loss_mlp": 0.01065864, "balance_loss_clip": 1.0244031, "balance_loss_mlp": 1.02512479, "epoch": 0.28462347813016686, "flos": 23181769520640.0, "grad_norm": 2.427114045750969, "language_loss": 0.73544776, "learning_rate": 3.357576466701875e-06, "loss": 0.7569052, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 0.546875, "step": 4734, "time_per_iteration": 2.4254510402679443 }, { "auxiliary_loss_clip": 0.01072561, "auxiliary_loss_mlp": 0.01053659, "balance_loss_clip": 1.01837289, "balance_loss_mlp": 1.02162051, "epoch": 0.2846836013828348, "flos": 18659504557440.0, "grad_norm": 3.440772847853321, "language_loss": 0.75363761, "learning_rate": 3.3572904459802056e-06, "loss": 0.77489984, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5078125, "step": 4735, "time_per_iteration": 2.3891022205352783 }, { "auxiliary_loss_clip": 0.01073673, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.02881598, "balance_loss_mlp": 1.02296472, "epoch": 0.2847437246355028, "flos": 14172641579520.0, "grad_norm": 2.3785281303374175, "language_loss": 0.81087649, "learning_rate": 3.357004373789946e-06, "loss": 0.83223271, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5078125, "step": 4736, "time_per_iteration": 2.383725881576538 }, { "auxiliary_loss_clip": 0.01076747, "auxiliary_loss_mlp": 0.01058753, "balance_loss_clip": 1.02034414, "balance_loss_mlp": 1.02455556, "epoch": 0.28480384788817076, "flos": 29277080968320.0, "grad_norm": 1.9269392644904917, "language_loss": 0.61018932, "learning_rate": 3.3567182501419453e-06, "loss": 0.63154429, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5234375, "step": 4737, "time_per_iteration": 2.4753646850585938 }, { "auxiliary_loss_clip": 0.01071016, "auxiliary_loss_mlp": 0.01062245, "balance_loss_clip": 1.02579129, "balance_loss_mlp": 1.02085793, "epoch": 0.2848639711408387, "flos": 22600221137280.0, "grad_norm": 2.0520696799436013, "language_loss": 0.87928224, "learning_rate": 3.356432075047052e-06, "loss": 0.90061492, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5, "step": 4738, "time_per_iteration": 2.4211652278900146 }, { "auxiliary_loss_clip": 0.01078696, "auxiliary_loss_mlp": 0.01069069, "balance_loss_clip": 1.02820432, "balance_loss_mlp": 1.02354622, "epoch": 0.2849240943935067, "flos": 17598598940160.0, "grad_norm": 2.0235511349570143, "language_loss": 0.91971934, "learning_rate": 3.356145848516118e-06, "loss": 0.94119704, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.55078125, "step": 4739, "time_per_iteration": 2.378560781478882 }, { "auxiliary_loss_clip": 0.01075701, "auxiliary_loss_mlp": 0.0105085, "balance_loss_clip": 1.01756728, "balance_loss_mlp": 1.0239377, "epoch": 0.28498421764617465, "flos": 24861440897280.0, "grad_norm": 1.6689619181131328, "language_loss": 0.72650218, "learning_rate": 3.355859570559998e-06, "loss": 0.74776763, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.51953125, "step": 4740, "time_per_iteration": 2.4405720233917236 }, { "auxiliary_loss_clip": 0.01072659, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.01111579, "balance_loss_mlp": 1.02291703, "epoch": 0.2850443408988426, "flos": 22781490249600.0, "grad_norm": 1.509469855394795, "language_loss": 0.79439133, "learning_rate": 3.3555732411895477e-06, "loss": 0.81558764, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.49609375, "step": 4741, "time_per_iteration": 2.420545816421509 }, { "auxiliary_loss_clip": 0.01075667, "auxiliary_loss_mlp": 0.01057342, "balance_loss_clip": 1.01816988, "balance_loss_mlp": 1.02132845, "epoch": 0.2851044641515106, "flos": 18843042908160.0, "grad_norm": 2.049237337244661, "language_loss": 0.77733159, "learning_rate": 3.3552868604156235e-06, "loss": 0.79866171, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.54296875, "step": 4742, "time_per_iteration": 2.395918130874634 }, { "auxiliary_loss_clip": 0.01079579, "auxiliary_loss_mlp": 0.01065705, "balance_loss_clip": 1.02529359, "balance_loss_mlp": 1.0236094, "epoch": 0.28516458740417855, "flos": 18879492234240.0, "grad_norm": 2.182973752867177, "language_loss": 0.5945363, "learning_rate": 3.355000428249086e-06, "loss": 0.61598915, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.55859375, "step": 4743, "time_per_iteration": 2.3966941833496094 }, { "auxiliary_loss_clip": 0.01079592, "auxiliary_loss_mlp": 0.01058454, "balance_loss_clip": 1.01983023, "balance_loss_mlp": 1.02499247, "epoch": 0.2852247106568465, "flos": 25298693164800.0, "grad_norm": 1.816825386539203, "language_loss": 0.75601912, "learning_rate": 3.354713944700797e-06, "loss": 0.77739966, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.546875, "step": 4744, "time_per_iteration": 2.428297996520996 }, { "auxiliary_loss_clip": 0.01074194, "auxiliary_loss_mlp": 0.01047269, "balance_loss_clip": 1.01322305, "balance_loss_mlp": 1.02283525, "epoch": 0.2852848339095145, "flos": 11654600791680.0, "grad_norm": 2.3758193824985763, "language_loss": 0.78478295, "learning_rate": 3.3544274097816185e-06, "loss": 0.80599761, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51171875, "step": 4745, "time_per_iteration": 2.378159523010254 }, { "auxiliary_loss_clip": 0.01072266, "auxiliary_loss_mlp": 0.01056973, "balance_loss_clip": 1.02221107, "balance_loss_mlp": 1.02376986, "epoch": 0.2853449571621825, "flos": 12932386974720.0, "grad_norm": 1.8563503100045373, "language_loss": 0.83576071, "learning_rate": 3.3541408235024173e-06, "loss": 0.8570531, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48632812, "step": 4746, "time_per_iteration": 2.3764760494232178 }, { "auxiliary_loss_clip": 0.01077584, "auxiliary_loss_mlp": 0.01060662, "balance_loss_clip": 1.01982141, "balance_loss_mlp": 1.02247214, "epoch": 0.28540508041485046, "flos": 20009560988160.0, "grad_norm": 1.6407601435637755, "language_loss": 0.81606013, "learning_rate": 3.3538541858740604e-06, "loss": 0.83744264, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.55078125, "step": 4747, "time_per_iteration": 2.419158697128296 }, { "auxiliary_loss_clip": 0.01026891, "auxiliary_loss_mlp": 0.01027562, "balance_loss_clip": 1.02274549, "balance_loss_mlp": 1.01432717, "epoch": 0.28546520366751843, "flos": 68135864175360.0, "grad_norm": 0.7951255599807986, "language_loss": 0.60512269, "learning_rate": 3.3535674969074173e-06, "loss": 0.62566721, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 0.0480957, "router_z_loss_mlp": 0.125, "step": 4748, "time_per_iteration": 3.037266731262207 }, { "auxiliary_loss_clip": 0.01073853, "auxiliary_loss_mlp": 0.01057747, "balance_loss_clip": 1.01855099, "balance_loss_mlp": 1.02143073, "epoch": 0.2855253269201864, "flos": 13250969930880.0, "grad_norm": 2.948606844728039, "language_loss": 0.82145447, "learning_rate": 3.3532807566133592e-06, "loss": 0.84277046, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5234375, "step": 4749, "time_per_iteration": 2.367530107498169 }, { "auxiliary_loss_clip": 0.010736, "auxiliary_loss_mlp": 0.01056597, "balance_loss_clip": 1.02169299, "balance_loss_mlp": 1.02161503, "epoch": 0.28558545017285436, "flos": 28619631555840.0, "grad_norm": 1.9824025116408432, "language_loss": 0.7258451, "learning_rate": 3.3529939650027587e-06, "loss": 0.74714708, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.51953125, "step": 4750, "time_per_iteration": 2.465240716934204 }, { "auxiliary_loss_clip": 0.01072928, "auxiliary_loss_mlp": 0.01065172, "balance_loss_clip": 1.02657199, "balance_loss_mlp": 1.02192104, "epoch": 0.2856455734255223, "flos": 34129065611520.0, "grad_norm": 1.675595733420184, "language_loss": 0.83028674, "learning_rate": 3.3527071220864917e-06, "loss": 0.85166776, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.51171875, "step": 4751, "time_per_iteration": 2.5410549640655518 }, { "auxiliary_loss_clip": 0.01074466, "auxiliary_loss_mlp": 0.01057854, "balance_loss_clip": 1.02104175, "balance_loss_mlp": 1.02259719, "epoch": 0.2857056966781903, "flos": 39784576262400.0, "grad_norm": 2.022613939167455, "language_loss": 0.8128733, "learning_rate": 3.3524202278754353e-06, "loss": 0.83419651, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.51953125, "step": 4752, "time_per_iteration": 2.553499937057495 }, { "auxiliary_loss_clip": 0.01073005, "auxiliary_loss_mlp": 0.01055177, "balance_loss_clip": 1.0183171, "balance_loss_mlp": 1.02098489, "epoch": 0.28576581993085826, "flos": 21871199704320.0, "grad_norm": 1.751244306578235, "language_loss": 0.80196536, "learning_rate": 3.3521332823804676e-06, "loss": 0.82324719, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5234375, "step": 4753, "time_per_iteration": 2.41487193107605 }, { "auxiliary_loss_clip": 0.01077244, "auxiliary_loss_mlp": 0.0106178, "balance_loss_clip": 1.0207243, "balance_loss_mlp": 1.02284491, "epoch": 0.2858259431835262, "flos": 19090856805120.0, "grad_norm": 2.576943761525006, "language_loss": 0.9171409, "learning_rate": 3.3518462856124704e-06, "loss": 0.93853116, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 0.546875, "step": 4754, "time_per_iteration": 2.3717336654663086 }, { "auxiliary_loss_clip": 0.01073195, "auxiliary_loss_mlp": 0.01054287, "balance_loss_clip": 1.01845288, "balance_loss_mlp": 1.02294278, "epoch": 0.2858860664361942, "flos": 20333415559680.0, "grad_norm": 1.663027878056912, "language_loss": 0.82831073, "learning_rate": 3.3515592375823267e-06, "loss": 0.84958547, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5, "step": 4755, "time_per_iteration": 2.4186153411865234 }, { "auxiliary_loss_clip": 0.01074341, "auxiliary_loss_mlp": 0.01054076, "balance_loss_clip": 1.01733589, "balance_loss_mlp": 1.0219245, "epoch": 0.28594618968886215, "flos": 24460603044480.0, "grad_norm": 1.5397807391454041, "language_loss": 0.85035586, "learning_rate": 3.351272138300922e-06, "loss": 0.87164009, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5234375, "step": 4756, "time_per_iteration": 2.4371719360351562 }, { "auxiliary_loss_clip": 0.01017336, "auxiliary_loss_mlp": 0.01018424, "balance_loss_clip": 1.01468062, "balance_loss_mlp": 1.00537157, "epoch": 0.2860063129415301, "flos": 71648964023040.0, "grad_norm": 0.8796171416364948, "language_loss": 0.61102045, "learning_rate": 3.350984987779142e-06, "loss": 0.63137805, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 0.03735352, "router_z_loss_mlp": 0.11914062, "step": 4757, "time_per_iteration": 3.175727367401123 }, { "auxiliary_loss_clip": 0.01074646, "auxiliary_loss_mlp": 0.01055235, "balance_loss_clip": 1.02290595, "balance_loss_mlp": 1.02427244, "epoch": 0.2860664361941981, "flos": 20557627511040.0, "grad_norm": 1.8601619301397203, "language_loss": 0.66947848, "learning_rate": 3.3506977860278756e-06, "loss": 0.6907773, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.50390625, "step": 4758, "time_per_iteration": 2.392843246459961 }, { "auxiliary_loss_clip": 0.01075873, "auxiliary_loss_mlp": 0.01065197, "balance_loss_clip": 1.02678728, "balance_loss_mlp": 1.0234673, "epoch": 0.2861265594468661, "flos": 35994788956800.0, "grad_norm": 1.6247413411596905, "language_loss": 0.64364958, "learning_rate": 3.3504105330580143e-06, "loss": 0.66506028, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5234375, "step": 4759, "time_per_iteration": 2.542724132537842 }, { "auxiliary_loss_clip": 0.010754, "auxiliary_loss_mlp": 0.01057911, "balance_loss_clip": 1.02033639, "balance_loss_mlp": 1.02364874, "epoch": 0.28618668269953407, "flos": 20046394339200.0, "grad_norm": 1.732512556786511, "language_loss": 0.75493592, "learning_rate": 3.3501232288804496e-06, "loss": 0.77626902, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.515625, "step": 4760, "time_per_iteration": 2.414698600769043 }, { "auxiliary_loss_clip": 0.01073642, "auxiliary_loss_mlp": 0.01055705, "balance_loss_clip": 1.02580714, "balance_loss_mlp": 1.025208, "epoch": 0.28624680595220203, "flos": 24970719052800.0, "grad_norm": 2.508640551028646, "language_loss": 0.74067581, "learning_rate": 3.3498358735060773e-06, "loss": 0.76196933, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.484375, "step": 4761, "time_per_iteration": 3.996635913848877 }, { "auxiliary_loss_clip": 0.01075039, "auxiliary_loss_mlp": 0.0105621, "balance_loss_clip": 1.02259266, "balance_loss_mlp": 1.02321172, "epoch": 0.28630692920487, "flos": 22491152449920.0, "grad_norm": 2.191047007432471, "language_loss": 0.76072901, "learning_rate": 3.349548466945793e-06, "loss": 0.78204149, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51953125, "step": 4762, "time_per_iteration": 2.4520647525787354 }, { "auxiliary_loss_clip": 0.01073848, "auxiliary_loss_mlp": 0.01059448, "balance_loss_clip": 1.02556872, "balance_loss_mlp": 1.02453709, "epoch": 0.28636705245753796, "flos": 21248872986240.0, "grad_norm": 1.4318592317331356, "language_loss": 0.76385844, "learning_rate": 3.349261009210496e-06, "loss": 0.78519142, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4921875, "step": 4763, "time_per_iteration": 3.7698569297790527 }, { "auxiliary_loss_clip": 0.01073399, "auxiliary_loss_mlp": 0.01064028, "balance_loss_clip": 1.02714455, "balance_loss_mlp": 1.02215338, "epoch": 0.28642717571020593, "flos": 24094678417920.0, "grad_norm": 1.621004060134999, "language_loss": 0.78813422, "learning_rate": 3.348973500311086e-06, "loss": 0.8095085, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.51171875, "step": 4764, "time_per_iteration": 2.465522527694702 }, { "auxiliary_loss_clip": 0.01075302, "auxiliary_loss_mlp": 0.01058303, "balance_loss_clip": 1.01812911, "balance_loss_mlp": 1.02322674, "epoch": 0.2864872989628739, "flos": 22600290960000.0, "grad_norm": 1.8653646912986528, "language_loss": 0.73012084, "learning_rate": 3.348685940258466e-06, "loss": 0.75145686, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5234375, "step": 4765, "time_per_iteration": 3.9146671295166016 }, { "auxiliary_loss_clip": 0.01070442, "auxiliary_loss_mlp": 0.01051525, "balance_loss_clip": 1.01964903, "balance_loss_mlp": 1.0215838, "epoch": 0.28654742221554186, "flos": 32743677018240.0, "grad_norm": 2.0041492775482013, "language_loss": 0.76617008, "learning_rate": 3.3483983290635395e-06, "loss": 0.78738976, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.48828125, "step": 4766, "time_per_iteration": 4.07109522819519 }, { "auxiliary_loss_clip": 0.01071233, "auxiliary_loss_mlp": 0.01052172, "balance_loss_clip": 1.01776814, "balance_loss_mlp": 1.02217388, "epoch": 0.2866075454682098, "flos": 26980354488960.0, "grad_norm": 1.5847343336297581, "language_loss": 0.79499912, "learning_rate": 3.348110666737214e-06, "loss": 0.81623316, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49023438, "step": 4767, "time_per_iteration": 2.4637036323547363 }, { "auxiliary_loss_clip": 0.0107299, "auxiliary_loss_mlp": 0.01053293, "balance_loss_clip": 1.02010477, "balance_loss_mlp": 1.0223068, "epoch": 0.2866676687208778, "flos": 23252852782080.0, "grad_norm": 1.961315836485499, "language_loss": 0.66522348, "learning_rate": 3.3478229532903956e-06, "loss": 0.68648636, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5078125, "step": 4768, "time_per_iteration": 2.4347009658813477 }, { "auxiliary_loss_clip": 0.01074385, "auxiliary_loss_mlp": 0.01059645, "balance_loss_clip": 1.02383423, "balance_loss_mlp": 1.02187777, "epoch": 0.28672779197354575, "flos": 21578662488960.0, "grad_norm": 1.6471189362521488, "language_loss": 0.7170741, "learning_rate": 3.3475351887339967e-06, "loss": 0.73841441, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5234375, "step": 4769, "time_per_iteration": 2.40556263923645 }, { "auxiliary_loss_clip": 0.0107208, "auxiliary_loss_mlp": 0.01049696, "balance_loss_clip": 1.01450586, "balance_loss_mlp": 1.02149248, "epoch": 0.2867879152262137, "flos": 19864531733760.0, "grad_norm": 1.732430666231737, "language_loss": 0.76071501, "learning_rate": 3.3472473730789288e-06, "loss": 0.78193277, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5078125, "step": 4770, "time_per_iteration": 2.443547487258911 }, { "auxiliary_loss_clip": 0.01074499, "auxiliary_loss_mlp": 0.01049198, "balance_loss_clip": 1.01484156, "balance_loss_mlp": 1.02241778, "epoch": 0.2868480384788817, "flos": 28212265278720.0, "grad_norm": 2.214327611548951, "language_loss": 0.6876415, "learning_rate": 3.3469595063361045e-06, "loss": 0.70887852, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51953125, "step": 4771, "time_per_iteration": 2.466397285461426 }, { "auxiliary_loss_clip": 0.01015794, "auxiliary_loss_mlp": 0.01002973, "balance_loss_clip": 0.99953985, "balance_loss_mlp": 1.00422299, "epoch": 0.2869081617315497, "flos": 65421298010880.0, "grad_norm": 0.7773311393434161, "language_loss": 0.57007879, "learning_rate": 3.3466715885164414e-06, "loss": 0.59026647, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 0.03442383, "router_z_loss_mlp": 0.11572266, "step": 4772, "time_per_iteration": 3.000842332839966 }, { "auxiliary_loss_clip": 0.01073066, "auxiliary_loss_mlp": 0.01061988, "balance_loss_clip": 1.02481842, "balance_loss_mlp": 1.02130222, "epoch": 0.28696828498421767, "flos": 18659748936960.0, "grad_norm": 2.3686432129747157, "language_loss": 0.84776151, "learning_rate": 3.346383619630856e-06, "loss": 0.86911201, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.51953125, "step": 4773, "time_per_iteration": 2.409834146499634 }, { "auxiliary_loss_clip": 0.01073431, "auxiliary_loss_mlp": 0.01051297, "balance_loss_clip": 1.0155108, "balance_loss_mlp": 1.02207756, "epoch": 0.28702840823688563, "flos": 23658613136640.0, "grad_norm": 2.380348133002109, "language_loss": 0.78890514, "learning_rate": 3.34609559969027e-06, "loss": 0.81015247, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.515625, "step": 4774, "time_per_iteration": 2.4460296630859375 }, { "auxiliary_loss_clip": 0.01071819, "auxiliary_loss_mlp": 0.01055334, "balance_loss_clip": 1.02040541, "balance_loss_mlp": 1.02175379, "epoch": 0.2870885314895536, "flos": 13803993866880.0, "grad_norm": 1.6776004519968446, "language_loss": 0.74832326, "learning_rate": 3.3458075287056034e-06, "loss": 0.76959479, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5, "step": 4775, "time_per_iteration": 2.3969686031341553 }, { "auxiliary_loss_clip": 0.01077407, "auxiliary_loss_mlp": 0.01053388, "balance_loss_clip": 1.01981866, "balance_loss_mlp": 1.02501225, "epoch": 0.28714865474222157, "flos": 17785768072320.0, "grad_norm": 1.7000665405246873, "language_loss": 0.89065868, "learning_rate": 3.34551940668778e-06, "loss": 0.91196662, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5234375, "step": 4776, "time_per_iteration": 2.3873684406280518 }, { "auxiliary_loss_clip": 0.01073301, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.02045846, "balance_loss_mlp": 1.02340293, "epoch": 0.28720877799488953, "flos": 15996574160640.0, "grad_norm": 1.8469120895615572, "language_loss": 0.7589196, "learning_rate": 3.345231233647726e-06, "loss": 0.78018457, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5, "step": 4777, "time_per_iteration": 2.4053878784179688 }, { "auxiliary_loss_clip": 0.01078807, "auxiliary_loss_mlp": 0.01064628, "balance_loss_clip": 1.02230835, "balance_loss_mlp": 1.02438879, "epoch": 0.2872689012475575, "flos": 20922085860480.0, "grad_norm": 1.8780643257407983, "language_loss": 0.81915498, "learning_rate": 3.3449430095963696e-06, "loss": 0.84058934, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 0.546875, "step": 4778, "time_per_iteration": 2.4110677242279053 }, { "auxiliary_loss_clip": 0.01071515, "auxiliary_loss_mlp": 0.01056809, "balance_loss_clip": 1.02121258, "balance_loss_mlp": 1.02313912, "epoch": 0.28732902450022546, "flos": 21324040876800.0, "grad_norm": 1.6362716944880653, "language_loss": 0.76014507, "learning_rate": 3.3446547345446386e-06, "loss": 0.78142834, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.484375, "step": 4779, "time_per_iteration": 2.4224095344543457 }, { "auxiliary_loss_clip": 0.01073371, "auxiliary_loss_mlp": 0.01058628, "balance_loss_clip": 1.02360415, "balance_loss_mlp": 1.02300239, "epoch": 0.2873891477528934, "flos": 20849326853760.0, "grad_norm": 1.5840545597449143, "language_loss": 0.77753973, "learning_rate": 3.3443664085034656e-06, "loss": 0.79885978, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.50390625, "step": 4780, "time_per_iteration": 2.3932180404663086 }, { "auxiliary_loss_clip": 0.01073391, "auxiliary_loss_mlp": 0.01054954, "balance_loss_clip": 1.02212358, "balance_loss_mlp": 1.02332151, "epoch": 0.2874492710055614, "flos": 17419110307200.0, "grad_norm": 1.8467946375664082, "language_loss": 0.82417935, "learning_rate": 3.344078031483784e-06, "loss": 0.8454628, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5, "step": 4781, "time_per_iteration": 2.424726963043213 }, { "auxiliary_loss_clip": 0.01074647, "auxiliary_loss_mlp": 0.01061658, "balance_loss_clip": 1.02346361, "balance_loss_mlp": 1.02245021, "epoch": 0.28750939425822936, "flos": 13405983834240.0, "grad_norm": 1.9833192606475072, "language_loss": 0.88325703, "learning_rate": 3.3437896034965283e-06, "loss": 0.90462005, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5234375, "step": 4782, "time_per_iteration": 2.381814479827881 }, { "auxiliary_loss_clip": 0.01073162, "auxiliary_loss_mlp": 0.01052776, "balance_loss_clip": 1.01776385, "balance_loss_mlp": 1.02246714, "epoch": 0.2875695175108973, "flos": 21869000288640.0, "grad_norm": 1.6347355370607755, "language_loss": 0.72600436, "learning_rate": 3.3435011245526357e-06, "loss": 0.74726373, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5078125, "step": 4783, "time_per_iteration": 2.415754795074463 }, { "auxiliary_loss_clip": 0.01072468, "auxiliary_loss_mlp": 0.01058328, "balance_loss_clip": 1.02239788, "balance_loss_mlp": 1.02192974, "epoch": 0.2876296407635653, "flos": 26244385695360.0, "grad_norm": 1.7004100020637987, "language_loss": 0.78401566, "learning_rate": 3.343212594663047e-06, "loss": 0.80532366, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.50390625, "step": 4784, "time_per_iteration": 2.40915584564209 }, { "auxiliary_loss_clip": 0.01072112, "auxiliary_loss_mlp": 0.01068081, "balance_loss_clip": 1.03279555, "balance_loss_mlp": 1.02243412, "epoch": 0.28768976401623325, "flos": 25372499512320.0, "grad_norm": 1.4997893525646806, "language_loss": 0.7716583, "learning_rate": 3.3429240138387015e-06, "loss": 0.79306018, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.49609375, "step": 4785, "time_per_iteration": 2.4666945934295654 }, { "auxiliary_loss_clip": 0.01073986, "auxiliary_loss_mlp": 0.01057448, "balance_loss_clip": 1.02311599, "balance_loss_mlp": 1.02390802, "epoch": 0.28774988726890127, "flos": 30663063054720.0, "grad_norm": 1.872304715780382, "language_loss": 0.83494884, "learning_rate": 3.3426353820905425e-06, "loss": 0.85626316, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5, "step": 4786, "time_per_iteration": 2.4626622200012207 }, { "auxiliary_loss_clip": 0.01072487, "auxiliary_loss_mlp": 0.01055809, "balance_loss_clip": 1.02178693, "balance_loss_mlp": 1.02255142, "epoch": 0.28781001052156924, "flos": 20594391039360.0, "grad_norm": 1.7376844880686022, "language_loss": 0.81106526, "learning_rate": 3.342346699429516e-06, "loss": 0.83234823, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5, "step": 4787, "time_per_iteration": 2.4181816577911377 }, { "auxiliary_loss_clip": 0.01072684, "auxiliary_loss_mlp": 0.01053539, "balance_loss_clip": 1.01710844, "balance_loss_mlp": 1.02122593, "epoch": 0.2878701337742372, "flos": 26541112273920.0, "grad_norm": 1.9013956838924242, "language_loss": 0.85134947, "learning_rate": 3.3420579658665677e-06, "loss": 0.87261164, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.515625, "step": 4788, "time_per_iteration": 2.4286539554595947 }, { "auxiliary_loss_clip": 0.01076333, "auxiliary_loss_mlp": 0.01056138, "balance_loss_clip": 1.02302194, "balance_loss_mlp": 1.02448487, "epoch": 0.28793025702690517, "flos": 28145615760000.0, "grad_norm": 1.762044127173841, "language_loss": 0.75818121, "learning_rate": 3.3417691814126468e-06, "loss": 0.77950585, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.515625, "step": 4789, "time_per_iteration": 2.5001394748687744 }, { "auxiliary_loss_clip": 0.01068981, "auxiliary_loss_mlp": 0.01051644, "balance_loss_clip": 1.01952863, "balance_loss_mlp": 1.02093577, "epoch": 0.28799038027957313, "flos": 23804340618240.0, "grad_norm": 1.9151625208477214, "language_loss": 0.85745358, "learning_rate": 3.341480346078704e-06, "loss": 0.87865984, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.48046875, "step": 4790, "time_per_iteration": 2.392709970474243 }, { "auxiliary_loss_clip": 0.01074319, "auxiliary_loss_mlp": 0.01052089, "balance_loss_clip": 1.01916385, "balance_loss_mlp": 1.02390289, "epoch": 0.2880505035322411, "flos": 22343085907200.0, "grad_norm": 2.892576960418631, "language_loss": 0.79518348, "learning_rate": 3.3411914598756922e-06, "loss": 0.81644756, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.50390625, "step": 4791, "time_per_iteration": 2.462554693222046 }, { "auxiliary_loss_clip": 0.01075608, "auxiliary_loss_mlp": 0.01052303, "balance_loss_clip": 1.01777959, "balance_loss_mlp": 1.02269506, "epoch": 0.28811062678490906, "flos": 18003277042560.0, "grad_norm": 1.8854170450488141, "language_loss": 0.73091519, "learning_rate": 3.3409025228145654e-06, "loss": 0.75219429, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.53125, "step": 4792, "time_per_iteration": 2.36004638671875 }, { "auxiliary_loss_clip": 0.01075175, "auxiliary_loss_mlp": 0.01052617, "balance_loss_clip": 1.02019227, "balance_loss_mlp": 1.02373874, "epoch": 0.28817075003757703, "flos": 22089790926720.0, "grad_norm": 1.7252346707429362, "language_loss": 0.81450498, "learning_rate": 3.3406135349062812e-06, "loss": 0.83578295, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.515625, "step": 4793, "time_per_iteration": 2.428710460662842 }, { "auxiliary_loss_clip": 0.01071423, "auxiliary_loss_mlp": 0.01049683, "balance_loss_clip": 1.01630473, "balance_loss_mlp": 1.02382827, "epoch": 0.288230873290245, "flos": 41681512229760.0, "grad_norm": 1.716099649177568, "language_loss": 0.79554808, "learning_rate": 3.340324496161797e-06, "loss": 0.81675911, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4765625, "step": 4794, "time_per_iteration": 2.552276372909546 }, { "auxiliary_loss_clip": 0.01077174, "auxiliary_loss_mlp": 0.01061085, "balance_loss_clip": 1.02558422, "balance_loss_mlp": 1.0256083, "epoch": 0.28829099654291296, "flos": 18623439256320.0, "grad_norm": 2.194883403283756, "language_loss": 0.84690183, "learning_rate": 3.340035406592074e-06, "loss": 0.86828434, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.515625, "step": 4795, "time_per_iteration": 2.404552459716797 }, { "auxiliary_loss_clip": 0.01070762, "auxiliary_loss_mlp": 0.01050507, "balance_loss_clip": 1.01965559, "balance_loss_mlp": 1.02263403, "epoch": 0.2883511197955809, "flos": 24673852828800.0, "grad_norm": 1.7822672764001202, "language_loss": 0.75784624, "learning_rate": 3.339746266208074e-06, "loss": 0.77905887, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48046875, "step": 4796, "time_per_iteration": 2.3984270095825195 }, { "auxiliary_loss_clip": 0.01077316, "auxiliary_loss_mlp": 0.01066587, "balance_loss_clip": 1.02684236, "balance_loss_mlp": 1.0236392, "epoch": 0.2884112430482489, "flos": 23111035372800.0, "grad_norm": 1.8849671974551858, "language_loss": 0.74277216, "learning_rate": 3.3394570750207614e-06, "loss": 0.76421118, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.5390625, "step": 4797, "time_per_iteration": 2.4430696964263916 }, { "auxiliary_loss_clip": 0.01074542, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.01466501, "balance_loss_mlp": 1.02373171, "epoch": 0.28847136630091685, "flos": 16872405327360.0, "grad_norm": 2.2701752252437557, "language_loss": 0.75895566, "learning_rate": 3.3391678330411017e-06, "loss": 0.78020203, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5078125, "step": 4798, "time_per_iteration": 2.376037120819092 }, { "auxiliary_loss_clip": 0.01075096, "auxiliary_loss_mlp": 0.01056316, "balance_loss_clip": 1.01816869, "balance_loss_mlp": 1.02265584, "epoch": 0.2885314895535849, "flos": 25656588178560.0, "grad_norm": 2.5632206552079677, "language_loss": 0.68572044, "learning_rate": 3.3388785402800642e-06, "loss": 0.70703459, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5234375, "step": 4799, "time_per_iteration": 2.538046360015869 }, { "auxiliary_loss_clip": 0.01076044, "auxiliary_loss_mlp": 0.01061831, "balance_loss_clip": 1.02373183, "balance_loss_mlp": 1.02361369, "epoch": 0.28859161280625284, "flos": 21106147881600.0, "grad_norm": 1.6742904360003887, "language_loss": 0.83106464, "learning_rate": 3.3385891967486178e-06, "loss": 0.85244346, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5234375, "step": 4800, "time_per_iteration": 3.9291372299194336 }, { "auxiliary_loss_clip": 0.01071793, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.0201869, "balance_loss_mlp": 1.02247405, "epoch": 0.2886517360589208, "flos": 26468318355840.0, "grad_norm": 1.5994166352621397, "language_loss": 0.91404307, "learning_rate": 3.3382998024577347e-06, "loss": 0.93529075, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 4801, "time_per_iteration": 2.4813859462738037 }, { "auxiliary_loss_clip": 0.01074449, "auxiliary_loss_mlp": 0.01052867, "balance_loss_clip": 1.01669931, "balance_loss_mlp": 1.02330148, "epoch": 0.28871185931158877, "flos": 25264094140800.0, "grad_norm": 2.517070367280696, "language_loss": 0.74397177, "learning_rate": 3.33801035741839e-06, "loss": 0.76524496, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.51171875, "step": 4802, "time_per_iteration": 2.4148359298706055 }, { "auxiliary_loss_clip": 0.01018943, "auxiliary_loss_mlp": 0.01015723, "balance_loss_clip": 1.01121664, "balance_loss_mlp": 1.00655103, "epoch": 0.28877198256425674, "flos": 66662390488320.0, "grad_norm": 0.779790091754816, "language_loss": 0.62979871, "learning_rate": 3.337720861641558e-06, "loss": 0.65014535, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 0.04516602, "router_z_loss_mlp": 0.12402344, "step": 4803, "time_per_iteration": 4.394913673400879 }, { "auxiliary_loss_clip": 0.01072752, "auxiliary_loss_mlp": 0.01061553, "balance_loss_clip": 1.02538502, "balance_loss_mlp": 1.02152371, "epoch": 0.2888321058169247, "flos": 20301993469440.0, "grad_norm": 1.8610799099675213, "language_loss": 0.72760189, "learning_rate": 3.3374313151382165e-06, "loss": 0.74894494, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.51171875, "step": 4804, "time_per_iteration": 2.4458436965942383 }, { "auxiliary_loss_clip": 0.0107581, "auxiliary_loss_mlp": 0.01059876, "balance_loss_clip": 1.02122855, "balance_loss_mlp": 1.0225029, "epoch": 0.28889222906959267, "flos": 25515643553280.0, "grad_norm": 1.7385467716163125, "language_loss": 0.69403988, "learning_rate": 3.337141717919346e-06, "loss": 0.71539676, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.53515625, "step": 4805, "time_per_iteration": 4.054839611053467 }, { "auxiliary_loss_clip": 0.01074066, "auxiliary_loss_mlp": 0.01062355, "balance_loss_clip": 1.02437496, "balance_loss_mlp": 1.02169561, "epoch": 0.28895235232226063, "flos": 32669940493440.0, "grad_norm": 1.396963792111239, "language_loss": 0.70882869, "learning_rate": 3.3368520699959272e-06, "loss": 0.7301929, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5234375, "step": 4806, "time_per_iteration": 3.872166633605957 }, { "auxiliary_loss_clip": 0.01070611, "auxiliary_loss_mlp": 0.01060892, "balance_loss_clip": 1.0258441, "balance_loss_mlp": 1.02124405, "epoch": 0.2890124755749286, "flos": 29713425540480.0, "grad_norm": 1.4284383836015453, "language_loss": 0.72325987, "learning_rate": 3.3365623713789443e-06, "loss": 0.7445749, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49414062, "step": 4807, "time_per_iteration": 2.4775710105895996 }, { "auxiliary_loss_clip": 0.01071866, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.02351499, "balance_loss_mlp": 1.02166462, "epoch": 0.28907259882759656, "flos": 22673364168960.0, "grad_norm": 1.5159217271741365, "language_loss": 0.82605267, "learning_rate": 3.336272622079382e-06, "loss": 0.84736538, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5, "step": 4808, "time_per_iteration": 2.3981733322143555 }, { "auxiliary_loss_clip": 0.01069743, "auxiliary_loss_mlp": 0.01064287, "balance_loss_clip": 1.02912056, "balance_loss_mlp": 1.02181685, "epoch": 0.2891327220802645, "flos": 22564923886080.0, "grad_norm": 1.4552747696854553, "language_loss": 0.79725134, "learning_rate": 3.3359828221082276e-06, "loss": 0.81859165, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.47851562, "step": 4809, "time_per_iteration": 2.508521556854248 }, { "auxiliary_loss_clip": 0.01077044, "auxiliary_loss_mlp": 0.0105997, "balance_loss_clip": 1.02215648, "balance_loss_mlp": 1.02223158, "epoch": 0.2891928453329325, "flos": 21651735697920.0, "grad_norm": 1.8779261297386927, "language_loss": 0.80362689, "learning_rate": 3.3356929714764714e-06, "loss": 0.82499701, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.546875, "step": 4810, "time_per_iteration": 2.420687437057495 }, { "auxiliary_loss_clip": 0.01071535, "auxiliary_loss_mlp": 0.01058747, "balance_loss_clip": 1.02358055, "balance_loss_mlp": 1.02232552, "epoch": 0.28925296858560046, "flos": 23220976844160.0, "grad_norm": 1.6192189001275268, "language_loss": 0.7803086, "learning_rate": 3.3354030701951032e-06, "loss": 0.80161142, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.4921875, "step": 4811, "time_per_iteration": 2.454054355621338 }, { "auxiliary_loss_clip": 0.01069983, "auxiliary_loss_mlp": 0.01053326, "balance_loss_clip": 1.01896977, "balance_loss_mlp": 1.02097917, "epoch": 0.2893130918382685, "flos": 28620399605760.0, "grad_norm": 1.3775956769239155, "language_loss": 0.79047394, "learning_rate": 3.335113118275117e-06, "loss": 0.81170702, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49023438, "step": 4812, "time_per_iteration": 2.4516518115997314 }, { "auxiliary_loss_clip": 0.0101686, "auxiliary_loss_mlp": 0.01013333, "balance_loss_clip": 1.00923252, "balance_loss_mlp": 1.00387549, "epoch": 0.28937321509093644, "flos": 72297615772800.0, "grad_norm": 0.8734953003775844, "language_loss": 0.60456955, "learning_rate": 3.3348231157275085e-06, "loss": 0.62487149, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.12988281, "step": 4813, "time_per_iteration": 3.192640781402588 }, { "auxiliary_loss_clip": 0.01073581, "auxiliary_loss_mlp": 0.01047269, "balance_loss_clip": 1.014606, "balance_loss_mlp": 1.0240196, "epoch": 0.2894333383436044, "flos": 16215479585280.0, "grad_norm": 3.116159072365966, "language_loss": 0.83498842, "learning_rate": 3.3345330625632725e-06, "loss": 0.85619688, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.49609375, "step": 4814, "time_per_iteration": 2.368969440460205 }, { "auxiliary_loss_clip": 0.01077734, "auxiliary_loss_mlp": 0.01064244, "balance_loss_clip": 1.02898169, "balance_loss_mlp": 1.02558315, "epoch": 0.2894934615962724, "flos": 24827086252800.0, "grad_norm": 1.6606257000707405, "language_loss": 0.74661142, "learning_rate": 3.3342429587934094e-06, "loss": 0.76803112, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5234375, "step": 4815, "time_per_iteration": 2.4814956188201904 }, { "auxiliary_loss_clip": 0.01070772, "auxiliary_loss_mlp": 0.01056411, "balance_loss_clip": 1.02541661, "balance_loss_mlp": 1.02396107, "epoch": 0.28955358484894034, "flos": 20448907937280.0, "grad_norm": 1.5421379597520848, "language_loss": 0.72432935, "learning_rate": 3.3339528044289198e-06, "loss": 0.74560124, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46875, "step": 4816, "time_per_iteration": 2.397700548171997 }, { "auxiliary_loss_clip": 0.01077515, "auxiliary_loss_mlp": 0.01062567, "balance_loss_clip": 1.02537346, "balance_loss_mlp": 1.02342248, "epoch": 0.2896137081016083, "flos": 22564086013440.0, "grad_norm": 2.1357712458324873, "language_loss": 0.77042443, "learning_rate": 3.3336625994808055e-06, "loss": 0.79182523, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.54296875, "step": 4817, "time_per_iteration": 2.4409806728363037 }, { "auxiliary_loss_clip": 0.0107998, "auxiliary_loss_mlp": 0.01069305, "balance_loss_clip": 1.03196907, "balance_loss_mlp": 1.02685452, "epoch": 0.28967383135427627, "flos": 26686735021440.0, "grad_norm": 1.8567295150786596, "language_loss": 0.784168, "learning_rate": 3.3333723439600723e-06, "loss": 0.80566084, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.53125, "step": 4818, "time_per_iteration": 2.440365791320801 }, { "auxiliary_loss_clip": 0.01075655, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.02536726, "balance_loss_mlp": 1.02466488, "epoch": 0.28973395460694423, "flos": 15557401768320.0, "grad_norm": 2.588603090438053, "language_loss": 0.81700897, "learning_rate": 3.3330820378777263e-06, "loss": 0.83835369, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51171875, "step": 4819, "time_per_iteration": 2.417724132537842 }, { "auxiliary_loss_clip": 0.01076651, "auxiliary_loss_mlp": 0.01068047, "balance_loss_clip": 1.03111553, "balance_loss_mlp": 1.02393651, "epoch": 0.2897940778596122, "flos": 18696477553920.0, "grad_norm": 1.7703697372160547, "language_loss": 0.80564809, "learning_rate": 3.332791681244776e-06, "loss": 0.82709509, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.52734375, "step": 4820, "time_per_iteration": 2.3786885738372803 }, { "auxiliary_loss_clip": 0.01076354, "auxiliary_loss_mlp": 0.0105713, "balance_loss_clip": 1.02322674, "balance_loss_mlp": 1.02430773, "epoch": 0.28985420111228016, "flos": 18769306383360.0, "grad_norm": 2.0657199995573867, "language_loss": 0.75037491, "learning_rate": 3.332501274072231e-06, "loss": 0.77170968, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51953125, "step": 4821, "time_per_iteration": 2.5142199993133545 }, { "auxiliary_loss_clip": 0.01075543, "auxiliary_loss_mlp": 0.01060032, "balance_loss_clip": 1.0254612, "balance_loss_mlp": 1.02461958, "epoch": 0.28991432436494813, "flos": 23068895495040.0, "grad_norm": 1.8891130077683513, "language_loss": 0.73827028, "learning_rate": 3.332210816371104e-06, "loss": 0.75962603, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5078125, "step": 4822, "time_per_iteration": 2.4035074710845947 }, { "auxiliary_loss_clip": 0.01073723, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.03421545, "balance_loss_mlp": 1.02355075, "epoch": 0.2899744476176161, "flos": 17602229721600.0, "grad_norm": 1.7607369357515252, "language_loss": 0.67336535, "learning_rate": 3.3319203081524102e-06, "loss": 0.69477999, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5, "step": 4823, "time_per_iteration": 2.3950703144073486 }, { "auxiliary_loss_clip": 0.01071275, "auxiliary_loss_mlp": 0.01057437, "balance_loss_clip": 1.02444017, "balance_loss_mlp": 1.0224185, "epoch": 0.29003457087028406, "flos": 22308277415040.0, "grad_norm": 1.9338915450665042, "language_loss": 0.83148879, "learning_rate": 3.331629749427164e-06, "loss": 0.85277593, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.48828125, "step": 4824, "time_per_iteration": 2.3936192989349365 }, { "auxiliary_loss_clip": 0.01072199, "auxiliary_loss_mlp": 0.01058554, "balance_loss_clip": 1.02274394, "balance_loss_mlp": 1.02192903, "epoch": 0.2900946941229521, "flos": 21943888888320.0, "grad_norm": 2.257098834980242, "language_loss": 0.74338484, "learning_rate": 3.331339140206385e-06, "loss": 0.76469243, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.50390625, "step": 4825, "time_per_iteration": 2.4501688480377197 }, { "auxiliary_loss_clip": 0.01074698, "auxiliary_loss_mlp": 0.01052137, "balance_loss_clip": 1.01773286, "balance_loss_mlp": 1.02319407, "epoch": 0.29015481737562004, "flos": 17931181351680.0, "grad_norm": 2.2635394337086567, "language_loss": 0.75976992, "learning_rate": 3.331048480501092e-06, "loss": 0.78103817, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4826, "time_per_iteration": 2.366119861602783 }, { "auxiliary_loss_clip": 0.01070986, "auxiliary_loss_mlp": 0.01054655, "balance_loss_clip": 1.02137196, "balance_loss_mlp": 1.02134156, "epoch": 0.290214940628288, "flos": 22782432856320.0, "grad_norm": 1.694768873937425, "language_loss": 0.70475221, "learning_rate": 3.3307577703223073e-06, "loss": 0.72600865, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.49609375, "step": 4827, "time_per_iteration": 2.5647995471954346 }, { "auxiliary_loss_clip": 0.01075844, "auxiliary_loss_mlp": 0.01053036, "balance_loss_clip": 1.01903749, "balance_loss_mlp": 1.02522469, "epoch": 0.290275063880956, "flos": 20005581093120.0, "grad_norm": 1.7994022040925317, "language_loss": 0.81390083, "learning_rate": 3.3304670096810545e-06, "loss": 0.83518958, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 4828, "time_per_iteration": 2.4223439693450928 }, { "auxiliary_loss_clip": 0.01073436, "auxiliary_loss_mlp": 0.01058096, "balance_loss_clip": 1.02471757, "balance_loss_mlp": 1.02404261, "epoch": 0.29033518713362394, "flos": 22052538639360.0, "grad_norm": 1.8723298988506072, "language_loss": 0.81776071, "learning_rate": 3.33017619858836e-06, "loss": 0.83907592, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49414062, "step": 4829, "time_per_iteration": 2.4619228839874268 }, { "auxiliary_loss_clip": 0.01072167, "auxiliary_loss_mlp": 0.01047136, "balance_loss_clip": 1.01642776, "balance_loss_mlp": 1.02366257, "epoch": 0.2903953103862919, "flos": 25628866692480.0, "grad_norm": 1.4492073900328657, "language_loss": 0.83741641, "learning_rate": 3.329885337055249e-06, "loss": 0.85860944, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.48632812, "step": 4830, "time_per_iteration": 2.4402098655700684 }, { "auxiliary_loss_clip": 0.01074304, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.01916313, "balance_loss_mlp": 1.02435994, "epoch": 0.29045543363895987, "flos": 16944919954560.0, "grad_norm": 2.3300599919391987, "language_loss": 0.81380785, "learning_rate": 3.3295944250927546e-06, "loss": 0.83506775, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.5, "step": 4831, "time_per_iteration": 2.4789958000183105 }, { "auxiliary_loss_clip": 0.0107142, "auxiliary_loss_mlp": 0.01049467, "balance_loss_clip": 1.02021289, "balance_loss_mlp": 1.02436423, "epoch": 0.29051555689162784, "flos": 26394302540160.0, "grad_norm": 1.9296052727155317, "language_loss": 0.75834548, "learning_rate": 3.3293034627119055e-06, "loss": 0.77955431, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.46875, "step": 4832, "time_per_iteration": 2.428722620010376 }, { "auxiliary_loss_clip": 0.01071331, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.01322722, "balance_loss_mlp": 1.02352381, "epoch": 0.2905756801442958, "flos": 21102866213760.0, "grad_norm": 1.5656511306715408, "language_loss": 0.77419889, "learning_rate": 3.329012449923736e-06, "loss": 0.79534775, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4765625, "step": 4833, "time_per_iteration": 2.471726179122925 }, { "auxiliary_loss_clip": 0.01073317, "auxiliary_loss_mlp": 0.01046768, "balance_loss_clip": 1.01451039, "balance_loss_mlp": 1.0253334, "epoch": 0.29063580339696377, "flos": 15705154108800.0, "grad_norm": 1.5160196133509451, "language_loss": 0.6642921, "learning_rate": 3.3287213867392813e-06, "loss": 0.68549293, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.48046875, "step": 4834, "time_per_iteration": 2.401853084564209 }, { "auxiliary_loss_clip": 0.01069633, "auxiliary_loss_mlp": 0.01044276, "balance_loss_clip": 1.01442575, "balance_loss_mlp": 1.0232029, "epoch": 0.29069592664963173, "flos": 24643827192960.0, "grad_norm": 1.5260463333510896, "language_loss": 0.73305595, "learning_rate": 3.3284302731695783e-06, "loss": 0.75419509, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.46484375, "step": 4835, "time_per_iteration": 2.4605278968811035 }, { "auxiliary_loss_clip": 0.01070639, "auxiliary_loss_mlp": 0.01050404, "balance_loss_clip": 1.02012444, "balance_loss_mlp": 1.02301168, "epoch": 0.2907560499022997, "flos": 24972569354880.0, "grad_norm": 1.5760233865184936, "language_loss": 0.81034529, "learning_rate": 3.3281391092256668e-06, "loss": 0.83155566, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4765625, "step": 4836, "time_per_iteration": 2.462496757507324 }, { "auxiliary_loss_clip": 0.01070378, "auxiliary_loss_mlp": 0.01046846, "balance_loss_clip": 1.01747298, "balance_loss_mlp": 1.02356136, "epoch": 0.29081617315496766, "flos": 18656606914560.0, "grad_norm": 1.6347402066325598, "language_loss": 0.82773226, "learning_rate": 3.3278478949185865e-06, "loss": 0.84890455, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.46679688, "step": 4837, "time_per_iteration": 2.4614601135253906 }, { "auxiliary_loss_clip": 0.01069727, "auxiliary_loss_mlp": 0.01047956, "balance_loss_clip": 1.01512551, "balance_loss_mlp": 1.02030027, "epoch": 0.2908762964076356, "flos": 35329693956480.0, "grad_norm": 2.4281840639783927, "language_loss": 0.68271661, "learning_rate": 3.327556630259381e-06, "loss": 0.70389342, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.49414062, "step": 4838, "time_per_iteration": 2.5122272968292236 }, { "auxiliary_loss_clip": 0.01075079, "auxiliary_loss_mlp": 0.010554, "balance_loss_clip": 1.01997054, "balance_loss_mlp": 1.02445769, "epoch": 0.29093641966030365, "flos": 23075179539840.0, "grad_norm": 1.640599650086532, "language_loss": 0.7251932, "learning_rate": 3.327265315259095e-06, "loss": 0.74649799, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5078125, "step": 4839, "time_per_iteration": 2.472647190093994 }, { "auxiliary_loss_clip": 0.01071145, "auxiliary_loss_mlp": 0.01051697, "balance_loss_clip": 1.020154, "balance_loss_mlp": 1.02251196, "epoch": 0.2909965429129716, "flos": 35953940799360.0, "grad_norm": 1.9854029705570404, "language_loss": 0.77677441, "learning_rate": 3.326973949928776e-06, "loss": 0.79800284, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48632812, "step": 4840, "time_per_iteration": 3.988675355911255 }, { "auxiliary_loss_clip": 0.01072052, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.01998496, "balance_loss_mlp": 1.02300477, "epoch": 0.2910566661656396, "flos": 30879001013760.0, "grad_norm": 1.796450017146185, "language_loss": 0.61609578, "learning_rate": 3.326682534279471e-06, "loss": 0.63734514, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.49023438, "step": 4841, "time_per_iteration": 2.5071892738342285 }, { "auxiliary_loss_clip": 0.01071493, "auxiliary_loss_mlp": 0.01050884, "balance_loss_clip": 1.0179584, "balance_loss_mlp": 1.02227485, "epoch": 0.29111678941830754, "flos": 30008825487360.0, "grad_norm": 1.9291208358891798, "language_loss": 0.72571898, "learning_rate": 3.326391068322232e-06, "loss": 0.74694276, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 4842, "time_per_iteration": 3.8700199127197266 }, { "auxiliary_loss_clip": 0.01069128, "auxiliary_loss_mlp": 0.01047507, "balance_loss_clip": 1.01614356, "balance_loss_mlp": 1.02155626, "epoch": 0.2911769126709755, "flos": 22856274115200.0, "grad_norm": 1.4386335905371743, "language_loss": 0.75185323, "learning_rate": 3.3260995520681098e-06, "loss": 0.77301955, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4765625, "step": 4843, "time_per_iteration": 2.4057388305664062 }, { "auxiliary_loss_clip": 0.01069561, "auxiliary_loss_mlp": 0.01047418, "balance_loss_clip": 1.01487374, "balance_loss_mlp": 1.02084136, "epoch": 0.2912370359236435, "flos": 21649501370880.0, "grad_norm": 1.9498740757938409, "language_loss": 0.59697199, "learning_rate": 3.3258079855281602e-06, "loss": 0.61814177, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.48632812, "step": 4844, "time_per_iteration": 3.926154851913452 }, { "auxiliary_loss_clip": 0.01073613, "auxiliary_loss_mlp": 0.0105114, "balance_loss_clip": 1.01537764, "balance_loss_mlp": 1.02288723, "epoch": 0.29129715917631144, "flos": 22892234682240.0, "grad_norm": 1.8795667400760883, "language_loss": 0.88026434, "learning_rate": 3.3255163687134396e-06, "loss": 0.90151185, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5078125, "step": 4845, "time_per_iteration": 2.4223411083221436 }, { "auxiliary_loss_clip": 0.01072573, "auxiliary_loss_mlp": 0.01057143, "balance_loss_clip": 1.01963937, "balance_loss_mlp": 1.02259171, "epoch": 0.2913572824289794, "flos": 22673364168960.0, "grad_norm": 1.6314729044197387, "language_loss": 0.6915915, "learning_rate": 3.3252247016350046e-06, "loss": 0.71288872, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5, "step": 4846, "time_per_iteration": 3.9333624839782715 }, { "auxiliary_loss_clip": 0.01071084, "auxiliary_loss_mlp": 0.01050121, "balance_loss_clip": 1.01831567, "balance_loss_mlp": 1.02316034, "epoch": 0.29141740568164737, "flos": 23106427073280.0, "grad_norm": 1.6654248248235428, "language_loss": 0.71504289, "learning_rate": 3.3249329843039166e-06, "loss": 0.73625493, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.47851562, "step": 4847, "time_per_iteration": 2.5234131813049316 }, { "auxiliary_loss_clip": 0.01072629, "auxiliary_loss_mlp": 0.01045762, "balance_loss_clip": 1.01221609, "balance_loss_mlp": 1.02349424, "epoch": 0.29147752893431533, "flos": 23585889041280.0, "grad_norm": 1.4993159706789372, "language_loss": 0.75313497, "learning_rate": 3.324641216731237e-06, "loss": 0.77431887, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 4848, "time_per_iteration": 2.4333927631378174 }, { "auxiliary_loss_clip": 0.01071027, "auxiliary_loss_mlp": 0.01053241, "balance_loss_clip": 1.01940989, "balance_loss_mlp": 1.02208877, "epoch": 0.2915376521869833, "flos": 20591004637440.0, "grad_norm": 2.7211882075607368, "language_loss": 0.79357147, "learning_rate": 3.3243493989280295e-06, "loss": 0.81481409, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.48828125, "step": 4849, "time_per_iteration": 2.420400619506836 }, { "auxiliary_loss_clip": 0.01074034, "auxiliary_loss_mlp": 0.01049937, "balance_loss_clip": 1.01443648, "balance_loss_mlp": 1.02355087, "epoch": 0.29159777543965126, "flos": 20810503555200.0, "grad_norm": 1.5777766122832055, "language_loss": 0.80270684, "learning_rate": 3.3240575309053596e-06, "loss": 0.82394648, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.50390625, "step": 4850, "time_per_iteration": 2.388314962387085 }, { "auxiliary_loss_clip": 0.01070676, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.01331997, "balance_loss_mlp": 1.02213418, "epoch": 0.29165789869231923, "flos": 24242989340160.0, "grad_norm": 1.6652021646208748, "language_loss": 0.77076858, "learning_rate": 3.323765612674296e-06, "loss": 0.79194725, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.484375, "step": 4851, "time_per_iteration": 2.4699764251708984 }, { "auxiliary_loss_clip": 0.01071387, "auxiliary_loss_mlp": 0.01046969, "balance_loss_clip": 1.01719046, "balance_loss_mlp": 1.0229665, "epoch": 0.29171802194498725, "flos": 28948653008640.0, "grad_norm": 1.3966409713721095, "language_loss": 0.78496802, "learning_rate": 3.3234736442459078e-06, "loss": 0.80615163, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.484375, "step": 4852, "time_per_iteration": 2.4490742683410645 }, { "auxiliary_loss_clip": 0.01070788, "auxiliary_loss_mlp": 0.0105358, "balance_loss_clip": 1.01991558, "balance_loss_mlp": 1.02261329, "epoch": 0.2917781451976552, "flos": 22597218760320.0, "grad_norm": 1.8614846901315991, "language_loss": 0.78789645, "learning_rate": 3.3231816256312665e-06, "loss": 0.80914009, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48046875, "step": 4853, "time_per_iteration": 2.485038995742798 }, { "auxiliary_loss_clip": 0.01072619, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.01523995, "balance_loss_mlp": 1.02197242, "epoch": 0.2918382684503232, "flos": 21573530519040.0, "grad_norm": 2.3236688225356836, "language_loss": 0.89055943, "learning_rate": 3.322889556841445e-06, "loss": 0.91177183, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 4854, "time_per_iteration": 2.378618001937866 }, { "auxiliary_loss_clip": 0.01070569, "auxiliary_loss_mlp": 0.01057428, "balance_loss_clip": 1.0233345, "balance_loss_mlp": 1.02120233, "epoch": 0.29189839170299114, "flos": 24352337318400.0, "grad_norm": 1.8639364092714703, "language_loss": 0.8733449, "learning_rate": 3.322597437887519e-06, "loss": 0.89462483, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49414062, "step": 4855, "time_per_iteration": 2.4928650856018066 }, { "auxiliary_loss_clip": 0.01017139, "auxiliary_loss_mlp": 0.01010783, "balance_loss_clip": 1.00639629, "balance_loss_mlp": 1.00586987, "epoch": 0.2919585149556591, "flos": 71313065032320.0, "grad_norm": 0.8226205422264361, "language_loss": 0.60308337, "learning_rate": 3.322305268780566e-06, "loss": 0.62336266, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 0.04394531, "router_z_loss_mlp": 0.11279297, "step": 4856, "time_per_iteration": 3.1482086181640625 }, { "auxiliary_loss_clip": 0.01070308, "auxiliary_loss_mlp": 0.01049347, "balance_loss_clip": 1.01854348, "balance_loss_mlp": 1.02170873, "epoch": 0.2920186382083271, "flos": 15632290368000.0, "grad_norm": 1.6784295241136882, "language_loss": 0.69830358, "learning_rate": 3.322013049531664e-06, "loss": 0.71950018, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48632812, "step": 4857, "time_per_iteration": 2.467507839202881 }, { "auxiliary_loss_clip": 0.01071382, "auxiliary_loss_mlp": 0.01045638, "balance_loss_clip": 1.01489353, "balance_loss_mlp": 1.02295709, "epoch": 0.29207876146099504, "flos": 28364765564160.0, "grad_norm": 2.011303578434968, "language_loss": 0.85622841, "learning_rate": 3.321720780151895e-06, "loss": 0.87739861, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.484375, "step": 4858, "time_per_iteration": 2.440809488296509 }, { "auxiliary_loss_clip": 0.01071842, "auxiliary_loss_mlp": 0.0105463, "balance_loss_clip": 1.02237201, "balance_loss_mlp": 1.02308083, "epoch": 0.292138884713663, "flos": 21869907984000.0, "grad_norm": 1.623429923013787, "language_loss": 0.79075944, "learning_rate": 3.321428460652342e-06, "loss": 0.81202418, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.48828125, "step": 4859, "time_per_iteration": 2.414384603500366 }, { "auxiliary_loss_clip": 0.01073159, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.01900613, "balance_loss_mlp": 1.02229774, "epoch": 0.29219900796633097, "flos": 20991598110720.0, "grad_norm": 2.6628204581679316, "language_loss": 0.7077781, "learning_rate": 3.3211360910440885e-06, "loss": 0.72903091, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5078125, "step": 4860, "time_per_iteration": 2.41534686088562 }, { "auxiliary_loss_clip": 0.01070193, "auxiliary_loss_mlp": 0.01052389, "balance_loss_clip": 1.02083397, "balance_loss_mlp": 1.02235961, "epoch": 0.29225913121899894, "flos": 35003221032960.0, "grad_norm": 2.139665364680057, "language_loss": 0.76568305, "learning_rate": 3.320843671338222e-06, "loss": 0.78690886, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.47851562, "step": 4861, "time_per_iteration": 2.544222831726074 }, { "auxiliary_loss_clip": 0.01067884, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.01989377, "balance_loss_mlp": 1.02146459, "epoch": 0.2923192544716669, "flos": 13514843053440.0, "grad_norm": 2.496752382358247, "language_loss": 0.93723172, "learning_rate": 3.320551201545832e-06, "loss": 0.95840776, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.46484375, "step": 4862, "time_per_iteration": 2.362663745880127 }, { "auxiliary_loss_clip": 0.01069886, "auxiliary_loss_mlp": 0.0104875, "balance_loss_clip": 1.01885247, "balance_loss_mlp": 1.02121615, "epoch": 0.29237937772433487, "flos": 19462506894720.0, "grad_norm": 2.2662374111127117, "language_loss": 0.754219, "learning_rate": 3.320258681678008e-06, "loss": 0.77540541, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.48632812, "step": 4863, "time_per_iteration": 2.444514274597168 }, { "auxiliary_loss_clip": 0.01068398, "auxiliary_loss_mlp": 0.01048657, "balance_loss_clip": 1.01897383, "balance_loss_mlp": 1.0232991, "epoch": 0.29243950097700283, "flos": 20849536321920.0, "grad_norm": 1.7105223351914913, "language_loss": 0.79495978, "learning_rate": 3.319966111745842e-06, "loss": 0.81613028, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.453125, "step": 4864, "time_per_iteration": 2.4131219387054443 }, { "auxiliary_loss_clip": 0.01072191, "auxiliary_loss_mlp": 0.0105468, "balance_loss_clip": 1.01858377, "balance_loss_mlp": 1.02246499, "epoch": 0.29249962422967085, "flos": 23583165955200.0, "grad_norm": 1.6090769220502443, "language_loss": 0.82753515, "learning_rate": 3.319673491760429e-06, "loss": 0.84880388, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.49804688, "step": 4865, "time_per_iteration": 2.4482688903808594 }, { "auxiliary_loss_clip": 0.01071196, "auxiliary_loss_mlp": 0.01049899, "balance_loss_clip": 1.01540005, "balance_loss_mlp": 1.02163339, "epoch": 0.2925597474823388, "flos": 22272247025280.0, "grad_norm": 1.8840959476230292, "language_loss": 0.87059575, "learning_rate": 3.3193808217328645e-06, "loss": 0.89180672, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.49609375, "step": 4866, "time_per_iteration": 2.420487403869629 }, { "auxiliary_loss_clip": 0.01070118, "auxiliary_loss_mlp": 0.01048095, "balance_loss_clip": 1.01583707, "balance_loss_mlp": 1.02221847, "epoch": 0.2926198707350068, "flos": 34454770485120.0, "grad_norm": 1.596215330631747, "language_loss": 0.76350784, "learning_rate": 3.3190881016742476e-06, "loss": 0.78469002, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47851562, "step": 4867, "time_per_iteration": 2.5623457431793213 }, { "auxiliary_loss_clip": 0.01073657, "auxiliary_loss_mlp": 0.01058701, "balance_loss_clip": 1.02491713, "balance_loss_mlp": 1.02275741, "epoch": 0.29267999398767475, "flos": 20703110613120.0, "grad_norm": 1.913639311522078, "language_loss": 0.74725288, "learning_rate": 3.3187953315956776e-06, "loss": 0.76857644, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5078125, "step": 4868, "time_per_iteration": 2.382826328277588 }, { "auxiliary_loss_clip": 0.01069967, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.00959158, "balance_loss_mlp": 1.02296472, "epoch": 0.2927401172403427, "flos": 18367700480640.0, "grad_norm": 1.4192228196538488, "language_loss": 0.75830472, "learning_rate": 3.3185025115082566e-06, "loss": 0.77940738, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.46875, "step": 4869, "time_per_iteration": 2.4319021701812744 }, { "auxiliary_loss_clip": 0.01072517, "auxiliary_loss_mlp": 0.01053297, "balance_loss_clip": 1.01867843, "balance_loss_mlp": 1.02307403, "epoch": 0.2928002404930107, "flos": 26102847576960.0, "grad_norm": 1.470710897877159, "language_loss": 0.78101373, "learning_rate": 3.318209641423088e-06, "loss": 0.80227196, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.49414062, "step": 4870, "time_per_iteration": 2.4650747776031494 }, { "auxiliary_loss_clip": 0.01073812, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.02244115, "balance_loss_mlp": 1.02202988, "epoch": 0.29286036374567864, "flos": 21323656851840.0, "grad_norm": 2.291424017738178, "language_loss": 0.69248939, "learning_rate": 3.3179167213512777e-06, "loss": 0.7138263, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.51953125, "step": 4871, "time_per_iteration": 2.4673917293548584 }, { "auxiliary_loss_clip": 0.01070124, "auxiliary_loss_mlp": 0.01057419, "balance_loss_clip": 1.02506566, "balance_loss_mlp": 1.02232099, "epoch": 0.2929204869983466, "flos": 29568221729280.0, "grad_norm": 1.7290370470554173, "language_loss": 0.78987998, "learning_rate": 3.317623751303933e-06, "loss": 0.81115538, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47851562, "step": 4872, "time_per_iteration": 2.5025992393493652 }, { "auxiliary_loss_clip": 0.01073132, "auxiliary_loss_mlp": 0.01056448, "balance_loss_clip": 1.01808643, "balance_loss_mlp": 1.02347612, "epoch": 0.2929806102510146, "flos": 19057374944640.0, "grad_norm": 3.6130454317329086, "language_loss": 0.74260318, "learning_rate": 3.317330731292164e-06, "loss": 0.76389897, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.49609375, "step": 4873, "time_per_iteration": 2.461566209793091 }, { "auxiliary_loss_clip": 0.01072895, "auxiliary_loss_mlp": 0.010536, "balance_loss_clip": 1.01843297, "balance_loss_mlp": 1.02171183, "epoch": 0.29304073350368254, "flos": 21943155749760.0, "grad_norm": 1.9713303159774065, "language_loss": 0.79986107, "learning_rate": 3.3170376613270812e-06, "loss": 0.82112604, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51171875, "step": 4874, "time_per_iteration": 2.3997695446014404 }, { "auxiliary_loss_clip": 0.01077039, "auxiliary_loss_mlp": 0.01053078, "balance_loss_clip": 1.01581359, "balance_loss_mlp": 1.02475643, "epoch": 0.2931008567563505, "flos": 15449904092160.0, "grad_norm": 2.1095128928403106, "language_loss": 0.7898097, "learning_rate": 3.3167445414197985e-06, "loss": 0.81111085, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5234375, "step": 4875, "time_per_iteration": 2.406203269958496 }, { "auxiliary_loss_clip": 0.01075784, "auxiliary_loss_mlp": 0.01053323, "balance_loss_clip": 1.01763201, "balance_loss_mlp": 1.02586794, "epoch": 0.29316098000901847, "flos": 16982207153280.0, "grad_norm": 2.9449864798644336, "language_loss": 0.70681322, "learning_rate": 3.316451371581431e-06, "loss": 0.72810429, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5, "step": 4876, "time_per_iteration": 2.3931055068969727 }, { "auxiliary_loss_clip": 0.01071525, "auxiliary_loss_mlp": 0.0105532, "balance_loss_clip": 1.0227282, "balance_loss_mlp": 1.0228765, "epoch": 0.29322110326168643, "flos": 16356912969600.0, "grad_norm": 1.9634521644839524, "language_loss": 0.84057999, "learning_rate": 3.316158151823096e-06, "loss": 0.86184835, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.48632812, "step": 4877, "time_per_iteration": 2.422816514968872 }, { "auxiliary_loss_clip": 0.01078558, "auxiliary_loss_mlp": 0.01059278, "balance_loss_clip": 1.02268076, "balance_loss_mlp": 1.02663875, "epoch": 0.29328122651435445, "flos": 13990010924160.0, "grad_norm": 3.8218805808170018, "language_loss": 0.71173424, "learning_rate": 3.315864882155911e-06, "loss": 0.73311263, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.51953125, "step": 4878, "time_per_iteration": 2.3955490589141846 }, { "auxiliary_loss_clip": 0.01074607, "auxiliary_loss_mlp": 0.0106074, "balance_loss_clip": 1.02385676, "balance_loss_mlp": 1.02409339, "epoch": 0.2933413497670224, "flos": 25263430824960.0, "grad_norm": 1.8630777652454271, "language_loss": 0.74581999, "learning_rate": 3.3155715625909982e-06, "loss": 0.76717347, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5078125, "step": 4879, "time_per_iteration": 2.4634108543395996 }, { "auxiliary_loss_clip": 0.01075968, "auxiliary_loss_mlp": 0.01068686, "balance_loss_clip": 1.03318548, "balance_loss_mlp": 1.02404356, "epoch": 0.2934014730196904, "flos": 32122397640960.0, "grad_norm": 2.1000944207387486, "language_loss": 0.682441, "learning_rate": 3.3152781931394803e-06, "loss": 0.70388758, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.51953125, "step": 4880, "time_per_iteration": 3.931900978088379 }, { "auxiliary_loss_clip": 0.01072769, "auxiliary_loss_mlp": 0.01057478, "balance_loss_clip": 1.02185822, "balance_loss_mlp": 1.02212763, "epoch": 0.29346159627235835, "flos": 24351359800320.0, "grad_norm": 20.708060075014668, "language_loss": 0.73499632, "learning_rate": 3.314984773812481e-06, "loss": 0.75629878, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5078125, "step": 4881, "time_per_iteration": 2.44140887260437 }, { "auxiliary_loss_clip": 0.01072737, "auxiliary_loss_mlp": 0.01061236, "balance_loss_clip": 1.0237323, "balance_loss_mlp": 1.02187037, "epoch": 0.2935217195250263, "flos": 22745669328000.0, "grad_norm": 1.63244243773188, "language_loss": 0.8459152, "learning_rate": 3.314691304621127e-06, "loss": 0.86725497, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5078125, "step": 4882, "time_per_iteration": 3.8535501956939697 }, { "auxiliary_loss_clip": 0.01074145, "auxiliary_loss_mlp": 0.01056139, "balance_loss_clip": 1.01739645, "balance_loss_mlp": 1.02330899, "epoch": 0.2935818427776943, "flos": 21724494704640.0, "grad_norm": 2.1956220755468503, "language_loss": 0.73960423, "learning_rate": 3.314397785576548e-06, "loss": 0.76090711, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.5078125, "step": 4883, "time_per_iteration": 2.3880820274353027 }, { "auxiliary_loss_clip": 0.01072391, "auxiliary_loss_mlp": 0.01054484, "balance_loss_clip": 1.01862621, "balance_loss_mlp": 1.02203321, "epoch": 0.29364196603036224, "flos": 23803851859200.0, "grad_norm": 2.2670835496753154, "language_loss": 0.93938446, "learning_rate": 3.3141042166898726e-06, "loss": 0.96065319, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.50390625, "step": 4884, "time_per_iteration": 3.9766452312469482 }, { "auxiliary_loss_clip": 0.01074587, "auxiliary_loss_mlp": 0.01057759, "balance_loss_clip": 1.02230597, "balance_loss_mlp": 1.02402091, "epoch": 0.2937020892830302, "flos": 23469139854720.0, "grad_norm": 2.316736242430448, "language_loss": 0.75325751, "learning_rate": 3.313810597972234e-06, "loss": 0.77458096, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.50390625, "step": 4885, "time_per_iteration": 2.411137580871582 }, { "auxiliary_loss_clip": 0.01069872, "auxiliary_loss_mlp": 0.01052157, "balance_loss_clip": 1.01873064, "balance_loss_mlp": 1.02077472, "epoch": 0.2937622125356982, "flos": 24271793078400.0, "grad_norm": 2.02589585953045, "language_loss": 0.86321092, "learning_rate": 3.3135169294347655e-06, "loss": 0.88443124, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4921875, "step": 4886, "time_per_iteration": 3.812697649002075 }, { "auxiliary_loss_clip": 0.01072576, "auxiliary_loss_mlp": 0.01055209, "balance_loss_clip": 1.02075756, "balance_loss_mlp": 1.02184367, "epoch": 0.29382233578836614, "flos": 20661564228480.0, "grad_norm": 2.480560310220524, "language_loss": 0.78955257, "learning_rate": 3.313223211088603e-06, "loss": 0.81083047, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 4887, "time_per_iteration": 2.427464246749878 }, { "auxiliary_loss_clip": 0.01072477, "auxiliary_loss_mlp": 0.01057146, "balance_loss_clip": 1.02293324, "balance_loss_mlp": 1.02183831, "epoch": 0.2938824590410341, "flos": 16544117013120.0, "grad_norm": 2.3209977973659384, "language_loss": 0.80854034, "learning_rate": 3.3129294429448855e-06, "loss": 0.82983661, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5078125, "step": 4888, "time_per_iteration": 2.3706562519073486 }, { "auxiliary_loss_clip": 0.01070153, "auxiliary_loss_mlp": 0.01048503, "balance_loss_clip": 1.0156486, "balance_loss_mlp": 1.02175987, "epoch": 0.29394258229370207, "flos": 37923949975680.0, "grad_norm": 4.030520061916835, "language_loss": 0.56728446, "learning_rate": 3.3126356250147517e-06, "loss": 0.588471, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.484375, "step": 4889, "time_per_iteration": 2.581333637237549 }, { "auxiliary_loss_clip": 0.01073057, "auxiliary_loss_mlp": 0.01055146, "balance_loss_clip": 1.01730871, "balance_loss_mlp": 1.02242005, "epoch": 0.29400270554637004, "flos": 20043741075840.0, "grad_norm": 1.8805411142663446, "language_loss": 0.86202985, "learning_rate": 3.3123417573093434e-06, "loss": 0.88331187, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5078125, "step": 4890, "time_per_iteration": 2.4000842571258545 }, { "auxiliary_loss_clip": 0.01074029, "auxiliary_loss_mlp": 0.01057175, "balance_loss_clip": 1.02265227, "balance_loss_mlp": 1.02325094, "epoch": 0.294062828799038, "flos": 15265527868800.0, "grad_norm": 2.0009550088355703, "language_loss": 0.74154902, "learning_rate": 3.3120478398398046e-06, "loss": 0.76286113, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 4891, "time_per_iteration": 2.405726432800293 }, { "auxiliary_loss_clip": 0.01073932, "auxiliary_loss_mlp": 0.01054097, "balance_loss_clip": 1.01924038, "balance_loss_mlp": 1.02365768, "epoch": 0.294122952051706, "flos": 22746053352960.0, "grad_norm": 2.7955587730523157, "language_loss": 0.78124285, "learning_rate": 3.3117538726172797e-06, "loss": 0.80252314, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 4892, "time_per_iteration": 2.397629737854004 }, { "auxiliary_loss_clip": 0.01069665, "auxiliary_loss_mlp": 0.0104934, "balance_loss_clip": 1.01612842, "balance_loss_mlp": 1.02111638, "epoch": 0.294183075304374, "flos": 24971731482240.0, "grad_norm": 2.327967692509786, "language_loss": 0.78952008, "learning_rate": 3.3114598556529164e-06, "loss": 0.81071019, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48632812, "step": 4893, "time_per_iteration": 2.4765195846557617 }, { "auxiliary_loss_clip": 0.01070998, "auxiliary_loss_mlp": 0.01055734, "balance_loss_clip": 1.02106786, "balance_loss_mlp": 1.02199149, "epoch": 0.29424319855704195, "flos": 30951760020480.0, "grad_norm": 1.6083825263218963, "language_loss": 0.85691649, "learning_rate": 3.311165788957864e-06, "loss": 0.87818378, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48828125, "step": 4894, "time_per_iteration": 2.4833176136016846 }, { "auxiliary_loss_clip": 0.01072848, "auxiliary_loss_mlp": 0.01049971, "balance_loss_clip": 1.01778483, "balance_loss_mlp": 1.02286887, "epoch": 0.2943033218097099, "flos": 15230684465280.0, "grad_norm": 4.0762549356447115, "language_loss": 0.9149397, "learning_rate": 3.310871672543274e-06, "loss": 0.93616784, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.5, "step": 4895, "time_per_iteration": 2.401970148086548 }, { "auxiliary_loss_clip": 0.01073209, "auxiliary_loss_mlp": 0.01053612, "balance_loss_clip": 1.01744437, "balance_loss_mlp": 1.02204823, "epoch": 0.2943634450623779, "flos": 21724808906880.0, "grad_norm": 1.7204799501478967, "language_loss": 0.88888407, "learning_rate": 3.3105775064202982e-06, "loss": 0.91015232, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.51171875, "step": 4896, "time_per_iteration": 2.4428389072418213 }, { "auxiliary_loss_clip": 0.01073168, "auxiliary_loss_mlp": 0.01060741, "balance_loss_clip": 1.02628946, "balance_loss_mlp": 1.02280641, "epoch": 0.29442356831504585, "flos": 22600989187200.0, "grad_norm": 1.861655127612311, "language_loss": 0.74606395, "learning_rate": 3.3102832906000924e-06, "loss": 0.76740313, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.50390625, "step": 4897, "time_per_iteration": 2.4077720642089844 }, { "auxiliary_loss_clip": 0.01076597, "auxiliary_loss_mlp": 0.01057251, "balance_loss_clip": 1.01841283, "balance_loss_mlp": 1.02267504, "epoch": 0.2944836915677138, "flos": 20010363949440.0, "grad_norm": 1.9473793465555769, "language_loss": 0.7636286, "learning_rate": 3.309989025093813e-06, "loss": 0.78496706, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5390625, "step": 4898, "time_per_iteration": 2.4418370723724365 }, { "auxiliary_loss_clip": 0.01076061, "auxiliary_loss_mlp": 0.01066152, "balance_loss_clip": 1.02736139, "balance_loss_mlp": 1.02287102, "epoch": 0.2945438148203818, "flos": 20044893150720.0, "grad_norm": 2.882657523578406, "language_loss": 0.72890675, "learning_rate": 3.309694709912618e-06, "loss": 0.75032896, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.53125, "step": 4899, "time_per_iteration": 2.3737075328826904 }, { "auxiliary_loss_clip": 0.01072335, "auxiliary_loss_mlp": 0.01056938, "balance_loss_clip": 1.02129459, "balance_loss_mlp": 1.02196217, "epoch": 0.29460393807304974, "flos": 23732384572800.0, "grad_norm": 2.0422618610528254, "language_loss": 0.80682731, "learning_rate": 3.3094003450676685e-06, "loss": 0.82812011, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.50390625, "step": 4900, "time_per_iteration": 2.464665412902832 }, { "auxiliary_loss_clip": 0.01070789, "auxiliary_loss_mlp": 0.01061173, "balance_loss_clip": 1.02481437, "balance_loss_mlp": 1.02050233, "epoch": 0.2946640613257177, "flos": 14975190069120.0, "grad_norm": 1.7036865815961801, "language_loss": 0.81882906, "learning_rate": 3.3091059305701268e-06, "loss": 0.84014869, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.50390625, "step": 4901, "time_per_iteration": 2.3845183849334717 }, { "auxiliary_loss_clip": 0.01066002, "auxiliary_loss_mlp": 0.01045335, "balance_loss_clip": 1.01500797, "balance_loss_mlp": 1.01999021, "epoch": 0.2947241845783857, "flos": 24242744960640.0, "grad_norm": 1.9926728591297775, "language_loss": 0.59630316, "learning_rate": 3.308811466431157e-06, "loss": 0.6174165, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4609375, "step": 4902, "time_per_iteration": 2.4277822971343994 }, { "auxiliary_loss_clip": 0.01072511, "auxiliary_loss_mlp": 0.01051809, "balance_loss_clip": 1.01833546, "balance_loss_mlp": 1.02206349, "epoch": 0.29478430783105364, "flos": 19937360563200.0, "grad_norm": 1.7393942931839688, "language_loss": 0.76862359, "learning_rate": 3.308516952661925e-06, "loss": 0.78986681, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.50390625, "step": 4903, "time_per_iteration": 2.4084975719451904 }, { "auxiliary_loss_clip": 0.01073023, "auxiliary_loss_mlp": 0.01053052, "balance_loss_clip": 1.01662183, "balance_loss_mlp": 1.0226469, "epoch": 0.2948444310837216, "flos": 27380110089600.0, "grad_norm": 2.3322529459646644, "language_loss": 0.63927937, "learning_rate": 3.3082223892736e-06, "loss": 0.6605401, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.50390625, "step": 4904, "time_per_iteration": 2.4827022552490234 }, { "auxiliary_loss_clip": 0.01072478, "auxiliary_loss_mlp": 0.01055474, "balance_loss_clip": 1.01994967, "balance_loss_mlp": 1.0208993, "epoch": 0.2949045543363896, "flos": 23404305726720.0, "grad_norm": 1.9987314484866203, "language_loss": 0.74582869, "learning_rate": 3.3079277762773496e-06, "loss": 0.7671082, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.515625, "step": 4905, "time_per_iteration": 2.3973491191864014 }, { "auxiliary_loss_clip": 0.01071154, "auxiliary_loss_mlp": 0.01055249, "balance_loss_clip": 1.01927161, "balance_loss_mlp": 1.0215739, "epoch": 0.2949646775890576, "flos": 23950347390720.0, "grad_norm": 1.8243963168510884, "language_loss": 0.82407546, "learning_rate": 3.3076331136843476e-06, "loss": 0.84533948, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.49609375, "step": 4906, "time_per_iteration": 2.463884115219116 }, { "auxiliary_loss_clip": 0.0106723, "auxiliary_loss_mlp": 0.0105081, "balance_loss_clip": 1.01855183, "balance_loss_mlp": 1.01988578, "epoch": 0.29502480084172555, "flos": 22783200906240.0, "grad_norm": 1.9292996709726093, "language_loss": 0.88355416, "learning_rate": 3.3073384015057667e-06, "loss": 0.90473461, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47265625, "step": 4907, "time_per_iteration": 2.4347665309906006 }, { "auxiliary_loss_clip": 0.01075031, "auxiliary_loss_mlp": 0.01056443, "balance_loss_clip": 1.018224, "balance_loss_mlp": 1.0227021, "epoch": 0.2950849240943935, "flos": 19645626309120.0, "grad_norm": 2.0037143148795864, "language_loss": 0.84027827, "learning_rate": 3.307043639752782e-06, "loss": 0.86159301, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5234375, "step": 4908, "time_per_iteration": 2.4218852519989014 }, { "auxiliary_loss_clip": 0.01016229, "auxiliary_loss_mlp": 0.01007377, "balance_loss_clip": 1.0026089, "balance_loss_mlp": 1.00526738, "epoch": 0.2951450473470615, "flos": 70999790469120.0, "grad_norm": 0.7758432712805711, "language_loss": 0.57343763, "learning_rate": 3.3067488284365728e-06, "loss": 0.59367371, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 0.04760742, "router_z_loss_mlp": 0.109375, "step": 4909, "time_per_iteration": 2.930645704269409 }, { "auxiliary_loss_clip": 0.01070016, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.02525711, "balance_loss_mlp": 1.02135289, "epoch": 0.29520517059972945, "flos": 22965203157120.0, "grad_norm": 1.6828892703361031, "language_loss": 0.87812179, "learning_rate": 3.3064539675683163e-06, "loss": 0.89940691, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48632812, "step": 4910, "time_per_iteration": 2.4146642684936523 }, { "auxiliary_loss_clip": 0.01068003, "auxiliary_loss_mlp": 0.01052024, "balance_loss_clip": 1.01979017, "balance_loss_mlp": 1.02125287, "epoch": 0.2952652938523974, "flos": 20484624124800.0, "grad_norm": 1.9260426355727707, "language_loss": 0.7508145, "learning_rate": 3.3061590571591946e-06, "loss": 0.77201474, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46679688, "step": 4911, "time_per_iteration": 2.3895938396453857 }, { "auxiliary_loss_clip": 0.01068787, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.01498497, "balance_loss_mlp": 1.02101326, "epoch": 0.2953254171050654, "flos": 19645556486400.0, "grad_norm": 1.6150248287891977, "language_loss": 0.9094699, "learning_rate": 3.3058640972203904e-06, "loss": 0.93063021, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4765625, "step": 4912, "time_per_iteration": 2.4016170501708984 }, { "auxiliary_loss_clip": 0.01071249, "auxiliary_loss_mlp": 0.01058366, "balance_loss_clip": 1.0247488, "balance_loss_mlp": 1.02207303, "epoch": 0.29538554035773334, "flos": 22746856314240.0, "grad_norm": 1.5665799681793202, "language_loss": 0.84850657, "learning_rate": 3.3055690877630894e-06, "loss": 0.86980271, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 4913, "time_per_iteration": 2.3933815956115723 }, { "auxiliary_loss_clip": 0.01071415, "auxiliary_loss_mlp": 0.01051326, "balance_loss_clip": 1.01651704, "balance_loss_mlp": 1.0206145, "epoch": 0.2954456636104013, "flos": 21870780768000.0, "grad_norm": 1.8005118118482717, "language_loss": 0.78663898, "learning_rate": 3.3052740287984765e-06, "loss": 0.80786639, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5078125, "step": 4914, "time_per_iteration": 2.4108452796936035 }, { "auxiliary_loss_clip": 0.01069694, "auxiliary_loss_mlp": 0.0105911, "balance_loss_clip": 1.02244163, "balance_loss_mlp": 1.02077579, "epoch": 0.2955057868630693, "flos": 40440978334080.0, "grad_norm": 1.748238031148167, "language_loss": 0.82918084, "learning_rate": 3.3049789203377424e-06, "loss": 0.85046887, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.48828125, "step": 4915, "time_per_iteration": 2.5443222522735596 }, { "auxiliary_loss_clip": 0.01074516, "auxiliary_loss_mlp": 0.01063129, "balance_loss_clip": 1.02560246, "balance_loss_mlp": 1.02329421, "epoch": 0.29556591011573724, "flos": 22563422697600.0, "grad_norm": 2.4901175114973033, "language_loss": 0.85699809, "learning_rate": 3.3046837623920772e-06, "loss": 0.87837458, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.51171875, "step": 4916, "time_per_iteration": 2.417827844619751 }, { "auxiliary_loss_clip": 0.01069864, "auxiliary_loss_mlp": 0.01049955, "balance_loss_clip": 1.01724362, "balance_loss_mlp": 1.02229571, "epoch": 0.2956260333684052, "flos": 22088254826880.0, "grad_norm": 2.0541672440034686, "language_loss": 0.72090679, "learning_rate": 3.3043885549726723e-06, "loss": 0.74210501, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4765625, "step": 4917, "time_per_iteration": 2.405491352081299 }, { "auxiliary_loss_clip": 0.01073285, "auxiliary_loss_mlp": 0.01051252, "balance_loss_clip": 1.01606154, "balance_loss_mlp": 1.02350497, "epoch": 0.2956861566210732, "flos": 16434559566720.0, "grad_norm": 2.02610689917768, "language_loss": 0.92322052, "learning_rate": 3.3040932980907226e-06, "loss": 0.94446588, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49804688, "step": 4918, "time_per_iteration": 2.4028449058532715 }, { "auxiliary_loss_clip": 0.0107474, "auxiliary_loss_mlp": 0.01055703, "balance_loss_clip": 1.02098966, "balance_loss_mlp": 1.02430367, "epoch": 0.2957462798737412, "flos": 25810903854720.0, "grad_norm": 2.0603121661759816, "language_loss": 0.73989892, "learning_rate": 3.303797991757425e-06, "loss": 0.76120335, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 4919, "time_per_iteration": 3.8773398399353027 }, { "auxiliary_loss_clip": 0.01069251, "auxiliary_loss_mlp": 0.01053329, "balance_loss_clip": 1.01954544, "balance_loss_mlp": 1.02120352, "epoch": 0.29580640312640916, "flos": 16689914317440.0, "grad_norm": 2.010343786307566, "language_loss": 0.77442658, "learning_rate": 3.3035026359839763e-06, "loss": 0.79565239, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.48046875, "step": 4920, "time_per_iteration": 2.374286413192749 }, { "auxiliary_loss_clip": 0.01073527, "auxiliary_loss_mlp": 0.0106088, "balance_loss_clip": 1.02480698, "balance_loss_mlp": 1.02324927, "epoch": 0.2958665263790771, "flos": 23944621927680.0, "grad_norm": 2.641901891840026, "language_loss": 0.69957709, "learning_rate": 3.3032072307815774e-06, "loss": 0.72092116, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.50390625, "step": 4921, "time_per_iteration": 2.4469776153564453 }, { "auxiliary_loss_clip": 0.0107464, "auxiliary_loss_mlp": 0.01057524, "balance_loss_clip": 1.02188039, "balance_loss_mlp": 1.02316546, "epoch": 0.2959266496317451, "flos": 18477432483840.0, "grad_norm": 1.9260588407204906, "language_loss": 0.75770396, "learning_rate": 3.3029117761614298e-06, "loss": 0.77902567, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.515625, "step": 4922, "time_per_iteration": 3.7966063022613525 }, { "auxiliary_loss_clip": 0.01075219, "auxiliary_loss_mlp": 0.01050785, "balance_loss_clip": 1.01493919, "balance_loss_mlp": 1.02236199, "epoch": 0.29598677288441305, "flos": 25956317134080.0, "grad_norm": 1.9803107240539604, "language_loss": 0.77720785, "learning_rate": 3.302616272134737e-06, "loss": 0.79846793, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.52734375, "step": 4923, "time_per_iteration": 2.4408934116363525 }, { "auxiliary_loss_clip": 0.01071541, "auxiliary_loss_mlp": 0.01052874, "balance_loss_clip": 1.01870894, "balance_loss_mlp": 1.02268553, "epoch": 0.296046896137081, "flos": 25154815985280.0, "grad_norm": 1.690597209565533, "language_loss": 0.87408042, "learning_rate": 3.3023207187127042e-06, "loss": 0.89532459, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48828125, "step": 4924, "time_per_iteration": 3.8029675483703613 }, { "auxiliary_loss_clip": 0.01071253, "auxiliary_loss_mlp": 0.01051231, "balance_loss_clip": 1.0169704, "balance_loss_mlp": 1.02319121, "epoch": 0.296107019389749, "flos": 21760106158080.0, "grad_norm": 1.3867499715422766, "language_loss": 0.82853138, "learning_rate": 3.3020251159065396e-06, "loss": 0.84975624, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48046875, "step": 4925, "time_per_iteration": 2.3817451000213623 }, { "auxiliary_loss_clip": 0.01073216, "auxiliary_loss_mlp": 0.01050021, "balance_loss_clip": 1.01692891, "balance_loss_mlp": 1.02528656, "epoch": 0.29616714264241695, "flos": 17959286862720.0, "grad_norm": 3.0850632983080497, "language_loss": 0.88759178, "learning_rate": 3.301729463727452e-06, "loss": 0.90882415, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48046875, "step": 4926, "time_per_iteration": 3.849029541015625 }, { "auxiliary_loss_clip": 0.01072104, "auxiliary_loss_mlp": 0.01052173, "balance_loss_clip": 1.01965237, "balance_loss_mlp": 1.02222085, "epoch": 0.2962272658950849, "flos": 15011883774720.0, "grad_norm": 1.9768862123988153, "language_loss": 0.8768667, "learning_rate": 3.3014337621866527e-06, "loss": 0.89810944, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5, "step": 4927, "time_per_iteration": 2.34637713432312 }, { "auxiliary_loss_clip": 0.01070448, "auxiliary_loss_mlp": 0.0104864, "balance_loss_clip": 1.0184567, "balance_loss_mlp": 1.023561, "epoch": 0.2962873891477529, "flos": 14719974963840.0, "grad_norm": 1.6650179327074452, "language_loss": 0.81734651, "learning_rate": 3.3011380112953553e-06, "loss": 0.83853734, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.46875, "step": 4928, "time_per_iteration": 2.4015843868255615 }, { "auxiliary_loss_clip": 0.01074564, "auxiliary_loss_mlp": 0.01061494, "balance_loss_clip": 1.02172613, "balance_loss_mlp": 1.02197695, "epoch": 0.29634751240042084, "flos": 26722590854400.0, "grad_norm": 2.6008225094475828, "language_loss": 0.75659472, "learning_rate": 3.300842211064773e-06, "loss": 0.77795529, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.52734375, "step": 4929, "time_per_iteration": 2.420578718185425 }, { "auxiliary_loss_clip": 0.01075235, "auxiliary_loss_mlp": 0.01058544, "balance_loss_clip": 1.02085018, "balance_loss_mlp": 1.02380538, "epoch": 0.2964076356530888, "flos": 14570511966720.0, "grad_norm": 2.3251143620708588, "language_loss": 0.7397933, "learning_rate": 3.3005463615061246e-06, "loss": 0.76113117, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.515625, "step": 4930, "time_per_iteration": 2.3859140872955322 }, { "auxiliary_loss_clip": 0.01017994, "auxiliary_loss_mlp": 0.01010924, "balance_loss_clip": 1.00596523, "balance_loss_mlp": 1.0059793, "epoch": 0.29646775890575683, "flos": 63100969585920.0, "grad_norm": 0.8118460453780725, "language_loss": 0.60787827, "learning_rate": 3.3002504626306275e-06, "loss": 0.62816745, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 0.04956055, "router_z_loss_mlp": 0.12011719, "step": 4931, "time_per_iteration": 2.968148708343506 }, { "auxiliary_loss_clip": 0.01015881, "auxiliary_loss_mlp": 0.01010139, "balance_loss_clip": 1.00568056, "balance_loss_mlp": 1.00415421, "epoch": 0.2965278821584248, "flos": 63064345703040.0, "grad_norm": 0.7475617868296669, "language_loss": 0.52471447, "learning_rate": 3.2999545144495023e-06, "loss": 0.54497457, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 0.04467773, "router_z_loss_mlp": 0.1171875, "step": 4932, "time_per_iteration": 2.9332351684570312 }, { "auxiliary_loss_clip": 0.01068926, "auxiliary_loss_mlp": 0.01055929, "balance_loss_clip": 1.02438581, "balance_loss_mlp": 1.02187634, "epoch": 0.29658800541109276, "flos": 23767612001280.0, "grad_norm": 1.822576153423367, "language_loss": 0.83077341, "learning_rate": 3.299658516973972e-06, "loss": 0.85202199, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.47070312, "step": 4933, "time_per_iteration": 2.4172708988189697 }, { "auxiliary_loss_clip": 0.01067279, "auxiliary_loss_mlp": 0.01051815, "balance_loss_clip": 1.01977205, "balance_loss_mlp": 1.02147555, "epoch": 0.2966481286637607, "flos": 23987390209920.0, "grad_norm": 2.1605541960150134, "language_loss": 0.76602173, "learning_rate": 3.299362470215261e-06, "loss": 0.78721261, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45703125, "step": 4934, "time_per_iteration": 2.4323980808258057 }, { "auxiliary_loss_clip": 0.01069594, "auxiliary_loss_mlp": 0.01067141, "balance_loss_clip": 1.03270173, "balance_loss_mlp": 1.02077115, "epoch": 0.2967082519164287, "flos": 17164209404160.0, "grad_norm": 1.7987647537193867, "language_loss": 0.64107931, "learning_rate": 3.299066374184594e-06, "loss": 0.66244662, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48828125, "step": 4935, "time_per_iteration": 2.3592231273651123 }, { "auxiliary_loss_clip": 0.01068163, "auxiliary_loss_mlp": 0.01068427, "balance_loss_clip": 1.0324738, "balance_loss_mlp": 1.02128315, "epoch": 0.29676837516909665, "flos": 29386428946560.0, "grad_norm": 1.514062830653207, "language_loss": 0.80694783, "learning_rate": 3.2987702288932e-06, "loss": 0.82831371, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.46875, "step": 4936, "time_per_iteration": 2.4798471927642822 }, { "auxiliary_loss_clip": 0.01071307, "auxiliary_loss_mlp": 0.0106556, "balance_loss_clip": 1.03058386, "balance_loss_mlp": 1.02231121, "epoch": 0.2968284984217646, "flos": 34749786407040.0, "grad_norm": 1.4403440175418074, "language_loss": 0.75601327, "learning_rate": 3.298474034352309e-06, "loss": 0.77738202, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49023438, "step": 4937, "time_per_iteration": 2.511199951171875 }, { "auxiliary_loss_clip": 0.01068746, "auxiliary_loss_mlp": 0.01052344, "balance_loss_clip": 1.01860785, "balance_loss_mlp": 1.02212822, "epoch": 0.2968886216744326, "flos": 21543016124160.0, "grad_norm": 1.742266291902911, "language_loss": 0.79571503, "learning_rate": 3.2981777905731526e-06, "loss": 0.81692594, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46484375, "step": 4938, "time_per_iteration": 2.437199831008911 }, { "auxiliary_loss_clip": 0.01075073, "auxiliary_loss_mlp": 0.0106454, "balance_loss_clip": 1.02920663, "balance_loss_mlp": 1.02450371, "epoch": 0.29694874492710055, "flos": 12786484936320.0, "grad_norm": 2.406652152552255, "language_loss": 0.7837491, "learning_rate": 3.297881497566964e-06, "loss": 0.80514526, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5078125, "step": 4939, "time_per_iteration": 2.376600503921509 }, { "auxiliary_loss_clip": 0.01076047, "auxiliary_loss_mlp": 0.01059316, "balance_loss_clip": 1.02498341, "balance_loss_mlp": 1.02439332, "epoch": 0.2970088681797685, "flos": 24568868770560.0, "grad_norm": 1.781395186859976, "language_loss": 0.80015892, "learning_rate": 3.297585155344979e-06, "loss": 0.82151258, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.515625, "step": 4940, "time_per_iteration": 2.470235586166382 }, { "auxiliary_loss_clip": 0.01075344, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.01459444, "balance_loss_mlp": 1.02553821, "epoch": 0.2970689914324365, "flos": 23658054554880.0, "grad_norm": 1.5239158411174356, "language_loss": 0.76627851, "learning_rate": 3.297288763918435e-06, "loss": 0.78755552, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.49804688, "step": 4941, "time_per_iteration": 2.4202511310577393 }, { "auxiliary_loss_clip": 0.01076294, "auxiliary_loss_mlp": 0.01054551, "balance_loss_clip": 1.01900256, "balance_loss_mlp": 1.02520919, "epoch": 0.29712911468510445, "flos": 39668909328000.0, "grad_norm": 2.467371886788019, "language_loss": 0.76374435, "learning_rate": 3.2969923232985712e-06, "loss": 0.78505284, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.51171875, "step": 4942, "time_per_iteration": 2.568516492843628 }, { "auxiliary_loss_clip": 0.01077161, "auxiliary_loss_mlp": 0.01058445, "balance_loss_clip": 1.02258706, "balance_loss_mlp": 1.0264678, "epoch": 0.2971892379377724, "flos": 26394127983360.0, "grad_norm": 1.8201815378262778, "language_loss": 0.72306985, "learning_rate": 3.2966958334966287e-06, "loss": 0.74442589, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5078125, "step": 4943, "time_per_iteration": 2.4508590698242188 }, { "auxiliary_loss_clip": 0.01079083, "auxiliary_loss_mlp": 0.0104923, "balance_loss_clip": 1.01268005, "balance_loss_mlp": 1.02719212, "epoch": 0.2972493611904404, "flos": 17602229721600.0, "grad_norm": 1.932238403533561, "language_loss": 0.81977844, "learning_rate": 3.2963992945238497e-06, "loss": 0.84106153, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.51953125, "step": 4944, "time_per_iteration": 2.413893222808838 }, { "auxiliary_loss_clip": 0.01071874, "auxiliary_loss_mlp": 0.01055656, "balance_loss_clip": 1.02413738, "balance_loss_mlp": 1.02408659, "epoch": 0.2973094844431084, "flos": 20411725472640.0, "grad_norm": 2.066699572052358, "language_loss": 0.84824955, "learning_rate": 3.2961027063914795e-06, "loss": 0.86952484, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47851562, "step": 4945, "time_per_iteration": 2.4090192317962646 }, { "auxiliary_loss_clip": 0.01074413, "auxiliary_loss_mlp": 0.01054625, "balance_loss_clip": 1.02298677, "balance_loss_mlp": 1.02712643, "epoch": 0.29736960769577636, "flos": 17492532629760.0, "grad_norm": 2.284242911992892, "language_loss": 0.68799412, "learning_rate": 3.2958060691107654e-06, "loss": 0.70928448, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47265625, "step": 4946, "time_per_iteration": 2.438011407852173 }, { "auxiliary_loss_clip": 0.01075999, "auxiliary_loss_mlp": 0.01049656, "balance_loss_clip": 1.01692104, "balance_loss_mlp": 1.02659726, "epoch": 0.2974297309484443, "flos": 26102777754240.0, "grad_norm": 1.7237954190274103, "language_loss": 0.75457019, "learning_rate": 3.2955093826929547e-06, "loss": 0.77582675, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.49414062, "step": 4947, "time_per_iteration": 2.4563536643981934 }, { "auxiliary_loss_clip": 0.01076242, "auxiliary_loss_mlp": 0.01058754, "balance_loss_clip": 1.02523255, "balance_loss_mlp": 1.02560616, "epoch": 0.2974898542011123, "flos": 25665246195840.0, "grad_norm": 2.8018723218448933, "language_loss": 0.74262404, "learning_rate": 3.2952126471492985e-06, "loss": 0.76397395, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.5078125, "step": 4948, "time_per_iteration": 2.487300157546997 }, { "auxiliary_loss_clip": 0.01070975, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.0172168, "balance_loss_mlp": 1.02363586, "epoch": 0.29754997745378026, "flos": 18660342430080.0, "grad_norm": 2.6897353909676216, "language_loss": 0.84877837, "learning_rate": 3.2949158624910497e-06, "loss": 0.86997569, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47265625, "step": 4949, "time_per_iteration": 2.41959285736084 }, { "auxiliary_loss_clip": 0.01071619, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.02222812, "balance_loss_mlp": 1.02431333, "epoch": 0.2976101007064482, "flos": 22273468922880.0, "grad_norm": 2.5214183998609663, "language_loss": 0.7251277, "learning_rate": 3.2946190287294603e-06, "loss": 0.74639583, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47265625, "step": 4950, "time_per_iteration": 2.4777534008026123 }, { "auxiliary_loss_clip": 0.01070759, "auxiliary_loss_mlp": 0.01058955, "balance_loss_clip": 1.03028536, "balance_loss_mlp": 1.02473581, "epoch": 0.2976702239591162, "flos": 21944552204160.0, "grad_norm": 1.9246962046195801, "language_loss": 0.83728421, "learning_rate": 3.294322145875789e-06, "loss": 0.85858142, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4609375, "step": 4951, "time_per_iteration": 2.4023971557617188 }, { "auxiliary_loss_clip": 0.01070931, "auxiliary_loss_mlp": 0.01053651, "balance_loss_clip": 1.02234674, "balance_loss_mlp": 1.02250302, "epoch": 0.29773034721178415, "flos": 24636251427840.0, "grad_norm": 2.8126589290533865, "language_loss": 0.76597071, "learning_rate": 3.2940252139412912e-06, "loss": 0.78721642, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.484375, "step": 4952, "time_per_iteration": 2.4573354721069336 }, { "auxiliary_loss_clip": 0.01071916, "auxiliary_loss_mlp": 0.01055633, "balance_loss_clip": 1.02249324, "balance_loss_mlp": 1.02409458, "epoch": 0.2977904704644521, "flos": 20556545258880.0, "grad_norm": 2.124694617560849, "language_loss": 0.85115862, "learning_rate": 3.293728232937228e-06, "loss": 0.87243414, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47851562, "step": 4953, "time_per_iteration": 2.4074764251708984 }, { "auxiliary_loss_clip": 0.01071434, "auxiliary_loss_mlp": 0.01056046, "balance_loss_clip": 1.0238595, "balance_loss_mlp": 1.02293324, "epoch": 0.2978505937171201, "flos": 18915452801280.0, "grad_norm": 2.1397316537029987, "language_loss": 0.75664568, "learning_rate": 3.2934312028748597e-06, "loss": 0.77792048, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.484375, "step": 4954, "time_per_iteration": 2.4374098777770996 }, { "auxiliary_loss_clip": 0.0106787, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.02186668, "balance_loss_mlp": 1.02131677, "epoch": 0.29791071696978805, "flos": 19316744501760.0, "grad_norm": 1.7503918987079452, "language_loss": 0.77055359, "learning_rate": 3.293134123765452e-06, "loss": 0.79174709, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.46484375, "step": 4955, "time_per_iteration": 2.3637735843658447 }, { "auxiliary_loss_clip": 0.01070725, "auxiliary_loss_mlp": 0.01050584, "balance_loss_clip": 1.01782525, "balance_loss_mlp": 1.02195513, "epoch": 0.297970840222456, "flos": 18805825532160.0, "grad_norm": 1.8067624729262326, "language_loss": 0.73442382, "learning_rate": 3.2928369956202684e-06, "loss": 0.75563693, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48828125, "step": 4956, "time_per_iteration": 2.428098678588867 }, { "auxiliary_loss_clip": 0.01072456, "auxiliary_loss_mlp": 0.01056615, "balance_loss_clip": 1.02250969, "balance_loss_mlp": 1.021088, "epoch": 0.298030963475124, "flos": 22851770549760.0, "grad_norm": 1.9053165860700283, "language_loss": 0.8054024, "learning_rate": 3.2925398184505754e-06, "loss": 0.82669312, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.51171875, "step": 4957, "time_per_iteration": 2.4103972911834717 }, { "auxiliary_loss_clip": 0.0107012, "auxiliary_loss_mlp": 0.01053212, "balance_loss_clip": 1.02031076, "balance_loss_mlp": 1.02158082, "epoch": 0.298091086727792, "flos": 21867499100160.0, "grad_norm": 1.5783504801481614, "language_loss": 0.71423602, "learning_rate": 3.2922425922676437e-06, "loss": 0.73546934, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48632812, "step": 4958, "time_per_iteration": 3.8554458618164062 }, { "auxiliary_loss_clip": 0.01069486, "auxiliary_loss_mlp": 0.01055142, "balance_loss_clip": 1.02398074, "balance_loss_mlp": 1.02231836, "epoch": 0.29815120998045996, "flos": 21174054209280.0, "grad_norm": 1.5973760184141246, "language_loss": 0.79588437, "learning_rate": 3.291945317082743e-06, "loss": 0.81713068, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.47070312, "step": 4959, "time_per_iteration": 2.397167444229126 }, { "auxiliary_loss_clip": 0.01067438, "auxiliary_loss_mlp": 0.01050118, "balance_loss_clip": 1.01752615, "balance_loss_mlp": 1.02023959, "epoch": 0.29821133323312793, "flos": 19895395242240.0, "grad_norm": 1.702436726490042, "language_loss": 0.80673039, "learning_rate": 3.291647992907147e-06, "loss": 0.82790595, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47265625, "step": 4960, "time_per_iteration": 2.4434754848480225 }, { "auxiliary_loss_clip": 0.01075742, "auxiliary_loss_mlp": 0.01053229, "balance_loss_clip": 1.01913464, "balance_loss_mlp": 1.02518713, "epoch": 0.2982714564857959, "flos": 12749930876160.0, "grad_norm": 2.2320374554604627, "language_loss": 0.75417262, "learning_rate": 3.291350619752129e-06, "loss": 0.77546239, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 4961, "time_per_iteration": 2.4707937240600586 }, { "auxiliary_loss_clip": 0.01070778, "auxiliary_loss_mlp": 0.01045936, "balance_loss_clip": 1.01656294, "balance_loss_mlp": 1.02334881, "epoch": 0.29833157973846386, "flos": 22270850570880.0, "grad_norm": 1.847720614251277, "language_loss": 0.63334548, "learning_rate": 3.291053197628967e-06, "loss": 0.65451264, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.47265625, "step": 4962, "time_per_iteration": 3.836911916732788 }, { "auxiliary_loss_clip": 0.01072426, "auxiliary_loss_mlp": 0.01058612, "balance_loss_clip": 1.0245893, "balance_loss_mlp": 1.02454448, "epoch": 0.2983917029911318, "flos": 15372222583680.0, "grad_norm": 1.6545135719105266, "language_loss": 0.84297562, "learning_rate": 3.2907557265489375e-06, "loss": 0.86428595, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.47851562, "step": 4963, "time_per_iteration": 3.881481409072876 }, { "auxiliary_loss_clip": 0.0107235, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.02058232, "balance_loss_mlp": 1.02494872, "epoch": 0.2984518262437998, "flos": 15376726149120.0, "grad_norm": 2.314063471996047, "language_loss": 0.67699575, "learning_rate": 3.290458206523322e-06, "loss": 0.69825339, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.47265625, "step": 4964, "time_per_iteration": 2.399296760559082 }, { "auxiliary_loss_clip": 0.01070137, "auxiliary_loss_mlp": 0.01045895, "balance_loss_clip": 1.01547289, "balance_loss_mlp": 1.02375841, "epoch": 0.29851194949646775, "flos": 18107632696320.0, "grad_norm": 1.8861872387344412, "language_loss": 0.73020732, "learning_rate": 3.2901606375634015e-06, "loss": 0.75136769, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.46484375, "step": 4965, "time_per_iteration": 3.8145718574523926 }, { "auxiliary_loss_clip": 0.01074708, "auxiliary_loss_mlp": 0.01053322, "balance_loss_clip": 1.01996779, "balance_loss_mlp": 1.02543378, "epoch": 0.2985720727491357, "flos": 22017136654080.0, "grad_norm": 1.8354160542603977, "language_loss": 0.68304247, "learning_rate": 3.289863019680461e-06, "loss": 0.70432276, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.49414062, "step": 4966, "time_per_iteration": 2.4712448120117188 }, { "auxiliary_loss_clip": 0.01073345, "auxiliary_loss_mlp": 0.01049772, "balance_loss_clip": 1.0182054, "balance_loss_mlp": 1.02518749, "epoch": 0.2986321960018037, "flos": 13040547966720.0, "grad_norm": 2.5614621596665446, "language_loss": 0.76271701, "learning_rate": 3.289565352885785e-06, "loss": 0.78394818, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48046875, "step": 4967, "time_per_iteration": 2.384584426879883 }, { "auxiliary_loss_clip": 0.01070709, "auxiliary_loss_mlp": 0.0105061, "balance_loss_clip": 1.02009249, "balance_loss_mlp": 1.02178764, "epoch": 0.29869231925447165, "flos": 14464166365440.0, "grad_norm": 2.676097966765139, "language_loss": 0.73000598, "learning_rate": 3.2892676371906614e-06, "loss": 0.75121915, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.48828125, "step": 4968, "time_per_iteration": 2.407240867614746 }, { "auxiliary_loss_clip": 0.01072407, "auxiliary_loss_mlp": 0.01050544, "balance_loss_clip": 1.01723707, "balance_loss_mlp": 1.02302289, "epoch": 0.2987524425071396, "flos": 31648870604160.0, "grad_norm": 1.5827339973804933, "language_loss": 0.77694046, "learning_rate": 3.2889698726063805e-06, "loss": 0.79816997, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49414062, "step": 4969, "time_per_iteration": 2.4957313537597656 }, { "auxiliary_loss_clip": 0.01070591, "auxiliary_loss_mlp": 0.01048574, "balance_loss_clip": 1.01772285, "balance_loss_mlp": 1.02348757, "epoch": 0.2988125657598076, "flos": 21432376425600.0, "grad_norm": 1.9583038821929744, "language_loss": 0.71621495, "learning_rate": 3.2886720591442327e-06, "loss": 0.73740661, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47070312, "step": 4970, "time_per_iteration": 2.4234254360198975 }, { "auxiliary_loss_clip": 0.01072487, "auxiliary_loss_mlp": 0.01058459, "balance_loss_clip": 1.02152777, "balance_loss_mlp": 1.02125371, "epoch": 0.2988726890124756, "flos": 18076001137920.0, "grad_norm": 2.2627687261326423, "language_loss": 0.86775655, "learning_rate": 3.2883741968155103e-06, "loss": 0.88906598, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.51171875, "step": 4971, "time_per_iteration": 2.3889262676239014 }, { "auxiliary_loss_clip": 0.01069964, "auxiliary_loss_mlp": 0.01053548, "balance_loss_clip": 1.02105165, "balance_loss_mlp": 1.02301002, "epoch": 0.29893281226514357, "flos": 21754764720000.0, "grad_norm": 1.8828492890845896, "language_loss": 0.80582368, "learning_rate": 3.2880762856315107e-06, "loss": 0.82705879, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46875, "step": 4972, "time_per_iteration": 2.4405760765075684 }, { "auxiliary_loss_clip": 0.01069854, "auxiliary_loss_mlp": 0.01061361, "balance_loss_clip": 1.02683806, "balance_loss_mlp": 1.02099657, "epoch": 0.29899293551781153, "flos": 16835781444480.0, "grad_norm": 1.926791096935694, "language_loss": 0.87145305, "learning_rate": 3.2877783256035285e-06, "loss": 0.89276516, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.48828125, "step": 4973, "time_per_iteration": 2.3882272243499756 }, { "auxiliary_loss_clip": 0.01065729, "auxiliary_loss_mlp": 0.01052008, "balance_loss_clip": 1.02115655, "balance_loss_mlp": 1.02044177, "epoch": 0.2990530587704795, "flos": 11728407139200.0, "grad_norm": 1.727083796313815, "language_loss": 0.78885752, "learning_rate": 3.287480316742863e-06, "loss": 0.81003493, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.453125, "step": 4974, "time_per_iteration": 2.359952211380005 }, { "auxiliary_loss_clip": 0.01070999, "auxiliary_loss_mlp": 0.01056481, "balance_loss_clip": 1.02200532, "balance_loss_mlp": 1.02192521, "epoch": 0.29911318202314746, "flos": 28038571931520.0, "grad_norm": 1.9250078280834568, "language_loss": 0.73424935, "learning_rate": 3.287182259060815e-06, "loss": 0.75552416, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4921875, "step": 4975, "time_per_iteration": 2.5221619606018066 }, { "auxiliary_loss_clip": 0.01070492, "auxiliary_loss_mlp": 0.0105347, "balance_loss_clip": 1.01787388, "balance_loss_mlp": 1.02192378, "epoch": 0.2991733052758154, "flos": 18732577766400.0, "grad_norm": 4.896063580601903, "language_loss": 0.77738786, "learning_rate": 3.286884152568687e-06, "loss": 0.7986275, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.484375, "step": 4976, "time_per_iteration": 2.3710060119628906 }, { "auxiliary_loss_clip": 0.01070177, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.0227387, "balance_loss_mlp": 1.02189255, "epoch": 0.2992334285284834, "flos": 15558274552320.0, "grad_norm": 2.1438299890149204, "language_loss": 0.88398516, "learning_rate": 3.2865859972777827e-06, "loss": 0.90526414, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.48242188, "step": 4977, "time_per_iteration": 2.3986966609954834 }, { "auxiliary_loss_clip": 0.01072196, "auxiliary_loss_mlp": 0.01054887, "balance_loss_clip": 1.01960087, "balance_loss_mlp": 1.02297258, "epoch": 0.29929355178115136, "flos": 21796520572800.0, "grad_norm": 1.7746852726364324, "language_loss": 0.70047677, "learning_rate": 3.2862877931994088e-06, "loss": 0.72174764, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.4921875, "step": 4978, "time_per_iteration": 2.4016833305358887 }, { "auxiliary_loss_clip": 0.01070915, "auxiliary_loss_mlp": 0.01053662, "balance_loss_clip": 1.01916242, "balance_loss_mlp": 1.02274847, "epoch": 0.2993536750338193, "flos": 21177475522560.0, "grad_norm": 2.797051485670118, "language_loss": 0.7871955, "learning_rate": 3.2859895403448726e-06, "loss": 0.80844128, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48242188, "step": 4979, "time_per_iteration": 2.45940899848938 }, { "auxiliary_loss_clip": 0.01070951, "auxiliary_loss_mlp": 0.01059142, "balance_loss_clip": 1.02161455, "balance_loss_mlp": 1.02116251, "epoch": 0.2994137982864873, "flos": 32120826629760.0, "grad_norm": 1.5988488402270855, "language_loss": 0.69468606, "learning_rate": 3.285691238725484e-06, "loss": 0.71598697, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.49609375, "step": 4980, "time_per_iteration": 2.483534336090088 }, { "auxiliary_loss_clip": 0.01070375, "auxiliary_loss_mlp": 0.01053733, "balance_loss_clip": 1.02240443, "balance_loss_mlp": 1.02307498, "epoch": 0.29947392153915525, "flos": 21104367402240.0, "grad_norm": 1.8704588607237216, "language_loss": 0.75134361, "learning_rate": 3.285392888352555e-06, "loss": 0.77258468, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.47265625, "step": 4981, "time_per_iteration": 2.4821488857269287 }, { "auxiliary_loss_clip": 0.01072147, "auxiliary_loss_mlp": 0.01056131, "balance_loss_clip": 1.02010632, "balance_loss_mlp": 1.02120996, "epoch": 0.2995340447918232, "flos": 21541584758400.0, "grad_norm": 1.9686395990041081, "language_loss": 0.87548542, "learning_rate": 3.2850944892373987e-06, "loss": 0.89676821, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51171875, "step": 4982, "time_per_iteration": 2.4308412075042725 }, { "auxiliary_loss_clip": 0.01074508, "auxiliary_loss_mlp": 0.01055561, "balance_loss_clip": 1.01796293, "balance_loss_mlp": 1.02260244, "epoch": 0.2995941680444912, "flos": 16724268961920.0, "grad_norm": 2.3877687241719197, "language_loss": 0.8806144, "learning_rate": 3.2847960413913307e-06, "loss": 0.90191519, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.51953125, "step": 4983, "time_per_iteration": 2.398627519607544 }, { "auxiliary_loss_clip": 0.01073647, "auxiliary_loss_mlp": 0.01057009, "balance_loss_clip": 1.02117491, "balance_loss_mlp": 1.02402949, "epoch": 0.2996542912971592, "flos": 20922434974080.0, "grad_norm": 1.9631790288429682, "language_loss": 0.80006826, "learning_rate": 3.284497544825668e-06, "loss": 0.82137477, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.49609375, "step": 4984, "time_per_iteration": 2.4025232791900635 }, { "auxiliary_loss_clip": 0.01073086, "auxiliary_loss_mlp": 0.01056819, "balance_loss_clip": 1.02115202, "balance_loss_mlp": 1.02326334, "epoch": 0.29971441454982717, "flos": 25078775310720.0, "grad_norm": 1.6385017071426284, "language_loss": 0.79836261, "learning_rate": 3.2841989995517303e-06, "loss": 0.81966168, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49804688, "step": 4985, "time_per_iteration": 2.473898410797119 }, { "auxiliary_loss_clip": 0.01075786, "auxiliary_loss_mlp": 0.01059359, "balance_loss_clip": 1.01930475, "balance_loss_mlp": 1.0233469, "epoch": 0.29977453780249513, "flos": 52553989543680.0, "grad_norm": 2.358454945663201, "language_loss": 0.72462744, "learning_rate": 3.283900405580837e-06, "loss": 0.74597889, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.5234375, "step": 4986, "time_per_iteration": 2.685598134994507 }, { "auxiliary_loss_clip": 0.01075076, "auxiliary_loss_mlp": 0.01061746, "balance_loss_clip": 1.02085686, "balance_loss_mlp": 1.02254725, "epoch": 0.2998346610551631, "flos": 22236042078720.0, "grad_norm": 2.8651023241825784, "language_loss": 0.75672078, "learning_rate": 3.283601762924312e-06, "loss": 0.77808905, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 0.5234375, "step": 4987, "time_per_iteration": 2.4541521072387695 }, { "auxiliary_loss_clip": 0.0106996, "auxiliary_loss_mlp": 0.0105109, "balance_loss_clip": 1.01768756, "balance_loss_mlp": 1.02114773, "epoch": 0.29989478430783106, "flos": 16872265681920.0, "grad_norm": 1.7554611970159923, "language_loss": 0.81561637, "learning_rate": 3.2833030715934793e-06, "loss": 0.83682692, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.48828125, "step": 4988, "time_per_iteration": 2.3813319206237793 }, { "auxiliary_loss_clip": 0.01071441, "auxiliary_loss_mlp": 0.01057068, "balance_loss_clip": 1.01934981, "balance_loss_mlp": 1.0215286, "epoch": 0.29995490756049903, "flos": 23767751646720.0, "grad_norm": 1.5836308066415319, "language_loss": 0.71791768, "learning_rate": 3.2830043315996658e-06, "loss": 0.73920274, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.5, "step": 4989, "time_per_iteration": 2.4964566230773926 }, { "auxiliary_loss_clip": 0.01076071, "auxiliary_loss_mlp": 0.01062099, "balance_loss_clip": 1.022331, "balance_loss_mlp": 1.02337372, "epoch": 0.300015030813167, "flos": 14464445656320.0, "grad_norm": 1.7400781640343912, "language_loss": 0.86931324, "learning_rate": 3.282705542954199e-06, "loss": 0.89069492, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.52734375, "step": 4990, "time_per_iteration": 2.3797109127044678 }, { "auxiliary_loss_clip": 0.01074468, "auxiliary_loss_mlp": 0.01060809, "balance_loss_clip": 1.02123189, "balance_loss_mlp": 1.02227974, "epoch": 0.30007515406583496, "flos": 25190811463680.0, "grad_norm": 1.621467583866376, "language_loss": 0.67898327, "learning_rate": 3.28240670566841e-06, "loss": 0.70033598, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.5234375, "step": 4991, "time_per_iteration": 2.4474854469299316 }, { "auxiliary_loss_clip": 0.01073735, "auxiliary_loss_mlp": 0.01061384, "balance_loss_clip": 1.01939893, "balance_loss_mlp": 1.02141237, "epoch": 0.3001352773185029, "flos": 19390166824320.0, "grad_norm": 3.0283873455692163, "language_loss": 0.80183244, "learning_rate": 3.28210781975363e-06, "loss": 0.82318366, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 0.5234375, "step": 4992, "time_per_iteration": 2.3969945907592773 }, { "auxiliary_loss_clip": 0.01068551, "auxiliary_loss_mlp": 0.01056611, "balance_loss_clip": 1.02037144, "balance_loss_mlp": 1.01982594, "epoch": 0.3001954005711709, "flos": 21542771744640.0, "grad_norm": 4.024402022120727, "language_loss": 0.83832443, "learning_rate": 3.281808885221193e-06, "loss": 0.85957605, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.48828125, "step": 4993, "time_per_iteration": 2.4585187435150146 }, { "auxiliary_loss_clip": 0.0107408, "auxiliary_loss_mlp": 0.01058799, "balance_loss_clip": 1.01822042, "balance_loss_mlp": 1.02091491, "epoch": 0.30025552382383885, "flos": 17383359208320.0, "grad_norm": 2.155100777074444, "language_loss": 0.88815671, "learning_rate": 3.2815099020824345e-06, "loss": 0.90948552, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.53125, "step": 4994, "time_per_iteration": 2.3689606189727783 }, { "auxiliary_loss_clip": 0.01072694, "auxiliary_loss_mlp": 0.01057084, "balance_loss_clip": 1.01960468, "balance_loss_mlp": 1.02266192, "epoch": 0.3003156470765068, "flos": 29532051694080.0, "grad_norm": 1.6351317122098659, "language_loss": 0.8264932, "learning_rate": 3.2812108703486924e-06, "loss": 0.84779096, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5, "step": 4995, "time_per_iteration": 2.521209478378296 }, { "auxiliary_loss_clip": 0.01069759, "auxiliary_loss_mlp": 0.0105258, "balance_loss_clip": 1.01858115, "balance_loss_mlp": 1.02160144, "epoch": 0.3003757703291748, "flos": 43644923159040.0, "grad_norm": 2.131256717702012, "language_loss": 0.68424606, "learning_rate": 3.2809117900313055e-06, "loss": 0.70546949, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.48242188, "step": 4996, "time_per_iteration": 2.5656886100769043 }, { "auxiliary_loss_clip": 0.01071241, "auxiliary_loss_mlp": 0.01058361, "balance_loss_clip": 1.02324247, "balance_loss_mlp": 1.02123153, "epoch": 0.30043589358184275, "flos": 22527287573760.0, "grad_norm": 1.9247347990659436, "language_loss": 0.77205473, "learning_rate": 3.280612661141615e-06, "loss": 0.79335082, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.5, "step": 4997, "time_per_iteration": 2.465752124786377 }, { "auxiliary_loss_clip": 0.01067637, "auxiliary_loss_mlp": 0.01055995, "balance_loss_clip": 1.01942182, "balance_loss_mlp": 1.01969564, "epoch": 0.30049601683451077, "flos": 20994844867200.0, "grad_norm": 1.9032760289841577, "language_loss": 0.79370332, "learning_rate": 3.2803134836909646e-06, "loss": 0.81493962, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.48046875, "step": 4998, "time_per_iteration": 3.8388283252716064 }, { "auxiliary_loss_clip": 0.01068422, "auxiliary_loss_mlp": 0.01049412, "balance_loss_clip": 1.01400685, "balance_loss_mlp": 1.02086139, "epoch": 0.30055614008717874, "flos": 23914840671360.0, "grad_norm": 1.9403560870740555, "language_loss": 0.74785626, "learning_rate": 3.2800142576906985e-06, "loss": 0.76903462, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4765625, "step": 4999, "time_per_iteration": 2.4379467964172363 }, { "auxiliary_loss_clip": 0.0107218, "auxiliary_loss_mlp": 0.01062678, "balance_loss_clip": 1.02639103, "balance_loss_mlp": 1.02259958, "epoch": 0.3006162633398467, "flos": 19168852515840.0, "grad_norm": 1.9315622015446479, "language_loss": 0.76540279, "learning_rate": 3.2797149831521626e-06, "loss": 0.78675139, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.49609375, "step": 5000, "time_per_iteration": 2.4575068950653076 }, { "auxiliary_loss_clip": 0.01069791, "auxiliary_loss_mlp": 0.01061497, "balance_loss_clip": 1.02410078, "balance_loss_mlp": 1.02254653, "epoch": 0.30067638659251467, "flos": 14678498401920.0, "grad_norm": 1.8987753717566525, "language_loss": 0.83056575, "learning_rate": 3.2794156600867073e-06, "loss": 0.85187864, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.47265625, "step": 5001, "time_per_iteration": 3.830817461013794 }, { "auxiliary_loss_clip": 0.01071699, "auxiliary_loss_mlp": 0.01058326, "balance_loss_clip": 1.02318335, "balance_loss_mlp": 1.02284443, "epoch": 0.30073650984518263, "flos": 23366878882560.0, "grad_norm": 1.7198324248108177, "language_loss": 0.82245636, "learning_rate": 3.2791162885056815e-06, "loss": 0.84375656, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.48828125, "step": 5002, "time_per_iteration": 2.4497947692871094 }, { "auxiliary_loss_clip": 0.01074089, "auxiliary_loss_mlp": 0.0105154, "balance_loss_clip": 1.01646924, "balance_loss_mlp": 1.02275467, "epoch": 0.3007966330978506, "flos": 22965517359360.0, "grad_norm": 1.8434757817325793, "language_loss": 0.72634661, "learning_rate": 3.2788168684204376e-06, "loss": 0.74760294, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51171875, "step": 5003, "time_per_iteration": 3.924666166305542 }, { "auxiliary_loss_clip": 0.01076269, "auxiliary_loss_mlp": 0.01059511, "balance_loss_clip": 1.02277112, "balance_loss_mlp": 1.02420783, "epoch": 0.30085675635051856, "flos": 27817222711680.0, "grad_norm": 1.7948488417906276, "language_loss": 0.71229869, "learning_rate": 3.27851739984233e-06, "loss": 0.73365653, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.51953125, "step": 5004, "time_per_iteration": 2.4612302780151367 }, { "auxiliary_loss_clip": 0.01075571, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.01540971, "balance_loss_mlp": 1.02344823, "epoch": 0.3009168796031865, "flos": 10882147760640.0, "grad_norm": 3.2391286332799862, "language_loss": 0.84085768, "learning_rate": 3.278217882782715e-06, "loss": 0.86214137, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.51953125, "step": 5005, "time_per_iteration": 3.7985782623291016 }, { "auxiliary_loss_clip": 0.01073985, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.01502085, "balance_loss_mlp": 1.02460074, "epoch": 0.3009770028558545, "flos": 23804270795520.0, "grad_norm": 2.4471818945790846, "language_loss": 0.76760125, "learning_rate": 3.2779183172529497e-06, "loss": 0.78883815, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.49414062, "step": 5006, "time_per_iteration": 2.424163341522217 }, { "auxiliary_loss_clip": 0.01070841, "auxiliary_loss_mlp": 0.01052054, "balance_loss_clip": 1.01848495, "balance_loss_mlp": 1.02253604, "epoch": 0.30103712610852246, "flos": 26467026635520.0, "grad_norm": 1.891280718473957, "language_loss": 0.72389209, "learning_rate": 3.2776187032643932e-06, "loss": 0.745121, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48242188, "step": 5007, "time_per_iteration": 2.4873249530792236 }, { "auxiliary_loss_clip": 0.01072324, "auxiliary_loss_mlp": 0.01058209, "balance_loss_clip": 1.01872706, "balance_loss_mlp": 1.02177703, "epoch": 0.3010972493611904, "flos": 22855366419840.0, "grad_norm": 2.2192495146268216, "language_loss": 0.77896988, "learning_rate": 3.2773190408284075e-06, "loss": 0.80027521, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.50390625, "step": 5008, "time_per_iteration": 2.398043155670166 }, { "auxiliary_loss_clip": 0.01070718, "auxiliary_loss_mlp": 0.01050957, "balance_loss_clip": 1.01531386, "balance_loss_mlp": 1.02096844, "epoch": 0.3011573726138584, "flos": 24052748008320.0, "grad_norm": 1.8551053554044001, "language_loss": 0.85963774, "learning_rate": 3.2770193299563564e-06, "loss": 0.88085449, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49609375, "step": 5009, "time_per_iteration": 2.4412825107574463 }, { "auxiliary_loss_clip": 0.01074611, "auxiliary_loss_mlp": 0.01059544, "balance_loss_clip": 1.01796412, "balance_loss_mlp": 1.02205706, "epoch": 0.30121749586652635, "flos": 20258841162240.0, "grad_norm": 1.7982037868409169, "language_loss": 0.84772271, "learning_rate": 3.276719570659604e-06, "loss": 0.86906421, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.52734375, "step": 5010, "time_per_iteration": 2.3783535957336426 }, { "auxiliary_loss_clip": 0.01070797, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.01292694, "balance_loss_mlp": 1.0211823, "epoch": 0.3012776191191944, "flos": 26941845392640.0, "grad_norm": 2.3060467536658984, "language_loss": 0.86656737, "learning_rate": 3.2764197629495176e-06, "loss": 0.88773787, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49609375, "step": 5011, "time_per_iteration": 2.4486145973205566 }, { "auxiliary_loss_clip": 0.0107105, "auxiliary_loss_mlp": 0.01059318, "balance_loss_clip": 1.02107537, "balance_loss_mlp": 1.02003503, "epoch": 0.30133774237186234, "flos": 20411271624960.0, "grad_norm": 13.498548741618645, "language_loss": 0.74503267, "learning_rate": 3.2761199068374656e-06, "loss": 0.76633632, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5078125, "step": 5012, "time_per_iteration": 2.388587236404419 }, { "auxiliary_loss_clip": 0.01070918, "auxiliary_loss_mlp": 0.01052187, "balance_loss_clip": 1.01687765, "balance_loss_mlp": 1.02098131, "epoch": 0.3013978656245303, "flos": 19791423613440.0, "grad_norm": 2.2749833932253796, "language_loss": 0.88796914, "learning_rate": 3.275820002334819e-06, "loss": 0.90920019, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5, "step": 5013, "time_per_iteration": 2.409979820251465 }, { "auxiliary_loss_clip": 0.01072762, "auxiliary_loss_mlp": 0.01055079, "balance_loss_clip": 1.01533484, "balance_loss_mlp": 1.02091074, "epoch": 0.30145798887719827, "flos": 16248821800320.0, "grad_norm": 2.2644869403148187, "language_loss": 0.84417903, "learning_rate": 3.2755200494529496e-06, "loss": 0.86545742, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.51953125, "step": 5014, "time_per_iteration": 2.3569135665893555 }, { "auxiliary_loss_clip": 0.01067606, "auxiliary_loss_mlp": 0.01048252, "balance_loss_clip": 1.01513577, "balance_loss_mlp": 1.02007318, "epoch": 0.30151811212986623, "flos": 24570579427200.0, "grad_norm": 2.080932272673734, "language_loss": 0.69374627, "learning_rate": 3.2752200482032323e-06, "loss": 0.7149049, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4765625, "step": 5015, "time_per_iteration": 2.4217259883880615 }, { "auxiliary_loss_clip": 0.01071265, "auxiliary_loss_mlp": 0.0105528, "balance_loss_clip": 1.01916003, "balance_loss_mlp": 1.02148819, "epoch": 0.3015782353825342, "flos": 21870990236160.0, "grad_norm": 3.2721365748174454, "language_loss": 0.76518071, "learning_rate": 3.2749199985970436e-06, "loss": 0.78644609, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.49804688, "step": 5016, "time_per_iteration": 2.3795278072357178 }, { "auxiliary_loss_clip": 0.01074172, "auxiliary_loss_mlp": 0.01053435, "balance_loss_clip": 1.01614654, "balance_loss_mlp": 1.021909, "epoch": 0.30163835863520216, "flos": 28768012300800.0, "grad_norm": 1.4824433005868674, "language_loss": 0.66731477, "learning_rate": 3.2746199006457603e-06, "loss": 0.68859088, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.5234375, "step": 5017, "time_per_iteration": 2.4685685634613037 }, { "auxiliary_loss_clip": 0.01072748, "auxiliary_loss_mlp": 0.01054745, "balance_loss_clip": 1.01851702, "balance_loss_mlp": 1.02261543, "epoch": 0.30169848188787013, "flos": 22965098423040.0, "grad_norm": 1.9672749448913653, "language_loss": 0.69854927, "learning_rate": 3.2743197543607628e-06, "loss": 0.71982419, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5, "step": 5018, "time_per_iteration": 2.3909614086151123 }, { "auxiliary_loss_clip": 0.01065947, "auxiliary_loss_mlp": 0.01049574, "balance_loss_clip": 1.01629031, "balance_loss_mlp": 1.0193696, "epoch": 0.3017586051405381, "flos": 21834191796480.0, "grad_norm": 2.3782958519125157, "language_loss": 0.80409706, "learning_rate": 3.2740195597534327e-06, "loss": 0.82525229, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46484375, "step": 5019, "time_per_iteration": 2.4048335552215576 }, { "auxiliary_loss_clip": 0.0107328, "auxiliary_loss_mlp": 0.01057728, "balance_loss_clip": 1.02158403, "balance_loss_mlp": 1.02238071, "epoch": 0.30181872839320606, "flos": 22159407911040.0, "grad_norm": 2.3871032807140797, "language_loss": 0.71315897, "learning_rate": 3.2737193168351527e-06, "loss": 0.73446906, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5078125, "step": 5020, "time_per_iteration": 2.4019789695739746 }, { "auxiliary_loss_clip": 0.01072526, "auxiliary_loss_mlp": 0.01060014, "balance_loss_clip": 1.02224827, "balance_loss_mlp": 1.02111149, "epoch": 0.301878851645874, "flos": 18113183602560.0, "grad_norm": 2.974096005308293, "language_loss": 0.80062562, "learning_rate": 3.2734190256173085e-06, "loss": 0.82195103, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.515625, "step": 5021, "time_per_iteration": 2.4390056133270264 }, { "auxiliary_loss_clip": 0.01069315, "auxiliary_loss_mlp": 0.01047679, "balance_loss_clip": 1.01503992, "balance_loss_mlp": 1.01974678, "epoch": 0.301938974898542, "flos": 17601287114880.0, "grad_norm": 2.412480616839102, "language_loss": 0.78158182, "learning_rate": 3.2731186861112877e-06, "loss": 0.80275178, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.49609375, "step": 5022, "time_per_iteration": 2.384329080581665 }, { "auxiliary_loss_clip": 0.01072447, "auxiliary_loss_mlp": 0.0105802, "balance_loss_clip": 1.02354503, "balance_loss_mlp": 1.02162516, "epoch": 0.30199909815120995, "flos": 11180445350400.0, "grad_norm": 2.0200771007926166, "language_loss": 0.70740914, "learning_rate": 3.2728182983284793e-06, "loss": 0.72871375, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5023, "time_per_iteration": 2.4352095127105713 }, { "auxiliary_loss_clip": 0.01073903, "auxiliary_loss_mlp": 0.01048124, "balance_loss_clip": 1.01343393, "balance_loss_mlp": 1.02094996, "epoch": 0.302059221403878, "flos": 21906776246400.0, "grad_norm": 1.9499243375798454, "language_loss": 0.72858381, "learning_rate": 3.2725178622802724e-06, "loss": 0.74980414, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.53125, "step": 5024, "time_per_iteration": 2.4216361045837402 }, { "auxiliary_loss_clip": 0.01070588, "auxiliary_loss_mlp": 0.01054208, "balance_loss_clip": 1.0203048, "balance_loss_mlp": 1.02161658, "epoch": 0.30211934465654594, "flos": 26395175324160.0, "grad_norm": 2.430010377597016, "language_loss": 0.75743711, "learning_rate": 3.272217377978061e-06, "loss": 0.77868509, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49023438, "step": 5025, "time_per_iteration": 2.428990602493286 }, { "auxiliary_loss_clip": 0.01068341, "auxiliary_loss_mlp": 0.01051153, "balance_loss_clip": 1.02025366, "balance_loss_mlp": 1.02086377, "epoch": 0.3021794679092139, "flos": 23399453047680.0, "grad_norm": 1.777364022205478, "language_loss": 0.69215989, "learning_rate": 3.2719168454332387e-06, "loss": 0.71335483, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47460938, "step": 5026, "time_per_iteration": 2.40132737159729 }, { "auxiliary_loss_clip": 0.01072544, "auxiliary_loss_mlp": 0.01056645, "balance_loss_clip": 1.02197945, "balance_loss_mlp": 1.02269721, "epoch": 0.30223959116188187, "flos": 20260097971200.0, "grad_norm": 1.7160036824085434, "language_loss": 0.86365914, "learning_rate": 3.2716162646572034e-06, "loss": 0.88495106, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.5, "step": 5027, "time_per_iteration": 2.38614821434021 }, { "auxiliary_loss_clip": 0.01069316, "auxiliary_loss_mlp": 0.01054328, "balance_loss_clip": 1.0226903, "balance_loss_mlp": 1.02046561, "epoch": 0.30229971441454984, "flos": 26686630287360.0, "grad_norm": 1.5444462044727607, "language_loss": 0.79634988, "learning_rate": 3.271315635661351e-06, "loss": 0.8175863, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48828125, "step": 5028, "time_per_iteration": 2.458916425704956 }, { "auxiliary_loss_clip": 0.01073438, "auxiliary_loss_mlp": 0.01053293, "balance_loss_clip": 1.01905608, "balance_loss_mlp": 1.02336597, "epoch": 0.3023598376672178, "flos": 34344026052480.0, "grad_norm": 1.9291470387018181, "language_loss": 0.78513908, "learning_rate": 3.2710149584570826e-06, "loss": 0.80640638, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5, "step": 5029, "time_per_iteration": 2.485807418823242 }, { "auxiliary_loss_clip": 0.01072876, "auxiliary_loss_mlp": 0.01051433, "balance_loss_clip": 1.01514554, "balance_loss_mlp": 1.02154922, "epoch": 0.30241996091988577, "flos": 23111035372800.0, "grad_norm": 2.9419182933677774, "language_loss": 0.83583832, "learning_rate": 3.2707142330557993e-06, "loss": 0.85708141, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.51171875, "step": 5030, "time_per_iteration": 2.4216108322143555 }, { "auxiliary_loss_clip": 0.01073543, "auxiliary_loss_mlp": 0.01062478, "balance_loss_clip": 1.02471292, "balance_loss_mlp": 1.0218451, "epoch": 0.30248008417255373, "flos": 19389014749440.0, "grad_norm": 6.21071449082839, "language_loss": 0.72012389, "learning_rate": 3.270413459468905e-06, "loss": 0.74148405, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.515625, "step": 5031, "time_per_iteration": 2.3858802318573 }, { "auxiliary_loss_clip": 0.0107122, "auxiliary_loss_mlp": 0.01052091, "balance_loss_clip": 1.01771128, "balance_loss_mlp": 1.02133274, "epoch": 0.3025402074252217, "flos": 23768554608000.0, "grad_norm": 1.870359270374137, "language_loss": 0.8336134, "learning_rate": 3.2701126377078047e-06, "loss": 0.85484648, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49804688, "step": 5032, "time_per_iteration": 2.4135398864746094 }, { "auxiliary_loss_clip": 0.01077199, "auxiliary_loss_mlp": 0.01060034, "balance_loss_clip": 1.02055156, "balance_loss_mlp": 1.0236572, "epoch": 0.30260033067788966, "flos": 25992941016960.0, "grad_norm": 2.4473277198238597, "language_loss": 0.75224626, "learning_rate": 3.269811767783906e-06, "loss": 0.77361858, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.53515625, "step": 5033, "time_per_iteration": 2.410855770111084 }, { "auxiliary_loss_clip": 0.01069713, "auxiliary_loss_mlp": 0.01053028, "balance_loss_clip": 1.01790953, "balance_loss_mlp": 1.02055264, "epoch": 0.3026604539305576, "flos": 25373372296320.0, "grad_norm": 1.7506073963392839, "language_loss": 0.75641882, "learning_rate": 3.2695108497086185e-06, "loss": 0.77764618, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.4921875, "step": 5034, "time_per_iteration": 2.4397478103637695 }, { "auxiliary_loss_clip": 0.01069312, "auxiliary_loss_mlp": 0.01050791, "balance_loss_clip": 1.01490927, "balance_loss_mlp": 1.02006042, "epoch": 0.3027205771832256, "flos": 25811532259200.0, "grad_norm": 14.16051426589058, "language_loss": 0.7401821, "learning_rate": 3.269209883493352e-06, "loss": 0.76138318, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.4921875, "step": 5035, "time_per_iteration": 2.4162325859069824 }, { "auxiliary_loss_clip": 0.01069397, "auxiliary_loss_mlp": 0.01047796, "balance_loss_clip": 1.01379704, "balance_loss_mlp": 1.02069831, "epoch": 0.30278070043589356, "flos": 27343311649920.0, "grad_norm": 1.927460170060167, "language_loss": 0.879713, "learning_rate": 3.2689088691495196e-06, "loss": 0.90088487, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.48632812, "step": 5036, "time_per_iteration": 2.4485418796539307 }, { "auxiliary_loss_clip": 0.01069933, "auxiliary_loss_mlp": 0.0105247, "balance_loss_clip": 1.01701689, "balance_loss_mlp": 1.02089405, "epoch": 0.3028408236885616, "flos": 24785190754560.0, "grad_norm": 1.873872701698567, "language_loss": 0.78310674, "learning_rate": 3.268607806688536e-06, "loss": 0.80433083, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49023438, "step": 5037, "time_per_iteration": 2.4137418270111084 }, { "auxiliary_loss_clip": 0.01073302, "auxiliary_loss_mlp": 0.01064434, "balance_loss_clip": 1.02683568, "balance_loss_mlp": 1.02241147, "epoch": 0.30290094694122954, "flos": 12931653836160.0, "grad_norm": 7.323304345455554, "language_loss": 0.79966164, "learning_rate": 3.268306696121816e-06, "loss": 0.82103896, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5078125, "step": 5038, "time_per_iteration": 3.8015122413635254 }, { "auxiliary_loss_clip": 0.01071337, "auxiliary_loss_mlp": 0.01054664, "balance_loss_clip": 1.02023649, "balance_loss_mlp": 1.02237332, "epoch": 0.3029610701938975, "flos": 25915399153920.0, "grad_norm": 1.9404411583229821, "language_loss": 0.75652719, "learning_rate": 3.2680055374607804e-06, "loss": 0.77778721, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48828125, "step": 5039, "time_per_iteration": 2.463085174560547 }, { "auxiliary_loss_clip": 0.01067544, "auxiliary_loss_mlp": 0.01049952, "balance_loss_clip": 1.01752722, "balance_loss_mlp": 1.02026272, "epoch": 0.3030211934465655, "flos": 21979919278080.0, "grad_norm": 1.866779395861628, "language_loss": 0.80775464, "learning_rate": 3.267704330716847e-06, "loss": 0.82892954, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47265625, "step": 5040, "time_per_iteration": 2.3880395889282227 }, { "auxiliary_loss_clip": 0.01070453, "auxiliary_loss_mlp": 0.01050589, "balance_loss_clip": 1.01685286, "balance_loss_mlp": 1.02263594, "epoch": 0.30308131669923344, "flos": 20991039528960.0, "grad_norm": 1.6698683009402098, "language_loss": 0.83183724, "learning_rate": 3.267403075901438e-06, "loss": 0.85304767, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.47851562, "step": 5041, "time_per_iteration": 3.798086643218994 }, { "auxiliary_loss_clip": 0.01019193, "auxiliary_loss_mlp": 0.01012316, "balance_loss_clip": 1.00814414, "balance_loss_mlp": 1.00836277, "epoch": 0.3031414399519014, "flos": 60545641599360.0, "grad_norm": 0.7757072673499196, "language_loss": 0.59618336, "learning_rate": 3.267101773025978e-06, "loss": 0.61649847, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.10839844, "step": 5042, "time_per_iteration": 3.138378381729126 }, { "auxiliary_loss_clip": 0.01073603, "auxiliary_loss_mlp": 0.01056485, "balance_loss_clip": 1.02079344, "balance_loss_mlp": 1.02334857, "epoch": 0.30320156320456937, "flos": 21906601689600.0, "grad_norm": 1.6505620178998337, "language_loss": 0.72876102, "learning_rate": 3.266800422101892e-06, "loss": 0.75006193, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.50390625, "step": 5043, "time_per_iteration": 3.905677556991577 }, { "auxiliary_loss_clip": 0.0107003, "auxiliary_loss_mlp": 0.01058202, "balance_loss_clip": 1.02179515, "balance_loss_mlp": 1.02061582, "epoch": 0.30326168645723733, "flos": 21651700786560.0, "grad_norm": 1.7837753615987932, "language_loss": 0.70992553, "learning_rate": 3.266499023140606e-06, "loss": 0.73120779, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.49414062, "step": 5044, "time_per_iteration": 3.7758328914642334 }, { "auxiliary_loss_clip": 0.01067901, "auxiliary_loss_mlp": 0.01055001, "balance_loss_clip": 1.02078843, "balance_loss_mlp": 1.01981473, "epoch": 0.3033218097099053, "flos": 21870222186240.0, "grad_norm": 1.5072027017112, "language_loss": 0.78114605, "learning_rate": 3.2661975761535513e-06, "loss": 0.80237514, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48046875, "step": 5045, "time_per_iteration": 2.3931403160095215 }, { "auxiliary_loss_clip": 0.01073477, "auxiliary_loss_mlp": 0.01060991, "balance_loss_clip": 1.02081764, "balance_loss_mlp": 1.02247357, "epoch": 0.30338193296257326, "flos": 27088480569600.0, "grad_norm": 2.9502297478864588, "language_loss": 0.73201883, "learning_rate": 3.2658960811521564e-06, "loss": 0.75336349, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.51171875, "step": 5046, "time_per_iteration": 2.4481136798858643 }, { "auxiliary_loss_clip": 0.01074912, "auxiliary_loss_mlp": 0.01060741, "balance_loss_clip": 1.02061546, "balance_loss_mlp": 1.02336752, "epoch": 0.30344205621524123, "flos": 19533415599360.0, "grad_norm": 1.8196103573055495, "language_loss": 0.82506216, "learning_rate": 3.2655945381478564e-06, "loss": 0.84641868, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.515625, "step": 5047, "time_per_iteration": 2.4067912101745605 }, { "auxiliary_loss_clip": 0.01073042, "auxiliary_loss_mlp": 0.01051959, "balance_loss_clip": 1.01555276, "balance_loss_mlp": 1.02270246, "epoch": 0.3035021794679092, "flos": 23909953080960.0, "grad_norm": 2.087499742101998, "language_loss": 0.72591633, "learning_rate": 3.265292947152084e-06, "loss": 0.7471664, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.50390625, "step": 5048, "time_per_iteration": 2.417738914489746 }, { "auxiliary_loss_clip": 0.01073492, "auxiliary_loss_mlp": 0.01051741, "balance_loss_clip": 1.01705086, "balance_loss_mlp": 1.02315593, "epoch": 0.30356230272057716, "flos": 16142685667200.0, "grad_norm": 2.083109842007804, "language_loss": 0.77523988, "learning_rate": 3.2649913081762763e-06, "loss": 0.79649222, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 5049, "time_per_iteration": 2.3631772994995117 }, { "auxiliary_loss_clip": 0.01075656, "auxiliary_loss_mlp": 0.01059051, "balance_loss_clip": 1.02297854, "balance_loss_mlp": 1.02329624, "epoch": 0.3036224259732452, "flos": 28913390668800.0, "grad_norm": 2.3461280589509266, "language_loss": 0.82964134, "learning_rate": 3.2646896212318717e-06, "loss": 0.85098839, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5234375, "step": 5050, "time_per_iteration": 2.5016918182373047 }, { "auxiliary_loss_clip": 0.01074447, "auxiliary_loss_mlp": 0.01053949, "balance_loss_clip": 1.01778066, "balance_loss_mlp": 1.02308464, "epoch": 0.30368254922591315, "flos": 21104541959040.0, "grad_norm": 2.1943997928107986, "language_loss": 0.75533092, "learning_rate": 3.2643878863303106e-06, "loss": 0.7766149, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.515625, "step": 5051, "time_per_iteration": 2.389305353164673 }, { "auxiliary_loss_clip": 0.01074581, "auxiliary_loss_mlp": 0.01057893, "balance_loss_clip": 1.02120078, "balance_loss_mlp": 1.02318501, "epoch": 0.3037426724785811, "flos": 23001198635520.0, "grad_norm": 1.6235288969938566, "language_loss": 0.78011203, "learning_rate": 3.264086103483033e-06, "loss": 0.80143678, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.515625, "step": 5052, "time_per_iteration": 2.420043706893921 }, { "auxiliary_loss_clip": 0.01077124, "auxiliary_loss_mlp": 0.01054879, "balance_loss_clip": 1.01635087, "balance_loss_mlp": 1.02484727, "epoch": 0.3038027957312491, "flos": 15631801608960.0, "grad_norm": 2.1500495499973726, "language_loss": 0.84408176, "learning_rate": 3.2637842727014836e-06, "loss": 0.86540174, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5234375, "step": 5053, "time_per_iteration": 2.35660719871521 }, { "auxiliary_loss_clip": 0.01074308, "auxiliary_loss_mlp": 0.01060324, "balance_loss_clip": 1.02124751, "balance_loss_mlp": 1.02298796, "epoch": 0.30386291898391704, "flos": 12713167347840.0, "grad_norm": 1.751470347299621, "language_loss": 0.72530311, "learning_rate": 3.2634823939971083e-06, "loss": 0.74664938, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.51171875, "step": 5054, "time_per_iteration": 2.5394792556762695 }, { "auxiliary_loss_clip": 0.0107453, "auxiliary_loss_mlp": 0.01057052, "balance_loss_clip": 1.01997745, "balance_loss_mlp": 1.02432489, "epoch": 0.303923042236585, "flos": 26358237239040.0, "grad_norm": 6.544393925936348, "language_loss": 0.7105245, "learning_rate": 3.2631804673813545e-06, "loss": 0.73184031, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.5, "step": 5055, "time_per_iteration": 2.4267289638519287 }, { "auxiliary_loss_clip": 0.01075574, "auxiliary_loss_mlp": 0.01057154, "balance_loss_clip": 1.01912642, "balance_loss_mlp": 1.02384543, "epoch": 0.30398316548925297, "flos": 19718210759040.0, "grad_norm": 5.404389060484848, "language_loss": 0.69350421, "learning_rate": 3.2628784928656707e-06, "loss": 0.71483147, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.515625, "step": 5056, "time_per_iteration": 2.434068441390991 }, { "auxiliary_loss_clip": 0.010728, "auxiliary_loss_mlp": 0.01059564, "balance_loss_clip": 1.02110708, "balance_loss_mlp": 1.02235556, "epoch": 0.30404328874192094, "flos": 24238799976960.0, "grad_norm": 1.7008474934378772, "language_loss": 0.83647764, "learning_rate": 3.262576470461507e-06, "loss": 0.85780126, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.50390625, "step": 5057, "time_per_iteration": 2.4107775688171387 }, { "auxiliary_loss_clip": 0.01071107, "auxiliary_loss_mlp": 0.0105403, "balance_loss_clip": 1.01676524, "balance_loss_mlp": 1.02073741, "epoch": 0.3041034119945889, "flos": 24497785509120.0, "grad_norm": 1.940119460312281, "language_loss": 0.90798199, "learning_rate": 3.2622744001803176e-06, "loss": 0.92923343, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 0.50390625, "step": 5058, "time_per_iteration": 2.5077600479125977 }, { "auxiliary_loss_clip": 0.01076342, "auxiliary_loss_mlp": 0.01067586, "balance_loss_clip": 1.02738881, "balance_loss_mlp": 1.0243932, "epoch": 0.30416353524725687, "flos": 28287747371520.0, "grad_norm": 2.016789585624093, "language_loss": 0.73350763, "learning_rate": 3.2619722820335564e-06, "loss": 0.75494695, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.51953125, "step": 5059, "time_per_iteration": 2.444354772567749 }, { "auxiliary_loss_clip": 0.01072779, "auxiliary_loss_mlp": 0.01059369, "balance_loss_clip": 1.02136517, "balance_loss_mlp": 1.02203226, "epoch": 0.30422365849992483, "flos": 23659241541120.0, "grad_norm": 2.3185117840740204, "language_loss": 0.7407105, "learning_rate": 3.26167011603268e-06, "loss": 0.76203197, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5078125, "step": 5060, "time_per_iteration": 2.458911180496216 }, { "auxiliary_loss_clip": 0.01074718, "auxiliary_loss_mlp": 0.01056951, "balance_loss_clip": 1.01951921, "balance_loss_mlp": 1.02226746, "epoch": 0.3042837817525928, "flos": 22997777322240.0, "grad_norm": 2.1861668184477954, "language_loss": 0.78622031, "learning_rate": 3.2613679021891463e-06, "loss": 0.80753696, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5234375, "step": 5061, "time_per_iteration": 2.3843657970428467 }, { "auxiliary_loss_clip": 0.01072892, "auxiliary_loss_mlp": 0.01057194, "balance_loss_clip": 1.01811731, "balance_loss_mlp": 1.02214551, "epoch": 0.30434390500526076, "flos": 22081482023040.0, "grad_norm": 6.2913879853090675, "language_loss": 0.83649349, "learning_rate": 3.261065640514415e-06, "loss": 0.8577944, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.5078125, "step": 5062, "time_per_iteration": 2.4116013050079346 }, { "auxiliary_loss_clip": 0.01069473, "auxiliary_loss_mlp": 0.01054396, "balance_loss_clip": 1.01813293, "balance_loss_mlp": 1.01993513, "epoch": 0.3044040282579287, "flos": 25482336249600.0, "grad_norm": 1.8085129263372726, "language_loss": 0.75770926, "learning_rate": 3.2607633310199483e-06, "loss": 0.77894795, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.49609375, "step": 5063, "time_per_iteration": 2.41461443901062 }, { "auxiliary_loss_clip": 0.01069418, "auxiliary_loss_mlp": 0.0105453, "balance_loss_clip": 1.01464224, "balance_loss_mlp": 1.0207144, "epoch": 0.30446415151059675, "flos": 21944377647360.0, "grad_norm": 1.643909431649782, "language_loss": 0.85350156, "learning_rate": 3.26046097371721e-06, "loss": 0.87474102, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.48632812, "step": 5064, "time_per_iteration": 2.424729824066162 }, { "auxiliary_loss_clip": 0.01071993, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.01913667, "balance_loss_mlp": 1.02076554, "epoch": 0.3045242747632647, "flos": 16434489744000.0, "grad_norm": 1.9620466328308048, "language_loss": 0.76939148, "learning_rate": 3.2601585686176655e-06, "loss": 0.79069948, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.51171875, "step": 5065, "time_per_iteration": 2.3697073459625244 }, { "auxiliary_loss_clip": 0.01074686, "auxiliary_loss_mlp": 0.01057077, "balance_loss_clip": 1.01907325, "balance_loss_mlp": 1.02179527, "epoch": 0.3045843980159327, "flos": 31538998955520.0, "grad_norm": 1.9314077890442634, "language_loss": 0.63796985, "learning_rate": 3.2598561157327814e-06, "loss": 0.65928751, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.53125, "step": 5066, "time_per_iteration": 2.5084526538848877 }, { "auxiliary_loss_clip": 0.01077795, "auxiliary_loss_mlp": 0.01058359, "balance_loss_clip": 1.01806664, "balance_loss_mlp": 1.02377832, "epoch": 0.30464452126860064, "flos": 17852801616000.0, "grad_norm": 2.43416767792794, "language_loss": 0.84196019, "learning_rate": 3.2595536150740265e-06, "loss": 0.86332178, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.5390625, "step": 5067, "time_per_iteration": 2.3800179958343506 }, { "auxiliary_loss_clip": 0.01069603, "auxiliary_loss_mlp": 0.01055508, "balance_loss_clip": 1.01960266, "balance_loss_mlp": 1.0209111, "epoch": 0.3047046445212686, "flos": 20630351606400.0, "grad_norm": 2.1147086634504246, "language_loss": 0.64794302, "learning_rate": 3.259251066652873e-06, "loss": 0.66919416, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48828125, "step": 5068, "time_per_iteration": 2.4038405418395996 }, { "auxiliary_loss_clip": 0.01070728, "auxiliary_loss_mlp": 0.01046991, "balance_loss_clip": 1.0137794, "balance_loss_mlp": 1.02118981, "epoch": 0.3047647677739366, "flos": 21286544209920.0, "grad_norm": 1.8674556472093402, "language_loss": 0.76492393, "learning_rate": 3.258948470480793e-06, "loss": 0.7861011, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49609375, "step": 5069, "time_per_iteration": 2.3890998363494873 }, { "auxiliary_loss_clip": 0.0106794, "auxiliary_loss_mlp": 0.01054119, "balance_loss_clip": 1.02109838, "balance_loss_mlp": 1.02136803, "epoch": 0.30482489102660454, "flos": 20994879778560.0, "grad_norm": 2.3109462775161265, "language_loss": 0.77229071, "learning_rate": 3.258645826569261e-06, "loss": 0.79351127, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.46679688, "step": 5070, "time_per_iteration": 2.404808759689331 }, { "auxiliary_loss_clip": 0.01075204, "auxiliary_loss_mlp": 0.01055801, "balance_loss_clip": 1.017416, "balance_loss_mlp": 1.02359009, "epoch": 0.3048850142792725, "flos": 26289493038720.0, "grad_norm": 2.0943075367219146, "language_loss": 0.8311432, "learning_rate": 3.2583431349297527e-06, "loss": 0.85245323, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.51953125, "step": 5071, "time_per_iteration": 2.4573731422424316 }, { "auxiliary_loss_clip": 0.01074061, "auxiliary_loss_mlp": 0.01057234, "balance_loss_clip": 1.01896739, "balance_loss_mlp": 1.02189803, "epoch": 0.30494513753194047, "flos": 22345145677440.0, "grad_norm": 1.8257980131169882, "language_loss": 0.76912189, "learning_rate": 3.2580403955737467e-06, "loss": 0.79043484, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5234375, "step": 5072, "time_per_iteration": 2.4377450942993164 }, { "auxiliary_loss_clip": 0.01070713, "auxiliary_loss_mlp": 0.01061527, "balance_loss_clip": 1.02574086, "balance_loss_mlp": 1.02189088, "epoch": 0.30500526078460843, "flos": 19536627444480.0, "grad_norm": 2.2120361172486005, "language_loss": 0.73716968, "learning_rate": 3.257737608512723e-06, "loss": 0.75849211, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.48828125, "step": 5073, "time_per_iteration": 2.382911443710327 }, { "auxiliary_loss_clip": 0.01077694, "auxiliary_loss_mlp": 0.0106406, "balance_loss_clip": 1.02493572, "balance_loss_mlp": 1.02424109, "epoch": 0.3050653840372764, "flos": 14464445656320.0, "grad_norm": 2.2791191539617612, "language_loss": 0.78526831, "learning_rate": 3.257434773758163e-06, "loss": 0.80668586, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53515625, "step": 5074, "time_per_iteration": 2.3593873977661133 }, { "auxiliary_loss_clip": 0.01071255, "auxiliary_loss_mlp": 0.01053342, "balance_loss_clip": 1.01881862, "balance_loss_mlp": 1.02273953, "epoch": 0.30512550728994436, "flos": 24242640226560.0, "grad_norm": 2.0732910532876283, "language_loss": 0.75352424, "learning_rate": 3.25713189132155e-06, "loss": 0.77477026, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.484375, "step": 5075, "time_per_iteration": 2.5213305950164795 }, { "auxiliary_loss_clip": 0.01074799, "auxiliary_loss_mlp": 0.01060936, "balance_loss_clip": 1.02188325, "balance_loss_mlp": 1.02275085, "epoch": 0.30518563054261233, "flos": 16359670967040.0, "grad_norm": 1.8785706085053306, "language_loss": 0.7660538, "learning_rate": 3.2568289612143703e-06, "loss": 0.78741109, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.51953125, "step": 5076, "time_per_iteration": 2.350162982940674 }, { "auxiliary_loss_clip": 0.01072016, "auxiliary_loss_mlp": 0.01056173, "balance_loss_clip": 1.02100658, "balance_loss_mlp": 1.02230453, "epoch": 0.30524575379528035, "flos": 21578522843520.0, "grad_norm": 2.474348726590459, "language_loss": 0.80544156, "learning_rate": 3.25652598344811e-06, "loss": 0.82672346, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49609375, "step": 5077, "time_per_iteration": 3.8848776817321777 }, { "auxiliary_loss_clip": 0.01068255, "auxiliary_loss_mlp": 0.01049145, "balance_loss_clip": 1.01632726, "balance_loss_mlp": 1.02172041, "epoch": 0.3053058770479483, "flos": 16544291569920.0, "grad_norm": 1.709074816197999, "language_loss": 0.7642765, "learning_rate": 3.256222958034259e-06, "loss": 0.78545046, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46484375, "step": 5078, "time_per_iteration": 2.394915819168091 }, { "auxiliary_loss_clip": 0.01070279, "auxiliary_loss_mlp": 0.01065772, "balance_loss_clip": 1.03046238, "balance_loss_mlp": 1.02170062, "epoch": 0.3053660003006163, "flos": 12312085115520.0, "grad_norm": 2.1051780160616684, "language_loss": 0.68505454, "learning_rate": 3.255919884984307e-06, "loss": 0.70641506, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.484375, "step": 5079, "time_per_iteration": 2.3724892139434814 }, { "auxiliary_loss_clip": 0.01069386, "auxiliary_loss_mlp": 0.0106075, "balance_loss_clip": 1.02575016, "balance_loss_mlp": 1.02117777, "epoch": 0.30542612355328425, "flos": 23111175018240.0, "grad_norm": 2.237505647100242, "language_loss": 0.81399369, "learning_rate": 3.2556167643097477e-06, "loss": 0.83529508, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.48242188, "step": 5080, "time_per_iteration": 3.84330415725708 }, { "auxiliary_loss_clip": 0.01070526, "auxiliary_loss_mlp": 0.01054244, "balance_loss_clip": 1.02408433, "balance_loss_mlp": 1.02165651, "epoch": 0.3054862468059522, "flos": 24388297885440.0, "grad_norm": 2.8982308064933404, "language_loss": 0.825095, "learning_rate": 3.255313596022074e-06, "loss": 0.84634268, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.48828125, "step": 5081, "time_per_iteration": 2.4548110961914062 }, { "auxiliary_loss_clip": 0.010701, "auxiliary_loss_mlp": 0.01052634, "balance_loss_clip": 1.02035189, "balance_loss_mlp": 1.02161944, "epoch": 0.3055463700586202, "flos": 29384857935360.0, "grad_norm": 1.6597751107066752, "language_loss": 0.72529352, "learning_rate": 3.255010380132783e-06, "loss": 0.74652088, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.484375, "step": 5082, "time_per_iteration": 2.4842379093170166 }, { "auxiliary_loss_clip": 0.01072779, "auxiliary_loss_mlp": 0.01056699, "balance_loss_clip": 1.01836181, "balance_loss_mlp": 1.02170897, "epoch": 0.30560649331128814, "flos": 25590636887040.0, "grad_norm": 1.8493963364440114, "language_loss": 0.74149477, "learning_rate": 3.2547071166533736e-06, "loss": 0.76278949, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.51171875, "step": 5083, "time_per_iteration": 5.257324457168579 }, { "auxiliary_loss_clip": 0.01071331, "auxiliary_loss_mlp": 0.01049217, "balance_loss_clip": 1.01464653, "balance_loss_mlp": 1.0221734, "epoch": 0.3056666165639561, "flos": 19127515599360.0, "grad_norm": 1.7645036763832327, "language_loss": 0.71960485, "learning_rate": 3.254403805595344e-06, "loss": 0.74081033, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.4921875, "step": 5084, "time_per_iteration": 2.380970001220703 }, { "auxiliary_loss_clip": 0.01073299, "auxiliary_loss_mlp": 0.01049277, "balance_loss_clip": 1.01577914, "balance_loss_mlp": 1.02257752, "epoch": 0.30572673981662407, "flos": 15522942389760.0, "grad_norm": 2.072618882303092, "language_loss": 0.80779374, "learning_rate": 3.2541004469701962e-06, "loss": 0.82901949, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5078125, "step": 5085, "time_per_iteration": 2.4142045974731445 }, { "auxiliary_loss_clip": 0.01065951, "auxiliary_loss_mlp": 0.01044643, "balance_loss_clip": 1.01285017, "balance_loss_mlp": 1.01950753, "epoch": 0.30578686306929204, "flos": 21505484545920.0, "grad_norm": 1.7264796856460227, "language_loss": 0.79084915, "learning_rate": 3.2537970407894342e-06, "loss": 0.81195509, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46484375, "step": 5086, "time_per_iteration": 2.396934986114502 }, { "auxiliary_loss_clip": 0.01069689, "auxiliary_loss_mlp": 0.01052273, "balance_loss_clip": 1.01867986, "balance_loss_mlp": 1.02096426, "epoch": 0.30584698632196, "flos": 20953368305280.0, "grad_norm": 1.843983980112431, "language_loss": 0.77353632, "learning_rate": 3.253493587064563e-06, "loss": 0.79475594, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48828125, "step": 5087, "time_per_iteration": 2.403778076171875 }, { "auxiliary_loss_clip": 0.01073403, "auxiliary_loss_mlp": 0.01050708, "balance_loss_clip": 1.01690078, "balance_loss_mlp": 1.02235365, "epoch": 0.30590710957462797, "flos": 24679962316800.0, "grad_norm": 2.1140051699510236, "language_loss": 0.74698031, "learning_rate": 3.2531900858070885e-06, "loss": 0.76822138, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.51171875, "step": 5088, "time_per_iteration": 2.4165072441101074 }, { "auxiliary_loss_clip": 0.01075339, "auxiliary_loss_mlp": 0.01050549, "balance_loss_clip": 1.01566827, "balance_loss_mlp": 1.02293801, "epoch": 0.30596723282729593, "flos": 17086108959360.0, "grad_norm": 5.6045978409837165, "language_loss": 0.81893152, "learning_rate": 3.252886537028521e-06, "loss": 0.84019041, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5234375, "step": 5089, "time_per_iteration": 2.404055118560791 }, { "auxiliary_loss_clip": 0.01071166, "auxiliary_loss_mlp": 0.01050537, "balance_loss_clip": 1.01808858, "balance_loss_mlp": 1.02147269, "epoch": 0.30602735607996395, "flos": 22855994824320.0, "grad_norm": 1.9275203758650057, "language_loss": 0.78536457, "learning_rate": 3.2525829407403703e-06, "loss": 0.80658162, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.49609375, "step": 5090, "time_per_iteration": 2.405430316925049 }, { "auxiliary_loss_clip": 0.01077427, "auxiliary_loss_mlp": 0.01060733, "balance_loss_clip": 1.02456522, "balance_loss_mlp": 1.02496302, "epoch": 0.3060874793326319, "flos": 29860200362880.0, "grad_norm": 1.9930822413564173, "language_loss": 0.78151226, "learning_rate": 3.2522792969541488e-06, "loss": 0.80289388, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5234375, "step": 5091, "time_per_iteration": 2.459881544113159 }, { "auxiliary_loss_clip": 0.01073567, "auxiliary_loss_mlp": 0.01060388, "balance_loss_clip": 1.02367127, "balance_loss_mlp": 1.02266383, "epoch": 0.3061476025852999, "flos": 20447546394240.0, "grad_norm": 1.7311628444979967, "language_loss": 0.73208487, "learning_rate": 3.2519756056813705e-06, "loss": 0.75342441, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.5078125, "step": 5092, "time_per_iteration": 2.3901138305664062 }, { "auxiliary_loss_clip": 0.01070795, "auxiliary_loss_mlp": 0.0104944, "balance_loss_clip": 1.01618099, "balance_loss_mlp": 1.02308965, "epoch": 0.30620772583796785, "flos": 19390446115200.0, "grad_norm": 3.044908019468082, "language_loss": 0.84100223, "learning_rate": 3.2516718669335522e-06, "loss": 0.86220455, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4765625, "step": 5093, "time_per_iteration": 2.421408176422119 }, { "auxiliary_loss_clip": 0.0107106, "auxiliary_loss_mlp": 0.0105006, "balance_loss_clip": 1.01815999, "balance_loss_mlp": 1.02282286, "epoch": 0.3062678490906358, "flos": 24023420599680.0, "grad_norm": 1.8841030791124576, "language_loss": 0.76325655, "learning_rate": 3.2513680807222114e-06, "loss": 0.7844677, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.48242188, "step": 5094, "time_per_iteration": 2.409820079803467 }, { "auxiliary_loss_clip": 0.01071402, "auxiliary_loss_mlp": 0.01051104, "balance_loss_clip": 1.01841688, "balance_loss_mlp": 1.02274799, "epoch": 0.3063279723433038, "flos": 19753647655680.0, "grad_norm": 2.799021347419914, "language_loss": 0.7707051, "learning_rate": 3.251064247058868e-06, "loss": 0.7919302, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.48828125, "step": 5095, "time_per_iteration": 2.435837745666504 }, { "auxiliary_loss_clip": 0.01069714, "auxiliary_loss_mlp": 0.01053382, "balance_loss_clip": 1.01947904, "balance_loss_mlp": 1.02248693, "epoch": 0.30638809559597174, "flos": 22449082394880.0, "grad_norm": 1.7696949478451667, "language_loss": 0.81516457, "learning_rate": 3.250760365955042e-06, "loss": 0.8363955, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.47265625, "step": 5096, "time_per_iteration": 2.388474702835083 }, { "auxiliary_loss_clip": 0.0107025, "auxiliary_loss_mlp": 0.01051935, "balance_loss_clip": 1.01655352, "balance_loss_mlp": 1.02128613, "epoch": 0.3064482188486397, "flos": 17164209404160.0, "grad_norm": 12.773784040828941, "language_loss": 0.83446789, "learning_rate": 3.250456437422258e-06, "loss": 0.85568976, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48828125, "step": 5097, "time_per_iteration": 2.3898282051086426 }, { "auxiliary_loss_clip": 0.01071244, "auxiliary_loss_mlp": 0.01057033, "balance_loss_clip": 1.0220567, "balance_loss_mlp": 1.02256751, "epoch": 0.3065083421013077, "flos": 23767367621760.0, "grad_norm": 2.0541065209400626, "language_loss": 0.78719682, "learning_rate": 3.250152461472041e-06, "loss": 0.80847961, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.48632812, "step": 5098, "time_per_iteration": 2.4099769592285156 }, { "auxiliary_loss_clip": 0.01069837, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.02390122, "balance_loss_mlp": 1.02239275, "epoch": 0.30656846535397564, "flos": 26430647132160.0, "grad_norm": 1.9589480011886165, "language_loss": 0.85348004, "learning_rate": 3.249848438115917e-06, "loss": 0.87473953, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47460938, "step": 5099, "time_per_iteration": 2.461533546447754 }, { "auxiliary_loss_clip": 0.01070654, "auxiliary_loss_mlp": 0.01064653, "balance_loss_clip": 1.02908087, "balance_loss_mlp": 1.02108455, "epoch": 0.3066285886066436, "flos": 26650564986240.0, "grad_norm": 3.0460231258942607, "language_loss": 0.86098075, "learning_rate": 3.2495443673654148e-06, "loss": 0.88233376, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49609375, "step": 5100, "time_per_iteration": 2.4253268241882324 }, { "auxiliary_loss_clip": 0.01069276, "auxiliary_loss_mlp": 0.01053759, "balance_loss_clip": 1.01663697, "balance_loss_mlp": 1.01954019, "epoch": 0.30668871185931157, "flos": 15049031328000.0, "grad_norm": 1.8777134575389212, "language_loss": 0.80407083, "learning_rate": 3.249240249232065e-06, "loss": 0.82530117, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.49804688, "step": 5101, "time_per_iteration": 2.4237868785858154 }, { "auxiliary_loss_clip": 0.01072588, "auxiliary_loss_mlp": 0.01060698, "balance_loss_clip": 1.02169323, "balance_loss_mlp": 1.02206755, "epoch": 0.30674883511197953, "flos": 20081133008640.0, "grad_norm": 1.6912147011330148, "language_loss": 0.8122707, "learning_rate": 3.2489360837273998e-06, "loss": 0.83360356, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.50390625, "step": 5102, "time_per_iteration": 2.392975330352783 }, { "auxiliary_loss_clip": 0.01073404, "auxiliary_loss_mlp": 0.01055228, "balance_loss_clip": 1.01865447, "balance_loss_mlp": 1.02296972, "epoch": 0.30680895836464755, "flos": 22892688529920.0, "grad_norm": 3.4190337351277953, "language_loss": 0.90513766, "learning_rate": 3.2486318708629532e-06, "loss": 0.92642397, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.50390625, "step": 5103, "time_per_iteration": 2.4399325847625732 }, { "auxiliary_loss_clip": 0.01071053, "auxiliary_loss_mlp": 0.01057152, "balance_loss_clip": 1.02186584, "balance_loss_mlp": 1.02132845, "epoch": 0.3068690816173155, "flos": 23695027551360.0, "grad_norm": 1.7984118032353333, "language_loss": 0.75549507, "learning_rate": 3.2483276106502607e-06, "loss": 0.77677715, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.49609375, "step": 5104, "time_per_iteration": 2.404832601547241 }, { "auxiliary_loss_clip": 0.01074074, "auxiliary_loss_mlp": 0.01058332, "balance_loss_clip": 1.02383316, "balance_loss_mlp": 1.02271652, "epoch": 0.3069292048699835, "flos": 23549893562880.0, "grad_norm": 2.0163609215055245, "language_loss": 0.74245441, "learning_rate": 3.2480233031008605e-06, "loss": 0.76377851, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 5105, "time_per_iteration": 2.4386074542999268 }, { "auxiliary_loss_clip": 0.0107264, "auxiliary_loss_mlp": 0.01053785, "balance_loss_clip": 1.01685381, "balance_loss_mlp": 1.02163982, "epoch": 0.30698932812265145, "flos": 24530604053760.0, "grad_norm": 1.917561820958087, "language_loss": 0.88094896, "learning_rate": 3.2477189482262916e-06, "loss": 0.90221322, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5078125, "step": 5106, "time_per_iteration": 2.424337863922119 }, { "auxiliary_loss_clip": 0.0107638, "auxiliary_loss_mlp": 0.01069652, "balance_loss_clip": 1.02957392, "balance_loss_mlp": 1.02295649, "epoch": 0.3070494513753194, "flos": 20995368537600.0, "grad_norm": 2.4992034250415216, "language_loss": 0.73744541, "learning_rate": 3.2474145460380945e-06, "loss": 0.75890571, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.53125, "step": 5107, "time_per_iteration": 2.3932113647460938 }, { "auxiliary_loss_clip": 0.01071908, "auxiliary_loss_mlp": 0.01059018, "balance_loss_clip": 1.02489972, "balance_loss_mlp": 1.02182615, "epoch": 0.3071095746279874, "flos": 19024940424960.0, "grad_norm": 3.1394699043648195, "language_loss": 0.73471355, "learning_rate": 3.247110096547814e-06, "loss": 0.75602281, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.5, "step": 5108, "time_per_iteration": 2.387723207473755 }, { "auxiliary_loss_clip": 0.01071328, "auxiliary_loss_mlp": 0.01054329, "balance_loss_clip": 1.0177561, "balance_loss_mlp": 1.02197421, "epoch": 0.30716969788065535, "flos": 21214448519040.0, "grad_norm": 1.6564544204465266, "language_loss": 0.87507427, "learning_rate": 3.2468055997669926e-06, "loss": 0.89633083, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.4921875, "step": 5109, "time_per_iteration": 2.455106019973755 }, { "auxiliary_loss_clip": 0.01071505, "auxiliary_loss_mlp": 0.01052261, "balance_loss_clip": 1.01718938, "balance_loss_mlp": 1.02105212, "epoch": 0.3072298211333233, "flos": 25771661619840.0, "grad_norm": 1.6402262828633027, "language_loss": 0.68449187, "learning_rate": 3.2465010557071788e-06, "loss": 0.70572954, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.50390625, "step": 5110, "time_per_iteration": 2.415372133255005 }, { "auxiliary_loss_clip": 0.01070186, "auxiliary_loss_mlp": 0.01049501, "balance_loss_clip": 1.01571751, "balance_loss_mlp": 1.02219009, "epoch": 0.3072899443859913, "flos": 25847737205760.0, "grad_norm": 1.8549822490896526, "language_loss": 0.77528942, "learning_rate": 3.246196464379919e-06, "loss": 0.79648638, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.48046875, "step": 5111, "time_per_iteration": 2.437851905822754 }, { "auxiliary_loss_clip": 0.01072254, "auxiliary_loss_mlp": 0.01058473, "balance_loss_clip": 1.02216148, "balance_loss_mlp": 1.02114236, "epoch": 0.30735006763865924, "flos": 25921578464640.0, "grad_norm": 1.7280769296340903, "language_loss": 0.69136262, "learning_rate": 3.245891825796765e-06, "loss": 0.71266985, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5078125, "step": 5112, "time_per_iteration": 2.4202606678009033 }, { "auxiliary_loss_clip": 0.01075963, "auxiliary_loss_mlp": 0.01070501, "balance_loss_clip": 1.03073263, "balance_loss_mlp": 1.02263546, "epoch": 0.3074101908913272, "flos": 30915764542080.0, "grad_norm": 2.0524838202256266, "language_loss": 0.80877024, "learning_rate": 3.2455871399692678e-06, "loss": 0.83023489, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 0.53125, "step": 5113, "time_per_iteration": 2.4735631942749023 }, { "auxiliary_loss_clip": 0.01074797, "auxiliary_loss_mlp": 0.01063042, "balance_loss_clip": 1.02520525, "balance_loss_mlp": 1.0229547, "epoch": 0.30747031414399517, "flos": 18400204823040.0, "grad_norm": 1.975581193867569, "language_loss": 0.78863084, "learning_rate": 3.2452824069089815e-06, "loss": 0.81000924, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.51953125, "step": 5114, "time_per_iteration": 2.3644065856933594 }, { "auxiliary_loss_clip": 0.01074065, "auxiliary_loss_mlp": 0.01050881, "balance_loss_clip": 1.01476049, "balance_loss_mlp": 1.02334118, "epoch": 0.30753043739666314, "flos": 22632201809280.0, "grad_norm": 1.8341787668972858, "language_loss": 0.63276172, "learning_rate": 3.2449776266274623e-06, "loss": 0.65401119, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5078125, "step": 5115, "time_per_iteration": 2.408470869064331 }, { "auxiliary_loss_clip": 0.01073069, "auxiliary_loss_mlp": 0.01053033, "balance_loss_clip": 1.018677, "balance_loss_mlp": 1.02237582, "epoch": 0.3075905606493311, "flos": 27342857802240.0, "grad_norm": 1.7557693363000624, "language_loss": 0.83292222, "learning_rate": 3.2446727991362657e-06, "loss": 0.8541832, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5116, "time_per_iteration": 3.9663705825805664 }, { "auxiliary_loss_clip": 0.01070894, "auxiliary_loss_mlp": 0.01056731, "balance_loss_clip": 1.02092028, "balance_loss_mlp": 1.02181041, "epoch": 0.3076506839019991, "flos": 22089721104000.0, "grad_norm": 1.9637438441135333, "language_loss": 0.77014351, "learning_rate": 3.244367924446952e-06, "loss": 0.79141974, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.49023438, "step": 5117, "time_per_iteration": 2.4196743965148926 }, { "auxiliary_loss_clip": 0.01073729, "auxiliary_loss_mlp": 0.0105384, "balance_loss_clip": 1.01855397, "balance_loss_mlp": 1.02193701, "epoch": 0.3077108071546671, "flos": 21288429423360.0, "grad_norm": 2.1355947994428566, "language_loss": 0.73208177, "learning_rate": 3.2440630025710826e-06, "loss": 0.75335747, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 5118, "time_per_iteration": 2.3682048320770264 }, { "auxiliary_loss_clip": 0.01070619, "auxiliary_loss_mlp": 0.01049174, "balance_loss_clip": 1.01481807, "balance_loss_mlp": 1.02193594, "epoch": 0.30777093040733505, "flos": 21430002453120.0, "grad_norm": 1.692579113266664, "language_loss": 0.75368643, "learning_rate": 3.243758033520219e-06, "loss": 0.77488434, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48828125, "step": 5119, "time_per_iteration": 2.416313886642456 }, { "auxiliary_loss_clip": 0.01073292, "auxiliary_loss_mlp": 0.01058437, "balance_loss_clip": 1.02107692, "balance_loss_mlp": 1.02215159, "epoch": 0.307831053660003, "flos": 23148148014720.0, "grad_norm": 1.7687099637013077, "language_loss": 0.81672311, "learning_rate": 3.243453017305926e-06, "loss": 0.83804047, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.51171875, "step": 5120, "time_per_iteration": 3.7767655849456787 }, { "auxiliary_loss_clip": 0.01068265, "auxiliary_loss_mlp": 0.01057199, "balance_loss_clip": 1.02412999, "balance_loss_mlp": 1.01970601, "epoch": 0.307891176912671, "flos": 17018796124800.0, "grad_norm": 3.4130728231104692, "language_loss": 0.81701326, "learning_rate": 3.24314795393977e-06, "loss": 0.83826798, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48632812, "step": 5121, "time_per_iteration": 2.389071464538574 }, { "auxiliary_loss_clip": 0.01070284, "auxiliary_loss_mlp": 0.01046148, "balance_loss_clip": 1.0129844, "balance_loss_mlp": 1.02178788, "epoch": 0.30795130016533895, "flos": 27703929749760.0, "grad_norm": 1.5933910230576407, "language_loss": 0.83659053, "learning_rate": 3.242842843433319e-06, "loss": 0.85775483, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.484375, "step": 5122, "time_per_iteration": 3.875358819961548 }, { "auxiliary_loss_clip": 0.01020475, "auxiliary_loss_mlp": 0.01005536, "balance_loss_clip": 1.00219834, "balance_loss_mlp": 1.00863051, "epoch": 0.3080114234180069, "flos": 69054987294720.0, "grad_norm": 0.7449626435355371, "language_loss": 0.58689028, "learning_rate": 3.242537685798143e-06, "loss": 0.60715038, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.11816406, "step": 5123, "time_per_iteration": 4.671058893203735 }, { "auxiliary_loss_clip": 0.01072968, "auxiliary_loss_mlp": 0.01053676, "balance_loss_clip": 1.0162915, "balance_loss_mlp": 1.02206922, "epoch": 0.3080715466706749, "flos": 24059101875840.0, "grad_norm": 1.6571578251871193, "language_loss": 0.84449691, "learning_rate": 3.242232481045813e-06, "loss": 0.86576331, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5078125, "step": 5124, "time_per_iteration": 2.4056413173675537 }, { "auxiliary_loss_clip": 0.01073926, "auxiliary_loss_mlp": 0.01051614, "balance_loss_clip": 1.01794899, "balance_loss_mlp": 1.02237928, "epoch": 0.30813166992334284, "flos": 25847492826240.0, "grad_norm": 2.3927459047350017, "language_loss": 0.81188536, "learning_rate": 3.2419272291879035e-06, "loss": 0.83314073, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.515625, "step": 5125, "time_per_iteration": 2.4360854625701904 }, { "auxiliary_loss_clip": 0.01075497, "auxiliary_loss_mlp": 0.01052795, "balance_loss_clip": 1.01519585, "balance_loss_mlp": 1.02297163, "epoch": 0.3081917931760108, "flos": 20448558823680.0, "grad_norm": 2.049789417834457, "language_loss": 0.66047478, "learning_rate": 3.241621930235989e-06, "loss": 0.68175769, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.52734375, "step": 5126, "time_per_iteration": 2.3739819526672363 }, { "auxiliary_loss_clip": 0.01068805, "auxiliary_loss_mlp": 0.01052961, "balance_loss_clip": 1.02199078, "balance_loss_mlp": 1.02159858, "epoch": 0.3082519164286788, "flos": 22165098462720.0, "grad_norm": 1.6946554675286505, "language_loss": 0.87967771, "learning_rate": 3.241316584201646e-06, "loss": 0.90089536, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.47265625, "step": 5127, "time_per_iteration": 2.4258644580841064 }, { "auxiliary_loss_clip": 0.01070685, "auxiliary_loss_mlp": 0.01062236, "balance_loss_clip": 1.02473295, "balance_loss_mlp": 1.0217967, "epoch": 0.30831203968134674, "flos": 28912133859840.0, "grad_norm": 2.2447140051668057, "language_loss": 0.69969273, "learning_rate": 3.2410111910964538e-06, "loss": 0.72102189, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.48828125, "step": 5128, "time_per_iteration": 2.4938104152679443 }, { "auxiliary_loss_clip": 0.01071814, "auxiliary_loss_mlp": 0.01058315, "balance_loss_clip": 1.02510285, "balance_loss_mlp": 1.02182305, "epoch": 0.3083721629340147, "flos": 25666503004800.0, "grad_norm": 1.9658916150293373, "language_loss": 0.73451251, "learning_rate": 3.240705750931993e-06, "loss": 0.75581384, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.5, "step": 5129, "time_per_iteration": 2.4641618728637695 }, { "auxiliary_loss_clip": 0.01018701, "auxiliary_loss_mlp": 0.01007454, "balance_loss_clip": 1.00380599, "balance_loss_mlp": 1.00696802, "epoch": 0.3084322861866827, "flos": 68209181763840.0, "grad_norm": 0.8516039853672059, "language_loss": 0.5937503, "learning_rate": 3.240400263719846e-06, "loss": 0.61401188, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 0.03637695, "router_z_loss_mlp": 0.1171875, "step": 5130, "time_per_iteration": 2.9810140132904053 }, { "auxiliary_loss_clip": 0.01073298, "auxiliary_loss_mlp": 0.01059563, "balance_loss_clip": 1.02391982, "balance_loss_mlp": 1.02233434, "epoch": 0.3084924094393507, "flos": 20295639601920.0, "grad_norm": 2.4772547117305095, "language_loss": 0.75050163, "learning_rate": 3.2400947294715957e-06, "loss": 0.77183026, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.51171875, "step": 5131, "time_per_iteration": 2.401658773422241 }, { "auxiliary_loss_clip": 0.01070024, "auxiliary_loss_mlp": 0.01052481, "balance_loss_clip": 1.01996088, "balance_loss_mlp": 1.02167547, "epoch": 0.30855253269201866, "flos": 23948741468160.0, "grad_norm": 1.5160392369184326, "language_loss": 0.72419083, "learning_rate": 3.2397891481988303e-06, "loss": 0.74541593, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.484375, "step": 5132, "time_per_iteration": 2.39300537109375 }, { "auxiliary_loss_clip": 0.01066037, "auxiliary_loss_mlp": 0.010552, "balance_loss_clip": 1.02513528, "balance_loss_mlp": 1.02089572, "epoch": 0.3086126559446866, "flos": 19280853757440.0, "grad_norm": 1.7102182831522172, "language_loss": 0.91429913, "learning_rate": 3.239483519913136e-06, "loss": 0.93551153, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45117188, "step": 5133, "time_per_iteration": 2.396352529525757 }, { "auxiliary_loss_clip": 0.01072075, "auxiliary_loss_mlp": 0.01055621, "balance_loss_clip": 1.02081203, "balance_loss_mlp": 1.02227795, "epoch": 0.3086727791973546, "flos": 33759510203520.0, "grad_norm": 2.3087133604267396, "language_loss": 0.6917603, "learning_rate": 3.239177844626102e-06, "loss": 0.71303725, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.49804688, "step": 5134, "time_per_iteration": 2.481449604034424 }, { "auxiliary_loss_clip": 0.01072189, "auxiliary_loss_mlp": 0.01051454, "balance_loss_clip": 1.01757526, "balance_loss_mlp": 1.02207804, "epoch": 0.30873290245002255, "flos": 16033232954880.0, "grad_norm": 1.9314705828625423, "language_loss": 0.84629381, "learning_rate": 3.2388721223493197e-06, "loss": 0.86753023, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5, "step": 5135, "time_per_iteration": 2.3870115280151367 }, { "auxiliary_loss_clip": 0.01015505, "auxiliary_loss_mlp": 0.01023855, "balance_loss_clip": 1.01965928, "balance_loss_mlp": 1.00410891, "epoch": 0.3087930257026905, "flos": 65044618819200.0, "grad_norm": 0.7118924682739293, "language_loss": 0.55389786, "learning_rate": 3.2385663530943824e-06, "loss": 0.57429141, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 0.04199219, "router_z_loss_mlp": 0.11376953, "step": 5136, "time_per_iteration": 3.0375308990478516 }, { "auxiliary_loss_clip": 0.01070233, "auxiliary_loss_mlp": 0.01053123, "balance_loss_clip": 1.02124608, "balance_loss_mlp": 1.02128077, "epoch": 0.3088531489553585, "flos": 74736301173120.0, "grad_norm": 1.9085728896238858, "language_loss": 0.77449036, "learning_rate": 3.2382605368728852e-06, "loss": 0.79572392, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.49023438, "step": 5137, "time_per_iteration": 2.880054235458374 }, { "auxiliary_loss_clip": 0.01068389, "auxiliary_loss_mlp": 0.01048948, "balance_loss_clip": 1.02056444, "balance_loss_mlp": 1.02134895, "epoch": 0.30891327220802645, "flos": 21141235664640.0, "grad_norm": 2.2764489233321425, "language_loss": 0.80530363, "learning_rate": 3.237954673696424e-06, "loss": 0.82647699, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.46875, "step": 5138, "time_per_iteration": 2.3701059818267822 }, { "auxiliary_loss_clip": 0.01070075, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.01545072, "balance_loss_mlp": 1.02146792, "epoch": 0.3089733954606944, "flos": 25663360982400.0, "grad_norm": 1.7518389281510307, "language_loss": 0.82679707, "learning_rate": 3.2376487635765983e-06, "loss": 0.84797263, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.484375, "step": 5139, "time_per_iteration": 2.458500385284424 }, { "auxiliary_loss_clip": 0.01075578, "auxiliary_loss_mlp": 0.01055916, "balance_loss_clip": 1.01798403, "balance_loss_mlp": 1.02370858, "epoch": 0.3090335187133624, "flos": 19426336859520.0, "grad_norm": 1.8879372384968347, "language_loss": 0.79357803, "learning_rate": 3.2373428065250067e-06, "loss": 0.81489289, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.515625, "step": 5140, "time_per_iteration": 2.364794969558716 }, { "auxiliary_loss_clip": 0.01066305, "auxiliary_loss_mlp": 0.0104724, "balance_loss_clip": 1.01977372, "balance_loss_mlp": 1.0214467, "epoch": 0.30909364196603034, "flos": 20010294126720.0, "grad_norm": 1.780706563398438, "language_loss": 0.8050015, "learning_rate": 3.237036802553252e-06, "loss": 0.82613695, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.44921875, "step": 5141, "time_per_iteration": 2.442444086074829 }, { "auxiliary_loss_clip": 0.01073107, "auxiliary_loss_mlp": 0.0105638, "balance_loss_clip": 1.02300119, "balance_loss_mlp": 1.02266121, "epoch": 0.3091537652186983, "flos": 19676699285760.0, "grad_norm": 2.2631928294318064, "language_loss": 0.89642549, "learning_rate": 3.2367307516729377e-06, "loss": 0.91772032, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.50390625, "step": 5142, "time_per_iteration": 2.375620126724243 }, { "auxiliary_loss_clip": 0.01070304, "auxiliary_loss_mlp": 0.01051464, "balance_loss_clip": 1.01875257, "balance_loss_mlp": 1.02100313, "epoch": 0.3092138884713663, "flos": 17019075415680.0, "grad_norm": 1.9233390305670097, "language_loss": 0.81350285, "learning_rate": 3.23642465389567e-06, "loss": 0.83472055, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 5143, "time_per_iteration": 2.4567012786865234 }, { "auxiliary_loss_clip": 0.01068455, "auxiliary_loss_mlp": 0.01052312, "balance_loss_clip": 1.01888585, "balance_loss_mlp": 1.02046013, "epoch": 0.3092740117240343, "flos": 25008809212800.0, "grad_norm": 1.7074340801698307, "language_loss": 0.73157942, "learning_rate": 3.236118509233055e-06, "loss": 0.75278699, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.48046875, "step": 5144, "time_per_iteration": 2.433255672454834 }, { "auxiliary_loss_clip": 0.01071722, "auxiliary_loss_mlp": 0.01054278, "balance_loss_clip": 1.02097106, "balance_loss_mlp": 1.02165556, "epoch": 0.30933413497670226, "flos": 25589310255360.0, "grad_norm": 5.9764391078815295, "language_loss": 0.75549734, "learning_rate": 3.235812317696702e-06, "loss": 0.77675736, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5, "step": 5145, "time_per_iteration": 2.436875104904175 }, { "auxiliary_loss_clip": 0.01070195, "auxiliary_loss_mlp": 0.01061127, "balance_loss_clip": 1.027951, "balance_loss_mlp": 1.02236211, "epoch": 0.3093942582293702, "flos": 24388507353600.0, "grad_norm": 1.5895592941385783, "language_loss": 0.77002192, "learning_rate": 3.2355060792982224e-06, "loss": 0.79133511, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4765625, "step": 5146, "time_per_iteration": 2.4170501232147217 }, { "auxiliary_loss_clip": 0.01070771, "auxiliary_loss_mlp": 0.01048608, "balance_loss_clip": 1.01764953, "balance_loss_mlp": 1.02209139, "epoch": 0.3094543814820382, "flos": 19645416840960.0, "grad_norm": 2.3295970472663674, "language_loss": 0.67968035, "learning_rate": 3.2351997940492286e-06, "loss": 0.70087421, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48632812, "step": 5147, "time_per_iteration": 2.4015259742736816 }, { "auxiliary_loss_clip": 0.010719, "auxiliary_loss_mlp": 0.01055918, "balance_loss_clip": 1.02430415, "balance_loss_mlp": 1.02245235, "epoch": 0.30951450473470615, "flos": 25662697666560.0, "grad_norm": 1.921095999329307, "language_loss": 0.76709485, "learning_rate": 3.2348934619613346e-06, "loss": 0.78837305, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.49609375, "step": 5148, "time_per_iteration": 2.4641566276550293 }, { "auxiliary_loss_clip": 0.01076082, "auxiliary_loss_mlp": 0.01060275, "balance_loss_clip": 1.02439356, "balance_loss_mlp": 1.02373981, "epoch": 0.3095746279873741, "flos": 12019617722880.0, "grad_norm": 2.485356254775871, "language_loss": 0.74787021, "learning_rate": 3.2345870830461567e-06, "loss": 0.7692337, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 5149, "time_per_iteration": 2.3894944190979004 }, { "auxiliary_loss_clip": 0.01073572, "auxiliary_loss_mlp": 0.01063899, "balance_loss_clip": 1.02706313, "balance_loss_mlp": 1.02168274, "epoch": 0.3096347512400421, "flos": 23621919431040.0, "grad_norm": 1.9249934748612914, "language_loss": 0.87117267, "learning_rate": 3.2342806573153132e-06, "loss": 0.89254737, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.51953125, "step": 5150, "time_per_iteration": 2.401935338973999 }, { "auxiliary_loss_clip": 0.01071148, "auxiliary_loss_mlp": 0.01058125, "balance_loss_clip": 1.02181411, "balance_loss_mlp": 1.02167988, "epoch": 0.30969487449271005, "flos": 22528195269120.0, "grad_norm": 1.766138025565247, "language_loss": 0.79789162, "learning_rate": 3.233974184780424e-06, "loss": 0.8191843, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.49414062, "step": 5151, "time_per_iteration": 2.4270591735839844 }, { "auxiliary_loss_clip": 0.01072217, "auxiliary_loss_mlp": 0.01048712, "balance_loss_clip": 1.01497579, "balance_loss_mlp": 1.02199495, "epoch": 0.309754997745378, "flos": 15267029057280.0, "grad_norm": 2.154399046636942, "language_loss": 0.68448311, "learning_rate": 3.2336676654531084e-06, "loss": 0.70569241, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5, "step": 5152, "time_per_iteration": 2.4004569053649902 }, { "auxiliary_loss_clip": 0.0107141, "auxiliary_loss_mlp": 0.01051924, "balance_loss_clip": 1.01778293, "balance_loss_mlp": 1.02199745, "epoch": 0.309815120998046, "flos": 26978085250560.0, "grad_norm": 2.1061627107101106, "language_loss": 0.84798014, "learning_rate": 3.2333610993449926e-06, "loss": 0.86921346, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.49609375, "step": 5153, "time_per_iteration": 2.4476535320281982 }, { "auxiliary_loss_clip": 0.01071324, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.02019572, "balance_loss_mlp": 1.02237797, "epoch": 0.30987524425071394, "flos": 21142073537280.0, "grad_norm": 1.8269964929786695, "language_loss": 0.74849284, "learning_rate": 3.2330544864676997e-06, "loss": 0.76975274, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49023438, "step": 5154, "time_per_iteration": 2.4161250591278076 }, { "auxiliary_loss_clip": 0.010685, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.01979852, "balance_loss_mlp": 1.02043998, "epoch": 0.3099353675033819, "flos": 15267378170880.0, "grad_norm": 1.7969514304878627, "language_loss": 0.77030909, "learning_rate": 3.232747826832858e-06, "loss": 0.79151917, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.48046875, "step": 5155, "time_per_iteration": 2.368520975112915 }, { "auxiliary_loss_clip": 0.01073791, "auxiliary_loss_mlp": 0.010535, "balance_loss_clip": 1.0178566, "balance_loss_mlp": 1.02264357, "epoch": 0.30999549075604993, "flos": 15412896184320.0, "grad_norm": 1.8895514638631774, "language_loss": 0.80037403, "learning_rate": 3.232441120452094e-06, "loss": 0.82164693, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.51171875, "step": 5156, "time_per_iteration": 3.9244837760925293 }, { "auxiliary_loss_clip": 0.01072847, "auxiliary_loss_mlp": 0.01057355, "balance_loss_clip": 1.02202153, "balance_loss_mlp": 1.02093959, "epoch": 0.3100556140087179, "flos": 23183445265920.0, "grad_norm": 2.3248984201419063, "language_loss": 0.7638759, "learning_rate": 3.23213436733704e-06, "loss": 0.78517795, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.51953125, "step": 5157, "time_per_iteration": 2.4296038150787354 }, { "auxiliary_loss_clip": 0.01068419, "auxiliary_loss_mlp": 0.01049068, "balance_loss_clip": 1.01757276, "balance_loss_mlp": 1.02121985, "epoch": 0.31011573726138586, "flos": 25740902845440.0, "grad_norm": 1.6625011598891408, "language_loss": 0.7118687, "learning_rate": 3.231827567499327e-06, "loss": 0.73304355, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.47265625, "step": 5158, "time_per_iteration": 2.432555913925171 }, { "auxiliary_loss_clip": 0.01067793, "auxiliary_loss_mlp": 0.01050715, "balance_loss_clip": 1.0179565, "balance_loss_mlp": 1.01933956, "epoch": 0.3101758605140538, "flos": 20010294126720.0, "grad_norm": 3.4978919448837513, "language_loss": 0.85947597, "learning_rate": 3.2315207209505896e-06, "loss": 0.88066113, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.484375, "step": 5159, "time_per_iteration": 2.398587703704834 }, { "auxiliary_loss_clip": 0.01069594, "auxiliary_loss_mlp": 0.0105581, "balance_loss_clip": 1.02178764, "balance_loss_mlp": 1.02063966, "epoch": 0.3102359837667218, "flos": 19134672428160.0, "grad_norm": 2.0647536007220593, "language_loss": 0.86994302, "learning_rate": 3.231213827702462e-06, "loss": 0.89119703, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49023438, "step": 5160, "time_per_iteration": 3.788621425628662 }, { "auxiliary_loss_clip": 0.01068077, "auxiliary_loss_mlp": 0.01049375, "balance_loss_clip": 1.01863146, "balance_loss_mlp": 1.02092481, "epoch": 0.31029610701938976, "flos": 22264531614720.0, "grad_norm": 2.1531131361922036, "language_loss": 0.77240151, "learning_rate": 3.230906887766584e-06, "loss": 0.79357606, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.47265625, "step": 5161, "time_per_iteration": 3.8507816791534424 }, { "auxiliary_loss_clip": 0.01069035, "auxiliary_loss_mlp": 0.01061368, "balance_loss_clip": 1.02655911, "balance_loss_mlp": 1.02022946, "epoch": 0.3103562302720577, "flos": 20804533712640.0, "grad_norm": 2.052059089141047, "language_loss": 0.83167791, "learning_rate": 3.2305999011545924e-06, "loss": 0.85298193, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48828125, "step": 5162, "time_per_iteration": 2.3795039653778076 }, { "auxiliary_loss_clip": 0.01065536, "auxiliary_loss_mlp": 0.01047138, "balance_loss_clip": 1.01851606, "balance_loss_mlp": 1.01965165, "epoch": 0.3104163535247257, "flos": 22343120818560.0, "grad_norm": 1.6027752743147308, "language_loss": 0.83714461, "learning_rate": 3.2302928678781295e-06, "loss": 0.85827136, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.45898438, "step": 5163, "time_per_iteration": 3.883543014526367 }, { "auxiliary_loss_clip": 0.01073893, "auxiliary_loss_mlp": 0.01054117, "balance_loss_clip": 1.02059495, "balance_loss_mlp": 1.02289534, "epoch": 0.31047647677739365, "flos": 21688289758080.0, "grad_norm": 1.8925122686477232, "language_loss": 0.77696192, "learning_rate": 3.2299857879488376e-06, "loss": 0.79824197, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.51171875, "step": 5164, "time_per_iteration": 2.4127323627471924 }, { "auxiliary_loss_clip": 0.01074417, "auxiliary_loss_mlp": 0.01058208, "balance_loss_clip": 1.02308917, "balance_loss_mlp": 1.02408981, "epoch": 0.3105366000300616, "flos": 18916255762560.0, "grad_norm": 2.7791989305015012, "language_loss": 0.76344168, "learning_rate": 3.2296786613783626e-06, "loss": 0.78476799, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.50390625, "step": 5165, "time_per_iteration": 2.3649725914001465 }, { "auxiliary_loss_clip": 0.01069011, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.02123356, "balance_loss_mlp": 1.02131319, "epoch": 0.3105967232827296, "flos": 18259399843200.0, "grad_norm": 1.526490629755684, "language_loss": 0.77188253, "learning_rate": 3.229371488178348e-06, "loss": 0.79310757, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4765625, "step": 5166, "time_per_iteration": 2.4062132835388184 }, { "auxiliary_loss_clip": 0.01070073, "auxiliary_loss_mlp": 0.0106519, "balance_loss_clip": 1.03128648, "balance_loss_mlp": 1.02234244, "epoch": 0.31065684653539755, "flos": 17671288124160.0, "grad_norm": 2.4469016985662715, "language_loss": 0.75432926, "learning_rate": 3.229064268360444e-06, "loss": 0.77568185, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4765625, "step": 5167, "time_per_iteration": 2.3398211002349854 }, { "auxiliary_loss_clip": 0.01017564, "auxiliary_loss_mlp": 0.01011205, "balance_loss_clip": 1.00739062, "balance_loss_mlp": 1.00555813, "epoch": 0.3107169697880655, "flos": 68528742238080.0, "grad_norm": 0.7313053793068099, "language_loss": 0.52985996, "learning_rate": 3.2287570019362997e-06, "loss": 0.55014765, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 0.03808594, "router_z_loss_mlp": 0.12011719, "step": 5168, "time_per_iteration": 3.085465431213379 }, { "auxiliary_loss_clip": 0.01071998, "auxiliary_loss_mlp": 0.01050317, "balance_loss_clip": 1.016891, "balance_loss_mlp": 1.02235556, "epoch": 0.3107770930407335, "flos": 13187881370880.0, "grad_norm": 1.8453792944328133, "language_loss": 0.80373275, "learning_rate": 3.2284496889175668e-06, "loss": 0.82495588, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.49609375, "step": 5169, "time_per_iteration": 2.377397298812866 }, { "auxiliary_loss_clip": 0.01071969, "auxiliary_loss_mlp": 0.01053432, "balance_loss_clip": 1.02036309, "balance_loss_mlp": 1.0224613, "epoch": 0.3108372162934015, "flos": 31579393265280.0, "grad_norm": 1.674792384646792, "language_loss": 0.64768779, "learning_rate": 3.2281423293158986e-06, "loss": 0.66894174, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.49609375, "step": 5170, "time_per_iteration": 2.5138497352600098 }, { "auxiliary_loss_clip": 0.01070783, "auxiliary_loss_mlp": 0.01053655, "balance_loss_clip": 1.01782036, "balance_loss_mlp": 1.02193773, "epoch": 0.31089733954606946, "flos": 28728595509120.0, "grad_norm": 2.3043042382350123, "language_loss": 0.78863955, "learning_rate": 3.22783492314295e-06, "loss": 0.80988389, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.48828125, "step": 5171, "time_per_iteration": 2.479414701461792 }, { "auxiliary_loss_clip": 0.01071313, "auxiliary_loss_mlp": 0.0105794, "balance_loss_clip": 1.02351248, "balance_loss_mlp": 1.02224159, "epoch": 0.3109574627987374, "flos": 19682215280640.0, "grad_norm": 1.8404824304962253, "language_loss": 0.84676588, "learning_rate": 3.2275274704103785e-06, "loss": 0.86805832, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49023438, "step": 5172, "time_per_iteration": 2.3911263942718506 }, { "auxiliary_loss_clip": 0.01070572, "auxiliary_loss_mlp": 0.010587, "balance_loss_clip": 1.02420068, "balance_loss_mlp": 1.02136397, "epoch": 0.3110175860514054, "flos": 14683106701440.0, "grad_norm": 2.477828263303185, "language_loss": 0.86355531, "learning_rate": 3.227219971129842e-06, "loss": 0.884848, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4921875, "step": 5173, "time_per_iteration": 2.3650271892547607 }, { "auxiliary_loss_clip": 0.01068531, "auxiliary_loss_mlp": 0.01050699, "balance_loss_clip": 1.01999092, "balance_loss_mlp": 1.02173376, "epoch": 0.31107770930407336, "flos": 25738459050240.0, "grad_norm": 1.9878834802374086, "language_loss": 0.84640324, "learning_rate": 3.226912425313001e-06, "loss": 0.86759549, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.46875, "step": 5174, "time_per_iteration": 2.4469003677368164 }, { "auxiliary_loss_clip": 0.01072255, "auxiliary_loss_mlp": 0.01057735, "balance_loss_clip": 1.02278268, "balance_loss_mlp": 1.02227283, "epoch": 0.3111378325567413, "flos": 19207256878080.0, "grad_norm": 2.0306135138874195, "language_loss": 0.86342078, "learning_rate": 3.2266048329715183e-06, "loss": 0.88472062, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49804688, "step": 5175, "time_per_iteration": 2.367511510848999 }, { "auxiliary_loss_clip": 0.01072313, "auxiliary_loss_mlp": 0.01051956, "balance_loss_clip": 1.02117658, "balance_loss_mlp": 1.02314067, "epoch": 0.3111979558094093, "flos": 23695237019520.0, "grad_norm": 1.8494906022999613, "language_loss": 0.85519576, "learning_rate": 3.2262971941170575e-06, "loss": 0.87643838, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4921875, "step": 5176, "time_per_iteration": 2.429464817047119 }, { "auxiliary_loss_clip": 0.01068422, "auxiliary_loss_mlp": 0.01053784, "balance_loss_clip": 1.01897466, "balance_loss_mlp": 1.02022004, "epoch": 0.31125807906207725, "flos": 21031957509120.0, "grad_norm": 1.8558402953411044, "language_loss": 0.82265091, "learning_rate": 3.2259895087612837e-06, "loss": 0.84387296, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48046875, "step": 5177, "time_per_iteration": 2.3679139614105225 }, { "auxiliary_loss_clip": 0.01069708, "auxiliary_loss_mlp": 0.01055218, "balance_loss_clip": 1.02083802, "balance_loss_mlp": 1.02116954, "epoch": 0.3113182023147452, "flos": 23075493742080.0, "grad_norm": 1.5944093429577266, "language_loss": 0.82537478, "learning_rate": 3.2256817769158657e-06, "loss": 0.84662396, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48632812, "step": 5178, "time_per_iteration": 2.4036431312561035 }, { "auxiliary_loss_clip": 0.01073101, "auxiliary_loss_mlp": 0.01058026, "balance_loss_clip": 1.02347875, "balance_loss_mlp": 1.02107978, "epoch": 0.3113783255674132, "flos": 11838174053760.0, "grad_norm": 2.0467490666592303, "language_loss": 0.83373213, "learning_rate": 3.225373998592471e-06, "loss": 0.85504341, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.51953125, "step": 5179, "time_per_iteration": 2.3373751640319824 }, { "auxiliary_loss_clip": 0.01073295, "auxiliary_loss_mlp": 0.01057846, "balance_loss_clip": 1.02441931, "balance_loss_mlp": 1.02401721, "epoch": 0.31143844882008115, "flos": 16288622616960.0, "grad_norm": 1.6511112735748263, "language_loss": 0.79678512, "learning_rate": 3.2250661738027715e-06, "loss": 0.81809658, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4921875, "step": 5180, "time_per_iteration": 2.377622365951538 }, { "auxiliary_loss_clip": 0.0107242, "auxiliary_loss_mlp": 0.01051837, "balance_loss_clip": 1.01744485, "balance_loss_mlp": 1.02165592, "epoch": 0.3114985720727491, "flos": 23216787480960.0, "grad_norm": 1.6620241999140897, "language_loss": 0.85305786, "learning_rate": 3.22475830255844e-06, "loss": 0.87430048, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5181, "time_per_iteration": 2.382403612136841 }, { "auxiliary_loss_clip": 0.01069755, "auxiliary_loss_mlp": 0.010592, "balance_loss_clip": 1.02787197, "balance_loss_mlp": 1.02114058, "epoch": 0.3115586953254171, "flos": 30043319777280.0, "grad_norm": 1.720482424728682, "language_loss": 0.75751722, "learning_rate": 3.2244503848711516e-06, "loss": 0.77880681, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.484375, "step": 5182, "time_per_iteration": 2.4983091354370117 }, { "auxiliary_loss_clip": 0.01073111, "auxiliary_loss_mlp": 0.01066273, "balance_loss_clip": 1.02886474, "balance_loss_mlp": 1.02227163, "epoch": 0.3116188185780851, "flos": 25665141461760.0, "grad_norm": 2.5718207409272926, "language_loss": 0.71819961, "learning_rate": 3.2241424207525815e-06, "loss": 0.73959351, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.5078125, "step": 5183, "time_per_iteration": 2.4236068725585938 }, { "auxiliary_loss_clip": 0.01017202, "auxiliary_loss_mlp": 0.01003695, "balance_loss_clip": 0.9999277, "balance_loss_mlp": 1.00604808, "epoch": 0.31167894183075306, "flos": 69506974022400.0, "grad_norm": 0.9546983867770389, "language_loss": 0.59849524, "learning_rate": 3.223834410214408e-06, "loss": 0.6187042, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 0.03759766, "router_z_loss_mlp": 0.11132812, "step": 5184, "time_per_iteration": 3.056886911392212 }, { "auxiliary_loss_clip": 0.01070924, "auxiliary_loss_mlp": 0.01053476, "balance_loss_clip": 1.02038383, "balance_loss_mlp": 1.02206254, "epoch": 0.31173906508342103, "flos": 14938950211200.0, "grad_norm": 2.093033739684117, "language_loss": 0.71596563, "learning_rate": 3.223526353268311e-06, "loss": 0.73720956, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48828125, "step": 5185, "time_per_iteration": 2.3713455200195312 }, { "auxiliary_loss_clip": 0.01073267, "auxiliary_loss_mlp": 0.01056801, "balance_loss_clip": 1.02196777, "balance_loss_mlp": 1.02223134, "epoch": 0.311799188336089, "flos": 16175224920960.0, "grad_norm": 2.2982094215745903, "language_loss": 0.65841675, "learning_rate": 3.2232182499259725e-06, "loss": 0.67971742, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.51171875, "step": 5186, "time_per_iteration": 2.3850576877593994 }, { "auxiliary_loss_clip": 0.01075341, "auxiliary_loss_mlp": 0.01058451, "balance_loss_clip": 1.02094746, "balance_loss_mlp": 1.02284062, "epoch": 0.31185931158875696, "flos": 25008460099200.0, "grad_norm": 2.256261223420343, "language_loss": 0.88292789, "learning_rate": 3.2229101001990747e-06, "loss": 0.90426588, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.52734375, "step": 5187, "time_per_iteration": 2.4243381023406982 }, { "auxiliary_loss_clip": 0.01069099, "auxiliary_loss_mlp": 0.01059357, "balance_loss_clip": 1.02755237, "balance_loss_mlp": 1.02058697, "epoch": 0.3119194348414249, "flos": 37231377868800.0, "grad_norm": 1.4431052697488067, "language_loss": 0.63955593, "learning_rate": 3.2226019040993036e-06, "loss": 0.66084051, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.484375, "step": 5188, "time_per_iteration": 2.552957773208618 }, { "auxiliary_loss_clip": 0.0107321, "auxiliary_loss_mlp": 0.01057579, "balance_loss_clip": 1.0226984, "balance_loss_mlp": 1.02394593, "epoch": 0.3119795580940929, "flos": 15011883774720.0, "grad_norm": 2.4179121015235623, "language_loss": 0.84975147, "learning_rate": 3.222293661638346e-06, "loss": 0.8710593, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.4921875, "step": 5189, "time_per_iteration": 2.3514082431793213 }, { "auxiliary_loss_clip": 0.01070178, "auxiliary_loss_mlp": 0.01051782, "balance_loss_clip": 1.01687777, "balance_loss_mlp": 1.02098298, "epoch": 0.31203968134676086, "flos": 15997237476480.0, "grad_norm": 1.8281733639808793, "language_loss": 0.80836272, "learning_rate": 3.22198537282789e-06, "loss": 0.82958233, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.4921875, "step": 5190, "time_per_iteration": 2.413468599319458 }, { "auxiliary_loss_clip": 0.01071203, "auxiliary_loss_mlp": 0.0104809, "balance_loss_clip": 1.01674974, "balance_loss_mlp": 1.0223546, "epoch": 0.3120998045994288, "flos": 23836356201600.0, "grad_norm": 1.4948799295925403, "language_loss": 0.76419735, "learning_rate": 3.2216770376796262e-06, "loss": 0.7853902, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.48828125, "step": 5191, "time_per_iteration": 2.4620039463043213 }, { "auxiliary_loss_clip": 0.01014131, "auxiliary_loss_mlp": 0.01008473, "balance_loss_clip": 1.004897, "balance_loss_mlp": 1.00306869, "epoch": 0.3121599278520968, "flos": 69181059680640.0, "grad_norm": 0.8543118982853731, "language_loss": 0.63968867, "learning_rate": 3.221368656205247e-06, "loss": 0.65991479, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.11035156, "step": 5192, "time_per_iteration": 3.1050448417663574 }, { "auxiliary_loss_clip": 0.01071262, "auxiliary_loss_mlp": 0.01051128, "balance_loss_clip": 1.01646233, "balance_loss_mlp": 1.02089429, "epoch": 0.31222005110476475, "flos": 23805213402240.0, "grad_norm": 1.8317926943818843, "language_loss": 0.81799304, "learning_rate": 3.221060228416446e-06, "loss": 0.83921695, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.50390625, "step": 5193, "time_per_iteration": 2.3983662128448486 }, { "auxiliary_loss_clip": 0.01070381, "auxiliary_loss_mlp": 0.01048663, "balance_loss_clip": 1.01480722, "balance_loss_mlp": 1.02068388, "epoch": 0.3122801743574327, "flos": 25225026462720.0, "grad_norm": 2.0039074490327637, "language_loss": 0.74035192, "learning_rate": 3.2207517543249183e-06, "loss": 0.76154232, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49609375, "step": 5194, "time_per_iteration": 2.4376049041748047 }, { "auxiliary_loss_clip": 0.01070712, "auxiliary_loss_mlp": 0.01049675, "balance_loss_clip": 1.01829958, "balance_loss_mlp": 1.02250111, "epoch": 0.3123402976101007, "flos": 22965377713920.0, "grad_norm": 1.45831254579897, "language_loss": 0.77248347, "learning_rate": 3.2204432339423616e-06, "loss": 0.79368734, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.484375, "step": 5195, "time_per_iteration": 2.4168224334716797 }, { "auxiliary_loss_clip": 0.01071347, "auxiliary_loss_mlp": 0.01057431, "balance_loss_clip": 1.02398038, "balance_loss_mlp": 1.02129197, "epoch": 0.3124004208627687, "flos": 25190916197760.0, "grad_norm": 1.3767205403481748, "language_loss": 0.79075599, "learning_rate": 3.220134667280476e-06, "loss": 0.81204379, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.5, "step": 5196, "time_per_iteration": 3.878204107284546 }, { "auxiliary_loss_clip": 0.01015562, "auxiliary_loss_mlp": 0.01010079, "balance_loss_clip": 1.00590658, "balance_loss_mlp": 1.00425708, "epoch": 0.31246054411543667, "flos": 67481626608000.0, "grad_norm": 0.7756839994348759, "language_loss": 0.54920948, "learning_rate": 3.2198260543509613e-06, "loss": 0.56946588, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.11328125, "step": 5197, "time_per_iteration": 3.0562310218811035 }, { "auxiliary_loss_clip": 0.01068732, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.01735198, "balance_loss_mlp": 1.02224123, "epoch": 0.31252066736810463, "flos": 17857549560960.0, "grad_norm": 1.6532230801798935, "language_loss": 0.68239284, "learning_rate": 3.21951739516552e-06, "loss": 0.70356214, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46484375, "step": 5198, "time_per_iteration": 2.372730016708374 }, { "auxiliary_loss_clip": 0.01072384, "auxiliary_loss_mlp": 0.0104696, "balance_loss_clip": 1.01386738, "balance_loss_mlp": 1.02244329, "epoch": 0.3125807906207726, "flos": 18474150816000.0, "grad_norm": 2.2424357622413615, "language_loss": 0.71584618, "learning_rate": 3.219208689735857e-06, "loss": 0.73703963, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.5, "step": 5199, "time_per_iteration": 2.415602445602417 }, { "auxiliary_loss_clip": 0.01070962, "auxiliary_loss_mlp": 0.01051359, "balance_loss_clip": 1.01953006, "balance_loss_mlp": 1.02281141, "epoch": 0.31264091387344056, "flos": 18945722816640.0, "grad_norm": 1.7232472157588758, "language_loss": 0.80145156, "learning_rate": 3.2188999380736785e-06, "loss": 0.82267475, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.48046875, "step": 5200, "time_per_iteration": 3.7842605113983154 }, { "auxiliary_loss_clip": 0.01067784, "auxiliary_loss_mlp": 0.01048611, "balance_loss_clip": 1.01635313, "balance_loss_mlp": 1.02186179, "epoch": 0.3127010371261085, "flos": 21467499120000.0, "grad_norm": 1.927265265356138, "language_loss": 0.83817327, "learning_rate": 3.2185911401906917e-06, "loss": 0.85933727, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45898438, "step": 5201, "time_per_iteration": 3.8097481727600098 }, { "auxiliary_loss_clip": 0.01073978, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.02466679, "balance_loss_mlp": 1.02459717, "epoch": 0.3127611603787765, "flos": 15335284498560.0, "grad_norm": 2.205348501182241, "language_loss": 0.71398854, "learning_rate": 3.2182822960986072e-06, "loss": 0.73530805, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 5202, "time_per_iteration": 3.7968175411224365 }, { "auxiliary_loss_clip": 0.01071042, "auxiliary_loss_mlp": 0.01051898, "balance_loss_clip": 1.02247739, "balance_loss_mlp": 1.02249014, "epoch": 0.31282128363144446, "flos": 17602020253440.0, "grad_norm": 1.9016282309650459, "language_loss": 0.86019588, "learning_rate": 3.2179734058091358e-06, "loss": 0.88142526, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.48632812, "step": 5203, "time_per_iteration": 2.374354362487793 }, { "auxiliary_loss_clip": 0.01071509, "auxiliary_loss_mlp": 0.01052762, "balance_loss_clip": 1.02195823, "balance_loss_mlp": 1.02397895, "epoch": 0.3128814068841124, "flos": 26755653778560.0, "grad_norm": 2.2299667211218672, "language_loss": 0.62939286, "learning_rate": 3.2176644693339913e-06, "loss": 0.6506356, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47460938, "step": 5204, "time_per_iteration": 2.4553256034851074 }, { "auxiliary_loss_clip": 0.01069373, "auxiliary_loss_mlp": 0.0104928, "balance_loss_clip": 1.01871514, "balance_loss_mlp": 1.02230287, "epoch": 0.3129415301367804, "flos": 22271304418560.0, "grad_norm": 1.7491852071254264, "language_loss": 0.67582119, "learning_rate": 3.217355486684887e-06, "loss": 0.69700778, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47070312, "step": 5205, "time_per_iteration": 2.4038052558898926 }, { "auxiliary_loss_clip": 0.0107176, "auxiliary_loss_mlp": 0.01052098, "balance_loss_clip": 1.02103186, "balance_loss_mlp": 1.02178526, "epoch": 0.31300165338944835, "flos": 26463814790400.0, "grad_norm": 1.804577263627564, "language_loss": 0.77489406, "learning_rate": 3.2170464578735414e-06, "loss": 0.79613256, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.5, "step": 5206, "time_per_iteration": 2.4570634365081787 }, { "auxiliary_loss_clip": 0.01068624, "auxiliary_loss_mlp": 0.01055354, "balance_loss_clip": 1.02443099, "balance_loss_mlp": 1.0207665, "epoch": 0.3130617766421163, "flos": 21943574686080.0, "grad_norm": 1.9279142064930386, "language_loss": 0.84919417, "learning_rate": 3.216737382911672e-06, "loss": 0.87043393, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47851562, "step": 5207, "time_per_iteration": 2.382864475250244 }, { "auxiliary_loss_clip": 0.01069752, "auxiliary_loss_mlp": 0.01054856, "balance_loss_clip": 1.02529263, "balance_loss_mlp": 1.02339363, "epoch": 0.3131218998947843, "flos": 23291710992000.0, "grad_norm": 1.8164120712581893, "language_loss": 0.7296446, "learning_rate": 3.216428261810999e-06, "loss": 0.75089073, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.46484375, "step": 5208, "time_per_iteration": 2.439727783203125 }, { "auxiliary_loss_clip": 0.01070382, "auxiliary_loss_mlp": 0.01062593, "balance_loss_clip": 1.02933407, "balance_loss_mlp": 1.0218811, "epoch": 0.3131820231474523, "flos": 21138652224000.0, "grad_norm": 2.0840254482686995, "language_loss": 0.75391412, "learning_rate": 3.2161190945832445e-06, "loss": 0.77524388, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48632812, "step": 5209, "time_per_iteration": 2.393115997314453 }, { "auxiliary_loss_clip": 0.01070354, "auxiliary_loss_mlp": 0.01058597, "balance_loss_clip": 1.0259819, "balance_loss_mlp": 1.0203135, "epoch": 0.31324214640012027, "flos": 23908870828800.0, "grad_norm": 2.445655947184509, "language_loss": 0.78262985, "learning_rate": 3.2158098812401325e-06, "loss": 0.80391937, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.5, "step": 5210, "time_per_iteration": 2.4371728897094727 }, { "auxiliary_loss_clip": 0.01065652, "auxiliary_loss_mlp": 0.01053774, "balance_loss_clip": 1.02250564, "balance_loss_mlp": 1.01953793, "epoch": 0.31330226965278823, "flos": 22235832610560.0, "grad_norm": 1.7577599569913829, "language_loss": 0.80717778, "learning_rate": 3.2155006217933874e-06, "loss": 0.82837206, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4609375, "step": 5211, "time_per_iteration": 2.39263653755188 }, { "auxiliary_loss_clip": 0.01068505, "auxiliary_loss_mlp": 0.0105151, "balance_loss_clip": 1.02070594, "balance_loss_mlp": 1.02130592, "epoch": 0.3133623929054562, "flos": 19753019251200.0, "grad_norm": 2.0914961147355937, "language_loss": 0.8025614, "learning_rate": 3.2151913162547367e-06, "loss": 0.82376158, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47265625, "step": 5212, "time_per_iteration": 2.4355316162109375 }, { "auxiliary_loss_clip": 0.01071788, "auxiliary_loss_mlp": 0.01057313, "balance_loss_clip": 1.02261138, "balance_loss_mlp": 1.02130675, "epoch": 0.31342251615812416, "flos": 27161030108160.0, "grad_norm": 1.8487610218477404, "language_loss": 0.73007202, "learning_rate": 3.2148819646359097e-06, "loss": 0.75136304, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 5213, "time_per_iteration": 2.444756031036377 }, { "auxiliary_loss_clip": 0.0107359, "auxiliary_loss_mlp": 0.01050688, "balance_loss_clip": 1.01645136, "balance_loss_mlp": 1.02409983, "epoch": 0.31348263941079213, "flos": 20228780615040.0, "grad_norm": 2.199763849710305, "language_loss": 0.78746653, "learning_rate": 3.2145725669486374e-06, "loss": 0.80870932, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49414062, "step": 5214, "time_per_iteration": 2.454724073410034 }, { "auxiliary_loss_clip": 0.01066942, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.01833129, "balance_loss_mlp": 1.02131057, "epoch": 0.3135427626634601, "flos": 24606505082880.0, "grad_norm": 1.760291973419956, "language_loss": 0.83633059, "learning_rate": 3.2142631232046517e-06, "loss": 0.85747427, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.45703125, "step": 5215, "time_per_iteration": 2.4192135334014893 }, { "auxiliary_loss_clip": 0.01074184, "auxiliary_loss_mlp": 0.01047671, "balance_loss_clip": 1.01355314, "balance_loss_mlp": 1.02384043, "epoch": 0.31360288591612806, "flos": 20958814477440.0, "grad_norm": 1.8626322683080796, "language_loss": 0.80165428, "learning_rate": 3.213953633415686e-06, "loss": 0.82287282, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.50390625, "step": 5216, "time_per_iteration": 2.4442548751831055 }, { "auxiliary_loss_clip": 0.01072399, "auxiliary_loss_mlp": 0.01053958, "balance_loss_clip": 1.02036464, "balance_loss_mlp": 1.02276802, "epoch": 0.313663009168796, "flos": 26979272236800.0, "grad_norm": 1.6352702738941254, "language_loss": 0.69741678, "learning_rate": 3.213644097593477e-06, "loss": 0.71868038, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.49609375, "step": 5217, "time_per_iteration": 2.457782745361328 }, { "auxiliary_loss_clip": 0.01072437, "auxiliary_loss_mlp": 0.01051547, "balance_loss_clip": 1.018538, "balance_loss_mlp": 1.02316451, "epoch": 0.313723132421464, "flos": 18039935836800.0, "grad_norm": 1.6935756772149768, "language_loss": 0.81814843, "learning_rate": 3.2133345157497624e-06, "loss": 0.83938825, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4921875, "step": 5218, "time_per_iteration": 2.4063880443573 }, { "auxiliary_loss_clip": 0.01072899, "auxiliary_loss_mlp": 0.01055729, "balance_loss_clip": 1.02041924, "balance_loss_mlp": 1.0238266, "epoch": 0.31378325567413196, "flos": 22487905693440.0, "grad_norm": 2.318848433835022, "language_loss": 0.70850694, "learning_rate": 3.2130248878962813e-06, "loss": 0.72979319, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.4921875, "step": 5219, "time_per_iteration": 2.38535475730896 }, { "auxiliary_loss_clip": 0.01071439, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.02112031, "balance_loss_mlp": 1.02265692, "epoch": 0.3138433789267999, "flos": 22418149063680.0, "grad_norm": 1.8821233299526994, "language_loss": 0.81344795, "learning_rate": 3.2127152140447747e-06, "loss": 0.83468121, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.48828125, "step": 5220, "time_per_iteration": 2.4430110454559326 }, { "auxiliary_loss_clip": 0.01071242, "auxiliary_loss_mlp": 0.01053281, "balance_loss_clip": 1.02180982, "balance_loss_mlp": 1.02231085, "epoch": 0.3139035021794679, "flos": 13005076158720.0, "grad_norm": 1.8941626302371748, "language_loss": 0.75200433, "learning_rate": 3.212405494206986e-06, "loss": 0.77324957, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.48828125, "step": 5221, "time_per_iteration": 2.3628551959991455 }, { "auxiliary_loss_clip": 0.0106775, "auxiliary_loss_mlp": 0.01047765, "balance_loss_clip": 1.01683009, "balance_loss_mlp": 1.02090764, "epoch": 0.31396362543213585, "flos": 16945059600000.0, "grad_norm": 1.670124918867196, "language_loss": 0.83362567, "learning_rate": 3.2120957283946588e-06, "loss": 0.85478079, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46875, "step": 5222, "time_per_iteration": 2.399254083633423 }, { "auxiliary_loss_clip": 0.01070247, "auxiliary_loss_mlp": 0.01063356, "balance_loss_clip": 1.02549505, "balance_loss_mlp": 1.02042913, "epoch": 0.31402374868480387, "flos": 20155707406080.0, "grad_norm": 2.083163638828032, "language_loss": 0.7261641, "learning_rate": 3.2117859166195407e-06, "loss": 0.74750012, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5, "step": 5223, "time_per_iteration": 2.39150333404541 }, { "auxiliary_loss_clip": 0.0106842, "auxiliary_loss_mlp": 0.01050806, "balance_loss_clip": 1.02026498, "balance_loss_mlp": 1.02054203, "epoch": 0.31408387193747184, "flos": 21250025061120.0, "grad_norm": 1.9053527719262828, "language_loss": 0.81848609, "learning_rate": 3.211476058893379e-06, "loss": 0.83967841, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47851562, "step": 5224, "time_per_iteration": 2.4357762336730957 }, { "auxiliary_loss_clip": 0.01072871, "auxiliary_loss_mlp": 0.01064923, "balance_loss_clip": 1.02887392, "balance_loss_mlp": 1.02176905, "epoch": 0.3141439951901398, "flos": 27483208934400.0, "grad_norm": 1.95242971105236, "language_loss": 0.59111428, "learning_rate": 3.2111661552279243e-06, "loss": 0.6124922, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.51171875, "step": 5225, "time_per_iteration": 2.4326047897338867 }, { "auxiliary_loss_clip": 0.01066427, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.01560092, "balance_loss_mlp": 1.02176571, "epoch": 0.31420411844280777, "flos": 17851440072960.0, "grad_norm": 1.8809420896722737, "language_loss": 0.83780313, "learning_rate": 3.2108562056349273e-06, "loss": 0.8589077, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.44726562, "step": 5226, "time_per_iteration": 2.3958053588867188 }, { "auxiliary_loss_clip": 0.01069532, "auxiliary_loss_mlp": 0.01053108, "balance_loss_clip": 1.02106428, "balance_loss_mlp": 1.02131128, "epoch": 0.31426424169547573, "flos": 21615879864960.0, "grad_norm": 1.767596340028278, "language_loss": 0.76043135, "learning_rate": 3.210546210126141e-06, "loss": 0.78165776, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.48046875, "step": 5227, "time_per_iteration": 2.407646417617798 }, { "auxiliary_loss_clip": 0.01070162, "auxiliary_loss_mlp": 0.01051891, "balance_loss_clip": 1.01758218, "balance_loss_mlp": 1.02111697, "epoch": 0.3143243649481437, "flos": 30919290589440.0, "grad_norm": 1.8346805571407, "language_loss": 0.69877088, "learning_rate": 3.2102361687133213e-06, "loss": 0.71999133, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49023438, "step": 5228, "time_per_iteration": 2.5092973709106445 }, { "auxiliary_loss_clip": 0.01068533, "auxiliary_loss_mlp": 0.01049786, "balance_loss_clip": 1.0193882, "balance_loss_mlp": 1.02061296, "epoch": 0.31438448820081166, "flos": 22820278636800.0, "grad_norm": 1.7395101257556493, "language_loss": 0.8110733, "learning_rate": 3.2099260814082254e-06, "loss": 0.83225656, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47851562, "step": 5229, "time_per_iteration": 2.4077649116516113 }, { "auxiliary_loss_clip": 0.0106876, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.01488662, "balance_loss_mlp": 1.02184606, "epoch": 0.3144446114534796, "flos": 23291082587520.0, "grad_norm": 2.246271709480059, "language_loss": 0.71340632, "learning_rate": 3.209615948222611e-06, "loss": 0.73455697, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46875, "step": 5230, "time_per_iteration": 2.477158546447754 }, { "auxiliary_loss_clip": 0.01069277, "auxiliary_loss_mlp": 0.01045219, "balance_loss_clip": 1.01291358, "balance_loss_mlp": 1.0205431, "epoch": 0.3145047347061476, "flos": 31354692554880.0, "grad_norm": 1.6306315721820377, "language_loss": 0.81126571, "learning_rate": 3.209305769168239e-06, "loss": 0.83241063, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.48828125, "step": 5231, "time_per_iteration": 2.4799187183380127 }, { "auxiliary_loss_clip": 0.01068971, "auxiliary_loss_mlp": 0.01051076, "balance_loss_clip": 1.01867533, "balance_loss_mlp": 1.02221942, "epoch": 0.31456485795881556, "flos": 10888780919040.0, "grad_norm": 2.0811146321771026, "language_loss": 0.87015748, "learning_rate": 3.2089955442568704e-06, "loss": 0.8913579, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46875, "step": 5232, "time_per_iteration": 2.398193836212158 }, { "auxiliary_loss_clip": 0.01069661, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.02009737, "balance_loss_mlp": 1.02307439, "epoch": 0.3146249812114835, "flos": 17091485308800.0, "grad_norm": 1.6137559948303661, "language_loss": 0.81712723, "learning_rate": 3.2086852735002692e-06, "loss": 0.83832824, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46679688, "step": 5233, "time_per_iteration": 2.387022018432617 }, { "auxiliary_loss_clip": 0.01073691, "auxiliary_loss_mlp": 0.01051242, "balance_loss_clip": 1.01965153, "balance_loss_mlp": 1.02474451, "epoch": 0.3146851044641515, "flos": 55289469479040.0, "grad_norm": 3.312536206334712, "language_loss": 0.72098821, "learning_rate": 3.2083749569102024e-06, "loss": 0.74223757, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.49023438, "step": 5234, "time_per_iteration": 2.7314324378967285 }, { "auxiliary_loss_clip": 0.01071037, "auxiliary_loss_mlp": 0.01046917, "balance_loss_clip": 1.01420569, "balance_loss_mlp": 1.02304292, "epoch": 0.31474522771681945, "flos": 27014674222080.0, "grad_norm": 1.8132133769517973, "language_loss": 0.74403185, "learning_rate": 3.2080645944984356e-06, "loss": 0.76521134, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5235, "time_per_iteration": 2.4337353706359863 }, { "auxiliary_loss_clip": 0.01070018, "auxiliary_loss_mlp": 0.01051635, "balance_loss_clip": 1.02066469, "balance_loss_mlp": 1.02218592, "epoch": 0.3148053509694875, "flos": 21250862933760.0, "grad_norm": 2.1440482320343195, "language_loss": 0.80217016, "learning_rate": 3.2077541862767384e-06, "loss": 0.82338667, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.47851562, "step": 5236, "time_per_iteration": 3.822981595993042 }, { "auxiliary_loss_clip": 0.01073298, "auxiliary_loss_mlp": 0.01051203, "balance_loss_clip": 1.01888537, "balance_loss_mlp": 1.02301085, "epoch": 0.31486547422215544, "flos": 31247334524160.0, "grad_norm": 1.6036461674562097, "language_loss": 0.77764118, "learning_rate": 3.207443732256881e-06, "loss": 0.79888618, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.50390625, "step": 5237, "time_per_iteration": 2.4718377590179443 }, { "auxiliary_loss_clip": 0.01067823, "auxiliary_loss_mlp": 0.01049626, "balance_loss_clip": 1.0214088, "balance_loss_mlp": 1.02219653, "epoch": 0.3149255974748234, "flos": 19827593648640.0, "grad_norm": 1.6924257704912489, "language_loss": 0.80578446, "learning_rate": 3.2071332324506372e-06, "loss": 0.82695895, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.45703125, "step": 5238, "time_per_iteration": 2.425583600997925 }, { "auxiliary_loss_clip": 0.01024325, "auxiliary_loss_mlp": 0.01012078, "balance_loss_clip": 1.00797677, "balance_loss_mlp": 1.01283383, "epoch": 0.31498572072749137, "flos": 67680981671040.0, "grad_norm": 0.8394430830183599, "language_loss": 0.67995822, "learning_rate": 3.2068226868697795e-06, "loss": 0.70032221, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.11523438, "step": 5239, "time_per_iteration": 4.453044652938843 }, { "auxiliary_loss_clip": 0.01072161, "auxiliary_loss_mlp": 0.01055722, "balance_loss_clip": 1.02234328, "balance_loss_mlp": 1.02191567, "epoch": 0.31504584398015933, "flos": 19792086929280.0, "grad_norm": 2.2192597522232185, "language_loss": 0.84296697, "learning_rate": 3.2065120955260846e-06, "loss": 0.86424577, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.50390625, "step": 5240, "time_per_iteration": 2.3748996257781982 }, { "auxiliary_loss_clip": 0.01071374, "auxiliary_loss_mlp": 0.01049716, "balance_loss_clip": 1.01930583, "balance_loss_mlp": 1.02398157, "epoch": 0.3151059672328273, "flos": 26614185482880.0, "grad_norm": 1.7960838303538833, "language_loss": 0.82376146, "learning_rate": 3.2062014584313302e-06, "loss": 0.84497237, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47265625, "step": 5241, "time_per_iteration": 3.897196054458618 }, { "auxiliary_loss_clip": 0.01066698, "auxiliary_loss_mlp": 0.01047129, "balance_loss_clip": 1.01757705, "balance_loss_mlp": 1.0215764, "epoch": 0.31516609048549526, "flos": 24203363080320.0, "grad_norm": 1.7329527624694296, "language_loss": 0.757236, "learning_rate": 3.2058907755972956e-06, "loss": 0.77837425, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.45117188, "step": 5242, "time_per_iteration": 3.86982798576355 }, { "auxiliary_loss_clip": 0.01068942, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.0203352, "balance_loss_mlp": 1.02181268, "epoch": 0.31522621373816323, "flos": 25957504120320.0, "grad_norm": 2.9761900714500036, "language_loss": 0.75114924, "learning_rate": 3.2055800470357626e-06, "loss": 0.77236104, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.47265625, "step": 5243, "time_per_iteration": 2.4343652725219727 }, { "auxiliary_loss_clip": 0.01068552, "auxiliary_loss_mlp": 0.01047885, "balance_loss_clip": 1.0170579, "balance_loss_mlp": 1.02121615, "epoch": 0.3152863369908312, "flos": 21907718853120.0, "grad_norm": 1.8462600918309915, "language_loss": 0.66357994, "learning_rate": 3.205269272758513e-06, "loss": 0.68474436, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47265625, "step": 5244, "time_per_iteration": 2.4286348819732666 }, { "auxiliary_loss_clip": 0.01070185, "auxiliary_loss_mlp": 0.01051101, "balance_loss_clip": 1.01960588, "balance_loss_mlp": 1.02142859, "epoch": 0.31534646024349916, "flos": 16280383536000.0, "grad_norm": 2.2098782574068228, "language_loss": 0.92865026, "learning_rate": 3.2049584527773313e-06, "loss": 0.9498632, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48632812, "step": 5245, "time_per_iteration": 2.3441379070281982 }, { "auxiliary_loss_clip": 0.01069258, "auxiliary_loss_mlp": 0.01050242, "balance_loss_clip": 1.01939106, "balance_loss_mlp": 1.02136993, "epoch": 0.3154065834961671, "flos": 24716097440640.0, "grad_norm": 1.72139007021704, "language_loss": 0.76803029, "learning_rate": 3.2046475871040048e-06, "loss": 0.78922534, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48046875, "step": 5246, "time_per_iteration": 2.4571502208709717 }, { "auxiliary_loss_clip": 0.01068677, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.01855016, "balance_loss_mlp": 1.02088571, "epoch": 0.3154667067488351, "flos": 35369704241280.0, "grad_norm": 1.5190253110627694, "language_loss": 0.6208325, "learning_rate": 3.204336675750321e-06, "loss": 0.64202684, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47851562, "step": 5247, "time_per_iteration": 2.512319564819336 }, { "auxiliary_loss_clip": 0.01070628, "auxiliary_loss_mlp": 0.01051479, "balance_loss_clip": 1.01817238, "balance_loss_mlp": 1.02130365, "epoch": 0.31552683000150306, "flos": 17455524721920.0, "grad_norm": 2.3255917622268445, "language_loss": 0.84753036, "learning_rate": 3.2040257187280693e-06, "loss": 0.86875147, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.49414062, "step": 5248, "time_per_iteration": 2.469712734222412 }, { "auxiliary_loss_clip": 0.01068358, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.01956415, "balance_loss_mlp": 1.02134681, "epoch": 0.3155869532541711, "flos": 18404778211200.0, "grad_norm": 1.9934699823643216, "language_loss": 0.8708899, "learning_rate": 3.2037147160490423e-06, "loss": 0.89209557, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47070312, "step": 5249, "time_per_iteration": 2.410456895828247 }, { "auxiliary_loss_clip": 0.01070674, "auxiliary_loss_mlp": 0.01055399, "balance_loss_clip": 1.02187753, "balance_loss_mlp": 1.02207613, "epoch": 0.31564707650683904, "flos": 21578697400320.0, "grad_norm": 1.9296998845635263, "language_loss": 0.87784648, "learning_rate": 3.2034036677250322e-06, "loss": 0.89910728, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.484375, "step": 5250, "time_per_iteration": 2.446606397628784 }, { "auxiliary_loss_clip": 0.01069997, "auxiliary_loss_mlp": 0.01050545, "balance_loss_clip": 1.0177505, "balance_loss_mlp": 1.02213836, "epoch": 0.315707199759507, "flos": 21029967561600.0, "grad_norm": 2.710296723764754, "language_loss": 0.7099719, "learning_rate": 3.203092573767835e-06, "loss": 0.73117733, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5251, "time_per_iteration": 2.3985767364501953 }, { "auxiliary_loss_clip": 0.01070667, "auxiliary_loss_mlp": 0.01055449, "balance_loss_clip": 1.02128327, "balance_loss_mlp": 1.02278113, "epoch": 0.31576732301217497, "flos": 26827784380800.0, "grad_norm": 1.7197688231866093, "language_loss": 0.79083216, "learning_rate": 3.202781434189246e-06, "loss": 0.81209338, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.47851562, "step": 5252, "time_per_iteration": 2.493068218231201 }, { "auxiliary_loss_clip": 0.01071519, "auxiliary_loss_mlp": 0.0105044, "balance_loss_clip": 1.01966047, "balance_loss_mlp": 1.02465093, "epoch": 0.31582744626484294, "flos": 22710057874560.0, "grad_norm": 1.970590278792238, "language_loss": 0.75033164, "learning_rate": 3.202470249001066e-06, "loss": 0.77155125, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46875, "step": 5253, "time_per_iteration": 2.4654693603515625 }, { "auxiliary_loss_clip": 0.01072753, "auxiliary_loss_mlp": 0.01057657, "balance_loss_clip": 1.02406418, "balance_loss_mlp": 1.02361798, "epoch": 0.3158875695175109, "flos": 23950766327040.0, "grad_norm": 2.1234512084127037, "language_loss": 0.75584072, "learning_rate": 3.2021590182150924e-06, "loss": 0.77714479, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 5254, "time_per_iteration": 2.454055070877075 }, { "auxiliary_loss_clip": 0.0107498, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.01831937, "balance_loss_mlp": 1.02448297, "epoch": 0.31594769277017887, "flos": 13261024402560.0, "grad_norm": 2.1552508075454977, "language_loss": 0.7905091, "learning_rate": 3.201847741843128e-06, "loss": 0.81175947, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.50390625, "step": 5255, "time_per_iteration": 2.4090025424957275 }, { "auxiliary_loss_clip": 0.01075315, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.01691365, "balance_loss_mlp": 1.02617776, "epoch": 0.31600781602284683, "flos": 23367123262080.0, "grad_norm": 1.7390802481407164, "language_loss": 0.79582548, "learning_rate": 3.2015364198969772e-06, "loss": 0.81707543, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 5256, "time_per_iteration": 2.4355077743530273 }, { "auxiliary_loss_clip": 0.01068754, "auxiliary_loss_mlp": 0.01050353, "balance_loss_clip": 1.01928675, "balance_loss_mlp": 1.02273667, "epoch": 0.3160679392755148, "flos": 19827558737280.0, "grad_norm": 1.5768911733651956, "language_loss": 0.72140604, "learning_rate": 3.2012250523884453e-06, "loss": 0.7425971, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4609375, "step": 5257, "time_per_iteration": 2.387295961380005 }, { "auxiliary_loss_clip": 0.01073865, "auxiliary_loss_mlp": 0.01054736, "balance_loss_clip": 1.0198555, "balance_loss_mlp": 1.02383232, "epoch": 0.31612806252818276, "flos": 20192191643520.0, "grad_norm": 6.306783600694465, "language_loss": 0.7848593, "learning_rate": 3.2009136393293393e-06, "loss": 0.80614531, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5, "step": 5258, "time_per_iteration": 2.408771514892578 }, { "auxiliary_loss_clip": 0.01074496, "auxiliary_loss_mlp": 0.01058635, "balance_loss_clip": 1.02432621, "balance_loss_mlp": 1.02512383, "epoch": 0.31618818578085073, "flos": 24235029550080.0, "grad_norm": 2.5753877980233004, "language_loss": 0.74357021, "learning_rate": 3.200602180731467e-06, "loss": 0.76490152, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49414062, "step": 5259, "time_per_iteration": 2.437163829803467 }, { "auxiliary_loss_clip": 0.01075676, "auxiliary_loss_mlp": 0.0105796, "balance_loss_clip": 1.02553475, "balance_loss_mlp": 1.02428055, "epoch": 0.3162483090335187, "flos": 25080695435520.0, "grad_norm": 2.044464727386963, "language_loss": 0.67675292, "learning_rate": 3.20029067660664e-06, "loss": 0.69808924, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.515625, "step": 5260, "time_per_iteration": 2.4226109981536865 }, { "auxiliary_loss_clip": 0.01072007, "auxiliary_loss_mlp": 0.01047656, "balance_loss_clip": 1.01389575, "balance_loss_mlp": 1.02192974, "epoch": 0.31630843228618666, "flos": 26322171937920.0, "grad_norm": 1.638706927547112, "language_loss": 0.73197103, "learning_rate": 3.1999791269666706e-06, "loss": 0.75316763, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.5, "step": 5261, "time_per_iteration": 2.4610073566436768 }, { "auxiliary_loss_clip": 0.01022059, "auxiliary_loss_mlp": 0.01009432, "balance_loss_clip": 1.00559306, "balance_loss_mlp": 1.01103616, "epoch": 0.3163685555388547, "flos": 66754839502080.0, "grad_norm": 0.7625295196193397, "language_loss": 0.50722063, "learning_rate": 3.1996675318233716e-06, "loss": 0.52753556, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.11035156, "step": 5262, "time_per_iteration": 3.0610902309417725 }, { "auxiliary_loss_clip": 0.01074981, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.02852726, "balance_loss_mlp": 1.02388692, "epoch": 0.31642867879152264, "flos": 25994442205440.0, "grad_norm": 1.4728549589476287, "language_loss": 0.86351657, "learning_rate": 3.19935589118856e-06, "loss": 0.88487947, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.51171875, "step": 5263, "time_per_iteration": 2.4337849617004395 }, { "auxiliary_loss_clip": 0.01068987, "auxiliary_loss_mlp": 0.01057335, "balance_loss_clip": 1.0253396, "balance_loss_mlp": 1.02184272, "epoch": 0.3164888020441906, "flos": 25773791212800.0, "grad_norm": 1.5763325679328357, "language_loss": 0.82868755, "learning_rate": 3.1990442050740535e-06, "loss": 0.84995079, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.47265625, "step": 5264, "time_per_iteration": 2.4524686336517334 }, { "auxiliary_loss_clip": 0.01073586, "auxiliary_loss_mlp": 0.01056095, "balance_loss_clip": 1.02169156, "balance_loss_mlp": 1.02223945, "epoch": 0.3165489252968586, "flos": 19755183755520.0, "grad_norm": 1.8930375384932434, "language_loss": 0.81388956, "learning_rate": 3.19873247349167e-06, "loss": 0.83518636, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 5265, "time_per_iteration": 2.4012763500213623 }, { "auxiliary_loss_clip": 0.01071697, "auxiliary_loss_mlp": 0.01058955, "balance_loss_clip": 1.02078414, "balance_loss_mlp": 1.02177775, "epoch": 0.31660904854952654, "flos": 23182851772800.0, "grad_norm": 1.9229585886148168, "language_loss": 0.76246703, "learning_rate": 3.1984206964532307e-06, "loss": 0.78377354, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.5, "step": 5266, "time_per_iteration": 2.436870813369751 }, { "auxiliary_loss_clip": 0.01071027, "auxiliary_loss_mlp": 0.01052594, "balance_loss_clip": 1.01861906, "balance_loss_mlp": 1.02046442, "epoch": 0.3166691718021945, "flos": 20407571020800.0, "grad_norm": 2.280663168675178, "language_loss": 0.79843605, "learning_rate": 3.1981088739705585e-06, "loss": 0.81967223, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.5078125, "step": 5267, "time_per_iteration": 2.412386417388916 }, { "auxiliary_loss_clip": 0.01015678, "auxiliary_loss_mlp": 0.01009982, "balance_loss_clip": 1.00635767, "balance_loss_mlp": 1.00515652, "epoch": 0.31672929505486247, "flos": 70141275336960.0, "grad_norm": 0.7408640447911711, "language_loss": 0.57927114, "learning_rate": 3.197797006055478e-06, "loss": 0.59952772, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 0.03613281, "router_z_loss_mlp": 0.10546875, "step": 5268, "time_per_iteration": 3.026930809020996 }, { "auxiliary_loss_clip": 0.01070165, "auxiliary_loss_mlp": 0.01047971, "balance_loss_clip": 1.01442599, "balance_loss_mlp": 1.02050269, "epoch": 0.31678941830753043, "flos": 14354888209920.0, "grad_norm": 2.3712846312568474, "language_loss": 0.75493264, "learning_rate": 3.197485092719815e-06, "loss": 0.77611399, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.49609375, "step": 5269, "time_per_iteration": 2.360624074935913 }, { "auxiliary_loss_clip": 0.01070197, "auxiliary_loss_mlp": 0.01069823, "balance_loss_clip": 1.03413177, "balance_loss_mlp": 1.02131486, "epoch": 0.3168495415601984, "flos": 22746611934720.0, "grad_norm": 1.7384246295876193, "language_loss": 0.80942857, "learning_rate": 3.1971731339753973e-06, "loss": 0.83082879, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.48828125, "step": 5270, "time_per_iteration": 2.4235222339630127 }, { "auxiliary_loss_clip": 0.01073866, "auxiliary_loss_mlp": 0.01069361, "balance_loss_clip": 1.03247738, "balance_loss_mlp": 1.02196133, "epoch": 0.31690966481286637, "flos": 20114370489600.0, "grad_norm": 2.0868920888820037, "language_loss": 0.80817556, "learning_rate": 3.1968611298340545e-06, "loss": 0.82960784, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.51953125, "step": 5271, "time_per_iteration": 2.379120111465454 }, { "auxiliary_loss_clip": 0.01072509, "auxiliary_loss_mlp": 0.01063021, "balance_loss_clip": 1.026829, "balance_loss_mlp": 1.02233291, "epoch": 0.31696978806553433, "flos": 21177859547520.0, "grad_norm": 1.8181744179150368, "language_loss": 0.74974525, "learning_rate": 3.1965490803076173e-06, "loss": 0.77110058, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5, "step": 5272, "time_per_iteration": 2.4362761974334717 }, { "auxiliary_loss_clip": 0.01073373, "auxiliary_loss_mlp": 0.0105485, "balance_loss_clip": 1.01861012, "balance_loss_mlp": 1.02208471, "epoch": 0.3170299113182023, "flos": 42995363713920.0, "grad_norm": 2.011412551998453, "language_loss": 0.70186627, "learning_rate": 3.1962369854079194e-06, "loss": 0.72314847, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.51171875, "step": 5273, "time_per_iteration": 2.628800630569458 }, { "auxiliary_loss_clip": 0.01072598, "auxiliary_loss_mlp": 0.01060162, "balance_loss_clip": 1.02642608, "balance_loss_mlp": 1.0233686, "epoch": 0.31709003457087026, "flos": 24459066944640.0, "grad_norm": 1.7913170699868086, "language_loss": 0.69304222, "learning_rate": 3.195924845146795e-06, "loss": 0.71436983, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 5274, "time_per_iteration": 2.4788818359375 }, { "auxiliary_loss_clip": 0.01073509, "auxiliary_loss_mlp": 0.01060306, "balance_loss_clip": 1.02816677, "balance_loss_mlp": 1.02680719, "epoch": 0.3171501578235382, "flos": 24134130120960.0, "grad_norm": 1.6739342046005805, "language_loss": 0.81407428, "learning_rate": 3.195612659536081e-06, "loss": 0.8354125, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46679688, "step": 5275, "time_per_iteration": 3.846859931945801 }, { "auxiliary_loss_clip": 0.01075251, "auxiliary_loss_mlp": 0.01061134, "balance_loss_clip": 1.02658713, "balance_loss_mlp": 1.02552605, "epoch": 0.31721028107620625, "flos": 18878724184320.0, "grad_norm": 2.650343740689562, "language_loss": 0.74914122, "learning_rate": 3.1953004285876147e-06, "loss": 0.77050507, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.49609375, "step": 5276, "time_per_iteration": 2.417881965637207 }, { "auxiliary_loss_clip": 0.01076278, "auxiliary_loss_mlp": 0.0104749, "balance_loss_clip": 1.01634061, "balance_loss_mlp": 1.02888978, "epoch": 0.3172704043288742, "flos": 23146786471680.0, "grad_norm": 5.47447096409296, "language_loss": 0.79065847, "learning_rate": 3.194988152313236e-06, "loss": 0.81189609, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.47265625, "step": 5277, "time_per_iteration": 2.4378886222839355 }, { "auxiliary_loss_clip": 0.01080307, "auxiliary_loss_mlp": 0.01054868, "balance_loss_clip": 1.02055907, "balance_loss_mlp": 1.02998543, "epoch": 0.3173305275815422, "flos": 17857549560960.0, "grad_norm": 1.6813324270739345, "language_loss": 0.80767864, "learning_rate": 3.1946758307247878e-06, "loss": 0.82903039, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.50390625, "step": 5278, "time_per_iteration": 3.8530702590942383 }, { "auxiliary_loss_clip": 0.01040007, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.03133547, "balance_loss_mlp": 1.02890253, "epoch": 0.31739065083421014, "flos": 59971042442880.0, "grad_norm": 0.8956242460750529, "language_loss": 0.62866634, "learning_rate": 3.1943634638341114e-06, "loss": 0.64942026, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.11132812, "step": 5279, "time_per_iteration": 2.8170394897460938 }, { "auxiliary_loss_clip": 0.01082289, "auxiliary_loss_mlp": 0.01059741, "balance_loss_clip": 1.02445519, "balance_loss_mlp": 1.03040886, "epoch": 0.3174507740868781, "flos": 23799976698240.0, "grad_norm": 1.8541126050614032, "language_loss": 0.82302749, "learning_rate": 3.194051051653053e-06, "loss": 0.84444785, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.51953125, "step": 5280, "time_per_iteration": 3.9273319244384766 }, { "auxiliary_loss_clip": 0.01080356, "auxiliary_loss_mlp": 0.01069687, "balance_loss_clip": 1.03404343, "balance_loss_mlp": 1.03177142, "epoch": 0.31751089733954607, "flos": 27637594433280.0, "grad_norm": 1.5320587096582174, "language_loss": 0.79704827, "learning_rate": 3.19373859419346e-06, "loss": 0.81854868, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48632812, "step": 5281, "time_per_iteration": 3.9269840717315674 }, { "auxiliary_loss_clip": 0.01079069, "auxiliary_loss_mlp": 0.01054765, "balance_loss_clip": 1.02057624, "balance_loss_mlp": 1.03009272, "epoch": 0.31757102059221404, "flos": 23768135671680.0, "grad_norm": 1.6108709873997022, "language_loss": 0.7994746, "learning_rate": 3.193426091467179e-06, "loss": 0.82081294, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.49023438, "step": 5282, "time_per_iteration": 2.467024087905884 }, { "auxiliary_loss_clip": 0.01082038, "auxiliary_loss_mlp": 0.01079624, "balance_loss_clip": 1.04319346, "balance_loss_mlp": 1.03053927, "epoch": 0.317631143844882, "flos": 25263361002240.0, "grad_norm": 2.120713385321906, "language_loss": 0.69387424, "learning_rate": 3.193113543486061e-06, "loss": 0.71549082, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.515625, "step": 5283, "time_per_iteration": 2.4508302211761475 }, { "auxiliary_loss_clip": 0.01034238, "auxiliary_loss_mlp": 0.01010851, "balance_loss_clip": 1.00751317, "balance_loss_mlp": 1.02221727, "epoch": 0.31769126709754997, "flos": 55823290300800.0, "grad_norm": 0.7342500911289369, "language_loss": 0.52929211, "learning_rate": 3.192800950261958e-06, "loss": 0.54974294, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.12011719, "step": 5284, "time_per_iteration": 3.0201048851013184 }, { "auxiliary_loss_clip": 0.01078478, "auxiliary_loss_mlp": 0.0109157, "balance_loss_clip": 1.05661774, "balance_loss_mlp": 1.02829397, "epoch": 0.31775139035021793, "flos": 16689635026560.0, "grad_norm": 1.7868761006537501, "language_loss": 0.7209605, "learning_rate": 3.1924883118067235e-06, "loss": 0.74266094, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5, "step": 5285, "time_per_iteration": 2.3896243572235107 }, { "auxiliary_loss_clip": 0.01024316, "auxiliary_loss_mlp": 0.01040542, "balance_loss_clip": 1.03646517, "balance_loss_mlp": 1.01171994, "epoch": 0.3178115136028859, "flos": 64224090979200.0, "grad_norm": 0.8497457265122933, "language_loss": 0.60613078, "learning_rate": 3.1921756281322123e-06, "loss": 0.62677938, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.12597656, "step": 5286, "time_per_iteration": 3.098567008972168 }, { "auxiliary_loss_clip": 0.0107285, "auxiliary_loss_mlp": 0.0109999, "balance_loss_clip": 1.06680202, "balance_loss_mlp": 1.02351356, "epoch": 0.31787163685555386, "flos": 18696477553920.0, "grad_norm": 2.0064110287313324, "language_loss": 0.73250329, "learning_rate": 3.1918628992502826e-06, "loss": 0.75423169, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 5287, "time_per_iteration": 2.394767999649048 }, { "auxiliary_loss_clip": 0.01074051, "auxiliary_loss_mlp": 0.01101593, "balance_loss_clip": 1.06614041, "balance_loss_mlp": 1.02404046, "epoch": 0.31793176010822183, "flos": 21323691763200.0, "grad_norm": 1.7471313968381195, "language_loss": 0.77178717, "learning_rate": 3.191550125172792e-06, "loss": 0.79354358, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5, "step": 5288, "time_per_iteration": 2.4358932971954346 }, { "auxiliary_loss_clip": 0.01069275, "auxiliary_loss_mlp": 0.01084294, "balance_loss_clip": 1.05544531, "balance_loss_mlp": 1.02318287, "epoch": 0.31799188336088985, "flos": 20957662402560.0, "grad_norm": 1.6415278801456097, "language_loss": 0.89464515, "learning_rate": 3.1912373059116007e-06, "loss": 0.91618085, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4609375, "step": 5289, "time_per_iteration": 2.4164464473724365 }, { "auxiliary_loss_clip": 0.01074263, "auxiliary_loss_mlp": 0.01094781, "balance_loss_clip": 1.06273723, "balance_loss_mlp": 1.02794003, "epoch": 0.3180520066135578, "flos": 22490838247680.0, "grad_norm": 1.5507489212056371, "language_loss": 0.69412875, "learning_rate": 3.190924441478572e-06, "loss": 0.71581924, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 5290, "time_per_iteration": 2.4639391899108887 }, { "auxiliary_loss_clip": 0.01084965, "auxiliary_loss_mlp": 0.01076148, "balance_loss_clip": 1.03907359, "balance_loss_mlp": 1.0330205, "epoch": 0.3181121298662258, "flos": 27234103317120.0, "grad_norm": 1.7114450717499257, "language_loss": 0.80361569, "learning_rate": 3.1906115318855687e-06, "loss": 0.82522684, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.51953125, "step": 5291, "time_per_iteration": 2.4788317680358887 }, { "auxiliary_loss_clip": 0.01085228, "auxiliary_loss_mlp": 0.0106393, "balance_loss_clip": 1.0275712, "balance_loss_mlp": 1.03501868, "epoch": 0.31817225311889374, "flos": 23179186080000.0, "grad_norm": 1.7881264918587272, "language_loss": 0.81282347, "learning_rate": 3.1902985771444577e-06, "loss": 0.83431506, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5, "step": 5292, "time_per_iteration": 2.487516164779663 }, { "auxiliary_loss_clip": 0.01085817, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.01381111, "balance_loss_mlp": 1.03916824, "epoch": 0.3182323763715617, "flos": 23257670549760.0, "grad_norm": 1.533348268992777, "language_loss": 0.76398575, "learning_rate": 3.1899855772671043e-06, "loss": 0.78529948, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46679688, "step": 5293, "time_per_iteration": 2.443641424179077 }, { "auxiliary_loss_clip": 0.01094514, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.01597607, "balance_loss_mlp": 1.045084, "epoch": 0.3182924996242297, "flos": 29015581818240.0, "grad_norm": 1.9636175822456505, "language_loss": 0.76262188, "learning_rate": 3.189672532265379e-06, "loss": 0.78404152, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.49414062, "step": 5294, "time_per_iteration": 2.505371570587158 }, { "auxiliary_loss_clip": 0.01098672, "auxiliary_loss_mlp": 0.01058051, "balance_loss_clip": 1.01926041, "balance_loss_mlp": 1.04595006, "epoch": 0.31835262287689764, "flos": 20448139887360.0, "grad_norm": 1.9088344499108978, "language_loss": 0.77599728, "learning_rate": 3.189359442151152e-06, "loss": 0.79756457, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.52734375, "step": 5295, "time_per_iteration": 2.415400266647339 }, { "auxiliary_loss_clip": 0.01100138, "auxiliary_loss_mlp": 0.01065009, "balance_loss_clip": 1.03031945, "balance_loss_mlp": 1.04638052, "epoch": 0.3184127461295656, "flos": 25118296836480.0, "grad_norm": 1.6424314819835317, "language_loss": 0.70547593, "learning_rate": 3.189046306936296e-06, "loss": 0.72712743, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5390625, "step": 5296, "time_per_iteration": 2.5075509548187256 }, { "auxiliary_loss_clip": 0.01100821, "auxiliary_loss_mlp": 0.01070356, "balance_loss_clip": 1.03745425, "balance_loss_mlp": 1.04839766, "epoch": 0.31847286938223357, "flos": 25550207665920.0, "grad_norm": 1.8891895755792623, "language_loss": 0.7905463, "learning_rate": 3.1887331266326846e-06, "loss": 0.81225812, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.5234375, "step": 5297, "time_per_iteration": 2.477365016937256 }, { "auxiliary_loss_clip": 0.01099653, "auxiliary_loss_mlp": 0.01072477, "balance_loss_clip": 1.03609395, "balance_loss_mlp": 1.04707313, "epoch": 0.31853299263490154, "flos": 27781227233280.0, "grad_norm": 1.9791655771332406, "language_loss": 0.79975033, "learning_rate": 3.1884199012521942e-06, "loss": 0.82147157, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.52734375, "step": 5298, "time_per_iteration": 2.559987783432007 }, { "auxiliary_loss_clip": 0.01101713, "auxiliary_loss_mlp": 0.01076001, "balance_loss_clip": 1.04111993, "balance_loss_mlp": 1.04622579, "epoch": 0.3185931158875695, "flos": 22705763777280.0, "grad_norm": 2.4145989260791136, "language_loss": 0.75689077, "learning_rate": 3.1881066308067016e-06, "loss": 0.77866781, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5546875, "step": 5299, "time_per_iteration": 2.4582860469818115 }, { "auxiliary_loss_clip": 0.01101526, "auxiliary_loss_mlp": 0.0108208, "balance_loss_clip": 1.04748595, "balance_loss_mlp": 1.04648376, "epoch": 0.31865323914023747, "flos": 24570369959040.0, "grad_norm": 1.9904173455086722, "language_loss": 0.79698408, "learning_rate": 3.1877933153080873e-06, "loss": 0.81882012, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.55078125, "step": 5300, "time_per_iteration": 2.4962432384490967 }, { "auxiliary_loss_clip": 0.01095643, "auxiliary_loss_mlp": 0.01083145, "balance_loss_clip": 1.04402018, "balance_loss_mlp": 1.04237199, "epoch": 0.31871336239290543, "flos": 18185593495680.0, "grad_norm": 2.1407910914103168, "language_loss": 0.8577677, "learning_rate": 3.1874799547682304e-06, "loss": 0.87955558, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53125, "step": 5301, "time_per_iteration": 2.393170118331909 }, { "auxiliary_loss_clip": 0.01093402, "auxiliary_loss_mlp": 0.01087174, "balance_loss_clip": 1.04688096, "balance_loss_mlp": 1.04271543, "epoch": 0.31877348564557345, "flos": 21825917804160.0, "grad_norm": 2.783000342552586, "language_loss": 0.78815091, "learning_rate": 3.187166549199015e-06, "loss": 0.80995661, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.50390625, "step": 5302, "time_per_iteration": 2.470381498336792 }, { "auxiliary_loss_clip": 0.01088578, "auxiliary_loss_mlp": 0.01073046, "balance_loss_clip": 1.03668737, "balance_loss_mlp": 1.03935909, "epoch": 0.3188336088982414, "flos": 22014239011200.0, "grad_norm": 1.880056496055087, "language_loss": 0.81230628, "learning_rate": 3.1868530986123255e-06, "loss": 0.83392251, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.4921875, "step": 5303, "time_per_iteration": 2.4180784225463867 }, { "auxiliary_loss_clip": 0.01094333, "auxiliary_loss_mlp": 0.01082376, "balance_loss_clip": 1.04384804, "balance_loss_mlp": 1.03911197, "epoch": 0.3188937321509094, "flos": 20046848186880.0, "grad_norm": 3.802462332914547, "language_loss": 0.75231647, "learning_rate": 3.186539603020047e-06, "loss": 0.77408355, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.55078125, "step": 5304, "time_per_iteration": 2.4787485599517822 }, { "auxiliary_loss_clip": 0.01085053, "auxiliary_loss_mlp": 0.01060688, "balance_loss_clip": 1.02884746, "balance_loss_mlp": 1.03637123, "epoch": 0.31895385540357735, "flos": 25846934244480.0, "grad_norm": 1.7887476071716195, "language_loss": 0.73590744, "learning_rate": 3.186226062434068e-06, "loss": 0.75736481, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.48632812, "step": 5305, "time_per_iteration": 2.448594331741333 }, { "auxiliary_loss_clip": 0.0108642, "auxiliary_loss_mlp": 0.01071605, "balance_loss_clip": 1.03722465, "balance_loss_mlp": 1.03543019, "epoch": 0.3190139786562453, "flos": 23476575974400.0, "grad_norm": 2.155738980986795, "language_loss": 0.65139675, "learning_rate": 3.1859124768662778e-06, "loss": 0.67297703, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5306, "time_per_iteration": 2.499844551086426 }, { "auxiliary_loss_clip": 0.01082657, "auxiliary_loss_mlp": 0.01056749, "balance_loss_clip": 1.02060509, "balance_loss_mlp": 1.03228605, "epoch": 0.3190741019089133, "flos": 29094275756160.0, "grad_norm": 2.144802732078294, "language_loss": 0.81708348, "learning_rate": 3.1855988463285678e-06, "loss": 0.83847749, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.50390625, "step": 5307, "time_per_iteration": 2.4775967597961426 }, { "auxiliary_loss_clip": 0.0107849, "auxiliary_loss_mlp": 0.0106261, "balance_loss_clip": 1.02851605, "balance_loss_mlp": 1.03045678, "epoch": 0.31913422516158124, "flos": 17128563039360.0, "grad_norm": 1.7086527069488036, "language_loss": 0.79128689, "learning_rate": 3.1852851708328308e-06, "loss": 0.81269783, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48046875, "step": 5308, "time_per_iteration": 2.42376971244812 }, { "auxiliary_loss_clip": 0.01087035, "auxiliary_loss_mlp": 0.01069122, "balance_loss_clip": 1.02634966, "balance_loss_mlp": 1.030756, "epoch": 0.3191943484142492, "flos": 16068949142400.0, "grad_norm": 2.360625776253197, "language_loss": 0.76489568, "learning_rate": 3.184971450390961e-06, "loss": 0.7864573, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 0.5625, "step": 5309, "time_per_iteration": 2.387672185897827 }, { "auxiliary_loss_clip": 0.01079498, "auxiliary_loss_mlp": 0.01052683, "balance_loss_clip": 1.01787376, "balance_loss_mlp": 1.02912009, "epoch": 0.3192544716669172, "flos": 22965063511680.0, "grad_norm": 3.0057776187950913, "language_loss": 0.84225726, "learning_rate": 3.184657685014856e-06, "loss": 0.86357903, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 5310, "time_per_iteration": 2.4562137126922607 }, { "auxiliary_loss_clip": 0.01076832, "auxiliary_loss_mlp": 0.01050083, "balance_loss_clip": 1.01687098, "balance_loss_mlp": 1.02764034, "epoch": 0.31931459491958514, "flos": 26869121297280.0, "grad_norm": 1.513019590978647, "language_loss": 0.79699457, "learning_rate": 3.184343874716412e-06, "loss": 0.81826377, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 5311, "time_per_iteration": 2.4525325298309326 }, { "auxiliary_loss_clip": 0.01078126, "auxiliary_loss_mlp": 0.01057684, "balance_loss_clip": 1.02323258, "balance_loss_mlp": 1.02821243, "epoch": 0.3193747181722531, "flos": 21835413694080.0, "grad_norm": 1.869547838031828, "language_loss": 0.8585124, "learning_rate": 3.1840300195075295e-06, "loss": 0.87987041, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5, "step": 5312, "time_per_iteration": 2.458223581314087 }, { "auxiliary_loss_clip": 0.01083656, "auxiliary_loss_mlp": 0.01063879, "balance_loss_clip": 1.02501643, "balance_loss_mlp": 1.0298574, "epoch": 0.31943484142492107, "flos": 18324233971200.0, "grad_norm": 4.535159417530678, "language_loss": 0.814973, "learning_rate": 3.1837161194001102e-06, "loss": 0.83644831, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.5390625, "step": 5313, "time_per_iteration": 2.3895158767700195 }, { "auxiliary_loss_clip": 0.01078826, "auxiliary_loss_mlp": 0.01052771, "balance_loss_clip": 1.01777077, "balance_loss_mlp": 1.02925158, "epoch": 0.31949496467758903, "flos": 21614762701440.0, "grad_norm": 2.0690697916635923, "language_loss": 0.88905668, "learning_rate": 3.183402174406057e-06, "loss": 0.91037261, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49609375, "step": 5314, "time_per_iteration": 3.9060401916503906 }, { "auxiliary_loss_clip": 0.0107897, "auxiliary_loss_mlp": 0.01056263, "balance_loss_clip": 1.01887941, "balance_loss_mlp": 1.02858031, "epoch": 0.31955508793025705, "flos": 21759198462720.0, "grad_norm": 1.6991661193942498, "language_loss": 0.80710447, "learning_rate": 3.1830881845372747e-06, "loss": 0.82845676, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.50390625, "step": 5315, "time_per_iteration": 2.4090631008148193 }, { "auxiliary_loss_clip": 0.01080732, "auxiliary_loss_mlp": 0.01062122, "balance_loss_clip": 1.02566755, "balance_loss_mlp": 1.02867055, "epoch": 0.319615211182925, "flos": 17163406442880.0, "grad_norm": 1.8470864472336228, "language_loss": 0.69077098, "learning_rate": 3.18277414980567e-06, "loss": 0.71219951, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.51953125, "step": 5316, "time_per_iteration": 2.412233829498291 }, { "auxiliary_loss_clip": 0.01082549, "auxiliary_loss_mlp": 0.01054574, "balance_loss_clip": 1.0205754, "balance_loss_mlp": 1.03147411, "epoch": 0.319675334435593, "flos": 28111505495040.0, "grad_norm": 1.4085707627975654, "language_loss": 0.69867092, "learning_rate": 3.1824600702231515e-06, "loss": 0.72004217, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.51171875, "step": 5317, "time_per_iteration": 2.493027925491333 }, { "auxiliary_loss_clip": 0.010249, "auxiliary_loss_mlp": 0.01011737, "balance_loss_clip": 1.00756443, "balance_loss_mlp": 1.01457906, "epoch": 0.31973545768826095, "flos": 69497266798080.0, "grad_norm": 0.7565392190354854, "language_loss": 0.53227985, "learning_rate": 3.182145945801628e-06, "loss": 0.55264616, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.10302734, "step": 5318, "time_per_iteration": 4.599034309387207 }, { "auxiliary_loss_clip": 0.01076058, "auxiliary_loss_mlp": 0.0105753, "balance_loss_clip": 1.02424705, "balance_loss_mlp": 1.02731836, "epoch": 0.3197955809409289, "flos": 13698346492800.0, "grad_norm": 1.8533392514338791, "language_loss": 0.855883, "learning_rate": 3.181831776553012e-06, "loss": 0.87721884, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48828125, "step": 5319, "time_per_iteration": 3.892874240875244 }, { "auxiliary_loss_clip": 0.0107319, "auxiliary_loss_mlp": 0.0105393, "balance_loss_clip": 1.01814365, "balance_loss_mlp": 1.02460241, "epoch": 0.3198557041935969, "flos": 33216750207360.0, "grad_norm": 1.7171385411795295, "language_loss": 0.65361011, "learning_rate": 3.1815175624892165e-06, "loss": 0.67488134, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.484375, "step": 5320, "time_per_iteration": 2.5267019271850586 }, { "auxiliary_loss_clip": 0.01081953, "auxiliary_loss_mlp": 0.01055164, "balance_loss_clip": 1.01954436, "balance_loss_mlp": 1.02816904, "epoch": 0.31991582744626484, "flos": 23730918295680.0, "grad_norm": 1.779136174045927, "language_loss": 0.7171756, "learning_rate": 3.1812033036221567e-06, "loss": 0.73854673, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5390625, "step": 5321, "time_per_iteration": 3.8277320861816406 }, { "auxiliary_loss_clip": 0.01082474, "auxiliary_loss_mlp": 0.01066653, "balance_loss_clip": 1.02664661, "balance_loss_mlp": 1.02797019, "epoch": 0.3199759506989328, "flos": 18549877288320.0, "grad_norm": 2.7141396308160814, "language_loss": 0.87779176, "learning_rate": 3.180888999963749e-06, "loss": 0.89928299, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 0.546875, "step": 5322, "time_per_iteration": 2.396885633468628 }, { "auxiliary_loss_clip": 0.01072919, "auxiliary_loss_mlp": 0.01055887, "balance_loss_clip": 1.02198362, "balance_loss_mlp": 1.02375412, "epoch": 0.3200360739516008, "flos": 22417799950080.0, "grad_norm": 1.6916859156196709, "language_loss": 0.84275699, "learning_rate": 3.1805746515259123e-06, "loss": 0.86404508, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4921875, "step": 5323, "time_per_iteration": 2.4066548347473145 }, { "auxiliary_loss_clip": 0.01073192, "auxiliary_loss_mlp": 0.01058507, "balance_loss_clip": 1.02150416, "balance_loss_mlp": 1.02442443, "epoch": 0.32009619720426874, "flos": 20594181571200.0, "grad_norm": 1.8289421235018428, "language_loss": 0.80105436, "learning_rate": 3.1802602583205663e-06, "loss": 0.82237136, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.48632812, "step": 5324, "time_per_iteration": 2.4336817264556885 }, { "auxiliary_loss_clip": 0.01073205, "auxiliary_loss_mlp": 0.01052644, "balance_loss_clip": 1.01862192, "balance_loss_mlp": 1.02450752, "epoch": 0.3201563204569367, "flos": 18146735285760.0, "grad_norm": 1.782018379905865, "language_loss": 0.81830835, "learning_rate": 3.1799458203596333e-06, "loss": 0.83956683, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.48828125, "step": 5325, "time_per_iteration": 2.377291440963745 }, { "auxiliary_loss_clip": 0.01074552, "auxiliary_loss_mlp": 0.01049948, "balance_loss_clip": 1.01521063, "balance_loss_mlp": 1.02382767, "epoch": 0.32021644370960467, "flos": 31682945957760.0, "grad_norm": 3.1994053538647735, "language_loss": 0.76575285, "learning_rate": 3.179631337655037e-06, "loss": 0.78699791, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.5078125, "step": 5326, "time_per_iteration": 2.522315263748169 }, { "auxiliary_loss_clip": 0.01072262, "auxiliary_loss_mlp": 0.01054188, "balance_loss_clip": 1.02009392, "balance_loss_mlp": 1.02498507, "epoch": 0.32027656696227264, "flos": 26864827200000.0, "grad_norm": 1.5652294964671287, "language_loss": 0.82204431, "learning_rate": 3.179316810218701e-06, "loss": 0.84330887, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.47265625, "step": 5327, "time_per_iteration": 2.429962396621704 }, { "auxiliary_loss_clip": 0.01077018, "auxiliary_loss_mlp": 0.01053216, "balance_loss_clip": 1.01669025, "balance_loss_mlp": 1.02465582, "epoch": 0.32033669021494066, "flos": 24168798967680.0, "grad_norm": 1.4228225780300756, "language_loss": 0.7867617, "learning_rate": 3.179002238062554e-06, "loss": 0.80806398, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.5234375, "step": 5328, "time_per_iteration": 2.440349817276001 }, { "auxiliary_loss_clip": 0.01076572, "auxiliary_loss_mlp": 0.01058128, "balance_loss_clip": 1.02033877, "balance_loss_mlp": 1.0261457, "epoch": 0.3203968134676086, "flos": 24459660437760.0, "grad_norm": 1.5720753985115923, "language_loss": 0.75034404, "learning_rate": 3.178687621198524e-06, "loss": 0.77169102, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.50390625, "step": 5329, "time_per_iteration": 2.441746711730957 }, { "auxiliary_loss_clip": 0.01070568, "auxiliary_loss_mlp": 0.01045197, "balance_loss_clip": 1.01401162, "balance_loss_mlp": 1.02313757, "epoch": 0.3204569367202766, "flos": 18003730890240.0, "grad_norm": 1.5193149345362091, "language_loss": 0.72675335, "learning_rate": 3.1783729596385415e-06, "loss": 0.74791098, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.47460938, "step": 5330, "time_per_iteration": 2.4335949420928955 }, { "auxiliary_loss_clip": 0.01078654, "auxiliary_loss_mlp": 0.0106059, "balance_loss_clip": 1.02144158, "balance_loss_mlp": 1.02545941, "epoch": 0.32051705997294455, "flos": 30588418834560.0, "grad_norm": 1.900630617744238, "language_loss": 0.80979061, "learning_rate": 3.1780582533945376e-06, "loss": 0.83118308, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.53125, "step": 5331, "time_per_iteration": 2.4651646614074707 }, { "auxiliary_loss_clip": 0.01019584, "auxiliary_loss_mlp": 0.01012094, "balance_loss_clip": 1.00801671, "balance_loss_mlp": 1.00893307, "epoch": 0.3205771832256125, "flos": 68414855783040.0, "grad_norm": 0.843834935414847, "language_loss": 0.57832181, "learning_rate": 3.177743502478447e-06, "loss": 0.59863853, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.10644531, "step": 5332, "time_per_iteration": 2.996425151824951 }, { "auxiliary_loss_clip": 0.01078934, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.01512146, "balance_loss_mlp": 1.02671742, "epoch": 0.3206373064782805, "flos": 30442691352960.0, "grad_norm": 1.715237850432866, "language_loss": 0.74779356, "learning_rate": 3.177428706902205e-06, "loss": 0.76909363, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.5234375, "step": 5333, "time_per_iteration": 2.494990587234497 }, { "auxiliary_loss_clip": 0.01075803, "auxiliary_loss_mlp": 0.01068103, "balance_loss_clip": 1.02962279, "balance_loss_mlp": 1.02496874, "epoch": 0.32069742973094845, "flos": 22053411423360.0, "grad_norm": 1.688153371058265, "language_loss": 0.72589695, "learning_rate": 3.1771138666777485e-06, "loss": 0.74733603, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5078125, "step": 5334, "time_per_iteration": 2.4250235557556152 }, { "auxiliary_loss_clip": 0.0107688, "auxiliary_loss_mlp": 0.01056193, "balance_loss_clip": 1.0218842, "balance_loss_mlp": 1.02628374, "epoch": 0.3207575529836164, "flos": 22052922664320.0, "grad_norm": 1.8455297319702417, "language_loss": 0.78437495, "learning_rate": 3.1767989818170156e-06, "loss": 0.80570567, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5335, "time_per_iteration": 2.3910887241363525 }, { "auxiliary_loss_clip": 0.01077406, "auxiliary_loss_mlp": 0.010546, "balance_loss_clip": 1.0199101, "balance_loss_mlp": 1.02692723, "epoch": 0.3208176762362844, "flos": 34056132048000.0, "grad_norm": 1.5272764861102277, "language_loss": 0.69813651, "learning_rate": 3.1764840523319477e-06, "loss": 0.71945655, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.50390625, "step": 5336, "time_per_iteration": 2.5134389400482178 }, { "auxiliary_loss_clip": 0.01074094, "auxiliary_loss_mlp": 0.01055696, "balance_loss_clip": 1.0216496, "balance_loss_mlp": 1.02470577, "epoch": 0.32087779948895234, "flos": 21797637736320.0, "grad_norm": 2.239572123714747, "language_loss": 0.805601, "learning_rate": 3.176169078234487e-06, "loss": 0.82689893, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49414062, "step": 5337, "time_per_iteration": 2.391011953353882 }, { "auxiliary_loss_clip": 0.01072211, "auxiliary_loss_mlp": 0.01051786, "balance_loss_clip": 1.02019596, "balance_loss_mlp": 1.02443957, "epoch": 0.3209379227416203, "flos": 21433039741440.0, "grad_norm": 1.8597491806404594, "language_loss": 0.75949812, "learning_rate": 3.1758540595365766e-06, "loss": 0.78073806, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47851562, "step": 5338, "time_per_iteration": 2.44710636138916 }, { "auxiliary_loss_clip": 0.01077209, "auxiliary_loss_mlp": 0.01054789, "balance_loss_clip": 1.016904, "balance_loss_mlp": 1.02492702, "epoch": 0.3209980459942883, "flos": 25847876851200.0, "grad_norm": 2.566792869620281, "language_loss": 0.63884574, "learning_rate": 3.1755389962501626e-06, "loss": 0.66016567, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5234375, "step": 5339, "time_per_iteration": 2.4235384464263916 }, { "auxiliary_loss_clip": 0.01075391, "auxiliary_loss_mlp": 0.01052442, "balance_loss_clip": 1.0163213, "balance_loss_mlp": 1.02535319, "epoch": 0.32105816924695624, "flos": 19098153279360.0, "grad_norm": 2.559830153304163, "language_loss": 0.83516055, "learning_rate": 3.175223888387192e-06, "loss": 0.85643888, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.5, "step": 5340, "time_per_iteration": 2.4333345890045166 }, { "auxiliary_loss_clip": 0.01074834, "auxiliary_loss_mlp": 0.01054456, "balance_loss_clip": 1.01919365, "balance_loss_mlp": 1.02579057, "epoch": 0.3211182924996242, "flos": 16580915452800.0, "grad_norm": 1.994193063387936, "language_loss": 0.77932751, "learning_rate": 3.1749087359596137e-06, "loss": 0.80062044, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.48828125, "step": 5341, "time_per_iteration": 2.3701252937316895 }, { "auxiliary_loss_clip": 0.010741, "auxiliary_loss_mlp": 0.01061562, "balance_loss_clip": 1.0271821, "balance_loss_mlp": 1.02481294, "epoch": 0.3211784157522922, "flos": 22671164753280.0, "grad_norm": 1.6289575995991281, "language_loss": 0.80186397, "learning_rate": 3.1745935389793786e-06, "loss": 0.82322061, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4921875, "step": 5342, "time_per_iteration": 2.454230546951294 }, { "auxiliary_loss_clip": 0.01076247, "auxiliary_loss_mlp": 0.0106076, "balance_loss_clip": 1.02390075, "balance_loss_mlp": 1.02565682, "epoch": 0.3212385390049602, "flos": 20557732245120.0, "grad_norm": 3.717515285155883, "language_loss": 0.76654184, "learning_rate": 3.174278297458438e-06, "loss": 0.78791189, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.50390625, "step": 5343, "time_per_iteration": 2.388232946395874 }, { "auxiliary_loss_clip": 0.01071533, "auxiliary_loss_mlp": 0.01058018, "balance_loss_clip": 1.02330446, "balance_loss_mlp": 1.02356303, "epoch": 0.32129866225762815, "flos": 24789973610880.0, "grad_norm": 1.5918031018582057, "language_loss": 0.83301401, "learning_rate": 3.173963011408748e-06, "loss": 0.85430956, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48046875, "step": 5344, "time_per_iteration": 2.503647804260254 }, { "auxiliary_loss_clip": 0.01074161, "auxiliary_loss_mlp": 0.01058665, "balance_loss_clip": 1.02347469, "balance_loss_mlp": 1.02274394, "epoch": 0.3213587855102961, "flos": 18365954912640.0, "grad_norm": 2.0070524252350515, "language_loss": 0.80985272, "learning_rate": 3.173647680842262e-06, "loss": 0.83118105, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.515625, "step": 5345, "time_per_iteration": 2.3665239810943604 }, { "auxiliary_loss_clip": 0.01072934, "auxiliary_loss_mlp": 0.010601, "balance_loss_clip": 1.02481461, "balance_loss_mlp": 1.02339399, "epoch": 0.3214189087629641, "flos": 27014778956160.0, "grad_norm": 2.3492583308115353, "language_loss": 0.85454512, "learning_rate": 3.1733323057709384e-06, "loss": 0.87587547, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.49609375, "step": 5346, "time_per_iteration": 2.495651960372925 }, { "auxiliary_loss_clip": 0.01072741, "auxiliary_loss_mlp": 0.01065584, "balance_loss_clip": 1.02872396, "balance_loss_mlp": 1.02257276, "epoch": 0.32147903201563205, "flos": 23147170496640.0, "grad_norm": 1.3979298526122703, "language_loss": 0.82226199, "learning_rate": 3.1730168862067366e-06, "loss": 0.84364522, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.5, "step": 5347, "time_per_iteration": 2.4102063179016113 }, { "auxiliary_loss_clip": 0.0107293, "auxiliary_loss_mlp": 0.01057598, "balance_loss_clip": 1.02092886, "balance_loss_mlp": 1.02309024, "epoch": 0.3215391552683, "flos": 16579833200640.0, "grad_norm": 1.8850878616764366, "language_loss": 0.81717306, "learning_rate": 3.1727014221616164e-06, "loss": 0.83847833, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.49804688, "step": 5348, "time_per_iteration": 2.401184320449829 }, { "auxiliary_loss_clip": 0.01072115, "auxiliary_loss_mlp": 0.01065006, "balance_loss_clip": 1.02960038, "balance_loss_mlp": 1.02290881, "epoch": 0.321599278520968, "flos": 17820855855360.0, "grad_norm": 2.229003139877603, "language_loss": 0.87149775, "learning_rate": 3.172385913647542e-06, "loss": 0.892869, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4921875, "step": 5349, "time_per_iteration": 2.3969485759735107 }, { "auxiliary_loss_clip": 0.01071559, "auxiliary_loss_mlp": 0.01057313, "balance_loss_clip": 1.02388728, "balance_loss_mlp": 1.02273154, "epoch": 0.32165940177363594, "flos": 16250881570560.0, "grad_norm": 4.677403323415619, "language_loss": 0.82135165, "learning_rate": 3.172070360676475e-06, "loss": 0.8426404, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.48828125, "step": 5350, "time_per_iteration": 2.393501043319702 }, { "auxiliary_loss_clip": 0.01071605, "auxiliary_loss_mlp": 0.01055711, "balance_loss_clip": 1.02280879, "balance_loss_mlp": 1.0227195, "epoch": 0.3217195250263039, "flos": 27598666400640.0, "grad_norm": 1.6716829715246035, "language_loss": 0.80686098, "learning_rate": 3.1717547632603828e-06, "loss": 0.82813418, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48828125, "step": 5351, "time_per_iteration": 2.451160192489624 }, { "auxiliary_loss_clip": 0.01073855, "auxiliary_loss_mlp": 0.01055752, "balance_loss_clip": 1.02168179, "balance_loss_mlp": 1.02471137, "epoch": 0.3217796482789719, "flos": 21469523978880.0, "grad_norm": 2.1282186637394886, "language_loss": 0.77355957, "learning_rate": 3.1714391214112326e-06, "loss": 0.79485571, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49023438, "step": 5352, "time_per_iteration": 2.4305784702301025 }, { "auxiliary_loss_clip": 0.01072119, "auxiliary_loss_mlp": 0.01054641, "balance_loss_clip": 1.01875889, "balance_loss_mlp": 1.02321172, "epoch": 0.32183977153163984, "flos": 21214518341760.0, "grad_norm": 3.9397898159138607, "language_loss": 0.83299512, "learning_rate": 3.1711234351409933e-06, "loss": 0.85426277, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48828125, "step": 5353, "time_per_iteration": 2.4186646938323975 }, { "auxiliary_loss_clip": 0.01071784, "auxiliary_loss_mlp": 0.01053087, "balance_loss_clip": 1.01863575, "balance_loss_mlp": 1.02418971, "epoch": 0.3218998947843078, "flos": 24607028753280.0, "grad_norm": 1.927550456289071, "language_loss": 0.73878002, "learning_rate": 3.1708077044616365e-06, "loss": 0.76002866, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47460938, "step": 5354, "time_per_iteration": 3.8893332481384277 }, { "auxiliary_loss_clip": 0.01072125, "auxiliary_loss_mlp": 0.01050282, "balance_loss_clip": 1.01530588, "balance_loss_mlp": 1.02234936, "epoch": 0.3219600180369758, "flos": 22269558850560.0, "grad_norm": 1.5962943843952742, "language_loss": 0.84732413, "learning_rate": 3.1704919293851334e-06, "loss": 0.86854827, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49804688, "step": 5355, "time_per_iteration": 2.400207042694092 }, { "auxiliary_loss_clip": 0.01077387, "auxiliary_loss_mlp": 0.01051133, "balance_loss_clip": 1.01811171, "balance_loss_mlp": 1.02704418, "epoch": 0.3220201412896438, "flos": 14938251984000.0, "grad_norm": 2.1566029435096934, "language_loss": 0.72698617, "learning_rate": 3.1701761099234597e-06, "loss": 0.74827135, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.50390625, "step": 5356, "time_per_iteration": 2.4160313606262207 }, { "auxiliary_loss_clip": 0.01080242, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.02417386, "balance_loss_mlp": 1.02656484, "epoch": 0.32208026454231176, "flos": 22666486631040.0, "grad_norm": 4.915512416054033, "language_loss": 0.70382702, "learning_rate": 3.1698602460885903e-06, "loss": 0.72526383, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.5390625, "step": 5357, "time_per_iteration": 2.4091241359710693 }, { "auxiliary_loss_clip": 0.01036248, "auxiliary_loss_mlp": 0.01004391, "balance_loss_clip": 1.00029004, "balance_loss_mlp": 1.02505684, "epoch": 0.3221403877949797, "flos": 64601606177280.0, "grad_norm": 0.7045714542121865, "language_loss": 0.58356035, "learning_rate": 3.1695443378925035e-06, "loss": 0.60396677, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.11181641, "step": 5358, "time_per_iteration": 4.601735591888428 }, { "auxiliary_loss_clip": 0.01072745, "auxiliary_loss_mlp": 0.01055479, "balance_loss_clip": 1.0212425, "balance_loss_mlp": 1.02334261, "epoch": 0.3222005110476477, "flos": 20155986696960.0, "grad_norm": 1.6028304190900455, "language_loss": 0.84817374, "learning_rate": 3.1692283853471777e-06, "loss": 0.86945599, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49414062, "step": 5359, "time_per_iteration": 3.886730670928955 }, { "auxiliary_loss_clip": 0.01073645, "auxiliary_loss_mlp": 0.01052166, "balance_loss_clip": 1.0180006, "balance_loss_mlp": 1.0244112, "epoch": 0.32226063430031565, "flos": 22673084878080.0, "grad_norm": 1.6316932453079498, "language_loss": 0.81251717, "learning_rate": 3.168912388464595e-06, "loss": 0.83377528, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.4921875, "step": 5360, "time_per_iteration": 2.39911150932312 }, { "auxiliary_loss_clip": 0.0103236, "auxiliary_loss_mlp": 0.01005384, "balance_loss_clip": 1.00114059, "balance_loss_mlp": 1.02097952, "epoch": 0.3223207575529836, "flos": 63825312896640.0, "grad_norm": 0.6620936757752361, "language_loss": 0.57169688, "learning_rate": 3.168596347256737e-06, "loss": 0.59207439, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 0.04248047, "router_z_loss_mlp": 0.11376953, "step": 5361, "time_per_iteration": 4.293464660644531 }, { "auxiliary_loss_clip": 0.01072167, "auxiliary_loss_mlp": 0.010551, "balance_loss_clip": 1.02153063, "balance_loss_mlp": 1.02327645, "epoch": 0.3223808808056516, "flos": 26868911829120.0, "grad_norm": 2.084438072676227, "language_loss": 0.73501831, "learning_rate": 3.168280261735588e-06, "loss": 0.75629097, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.49023438, "step": 5362, "time_per_iteration": 2.4395878314971924 }, { "auxiliary_loss_clip": 0.01074714, "auxiliary_loss_mlp": 0.01059426, "balance_loss_clip": 1.02585626, "balance_loss_mlp": 1.02528656, "epoch": 0.32244100405831955, "flos": 26760122432640.0, "grad_norm": 1.879965940489933, "language_loss": 0.74037546, "learning_rate": 3.167964131913135e-06, "loss": 0.76171684, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.49609375, "step": 5363, "time_per_iteration": 2.4255177974700928 }, { "auxiliary_loss_clip": 0.01075305, "auxiliary_loss_mlp": 0.0106571, "balance_loss_clip": 1.02777767, "balance_loss_mlp": 1.02240944, "epoch": 0.3225011273109875, "flos": 23801966645760.0, "grad_norm": 2.3652508863327313, "language_loss": 0.77210021, "learning_rate": 3.167647957801365e-06, "loss": 0.79351032, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.52734375, "step": 5364, "time_per_iteration": 2.438962697982788 }, { "auxiliary_loss_clip": 0.01071199, "auxiliary_loss_mlp": 0.01058465, "balance_loss_clip": 1.02217793, "balance_loss_mlp": 1.02212012, "epoch": 0.3225612505636555, "flos": 17273557382400.0, "grad_norm": 2.201892041572041, "language_loss": 0.78386867, "learning_rate": 3.1673317394122672e-06, "loss": 0.80516529, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.4921875, "step": 5365, "time_per_iteration": 2.359048843383789 }, { "auxiliary_loss_clip": 0.01078948, "auxiliary_loss_mlp": 0.01061201, "balance_loss_clip": 1.02465129, "balance_loss_mlp": 1.02752852, "epoch": 0.32262137381632344, "flos": 23365168225920.0, "grad_norm": 1.6457702571563129, "language_loss": 0.77858675, "learning_rate": 3.1670154767578333e-06, "loss": 0.79998815, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.515625, "step": 5366, "time_per_iteration": 2.454446315765381 }, { "auxiliary_loss_clip": 0.01073496, "auxiliary_loss_mlp": 0.01063451, "balance_loss_clip": 1.02711606, "balance_loss_mlp": 1.02327287, "epoch": 0.3226814970689914, "flos": 23257670549760.0, "grad_norm": 1.7877354926387343, "language_loss": 0.73417664, "learning_rate": 3.166699169850055e-06, "loss": 0.75554609, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.50390625, "step": 5367, "time_per_iteration": 2.4168241024017334 }, { "auxiliary_loss_clip": 0.01073335, "auxiliary_loss_mlp": 0.01052624, "balance_loss_clip": 1.01975846, "balance_loss_mlp": 1.02487493, "epoch": 0.32274162032165943, "flos": 16394374725120.0, "grad_norm": 3.372138627812276, "language_loss": 0.75685072, "learning_rate": 3.1663828187009274e-06, "loss": 0.77811027, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.484375, "step": 5368, "time_per_iteration": 2.4387261867523193 }, { "auxiliary_loss_clip": 0.01072251, "auxiliary_loss_mlp": 0.01048916, "balance_loss_clip": 1.01594222, "balance_loss_mlp": 1.02392721, "epoch": 0.3228017435743274, "flos": 27853846594560.0, "grad_norm": 1.6716565853287766, "language_loss": 0.79722649, "learning_rate": 3.1660664233224467e-06, "loss": 0.81843817, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.484375, "step": 5369, "time_per_iteration": 2.4463508129119873 }, { "auxiliary_loss_clip": 0.01073142, "auxiliary_loss_mlp": 0.01048363, "balance_loss_clip": 1.01691556, "balance_loss_mlp": 1.02617073, "epoch": 0.32286186682699536, "flos": 19607780528640.0, "grad_norm": 4.083048216431409, "language_loss": 0.84502208, "learning_rate": 3.16574998372661e-06, "loss": 0.86623704, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.47070312, "step": 5370, "time_per_iteration": 2.4403798580169678 }, { "auxiliary_loss_clip": 0.01075417, "auxiliary_loss_mlp": 0.01052671, "balance_loss_clip": 1.01903081, "balance_loss_mlp": 1.02634525, "epoch": 0.3229219900796633, "flos": 24132873312000.0, "grad_norm": 1.8899369378546755, "language_loss": 0.8403371, "learning_rate": 3.1654334999254177e-06, "loss": 0.86161804, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 5371, "time_per_iteration": 2.412097692489624 }, { "auxiliary_loss_clip": 0.01079241, "auxiliary_loss_mlp": 0.01060882, "balance_loss_clip": 1.02027893, "balance_loss_mlp": 1.0259856, "epoch": 0.3229821133323313, "flos": 17747747735040.0, "grad_norm": 2.1036766974130265, "language_loss": 0.89894164, "learning_rate": 3.1651169719308695e-06, "loss": 0.92034292, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 0.53125, "step": 5372, "time_per_iteration": 2.431300401687622 }, { "auxiliary_loss_clip": 0.01078004, "auxiliary_loss_mlp": 0.01057795, "balance_loss_clip": 1.0222466, "balance_loss_mlp": 1.0275476, "epoch": 0.32304223658499925, "flos": 22344936209280.0, "grad_norm": 1.9634578772324833, "language_loss": 0.74283206, "learning_rate": 3.1648003997549694e-06, "loss": 0.76419002, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.50390625, "step": 5373, "time_per_iteration": 2.4360761642456055 }, { "auxiliary_loss_clip": 0.01077397, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.01869369, "balance_loss_mlp": 1.02881074, "epoch": 0.3231023598376672, "flos": 18477327749760.0, "grad_norm": 3.045696752843061, "language_loss": 0.817644, "learning_rate": 3.1644837834097214e-06, "loss": 0.83894533, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.484375, "step": 5374, "time_per_iteration": 2.4189534187316895 }, { "auxiliary_loss_clip": 0.01073474, "auxiliary_loss_mlp": 0.01050202, "balance_loss_clip": 1.01553595, "balance_loss_mlp": 1.02510417, "epoch": 0.3231624830903352, "flos": 27635080815360.0, "grad_norm": 2.8782843399253637, "language_loss": 0.89801115, "learning_rate": 3.1641671229071317e-06, "loss": 0.91924787, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.484375, "step": 5375, "time_per_iteration": 2.479678153991699 }, { "auxiliary_loss_clip": 0.01079471, "auxiliary_loss_mlp": 0.01060245, "balance_loss_clip": 1.02154946, "balance_loss_mlp": 1.02694654, "epoch": 0.32322260634300315, "flos": 21725332577280.0, "grad_norm": 1.8891747951166262, "language_loss": 0.76981825, "learning_rate": 3.1638504182592076e-06, "loss": 0.79121542, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 0.52734375, "step": 5376, "time_per_iteration": 2.459602117538452 }, { "auxiliary_loss_clip": 0.01077125, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.01373422, "balance_loss_mlp": 1.02900934, "epoch": 0.3232827295956711, "flos": 22636565729280.0, "grad_norm": 4.506894607255021, "language_loss": 0.68424475, "learning_rate": 3.1635336694779594e-06, "loss": 0.70548213, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48242188, "step": 5377, "time_per_iteration": 2.4227726459503174 }, { "auxiliary_loss_clip": 0.01077219, "auxiliary_loss_mlp": 0.01059996, "balance_loss_clip": 1.02087188, "balance_loss_mlp": 1.02735305, "epoch": 0.3233428528483391, "flos": 26321403888000.0, "grad_norm": 1.373386005949879, "language_loss": 0.73503339, "learning_rate": 3.1632168765753982e-06, "loss": 0.75640547, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.49804688, "step": 5378, "time_per_iteration": 2.4730067253112793 }, { "auxiliary_loss_clip": 0.01074396, "auxiliary_loss_mlp": 0.01057093, "balance_loss_clip": 1.02237976, "balance_loss_mlp": 1.02504814, "epoch": 0.32340297610100704, "flos": 28583950279680.0, "grad_norm": 2.0684930625456146, "language_loss": 0.84002566, "learning_rate": 3.1629000395635357e-06, "loss": 0.86134058, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.49414062, "step": 5379, "time_per_iteration": 2.4681522846221924 }, { "auxiliary_loss_clip": 0.01079831, "auxiliary_loss_mlp": 0.01052698, "balance_loss_clip": 1.01691151, "balance_loss_mlp": 1.02760744, "epoch": 0.323463099353675, "flos": 30772480855680.0, "grad_norm": 1.790506556291751, "language_loss": 0.79808342, "learning_rate": 3.162583158454388e-06, "loss": 0.81940871, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5234375, "step": 5380, "time_per_iteration": 2.543022871017456 }, { "auxiliary_loss_clip": 0.01080687, "auxiliary_loss_mlp": 0.01054087, "balance_loss_clip": 1.0196594, "balance_loss_mlp": 1.02890849, "epoch": 0.32352322260634303, "flos": 25227435346560.0, "grad_norm": 1.7152187089911302, "language_loss": 0.78077674, "learning_rate": 3.1622662332599697e-06, "loss": 0.8021245, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51953125, "step": 5381, "time_per_iteration": 2.453763008117676 }, { "auxiliary_loss_clip": 0.01072321, "auxiliary_loss_mlp": 0.01048754, "balance_loss_clip": 1.01736593, "balance_loss_mlp": 1.02430904, "epoch": 0.323583345859011, "flos": 23329382215680.0, "grad_norm": 1.7683284405171216, "language_loss": 0.72971821, "learning_rate": 3.1619492639922998e-06, "loss": 0.750929, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.48046875, "step": 5382, "time_per_iteration": 2.4316177368164062 }, { "auxiliary_loss_clip": 0.01076468, "auxiliary_loss_mlp": 0.01062689, "balance_loss_clip": 1.02690232, "balance_loss_mlp": 1.02522779, "epoch": 0.32364346911167896, "flos": 26206470092160.0, "grad_norm": 2.1683943437674134, "language_loss": 0.72957492, "learning_rate": 3.1616322506633964e-06, "loss": 0.75096643, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.51171875, "step": 5383, "time_per_iteration": 2.479782819747925 }, { "auxiliary_loss_clip": 0.01068754, "auxiliary_loss_mlp": 0.01047115, "balance_loss_clip": 1.01634693, "balance_loss_mlp": 1.02261925, "epoch": 0.3237035923643469, "flos": 23694643526400.0, "grad_norm": 2.106078595233845, "language_loss": 0.79978096, "learning_rate": 3.161315193285283e-06, "loss": 0.82093966, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4609375, "step": 5384, "time_per_iteration": 2.449404001235962 }, { "auxiliary_loss_clip": 0.01074341, "auxiliary_loss_mlp": 0.01051799, "balance_loss_clip": 1.01732337, "balance_loss_mlp": 1.02310228, "epoch": 0.3237637156170149, "flos": 14427856684800.0, "grad_norm": 2.129877652288771, "language_loss": 0.76424098, "learning_rate": 3.16099809186998e-06, "loss": 0.78550237, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.51171875, "step": 5385, "time_per_iteration": 2.3916494846343994 }, { "auxiliary_loss_clip": 0.01070995, "auxiliary_loss_mlp": 0.01055094, "balance_loss_clip": 1.02057099, "balance_loss_mlp": 1.02263343, "epoch": 0.32382383886968286, "flos": 31061736403200.0, "grad_norm": 1.9378952731384336, "language_loss": 0.72863191, "learning_rate": 3.1606809464295145e-06, "loss": 0.74989283, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.484375, "step": 5386, "time_per_iteration": 2.5048959255218506 }, { "auxiliary_loss_clip": 0.01073094, "auxiliary_loss_mlp": 0.01055162, "balance_loss_clip": 1.0172776, "balance_loss_mlp": 1.02225494, "epoch": 0.3238839621223508, "flos": 23255855159040.0, "grad_norm": 1.8226092625650352, "language_loss": 0.94995379, "learning_rate": 3.1603637569759095e-06, "loss": 0.97123635, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5078125, "step": 5387, "time_per_iteration": 2.4220590591430664 }, { "auxiliary_loss_clip": 0.01073552, "auxiliary_loss_mlp": 0.01061621, "balance_loss_clip": 1.02559638, "balance_loss_mlp": 1.02306688, "epoch": 0.3239440853750188, "flos": 22963597234560.0, "grad_norm": 3.962550969210986, "language_loss": 0.79342973, "learning_rate": 3.1600465235211956e-06, "loss": 0.81478155, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.50390625, "step": 5388, "time_per_iteration": 2.419628620147705 }, { "auxiliary_loss_clip": 0.01071318, "auxiliary_loss_mlp": 0.0105979, "balance_loss_clip": 1.02359843, "balance_loss_mlp": 1.02134371, "epoch": 0.32400420862768675, "flos": 36245151383040.0, "grad_norm": 72.60882506206835, "language_loss": 0.73355424, "learning_rate": 3.1597292460774006e-06, "loss": 0.75486535, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.5, "step": 5389, "time_per_iteration": 2.5340301990509033 }, { "auxiliary_loss_clip": 0.01070569, "auxiliary_loss_mlp": 0.01055029, "balance_loss_clip": 1.02267563, "balance_loss_mlp": 1.0223875, "epoch": 0.3240643318803547, "flos": 21615426017280.0, "grad_norm": 1.8492626619526433, "language_loss": 0.82169491, "learning_rate": 3.159411924656557e-06, "loss": 0.84295082, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.48242188, "step": 5390, "time_per_iteration": 2.430562734603882 }, { "auxiliary_loss_clip": 0.01071528, "auxiliary_loss_mlp": 0.01062718, "balance_loss_clip": 1.02726507, "balance_loss_mlp": 1.02308106, "epoch": 0.3241244551330227, "flos": 23294468989440.0, "grad_norm": 1.8671097769638614, "language_loss": 0.74012989, "learning_rate": 3.1590945592706967e-06, "loss": 0.76147234, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.484375, "step": 5391, "time_per_iteration": 2.4453272819519043 }, { "auxiliary_loss_clip": 0.01067724, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.02028441, "balance_loss_mlp": 1.02156544, "epoch": 0.32418457838569065, "flos": 14096461259520.0, "grad_norm": 1.5912701377115213, "language_loss": 0.78533107, "learning_rate": 3.158777149931855e-06, "loss": 0.80655181, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4609375, "step": 5392, "time_per_iteration": 2.412327766418457 }, { "auxiliary_loss_clip": 0.01073912, "auxiliary_loss_mlp": 0.01059773, "balance_loss_clip": 1.02410555, "balance_loss_mlp": 1.02343357, "epoch": 0.3242447016383586, "flos": 29751376055040.0, "grad_norm": 2.1384618425371698, "language_loss": 0.6484791, "learning_rate": 3.158459696652067e-06, "loss": 0.6698159, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.50390625, "step": 5393, "time_per_iteration": 2.530757427215576 }, { "auxiliary_loss_clip": 0.01071863, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.01669729, "balance_loss_mlp": 1.02263808, "epoch": 0.3243048248910266, "flos": 24350102991360.0, "grad_norm": 1.6529343115322592, "language_loss": 0.84259325, "learning_rate": 3.158142199443371e-06, "loss": 0.86381096, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 5394, "time_per_iteration": 3.8516106605529785 }, { "auxiliary_loss_clip": 0.01067986, "auxiliary_loss_mlp": 0.01056083, "balance_loss_clip": 1.0280931, "balance_loss_mlp": 1.02290463, "epoch": 0.3243649481436946, "flos": 24351883470720.0, "grad_norm": 2.295931697609266, "language_loss": 0.83406454, "learning_rate": 3.1578246583178076e-06, "loss": 0.85530519, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.45117188, "step": 5395, "time_per_iteration": 2.4713172912597656 }, { "auxiliary_loss_clip": 0.01069434, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.01813889, "balance_loss_mlp": 1.02368665, "epoch": 0.32442507139636256, "flos": 22924250265600.0, "grad_norm": 1.7416627620095333, "language_loss": 0.84676731, "learning_rate": 3.157507073287417e-06, "loss": 0.86794889, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45703125, "step": 5396, "time_per_iteration": 2.478872776031494 }, { "auxiliary_loss_clip": 0.0107559, "auxiliary_loss_mlp": 0.01059133, "balance_loss_clip": 1.02227378, "balance_loss_mlp": 1.02404928, "epoch": 0.32448519464903053, "flos": 22199103993600.0, "grad_norm": 2.2675136963688263, "language_loss": 0.78957427, "learning_rate": 3.1571894443642414e-06, "loss": 0.81092155, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.515625, "step": 5397, "time_per_iteration": 3.873203754425049 }, { "auxiliary_loss_clip": 0.01070743, "auxiliary_loss_mlp": 0.01047483, "balance_loss_clip": 1.01585662, "balance_loss_mlp": 1.02331638, "epoch": 0.3245453179016985, "flos": 18837596736000.0, "grad_norm": 2.3500274704199926, "language_loss": 0.68839014, "learning_rate": 3.1568717715603263e-06, "loss": 0.70957243, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.47460938, "step": 5398, "time_per_iteration": 2.4005684852600098 }, { "auxiliary_loss_clip": 0.01073245, "auxiliary_loss_mlp": 0.01047448, "balance_loss_clip": 1.01375985, "balance_loss_mlp": 1.02432895, "epoch": 0.32460544115436646, "flos": 21177335877120.0, "grad_norm": 1.6917340587054654, "language_loss": 0.74200213, "learning_rate": 3.156554054887718e-06, "loss": 0.76320904, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.49023438, "step": 5399, "time_per_iteration": 3.9258251190185547 }, { "auxiliary_loss_clip": 0.01071067, "auxiliary_loss_mlp": 0.01054956, "balance_loss_clip": 1.02048039, "balance_loss_mlp": 1.02225947, "epoch": 0.3246655644070344, "flos": 21980058923520.0, "grad_norm": 2.270077222946056, "language_loss": 0.72705936, "learning_rate": 3.1562362943584645e-06, "loss": 0.74831963, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48632812, "step": 5400, "time_per_iteration": 2.416074752807617 }, { "auxiliary_loss_clip": 0.010748, "auxiliary_loss_mlp": 0.01051292, "balance_loss_clip": 1.01798475, "balance_loss_mlp": 1.024369, "epoch": 0.3247256876597024, "flos": 32158393119360.0, "grad_norm": 1.962007132366792, "language_loss": 0.81225431, "learning_rate": 3.155918489984614e-06, "loss": 0.83351517, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.50390625, "step": 5401, "time_per_iteration": 3.919761896133423 }, { "auxiliary_loss_clip": 0.0107304, "auxiliary_loss_mlp": 0.01052009, "balance_loss_clip": 1.01693738, "balance_loss_mlp": 1.0236727, "epoch": 0.32478581091237035, "flos": 20996450789760.0, "grad_norm": 1.406501091278154, "language_loss": 0.88214248, "learning_rate": 3.1556006417782196e-06, "loss": 0.90339297, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49414062, "step": 5402, "time_per_iteration": 2.4712460041046143 }, { "auxiliary_loss_clip": 0.01069806, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.01874375, "balance_loss_mlp": 1.02264214, "epoch": 0.3248459341650383, "flos": 17924199079680.0, "grad_norm": 2.192572470956526, "language_loss": 0.86398447, "learning_rate": 3.155282749751332e-06, "loss": 0.88520062, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47265625, "step": 5403, "time_per_iteration": 2.4334774017333984 }, { "auxiliary_loss_clip": 0.01070808, "auxiliary_loss_mlp": 0.01050221, "balance_loss_clip": 1.02051449, "balance_loss_mlp": 1.0245024, "epoch": 0.3249060574177063, "flos": 24534444303360.0, "grad_norm": 2.101241750218129, "language_loss": 0.88642085, "learning_rate": 3.154964813916007e-06, "loss": 0.90763116, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.46289062, "step": 5404, "time_per_iteration": 2.501919984817505 }, { "auxiliary_loss_clip": 0.01073035, "auxiliary_loss_mlp": 0.01044185, "balance_loss_clip": 1.01326203, "balance_loss_mlp": 1.02480054, "epoch": 0.32496618067037425, "flos": 25993569421440.0, "grad_norm": 2.0640148729944383, "language_loss": 0.7448808, "learning_rate": 3.1546468342843008e-06, "loss": 0.76605296, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48242188, "step": 5405, "time_per_iteration": 2.473304033279419 }, { "auxiliary_loss_clip": 0.0106976, "auxiliary_loss_mlp": 0.01051096, "balance_loss_clip": 1.02079272, "balance_loss_mlp": 1.02291512, "epoch": 0.3250263039230422, "flos": 19572727656960.0, "grad_norm": 1.6726731866535738, "language_loss": 0.84487426, "learning_rate": 3.1543288108682707e-06, "loss": 0.86608285, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46875, "step": 5406, "time_per_iteration": 2.4587175846099854 }, { "auxiliary_loss_clip": 0.01070448, "auxiliary_loss_mlp": 0.01045853, "balance_loss_clip": 1.0148232, "balance_loss_mlp": 1.02317178, "epoch": 0.3250864271757102, "flos": 16762708235520.0, "grad_norm": 2.2467438769757644, "language_loss": 0.8891651, "learning_rate": 3.1540107436799764e-06, "loss": 0.91032809, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.47265625, "step": 5407, "time_per_iteration": 2.420661449432373 }, { "auxiliary_loss_clip": 0.0107079, "auxiliary_loss_mlp": 0.01051912, "balance_loss_clip": 1.01862836, "balance_loss_mlp": 1.02192295, "epoch": 0.3251465504283782, "flos": 27818200229760.0, "grad_norm": 2.4797960334983826, "language_loss": 0.70449662, "learning_rate": 3.153692632731479e-06, "loss": 0.72572362, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48828125, "step": 5408, "time_per_iteration": 2.4985427856445312 }, { "auxiliary_loss_clip": 0.01077546, "auxiliary_loss_mlp": 0.01053325, "balance_loss_clip": 1.01782477, "balance_loss_mlp": 1.02375257, "epoch": 0.32520667368104617, "flos": 19062122889600.0, "grad_norm": 1.894853243482386, "language_loss": 0.79191172, "learning_rate": 3.153374478034841e-06, "loss": 0.81322038, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.5390625, "step": 5409, "time_per_iteration": 2.427690267562866 }, { "auxiliary_loss_clip": 0.01073307, "auxiliary_loss_mlp": 0.010628, "balance_loss_clip": 1.02703726, "balance_loss_mlp": 1.02191305, "epoch": 0.32526679693371413, "flos": 29381017685760.0, "grad_norm": 1.9855431623151774, "language_loss": 0.84235168, "learning_rate": 3.1530562796021285e-06, "loss": 0.86371279, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.515625, "step": 5410, "time_per_iteration": 2.4572863578796387 }, { "auxiliary_loss_clip": 0.01068278, "auxiliary_loss_mlp": 0.01048409, "balance_loss_clip": 1.01801062, "balance_loss_mlp": 1.02198017, "epoch": 0.3253269201863821, "flos": 20703459726720.0, "grad_norm": 2.1213510556811475, "language_loss": 0.72358406, "learning_rate": 3.152738037445405e-06, "loss": 0.74475092, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46289062, "step": 5411, "time_per_iteration": 2.4290783405303955 }, { "auxiliary_loss_clip": 0.01072015, "auxiliary_loss_mlp": 0.01054861, "balance_loss_clip": 1.02124381, "balance_loss_mlp": 1.02283847, "epoch": 0.32538704343905006, "flos": 29092914213120.0, "grad_norm": 1.4700535619837132, "language_loss": 0.83272803, "learning_rate": 3.1524197515767403e-06, "loss": 0.85399675, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4921875, "step": 5412, "time_per_iteration": 2.4644923210144043 }, { "auxiliary_loss_clip": 0.01072034, "auxiliary_loss_mlp": 0.01053339, "balance_loss_clip": 1.0165031, "balance_loss_mlp": 1.02170682, "epoch": 0.325447166691718, "flos": 24675109637760.0, "grad_norm": 2.813748543754715, "language_loss": 0.821527, "learning_rate": 3.152101422008203e-06, "loss": 0.84278071, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.50390625, "step": 5413, "time_per_iteration": 2.43977952003479 }, { "auxiliary_loss_clip": 0.01070623, "auxiliary_loss_mlp": 0.01057489, "balance_loss_clip": 1.02382469, "balance_loss_mlp": 1.02209306, "epoch": 0.325507289944386, "flos": 21542073517440.0, "grad_norm": 9.813382738388531, "language_loss": 0.776443, "learning_rate": 3.151783048751864e-06, "loss": 0.79772413, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.484375, "step": 5414, "time_per_iteration": 2.4251210689544678 }, { "auxiliary_loss_clip": 0.01021482, "auxiliary_loss_mlp": 0.01016883, "balance_loss_clip": 1.0133549, "balance_loss_mlp": 1.00996482, "epoch": 0.32556741319705396, "flos": 71515527206400.0, "grad_norm": 0.916110389372844, "language_loss": 0.64105648, "learning_rate": 3.1514646318197965e-06, "loss": 0.66144013, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 0.03540039, "router_z_loss_mlp": 0.11523438, "step": 5415, "time_per_iteration": 2.980988025665283 }, { "auxiliary_loss_clip": 0.01071123, "auxiliary_loss_mlp": 0.01051984, "balance_loss_clip": 1.01672196, "balance_loss_mlp": 1.02178097, "epoch": 0.3256275364497219, "flos": 23731302320640.0, "grad_norm": 1.4956860872341247, "language_loss": 0.75759935, "learning_rate": 3.151146171224075e-06, "loss": 0.77883041, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49414062, "step": 5416, "time_per_iteration": 2.5002965927124023 }, { "auxiliary_loss_clip": 0.0101889, "auxiliary_loss_mlp": 0.01005746, "balance_loss_clip": 1.00216997, "balance_loss_mlp": 1.00739551, "epoch": 0.3256876597023899, "flos": 67286043838080.0, "grad_norm": 0.7839268279134989, "language_loss": 0.58087015, "learning_rate": 3.1508276669767757e-06, "loss": 0.60111654, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.11523438, "step": 5417, "time_per_iteration": 3.1322898864746094 }, { "auxiliary_loss_clip": 0.01017011, "auxiliary_loss_mlp": 0.01002974, "balance_loss_clip": 0.99939764, "balance_loss_mlp": 1.00593174, "epoch": 0.32574778295505785, "flos": 71278605653760.0, "grad_norm": 0.8264227216248546, "language_loss": 0.63551438, "learning_rate": 3.150509119089975e-06, "loss": 0.65571415, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.11083984, "step": 5418, "time_per_iteration": 3.182905912399292 }, { "auxiliary_loss_clip": 0.01069817, "auxiliary_loss_mlp": 0.0105398, "balance_loss_clip": 1.01969528, "balance_loss_mlp": 1.02194524, "epoch": 0.3258079062077258, "flos": 20775345949440.0, "grad_norm": 1.907013141048205, "language_loss": 0.70861197, "learning_rate": 3.1501905275757537e-06, "loss": 0.72984993, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47851562, "step": 5419, "time_per_iteration": 2.3993401527404785 }, { "auxiliary_loss_clip": 0.010733, "auxiliary_loss_mlp": 0.01061461, "balance_loss_clip": 1.02469683, "balance_loss_mlp": 1.02365243, "epoch": 0.3258680294603938, "flos": 22234401244800.0, "grad_norm": 1.841016469714559, "language_loss": 0.78120553, "learning_rate": 3.1498718924461926e-06, "loss": 0.80255312, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.49804688, "step": 5420, "time_per_iteration": 2.4388620853424072 }, { "auxiliary_loss_clip": 0.01074154, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.02887583, "balance_loss_mlp": 1.02342415, "epoch": 0.3259281527130618, "flos": 26978748566400.0, "grad_norm": 1.6217717993817948, "language_loss": 0.81671405, "learning_rate": 3.1495532137133736e-06, "loss": 0.83808964, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.5078125, "step": 5421, "time_per_iteration": 2.442211151123047 }, { "auxiliary_loss_clip": 0.01070446, "auxiliary_loss_mlp": 0.01062748, "balance_loss_clip": 1.03153872, "balance_loss_mlp": 1.02371478, "epoch": 0.32598827596572977, "flos": 26213033427840.0, "grad_norm": 1.6773035130411653, "language_loss": 0.76736867, "learning_rate": 3.149234491389381e-06, "loss": 0.78870058, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46875, "step": 5422, "time_per_iteration": 2.4773449897766113 }, { "auxiliary_loss_clip": 0.01076307, "auxiliary_loss_mlp": 0.0105687, "balance_loss_clip": 1.02141666, "balance_loss_mlp": 1.02581787, "epoch": 0.32604839921839773, "flos": 17638783781760.0, "grad_norm": 1.9322188569213734, "language_loss": 0.65143359, "learning_rate": 3.1489157254863026e-06, "loss": 0.67276537, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.50390625, "step": 5423, "time_per_iteration": 2.373544692993164 }, { "auxiliary_loss_clip": 0.01069683, "auxiliary_loss_mlp": 0.01049099, "balance_loss_clip": 1.01796138, "balance_loss_mlp": 1.02442181, "epoch": 0.3261085224710657, "flos": 23621605228800.0, "grad_norm": 2.986424433040947, "language_loss": 0.75760174, "learning_rate": 3.148596916016224e-06, "loss": 0.77878964, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.453125, "step": 5424, "time_per_iteration": 2.4547507762908936 }, { "auxiliary_loss_clip": 0.0107287, "auxiliary_loss_mlp": 0.01052865, "balance_loss_clip": 1.02077365, "balance_loss_mlp": 1.02569342, "epoch": 0.32616864572373366, "flos": 23259276472320.0, "grad_norm": 1.7356835849425494, "language_loss": 0.77858114, "learning_rate": 3.1482780629912355e-06, "loss": 0.79983842, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.47265625, "step": 5425, "time_per_iteration": 2.4368221759796143 }, { "auxiliary_loss_clip": 0.01078023, "auxiliary_loss_mlp": 0.01055006, "balance_loss_clip": 1.01833773, "balance_loss_mlp": 1.0260911, "epoch": 0.32622876897640163, "flos": 25592242809600.0, "grad_norm": 2.372130252363461, "language_loss": 0.8035115, "learning_rate": 3.147959166423428e-06, "loss": 0.8248418, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.51953125, "step": 5426, "time_per_iteration": 2.475670099258423 }, { "auxiliary_loss_clip": 0.01075611, "auxiliary_loss_mlp": 0.0105152, "balance_loss_clip": 1.01699674, "balance_loss_mlp": 1.02714205, "epoch": 0.3262888922290696, "flos": 22417904684160.0, "grad_norm": 1.7686895509048688, "language_loss": 0.75634062, "learning_rate": 3.147640226324893e-06, "loss": 0.77761197, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.484375, "step": 5427, "time_per_iteration": 2.418316602706909 }, { "auxiliary_loss_clip": 0.01076223, "auxiliary_loss_mlp": 0.0105951, "balance_loss_clip": 1.02379465, "balance_loss_mlp": 1.02585506, "epoch": 0.32634901548173756, "flos": 19717896556800.0, "grad_norm": 1.6525174805102785, "language_loss": 0.8073107, "learning_rate": 3.1473212427077266e-06, "loss": 0.82866812, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.50390625, "step": 5428, "time_per_iteration": 2.4138104915618896 }, { "auxiliary_loss_clip": 0.01076023, "auxiliary_loss_mlp": 0.01050097, "balance_loss_clip": 1.01783955, "balance_loss_mlp": 1.02744412, "epoch": 0.3264091387344055, "flos": 16142022351360.0, "grad_norm": 1.7184462075443965, "language_loss": 0.72449541, "learning_rate": 3.147002215584023e-06, "loss": 0.74575663, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.48632812, "step": 5429, "time_per_iteration": 2.375328540802002 }, { "auxiliary_loss_clip": 0.01073759, "auxiliary_loss_mlp": 0.01051213, "balance_loss_clip": 1.01928902, "balance_loss_mlp": 1.02685153, "epoch": 0.3264692619870735, "flos": 16398145152000.0, "grad_norm": 1.623737806390898, "language_loss": 0.79884219, "learning_rate": 3.146683144965881e-06, "loss": 0.82009184, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46875, "step": 5430, "time_per_iteration": 2.4042210578918457 }, { "auxiliary_loss_clip": 0.01078615, "auxiliary_loss_mlp": 0.01058397, "balance_loss_clip": 1.02001154, "balance_loss_mlp": 1.02876234, "epoch": 0.32652938523974145, "flos": 22381245889920.0, "grad_norm": 2.0523064420053783, "language_loss": 0.85554427, "learning_rate": 3.146364030865399e-06, "loss": 0.87691438, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.49804688, "step": 5431, "time_per_iteration": 2.425642967224121 }, { "auxiliary_loss_clip": 0.01072381, "auxiliary_loss_mlp": 0.01053857, "balance_loss_clip": 1.02252889, "balance_loss_mlp": 1.0256691, "epoch": 0.3265895084924094, "flos": 21906985714560.0, "grad_norm": 2.244245676502082, "language_loss": 0.7185365, "learning_rate": 3.146044873294678e-06, "loss": 0.7397989, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46679688, "step": 5432, "time_per_iteration": 2.4290618896484375 }, { "auxiliary_loss_clip": 0.01071912, "auxiliary_loss_mlp": 0.01050072, "balance_loss_clip": 1.0183624, "balance_loss_mlp": 1.02413869, "epoch": 0.3266496317450774, "flos": 16066330790400.0, "grad_norm": 1.5161923125829169, "language_loss": 0.85051274, "learning_rate": 3.1457256722658203e-06, "loss": 0.87173259, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4765625, "step": 5433, "time_per_iteration": 2.3777236938476562 }, { "auxiliary_loss_clip": 0.01071675, "auxiliary_loss_mlp": 0.01051797, "balance_loss_clip": 1.01959825, "balance_loss_mlp": 1.02553105, "epoch": 0.3267097549977454, "flos": 22527147928320.0, "grad_norm": 1.4175278873650927, "language_loss": 0.86593187, "learning_rate": 3.145406427790931e-06, "loss": 0.88716656, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4609375, "step": 5434, "time_per_iteration": 3.8467822074890137 }, { "auxiliary_loss_clip": 0.01073816, "auxiliary_loss_mlp": 0.01057656, "balance_loss_clip": 1.02320504, "balance_loss_mlp": 1.02446198, "epoch": 0.32676987825041337, "flos": 27269226011520.0, "grad_norm": 1.9315651965383005, "language_loss": 0.88976389, "learning_rate": 3.1450871398821147e-06, "loss": 0.91107857, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49414062, "step": 5435, "time_per_iteration": 2.4756805896759033 }, { "auxiliary_loss_clip": 0.01070153, "auxiliary_loss_mlp": 0.01060163, "balance_loss_clip": 1.0254972, "balance_loss_mlp": 1.02280569, "epoch": 0.32683000150308134, "flos": 11507511767040.0, "grad_norm": 2.8922459361520643, "language_loss": 0.77456701, "learning_rate": 3.144767808551479e-06, "loss": 0.79587007, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.47265625, "step": 5436, "time_per_iteration": 3.7797019481658936 }, { "auxiliary_loss_clip": 0.01068792, "auxiliary_loss_mlp": 0.0106173, "balance_loss_clip": 1.0299015, "balance_loss_mlp": 1.02175474, "epoch": 0.3268901247557493, "flos": 25629006337920.0, "grad_norm": 1.5883987611686912, "language_loss": 0.73507679, "learning_rate": 3.144448433811134e-06, "loss": 0.75638199, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.47070312, "step": 5437, "time_per_iteration": 2.489107370376587 }, { "auxiliary_loss_clip": 0.0107268, "auxiliary_loss_mlp": 0.01071883, "balance_loss_clip": 1.03519022, "balance_loss_mlp": 1.02306485, "epoch": 0.32695024800841727, "flos": 24859765152000.0, "grad_norm": 1.6515809280844398, "language_loss": 0.65437382, "learning_rate": 3.144129015673189e-06, "loss": 0.6758194, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.49609375, "step": 5438, "time_per_iteration": 2.4375667572021484 }, { "auxiliary_loss_clip": 0.01069178, "auxiliary_loss_mlp": 0.01059226, "balance_loss_clip": 1.02491784, "balance_loss_mlp": 1.02211332, "epoch": 0.32701037126108523, "flos": 28838013310080.0, "grad_norm": 1.5956351011431187, "language_loss": 0.75156707, "learning_rate": 3.1438095541497576e-06, "loss": 0.77285105, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47070312, "step": 5439, "time_per_iteration": 3.8575401306152344 }, { "auxiliary_loss_clip": 0.01071806, "auxiliary_loss_mlp": 0.01061373, "balance_loss_clip": 1.0258013, "balance_loss_mlp": 1.02383471, "epoch": 0.3270704945137532, "flos": 27963822977280.0, "grad_norm": 1.9100087715132943, "language_loss": 0.75615025, "learning_rate": 3.1434900492529527e-06, "loss": 0.77748209, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48046875, "step": 5440, "time_per_iteration": 3.9729113578796387 }, { "auxiliary_loss_clip": 0.0106876, "auxiliary_loss_mlp": 0.01068905, "balance_loss_clip": 1.03638506, "balance_loss_mlp": 1.02208066, "epoch": 0.32713061776642116, "flos": 23689721024640.0, "grad_norm": 2.5122424468144366, "language_loss": 0.8578831, "learning_rate": 3.1431705009948914e-06, "loss": 0.87925971, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46679688, "step": 5441, "time_per_iteration": 2.4053778648376465 }, { "auxiliary_loss_clip": 0.01071364, "auxiliary_loss_mlp": 0.01058184, "balance_loss_clip": 1.0233748, "balance_loss_mlp": 1.0225265, "epoch": 0.3271907410190891, "flos": 22454528567040.0, "grad_norm": 3.656126749402145, "language_loss": 0.87226444, "learning_rate": 3.1428509093876897e-06, "loss": 0.89355993, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48828125, "step": 5442, "time_per_iteration": 2.4195220470428467 }, { "auxiliary_loss_clip": 0.01073343, "auxiliary_loss_mlp": 0.01058192, "balance_loss_clip": 1.02159512, "balance_loss_mlp": 1.02376413, "epoch": 0.3272508642717571, "flos": 22819021827840.0, "grad_norm": 3.5725335227434942, "language_loss": 0.78860748, "learning_rate": 3.1425312744434668e-06, "loss": 0.80992281, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.49609375, "step": 5443, "time_per_iteration": 2.4414238929748535 }, { "auxiliary_loss_clip": 0.01072325, "auxiliary_loss_mlp": 0.01063636, "balance_loss_clip": 1.0291127, "balance_loss_mlp": 1.02279401, "epoch": 0.32731098752442506, "flos": 11800572652800.0, "grad_norm": 2.2289621349534, "language_loss": 0.82967389, "learning_rate": 3.142211596174343e-06, "loss": 0.85103345, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.49609375, "step": 5444, "time_per_iteration": 2.428816318511963 }, { "auxiliary_loss_clip": 0.01072905, "auxiliary_loss_mlp": 0.01051386, "balance_loss_clip": 1.01857936, "balance_loss_mlp": 1.02367544, "epoch": 0.327371110777093, "flos": 21026860450560.0, "grad_norm": 2.2900670856512995, "language_loss": 0.60307884, "learning_rate": 3.1418918745924423e-06, "loss": 0.6243217, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 5445, "time_per_iteration": 2.39406418800354 }, { "auxiliary_loss_clip": 0.01073795, "auxiliary_loss_mlp": 0.01050318, "balance_loss_clip": 1.01617622, "balance_loss_mlp": 1.02543032, "epoch": 0.327431234029761, "flos": 19061110460160.0, "grad_norm": 3.5649947630195182, "language_loss": 0.89480209, "learning_rate": 3.1415721097098865e-06, "loss": 0.91604316, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.484375, "step": 5446, "time_per_iteration": 2.4272255897521973 }, { "auxiliary_loss_clip": 0.01080223, "auxiliary_loss_mlp": 0.01056565, "balance_loss_clip": 1.01815546, "balance_loss_mlp": 1.02795064, "epoch": 0.32749135728242895, "flos": 25848016496640.0, "grad_norm": 1.7806944635142687, "language_loss": 0.81356847, "learning_rate": 3.141252301538802e-06, "loss": 0.83493638, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5234375, "step": 5447, "time_per_iteration": 2.4461405277252197 }, { "auxiliary_loss_clip": 0.01075336, "auxiliary_loss_mlp": 0.01042455, "balance_loss_clip": 1.01200867, "balance_loss_mlp": 1.02683651, "epoch": 0.327551480535097, "flos": 20119502459520.0, "grad_norm": 3.0478611482951345, "language_loss": 0.74475169, "learning_rate": 3.1409324500913157e-06, "loss": 0.76592964, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.484375, "step": 5448, "time_per_iteration": 2.473785161972046 }, { "auxiliary_loss_clip": 0.01075064, "auxiliary_loss_mlp": 0.01052506, "balance_loss_clip": 1.02005744, "balance_loss_mlp": 1.02754307, "epoch": 0.32761160378776494, "flos": 28802297122560.0, "grad_norm": 1.6342473213826885, "language_loss": 0.68416876, "learning_rate": 3.1406125553795567e-06, "loss": 0.70544446, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47460938, "step": 5449, "time_per_iteration": 2.471534013748169 }, { "auxiliary_loss_clip": 0.01073158, "auxiliary_loss_mlp": 0.01053384, "balance_loss_clip": 1.02153158, "balance_loss_mlp": 1.02693212, "epoch": 0.3276717270404329, "flos": 26936713422720.0, "grad_norm": 2.064709985085134, "language_loss": 0.66588527, "learning_rate": 3.1402926174156556e-06, "loss": 0.68715072, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 5450, "time_per_iteration": 2.5470423698425293 }, { "auxiliary_loss_clip": 0.01077097, "auxiliary_loss_mlp": 0.0105631, "balance_loss_clip": 1.02104807, "balance_loss_mlp": 1.02733135, "epoch": 0.32773185029310087, "flos": 25337237172480.0, "grad_norm": 7.533722740191079, "language_loss": 0.79008245, "learning_rate": 3.1399726362117437e-06, "loss": 0.81141651, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49609375, "step": 5451, "time_per_iteration": 2.465240716934204 }, { "auxiliary_loss_clip": 0.0107707, "auxiliary_loss_mlp": 0.01051516, "balance_loss_clip": 1.01873326, "balance_loss_mlp": 1.02829516, "epoch": 0.32779197354576883, "flos": 26390636847360.0, "grad_norm": 2.2762253566263864, "language_loss": 0.71898574, "learning_rate": 3.1396526117799555e-06, "loss": 0.74027169, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48828125, "step": 5452, "time_per_iteration": 2.500575065612793 }, { "auxiliary_loss_clip": 0.01073571, "auxiliary_loss_mlp": 0.01054084, "balance_loss_clip": 1.02080107, "balance_loss_mlp": 1.02752995, "epoch": 0.3278520967984368, "flos": 24898239336960.0, "grad_norm": 2.021175622655834, "language_loss": 0.80025762, "learning_rate": 3.1393325441324256e-06, "loss": 0.82153416, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4609375, "step": 5453, "time_per_iteration": 2.443418264389038 }, { "auxiliary_loss_clip": 0.01076042, "auxiliary_loss_mlp": 0.01050225, "balance_loss_clip": 1.01937389, "balance_loss_mlp": 1.02849853, "epoch": 0.32791222005110476, "flos": 29751690257280.0, "grad_norm": 2.2253679777236686, "language_loss": 0.77778327, "learning_rate": 3.1390124332812916e-06, "loss": 0.79904604, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4765625, "step": 5454, "time_per_iteration": 2.506333827972412 }, { "auxiliary_loss_clip": 0.01072915, "auxiliary_loss_mlp": 0.01047548, "balance_loss_clip": 1.01915264, "balance_loss_mlp": 1.0275079, "epoch": 0.32797234330377273, "flos": 16507144016640.0, "grad_norm": 1.9272359003889732, "language_loss": 0.78227305, "learning_rate": 3.1386922792386924e-06, "loss": 0.80347764, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.453125, "step": 5455, "time_per_iteration": 2.385925054550171 }, { "auxiliary_loss_clip": 0.01074256, "auxiliary_loss_mlp": 0.01058226, "balance_loss_clip": 1.02328563, "balance_loss_mlp": 1.02472138, "epoch": 0.3280324665564407, "flos": 26576723727360.0, "grad_norm": 1.7615411674805983, "language_loss": 0.75151205, "learning_rate": 3.138372082016768e-06, "loss": 0.77283686, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49609375, "step": 5456, "time_per_iteration": 2.4691927433013916 }, { "auxiliary_loss_clip": 0.01071246, "auxiliary_loss_mlp": 0.01056299, "balance_loss_clip": 1.02327764, "balance_loss_mlp": 1.02376473, "epoch": 0.32809258980910866, "flos": 22928858565120.0, "grad_norm": 4.515423897120848, "language_loss": 0.79429221, "learning_rate": 3.1380518416276596e-06, "loss": 0.81556761, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47460938, "step": 5457, "time_per_iteration": 2.4406580924987793 }, { "auxiliary_loss_clip": 0.01074035, "auxiliary_loss_mlp": 0.01053582, "balance_loss_clip": 1.02120471, "balance_loss_mlp": 1.02468073, "epoch": 0.3281527130617766, "flos": 22782747058560.0, "grad_norm": 2.8050815148248485, "language_loss": 0.82064992, "learning_rate": 3.1377315580835115e-06, "loss": 0.8419261, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.49414062, "step": 5458, "time_per_iteration": 2.4208099842071533 }, { "auxiliary_loss_clip": 0.01070146, "auxiliary_loss_mlp": 0.01049152, "balance_loss_clip": 1.01853871, "balance_loss_mlp": 1.02330208, "epoch": 0.3282128363144446, "flos": 21249641036160.0, "grad_norm": 1.5804921887537156, "language_loss": 0.74598271, "learning_rate": 3.1374112313964686e-06, "loss": 0.76717567, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.46875, "step": 5459, "time_per_iteration": 2.453692674636841 }, { "auxiliary_loss_clip": 0.01072789, "auxiliary_loss_mlp": 0.01050548, "balance_loss_clip": 1.01734769, "balance_loss_mlp": 1.02361631, "epoch": 0.32827295956711255, "flos": 30841853460480.0, "grad_norm": 1.7971312118093028, "language_loss": 0.84910744, "learning_rate": 3.1370908615786783e-06, "loss": 0.87034082, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4921875, "step": 5460, "time_per_iteration": 2.4760990142822266 }, { "auxiliary_loss_clip": 0.01068278, "auxiliary_loss_mlp": 0.01051182, "balance_loss_clip": 1.01892376, "balance_loss_mlp": 1.02145934, "epoch": 0.3283330828197806, "flos": 25914002699520.0, "grad_norm": 2.2742410700853584, "language_loss": 0.78478205, "learning_rate": 3.136770448642288e-06, "loss": 0.80597669, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46875, "step": 5461, "time_per_iteration": 2.4587838649749756 }, { "auxiliary_loss_clip": 0.0106753, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.02095222, "balance_loss_mlp": 1.02127337, "epoch": 0.32839320607244854, "flos": 38580526604160.0, "grad_norm": 2.0386841497406, "language_loss": 0.63943118, "learning_rate": 3.1364499925994484e-06, "loss": 0.66064954, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.46289062, "step": 5462, "time_per_iteration": 2.5685999393463135 }, { "auxiliary_loss_clip": 0.01068174, "auxiliary_loss_mlp": 0.01048701, "balance_loss_clip": 1.0180881, "balance_loss_mlp": 1.02236867, "epoch": 0.3284533293251165, "flos": 26649692202240.0, "grad_norm": 1.6519164436443026, "language_loss": 0.7902385, "learning_rate": 3.1361294934623115e-06, "loss": 0.81140721, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45703125, "step": 5463, "time_per_iteration": 2.5087454319000244 }, { "auxiliary_loss_clip": 0.01070634, "auxiliary_loss_mlp": 0.01058797, "balance_loss_clip": 1.02389216, "balance_loss_mlp": 1.02248919, "epoch": 0.32851345257778447, "flos": 15303268915200.0, "grad_norm": 2.422501592683041, "language_loss": 0.71335477, "learning_rate": 3.1358089512430303e-06, "loss": 0.73464906, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.48242188, "step": 5464, "time_per_iteration": 2.3807013034820557 }, { "auxiliary_loss_clip": 0.01066969, "auxiliary_loss_mlp": 0.01048459, "balance_loss_clip": 1.01636767, "balance_loss_mlp": 1.02246165, "epoch": 0.32857357583045244, "flos": 23512606364160.0, "grad_norm": 1.815561213613867, "language_loss": 0.72772062, "learning_rate": 3.1354883659537594e-06, "loss": 0.7488749, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 5465, "time_per_iteration": 2.488621711730957 }, { "auxiliary_loss_clip": 0.01069689, "auxiliary_loss_mlp": 0.01049612, "balance_loss_clip": 1.01851022, "balance_loss_mlp": 1.02268434, "epoch": 0.3286336990831204, "flos": 20994181551360.0, "grad_norm": 1.647091917263592, "language_loss": 0.83805096, "learning_rate": 3.1351677376066567e-06, "loss": 0.85924393, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.47070312, "step": 5466, "time_per_iteration": 2.425475597381592 }, { "auxiliary_loss_clip": 0.01069323, "auxiliary_loss_mlp": 0.0104824, "balance_loss_clip": 1.01581478, "balance_loss_mlp": 1.02225649, "epoch": 0.32869382233578837, "flos": 23657705441280.0, "grad_norm": 1.9541633133664364, "language_loss": 0.80722165, "learning_rate": 3.134847066213879e-06, "loss": 0.82839721, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47070312, "step": 5467, "time_per_iteration": 2.4333090782165527 }, { "auxiliary_loss_clip": 0.01070179, "auxiliary_loss_mlp": 0.0104958, "balance_loss_clip": 1.01608181, "balance_loss_mlp": 1.02253199, "epoch": 0.32875394558845633, "flos": 25335386870400.0, "grad_norm": 1.6498360893539201, "language_loss": 0.75585294, "learning_rate": 3.134526351787587e-06, "loss": 0.7770505, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4765625, "step": 5468, "time_per_iteration": 2.44427752494812 }, { "auxiliary_loss_clip": 0.01076054, "auxiliary_loss_mlp": 0.01052415, "balance_loss_clip": 1.01743913, "balance_loss_mlp": 1.02614713, "epoch": 0.3288140688411243, "flos": 14902221594240.0, "grad_norm": 1.700049437406739, "language_loss": 0.80479324, "learning_rate": 3.134205594339942e-06, "loss": 0.82607794, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5, "step": 5469, "time_per_iteration": 2.425823926925659 }, { "auxiliary_loss_clip": 0.01069638, "auxiliary_loss_mlp": 0.01048316, "balance_loss_clip": 1.01553297, "balance_loss_mlp": 1.0220536, "epoch": 0.32887419209379226, "flos": 18550366047360.0, "grad_norm": 1.7361436100796737, "language_loss": 0.82999158, "learning_rate": 3.133884793883107e-06, "loss": 0.85117114, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4765625, "step": 5470, "time_per_iteration": 2.3766744136810303 }, { "auxiliary_loss_clip": 0.01070519, "auxiliary_loss_mlp": 0.01048725, "balance_loss_clip": 1.01469111, "balance_loss_mlp": 1.02240157, "epoch": 0.3289343153464602, "flos": 48103785360000.0, "grad_norm": 1.979029946281867, "language_loss": 0.69589424, "learning_rate": 3.1335639504292478e-06, "loss": 0.71708667, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.48046875, "step": 5471, "time_per_iteration": 2.6472537517547607 }, { "auxiliary_loss_clip": 0.01073236, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.02592516, "balance_loss_mlp": 1.02372217, "epoch": 0.3289944385991282, "flos": 27599050425600.0, "grad_norm": 2.090357218063405, "language_loss": 0.66654038, "learning_rate": 3.1332430639905288e-06, "loss": 0.6878792, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.49609375, "step": 5472, "time_per_iteration": 2.4556658267974854 }, { "auxiliary_loss_clip": 0.01070522, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.02108133, "balance_loss_mlp": 1.02276826, "epoch": 0.32905456185179616, "flos": 20119292991360.0, "grad_norm": 1.657580946351629, "language_loss": 0.89811158, "learning_rate": 3.13292213457912e-06, "loss": 0.91938186, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4765625, "step": 5473, "time_per_iteration": 3.8442516326904297 }, { "auxiliary_loss_clip": 0.01070671, "auxiliary_loss_mlp": 0.0105083, "balance_loss_clip": 1.01482916, "balance_loss_mlp": 1.02272439, "epoch": 0.3291146851044642, "flos": 23179255902720.0, "grad_norm": 1.7489138242311906, "language_loss": 0.8018288, "learning_rate": 3.1326011622071903e-06, "loss": 0.82304382, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48046875, "step": 5474, "time_per_iteration": 2.4720981121063232 }, { "auxiliary_loss_clip": 0.01018412, "auxiliary_loss_mlp": 0.01011898, "balance_loss_clip": 1.00794005, "balance_loss_mlp": 1.00735259, "epoch": 0.32917480835713214, "flos": 67618626249600.0, "grad_norm": 0.8151204912893346, "language_loss": 0.60261691, "learning_rate": 3.132280146886911e-06, "loss": 0.62292004, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.11035156, "step": 5475, "time_per_iteration": 3.0370781421661377 }, { "auxiliary_loss_clip": 0.01070796, "auxiliary_loss_mlp": 0.01053714, "balance_loss_clip": 1.01833272, "balance_loss_mlp": 1.02154732, "epoch": 0.3292349316098001, "flos": 27963299306880.0, "grad_norm": 2.6025908525083623, "language_loss": 0.78246182, "learning_rate": 3.131959088630455e-06, "loss": 0.80370694, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.4921875, "step": 5476, "time_per_iteration": 3.8866868019104004 }, { "auxiliary_loss_clip": 0.01068185, "auxiliary_loss_mlp": 0.01056084, "balance_loss_clip": 1.02426696, "balance_loss_mlp": 1.0216763, "epoch": 0.3292950548624681, "flos": 20262716323200.0, "grad_norm": 1.9755285835353642, "language_loss": 0.76634508, "learning_rate": 3.131637987449997e-06, "loss": 0.78758776, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46484375, "step": 5477, "time_per_iteration": 2.458207130432129 }, { "auxiliary_loss_clip": 0.01066479, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.01915038, "balance_loss_mlp": 1.02221942, "epoch": 0.32935517811513604, "flos": 20811969832320.0, "grad_norm": 2.332572780205217, "language_loss": 0.77103531, "learning_rate": 3.131316843357713e-06, "loss": 0.79218954, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.44335938, "step": 5478, "time_per_iteration": 3.719945192337036 }, { "auxiliary_loss_clip": 0.0106732, "auxiliary_loss_mlp": 0.01049459, "balance_loss_clip": 1.02084851, "balance_loss_mlp": 1.02237797, "epoch": 0.329415301367804, "flos": 18440878423680.0, "grad_norm": 1.979483805708507, "language_loss": 0.81874299, "learning_rate": 3.1309956563657807e-06, "loss": 0.83991075, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.44921875, "step": 5479, "time_per_iteration": 2.415287494659424 }, { "auxiliary_loss_clip": 0.01014871, "auxiliary_loss_mlp": 0.01006304, "balance_loss_clip": 1.00227451, "balance_loss_mlp": 1.00381243, "epoch": 0.32947542462047197, "flos": 66319367713920.0, "grad_norm": 0.7574141346833446, "language_loss": 0.56652832, "learning_rate": 3.1306744264863804e-06, "loss": 0.58674002, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.11035156, "step": 5480, "time_per_iteration": 4.502478837966919 }, { "auxiliary_loss_clip": 0.01067432, "auxiliary_loss_mlp": 0.01051628, "balance_loss_clip": 1.02115798, "balance_loss_mlp": 1.0213089, "epoch": 0.32953554787313993, "flos": 23220488085120.0, "grad_norm": 1.7147040560600784, "language_loss": 0.79020059, "learning_rate": 3.1303531537316915e-06, "loss": 0.81139117, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4609375, "step": 5481, "time_per_iteration": 2.4088613986968994 }, { "auxiliary_loss_clip": 0.01069121, "auxiliary_loss_mlp": 0.01050198, "balance_loss_clip": 1.01970458, "balance_loss_mlp": 1.02226782, "epoch": 0.3295956711258079, "flos": 27008460000000.0, "grad_norm": 1.6021857296811188, "language_loss": 0.7944243, "learning_rate": 3.130031838113899e-06, "loss": 0.8156175, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.46875, "step": 5482, "time_per_iteration": 2.5287725925445557 }, { "auxiliary_loss_clip": 0.01067923, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.02112746, "balance_loss_mlp": 1.02166462, "epoch": 0.32965579437847586, "flos": 19170702817920.0, "grad_norm": 1.7349894740063465, "language_loss": 0.75354445, "learning_rate": 3.129710479645185e-06, "loss": 0.77476108, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46289062, "step": 5483, "time_per_iteration": 2.3935012817382812 }, { "auxiliary_loss_clip": 0.01068569, "auxiliary_loss_mlp": 0.0104648, "balance_loss_clip": 1.01467514, "balance_loss_mlp": 1.02223825, "epoch": 0.32971591763114383, "flos": 30481200449280.0, "grad_norm": 1.510621522648374, "language_loss": 0.76468033, "learning_rate": 3.1293890783377366e-06, "loss": 0.78583086, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46289062, "step": 5484, "time_per_iteration": 2.4957189559936523 }, { "auxiliary_loss_clip": 0.01066773, "auxiliary_loss_mlp": 0.01053798, "balance_loss_clip": 1.02403176, "balance_loss_mlp": 1.0221231, "epoch": 0.3297760408838118, "flos": 16288657528320.0, "grad_norm": 1.8867738920876655, "language_loss": 0.73081923, "learning_rate": 3.129067634203742e-06, "loss": 0.75202501, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44726562, "step": 5485, "time_per_iteration": 2.381308078765869 }, { "auxiliary_loss_clip": 0.01068353, "auxiliary_loss_mlp": 0.0104807, "balance_loss_clip": 1.01742101, "balance_loss_mlp": 1.02324891, "epoch": 0.32983616413647976, "flos": 29529712632960.0, "grad_norm": 1.6071068054535178, "language_loss": 0.81287748, "learning_rate": 3.128746147255388e-06, "loss": 0.83404171, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45117188, "step": 5486, "time_per_iteration": 2.492466449737549 }, { "auxiliary_loss_clip": 0.01065595, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.01674342, "balance_loss_mlp": 1.02061772, "epoch": 0.3298962873891478, "flos": 20630351606400.0, "grad_norm": 2.0795644484903235, "language_loss": 0.85784519, "learning_rate": 3.1284246175048683e-06, "loss": 0.87896466, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 5487, "time_per_iteration": 2.4108831882476807 }, { "auxiliary_loss_clip": 0.0107001, "auxiliary_loss_mlp": 0.01051858, "balance_loss_clip": 1.01743019, "balance_loss_mlp": 1.02168489, "epoch": 0.32995641064181574, "flos": 14975120246400.0, "grad_norm": 2.2842888403109223, "language_loss": 0.76377618, "learning_rate": 3.1281030449643735e-06, "loss": 0.78499484, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48242188, "step": 5488, "time_per_iteration": 2.3967974185943604 }, { "auxiliary_loss_clip": 0.01068799, "auxiliary_loss_mlp": 0.01048399, "balance_loss_clip": 1.01502037, "balance_loss_mlp": 1.02126646, "epoch": 0.3300165338944837, "flos": 18660447164160.0, "grad_norm": 2.265223701919076, "language_loss": 0.74390244, "learning_rate": 3.127781429646098e-06, "loss": 0.76507437, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4765625, "step": 5489, "time_per_iteration": 2.367581605911255 }, { "auxiliary_loss_clip": 0.01065031, "auxiliary_loss_mlp": 0.01047996, "balance_loss_clip": 1.01904035, "balance_loss_mlp": 1.02081585, "epoch": 0.3300766571471517, "flos": 25582816742400.0, "grad_norm": 3.6457118803645074, "language_loss": 0.90861881, "learning_rate": 3.127459771562238e-06, "loss": 0.92974907, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44140625, "step": 5490, "time_per_iteration": 2.4556074142456055 }, { "auxiliary_loss_clip": 0.01065667, "auxiliary_loss_mlp": 0.0104686, "balance_loss_clip": 1.01813054, "balance_loss_mlp": 1.02036381, "epoch": 0.33013678039981964, "flos": 11362726892160.0, "grad_norm": 2.120566837696673, "language_loss": 0.84614813, "learning_rate": 3.1271380707249907e-06, "loss": 0.86727339, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.453125, "step": 5491, "time_per_iteration": 2.400254249572754 }, { "auxiliary_loss_clip": 0.01067154, "auxiliary_loss_mlp": 0.01045384, "balance_loss_clip": 1.01475954, "balance_loss_mlp": 1.02242565, "epoch": 0.3301969036524876, "flos": 24820208714880.0, "grad_norm": 1.7802664977785094, "language_loss": 0.7887345, "learning_rate": 3.126816327146554e-06, "loss": 0.80985993, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44921875, "step": 5492, "time_per_iteration": 2.4701054096221924 }, { "auxiliary_loss_clip": 0.0107213, "auxiliary_loss_mlp": 0.01052329, "balance_loss_clip": 1.01954651, "balance_loss_mlp": 1.02508175, "epoch": 0.33025702690515557, "flos": 15960229568640.0, "grad_norm": 2.3051591869456094, "language_loss": 0.79062033, "learning_rate": 3.12649454083913e-06, "loss": 0.81186491, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.47070312, "step": 5493, "time_per_iteration": 2.380720376968384 }, { "auxiliary_loss_clip": 0.01018753, "auxiliary_loss_mlp": 0.01013495, "balance_loss_clip": 1.00963306, "balance_loss_mlp": 1.00810361, "epoch": 0.33031715015782354, "flos": 59413582897920.0, "grad_norm": 0.7924774017818957, "language_loss": 0.54032993, "learning_rate": 3.12617271181492e-06, "loss": 0.56065243, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 0.03857422, "router_z_loss_mlp": 0.10644531, "step": 5494, "time_per_iteration": 3.0080111026763916 }, { "auxiliary_loss_clip": 0.01066421, "auxiliary_loss_mlp": 0.01039171, "balance_loss_clip": 1.0096432, "balance_loss_mlp": 1.02180552, "epoch": 0.3303772734104915, "flos": 23183270709120.0, "grad_norm": 1.6078702539089902, "language_loss": 0.88068652, "learning_rate": 3.1258508400861276e-06, "loss": 0.90174246, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44726562, "step": 5495, "time_per_iteration": 2.42714524269104 }, { "auxiliary_loss_clip": 0.01068712, "auxiliary_loss_mlp": 0.0105223, "balance_loss_clip": 1.01854157, "balance_loss_mlp": 1.02165258, "epoch": 0.33043739666315947, "flos": 33070533966720.0, "grad_norm": 2.1179678722908046, "language_loss": 0.7483533, "learning_rate": 3.1255289256649587e-06, "loss": 0.76956272, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.47070312, "step": 5496, "time_per_iteration": 2.5375235080718994 }, { "auxiliary_loss_clip": 0.01064277, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.01180291, "balance_loss_mlp": 1.0195998, "epoch": 0.33049751991582743, "flos": 24894399087360.0, "grad_norm": 1.88307215298916, "language_loss": 0.73423445, "learning_rate": 3.1252069685636196e-06, "loss": 0.75527513, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.44726562, "step": 5497, "time_per_iteration": 2.4137399196624756 }, { "auxiliary_loss_clip": 0.01066817, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.0203855, "balance_loss_mlp": 1.0219686, "epoch": 0.3305576431684954, "flos": 29459292687360.0, "grad_norm": 10.21787146031512, "language_loss": 0.82659531, "learning_rate": 3.124884968794321e-06, "loss": 0.84776092, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44921875, "step": 5498, "time_per_iteration": 2.5626227855682373 }, { "auxiliary_loss_clip": 0.01064704, "auxiliary_loss_mlp": 0.01049177, "balance_loss_clip": 1.01708603, "balance_loss_mlp": 1.01883054, "epoch": 0.33061776642116336, "flos": 22631363936640.0, "grad_norm": 1.8232880853808997, "language_loss": 0.77206975, "learning_rate": 3.12456292636927e-06, "loss": 0.7932086, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45898438, "step": 5499, "time_per_iteration": 2.452155113220215 }, { "auxiliary_loss_clip": 0.01065824, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.01584184, "balance_loss_mlp": 1.0213263, "epoch": 0.3306778896738313, "flos": 25775117844480.0, "grad_norm": 1.5327099319429727, "language_loss": 0.80021322, "learning_rate": 3.124240841300681e-06, "loss": 0.82130742, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4453125, "step": 5500, "time_per_iteration": 2.467546224594116 }, { "auxiliary_loss_clip": 0.01068533, "auxiliary_loss_mlp": 0.01052879, "balance_loss_clip": 1.02101469, "balance_loss_mlp": 1.02160621, "epoch": 0.33073801292649935, "flos": 36939050121600.0, "grad_norm": 3.9929329613767677, "language_loss": 0.68335807, "learning_rate": 3.1239187136007665e-06, "loss": 0.7045722, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46875, "step": 5501, "time_per_iteration": 2.5195114612579346 }, { "auxiliary_loss_clip": 0.0106715, "auxiliary_loss_mlp": 0.0105617, "balance_loss_clip": 1.02329218, "balance_loss_mlp": 1.02014756, "epoch": 0.3307981361791673, "flos": 12966951087360.0, "grad_norm": 2.0132504324614344, "language_loss": 0.7947762, "learning_rate": 3.1235965432817417e-06, "loss": 0.8160094, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46875, "step": 5502, "time_per_iteration": 2.4046525955200195 }, { "auxiliary_loss_clip": 0.01068722, "auxiliary_loss_mlp": 0.01052602, "balance_loss_clip": 1.02225184, "balance_loss_mlp": 1.02211761, "epoch": 0.3308582594318353, "flos": 25373197739520.0, "grad_norm": 3.6641027725551827, "language_loss": 0.7379241, "learning_rate": 3.123274330355824e-06, "loss": 0.75913733, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.46484375, "step": 5503, "time_per_iteration": 2.4225854873657227 }, { "auxiliary_loss_clip": 0.01066458, "auxiliary_loss_mlp": 0.01047582, "balance_loss_clip": 1.0182445, "balance_loss_mlp": 1.02013576, "epoch": 0.33091838268450324, "flos": 26467375749120.0, "grad_norm": 1.5122997215626903, "language_loss": 0.76359332, "learning_rate": 3.12295207483523e-06, "loss": 0.78473377, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.46289062, "step": 5504, "time_per_iteration": 2.4794552326202393 }, { "auxiliary_loss_clip": 0.01067863, "auxiliary_loss_mlp": 0.01054725, "balance_loss_clip": 1.02468407, "balance_loss_mlp": 1.02207375, "epoch": 0.3309785059371712, "flos": 24970055736960.0, "grad_norm": 1.7107289746303507, "language_loss": 0.71400166, "learning_rate": 3.1226297767321816e-06, "loss": 0.73522747, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45703125, "step": 5505, "time_per_iteration": 2.4640347957611084 }, { "auxiliary_loss_clip": 0.0106794, "auxiliary_loss_mlp": 0.01058361, "balance_loss_clip": 1.02841604, "balance_loss_mlp": 1.02239799, "epoch": 0.3310386291898392, "flos": 20445731003520.0, "grad_norm": 1.7265188124845996, "language_loss": 0.83167517, "learning_rate": 3.122307436058899e-06, "loss": 0.85293818, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.45507812, "step": 5506, "time_per_iteration": 2.4464664459228516 }, { "auxiliary_loss_clip": 0.01068137, "auxiliary_loss_mlp": 0.01050148, "balance_loss_clip": 1.01862884, "balance_loss_mlp": 1.02224028, "epoch": 0.33109875244250714, "flos": 23181629875200.0, "grad_norm": 1.822852472188488, "language_loss": 0.80865204, "learning_rate": 3.121985052827606e-06, "loss": 0.82983482, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45898438, "step": 5507, "time_per_iteration": 2.405184507369995 }, { "auxiliary_loss_clip": 0.01067925, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.02140844, "balance_loss_mlp": 1.02103972, "epoch": 0.3311588756951751, "flos": 24167297779200.0, "grad_norm": 1.5609399838841558, "language_loss": 0.73063791, "learning_rate": 3.1216626270505274e-06, "loss": 0.75183958, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46875, "step": 5508, "time_per_iteration": 2.4126219749450684 }, { "auxiliary_loss_clip": 0.01066651, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.01799536, "balance_loss_mlp": 1.02215159, "epoch": 0.33121899894784307, "flos": 28144533507840.0, "grad_norm": 1.9528328791651648, "language_loss": 0.72910577, "learning_rate": 3.12134015873989e-06, "loss": 0.75022376, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4453125, "step": 5509, "time_per_iteration": 2.4431326389312744 }, { "auxiliary_loss_clip": 0.01067881, "auxiliary_loss_mlp": 0.01055806, "balance_loss_clip": 1.02323818, "balance_loss_mlp": 1.02149141, "epoch": 0.33127912220051103, "flos": 29566441249920.0, "grad_norm": 1.5838832746367593, "language_loss": 0.75300443, "learning_rate": 3.121017647907921e-06, "loss": 0.77424133, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46289062, "step": 5510, "time_per_iteration": 2.460125684738159 }, { "auxiliary_loss_clip": 0.01066979, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.01597941, "balance_loss_mlp": 1.0215615, "epoch": 0.331339245453179, "flos": 14427961418880.0, "grad_norm": 2.151046158574119, "language_loss": 0.88810003, "learning_rate": 3.1206950945668508e-06, "loss": 0.90921497, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.45507812, "step": 5511, "time_per_iteration": 2.4014720916748047 }, { "auxiliary_loss_clip": 0.01061953, "auxiliary_loss_mlp": 0.01045342, "balance_loss_clip": 1.01737547, "balance_loss_mlp": 1.02015519, "epoch": 0.33139936870584696, "flos": 20886055470720.0, "grad_norm": 1.6469094366882815, "language_loss": 0.75107086, "learning_rate": 3.12037249872891e-06, "loss": 0.77214372, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 5512, "time_per_iteration": 2.413630485534668 }, { "auxiliary_loss_clip": 0.01065175, "auxiliary_loss_mlp": 0.01047974, "balance_loss_clip": 1.01845741, "balance_loss_mlp": 1.02096868, "epoch": 0.33145949195851493, "flos": 36282857518080.0, "grad_norm": 1.7402102366933752, "language_loss": 0.7423619, "learning_rate": 3.1200498604063317e-06, "loss": 0.76349342, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44140625, "step": 5513, "time_per_iteration": 3.9858715534210205 }, { "auxiliary_loss_clip": 0.01070656, "auxiliary_loss_mlp": 0.01049198, "balance_loss_clip": 1.014961, "balance_loss_mlp": 1.02202606, "epoch": 0.33151961521118295, "flos": 14278952269440.0, "grad_norm": 1.9445672528085716, "language_loss": 0.70136344, "learning_rate": 3.1197271796113507e-06, "loss": 0.72256196, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48632812, "step": 5514, "time_per_iteration": 2.4123313426971436 }, { "auxiliary_loss_clip": 0.01068751, "auxiliary_loss_mlp": 0.01055762, "balance_loss_clip": 1.01921237, "balance_loss_mlp": 1.02175939, "epoch": 0.3315797384638509, "flos": 20773356001920.0, "grad_norm": 1.9624782120564408, "language_loss": 0.67865241, "learning_rate": 3.1194044563562026e-06, "loss": 0.69989753, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.46875, "step": 5515, "time_per_iteration": 2.4035611152648926 }, { "auxiliary_loss_clip": 0.01068248, "auxiliary_loss_mlp": 0.01050317, "balance_loss_clip": 1.01815462, "balance_loss_mlp": 1.02127504, "epoch": 0.3316398617165189, "flos": 24678356394240.0, "grad_norm": 1.503308295948925, "language_loss": 0.7042461, "learning_rate": 3.1190816906531257e-06, "loss": 0.72543174, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46875, "step": 5516, "time_per_iteration": 3.963435411453247 }, { "auxiliary_loss_clip": 0.01068019, "auxiliary_loss_mlp": 0.0104742, "balance_loss_clip": 1.01530528, "balance_loss_mlp": 1.02098012, "epoch": 0.33169998496918685, "flos": 18586989930240.0, "grad_norm": 2.3380752536182934, "language_loss": 0.81801927, "learning_rate": 3.118758882514359e-06, "loss": 0.83917367, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46875, "step": 5517, "time_per_iteration": 2.3873612880706787 }, { "auxiliary_loss_clip": 0.01063658, "auxiliary_loss_mlp": 0.01051938, "balance_loss_clip": 1.02257693, "balance_loss_mlp": 1.0199821, "epoch": 0.3317601082218548, "flos": 20192610579840.0, "grad_norm": 1.7445561686908913, "language_loss": 0.75411689, "learning_rate": 3.118436031952143e-06, "loss": 0.77527279, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43554688, "step": 5518, "time_per_iteration": 3.9123880863189697 }, { "auxiliary_loss_clip": 0.01013896, "auxiliary_loss_mlp": 0.01003542, "balance_loss_clip": 1.00018036, "balance_loss_mlp": 1.00407183, "epoch": 0.3318202314745228, "flos": 68971301032320.0, "grad_norm": 0.6134744398193794, "language_loss": 0.5438391, "learning_rate": 3.1181131389787206e-06, "loss": 0.56401348, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 0.03369141, "router_z_loss_mlp": 0.09863281, "step": 5519, "time_per_iteration": 3.159447431564331 }, { "auxiliary_loss_clip": 0.01068602, "auxiliary_loss_mlp": 0.0104616, "balance_loss_clip": 1.01187575, "balance_loss_mlp": 1.02180362, "epoch": 0.33188035472719074, "flos": 21499235412480.0, "grad_norm": 2.3831432986703933, "language_loss": 0.8002184, "learning_rate": 3.117790203606336e-06, "loss": 0.82136601, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46679688, "step": 5520, "time_per_iteration": 3.846764326095581 }, { "auxiliary_loss_clip": 0.01065797, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.01505494, "balance_loss_mlp": 1.0214653, "epoch": 0.3319404779798587, "flos": 28869400488960.0, "grad_norm": 1.755299528246814, "language_loss": 0.77069938, "learning_rate": 3.1174672258472344e-06, "loss": 0.79180211, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44335938, "step": 5521, "time_per_iteration": 2.436143159866333 }, { "auxiliary_loss_clip": 0.01067193, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.01921439, "balance_loss_mlp": 1.02030063, "epoch": 0.33200060123252667, "flos": 23075773032960.0, "grad_norm": 2.278920552109586, "language_loss": 0.7170167, "learning_rate": 3.117144205713664e-06, "loss": 0.73821181, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46875, "step": 5522, "time_per_iteration": 2.4178173542022705 }, { "auxiliary_loss_clip": 0.01066347, "auxiliary_loss_mlp": 0.01045501, "balance_loss_clip": 1.01521027, "balance_loss_mlp": 1.02113008, "epoch": 0.33206072448519464, "flos": 21141410221440.0, "grad_norm": 1.6408733300866396, "language_loss": 0.74969375, "learning_rate": 3.1168211432178735e-06, "loss": 0.77081221, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.453125, "step": 5523, "time_per_iteration": 2.3969779014587402 }, { "auxiliary_loss_clip": 0.01063143, "auxiliary_loss_mlp": 0.01052029, "balance_loss_clip": 1.02067685, "balance_loss_mlp": 1.01901472, "epoch": 0.3321208477378626, "flos": 13078254101760.0, "grad_norm": 1.6908955603754228, "language_loss": 0.82884973, "learning_rate": 3.116498038372114e-06, "loss": 0.85000145, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 5524, "time_per_iteration": 2.405308246612549 }, { "auxiliary_loss_clip": 0.01063999, "auxiliary_loss_mlp": 0.01046272, "balance_loss_clip": 1.01638639, "balance_loss_mlp": 1.01989019, "epoch": 0.33218097099053057, "flos": 21214343784960.0, "grad_norm": 1.6301800347981898, "language_loss": 0.84321415, "learning_rate": 3.116174891188636e-06, "loss": 0.86431688, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.44140625, "step": 5525, "time_per_iteration": 2.382605791091919 }, { "auxiliary_loss_clip": 0.0101468, "auxiliary_loss_mlp": 0.01006241, "balance_loss_clip": 1.00324929, "balance_loss_mlp": 1.00471103, "epoch": 0.33224109424319853, "flos": 64345483376640.0, "grad_norm": 0.7671220726632565, "language_loss": 0.5283705, "learning_rate": 3.1158517016796945e-06, "loss": 0.54857969, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 0.02990723, "router_z_loss_mlp": 0.09960938, "step": 5526, "time_per_iteration": 3.0100691318511963 }, { "auxiliary_loss_clip": 0.01068227, "auxiliary_loss_mlp": 0.0104961, "balance_loss_clip": 1.01711345, "balance_loss_mlp": 1.02146459, "epoch": 0.33230121749586655, "flos": 17345094491520.0, "grad_norm": 2.028689111528651, "language_loss": 0.79429865, "learning_rate": 3.1155284698575445e-06, "loss": 0.81547707, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46875, "step": 5527, "time_per_iteration": 2.339938163757324 }, { "auxiliary_loss_clip": 0.01067287, "auxiliary_loss_mlp": 0.01050539, "balance_loss_clip": 1.02108264, "balance_loss_mlp": 1.02183199, "epoch": 0.3323613407485345, "flos": 20995962030720.0, "grad_norm": 2.307783155249437, "language_loss": 0.74100137, "learning_rate": 3.1152051957344434e-06, "loss": 0.76217967, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.45507812, "step": 5528, "time_per_iteration": 2.4466662406921387 }, { "auxiliary_loss_clip": 0.01065829, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.01130939, "balance_loss_mlp": 1.02075148, "epoch": 0.3324214640012025, "flos": 13151676424320.0, "grad_norm": 3.1808135996316844, "language_loss": 0.85380387, "learning_rate": 3.1148818793226497e-06, "loss": 0.8748824, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44921875, "step": 5529, "time_per_iteration": 2.359931707382202 }, { "auxiliary_loss_clip": 0.01071208, "auxiliary_loss_mlp": 0.01049837, "balance_loss_clip": 1.01701844, "balance_loss_mlp": 1.02240777, "epoch": 0.33248158725387045, "flos": 22272421582080.0, "grad_norm": 1.8985316959994523, "language_loss": 0.71406293, "learning_rate": 3.114558520634423e-06, "loss": 0.73527336, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48828125, "step": 5530, "time_per_iteration": 2.4435713291168213 }, { "auxiliary_loss_clip": 0.01069353, "auxiliary_loss_mlp": 0.01060694, "balance_loss_clip": 1.02609909, "balance_loss_mlp": 1.02106833, "epoch": 0.3325417105065384, "flos": 20739943964160.0, "grad_norm": 2.6105215786288465, "language_loss": 0.78959298, "learning_rate": 3.1142351196820256e-06, "loss": 0.81089348, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.48242188, "step": 5531, "time_per_iteration": 2.3819162845611572 }, { "auxiliary_loss_clip": 0.01068696, "auxiliary_loss_mlp": 0.01056592, "balance_loss_clip": 1.02225995, "balance_loss_mlp": 1.02109945, "epoch": 0.3326018337592064, "flos": 24789380117760.0, "grad_norm": 1.874893765084891, "language_loss": 0.7516399, "learning_rate": 3.1139116764777206e-06, "loss": 0.77289283, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47460938, "step": 5532, "time_per_iteration": 2.4730160236358643 }, { "auxiliary_loss_clip": 0.01067484, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.01348662, "balance_loss_mlp": 1.02159595, "epoch": 0.33266195701187434, "flos": 14500825159680.0, "grad_norm": 1.9488794901658864, "language_loss": 0.67895961, "learning_rate": 3.1135881910337735e-06, "loss": 0.70008105, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45703125, "step": 5533, "time_per_iteration": 2.392252206802368 }, { "auxiliary_loss_clip": 0.01066154, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.01526546, "balance_loss_mlp": 1.01999915, "epoch": 0.3327220802645423, "flos": 15303513294720.0, "grad_norm": 1.7105923164397687, "language_loss": 0.72640055, "learning_rate": 3.113264663362451e-06, "loss": 0.74753875, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4609375, "step": 5534, "time_per_iteration": 2.3850626945495605 }, { "auxiliary_loss_clip": 0.0106673, "auxiliary_loss_mlp": 0.01049427, "balance_loss_clip": 1.01881421, "balance_loss_mlp": 1.02165818, "epoch": 0.3327822035172103, "flos": 23476401417600.0, "grad_norm": 1.5040667491867912, "language_loss": 0.68271017, "learning_rate": 3.1129410934760204e-06, "loss": 0.70387179, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45117188, "step": 5535, "time_per_iteration": 2.42374849319458 }, { "auxiliary_loss_clip": 0.01067112, "auxiliary_loss_mlp": 0.01054911, "balance_loss_clip": 1.0211513, "balance_loss_mlp": 1.02041352, "epoch": 0.33284232676987824, "flos": 25373337384960.0, "grad_norm": 2.414191757133029, "language_loss": 0.74202591, "learning_rate": 3.1126174813867517e-06, "loss": 0.76324612, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46679688, "step": 5536, "time_per_iteration": 2.4249255657196045 }, { "auxiliary_loss_clip": 0.01067003, "auxiliary_loss_mlp": 0.01053325, "balance_loss_clip": 1.01997042, "balance_loss_mlp": 1.01956081, "epoch": 0.3329024500225462, "flos": 23693281983360.0, "grad_norm": 1.5487821894580853, "language_loss": 0.8231827, "learning_rate": 3.112293827106917e-06, "loss": 0.84438598, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.47265625, "step": 5537, "time_per_iteration": 2.4379494190216064 }, { "auxiliary_loss_clip": 0.01071755, "auxiliary_loss_mlp": 0.0106032, "balance_loss_clip": 1.02424753, "balance_loss_mlp": 1.02279043, "epoch": 0.33296257327521417, "flos": 31721804167680.0, "grad_norm": 1.7360288970815656, "language_loss": 0.72808194, "learning_rate": 3.111970130648789e-06, "loss": 0.7494027, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48828125, "step": 5538, "time_per_iteration": 2.4575870037078857 }, { "auxiliary_loss_clip": 0.01066021, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.01536655, "balance_loss_mlp": 1.02033854, "epoch": 0.33302269652788213, "flos": 22743679380480.0, "grad_norm": 1.7502021943892294, "language_loss": 0.76000631, "learning_rate": 3.1116463920246424e-06, "loss": 0.78114104, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45703125, "step": 5539, "time_per_iteration": 2.4459168910980225 }, { "auxiliary_loss_clip": 0.01069861, "auxiliary_loss_mlp": 0.0106094, "balance_loss_clip": 1.02417541, "balance_loss_mlp": 1.02080703, "epoch": 0.33308281978055015, "flos": 11472947654400.0, "grad_norm": 2.1327551749953875, "language_loss": 0.7253201, "learning_rate": 3.1113226112467527e-06, "loss": 0.74662811, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.4921875, "step": 5540, "time_per_iteration": 2.362872838973999 }, { "auxiliary_loss_clip": 0.01065538, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.01383877, "balance_loss_mlp": 1.01914048, "epoch": 0.3331429430332181, "flos": 38212262916480.0, "grad_norm": 1.95670488799209, "language_loss": 0.61848658, "learning_rate": 3.1109987883273983e-06, "loss": 0.63959789, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46484375, "step": 5541, "time_per_iteration": 2.5486080646514893 }, { "auxiliary_loss_clip": 0.01067634, "auxiliary_loss_mlp": 0.01047786, "balance_loss_clip": 1.01497912, "balance_loss_mlp": 1.01954246, "epoch": 0.3332030662858861, "flos": 22527566864640.0, "grad_norm": 1.6650601468472461, "language_loss": 0.70365632, "learning_rate": 3.1106749232788584e-06, "loss": 0.72481048, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5542, "time_per_iteration": 2.3815624713897705 }, { "auxiliary_loss_clip": 0.01068059, "auxiliary_loss_mlp": 0.01052371, "balance_loss_clip": 1.01989865, "balance_loss_mlp": 1.02084637, "epoch": 0.33326318953855405, "flos": 15996853451520.0, "grad_norm": 1.6876551651327367, "language_loss": 0.76841801, "learning_rate": 3.110351016113414e-06, "loss": 0.78962231, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47265625, "step": 5543, "time_per_iteration": 2.383723735809326 }, { "auxiliary_loss_clip": 0.01070056, "auxiliary_loss_mlp": 0.01045169, "balance_loss_clip": 1.01231503, "balance_loss_mlp": 1.02211916, "epoch": 0.333323312791222, "flos": 25592347543680.0, "grad_norm": 1.6894072031269822, "language_loss": 0.76614809, "learning_rate": 3.110027066843348e-06, "loss": 0.78730023, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5544, "time_per_iteration": 2.419053792953491 }, { "auxiliary_loss_clip": 0.01064103, "auxiliary_loss_mlp": 0.01047354, "balance_loss_clip": 1.01490521, "balance_loss_mlp": 1.01936388, "epoch": 0.33338343604389, "flos": 25118366659200.0, "grad_norm": 1.7375543028142386, "language_loss": 0.72078669, "learning_rate": 3.1097030754809456e-06, "loss": 0.74190122, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44726562, "step": 5545, "time_per_iteration": 2.4689018726348877 }, { "auxiliary_loss_clip": 0.0106316, "auxiliary_loss_mlp": 0.01050276, "balance_loss_clip": 1.01954389, "balance_loss_mlp": 1.01910865, "epoch": 0.33344355929655795, "flos": 16946316408960.0, "grad_norm": 1.6401444080112273, "language_loss": 0.70850277, "learning_rate": 3.1093790420384894e-06, "loss": 0.72963715, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44140625, "step": 5546, "time_per_iteration": 2.3480796813964844 }, { "auxiliary_loss_clip": 0.01068035, "auxiliary_loss_mlp": 0.01050286, "balance_loss_clip": 1.01748013, "balance_loss_mlp": 1.01914144, "epoch": 0.3335036825492259, "flos": 27888410707200.0, "grad_norm": 1.8541341515863892, "language_loss": 0.66176319, "learning_rate": 3.1090549665282702e-06, "loss": 0.68294644, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.49023438, "step": 5547, "time_per_iteration": 2.4429309368133545 }, { "auxiliary_loss_clip": 0.01064187, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.01527524, "balance_loss_mlp": 1.01977515, "epoch": 0.3335638058018939, "flos": 16178646234240.0, "grad_norm": 2.3799267170328116, "language_loss": 0.86656547, "learning_rate": 3.1087308489625742e-06, "loss": 0.88764966, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4453125, "step": 5548, "time_per_iteration": 2.3469579219818115 }, { "auxiliary_loss_clip": 0.01066262, "auxiliary_loss_mlp": 0.01048664, "balance_loss_clip": 1.014642, "balance_loss_mlp": 1.01813841, "epoch": 0.33362392905456184, "flos": 39894517733760.0, "grad_norm": 3.0900609617376693, "language_loss": 0.76219141, "learning_rate": 3.1084066893536945e-06, "loss": 0.78334063, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.48046875, "step": 5549, "time_per_iteration": 2.597487449645996 }, { "auxiliary_loss_clip": 0.01067226, "auxiliary_loss_mlp": 0.01050627, "balance_loss_clip": 1.01746321, "balance_loss_mlp": 1.0211978, "epoch": 0.3336840523072298, "flos": 44269588938240.0, "grad_norm": 1.9986789310884376, "language_loss": 0.70765281, "learning_rate": 3.108082487713921e-06, "loss": 0.72883129, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4609375, "step": 5550, "time_per_iteration": 2.556265354156494 }, { "auxiliary_loss_clip": 0.01066262, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.01527071, "balance_loss_mlp": 1.01987147, "epoch": 0.33374417555989777, "flos": 15084782426880.0, "grad_norm": 2.1305881467545884, "language_loss": 0.62159312, "learning_rate": 3.1077582440555495e-06, "loss": 0.64273798, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46484375, "step": 5551, "time_per_iteration": 2.370605707168579 }, { "auxiliary_loss_clip": 0.01063264, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.01412892, "balance_loss_mlp": 1.018139, "epoch": 0.33380429881256574, "flos": 15848333061120.0, "grad_norm": 1.7851044608370739, "language_loss": 0.71472001, "learning_rate": 3.1074339583908746e-06, "loss": 0.73580718, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45117188, "step": 5552, "time_per_iteration": 2.346456527709961 }, { "auxiliary_loss_clip": 0.01067, "auxiliary_loss_mlp": 0.01049714, "balance_loss_clip": 1.01793289, "balance_loss_mlp": 1.02008259, "epoch": 0.33386442206523376, "flos": 13479475979520.0, "grad_norm": 1.9740342489108995, "language_loss": 0.85231888, "learning_rate": 3.107109630732192e-06, "loss": 0.87348604, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46875, "step": 5553, "time_per_iteration": 3.800049304962158 }, { "auxiliary_loss_clip": 0.0106815, "auxiliary_loss_mlp": 0.01052184, "balance_loss_clip": 1.01955664, "balance_loss_mlp": 1.02085578, "epoch": 0.3339245453179017, "flos": 16689739760640.0, "grad_norm": 2.152275254503383, "language_loss": 0.82703221, "learning_rate": 3.1067852610918017e-06, "loss": 0.84823561, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47265625, "step": 5554, "time_per_iteration": 2.3561229705810547 }, { "auxiliary_loss_clip": 0.0106671, "auxiliary_loss_mlp": 0.010531, "balance_loss_clip": 1.01826715, "balance_loss_mlp": 1.01989329, "epoch": 0.3339846685705697, "flos": 24609402725760.0, "grad_norm": 1.5597031850965786, "language_loss": 0.83123964, "learning_rate": 3.1064608494820032e-06, "loss": 0.85243773, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46875, "step": 5555, "time_per_iteration": 3.8779356479644775 }, { "auxiliary_loss_clip": 0.01065097, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.02067995, "balance_loss_mlp": 1.01899779, "epoch": 0.33404479182323765, "flos": 30952562981760.0, "grad_norm": 1.844870268208161, "language_loss": 0.75452471, "learning_rate": 3.106136395915099e-06, "loss": 0.77570069, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4609375, "step": 5556, "time_per_iteration": 2.508586883544922 }, { "auxiliary_loss_clip": 0.01064384, "auxiliary_loss_mlp": 0.01052706, "balance_loss_clip": 1.01999497, "balance_loss_mlp": 1.01947927, "epoch": 0.3341049150759056, "flos": 23512187427840.0, "grad_norm": 1.4398053396018582, "language_loss": 0.83800936, "learning_rate": 3.105811900403391e-06, "loss": 0.85918033, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44921875, "step": 5557, "time_per_iteration": 3.798022508621216 }, { "auxiliary_loss_clip": 0.01067183, "auxiliary_loss_mlp": 0.01055834, "balance_loss_clip": 1.02309871, "balance_loss_mlp": 1.02065444, "epoch": 0.3341650383285736, "flos": 24025620015360.0, "grad_norm": 1.6513518840894525, "language_loss": 0.81562614, "learning_rate": 3.1054873629591855e-06, "loss": 0.83685625, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46484375, "step": 5558, "time_per_iteration": 2.4696714878082275 }, { "auxiliary_loss_clip": 0.01067048, "auxiliary_loss_mlp": 0.01047962, "balance_loss_clip": 1.01508427, "balance_loss_mlp": 1.02057874, "epoch": 0.33422516158124155, "flos": 24900752954880.0, "grad_norm": 1.877649582523763, "language_loss": 0.82343793, "learning_rate": 3.105162783594788e-06, "loss": 0.84458804, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46484375, "step": 5559, "time_per_iteration": 3.8448479175567627 }, { "auxiliary_loss_clip": 0.01065012, "auxiliary_loss_mlp": 0.01045515, "balance_loss_clip": 1.01452065, "balance_loss_mlp": 1.01985836, "epoch": 0.3342852848339095, "flos": 18332403229440.0, "grad_norm": 1.7383749147130878, "language_loss": 0.72301978, "learning_rate": 3.1048381623225074e-06, "loss": 0.74412507, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45117188, "step": 5560, "time_per_iteration": 2.348094940185547 }, { "auxiliary_loss_clip": 0.01069369, "auxiliary_loss_mlp": 0.01059877, "balance_loss_clip": 1.02549708, "balance_loss_mlp": 1.02116024, "epoch": 0.3343454080865775, "flos": 30045170079360.0, "grad_norm": 1.4026019237757872, "language_loss": 0.76227593, "learning_rate": 3.1045134991546526e-06, "loss": 0.78356838, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48242188, "step": 5561, "time_per_iteration": 2.4864394664764404 }, { "auxiliary_loss_clip": 0.01065767, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.02064061, "balance_loss_mlp": 1.01939368, "epoch": 0.33440553133924544, "flos": 16397900772480.0, "grad_norm": 1.978325194487784, "language_loss": 0.70770085, "learning_rate": 3.1041887941035355e-06, "loss": 0.72889322, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46289062, "step": 5562, "time_per_iteration": 2.357445478439331 }, { "auxiliary_loss_clip": 0.01063417, "auxiliary_loss_mlp": 0.01049156, "balance_loss_clip": 1.01887655, "balance_loss_mlp": 1.01929855, "epoch": 0.3344656545919134, "flos": 24240964481280.0, "grad_norm": 1.5967967096923024, "language_loss": 0.66440094, "learning_rate": 3.1038640471814685e-06, "loss": 0.68552661, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44140625, "step": 5563, "time_per_iteration": 2.4313278198242188 }, { "auxiliary_loss_clip": 0.01070596, "auxiliary_loss_mlp": 0.01062736, "balance_loss_clip": 1.02244306, "balance_loss_mlp": 1.02152395, "epoch": 0.3345257778445814, "flos": 52116911832960.0, "grad_norm": 1.4314163691201478, "language_loss": 0.7496143, "learning_rate": 3.103539258400766e-06, "loss": 0.77094758, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 0.49023438, "step": 5564, "time_per_iteration": 2.6400632858276367 }, { "auxiliary_loss_clip": 0.01012921, "auxiliary_loss_mlp": 0.01017718, "balance_loss_clip": 1.01426136, "balance_loss_mlp": 1.00325263, "epoch": 0.33458590109724934, "flos": 68045614577280.0, "grad_norm": 0.7912708442023627, "language_loss": 0.55605054, "learning_rate": 3.103214427773745e-06, "loss": 0.57635689, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 0.03466797, "router_z_loss_mlp": 0.09667969, "step": 5565, "time_per_iteration": 2.9798686504364014 }, { "auxiliary_loss_clip": 0.01065345, "auxiliary_loss_mlp": 0.01047019, "balance_loss_clip": 1.01564276, "balance_loss_mlp": 1.02047253, "epoch": 0.3346460243499173, "flos": 37413275385600.0, "grad_norm": 1.720286915989064, "language_loss": 0.66047359, "learning_rate": 3.102889555312721e-06, "loss": 0.68159723, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44921875, "step": 5566, "time_per_iteration": 2.4966275691986084 }, { "auxiliary_loss_clip": 0.01067502, "auxiliary_loss_mlp": 0.01054935, "balance_loss_clip": 1.02055478, "balance_loss_mlp": 1.02218676, "epoch": 0.3347061476025853, "flos": 18696372819840.0, "grad_norm": 1.8465678724378116, "language_loss": 0.79194248, "learning_rate": 3.102564641030016e-06, "loss": 0.81316686, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.453125, "step": 5567, "time_per_iteration": 2.379993438720703 }, { "auxiliary_loss_clip": 0.01068202, "auxiliary_loss_mlp": 0.01054158, "balance_loss_clip": 1.0196588, "balance_loss_mlp": 1.02079844, "epoch": 0.3347662708552533, "flos": 13916972626560.0, "grad_norm": 1.6636252602609993, "language_loss": 0.78767353, "learning_rate": 3.102239684937949e-06, "loss": 0.80889714, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47460938, "step": 5568, "time_per_iteration": 2.367408275604248 }, { "auxiliary_loss_clip": 0.01069453, "auxiliary_loss_mlp": 0.01054087, "balance_loss_clip": 1.01975489, "balance_loss_mlp": 1.02062917, "epoch": 0.33482639410792125, "flos": 19749528115200.0, "grad_norm": 2.410536661939314, "language_loss": 0.72736979, "learning_rate": 3.101914687048842e-06, "loss": 0.74860513, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48828125, "step": 5569, "time_per_iteration": 2.3954734802246094 }, { "auxiliary_loss_clip": 0.01068192, "auxiliary_loss_mlp": 0.01054216, "balance_loss_clip": 1.01785684, "balance_loss_mlp": 1.01997232, "epoch": 0.3348865173605892, "flos": 16102186623360.0, "grad_norm": 2.0483152476769706, "language_loss": 0.91174716, "learning_rate": 3.10158964737502e-06, "loss": 0.93297124, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.48242188, "step": 5570, "time_per_iteration": 2.350863456726074 }, { "auxiliary_loss_clip": 0.01067499, "auxiliary_loss_mlp": 0.0105263, "balance_loss_clip": 1.01839328, "balance_loss_mlp": 1.02037358, "epoch": 0.3349466406132572, "flos": 25007796783360.0, "grad_norm": 1.5514983945285732, "language_loss": 0.81122887, "learning_rate": 3.101264565928808e-06, "loss": 0.83243018, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47070312, "step": 5571, "time_per_iteration": 2.4733152389526367 }, { "auxiliary_loss_clip": 0.01013143, "auxiliary_loss_mlp": 0.0101139, "balance_loss_clip": 1.00762308, "balance_loss_mlp": 1.00299728, "epoch": 0.33500676386592515, "flos": 54316647089280.0, "grad_norm": 0.9013404278223167, "language_loss": 0.56114805, "learning_rate": 3.1009394427225335e-06, "loss": 0.58139336, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 0.03759766, "router_z_loss_mlp": 0.1015625, "step": 5572, "time_per_iteration": 2.9561569690704346 }, { "auxiliary_loss_clip": 0.0106837, "auxiliary_loss_mlp": 0.01059071, "balance_loss_clip": 1.02526331, "balance_loss_mlp": 1.02196646, "epoch": 0.3350668871185931, "flos": 26796117911040.0, "grad_norm": 1.9449858012547798, "language_loss": 0.79893881, "learning_rate": 3.1006142777685257e-06, "loss": 0.8202132, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46289062, "step": 5573, "time_per_iteration": 2.4576141834259033 }, { "auxiliary_loss_clip": 0.01071479, "auxiliary_loss_mlp": 0.01066208, "balance_loss_clip": 1.02784634, "balance_loss_mlp": 1.02226162, "epoch": 0.3351270103712611, "flos": 33509112865920.0, "grad_norm": 2.3134797752167184, "language_loss": 0.74609244, "learning_rate": 3.1002890710791133e-06, "loss": 0.76746935, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.4921875, "step": 5574, "time_per_iteration": 2.471407651901245 }, { "auxiliary_loss_clip": 0.01065267, "auxiliary_loss_mlp": 0.01056802, "balance_loss_clip": 1.02516413, "balance_loss_mlp": 1.02083969, "epoch": 0.33518713362392905, "flos": 26505012061440.0, "grad_norm": 1.5849993303176617, "language_loss": 0.89587009, "learning_rate": 3.0999638226666287e-06, "loss": 0.91709083, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44335938, "step": 5575, "time_per_iteration": 2.4425432682037354 }, { "auxiliary_loss_clip": 0.01072843, "auxiliary_loss_mlp": 0.01062257, "balance_loss_clip": 1.02325165, "balance_loss_mlp": 1.02178812, "epoch": 0.335247256876597, "flos": 17231557150080.0, "grad_norm": 2.1406495349784174, "language_loss": 0.8409152, "learning_rate": 3.0996385325434063e-06, "loss": 0.86226618, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.51171875, "step": 5576, "time_per_iteration": 2.3642022609710693 }, { "auxiliary_loss_clip": 0.01072025, "auxiliary_loss_mlp": 0.01054056, "balance_loss_clip": 1.01621926, "balance_loss_mlp": 1.0221715, "epoch": 0.335307380129265, "flos": 25628203376640.0, "grad_norm": 2.2366104411912704, "language_loss": 0.74545348, "learning_rate": 3.0993132007217806e-06, "loss": 0.76671433, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5, "step": 5577, "time_per_iteration": 2.439854860305786 }, { "auxiliary_loss_clip": 0.01071105, "auxiliary_loss_mlp": 0.01061037, "balance_loss_clip": 1.02515507, "balance_loss_mlp": 1.0225606, "epoch": 0.33536750338193294, "flos": 19679143080960.0, "grad_norm": 1.6990164250579898, "language_loss": 0.82839006, "learning_rate": 3.0989878272140883e-06, "loss": 0.84971148, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48632812, "step": 5578, "time_per_iteration": 2.382715940475464 }, { "auxiliary_loss_clip": 0.01065315, "auxiliary_loss_mlp": 0.01056542, "balance_loss_clip": 1.02230477, "balance_loss_mlp": 1.0210346, "epoch": 0.3354276266346009, "flos": 18331635179520.0, "grad_norm": 1.8691328426394411, "language_loss": 0.72553027, "learning_rate": 3.0986624120326676e-06, "loss": 0.74674881, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.44335938, "step": 5579, "time_per_iteration": 2.393550157546997 }, { "auxiliary_loss_clip": 0.01070876, "auxiliary_loss_mlp": 0.01058872, "balance_loss_clip": 1.02184534, "balance_loss_mlp": 1.02159524, "epoch": 0.3354877498872689, "flos": 17857584472320.0, "grad_norm": 1.945729865758927, "language_loss": 0.82978308, "learning_rate": 3.0983369551898573e-06, "loss": 0.85108054, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.4921875, "step": 5580, "time_per_iteration": 2.373728036880493 }, { "auxiliary_loss_clip": 0.01070101, "auxiliary_loss_mlp": 0.0106308, "balance_loss_clip": 1.02479017, "balance_loss_mlp": 1.02109623, "epoch": 0.3355478731399369, "flos": 24716586199680.0, "grad_norm": 1.50146117954155, "language_loss": 0.78759801, "learning_rate": 3.0980114566980003e-06, "loss": 0.8089298, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.48828125, "step": 5581, "time_per_iteration": 2.4463448524475098 }, { "auxiliary_loss_clip": 0.01072286, "auxiliary_loss_mlp": 0.01068025, "balance_loss_clip": 1.02763629, "balance_loss_mlp": 1.02129281, "epoch": 0.33560799639260486, "flos": 16872928997760.0, "grad_norm": 2.316754631207495, "language_loss": 0.75798225, "learning_rate": 3.0976859165694384e-06, "loss": 0.77938539, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 0.51171875, "step": 5582, "time_per_iteration": 2.3552191257476807 }, { "auxiliary_loss_clip": 0.01069807, "auxiliary_loss_mlp": 0.01068062, "balance_loss_clip": 1.0304637, "balance_loss_mlp": 1.02032948, "epoch": 0.3356681196452728, "flos": 18332507963520.0, "grad_norm": 1.6817101311945357, "language_loss": 0.84439367, "learning_rate": 3.0973603348165166e-06, "loss": 0.86577237, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.49414062, "step": 5583, "time_per_iteration": 2.3987197875976562 }, { "auxiliary_loss_clip": 0.01066963, "auxiliary_loss_mlp": 0.01072437, "balance_loss_clip": 1.03806901, "balance_loss_mlp": 1.02014875, "epoch": 0.3357282428979408, "flos": 34749192913920.0, "grad_norm": 1.6573257988027668, "language_loss": 0.78526652, "learning_rate": 3.097034711451581e-06, "loss": 0.80666053, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46679688, "step": 5584, "time_per_iteration": 2.505084991455078 }, { "auxiliary_loss_clip": 0.0106953, "auxiliary_loss_mlp": 0.01059566, "balance_loss_clip": 1.02563953, "balance_loss_mlp": 1.02039647, "epoch": 0.33578836615060875, "flos": 21579011602560.0, "grad_norm": 1.564355042238551, "language_loss": 0.77799487, "learning_rate": 3.0967090464869795e-06, "loss": 0.79928583, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4921875, "step": 5585, "time_per_iteration": 2.404996395111084 }, { "auxiliary_loss_clip": 0.01064999, "auxiliary_loss_mlp": 0.01057631, "balance_loss_clip": 1.0217731, "balance_loss_mlp": 1.02014709, "epoch": 0.3358484894032767, "flos": 24529277422080.0, "grad_norm": 1.5912428176731959, "language_loss": 0.78817058, "learning_rate": 3.0963833399350608e-06, "loss": 0.80939686, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.44921875, "step": 5586, "time_per_iteration": 2.408074140548706 }, { "auxiliary_loss_clip": 0.01075294, "auxiliary_loss_mlp": 0.01063938, "balance_loss_clip": 1.02447999, "balance_loss_mlp": 1.02501559, "epoch": 0.3359086126559447, "flos": 22454493655680.0, "grad_norm": 1.793191053025723, "language_loss": 0.82842308, "learning_rate": 3.0960575918081756e-06, "loss": 0.84981537, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.50390625, "step": 5587, "time_per_iteration": 2.423187494277954 }, { "auxiliary_loss_clip": 0.01065544, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.01472366, "balance_loss_mlp": 1.02197158, "epoch": 0.33596873590861265, "flos": 16542790381440.0, "grad_norm": 1.8442974788881963, "language_loss": 0.69347405, "learning_rate": 3.095731802118677e-06, "loss": 0.71457446, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4375, "step": 5588, "time_per_iteration": 2.354553461074829 }, { "auxiliary_loss_clip": 0.01069308, "auxiliary_loss_mlp": 0.01051025, "balance_loss_clip": 1.0170505, "balance_loss_mlp": 1.02263761, "epoch": 0.3360288591612806, "flos": 31174470783360.0, "grad_norm": 1.8814749055754945, "language_loss": 0.71817869, "learning_rate": 3.095405970878919e-06, "loss": 0.73938203, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.46679688, "step": 5589, "time_per_iteration": 2.5201454162597656 }, { "auxiliary_loss_clip": 0.01069013, "auxiliary_loss_mlp": 0.01048457, "balance_loss_clip": 1.01462543, "balance_loss_mlp": 1.02132547, "epoch": 0.3360889824139486, "flos": 23695760689920.0, "grad_norm": 1.6048324799078835, "language_loss": 0.68367332, "learning_rate": 3.0950800981012567e-06, "loss": 0.70484805, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4765625, "step": 5590, "time_per_iteration": 2.4083657264709473 }, { "auxiliary_loss_clip": 0.0106836, "auxiliary_loss_mlp": 0.01057016, "balance_loss_clip": 1.02411413, "balance_loss_mlp": 1.02254176, "epoch": 0.33614910566661654, "flos": 19317093615360.0, "grad_norm": 1.9948221248738829, "language_loss": 0.76425999, "learning_rate": 3.094754183798047e-06, "loss": 0.78551376, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45703125, "step": 5591, "time_per_iteration": 2.4222261905670166 }, { "auxiliary_loss_clip": 0.01066895, "auxiliary_loss_mlp": 0.01048738, "balance_loss_clip": 1.01583624, "balance_loss_mlp": 1.02092147, "epoch": 0.3362092289192845, "flos": 16471323095040.0, "grad_norm": 2.137944518183399, "language_loss": 0.71465892, "learning_rate": 3.0944282279816493e-06, "loss": 0.73581517, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45898438, "step": 5592, "time_per_iteration": 3.776217460632324 }, { "auxiliary_loss_clip": 0.0106547, "auxiliary_loss_mlp": 0.01046484, "balance_loss_clip": 1.01634765, "balance_loss_mlp": 1.02133238, "epoch": 0.33626935217195253, "flos": 24242430758400.0, "grad_norm": 2.085771832003528, "language_loss": 0.78196812, "learning_rate": 3.094102230664423e-06, "loss": 0.80308765, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 5593, "time_per_iteration": 2.452629804611206 }, { "auxiliary_loss_clip": 0.01068651, "auxiliary_loss_mlp": 0.01056807, "balance_loss_clip": 1.01911354, "balance_loss_mlp": 1.02012658, "epoch": 0.3363294754246205, "flos": 19717756911360.0, "grad_norm": 1.932849782080291, "language_loss": 0.7406249, "learning_rate": 3.093776191858731e-06, "loss": 0.76187944, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.48632812, "step": 5594, "time_per_iteration": 2.367388963699341 }, { "auxiliary_loss_clip": 0.01070265, "auxiliary_loss_mlp": 0.01056695, "balance_loss_clip": 1.02314973, "balance_loss_mlp": 1.02195191, "epoch": 0.33638959867728846, "flos": 22595333546880.0, "grad_norm": 1.7973973089832094, "language_loss": 0.81084824, "learning_rate": 3.0934501115769363e-06, "loss": 0.8321178, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.484375, "step": 5595, "time_per_iteration": 3.814136028289795 }, { "auxiliary_loss_clip": 0.01067146, "auxiliary_loss_mlp": 0.01049942, "balance_loss_clip": 1.0208075, "balance_loss_mlp": 1.02215672, "epoch": 0.3364497219299564, "flos": 20993727703680.0, "grad_norm": 1.5575724463033636, "language_loss": 0.82913584, "learning_rate": 3.0931239898314037e-06, "loss": 0.85030675, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.45117188, "step": 5596, "time_per_iteration": 2.4350147247314453 }, { "auxiliary_loss_clip": 0.01067292, "auxiliary_loss_mlp": 0.01052038, "balance_loss_clip": 1.02178288, "balance_loss_mlp": 1.02142727, "epoch": 0.3365098451826244, "flos": 25227435346560.0, "grad_norm": 1.6620991452001994, "language_loss": 0.7710458, "learning_rate": 3.0927978266344995e-06, "loss": 0.79223913, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.45703125, "step": 5597, "time_per_iteration": 3.790078639984131 }, { "auxiliary_loss_clip": 0.01063078, "auxiliary_loss_mlp": 0.01042993, "balance_loss_clip": 1.01506233, "balance_loss_mlp": 1.01916647, "epoch": 0.33656996843529235, "flos": 24570544515840.0, "grad_norm": 1.7724394713429623, "language_loss": 0.80304623, "learning_rate": 3.0924716219985916e-06, "loss": 0.82410693, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.43945312, "step": 5598, "time_per_iteration": 2.442047357559204 }, { "auxiliary_loss_clip": 0.01070528, "auxiliary_loss_mlp": 0.01054346, "balance_loss_clip": 1.01906013, "balance_loss_mlp": 1.02067888, "epoch": 0.3366300916879603, "flos": 44089436989440.0, "grad_norm": 1.8238151171857462, "language_loss": 0.66568685, "learning_rate": 3.0921453759360514e-06, "loss": 0.68693566, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.49609375, "step": 5599, "time_per_iteration": 4.015202283859253 }, { "auxiliary_loss_clip": 0.01071841, "auxiliary_loss_mlp": 0.01061639, "balance_loss_clip": 1.02220416, "balance_loss_mlp": 1.02139378, "epoch": 0.3366902149406283, "flos": 13879057023360.0, "grad_norm": 2.649585378524128, "language_loss": 0.83834624, "learning_rate": 3.091819088459249e-06, "loss": 0.85968101, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.50390625, "step": 5600, "time_per_iteration": 2.3532330989837646 }, { "auxiliary_loss_clip": 0.01067335, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.02268982, "balance_loss_mlp": 1.01973832, "epoch": 0.33675033819329625, "flos": 16252173290880.0, "grad_norm": 2.2368321115076104, "language_loss": 0.85485125, "learning_rate": 3.0914927595805573e-06, "loss": 0.8761282, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.4765625, "step": 5601, "time_per_iteration": 2.361980438232422 }, { "auxiliary_loss_clip": 0.01064328, "auxiliary_loss_mlp": 0.01041418, "balance_loss_clip": 1.01469076, "balance_loss_mlp": 1.02177763, "epoch": 0.3368104614459642, "flos": 17054861425920.0, "grad_norm": 1.5130828270610155, "language_loss": 0.84979594, "learning_rate": 3.0911663893123507e-06, "loss": 0.8708533, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.42578125, "step": 5602, "time_per_iteration": 2.3786814212799072 }, { "auxiliary_loss_clip": 0.0106685, "auxiliary_loss_mlp": 0.01047838, "balance_loss_clip": 1.01656997, "balance_loss_mlp": 1.02177846, "epoch": 0.3368705846986322, "flos": 17857654295040.0, "grad_norm": 1.8340634698263256, "language_loss": 0.71117705, "learning_rate": 3.0908399776670048e-06, "loss": 0.73232388, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45117188, "step": 5603, "time_per_iteration": 2.359121322631836 }, { "auxiliary_loss_clip": 0.01068584, "auxiliary_loss_mlp": 0.01047422, "balance_loss_clip": 1.01734543, "balance_loss_mlp": 1.02123737, "epoch": 0.33693070795130015, "flos": 22928404717440.0, "grad_norm": 1.666676119656041, "language_loss": 0.84509242, "learning_rate": 3.090513524656898e-06, "loss": 0.86625254, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.47265625, "step": 5604, "time_per_iteration": 2.4440155029296875 }, { "auxiliary_loss_clip": 0.0106661, "auxiliary_loss_mlp": 0.01045017, "balance_loss_clip": 1.01367652, "balance_loss_mlp": 1.02001548, "epoch": 0.3369908312039681, "flos": 22016368604160.0, "grad_norm": 1.4998229855372456, "language_loss": 0.74902987, "learning_rate": 3.090187030294409e-06, "loss": 0.77014619, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46679688, "step": 5605, "time_per_iteration": 2.385272741317749 }, { "auxiliary_loss_clip": 0.01068266, "auxiliary_loss_mlp": 0.01056058, "balance_loss_clip": 1.02242887, "balance_loss_mlp": 1.02030778, "epoch": 0.33705095445663613, "flos": 11801166145920.0, "grad_norm": 3.209363442309294, "language_loss": 0.86183006, "learning_rate": 3.089860494591919e-06, "loss": 0.88307327, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48046875, "step": 5606, "time_per_iteration": 2.375382423400879 }, { "auxiliary_loss_clip": 0.01064294, "auxiliary_loss_mlp": 0.0104647, "balance_loss_clip": 1.01633382, "balance_loss_mlp": 1.01952648, "epoch": 0.3371110777093041, "flos": 25045223627520.0, "grad_norm": 1.4629039632044774, "language_loss": 0.69309455, "learning_rate": 3.089533917561809e-06, "loss": 0.71420217, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44726562, "step": 5607, "time_per_iteration": 2.403574228286743 }, { "auxiliary_loss_clip": 0.01068252, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.01223803, "balance_loss_mlp": 1.02122915, "epoch": 0.33717120096197206, "flos": 26577805979520.0, "grad_norm": 1.7680212763639795, "language_loss": 0.72287518, "learning_rate": 3.089207299216464e-06, "loss": 0.74398655, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.47070312, "step": 5608, "time_per_iteration": 2.4223086833953857 }, { "auxiliary_loss_clip": 0.01067747, "auxiliary_loss_mlp": 0.01046577, "balance_loss_clip": 1.01710856, "balance_loss_mlp": 1.02257061, "epoch": 0.33723132421464, "flos": 15157646167680.0, "grad_norm": 1.8903261776455096, "language_loss": 0.80904281, "learning_rate": 3.088880639568269e-06, "loss": 0.83018601, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.45117188, "step": 5609, "time_per_iteration": 2.395296096801758 }, { "auxiliary_loss_clip": 0.01066684, "auxiliary_loss_mlp": 0.01051494, "balance_loss_clip": 1.02086902, "balance_loss_mlp": 1.02178288, "epoch": 0.337291447467308, "flos": 23435099412480.0, "grad_norm": 1.61559855405847, "language_loss": 0.83324283, "learning_rate": 3.0885539386296114e-06, "loss": 0.8544246, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44921875, "step": 5610, "time_per_iteration": 2.4401659965515137 }, { "auxiliary_loss_clip": 0.01064321, "auxiliary_loss_mlp": 0.01047878, "balance_loss_clip": 1.01770651, "balance_loss_mlp": 1.0214057, "epoch": 0.33735157071997596, "flos": 17237212790400.0, "grad_norm": 1.7756487139104977, "language_loss": 0.83714342, "learning_rate": 3.088227196412879e-06, "loss": 0.8582654, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42773438, "step": 5611, "time_per_iteration": 2.3652403354644775 }, { "auxiliary_loss_clip": 0.01067025, "auxiliary_loss_mlp": 0.01049316, "balance_loss_clip": 1.01826215, "balance_loss_mlp": 1.02191639, "epoch": 0.3374116939726439, "flos": 28256115813120.0, "grad_norm": 1.7052895778472068, "language_loss": 0.81135404, "learning_rate": 3.0879004129304626e-06, "loss": 0.83251745, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.45117188, "step": 5612, "time_per_iteration": 2.4766154289245605 }, { "auxiliary_loss_clip": 0.01066627, "auxiliary_loss_mlp": 0.01050076, "balance_loss_clip": 1.0199635, "balance_loss_mlp": 1.02038646, "epoch": 0.3374718172253119, "flos": 35917910409600.0, "grad_norm": 2.4930242637816953, "language_loss": 0.71199965, "learning_rate": 3.087573588194753e-06, "loss": 0.73316664, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.46289062, "step": 5613, "time_per_iteration": 2.4944190979003906 }, { "auxiliary_loss_clip": 0.01067754, "auxiliary_loss_mlp": 0.01048724, "balance_loss_clip": 1.01527417, "balance_loss_mlp": 1.02140534, "epoch": 0.33753194047797985, "flos": 18185698229760.0, "grad_norm": 1.7273037168813863, "language_loss": 0.8061527, "learning_rate": 3.087246722218144e-06, "loss": 0.82731748, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.46484375, "step": 5614, "time_per_iteration": 2.395639181137085 }, { "auxiliary_loss_clip": 0.01068172, "auxiliary_loss_mlp": 0.01054816, "balance_loss_clip": 1.02055562, "balance_loss_mlp": 1.02133751, "epoch": 0.3375920637306478, "flos": 23147798901120.0, "grad_norm": 1.6763883247726639, "language_loss": 0.92505437, "learning_rate": 3.086919815013031e-06, "loss": 0.94628423, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.46875, "step": 5615, "time_per_iteration": 2.4026145935058594 }, { "auxiliary_loss_clip": 0.01062714, "auxiliary_loss_mlp": 0.01051161, "balance_loss_clip": 1.0212512, "balance_loss_mlp": 1.01903152, "epoch": 0.3376521869833158, "flos": 23111105195520.0, "grad_norm": 1.5814366093274828, "language_loss": 0.82083392, "learning_rate": 3.086592866591809e-06, "loss": 0.84197259, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4375, "step": 5616, "time_per_iteration": 2.4318456649780273 }, { "auxiliary_loss_clip": 0.01069082, "auxiliary_loss_mlp": 0.01055931, "balance_loss_clip": 1.01897621, "balance_loss_mlp": 1.02055013, "epoch": 0.33771231023598375, "flos": 19273766751360.0, "grad_norm": 4.239232846691606, "language_loss": 0.84659141, "learning_rate": 3.0862658769668774e-06, "loss": 0.8678416, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 0.484375, "step": 5617, "time_per_iteration": 2.3806350231170654 }, { "auxiliary_loss_clip": 0.01066782, "auxiliary_loss_mlp": 0.01056426, "balance_loss_clip": 1.02475262, "balance_loss_mlp": 1.02082992, "epoch": 0.3377724334886517, "flos": 18149213992320.0, "grad_norm": 2.5899141139806807, "language_loss": 0.81293839, "learning_rate": 3.0859388461506343e-06, "loss": 0.83417046, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4609375, "step": 5618, "time_per_iteration": 2.424600601196289 }, { "auxiliary_loss_clip": 0.01067773, "auxiliary_loss_mlp": 0.01050949, "balance_loss_clip": 1.01866674, "balance_loss_mlp": 1.0205704, "epoch": 0.3378325567413197, "flos": 25774803642240.0, "grad_norm": 2.130488424054777, "language_loss": 0.72706228, "learning_rate": 3.085611774155481e-06, "loss": 0.74824953, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47265625, "step": 5619, "time_per_iteration": 2.4351589679718018 }, { "auxiliary_loss_clip": 0.01065439, "auxiliary_loss_mlp": 0.01058509, "balance_loss_clip": 1.02433121, "balance_loss_mlp": 1.01956177, "epoch": 0.3378926799939877, "flos": 21316255643520.0, "grad_norm": 2.5600653154588264, "language_loss": 0.73385012, "learning_rate": 3.085284660993821e-06, "loss": 0.75508964, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.45898438, "step": 5620, "time_per_iteration": 2.4205617904663086 }, { "auxiliary_loss_clip": 0.01068037, "auxiliary_loss_mlp": 0.01049534, "balance_loss_clip": 1.01751447, "balance_loss_mlp": 1.02213919, "epoch": 0.33795280324665566, "flos": 24898867741440.0, "grad_norm": 1.7986644355415815, "language_loss": 0.69128573, "learning_rate": 3.084957506678058e-06, "loss": 0.71246147, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45898438, "step": 5621, "time_per_iteration": 2.419731378555298 }, { "auxiliary_loss_clip": 0.01064212, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.01631784, "balance_loss_mlp": 1.01956904, "epoch": 0.33801292649932363, "flos": 24752791146240.0, "grad_norm": 1.6316057400667148, "language_loss": 0.83597463, "learning_rate": 3.0846303112205975e-06, "loss": 0.85709715, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4453125, "step": 5622, "time_per_iteration": 2.461059331893921 }, { "auxiliary_loss_clip": 0.01064603, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.01865292, "balance_loss_mlp": 1.01957572, "epoch": 0.3380730497519916, "flos": 26722765411200.0, "grad_norm": 1.439254976643121, "language_loss": 0.74579406, "learning_rate": 3.0843030746338464e-06, "loss": 0.76692116, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44921875, "step": 5623, "time_per_iteration": 2.418384552001953 }, { "auxiliary_loss_clip": 0.01012283, "auxiliary_loss_mlp": 0.0101313, "balance_loss_clip": 1.00974464, "balance_loss_mlp": 1.00300384, "epoch": 0.33813317300465956, "flos": 70032241560960.0, "grad_norm": 0.754273690806335, "language_loss": 0.55062795, "learning_rate": 3.083975796930215e-06, "loss": 0.57088208, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.09277344, "step": 5624, "time_per_iteration": 3.166423797607422 }, { "auxiliary_loss_clip": 0.01069638, "auxiliary_loss_mlp": 0.01054681, "balance_loss_clip": 1.02193403, "balance_loss_mlp": 1.02202594, "epoch": 0.3381932962573275, "flos": 24096179606400.0, "grad_norm": 2.195090536368631, "language_loss": 0.74645597, "learning_rate": 3.083648478122111e-06, "loss": 0.76769918, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4765625, "step": 5625, "time_per_iteration": 2.4176554679870605 }, { "auxiliary_loss_clip": 0.01068571, "auxiliary_loss_mlp": 0.01054101, "balance_loss_clip": 1.01752734, "balance_loss_mlp": 1.02091706, "epoch": 0.3382534195099955, "flos": 19277327710080.0, "grad_norm": 4.200518995108873, "language_loss": 0.72725612, "learning_rate": 3.0833211182219497e-06, "loss": 0.74848282, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.4765625, "step": 5626, "time_per_iteration": 2.430955410003662 }, { "auxiliary_loss_clip": 0.01066147, "auxiliary_loss_mlp": 0.01043645, "balance_loss_clip": 1.01250744, "balance_loss_mlp": 1.02172852, "epoch": 0.33831354276266346, "flos": 25225131196800.0, "grad_norm": 1.4524140757776258, "language_loss": 0.81593943, "learning_rate": 3.0829937172421425e-06, "loss": 0.83703732, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4453125, "step": 5627, "time_per_iteration": 2.4581594467163086 }, { "auxiliary_loss_clip": 0.01073324, "auxiliary_loss_mlp": 0.01052834, "balance_loss_clip": 1.0170238, "balance_loss_mlp": 1.02345395, "epoch": 0.3383736660153314, "flos": 23110895727360.0, "grad_norm": 2.1426097959603077, "language_loss": 0.81893432, "learning_rate": 3.0826662751951055e-06, "loss": 0.84019589, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.5, "step": 5628, "time_per_iteration": 2.4344191551208496 }, { "auxiliary_loss_clip": 0.01069328, "auxiliary_loss_mlp": 0.01053932, "balance_loss_clip": 1.01690578, "balance_loss_mlp": 1.02131104, "epoch": 0.3384337892679994, "flos": 23476017392640.0, "grad_norm": 2.0113061686234697, "language_loss": 0.80134434, "learning_rate": 3.082338792093254e-06, "loss": 0.82257688, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.47851562, "step": 5629, "time_per_iteration": 2.4255802631378174 }, { "auxiliary_loss_clip": 0.01070134, "auxiliary_loss_mlp": 0.01060229, "balance_loss_clip": 1.02184343, "balance_loss_mlp": 1.02171016, "epoch": 0.33849391252066735, "flos": 19424835671040.0, "grad_norm": 2.207572320126079, "language_loss": 0.86414099, "learning_rate": 3.0820112679490074e-06, "loss": 0.88544464, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.484375, "step": 5630, "time_per_iteration": 2.449812173843384 }, { "auxiliary_loss_clip": 0.01069622, "auxiliary_loss_mlp": 0.01053561, "balance_loss_clip": 1.02068329, "balance_loss_mlp": 1.02207804, "epoch": 0.3385540357733353, "flos": 21063903269760.0, "grad_norm": 2.3032729434987393, "language_loss": 0.73639035, "learning_rate": 3.0816837027747857e-06, "loss": 0.75762212, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4765625, "step": 5631, "time_per_iteration": 3.7972166538238525 }, { "auxiliary_loss_clip": 0.01017539, "auxiliary_loss_mlp": 0.01014062, "balance_loss_clip": 1.0106523, "balance_loss_mlp": 1.00779164, "epoch": 0.3386141590260033, "flos": 69205220208000.0, "grad_norm": 0.8673833218218914, "language_loss": 0.56141651, "learning_rate": 3.0813560965830084e-06, "loss": 0.58173251, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.09765625, "step": 5632, "time_per_iteration": 3.1092751026153564 }, { "auxiliary_loss_clip": 0.010678, "auxiliary_loss_mlp": 0.01051723, "balance_loss_clip": 1.01963222, "balance_loss_mlp": 1.02113211, "epoch": 0.3386742822786713, "flos": 25518331728000.0, "grad_norm": 1.5801291992127109, "language_loss": 0.8149032, "learning_rate": 3.0810284493861005e-06, "loss": 0.83609843, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46679688, "step": 5633, "time_per_iteration": 2.4536216259002686 }, { "auxiliary_loss_clip": 0.01069064, "auxiliary_loss_mlp": 0.01059529, "balance_loss_clip": 1.02511287, "balance_loss_mlp": 1.02189565, "epoch": 0.33873440553133927, "flos": 23621989253760.0, "grad_norm": 2.0767955283823034, "language_loss": 0.61774224, "learning_rate": 3.0807007611964855e-06, "loss": 0.63902819, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47265625, "step": 5634, "time_per_iteration": 2.397555112838745 }, { "auxiliary_loss_clip": 0.01066134, "auxiliary_loss_mlp": 0.01050971, "balance_loss_clip": 1.01930952, "balance_loss_mlp": 1.01997304, "epoch": 0.33879452878400723, "flos": 17088029084160.0, "grad_norm": 1.7449384144245945, "language_loss": 0.93973994, "learning_rate": 3.080373032026589e-06, "loss": 0.96091104, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4609375, "step": 5635, "time_per_iteration": 3.814988851547241 }, { "auxiliary_loss_clip": 0.01066924, "auxiliary_loss_mlp": 0.01050911, "balance_loss_clip": 1.02070332, "balance_loss_mlp": 1.02181721, "epoch": 0.3388546520366752, "flos": 15741149587200.0, "grad_norm": 2.5639829651667614, "language_loss": 0.76754224, "learning_rate": 3.0800452618888386e-06, "loss": 0.78872055, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.45117188, "step": 5636, "time_per_iteration": 2.3698232173919678 }, { "auxiliary_loss_clip": 0.01065879, "auxiliary_loss_mlp": 0.0106257, "balance_loss_clip": 1.02764153, "balance_loss_mlp": 1.01992273, "epoch": 0.33891477528934316, "flos": 22417660304640.0, "grad_norm": 1.5083877657105533, "language_loss": 0.84587693, "learning_rate": 3.0797174507956637e-06, "loss": 0.86716139, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.45898438, "step": 5637, "time_per_iteration": 3.897296905517578 }, { "auxiliary_loss_clip": 0.0106821, "auxiliary_loss_mlp": 0.01058224, "balance_loss_clip": 1.02377272, "balance_loss_mlp": 1.0213871, "epoch": 0.3389748985420111, "flos": 17273871584640.0, "grad_norm": 1.6994407656914603, "language_loss": 0.71319056, "learning_rate": 3.079389598759495e-06, "loss": 0.73445487, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46875, "step": 5638, "time_per_iteration": 3.8198401927948 }, { "auxiliary_loss_clip": 0.01067422, "auxiliary_loss_mlp": 0.01057321, "balance_loss_clip": 1.02601635, "balance_loss_mlp": 1.02179754, "epoch": 0.3390350217946791, "flos": 27743765477760.0, "grad_norm": 2.3277485940039213, "language_loss": 0.8196165, "learning_rate": 3.079061705792765e-06, "loss": 0.84086394, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45703125, "step": 5639, "time_per_iteration": 2.4499642848968506 }, { "auxiliary_loss_clip": 0.01069732, "auxiliary_loss_mlp": 0.01064092, "balance_loss_clip": 1.03095198, "balance_loss_mlp": 1.02131987, "epoch": 0.33909514504734706, "flos": 20338756997760.0, "grad_norm": 2.2520731126705176, "language_loss": 0.7039336, "learning_rate": 3.078733771907907e-06, "loss": 0.72527182, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.484375, "step": 5640, "time_per_iteration": 2.421077251434326 }, { "auxiliary_loss_clip": 0.01069683, "auxiliary_loss_mlp": 0.01062253, "balance_loss_clip": 1.02736056, "balance_loss_mlp": 1.02330172, "epoch": 0.339155268300015, "flos": 14829148385280.0, "grad_norm": 1.8612771365746288, "language_loss": 0.71280992, "learning_rate": 3.0784057971173554e-06, "loss": 0.73412931, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46484375, "step": 5641, "time_per_iteration": 2.380199670791626 }, { "auxiliary_loss_clip": 0.01070265, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.02750587, "balance_loss_mlp": 1.02320087, "epoch": 0.339215391552683, "flos": 26066747364480.0, "grad_norm": 2.325122979416618, "language_loss": 0.89288127, "learning_rate": 3.0780777814335483e-06, "loss": 0.91418755, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.47070312, "step": 5642, "time_per_iteration": 2.45731258392334 }, { "auxiliary_loss_clip": 0.01066293, "auxiliary_loss_mlp": 0.01042587, "balance_loss_clip": 1.0155865, "balance_loss_mlp": 1.02445638, "epoch": 0.33927551480535095, "flos": 14573828545920.0, "grad_norm": 1.8994795941211735, "language_loss": 0.85544008, "learning_rate": 3.077749724868924e-06, "loss": 0.87652886, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 5643, "time_per_iteration": 2.367480516433716 }, { "auxiliary_loss_clip": 0.01073961, "auxiliary_loss_mlp": 0.01061869, "balance_loss_clip": 1.02705956, "balance_loss_mlp": 1.02678633, "epoch": 0.3393356380580189, "flos": 23804445352320.0, "grad_norm": 1.8958805459253072, "language_loss": 0.78257638, "learning_rate": 3.077421627435922e-06, "loss": 0.80393469, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.47265625, "step": 5644, "time_per_iteration": 2.476710081100464 }, { "auxiliary_loss_clip": 0.01070941, "auxiliary_loss_mlp": 0.01055974, "balance_loss_clip": 1.02261889, "balance_loss_mlp": 1.0243212, "epoch": 0.3393957613106869, "flos": 17346909882240.0, "grad_norm": 2.8005990176777504, "language_loss": 0.65766782, "learning_rate": 3.0770934891469832e-06, "loss": 0.67893696, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.46484375, "step": 5645, "time_per_iteration": 2.385627269744873 }, { "auxiliary_loss_clip": 0.01069287, "auxiliary_loss_mlp": 0.01051144, "balance_loss_clip": 1.0192678, "balance_loss_mlp": 1.02371526, "epoch": 0.3394558845633549, "flos": 28432846448640.0, "grad_norm": 1.8165372734160639, "language_loss": 0.78374553, "learning_rate": 3.076765310014552e-06, "loss": 0.80494988, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45507812, "step": 5646, "time_per_iteration": 2.5007402896881104 }, { "auxiliary_loss_clip": 0.01074071, "auxiliary_loss_mlp": 0.01058163, "balance_loss_clip": 1.02232838, "balance_loss_mlp": 1.02495623, "epoch": 0.33951600781602287, "flos": 22085950677120.0, "grad_norm": 2.2243244255691095, "language_loss": 0.80648029, "learning_rate": 3.0764370900510727e-06, "loss": 0.8278026, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.4921875, "step": 5647, "time_per_iteration": 2.391749858856201 }, { "auxiliary_loss_clip": 0.01073609, "auxiliary_loss_mlp": 0.01048573, "balance_loss_clip": 1.01641011, "balance_loss_mlp": 1.02645135, "epoch": 0.33957613106869083, "flos": 23877134536320.0, "grad_norm": 1.865436779866164, "language_loss": 0.78433573, "learning_rate": 3.0761088292689904e-06, "loss": 0.80555761, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47265625, "step": 5648, "time_per_iteration": 2.447050094604492 }, { "auxiliary_loss_clip": 0.01022441, "auxiliary_loss_mlp": 0.01013518, "balance_loss_clip": 1.01027572, "balance_loss_mlp": 1.01255059, "epoch": 0.3396362543213588, "flos": 71239014305280.0, "grad_norm": 0.7829316428918759, "language_loss": 0.56472409, "learning_rate": 3.075780527680754e-06, "loss": 0.5850836, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.09863281, "step": 5649, "time_per_iteration": 3.081279993057251 }, { "auxiliary_loss_clip": 0.01071077, "auxiliary_loss_mlp": 0.01070466, "balance_loss_clip": 1.03726625, "balance_loss_mlp": 1.02389264, "epoch": 0.33969637757402676, "flos": 25920426389760.0, "grad_norm": 1.7252809558863162, "language_loss": 0.86639726, "learning_rate": 3.0754521852988117e-06, "loss": 0.88781273, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47265625, "step": 5650, "time_per_iteration": 2.4688503742218018 }, { "auxiliary_loss_clip": 0.01067434, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.01744235, "balance_loss_mlp": 1.02256405, "epoch": 0.33975650082669473, "flos": 35260286440320.0, "grad_norm": 2.2058824262148247, "language_loss": 0.72950113, "learning_rate": 3.0751238021356152e-06, "loss": 0.75065434, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44921875, "step": 5651, "time_per_iteration": 2.5045197010040283 }, { "auxiliary_loss_clip": 0.01070161, "auxiliary_loss_mlp": 0.01056861, "balance_loss_clip": 1.02460337, "balance_loss_mlp": 1.0234189, "epoch": 0.3398166240793627, "flos": 16646273251200.0, "grad_norm": 1.8469861320367245, "language_loss": 0.82596618, "learning_rate": 3.074795378203616e-06, "loss": 0.84723639, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46679688, "step": 5652, "time_per_iteration": 2.3887338638305664 }, { "auxiliary_loss_clip": 0.01072574, "auxiliary_loss_mlp": 0.01067266, "balance_loss_clip": 1.03121686, "balance_loss_mlp": 1.0240922, "epoch": 0.33987674733203066, "flos": 24061022000640.0, "grad_norm": 1.7994202073844106, "language_loss": 0.78551811, "learning_rate": 3.0744669135152685e-06, "loss": 0.80691648, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.484375, "step": 5653, "time_per_iteration": 2.427478790283203 }, { "auxiliary_loss_clip": 0.01068922, "auxiliary_loss_mlp": 0.01057257, "balance_loss_clip": 1.0269779, "balance_loss_mlp": 1.02255452, "epoch": 0.3399368705846986, "flos": 13250132058240.0, "grad_norm": 2.6312825690648634, "language_loss": 0.88596958, "learning_rate": 3.0741384080830278e-06, "loss": 0.90723133, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46289062, "step": 5654, "time_per_iteration": 2.393951654434204 }, { "auxiliary_loss_clip": 0.01065915, "auxiliary_loss_mlp": 0.01065719, "balance_loss_clip": 1.03303134, "balance_loss_mlp": 1.01994693, "epoch": 0.3399969938373666, "flos": 27011706756480.0, "grad_norm": 2.1108059507937167, "language_loss": 0.66965902, "learning_rate": 3.073809861919351e-06, "loss": 0.69097543, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.45898438, "step": 5655, "time_per_iteration": 2.417910099029541 }, { "auxiliary_loss_clip": 0.01067532, "auxiliary_loss_mlp": 0.01071993, "balance_loss_clip": 1.03913879, "balance_loss_mlp": 1.02195704, "epoch": 0.34005711709003456, "flos": 28548792673920.0, "grad_norm": 1.3816214381408023, "language_loss": 0.78117597, "learning_rate": 3.073481275036697e-06, "loss": 0.8025713, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45507812, "step": 5656, "time_per_iteration": 2.5124590396881104 }, { "auxiliary_loss_clip": 0.01070652, "auxiliary_loss_mlp": 0.01060815, "balance_loss_clip": 1.02520728, "balance_loss_mlp": 1.0212419, "epoch": 0.3401172403427025, "flos": 21615914776320.0, "grad_norm": 1.6427287200198972, "language_loss": 0.84516001, "learning_rate": 3.073152647447525e-06, "loss": 0.86647463, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49414062, "step": 5657, "time_per_iteration": 2.3979270458221436 }, { "auxiliary_loss_clip": 0.01068475, "auxiliary_loss_mlp": 0.01062294, "balance_loss_clip": 1.03056073, "balance_loss_mlp": 1.02290452, "epoch": 0.3401773635953705, "flos": 25884570556800.0, "grad_norm": 1.7059449772311954, "language_loss": 0.86982334, "learning_rate": 3.0728239791642976e-06, "loss": 0.89113104, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45703125, "step": 5658, "time_per_iteration": 2.45029354095459 }, { "auxiliary_loss_clip": 0.01016332, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.02861536, "balance_loss_mlp": 1.0067929, "epoch": 0.3402374868480385, "flos": 65504704982400.0, "grad_norm": 0.831706680058992, "language_loss": 0.60245812, "learning_rate": 3.072495270199477e-06, "loss": 0.62294769, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.09570312, "step": 5659, "time_per_iteration": 2.963164806365967 }, { "auxiliary_loss_clip": 0.01067021, "auxiliary_loss_mlp": 0.01051549, "balance_loss_clip": 1.02217627, "balance_loss_mlp": 1.02377653, "epoch": 0.34029761010070647, "flos": 24059450989440.0, "grad_norm": 1.7271249737596355, "language_loss": 0.69158816, "learning_rate": 3.0721665205655284e-06, "loss": 0.7127738, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 5660, "time_per_iteration": 2.4333102703094482 }, { "auxiliary_loss_clip": 0.01071509, "auxiliary_loss_mlp": 0.01061874, "balance_loss_clip": 1.02806616, "balance_loss_mlp": 1.02509403, "epoch": 0.34035773335337444, "flos": 27598491843840.0, "grad_norm": 1.6713153413618167, "language_loss": 0.68250728, "learning_rate": 3.071837730274918e-06, "loss": 0.70384109, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46484375, "step": 5661, "time_per_iteration": 2.431236982345581 }, { "auxiliary_loss_clip": 0.0107096, "auxiliary_loss_mlp": 0.01045404, "balance_loss_clip": 1.01536345, "balance_loss_mlp": 1.02653146, "epoch": 0.3404178566060424, "flos": 20811760364160.0, "grad_norm": 1.8036056685809438, "language_loss": 0.80335987, "learning_rate": 3.071508899340113e-06, "loss": 0.82452357, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4453125, "step": 5662, "time_per_iteration": 2.4718222618103027 }, { "auxiliary_loss_clip": 0.01073863, "auxiliary_loss_mlp": 0.01059443, "balance_loss_clip": 1.02646971, "balance_loss_mlp": 1.02678442, "epoch": 0.34047797985871037, "flos": 26832357768960.0, "grad_norm": 2.1693058631748676, "language_loss": 0.75115919, "learning_rate": 3.0711800277735833e-06, "loss": 0.77249223, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47070312, "step": 5663, "time_per_iteration": 2.435824394226074 }, { "auxiliary_loss_clip": 0.01071343, "auxiliary_loss_mlp": 0.01049049, "balance_loss_clip": 1.02082026, "balance_loss_mlp": 1.02670598, "epoch": 0.34053810311137833, "flos": 19681621787520.0, "grad_norm": 1.6851841233484264, "language_loss": 0.87545007, "learning_rate": 3.0708511155877997e-06, "loss": 0.89665401, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4453125, "step": 5664, "time_per_iteration": 2.4426486492156982 }, { "auxiliary_loss_clip": 0.01076556, "auxiliary_loss_mlp": 0.01053028, "balance_loss_clip": 1.02112758, "balance_loss_mlp": 1.02924585, "epoch": 0.3405982263640463, "flos": 21724669261440.0, "grad_norm": 1.9516805675259707, "language_loss": 0.71003103, "learning_rate": 3.070522162795235e-06, "loss": 0.73132682, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.47265625, "step": 5665, "time_per_iteration": 2.4141886234283447 }, { "auxiliary_loss_clip": 0.01074553, "auxiliary_loss_mlp": 0.01053789, "balance_loss_clip": 1.01819277, "balance_loss_mlp": 1.02785206, "epoch": 0.34065834961671426, "flos": 18040634064000.0, "grad_norm": 2.2830985657942633, "language_loss": 0.74162149, "learning_rate": 3.0701931694083626e-06, "loss": 0.76290488, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.46875, "step": 5666, "time_per_iteration": 2.3984804153442383 }, { "auxiliary_loss_clip": 0.01079469, "auxiliary_loss_mlp": 0.01060069, "balance_loss_clip": 1.02723908, "balance_loss_mlp": 1.03018582, "epoch": 0.3407184728693822, "flos": 21396276213120.0, "grad_norm": 1.4622782622882156, "language_loss": 0.74294615, "learning_rate": 3.0698641354396576e-06, "loss": 0.76434153, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4921875, "step": 5667, "time_per_iteration": 2.4133548736572266 }, { "auxiliary_loss_clip": 0.01032171, "auxiliary_loss_mlp": 0.01031692, "balance_loss_clip": 1.02830684, "balance_loss_mlp": 1.02114749, "epoch": 0.3407785961220502, "flos": 68684559102720.0, "grad_norm": 0.8570098718066301, "language_loss": 0.63295496, "learning_rate": 3.069535060901597e-06, "loss": 0.65359354, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.11035156, "step": 5668, "time_per_iteration": 3.2349441051483154 }, { "auxiliary_loss_clip": 0.01075616, "auxiliary_loss_mlp": 0.010616, "balance_loss_clip": 1.02857888, "balance_loss_mlp": 1.02784371, "epoch": 0.34083871937471816, "flos": 14063503069440.0, "grad_norm": 2.4082240318282517, "language_loss": 0.74911129, "learning_rate": 3.0692059458066596e-06, "loss": 0.77048349, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4765625, "step": 5669, "time_per_iteration": 2.4131181240081787 }, { "auxiliary_loss_clip": 0.01074809, "auxiliary_loss_mlp": 0.0106156, "balance_loss_clip": 1.02746582, "balance_loss_mlp": 1.02590704, "epoch": 0.3408988426273861, "flos": 17084677593600.0, "grad_norm": 3.1800991339803932, "language_loss": 0.81805015, "learning_rate": 3.0688767901673265e-06, "loss": 0.83941382, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.49023438, "step": 5670, "time_per_iteration": 2.411785364151001 }, { "auxiliary_loss_clip": 0.0107399, "auxiliary_loss_mlp": 0.0106258, "balance_loss_clip": 1.02998769, "balance_loss_mlp": 1.02552009, "epoch": 0.3409589658800541, "flos": 24023420599680.0, "grad_norm": 1.6774302659132672, "language_loss": 0.78817052, "learning_rate": 3.068547593996078e-06, "loss": 0.80953622, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.484375, "step": 5671, "time_per_iteration": 3.8471426963806152 }, { "auxiliary_loss_clip": 0.01070905, "auxiliary_loss_mlp": 0.01053152, "balance_loss_clip": 1.01878381, "balance_loss_mlp": 1.02437437, "epoch": 0.34101908913272205, "flos": 21140956373760.0, "grad_norm": 1.916541119153878, "language_loss": 0.76110888, "learning_rate": 3.0682183573053974e-06, "loss": 0.78234941, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46484375, "step": 5672, "time_per_iteration": 2.4507548809051514 }, { "auxiliary_loss_clip": 0.01073286, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.01587367, "balance_loss_mlp": 1.02528203, "epoch": 0.3410792123853901, "flos": 15701209125120.0, "grad_norm": 2.2857001510481694, "language_loss": 0.7534306, "learning_rate": 3.06788908010777e-06, "loss": 0.77464956, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5673, "time_per_iteration": 2.3886961936950684 }, { "auxiliary_loss_clip": 0.0107036, "auxiliary_loss_mlp": 0.01055648, "balance_loss_clip": 1.022174, "balance_loss_mlp": 1.02424264, "epoch": 0.34113933563805804, "flos": 23034994698240.0, "grad_norm": 2.011370358154867, "language_loss": 0.80454427, "learning_rate": 3.067559762415682e-06, "loss": 0.82580435, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4609375, "step": 5674, "time_per_iteration": 3.8167154788970947 }, { "auxiliary_loss_clip": 0.01017504, "auxiliary_loss_mlp": 0.01006454, "balance_loss_clip": 1.00297332, "balance_loss_mlp": 1.00820589, "epoch": 0.341199458890726, "flos": 69611294764800.0, "grad_norm": 0.797325223601645, "language_loss": 0.56101996, "learning_rate": 3.0672304042416198e-06, "loss": 0.58125955, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.09326172, "step": 5675, "time_per_iteration": 3.194430351257324 }, { "auxiliary_loss_clip": 0.01068318, "auxiliary_loss_mlp": 0.01059604, "balance_loss_clip": 1.02777493, "balance_loss_mlp": 1.0226016, "epoch": 0.34125958214339397, "flos": 22345250411520.0, "grad_norm": 5.524830132421128, "language_loss": 0.80056775, "learning_rate": 3.0669010055980734e-06, "loss": 0.82184696, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45703125, "step": 5676, "time_per_iteration": 2.405494213104248 }, { "auxiliary_loss_clip": 0.01068177, "auxiliary_loss_mlp": 0.0104833, "balance_loss_clip": 1.01733565, "balance_loss_mlp": 1.02274084, "epoch": 0.34131970539606193, "flos": 21870850590720.0, "grad_norm": 1.7354259748701053, "language_loss": 0.87189418, "learning_rate": 3.0665715664975357e-06, "loss": 0.89305925, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.453125, "step": 5677, "time_per_iteration": 3.8039214611053467 }, { "auxiliary_loss_clip": 0.01069292, "auxiliary_loss_mlp": 0.0104861, "balance_loss_clip": 1.01561296, "balance_loss_mlp": 1.02379203, "epoch": 0.3413798286487299, "flos": 24934583928960.0, "grad_norm": 1.8113056525966331, "language_loss": 0.81562585, "learning_rate": 3.0662420869524966e-06, "loss": 0.83680487, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45507812, "step": 5678, "time_per_iteration": 3.8676421642303467 }, { "auxiliary_loss_clip": 0.01070979, "auxiliary_loss_mlp": 0.01049337, "balance_loss_clip": 1.01827061, "balance_loss_mlp": 1.02445126, "epoch": 0.34143995190139786, "flos": 25373197739520.0, "grad_norm": 1.7756478912855396, "language_loss": 0.76115233, "learning_rate": 3.0659125669754506e-06, "loss": 0.78235555, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.46484375, "step": 5679, "time_per_iteration": 2.4393506050109863 }, { "auxiliary_loss_clip": 0.0102782, "auxiliary_loss_mlp": 0.01012724, "balance_loss_clip": 1.00957739, "balance_loss_mlp": 1.01853764, "epoch": 0.34150007515406583, "flos": 67778876856960.0, "grad_norm": 0.729788562271646, "language_loss": 0.59580165, "learning_rate": 3.0655830065788923e-06, "loss": 0.61620712, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.09277344, "step": 5680, "time_per_iteration": 3.1339969635009766 }, { "auxiliary_loss_clip": 0.01071407, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.01093256, "balance_loss_mlp": 1.02642381, "epoch": 0.3415601984067338, "flos": 20301399976320.0, "grad_norm": 1.9160044382698322, "language_loss": 0.73369193, "learning_rate": 3.0652534057753206e-06, "loss": 0.75482309, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44921875, "step": 5681, "time_per_iteration": 2.4488778114318848 }, { "auxiliary_loss_clip": 0.01072563, "auxiliary_loss_mlp": 0.0105102, "balance_loss_clip": 1.01869011, "balance_loss_mlp": 1.02797818, "epoch": 0.34162032165940176, "flos": 26029983836160.0, "grad_norm": 2.76735905707216, "language_loss": 0.73649359, "learning_rate": 3.064923764577233e-06, "loss": 0.75772941, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4453125, "step": 5682, "time_per_iteration": 2.470813512802124 }, { "auxiliary_loss_clip": 0.01073476, "auxiliary_loss_mlp": 0.01052725, "balance_loss_clip": 1.01941812, "balance_loss_mlp": 1.02635622, "epoch": 0.3416804449120697, "flos": 28802087654400.0, "grad_norm": 2.9247902728938744, "language_loss": 0.85380483, "learning_rate": 3.0645940829971295e-06, "loss": 0.87506682, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47265625, "step": 5683, "time_per_iteration": 2.473719596862793 }, { "auxiliary_loss_clip": 0.01077857, "auxiliary_loss_mlp": 0.01051109, "balance_loss_clip": 1.01639509, "balance_loss_mlp": 1.02911544, "epoch": 0.3417405681647377, "flos": 22600500428160.0, "grad_norm": 1.563380151474871, "language_loss": 0.72129476, "learning_rate": 3.0642643610475116e-06, "loss": 0.74258447, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48828125, "step": 5684, "time_per_iteration": 2.436532497406006 }, { "auxiliary_loss_clip": 0.01073216, "auxiliary_loss_mlp": 0.01042415, "balance_loss_clip": 1.01394773, "balance_loss_mlp": 1.02911651, "epoch": 0.34180069141740566, "flos": 24715119922560.0, "grad_norm": 1.7446194262066692, "language_loss": 0.76059675, "learning_rate": 3.0639345987408823e-06, "loss": 0.78175306, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.44140625, "step": 5685, "time_per_iteration": 2.541807174682617 }, { "auxiliary_loss_clip": 0.01075378, "auxiliary_loss_mlp": 0.01047884, "balance_loss_clip": 1.01920199, "balance_loss_mlp": 1.03128076, "epoch": 0.3418608146700737, "flos": 30517440307200.0, "grad_norm": 1.8372544284945331, "language_loss": 0.72438145, "learning_rate": 3.0636047960897468e-06, "loss": 0.74561405, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.44140625, "step": 5686, "time_per_iteration": 2.5034778118133545 }, { "auxiliary_loss_clip": 0.01075006, "auxiliary_loss_mlp": 0.01062817, "balance_loss_clip": 1.02859199, "balance_loss_mlp": 1.02733314, "epoch": 0.34192093792274164, "flos": 15121441221120.0, "grad_norm": 3.9435679580528413, "language_loss": 0.79914653, "learning_rate": 3.06327495310661e-06, "loss": 0.82052475, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.4765625, "step": 5687, "time_per_iteration": 2.423860788345337 }, { "auxiliary_loss_clip": 0.01075471, "auxiliary_loss_mlp": 0.01054631, "balance_loss_clip": 1.0225873, "balance_loss_mlp": 1.03092527, "epoch": 0.3419810611754096, "flos": 13186973675520.0, "grad_norm": 2.081531358766658, "language_loss": 0.88487405, "learning_rate": 3.062945069803981e-06, "loss": 0.90617508, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 5688, "time_per_iteration": 2.380075216293335 }, { "auxiliary_loss_clip": 0.01079016, "auxiliary_loss_mlp": 0.01053839, "balance_loss_clip": 1.01876736, "balance_loss_mlp": 1.02955294, "epoch": 0.34204118442807757, "flos": 19535265901440.0, "grad_norm": 1.9700002392103135, "language_loss": 0.81622171, "learning_rate": 3.0626151461943684e-06, "loss": 0.83755022, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.49414062, "step": 5689, "time_per_iteration": 2.4419405460357666 }, { "auxiliary_loss_clip": 0.01073218, "auxiliary_loss_mlp": 0.01058868, "balance_loss_clip": 1.02320051, "balance_loss_mlp": 1.02693951, "epoch": 0.34210130768074554, "flos": 15193955848320.0, "grad_norm": 1.753873517267955, "language_loss": 0.74820369, "learning_rate": 3.0622851822902834e-06, "loss": 0.76952451, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.46289062, "step": 5690, "time_per_iteration": 2.387691020965576 }, { "auxiliary_loss_clip": 0.01072095, "auxiliary_loss_mlp": 0.0104857, "balance_loss_clip": 1.01788509, "balance_loss_mlp": 1.02629852, "epoch": 0.3421614309334135, "flos": 24935072688000.0, "grad_norm": 1.882885931429003, "language_loss": 0.77677071, "learning_rate": 3.061955178104237e-06, "loss": 0.79797745, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 5691, "time_per_iteration": 2.424591302871704 }, { "auxiliary_loss_clip": 0.01070661, "auxiliary_loss_mlp": 0.010558, "balance_loss_clip": 1.02628386, "balance_loss_mlp": 1.02651513, "epoch": 0.34222155418608147, "flos": 21907544296320.0, "grad_norm": 1.50400519121594, "language_loss": 0.70109326, "learning_rate": 3.0616251336487447e-06, "loss": 0.72235787, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44140625, "step": 5692, "time_per_iteration": 2.430199146270752 }, { "auxiliary_loss_clip": 0.01072907, "auxiliary_loss_mlp": 0.01064409, "balance_loss_clip": 1.03203154, "balance_loss_mlp": 1.02596974, "epoch": 0.34228167743874943, "flos": 18113078868480.0, "grad_norm": 2.023127776983481, "language_loss": 0.74997973, "learning_rate": 3.06129504893632e-06, "loss": 0.77135289, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46875, "step": 5693, "time_per_iteration": 2.37575626373291 }, { "auxiliary_loss_clip": 0.01067926, "auxiliary_loss_mlp": 0.01056191, "balance_loss_clip": 1.02774811, "balance_loss_mlp": 1.02339828, "epoch": 0.3423418006914174, "flos": 21287521728000.0, "grad_norm": 1.801020675033579, "language_loss": 0.77356243, "learning_rate": 3.0609649239794813e-06, "loss": 0.79480368, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4453125, "step": 5694, "time_per_iteration": 2.444194793701172 }, { "auxiliary_loss_clip": 0.01067155, "auxiliary_loss_mlp": 0.01049272, "balance_loss_clip": 1.02138901, "balance_loss_mlp": 1.02357984, "epoch": 0.34240192394408536, "flos": 19822601324160.0, "grad_norm": 1.617828695510483, "language_loss": 0.80668491, "learning_rate": 3.060634758790747e-06, "loss": 0.82784915, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43554688, "step": 5695, "time_per_iteration": 2.3903141021728516 }, { "auxiliary_loss_clip": 0.01069399, "auxiliary_loss_mlp": 0.01061295, "balance_loss_clip": 1.02919149, "balance_loss_mlp": 1.02337098, "epoch": 0.3424620471967533, "flos": 24534374480640.0, "grad_norm": 1.9407192578553727, "language_loss": 0.75209624, "learning_rate": 3.060304553382635e-06, "loss": 0.77340317, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4609375, "step": 5696, "time_per_iteration": 2.452543020248413 }, { "auxiliary_loss_clip": 0.01068536, "auxiliary_loss_mlp": 0.01054603, "balance_loss_clip": 1.02388322, "balance_loss_mlp": 1.0227567, "epoch": 0.3425221704494213, "flos": 25847702294400.0, "grad_norm": 1.6339911130147682, "language_loss": 0.72258162, "learning_rate": 3.0599743077676685e-06, "loss": 0.74381304, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45703125, "step": 5697, "time_per_iteration": 2.4322593212127686 }, { "auxiliary_loss_clip": 0.0106746, "auxiliary_loss_mlp": 0.01050245, "balance_loss_clip": 1.02113461, "balance_loss_mlp": 1.02403378, "epoch": 0.34258229370208926, "flos": 21539524988160.0, "grad_norm": 2.095406030491964, "language_loss": 0.83368838, "learning_rate": 3.05964402195837e-06, "loss": 0.85486543, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 5698, "time_per_iteration": 2.4209043979644775 }, { "auxiliary_loss_clip": 0.01068482, "auxiliary_loss_mlp": 0.01054536, "balance_loss_clip": 1.02263522, "balance_loss_mlp": 1.02297163, "epoch": 0.3426424169547573, "flos": 23651840332800.0, "grad_norm": 2.844284802878053, "language_loss": 0.70506597, "learning_rate": 3.0593136959672645e-06, "loss": 0.72629613, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45507812, "step": 5699, "time_per_iteration": 2.458503246307373 }, { "auxiliary_loss_clip": 0.01070266, "auxiliary_loss_mlp": 0.01050252, "balance_loss_clip": 1.02052093, "balance_loss_mlp": 1.02520561, "epoch": 0.34270254020742524, "flos": 24643722458880.0, "grad_norm": 3.2437080643096254, "language_loss": 0.73953211, "learning_rate": 3.058983329806877e-06, "loss": 0.7607373, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 5700, "time_per_iteration": 2.516770839691162 }, { "auxiliary_loss_clip": 0.0107061, "auxiliary_loss_mlp": 0.01052983, "balance_loss_clip": 1.02261996, "balance_loss_mlp": 1.02603209, "epoch": 0.3427626634600932, "flos": 20995682739840.0, "grad_norm": 2.6073895956425246, "language_loss": 0.8302384, "learning_rate": 3.0586529234897354e-06, "loss": 0.85147434, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 5701, "time_per_iteration": 2.3982882499694824 }, { "auxiliary_loss_clip": 0.01074766, "auxiliary_loss_mlp": 0.01047813, "balance_loss_clip": 1.01812983, "balance_loss_mlp": 1.02850592, "epoch": 0.3428227867127612, "flos": 21432725539200.0, "grad_norm": 1.7224087955386915, "language_loss": 0.73097426, "learning_rate": 3.0583224770283694e-06, "loss": 0.75220001, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.46289062, "step": 5702, "time_per_iteration": 2.4410970211029053 }, { "auxiliary_loss_clip": 0.01023833, "auxiliary_loss_mlp": 0.01036867, "balance_loss_clip": 1.03360093, "balance_loss_mlp": 1.01420283, "epoch": 0.34288290996542914, "flos": 55728709827840.0, "grad_norm": 0.7937176343867128, "language_loss": 0.57630271, "learning_rate": 3.057991990435309e-06, "loss": 0.5969097, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.09667969, "step": 5703, "time_per_iteration": 2.9813315868377686 }, { "auxiliary_loss_clip": 0.01076218, "auxiliary_loss_mlp": 0.01051576, "balance_loss_clip": 1.02009332, "balance_loss_mlp": 1.02995694, "epoch": 0.3429430332180971, "flos": 20155777228800.0, "grad_norm": 1.8007491647591978, "language_loss": 0.76080173, "learning_rate": 3.057661463723086e-06, "loss": 0.7820797, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46289062, "step": 5704, "time_per_iteration": 2.427797555923462 }, { "auxiliary_loss_clip": 0.01076174, "auxiliary_loss_mlp": 0.01044375, "balance_loss_clip": 1.01559758, "balance_loss_mlp": 1.03126025, "epoch": 0.34300315647076507, "flos": 17964942503040.0, "grad_norm": 1.698680485094175, "language_loss": 0.74181914, "learning_rate": 3.0573308969042346e-06, "loss": 0.76302463, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44921875, "step": 5705, "time_per_iteration": 2.416372060775757 }, { "auxiliary_loss_clip": 0.01077133, "auxiliary_loss_mlp": 0.01044367, "balance_loss_clip": 1.01320565, "balance_loss_mlp": 1.03102636, "epoch": 0.34306327972343303, "flos": 22085845943040.0, "grad_norm": 1.7612681891900592, "language_loss": 0.80691528, "learning_rate": 3.057000289991289e-06, "loss": 0.82813025, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4609375, "step": 5706, "time_per_iteration": 2.4324662685394287 }, { "auxiliary_loss_clip": 0.01080749, "auxiliary_loss_mlp": 0.01054268, "balance_loss_clip": 1.01943517, "balance_loss_mlp": 1.0324111, "epoch": 0.343123402976101, "flos": 18441681384960.0, "grad_norm": 2.9293442679527617, "language_loss": 0.84360284, "learning_rate": 3.056669642996787e-06, "loss": 0.86495298, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.484375, "step": 5707, "time_per_iteration": 2.390139579772949 }, { "auxiliary_loss_clip": 0.01079273, "auxiliary_loss_mlp": 0.0105328, "balance_loss_clip": 1.02307272, "balance_loss_mlp": 1.03323352, "epoch": 0.34318352622876896, "flos": 17162778038400.0, "grad_norm": 1.5513188976544074, "language_loss": 0.76151133, "learning_rate": 3.056338955933266e-06, "loss": 0.78283679, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4609375, "step": 5708, "time_per_iteration": 2.4183433055877686 }, { "auxiliary_loss_clip": 0.01074602, "auxiliary_loss_mlp": 0.01059314, "balance_loss_clip": 1.02996433, "balance_loss_mlp": 1.02974343, "epoch": 0.34324364948143693, "flos": 26686944489600.0, "grad_norm": 1.6775795919288319, "language_loss": 0.82250297, "learning_rate": 3.0560082288132662e-06, "loss": 0.84384203, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44921875, "step": 5709, "time_per_iteration": 2.4513614177703857 }, { "auxiliary_loss_clip": 0.01078512, "auxiliary_loss_mlp": 0.0106648, "balance_loss_clip": 1.03086019, "balance_loss_mlp": 1.03223443, "epoch": 0.3433037727341049, "flos": 21250513820160.0, "grad_norm": 3.2113395377003258, "language_loss": 0.79954803, "learning_rate": 3.055677461649329e-06, "loss": 0.82099795, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4609375, "step": 5710, "time_per_iteration": 2.477811813354492 }, { "auxiliary_loss_clip": 0.01079013, "auxiliary_loss_mlp": 0.01065758, "balance_loss_clip": 1.0324986, "balance_loss_mlp": 1.03029358, "epoch": 0.34336389598677286, "flos": 20628431481600.0, "grad_norm": 1.9849187756445519, "language_loss": 0.72390556, "learning_rate": 3.055346654453996e-06, "loss": 0.74535328, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48828125, "step": 5711, "time_per_iteration": 3.8559608459472656 }, { "auxiliary_loss_clip": 0.01074369, "auxiliary_loss_mlp": 0.010637, "balance_loss_clip": 1.03275323, "balance_loss_mlp": 1.02920246, "epoch": 0.3434240192394409, "flos": 14537693422080.0, "grad_norm": 1.6276711220056086, "language_loss": 0.67746156, "learning_rate": 3.055015807239812e-06, "loss": 0.69884229, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45117188, "step": 5712, "time_per_iteration": 2.378652811050415 }, { "auxiliary_loss_clip": 0.01029077, "auxiliary_loss_mlp": 0.01010886, "balance_loss_clip": 1.00784576, "balance_loss_mlp": 1.01965177, "epoch": 0.34348414249210885, "flos": 58048828784640.0, "grad_norm": 0.8559329663354557, "language_loss": 0.5811196, "learning_rate": 3.0546849200193226e-06, "loss": 0.60151923, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 0.03039551, "router_z_loss_mlp": 0.09472656, "step": 5713, "time_per_iteration": 4.43656325340271 }, { "auxiliary_loss_clip": 0.01072563, "auxiliary_loss_mlp": 0.01065244, "balance_loss_clip": 1.03471446, "balance_loss_mlp": 1.02657461, "epoch": 0.3435442657447768, "flos": 20703389904000.0, "grad_norm": 2.5304577626493634, "language_loss": 0.82612526, "learning_rate": 3.054353992805076e-06, "loss": 0.8475033, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4609375, "step": 5714, "time_per_iteration": 2.4125444889068604 }, { "auxiliary_loss_clip": 0.01070811, "auxiliary_loss_mlp": 0.01058082, "balance_loss_clip": 1.02603829, "balance_loss_mlp": 1.0245235, "epoch": 0.3436043889974448, "flos": 22929137856000.0, "grad_norm": 2.0735953514594745, "language_loss": 0.73683095, "learning_rate": 3.05402302560962e-06, "loss": 0.75811982, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4609375, "step": 5715, "time_per_iteration": 2.4785869121551514 }, { "auxiliary_loss_clip": 0.0102073, "auxiliary_loss_mlp": 0.01006763, "balance_loss_clip": 1.00402153, "balance_loss_mlp": 1.01126695, "epoch": 0.34366451225011274, "flos": 58399914216960.0, "grad_norm": 0.922837362982081, "language_loss": 0.66045725, "learning_rate": 3.053692018445505e-06, "loss": 0.68073219, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.09472656, "step": 5716, "time_per_iteration": 4.523667335510254 }, { "auxiliary_loss_clip": 0.01069183, "auxiliary_loss_mlp": 0.01055385, "balance_loss_clip": 1.02504623, "balance_loss_mlp": 1.02408946, "epoch": 0.3437246355027807, "flos": 15595387194240.0, "grad_norm": 2.0030742370389962, "language_loss": 0.7576679, "learning_rate": 3.0533609713252838e-06, "loss": 0.7789135, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.45117188, "step": 5717, "time_per_iteration": 3.808115243911743 }, { "auxiliary_loss_clip": 0.01069097, "auxiliary_loss_mlp": 0.01057396, "balance_loss_clip": 1.02582955, "balance_loss_mlp": 1.0228976, "epoch": 0.34378475875544867, "flos": 27671041382400.0, "grad_norm": 1.8088632674885232, "language_loss": 0.76923823, "learning_rate": 3.0530298842615077e-06, "loss": 0.79050314, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46289062, "step": 5718, "time_per_iteration": 2.453612804412842 }, { "auxiliary_loss_clip": 0.01069587, "auxiliary_loss_mlp": 0.01061462, "balance_loss_clip": 1.02813113, "balance_loss_mlp": 1.0231185, "epoch": 0.34384488200811664, "flos": 31430139736320.0, "grad_norm": 9.302290551672355, "language_loss": 0.65685511, "learning_rate": 3.052698757266734e-06, "loss": 0.67816561, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46484375, "step": 5719, "time_per_iteration": 2.5195231437683105 }, { "auxiliary_loss_clip": 0.01072763, "auxiliary_loss_mlp": 0.01054272, "balance_loss_clip": 1.01974893, "balance_loss_mlp": 1.02417529, "epoch": 0.3439050052607846, "flos": 24898763007360.0, "grad_norm": 1.7430904047433307, "language_loss": 0.7511543, "learning_rate": 3.0523675903535183e-06, "loss": 0.7724247, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.484375, "step": 5720, "time_per_iteration": 2.4391825199127197 }, { "auxiliary_loss_clip": 0.01070441, "auxiliary_loss_mlp": 0.01057806, "balance_loss_clip": 1.02404559, "balance_loss_mlp": 1.02413976, "epoch": 0.34396512851345257, "flos": 18149109258240.0, "grad_norm": 1.6832609888496959, "language_loss": 0.74779928, "learning_rate": 3.0520363835344173e-06, "loss": 0.76908171, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46289062, "step": 5721, "time_per_iteration": 2.4479684829711914 }, { "auxiliary_loss_clip": 0.01075736, "auxiliary_loss_mlp": 0.01067164, "balance_loss_clip": 1.03287947, "balance_loss_mlp": 1.02701187, "epoch": 0.34402525176612053, "flos": 16033512245760.0, "grad_norm": 3.0327519998443653, "language_loss": 0.81278598, "learning_rate": 3.051705136821992e-06, "loss": 0.83421493, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48632812, "step": 5722, "time_per_iteration": 2.3867363929748535 }, { "auxiliary_loss_clip": 0.0107583, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.01893604, "balance_loss_mlp": 1.03023899, "epoch": 0.3440853750187885, "flos": 21177580256640.0, "grad_norm": 1.6366154329359717, "language_loss": 0.82716632, "learning_rate": 3.051373850228801e-06, "loss": 0.84841943, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.45507812, "step": 5723, "time_per_iteration": 2.5139505863189697 }, { "auxiliary_loss_clip": 0.01078332, "auxiliary_loss_mlp": 0.01055039, "balance_loss_clip": 1.02290034, "balance_loss_mlp": 1.03017139, "epoch": 0.34414549827145646, "flos": 12677032224000.0, "grad_norm": 2.083228577428286, "language_loss": 0.82787812, "learning_rate": 3.0510425237674096e-06, "loss": 0.84921181, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.48046875, "step": 5724, "time_per_iteration": 2.381873369216919 }, { "auxiliary_loss_clip": 0.01078119, "auxiliary_loss_mlp": 0.01057606, "balance_loss_clip": 1.02310681, "balance_loss_mlp": 1.03059959, "epoch": 0.3442056215241244, "flos": 31283190357120.0, "grad_norm": 1.7440029003371, "language_loss": 0.71366781, "learning_rate": 3.05071115745038e-06, "loss": 0.73502505, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.4765625, "step": 5725, "time_per_iteration": 2.4994454383850098 }, { "auxiliary_loss_clip": 0.01082876, "auxiliary_loss_mlp": 0.01058896, "balance_loss_clip": 1.0211072, "balance_loss_mlp": 1.03221977, "epoch": 0.34426574477679245, "flos": 23366180655360.0, "grad_norm": 1.460834838701929, "language_loss": 0.712883, "learning_rate": 3.0503797512902773e-06, "loss": 0.73430073, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.5078125, "step": 5726, "time_per_iteration": 2.431016206741333 }, { "auxiliary_loss_clip": 0.01076561, "auxiliary_loss_mlp": 0.01048587, "balance_loss_clip": 1.01756871, "balance_loss_mlp": 1.03028727, "epoch": 0.3443258680294604, "flos": 24534269746560.0, "grad_norm": 2.2547706066433606, "language_loss": 0.74597371, "learning_rate": 3.0500483052996703e-06, "loss": 0.76722515, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46289062, "step": 5727, "time_per_iteration": 2.532609224319458 }, { "auxiliary_loss_clip": 0.01078869, "auxiliary_loss_mlp": 0.0105453, "balance_loss_clip": 1.02084136, "balance_loss_mlp": 1.03089511, "epoch": 0.3443859912821284, "flos": 20229094817280.0, "grad_norm": 2.1010649462035835, "language_loss": 0.90179914, "learning_rate": 3.0497168194911257e-06, "loss": 0.92313313, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.48046875, "step": 5728, "time_per_iteration": 2.411522150039673 }, { "auxiliary_loss_clip": 0.0107991, "auxiliary_loss_mlp": 0.01050189, "balance_loss_clip": 1.01654816, "balance_loss_mlp": 1.03278947, "epoch": 0.34444611453479634, "flos": 24315364321920.0, "grad_norm": 2.0049914189739235, "language_loss": 0.71876788, "learning_rate": 3.0493852938772143e-06, "loss": 0.74006891, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.47070312, "step": 5729, "time_per_iteration": 2.4435479640960693 }, { "auxiliary_loss_clip": 0.01078138, "auxiliary_loss_mlp": 0.01050385, "balance_loss_clip": 1.01702988, "balance_loss_mlp": 1.03160262, "epoch": 0.3445062377874643, "flos": 16982451532800.0, "grad_norm": 2.6200547134870136, "language_loss": 0.75906301, "learning_rate": 3.0490537284705078e-06, "loss": 0.7803483, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.46679688, "step": 5730, "time_per_iteration": 2.3883657455444336 }, { "auxiliary_loss_clip": 0.01079128, "auxiliary_loss_mlp": 0.01052007, "balance_loss_clip": 1.01591098, "balance_loss_mlp": 1.03010893, "epoch": 0.3445663610401323, "flos": 20301679267200.0, "grad_norm": 2.3128803750875604, "language_loss": 0.81545889, "learning_rate": 3.048722123283578e-06, "loss": 0.83677024, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.49023438, "step": 5731, "time_per_iteration": 2.390544891357422 }, { "auxiliary_loss_clip": 0.01078223, "auxiliary_loss_mlp": 0.01058427, "balance_loss_clip": 1.02266455, "balance_loss_mlp": 1.03126907, "epoch": 0.34462648429280024, "flos": 15887191271040.0, "grad_norm": 1.9655230609446368, "language_loss": 0.79704016, "learning_rate": 3.0483904783290006e-06, "loss": 0.8184067, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.46875, "step": 5732, "time_per_iteration": 2.4063196182250977 }, { "auxiliary_loss_clip": 0.01052602, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.03360462, "balance_loss_mlp": 1.04219079, "epoch": 0.3446866075454682, "flos": 59307760967040.0, "grad_norm": 0.7667424997685337, "language_loss": 0.53699392, "learning_rate": 3.0480587936193505e-06, "loss": 0.55788565, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.10400391, "step": 5733, "time_per_iteration": 3.0551936626434326 }, { "auxiliary_loss_clip": 0.01079617, "auxiliary_loss_mlp": 0.01056818, "balance_loss_clip": 1.02081633, "balance_loss_mlp": 1.0309391, "epoch": 0.34474673079813617, "flos": 22342771704960.0, "grad_norm": 1.6793597951256611, "language_loss": 0.84750199, "learning_rate": 3.047727069167207e-06, "loss": 0.86886632, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48632812, "step": 5734, "time_per_iteration": 2.4198031425476074 }, { "auxiliary_loss_clip": 0.01076383, "auxiliary_loss_mlp": 0.01054733, "balance_loss_clip": 1.01913655, "balance_loss_mlp": 1.02739084, "epoch": 0.34480685405080413, "flos": 27668981612160.0, "grad_norm": 2.3465816006428497, "language_loss": 0.94312125, "learning_rate": 3.0473953049851478e-06, "loss": 0.9644323, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48828125, "step": 5735, "time_per_iteration": 2.4868457317352295 }, { "auxiliary_loss_clip": 0.01077561, "auxiliary_loss_mlp": 0.01068621, "balance_loss_clip": 1.03238153, "balance_loss_mlp": 1.02946627, "epoch": 0.3448669773034721, "flos": 22454912592000.0, "grad_norm": 1.7959881318236572, "language_loss": 0.78277695, "learning_rate": 3.0470635010857533e-06, "loss": 0.8042388, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.48046875, "step": 5736, "time_per_iteration": 2.4215283393859863 }, { "auxiliary_loss_clip": 0.01075911, "auxiliary_loss_mlp": 0.01062152, "balance_loss_clip": 1.025769, "balance_loss_mlp": 1.02862477, "epoch": 0.34492710055614006, "flos": 24935037776640.0, "grad_norm": 1.5975385550673242, "language_loss": 0.80321562, "learning_rate": 3.0467316574816064e-06, "loss": 0.82459617, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.47265625, "step": 5737, "time_per_iteration": 2.468062162399292 }, { "auxiliary_loss_clip": 0.0107668, "auxiliary_loss_mlp": 0.0107292, "balance_loss_clip": 1.0340817, "balance_loss_mlp": 1.02607155, "epoch": 0.34498722380880803, "flos": 20119781750400.0, "grad_norm": 2.1946822040083647, "language_loss": 0.73147941, "learning_rate": 3.0463997741852893e-06, "loss": 0.75297546, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.50390625, "step": 5738, "time_per_iteration": 2.397984027862549 }, { "auxiliary_loss_clip": 0.01074202, "auxiliary_loss_mlp": 0.01078899, "balance_loss_clip": 1.03927386, "balance_loss_mlp": 1.02428329, "epoch": 0.34504734706147605, "flos": 28436896166400.0, "grad_norm": 2.5218704687822426, "language_loss": 0.83132577, "learning_rate": 3.046067851209389e-06, "loss": 0.85285676, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 0.5, "step": 5739, "time_per_iteration": 2.485677480697632 }, { "auxiliary_loss_clip": 0.01074047, "auxiliary_loss_mlp": 0.01076587, "balance_loss_clip": 1.04060912, "balance_loss_mlp": 1.02599335, "epoch": 0.345107470314144, "flos": 22673364168960.0, "grad_norm": 1.9571167893742825, "language_loss": 0.83833277, "learning_rate": 3.0457358885664898e-06, "loss": 0.85983908, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.48046875, "step": 5740, "time_per_iteration": 2.443359375 }, { "auxiliary_loss_clip": 0.01072804, "auxiliary_loss_mlp": 0.01075323, "balance_loss_clip": 1.0393455, "balance_loss_mlp": 1.02458811, "epoch": 0.345167593566812, "flos": 20629688290560.0, "grad_norm": 2.308397724449736, "language_loss": 0.78307104, "learning_rate": 3.045403886269181e-06, "loss": 0.80455232, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48242188, "step": 5741, "time_per_iteration": 2.394364356994629 }, { "auxiliary_loss_clip": 0.01073642, "auxiliary_loss_mlp": 0.01079198, "balance_loss_clip": 1.04425764, "balance_loss_mlp": 1.02366412, "epoch": 0.34522771681947995, "flos": 26213138161920.0, "grad_norm": 1.772978624274225, "language_loss": 0.77958041, "learning_rate": 3.045071844330053e-06, "loss": 0.80110878, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.5, "step": 5742, "time_per_iteration": 2.5062801837921143 }, { "auxiliary_loss_clip": 0.01072817, "auxiliary_loss_mlp": 0.01086373, "balance_loss_clip": 1.05163503, "balance_loss_mlp": 1.02536476, "epoch": 0.3452878400721479, "flos": 19061354839680.0, "grad_norm": 2.305358306195952, "language_loss": 0.78119999, "learning_rate": 3.0447397627616955e-06, "loss": 0.80279189, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.47460938, "step": 5743, "time_per_iteration": 2.4018890857696533 }, { "auxiliary_loss_clip": 0.01075258, "auxiliary_loss_mlp": 0.01085101, "balance_loss_clip": 1.05062604, "balance_loss_mlp": 1.0273031, "epoch": 0.3453479633248159, "flos": 27928455903360.0, "grad_norm": 1.6742534608801989, "language_loss": 0.709185, "learning_rate": 3.0444076415767016e-06, "loss": 0.73078859, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47851562, "step": 5744, "time_per_iteration": 2.5147030353546143 }, { "auxiliary_loss_clip": 0.0107611, "auxiliary_loss_mlp": 0.01081701, "balance_loss_clip": 1.04698765, "balance_loss_mlp": 1.02934241, "epoch": 0.34540808657748384, "flos": 19605197088000.0, "grad_norm": 1.9752462198108303, "language_loss": 0.81346893, "learning_rate": 3.044075480787665e-06, "loss": 0.83504701, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46875, "step": 5745, "time_per_iteration": 2.421937942504883 }, { "auxiliary_loss_clip": 0.0108057, "auxiliary_loss_mlp": 0.01073966, "balance_loss_clip": 1.03472209, "balance_loss_mlp": 1.03019392, "epoch": 0.3454682098301518, "flos": 20410643220480.0, "grad_norm": 2.439711401400575, "language_loss": 0.90261161, "learning_rate": 3.043743280407182e-06, "loss": 0.92415696, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.50390625, "step": 5746, "time_per_iteration": 2.4659440517425537 }, { "auxiliary_loss_clip": 0.01079751, "auxiliary_loss_mlp": 0.01069853, "balance_loss_clip": 1.03280246, "balance_loss_mlp": 1.03012323, "epoch": 0.34552833308281977, "flos": 21324040876800.0, "grad_norm": 1.8753451880101872, "language_loss": 0.66763258, "learning_rate": 3.043411040447849e-06, "loss": 0.68912858, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.49609375, "step": 5747, "time_per_iteration": 2.4393560886383057 }, { "auxiliary_loss_clip": 0.01077987, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.02369571, "balance_loss_mlp": 1.03147793, "epoch": 0.34558845633548774, "flos": 36242253740160.0, "grad_norm": 1.7358477885173567, "language_loss": 0.74648219, "learning_rate": 3.043078760922264e-06, "loss": 0.76782519, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46484375, "step": 5748, "time_per_iteration": 2.5488250255584717 }, { "auxiliary_loss_clip": 0.01078609, "auxiliary_loss_mlp": 0.01047378, "balance_loss_clip": 1.01590705, "balance_loss_mlp": 1.03302026, "epoch": 0.3456485795881557, "flos": 22449606065280.0, "grad_norm": 3.126749824787166, "language_loss": 0.7760011, "learning_rate": 3.042746441843029e-06, "loss": 0.797261, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45703125, "step": 5749, "time_per_iteration": 2.4612433910369873 }, { "auxiliary_loss_clip": 0.01043318, "auxiliary_loss_mlp": 0.01058894, "balance_loss_clip": 1.05603337, "balance_loss_mlp": 1.03312016, "epoch": 0.34570870284082367, "flos": 62001135936000.0, "grad_norm": 0.9103711331807868, "language_loss": 0.62736475, "learning_rate": 3.0424140832227437e-06, "loss": 0.64838684, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.10205078, "step": 5750, "time_per_iteration": 2.9870095252990723 }, { "auxiliary_loss_clip": 0.0108357, "auxiliary_loss_mlp": 0.01045768, "balance_loss_clip": 1.01477313, "balance_loss_mlp": 1.03786445, "epoch": 0.34576882609349163, "flos": 22781141136000.0, "grad_norm": 1.5930095724477433, "language_loss": 0.81987906, "learning_rate": 3.042081685074012e-06, "loss": 0.84117246, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 5751, "time_per_iteration": 3.8663382530212402 }, { "auxiliary_loss_clip": 0.01088125, "auxiliary_loss_mlp": 0.01060104, "balance_loss_clip": 1.02684498, "balance_loss_mlp": 1.04108405, "epoch": 0.34582894934615965, "flos": 12348010771200.0, "grad_norm": 2.111542051075509, "language_loss": 0.85897046, "learning_rate": 3.041749247409439e-06, "loss": 0.88045275, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46875, "step": 5752, "time_per_iteration": 2.413193702697754 }, { "auxiliary_loss_clip": 0.01057248, "auxiliary_loss_mlp": 0.0100654, "balance_loss_clip": 1.00348806, "balance_loss_mlp": 1.04603124, "epoch": 0.3458890725988276, "flos": 70164563080320.0, "grad_norm": 0.7438335388390868, "language_loss": 0.63166505, "learning_rate": 3.0414167702416296e-06, "loss": 0.65230292, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.11230469, "step": 5753, "time_per_iteration": 4.324312925338745 }, { "auxiliary_loss_clip": 0.0109198, "auxiliary_loss_mlp": 0.01068534, "balance_loss_clip": 1.03410602, "balance_loss_mlp": 1.04394913, "epoch": 0.3459491958514956, "flos": 17091624954240.0, "grad_norm": 1.8442619947680616, "language_loss": 0.72211492, "learning_rate": 3.0410842535831914e-06, "loss": 0.74372005, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48046875, "step": 5754, "time_per_iteration": 2.4320058822631836 }, { "auxiliary_loss_clip": 0.01090691, "auxiliary_loss_mlp": 0.01073566, "balance_loss_clip": 1.03870964, "balance_loss_mlp": 1.04110157, "epoch": 0.34600931910416355, "flos": 16650113500800.0, "grad_norm": 1.6592739180145857, "language_loss": 0.74655104, "learning_rate": 3.0407516974467343e-06, "loss": 0.7681936, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.49609375, "step": 5755, "time_per_iteration": 2.4063165187835693 }, { "auxiliary_loss_clip": 0.01083943, "auxiliary_loss_mlp": 0.01076311, "balance_loss_clip": 1.04414856, "balance_loss_mlp": 1.03727865, "epoch": 0.3460694423568315, "flos": 38544635859840.0, "grad_norm": 1.471286216084281, "language_loss": 0.73397279, "learning_rate": 3.040419101844869e-06, "loss": 0.7555753, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46679688, "step": 5756, "time_per_iteration": 4.073963403701782 }, { "auxiliary_loss_clip": 0.01049918, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.04630411, "balance_loss_mlp": 1.03916097, "epoch": 0.3461295656094995, "flos": 72077837564160.0, "grad_norm": 0.7599659908252645, "language_loss": 0.62765598, "learning_rate": 3.040086466790207e-06, "loss": 0.64864659, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.10742188, "step": 5757, "time_per_iteration": 4.416916847229004 }, { "auxiliary_loss_clip": 0.01044301, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.03380609, "balance_loss_mlp": 1.03401721, "epoch": 0.34618968886216744, "flos": 65457118932480.0, "grad_norm": 0.8976355954556915, "language_loss": 0.59244919, "learning_rate": 3.039753792295362e-06, "loss": 0.61325836, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 0.02807617, "router_z_loss_mlp": 0.10253906, "step": 5758, "time_per_iteration": 3.0120491981506348 }, { "auxiliary_loss_clip": 0.01074336, "auxiliary_loss_mlp": 0.010656, "balance_loss_clip": 1.03417647, "balance_loss_mlp": 1.02886248, "epoch": 0.3462498121148354, "flos": 23471548738560.0, "grad_norm": 1.6856442066214543, "language_loss": 0.72538048, "learning_rate": 3.0394210783729487e-06, "loss": 0.74677992, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45507812, "step": 5759, "time_per_iteration": 2.4543159008026123 }, { "auxiliary_loss_clip": 0.01074283, "auxiliary_loss_mlp": 0.01059362, "balance_loss_clip": 1.02696061, "balance_loss_mlp": 1.02851164, "epoch": 0.3463099353675034, "flos": 24169636840320.0, "grad_norm": 2.119169691937801, "language_loss": 0.84741378, "learning_rate": 3.0390883250355836e-06, "loss": 0.86875021, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45703125, "step": 5760, "time_per_iteration": 2.444492816925049 }, { "auxiliary_loss_clip": 0.01022241, "auxiliary_loss_mlp": 0.01003536, "balance_loss_clip": 1.00027001, "balance_loss_mlp": 1.01296687, "epoch": 0.34637005862017134, "flos": 63697915745280.0, "grad_norm": 0.8444779811170003, "language_loss": 0.5655418, "learning_rate": 3.0387555322958865e-06, "loss": 0.58579957, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.09277344, "step": 5761, "time_per_iteration": 3.095231294631958 }, { "auxiliary_loss_clip": 0.01070759, "auxiliary_loss_mlp": 0.01052832, "balance_loss_clip": 1.02118158, "balance_loss_mlp": 1.02599728, "epoch": 0.3464301818728393, "flos": 13144868709120.0, "grad_norm": 2.311630279129401, "language_loss": 0.96298701, "learning_rate": 3.038422700166474e-06, "loss": 0.98422289, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44726562, "step": 5762, "time_per_iteration": 2.400195837020874 }, { "auxiliary_loss_clip": 0.01075261, "auxiliary_loss_mlp": 0.01058748, "balance_loss_clip": 1.02639437, "balance_loss_mlp": 1.02718556, "epoch": 0.34649030512550727, "flos": 29313879408000.0, "grad_norm": 1.580712593362088, "language_loss": 0.70794535, "learning_rate": 3.0380898286599692e-06, "loss": 0.72928542, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.48046875, "step": 5763, "time_per_iteration": 2.517936944961548 }, { "auxiliary_loss_clip": 0.01079221, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.01737392, "balance_loss_mlp": 1.03104484, "epoch": 0.34655042837817523, "flos": 23729801132160.0, "grad_norm": 1.6192029529189438, "language_loss": 0.85072184, "learning_rate": 3.0377569177889945e-06, "loss": 0.87202948, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.48046875, "step": 5764, "time_per_iteration": 2.428978443145752 }, { "auxiliary_loss_clip": 0.01081456, "auxiliary_loss_mlp": 0.01049992, "balance_loss_clip": 1.01918805, "balance_loss_mlp": 1.03518891, "epoch": 0.34661055163084326, "flos": 22053132132480.0, "grad_norm": 2.348373604582191, "language_loss": 0.70069182, "learning_rate": 3.0374239675661722e-06, "loss": 0.72200632, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46289062, "step": 5765, "time_per_iteration": 2.4405624866485596 }, { "auxiliary_loss_clip": 0.01084855, "auxiliary_loss_mlp": 0.0104835, "balance_loss_clip": 1.01869082, "balance_loss_mlp": 1.03973961, "epoch": 0.3466706748835112, "flos": 21798126495360.0, "grad_norm": 1.8973113223679967, "language_loss": 0.78063786, "learning_rate": 3.03709097800413e-06, "loss": 0.80196989, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 5766, "time_per_iteration": 2.435934066772461 }, { "auxiliary_loss_clip": 0.01090271, "auxiliary_loss_mlp": 0.01044599, "balance_loss_clip": 1.01679921, "balance_loss_mlp": 1.04554296, "epoch": 0.3467307981361792, "flos": 19460726415360.0, "grad_norm": 1.4759685346577198, "language_loss": 0.74633801, "learning_rate": 3.0367579491154943e-06, "loss": 0.76768667, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.44726562, "step": 5767, "time_per_iteration": 2.4651293754577637 }, { "auxiliary_loss_clip": 0.01092903, "auxiliary_loss_mlp": 0.01052664, "balance_loss_clip": 1.02138352, "balance_loss_mlp": 1.04622996, "epoch": 0.34679092138884715, "flos": 24826283291520.0, "grad_norm": 1.901868111213925, "language_loss": 0.79603958, "learning_rate": 3.036424880912893e-06, "loss": 0.81749523, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46679688, "step": 5768, "time_per_iteration": 2.4725663661956787 }, { "auxiliary_loss_clip": 0.01110797, "auxiliary_loss_mlp": 0.01026365, "balance_loss_clip": 1.02305067, "balance_loss_mlp": 1.09988856, "epoch": 0.3468510446415151, "flos": 63233116548480.0, "grad_norm": 0.790209355758782, "language_loss": 0.57625508, "learning_rate": 3.036091773408956e-06, "loss": 0.59762669, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.109375, "step": 5769, "time_per_iteration": 3.080239772796631 }, { "auxiliary_loss_clip": 0.01101225, "auxiliary_loss_mlp": 0.01052133, "balance_loss_clip": 1.01708555, "balance_loss_mlp": 1.04926467, "epoch": 0.3469111678941831, "flos": 12120168038400.0, "grad_norm": 3.5835534670198808, "language_loss": 0.88215768, "learning_rate": 3.0357586266163154e-06, "loss": 0.90369129, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.51953125, "step": 5770, "time_per_iteration": 2.401235342025757 }, { "auxiliary_loss_clip": 0.01120012, "auxiliary_loss_mlp": 0.01008597, "balance_loss_clip": 1.00502121, "balance_loss_mlp": 1.10899377, "epoch": 0.34697129114685105, "flos": 65931134728320.0, "grad_norm": 0.7877526445140526, "language_loss": 0.59856343, "learning_rate": 3.0354254405476036e-06, "loss": 0.61984956, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.11035156, "step": 5771, "time_per_iteration": 2.8367111682891846 }, { "auxiliary_loss_clip": 0.01090666, "auxiliary_loss_mlp": 0.01058775, "balance_loss_clip": 1.02878189, "balance_loss_mlp": 1.0451349, "epoch": 0.347031414399519, "flos": 34452920183040.0, "grad_norm": 2.0748850720451593, "language_loss": 0.72962284, "learning_rate": 3.0350922152154557e-06, "loss": 0.75111723, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45507812, "step": 5772, "time_per_iteration": 2.5532376766204834 }, { "auxiliary_loss_clip": 0.01089597, "auxiliary_loss_mlp": 0.01069608, "balance_loss_clip": 1.03822029, "balance_loss_mlp": 1.04147112, "epoch": 0.347091537652187, "flos": 26942892733440.0, "grad_norm": 1.5255470421047013, "language_loss": 0.77800405, "learning_rate": 3.034758950632507e-06, "loss": 0.79959607, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.48046875, "step": 5773, "time_per_iteration": 2.489081621170044 }, { "auxiliary_loss_clip": 0.01081958, "auxiliary_loss_mlp": 0.01078766, "balance_loss_clip": 1.04641247, "balance_loss_mlp": 1.03408098, "epoch": 0.34715166090485494, "flos": 21141165841920.0, "grad_norm": 2.232845841914769, "language_loss": 0.72458911, "learning_rate": 3.034425646811396e-06, "loss": 0.74619639, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47851562, "step": 5774, "time_per_iteration": 2.458618402481079 }, { "auxiliary_loss_clip": 0.01078337, "auxiliary_loss_mlp": 0.01098602, "balance_loss_clip": 1.06900287, "balance_loss_mlp": 1.03329992, "epoch": 0.3472117841575229, "flos": 23476855265280.0, "grad_norm": 1.762891794177019, "language_loss": 0.77353138, "learning_rate": 3.0340923037647602e-06, "loss": 0.79530078, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 5775, "time_per_iteration": 2.4566147327423096 }, { "auxiliary_loss_clip": 0.01078285, "auxiliary_loss_mlp": 0.01103004, "balance_loss_clip": 1.06768227, "balance_loss_mlp": 1.02934825, "epoch": 0.34727190741019087, "flos": 17491869313920.0, "grad_norm": 2.008549467131087, "language_loss": 0.80387831, "learning_rate": 3.0337589215052404e-06, "loss": 0.82569122, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.48828125, "step": 5776, "time_per_iteration": 2.408928155899048 }, { "auxiliary_loss_clip": 0.01048063, "auxiliary_loss_mlp": 0.01088732, "balance_loss_clip": 1.08451223, "balance_loss_mlp": 1.0369215, "epoch": 0.34733203066285884, "flos": 65261848498560.0, "grad_norm": 0.9117067832794445, "language_loss": 0.63429564, "learning_rate": 3.033425500045478e-06, "loss": 0.65566361, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.11132812, "step": 5777, "time_per_iteration": 3.0589051246643066 }, { "auxiliary_loss_clip": 0.01073588, "auxiliary_loss_mlp": 0.01097974, "balance_loss_clip": 1.06631231, "balance_loss_mlp": 1.02584147, "epoch": 0.3473921539155268, "flos": 28657442424960.0, "grad_norm": 1.7667876768086608, "language_loss": 0.66678029, "learning_rate": 3.033092039398119e-06, "loss": 0.68849593, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4765625, "step": 5778, "time_per_iteration": 2.5368399620056152 }, { "auxiliary_loss_clip": 0.01076264, "auxiliary_loss_mlp": 0.01095614, "balance_loss_clip": 1.06276, "balance_loss_mlp": 1.02797508, "epoch": 0.3474522771681948, "flos": 40835497230720.0, "grad_norm": 1.801942470518592, "language_loss": 0.74106085, "learning_rate": 3.0327585395758046e-06, "loss": 0.76277965, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48242188, "step": 5779, "time_per_iteration": 2.5849602222442627 }, { "auxiliary_loss_clip": 0.01079472, "auxiliary_loss_mlp": 0.01089844, "balance_loss_clip": 1.05701399, "balance_loss_mlp": 1.0308435, "epoch": 0.3475124004208628, "flos": 24607412778240.0, "grad_norm": 1.8460594367499052, "language_loss": 0.64182055, "learning_rate": 3.0324250005911837e-06, "loss": 0.66351372, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.484375, "step": 5780, "time_per_iteration": 2.546299934387207 }, { "auxiliary_loss_clip": 0.01082602, "auxiliary_loss_mlp": 0.01081822, "balance_loss_clip": 1.051054, "balance_loss_mlp": 1.03695893, "epoch": 0.34757252367353075, "flos": 22710197520000.0, "grad_norm": 1.6784908181188403, "language_loss": 0.72763121, "learning_rate": 3.0320914224569033e-06, "loss": 0.74927545, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 5781, "time_per_iteration": 2.4660961627960205 }, { "auxiliary_loss_clip": 0.01092704, "auxiliary_loss_mlp": 0.01078364, "balance_loss_clip": 1.04415083, "balance_loss_mlp": 1.04488182, "epoch": 0.3476326469261987, "flos": 19827174712320.0, "grad_norm": 2.0576391041474738, "language_loss": 0.79391694, "learning_rate": 3.031757805185612e-06, "loss": 0.81562757, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.47851562, "step": 5782, "time_per_iteration": 2.467360496520996 }, { "auxiliary_loss_clip": 0.01094587, "auxiliary_loss_mlp": 0.01064523, "balance_loss_clip": 1.03486419, "balance_loss_mlp": 1.0485729, "epoch": 0.3476927701788667, "flos": 19937081272320.0, "grad_norm": 3.7160584077882852, "language_loss": 0.63997865, "learning_rate": 3.0314241487899622e-06, "loss": 0.66156977, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4609375, "step": 5783, "time_per_iteration": 2.438946485519409 }, { "auxiliary_loss_clip": 0.01105588, "auxiliary_loss_mlp": 0.01053358, "balance_loss_clip": 1.02422309, "balance_loss_mlp": 1.05944991, "epoch": 0.34775289343153465, "flos": 20734218501120.0, "grad_norm": 2.1068858006369204, "language_loss": 0.89118379, "learning_rate": 3.031090453282605e-06, "loss": 0.91277325, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4609375, "step": 5784, "time_per_iteration": 2.5052058696746826 }, { "auxiliary_loss_clip": 0.01110561, "auxiliary_loss_mlp": 0.01047225, "balance_loss_clip": 1.01605225, "balance_loss_mlp": 1.06364048, "epoch": 0.3478130166842026, "flos": 19353822232320.0, "grad_norm": 1.7733480970983828, "language_loss": 0.83180451, "learning_rate": 3.0307567186761946e-06, "loss": 0.85338241, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46875, "step": 5785, "time_per_iteration": 2.4215545654296875 }, { "auxiliary_loss_clip": 0.01115716, "auxiliary_loss_mlp": 0.01054244, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.0663892, "epoch": 0.3478731399368706, "flos": 22050199578240.0, "grad_norm": 1.711744055374755, "language_loss": 0.81426966, "learning_rate": 3.0304229449833862e-06, "loss": 0.83596927, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.49414062, "step": 5786, "time_per_iteration": 2.468496799468994 }, { "auxiliary_loss_clip": 0.01117511, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.02009261, "balance_loss_mlp": 1.07008278, "epoch": 0.34793326318953854, "flos": 18040459507200.0, "grad_norm": 2.188406086539704, "language_loss": 0.76106763, "learning_rate": 3.030089132216836e-06, "loss": 0.78274632, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.47460938, "step": 5787, "time_per_iteration": 2.425279140472412 }, { "auxiliary_loss_clip": 0.01119925, "auxiliary_loss_mlp": 0.01055549, "balance_loss_clip": 1.02679563, "balance_loss_mlp": 1.0695622, "epoch": 0.3479933864422065, "flos": 29313390648960.0, "grad_norm": 1.5681236698017509, "language_loss": 0.83004665, "learning_rate": 3.029755280389203e-06, "loss": 0.8518014, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.5, "step": 5788, "time_per_iteration": 2.531289577484131 }, { "auxiliary_loss_clip": 0.01124634, "auxiliary_loss_mlp": 0.0106496, "balance_loss_clip": 1.03170121, "balance_loss_mlp": 1.07165122, "epoch": 0.3480535096948745, "flos": 20119677016320.0, "grad_norm": 2.361560001324782, "language_loss": 0.87768996, "learning_rate": 3.029421389513147e-06, "loss": 0.8995859, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.53125, "step": 5789, "time_per_iteration": 2.459383726119995 }, { "auxiliary_loss_clip": 0.01125235, "auxiliary_loss_mlp": 0.01085725, "balance_loss_clip": 1.05265641, "balance_loss_mlp": 1.07369626, "epoch": 0.34811363294754244, "flos": 18548061897600.0, "grad_norm": 1.854966163884327, "language_loss": 0.860726, "learning_rate": 3.029087459601328e-06, "loss": 0.88283557, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.515625, "step": 5790, "time_per_iteration": 2.484607696533203 }, { "auxiliary_loss_clip": 0.01122616, "auxiliary_loss_mlp": 0.01084353, "balance_loss_clip": 1.05338287, "balance_loss_mlp": 1.07115054, "epoch": 0.3481737562002104, "flos": 26869086385920.0, "grad_norm": 2.0005381945502663, "language_loss": 0.82757109, "learning_rate": 3.0287534906664097e-06, "loss": 0.84964073, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.515625, "step": 5791, "time_per_iteration": 3.9457123279571533 }, { "auxiliary_loss_clip": 0.01118577, "auxiliary_loss_mlp": 0.01073195, "balance_loss_clip": 1.03936386, "balance_loss_mlp": 1.0673548, "epoch": 0.3482338794528784, "flos": 28907525560320.0, "grad_norm": 1.6859826228784331, "language_loss": 0.78889602, "learning_rate": 3.028419482721056e-06, "loss": 0.81081378, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.51171875, "step": 5792, "time_per_iteration": 3.9383633136749268 }, { "auxiliary_loss_clip": 0.01111965, "auxiliary_loss_mlp": 0.01078371, "balance_loss_clip": 1.04695928, "balance_loss_mlp": 1.06287336, "epoch": 0.3482940027055464, "flos": 22199662575360.0, "grad_norm": 1.4940570846892904, "language_loss": 0.82832783, "learning_rate": 3.0280854357779325e-06, "loss": 0.85023117, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4921875, "step": 5793, "time_per_iteration": 2.513511896133423 }, { "auxiliary_loss_clip": 0.01114223, "auxiliary_loss_mlp": 0.01076586, "balance_loss_clip": 1.04361308, "balance_loss_mlp": 1.06441689, "epoch": 0.34835412595821436, "flos": 20301679267200.0, "grad_norm": 1.8946959068397815, "language_loss": 0.77535689, "learning_rate": 3.027751349849706e-06, "loss": 0.79726493, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.49609375, "step": 5794, "time_per_iteration": 2.465667247772217 }, { "auxiliary_loss_clip": 0.01109408, "auxiliary_loss_mlp": 0.01069368, "balance_loss_clip": 1.03863549, "balance_loss_mlp": 1.06124735, "epoch": 0.3484142492108823, "flos": 20448628646400.0, "grad_norm": 1.963909151380665, "language_loss": 0.57904178, "learning_rate": 3.0274172249490456e-06, "loss": 0.60082948, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.48046875, "step": 5795, "time_per_iteration": 2.496779441833496 }, { "auxiliary_loss_clip": 0.01104478, "auxiliary_loss_mlp": 0.0104781, "balance_loss_clip": 1.01984334, "balance_loss_mlp": 1.05808794, "epoch": 0.3484743724635503, "flos": 24351778736640.0, "grad_norm": 1.639251149015611, "language_loss": 0.83856297, "learning_rate": 3.0270830610886213e-06, "loss": 0.86008584, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.46484375, "step": 5796, "time_per_iteration": 3.887704849243164 }, { "auxiliary_loss_clip": 0.01099486, "auxiliary_loss_mlp": 0.01056848, "balance_loss_clip": 1.02728462, "balance_loss_mlp": 1.05388904, "epoch": 0.34853449571621825, "flos": 24351848559360.0, "grad_norm": 2.392264744977286, "language_loss": 0.84519941, "learning_rate": 3.0267488582811033e-06, "loss": 0.86676276, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.45507812, "step": 5797, "time_per_iteration": 3.8614859580993652 }, { "auxiliary_loss_clip": 0.01095737, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.02054513, "balance_loss_mlp": 1.05125594, "epoch": 0.3485946189688862, "flos": 27266572748160.0, "grad_norm": 1.629144811364495, "language_loss": 0.74175072, "learning_rate": 3.026414616539167e-06, "loss": 0.76322287, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 5798, "time_per_iteration": 2.5441410541534424 }, { "auxiliary_loss_clip": 0.0108839, "auxiliary_loss_mlp": 0.01050674, "balance_loss_clip": 1.02058578, "balance_loss_mlp": 1.04271555, "epoch": 0.3486547422215542, "flos": 20155672494720.0, "grad_norm": 1.839228890646014, "language_loss": 0.77217484, "learning_rate": 3.026080335875485e-06, "loss": 0.79356539, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45703125, "step": 5799, "time_per_iteration": 2.4297337532043457 }, { "auxiliary_loss_clip": 0.01088434, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.02045512, "balance_loss_mlp": 1.04215097, "epoch": 0.34871486547422215, "flos": 20229304285440.0, "grad_norm": 1.9143831638052071, "language_loss": 0.76959896, "learning_rate": 3.025746016302734e-06, "loss": 0.7909925, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.46289062, "step": 5800, "time_per_iteration": 2.513524293899536 }, { "auxiliary_loss_clip": 0.01085408, "auxiliary_loss_mlp": 0.01051368, "balance_loss_clip": 1.01906276, "balance_loss_mlp": 1.03784585, "epoch": 0.3487749887268901, "flos": 44051591208960.0, "grad_norm": 2.968128044711718, "language_loss": 0.68710959, "learning_rate": 3.025411657833591e-06, "loss": 0.70847732, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.47460938, "step": 5801, "time_per_iteration": 2.623863458633423 }, { "auxiliary_loss_clip": 0.01079768, "auxiliary_loss_mlp": 0.01053664, "balance_loss_clip": 1.02097714, "balance_loss_mlp": 1.03553104, "epoch": 0.3488351119795581, "flos": 23294015141760.0, "grad_norm": 2.0251315136323544, "language_loss": 0.77438146, "learning_rate": 3.025077260480735e-06, "loss": 0.79571581, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44335938, "step": 5802, "time_per_iteration": 2.5143930912017822 }, { "auxiliary_loss_clip": 0.01074823, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.01695275, "balance_loss_mlp": 1.03140593, "epoch": 0.34889523523222604, "flos": 19933904338560.0, "grad_norm": 1.8092280897434303, "language_loss": 0.7998361, "learning_rate": 3.0247428242568474e-06, "loss": 0.8210519, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 5803, "time_per_iteration": 2.445662260055542 }, { "auxiliary_loss_clip": 0.01077751, "auxiliary_loss_mlp": 0.01052983, "balance_loss_clip": 1.0201292, "balance_loss_mlp": 1.02976847, "epoch": 0.348955358484894, "flos": 30444855857280.0, "grad_norm": 1.7566057106547157, "language_loss": 0.68784267, "learning_rate": 3.0244083491746085e-06, "loss": 0.70915002, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5804, "time_per_iteration": 2.5335896015167236 }, { "auxiliary_loss_clip": 0.01072207, "auxiliary_loss_mlp": 0.01049267, "balance_loss_clip": 1.02099025, "balance_loss_mlp": 1.02983963, "epoch": 0.349015481737562, "flos": 17999122590720.0, "grad_norm": 1.9277410812636442, "language_loss": 0.78667593, "learning_rate": 3.024073835246702e-06, "loss": 0.80789071, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42382812, "step": 5805, "time_per_iteration": 2.3964343070983887 }, { "auxiliary_loss_clip": 0.01075776, "auxiliary_loss_mlp": 0.01048611, "balance_loss_clip": 1.01554191, "balance_loss_mlp": 1.03013408, "epoch": 0.34907560499023, "flos": 27197269966080.0, "grad_norm": 2.8325112695221346, "language_loss": 0.69236374, "learning_rate": 3.023739282485814e-06, "loss": 0.71360755, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45703125, "step": 5806, "time_per_iteration": 2.5256264209747314 }, { "auxiliary_loss_clip": 0.01072899, "auxiliary_loss_mlp": 0.01043246, "balance_loss_clip": 1.01295519, "balance_loss_mlp": 1.02797508, "epoch": 0.34913572824289796, "flos": 30225566407680.0, "grad_norm": 1.4595041924837142, "language_loss": 0.73442417, "learning_rate": 3.023404690904629e-06, "loss": 0.75558555, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44921875, "step": 5807, "time_per_iteration": 2.4796314239501953 }, { "auxiliary_loss_clip": 0.01070951, "auxiliary_loss_mlp": 0.0105087, "balance_loss_clip": 1.0178256, "balance_loss_mlp": 1.02415967, "epoch": 0.3491958514955659, "flos": 29970595681920.0, "grad_norm": 1.9628529179786274, "language_loss": 0.7554847, "learning_rate": 3.0230700605158364e-06, "loss": 0.77670294, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.46875, "step": 5808, "time_per_iteration": 2.5145537853240967 }, { "auxiliary_loss_clip": 0.01069236, "auxiliary_loss_mlp": 0.01050388, "balance_loss_clip": 1.01960826, "balance_loss_mlp": 1.02499926, "epoch": 0.3492559747482339, "flos": 22782188476800.0, "grad_norm": 1.8314000773603696, "language_loss": 0.85023177, "learning_rate": 3.0227353913321238e-06, "loss": 0.87142801, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44140625, "step": 5809, "time_per_iteration": 2.429583787918091 }, { "auxiliary_loss_clip": 0.0106765, "auxiliary_loss_mlp": 0.01049657, "balance_loss_clip": 1.02028346, "balance_loss_mlp": 1.02416658, "epoch": 0.34931609800090185, "flos": 26066817187200.0, "grad_norm": 1.9821434360238217, "language_loss": 0.8134715, "learning_rate": 3.0224006833661835e-06, "loss": 0.83464456, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 5810, "time_per_iteration": 2.487069845199585 }, { "auxiliary_loss_clip": 0.01069497, "auxiliary_loss_mlp": 0.01051567, "balance_loss_clip": 1.01949978, "balance_loss_mlp": 1.02412069, "epoch": 0.3493762212535698, "flos": 29240736376320.0, "grad_norm": 1.728257900291536, "language_loss": 0.76588452, "learning_rate": 3.0220659366307057e-06, "loss": 0.78709519, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45507812, "step": 5811, "time_per_iteration": 2.4658031463623047 }, { "auxiliary_loss_clip": 0.01072955, "auxiliary_loss_mlp": 0.01050297, "balance_loss_clip": 1.01857531, "balance_loss_mlp": 1.0242238, "epoch": 0.3494363445062378, "flos": 27124825161600.0, "grad_norm": 1.6241625702323288, "language_loss": 0.80598879, "learning_rate": 3.021731151138386e-06, "loss": 0.82722133, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.48828125, "step": 5812, "time_per_iteration": 2.461359977722168 }, { "auxiliary_loss_clip": 0.01069946, "auxiliary_loss_mlp": 0.01049701, "balance_loss_clip": 1.01884985, "balance_loss_mlp": 1.02379894, "epoch": 0.34949646775890575, "flos": 12275391409920.0, "grad_norm": 1.95677368743276, "language_loss": 0.7167778, "learning_rate": 3.021396326901918e-06, "loss": 0.73797429, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4609375, "step": 5813, "time_per_iteration": 2.373339891433716 }, { "auxiliary_loss_clip": 0.01067811, "auxiliary_loss_mlp": 0.01047476, "balance_loss_clip": 1.01590896, "balance_loss_mlp": 1.02293003, "epoch": 0.3495565910115737, "flos": 17164558517760.0, "grad_norm": 2.0369109248681463, "language_loss": 0.77316213, "learning_rate": 3.0210614639339998e-06, "loss": 0.79431504, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 5814, "time_per_iteration": 2.3924319744110107 }, { "auxiliary_loss_clip": 0.01072467, "auxiliary_loss_mlp": 0.01059153, "balance_loss_clip": 1.02396214, "balance_loss_mlp": 1.02405119, "epoch": 0.3496167142642417, "flos": 26464547928960.0, "grad_norm": 1.5191922182525877, "language_loss": 0.85942668, "learning_rate": 3.020726562247328e-06, "loss": 0.88074291, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.484375, "step": 5815, "time_per_iteration": 2.4504432678222656 }, { "auxiliary_loss_clip": 0.0106857, "auxiliary_loss_mlp": 0.01052559, "balance_loss_clip": 1.02092075, "balance_loss_mlp": 1.02305543, "epoch": 0.34967683751690964, "flos": 17414048160000.0, "grad_norm": 1.9438861608842157, "language_loss": 0.79244387, "learning_rate": 3.0203916218546024e-06, "loss": 0.81365514, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45507812, "step": 5816, "time_per_iteration": 2.3882343769073486 }, { "auxiliary_loss_clip": 0.01074945, "auxiliary_loss_mlp": 0.01055248, "balance_loss_clip": 1.02244151, "balance_loss_mlp": 1.02699399, "epoch": 0.3497369607695776, "flos": 22598964328320.0, "grad_norm": 2.259040377787045, "language_loss": 0.61393118, "learning_rate": 3.0200566427685246e-06, "loss": 0.63523316, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.48046875, "step": 5817, "time_per_iteration": 2.4290599822998047 }, { "auxiliary_loss_clip": 0.01016321, "auxiliary_loss_mlp": 0.01021094, "balance_loss_clip": 1.01785171, "balance_loss_mlp": 1.00670481, "epoch": 0.34979708402224563, "flos": 68526193708800.0, "grad_norm": 0.8927622231140444, "language_loss": 0.59962922, "learning_rate": 3.0197216250017975e-06, "loss": 0.62000334, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.09619141, "step": 5818, "time_per_iteration": 3.125664234161377 }, { "auxiliary_loss_clip": 0.01067554, "auxiliary_loss_mlp": 0.01055638, "balance_loss_clip": 1.02209258, "balance_loss_mlp": 1.02396178, "epoch": 0.3498572072749136, "flos": 18988630744320.0, "grad_norm": 2.3915474440455147, "language_loss": 0.84068668, "learning_rate": 3.019386568567123e-06, "loss": 0.86191857, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4375, "step": 5819, "time_per_iteration": 2.383964776992798 }, { "auxiliary_loss_clip": 0.01069921, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.01928961, "balance_loss_mlp": 1.02367663, "epoch": 0.34991733052758156, "flos": 27817641648000.0, "grad_norm": 1.704943036394434, "language_loss": 0.71330804, "learning_rate": 3.0190514734772083e-06, "loss": 0.73452652, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46289062, "step": 5820, "time_per_iteration": 2.480193853378296 }, { "auxiliary_loss_clip": 0.01068976, "auxiliary_loss_mlp": 0.01059052, "balance_loss_clip": 1.02760458, "balance_loss_mlp": 1.02294505, "epoch": 0.3499774537802495, "flos": 33582779568000.0, "grad_norm": 1.689992856138994, "language_loss": 0.70676184, "learning_rate": 3.018716339744759e-06, "loss": 0.72804213, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4609375, "step": 5821, "time_per_iteration": 2.49688982963562 }, { "auxiliary_loss_clip": 0.01077178, "auxiliary_loss_mlp": 0.0107155, "balance_loss_clip": 1.03235364, "balance_loss_mlp": 1.02698457, "epoch": 0.3500375770329175, "flos": 23475633367680.0, "grad_norm": 2.0218095178282103, "language_loss": 0.75640047, "learning_rate": 3.0183811673824842e-06, "loss": 0.7778877, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.50390625, "step": 5822, "time_per_iteration": 2.4625141620635986 }, { "auxiliary_loss_clip": 0.01072347, "auxiliary_loss_mlp": 0.01066652, "balance_loss_clip": 1.02874398, "balance_loss_mlp": 1.0242672, "epoch": 0.35009770028558546, "flos": 19025045159040.0, "grad_norm": 1.7937322377805607, "language_loss": 0.79493153, "learning_rate": 3.018045956403094e-06, "loss": 0.81632149, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.48046875, "step": 5823, "time_per_iteration": 2.381380081176758 }, { "auxiliary_loss_clip": 0.01013318, "auxiliary_loss_mlp": 0.01009023, "balance_loss_clip": 1.00561333, "balance_loss_mlp": 1.00342298, "epoch": 0.3501578235382534, "flos": 68348555377920.0, "grad_norm": 0.716615717363725, "language_loss": 0.59391475, "learning_rate": 3.017710706819298e-06, "loss": 0.61413813, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.09863281, "step": 5824, "time_per_iteration": 3.0502030849456787 }, { "auxiliary_loss_clip": 0.01068371, "auxiliary_loss_mlp": 0.01056095, "balance_loss_clip": 1.02113152, "balance_loss_mlp": 1.02188325, "epoch": 0.3502179467909214, "flos": 21249850504320.0, "grad_norm": 2.5098443285060106, "language_loss": 0.86593747, "learning_rate": 3.017375418643811e-06, "loss": 0.88718212, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.46484375, "step": 5825, "time_per_iteration": 2.406557559967041 }, { "auxiliary_loss_clip": 0.01069702, "auxiliary_loss_mlp": 0.01063073, "balance_loss_clip": 1.03045774, "balance_loss_mlp": 1.02350545, "epoch": 0.35027807004358935, "flos": 11942285328000.0, "grad_norm": 2.171718314278295, "language_loss": 0.8445484, "learning_rate": 3.0170400918893464e-06, "loss": 0.8658762, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.4609375, "step": 5826, "time_per_iteration": 2.36505389213562 }, { "auxiliary_loss_clip": 0.01072013, "auxiliary_loss_mlp": 0.01064306, "balance_loss_clip": 1.03105879, "balance_loss_mlp": 1.02581215, "epoch": 0.3503381932962573, "flos": 21469838181120.0, "grad_norm": 1.6082830086500342, "language_loss": 0.81813109, "learning_rate": 3.0167047265686186e-06, "loss": 0.83949435, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4609375, "step": 5827, "time_per_iteration": 2.4067065715789795 }, { "auxiliary_loss_clip": 0.01070342, "auxiliary_loss_mlp": 0.01061582, "balance_loss_clip": 1.03019381, "balance_loss_mlp": 1.02428186, "epoch": 0.3503983165489253, "flos": 21250059972480.0, "grad_norm": 2.103535874486085, "language_loss": 0.72731251, "learning_rate": 3.0163693226943467e-06, "loss": 0.74863172, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4609375, "step": 5828, "time_per_iteration": 2.442513942718506 }, { "auxiliary_loss_clip": 0.01073836, "auxiliary_loss_mlp": 0.01074403, "balance_loss_clip": 1.03551662, "balance_loss_mlp": 1.02555621, "epoch": 0.35045843980159325, "flos": 27814569448320.0, "grad_norm": 21.868245058164973, "language_loss": 0.80895257, "learning_rate": 3.016033880279248e-06, "loss": 0.83043492, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.48242188, "step": 5829, "time_per_iteration": 2.436824321746826 }, { "auxiliary_loss_clip": 0.01077436, "auxiliary_loss_mlp": 0.01069214, "balance_loss_clip": 1.02997041, "balance_loss_mlp": 1.02777731, "epoch": 0.3505185630542612, "flos": 25919972542080.0, "grad_norm": 2.4841017402925143, "language_loss": 0.73498297, "learning_rate": 3.0156983993360417e-06, "loss": 0.75644946, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 0.49609375, "step": 5830, "time_per_iteration": 3.869079828262329 }, { "auxiliary_loss_clip": 0.0107125, "auxiliary_loss_mlp": 0.0105768, "balance_loss_clip": 1.02509999, "balance_loss_mlp": 1.02551258, "epoch": 0.35057868630692923, "flos": 20520724337280.0, "grad_norm": 2.1028258510182884, "language_loss": 0.89524508, "learning_rate": 3.0153628798774513e-06, "loss": 0.91653442, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.45703125, "step": 5831, "time_per_iteration": 2.4045944213867188 }, { "auxiliary_loss_clip": 0.01072337, "auxiliary_loss_mlp": 0.01057534, "balance_loss_clip": 1.0254662, "balance_loss_mlp": 1.02552593, "epoch": 0.3506388095595972, "flos": 20447616216960.0, "grad_norm": 2.4926967062826866, "language_loss": 0.79624629, "learning_rate": 3.0150273219161985e-06, "loss": 0.81754494, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46875, "step": 5832, "time_per_iteration": 3.8219618797302246 }, { "auxiliary_loss_clip": 0.01073874, "auxiliary_loss_mlp": 0.01069771, "balance_loss_clip": 1.03396058, "balance_loss_mlp": 1.02619267, "epoch": 0.35069893281226516, "flos": 23108626488960.0, "grad_norm": 2.050430462970009, "language_loss": 0.73158526, "learning_rate": 3.014691725465008e-06, "loss": 0.75302184, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.4765625, "step": 5833, "time_per_iteration": 2.4380710124969482 }, { "auxiliary_loss_clip": 0.01070897, "auxiliary_loss_mlp": 0.01053767, "balance_loss_clip": 1.02198529, "balance_loss_mlp": 1.02632165, "epoch": 0.35075905606493313, "flos": 27270762111360.0, "grad_norm": 1.3183549200075686, "language_loss": 0.81812167, "learning_rate": 3.014356090536606e-06, "loss": 0.83936834, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4453125, "step": 5834, "time_per_iteration": 4.001646518707275 }, { "auxiliary_loss_clip": 0.01075107, "auxiliary_loss_mlp": 0.01056426, "balance_loss_clip": 1.02344048, "balance_loss_mlp": 1.02957249, "epoch": 0.3508191793176011, "flos": 19127794890240.0, "grad_norm": 2.7909438498610744, "language_loss": 0.85360861, "learning_rate": 3.0140204171437183e-06, "loss": 0.87492394, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45507812, "step": 5835, "time_per_iteration": 2.38238263130188 }, { "auxiliary_loss_clip": 0.01072686, "auxiliary_loss_mlp": 0.01053783, "balance_loss_clip": 1.02293134, "balance_loss_mlp": 1.02703309, "epoch": 0.35087930257026906, "flos": 25556386976640.0, "grad_norm": 1.6237561918683274, "language_loss": 0.77774358, "learning_rate": 3.0136847052990754e-06, "loss": 0.79900825, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 5836, "time_per_iteration": 3.8332507610321045 }, { "auxiliary_loss_clip": 0.01076074, "auxiliary_loss_mlp": 0.01049787, "balance_loss_clip": 1.01596761, "balance_loss_mlp": 1.02941525, "epoch": 0.350939425822937, "flos": 18003277042560.0, "grad_norm": 2.1377329023050295, "language_loss": 0.7868551, "learning_rate": 3.0133489550154074e-06, "loss": 0.80811375, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46679688, "step": 5837, "time_per_iteration": 2.3806662559509277 }, { "auxiliary_loss_clip": 0.01073167, "auxiliary_loss_mlp": 0.01047668, "balance_loss_clip": 1.01486123, "balance_loss_mlp": 1.02905583, "epoch": 0.350999549075605, "flos": 22272107379840.0, "grad_norm": 1.68650829982697, "language_loss": 0.69786578, "learning_rate": 3.0130131663054442e-06, "loss": 0.71907413, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44140625, "step": 5838, "time_per_iteration": 2.4404561519622803 }, { "auxiliary_loss_clip": 0.01073304, "auxiliary_loss_mlp": 0.01045845, "balance_loss_clip": 1.01380134, "balance_loss_mlp": 1.02742851, "epoch": 0.35105967232827295, "flos": 14391407358720.0, "grad_norm": 2.0484376735249374, "language_loss": 0.84954381, "learning_rate": 3.0126773391819215e-06, "loss": 0.87073529, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45703125, "step": 5839, "time_per_iteration": 2.372685432434082 }, { "auxiliary_loss_clip": 0.01075219, "auxiliary_loss_mlp": 0.01055769, "balance_loss_clip": 1.02125764, "balance_loss_mlp": 1.02731419, "epoch": 0.3511197955809409, "flos": 25081184194560.0, "grad_norm": 2.0777803660218988, "language_loss": 0.60660422, "learning_rate": 3.012341473657572e-06, "loss": 0.62791407, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48046875, "step": 5840, "time_per_iteration": 2.4933981895446777 }, { "auxiliary_loss_clip": 0.01074103, "auxiliary_loss_mlp": 0.01060412, "balance_loss_clip": 1.02557862, "balance_loss_mlp": 1.02763915, "epoch": 0.3511799188336089, "flos": 25882999545600.0, "grad_norm": 2.5483035567991013, "language_loss": 0.8878755, "learning_rate": 3.0120055697451322e-06, "loss": 0.90922064, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.46484375, "step": 5841, "time_per_iteration": 2.4199652671813965 }, { "auxiliary_loss_clip": 0.0107551, "auxiliary_loss_mlp": 0.01062296, "balance_loss_clip": 1.0269146, "balance_loss_mlp": 1.0269978, "epoch": 0.35124004208627685, "flos": 20082704019840.0, "grad_norm": 1.8144776608022941, "language_loss": 0.76243138, "learning_rate": 3.0116696274573406e-06, "loss": 0.78380942, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.48632812, "step": 5842, "time_per_iteration": 2.463029623031616 }, { "auxiliary_loss_clip": 0.0107189, "auxiliary_loss_mlp": 0.01063571, "balance_loss_clip": 1.0289526, "balance_loss_mlp": 1.02532101, "epoch": 0.3513001653389448, "flos": 17782521315840.0, "grad_norm": 2.0559801936783555, "language_loss": 0.70819163, "learning_rate": 3.0113336468069346e-06, "loss": 0.72954619, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.46484375, "step": 5843, "time_per_iteration": 2.371574640274048 }, { "auxiliary_loss_clip": 0.01068952, "auxiliary_loss_mlp": 0.01057795, "balance_loss_clip": 1.02450013, "balance_loss_mlp": 1.02398062, "epoch": 0.3513602885916128, "flos": 29385870364800.0, "grad_norm": 3.106905195881391, "language_loss": 0.66746396, "learning_rate": 3.010997627806655e-06, "loss": 0.68873143, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44921875, "step": 5844, "time_per_iteration": 2.508284568786621 }, { "auxiliary_loss_clip": 0.01069272, "auxiliary_loss_mlp": 0.01064867, "balance_loss_clip": 1.03170264, "balance_loss_mlp": 1.02366662, "epoch": 0.3514204118442808, "flos": 16178960436480.0, "grad_norm": 2.029065472002873, "language_loss": 0.77351749, "learning_rate": 3.010661570469245e-06, "loss": 0.79485881, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.45507812, "step": 5845, "time_per_iteration": 2.3708672523498535 }, { "auxiliary_loss_clip": 0.01069303, "auxiliary_loss_mlp": 0.01071077, "balance_loss_clip": 1.03877163, "balance_loss_mlp": 1.02419877, "epoch": 0.35148053509694877, "flos": 23833737849600.0, "grad_norm": 2.754368169222888, "language_loss": 0.755198, "learning_rate": 3.0103254748074465e-06, "loss": 0.77660179, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.45117188, "step": 5846, "time_per_iteration": 2.4591095447540283 }, { "auxiliary_loss_clip": 0.01069659, "auxiliary_loss_mlp": 0.0106874, "balance_loss_clip": 1.03464627, "balance_loss_mlp": 1.0242362, "epoch": 0.35154065834961673, "flos": 20990376213120.0, "grad_norm": 1.7124426127153407, "language_loss": 0.76363039, "learning_rate": 3.0099893408340046e-06, "loss": 0.78501433, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.45507812, "step": 5847, "time_per_iteration": 2.4986507892608643 }, { "auxiliary_loss_clip": 0.01066618, "auxiliary_loss_mlp": 0.01067266, "balance_loss_clip": 1.03541338, "balance_loss_mlp": 1.02045465, "epoch": 0.3516007816022847, "flos": 33254072317440.0, "grad_norm": 2.3514699440812414, "language_loss": 0.73501819, "learning_rate": 3.009653168561666e-06, "loss": 0.75635707, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4609375, "step": 5848, "time_per_iteration": 2.530108690261841 }, { "auxiliary_loss_clip": 0.01071461, "auxiliary_loss_mlp": 0.01065392, "balance_loss_clip": 1.0310955, "balance_loss_mlp": 1.02405024, "epoch": 0.35166090485495266, "flos": 11726207723520.0, "grad_norm": 2.3659354834879416, "language_loss": 0.91719878, "learning_rate": 3.009316958003178e-06, "loss": 0.93856728, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47460938, "step": 5849, "time_per_iteration": 2.350961446762085 }, { "auxiliary_loss_clip": 0.01066368, "auxiliary_loss_mlp": 0.01061013, "balance_loss_clip": 1.03090048, "balance_loss_mlp": 1.02238429, "epoch": 0.3517210281076206, "flos": 22637333779200.0, "grad_norm": 1.9798091018719801, "language_loss": 0.76457191, "learning_rate": 3.0089807091712897e-06, "loss": 0.7858457, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43945312, "step": 5850, "time_per_iteration": 2.462162494659424 }, { "auxiliary_loss_clip": 0.01066568, "auxiliary_loss_mlp": 0.01063041, "balance_loss_clip": 1.03145015, "balance_loss_mlp": 1.02296233, "epoch": 0.3517811513602886, "flos": 21321736727040.0, "grad_norm": 1.4271543594638207, "language_loss": 0.77033454, "learning_rate": 3.0086444220787515e-06, "loss": 0.79163063, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43554688, "step": 5851, "time_per_iteration": 2.4188969135284424 }, { "auxiliary_loss_clip": 0.01070571, "auxiliary_loss_mlp": 0.01067216, "balance_loss_clip": 1.0351249, "balance_loss_mlp": 1.02505326, "epoch": 0.35184127461295656, "flos": 21031817863680.0, "grad_norm": 1.9630680247349028, "language_loss": 0.88776636, "learning_rate": 3.0083080967383165e-06, "loss": 0.90914416, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45507812, "step": 5852, "time_per_iteration": 2.4202194213867188 }, { "auxiliary_loss_clip": 0.01067408, "auxiliary_loss_mlp": 0.01046332, "balance_loss_clip": 1.01792479, "balance_loss_mlp": 1.02460563, "epoch": 0.3519013978656245, "flos": 22454179453440.0, "grad_norm": 2.5511955901605523, "language_loss": 0.69199812, "learning_rate": 3.007971733162737e-06, "loss": 0.7131356, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42773438, "step": 5853, "time_per_iteration": 2.4069316387176514 }, { "auxiliary_loss_clip": 0.01068539, "auxiliary_loss_mlp": 0.01054725, "balance_loss_clip": 1.02454162, "balance_loss_mlp": 1.02428567, "epoch": 0.3519615211182925, "flos": 13114459048320.0, "grad_norm": 1.6479695697027004, "language_loss": 0.82341397, "learning_rate": 3.0076353313647686e-06, "loss": 0.84464669, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44335938, "step": 5854, "time_per_iteration": 2.386979103088379 }, { "auxiliary_loss_clip": 0.01067326, "auxiliary_loss_mlp": 0.01046318, "balance_loss_clip": 1.01709974, "balance_loss_mlp": 1.02433443, "epoch": 0.35202164437096045, "flos": 19134148757760.0, "grad_norm": 3.05291086643193, "language_loss": 0.74650729, "learning_rate": 3.0072988913571666e-06, "loss": 0.76764369, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 5855, "time_per_iteration": 2.3978748321533203 }, { "auxiliary_loss_clip": 0.01067845, "auxiliary_loss_mlp": 0.01051706, "balance_loss_clip": 1.02220201, "balance_loss_mlp": 1.02574563, "epoch": 0.3520817676236284, "flos": 26540972628480.0, "grad_norm": 2.1274468151501935, "language_loss": 0.72918022, "learning_rate": 3.006962413152691e-06, "loss": 0.75037575, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.421875, "step": 5856, "time_per_iteration": 2.453897476196289 }, { "auxiliary_loss_clip": 0.01071365, "auxiliary_loss_mlp": 0.01057696, "balance_loss_clip": 1.02429318, "balance_loss_mlp": 1.02664888, "epoch": 0.3521418908762964, "flos": 44891776010880.0, "grad_norm": 1.5980651001252413, "language_loss": 0.63332146, "learning_rate": 3.0066258967640987e-06, "loss": 0.65461206, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.44726562, "step": 5857, "time_per_iteration": 2.622370719909668 }, { "auxiliary_loss_clip": 0.01071055, "auxiliary_loss_mlp": 0.0104439, "balance_loss_clip": 1.01344371, "balance_loss_mlp": 1.02710831, "epoch": 0.3522020141289644, "flos": 20186536003200.0, "grad_norm": 2.063636088411625, "language_loss": 0.73862195, "learning_rate": 3.006289342204152e-06, "loss": 0.75977635, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43945312, "step": 5858, "time_per_iteration": 2.453098773956299 }, { "auxiliary_loss_clip": 0.01071276, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.014539, "balance_loss_mlp": 1.02655339, "epoch": 0.35226213738163237, "flos": 27562670922240.0, "grad_norm": 1.6638805473600786, "language_loss": 0.78389305, "learning_rate": 3.0059527494856126e-06, "loss": 0.80503261, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.44726562, "step": 5859, "time_per_iteration": 2.5030131340026855 }, { "auxiliary_loss_clip": 0.0107748, "auxiliary_loss_mlp": 0.01058835, "balance_loss_clip": 1.02714896, "balance_loss_mlp": 1.03051591, "epoch": 0.35232226063430033, "flos": 22965203157120.0, "grad_norm": 1.722573559506577, "language_loss": 0.73505527, "learning_rate": 3.0056161186212435e-06, "loss": 0.75641841, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46875, "step": 5860, "time_per_iteration": 2.4506642818450928 }, { "auxiliary_loss_clip": 0.01074685, "auxiliary_loss_mlp": 0.0105284, "balance_loss_clip": 1.02191687, "balance_loss_mlp": 1.02744627, "epoch": 0.3523823838869683, "flos": 19167386238720.0, "grad_norm": 2.396963391109648, "language_loss": 0.69064307, "learning_rate": 3.005279449623811e-06, "loss": 0.71191823, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.47265625, "step": 5861, "time_per_iteration": 2.390152931213379 }, { "auxiliary_loss_clip": 0.01072299, "auxiliary_loss_mlp": 0.01050618, "balance_loss_clip": 1.02279496, "balance_loss_mlp": 1.02963996, "epoch": 0.35244250713963626, "flos": 17930029276800.0, "grad_norm": 2.0247560652829937, "language_loss": 0.67655236, "learning_rate": 3.0049427425060815e-06, "loss": 0.69778156, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42578125, "step": 5862, "time_per_iteration": 2.4151179790496826 }, { "auxiliary_loss_clip": 0.01072004, "auxiliary_loss_mlp": 0.01057379, "balance_loss_clip": 1.02642059, "balance_loss_mlp": 1.02720714, "epoch": 0.35250263039230423, "flos": 21431503641600.0, "grad_norm": 1.9731807861699378, "language_loss": 0.78101951, "learning_rate": 3.0046059972808215e-06, "loss": 0.80231333, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44921875, "step": 5863, "time_per_iteration": 2.413172483444214 }, { "auxiliary_loss_clip": 0.01071304, "auxiliary_loss_mlp": 0.01053211, "balance_loss_clip": 1.02482724, "balance_loss_mlp": 1.02740407, "epoch": 0.3525627536449722, "flos": 27415651720320.0, "grad_norm": 1.878992995111161, "language_loss": 0.76735032, "learning_rate": 3.0042692139608024e-06, "loss": 0.7885955, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4375, "step": 5864, "time_per_iteration": 2.4646921157836914 }, { "auxiliary_loss_clip": 0.01069451, "auxiliary_loss_mlp": 0.01063094, "balance_loss_clip": 1.03262401, "balance_loss_mlp": 1.02553535, "epoch": 0.35262287689764016, "flos": 24788681890560.0, "grad_norm": 2.3340722691844595, "language_loss": 0.81399935, "learning_rate": 3.003932392558793e-06, "loss": 0.83532482, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 5865, "time_per_iteration": 2.4382455348968506 }, { "auxiliary_loss_clip": 0.01074619, "auxiliary_loss_mlp": 0.01063684, "balance_loss_clip": 1.02980423, "balance_loss_mlp": 1.02789545, "epoch": 0.3526830001503081, "flos": 17820646387200.0, "grad_norm": 2.230431138181942, "language_loss": 0.83339953, "learning_rate": 3.0035955330875677e-06, "loss": 0.85478258, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.46679688, "step": 5866, "time_per_iteration": 2.384157419204712 }, { "auxiliary_loss_clip": 0.01075429, "auxiliary_loss_mlp": 0.01068242, "balance_loss_clip": 1.03171659, "balance_loss_mlp": 1.02688789, "epoch": 0.3527431234029761, "flos": 18077118301440.0, "grad_norm": 2.110367290972529, "language_loss": 0.86571938, "learning_rate": 3.0032586355598986e-06, "loss": 0.88715613, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.48632812, "step": 5867, "time_per_iteration": 2.4504923820495605 }, { "auxiliary_loss_clip": 0.01070498, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.02763033, "balance_loss_mlp": 1.02542102, "epoch": 0.35280324665564405, "flos": 19426336859520.0, "grad_norm": 2.003807251238924, "language_loss": 0.75284964, "learning_rate": 3.0029216999885613e-06, "loss": 0.77414405, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45117188, "step": 5868, "time_per_iteration": 2.3886945247650146 }, { "auxiliary_loss_clip": 0.01069903, "auxiliary_loss_mlp": 0.01060353, "balance_loss_clip": 1.0277133, "balance_loss_mlp": 1.02362132, "epoch": 0.352863369908312, "flos": 21503040750720.0, "grad_norm": 2.5811910562553564, "language_loss": 0.63547289, "learning_rate": 3.0025847263863327e-06, "loss": 0.65677547, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46484375, "step": 5869, "time_per_iteration": 2.4295918941497803 }, { "auxiliary_loss_clip": 0.01065218, "auxiliary_loss_mlp": 0.01055632, "balance_loss_clip": 1.02550793, "balance_loss_mlp": 1.02060056, "epoch": 0.35292349316098, "flos": 22308417060480.0, "grad_norm": 1.9411310933428056, "language_loss": 0.76186657, "learning_rate": 3.0022477147659917e-06, "loss": 0.78307509, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4453125, "step": 5870, "time_per_iteration": 3.831289291381836 }, { "auxiliary_loss_clip": 0.01064778, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.017555, "balance_loss_mlp": 1.02045274, "epoch": 0.352983616413648, "flos": 33108344835840.0, "grad_norm": 1.4832416753881874, "language_loss": 0.73011786, "learning_rate": 3.001910665140316e-06, "loss": 0.75124705, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44335938, "step": 5871, "time_per_iteration": 2.5230624675750732 }, { "auxiliary_loss_clip": 0.01063679, "auxiliary_loss_mlp": 0.01044158, "balance_loss_clip": 1.01565492, "balance_loss_mlp": 1.02080894, "epoch": 0.35304373966631597, "flos": 18695639681280.0, "grad_norm": 2.8041596327889367, "language_loss": 0.75396693, "learning_rate": 3.0015735775220873e-06, "loss": 0.77504534, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 5872, "time_per_iteration": 3.785813093185425 }, { "auxiliary_loss_clip": 0.01063378, "auxiliary_loss_mlp": 0.0105097, "balance_loss_clip": 1.02085757, "balance_loss_mlp": 1.020383, "epoch": 0.35310386291898394, "flos": 23363911416960.0, "grad_norm": 2.0325381720787665, "language_loss": 0.83384073, "learning_rate": 3.001236451924089e-06, "loss": 0.85498422, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 5873, "time_per_iteration": 2.4024715423583984 }, { "auxiliary_loss_clip": 0.01068663, "auxiliary_loss_mlp": 0.01048845, "balance_loss_clip": 1.01751697, "balance_loss_mlp": 1.0218302, "epoch": 0.3531639861716519, "flos": 24460812512640.0, "grad_norm": 2.0341476463751844, "language_loss": 0.67914867, "learning_rate": 3.000899288359104e-06, "loss": 0.70032376, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46875, "step": 5874, "time_per_iteration": 3.811715602874756 }, { "auxiliary_loss_clip": 0.01013987, "auxiliary_loss_mlp": 0.01005445, "balance_loss_clip": 1.00213075, "balance_loss_mlp": 1.00461388, "epoch": 0.35322410942431987, "flos": 70309347955200.0, "grad_norm": 1.2716897174479562, "language_loss": 0.61611688, "learning_rate": 3.000562086839917e-06, "loss": 0.63631123, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.09375, "step": 5875, "time_per_iteration": 2.958010673522949 }, { "auxiliary_loss_clip": 0.01067565, "auxiliary_loss_mlp": 0.01050895, "balance_loss_clip": 1.02044868, "balance_loss_mlp": 1.02245986, "epoch": 0.35328423267698783, "flos": 19820087706240.0, "grad_norm": 1.795337470128282, "language_loss": 0.80736089, "learning_rate": 3.0002248473793163e-06, "loss": 0.82854551, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.45117188, "step": 5876, "time_per_iteration": 3.7829110622406006 }, { "auxiliary_loss_clip": 0.01014476, "auxiliary_loss_mlp": 0.01006788, "balance_loss_clip": 1.00333107, "balance_loss_mlp": 1.00486827, "epoch": 0.3533443559296558, "flos": 60823516043520.0, "grad_norm": 0.6767174665494686, "language_loss": 0.56788683, "learning_rate": 2.999887569990088e-06, "loss": 0.58809948, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 0.03466797, "router_z_loss_mlp": 0.09619141, "step": 5877, "time_per_iteration": 3.101242780685425 }, { "auxiliary_loss_clip": 0.01070625, "auxiliary_loss_mlp": 0.01053692, "balance_loss_clip": 1.02174437, "balance_loss_mlp": 1.02403033, "epoch": 0.35340447918232376, "flos": 24754571625600.0, "grad_norm": 1.416096287835821, "language_loss": 0.73383081, "learning_rate": 2.999550254685024e-06, "loss": 0.75507402, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46484375, "step": 5878, "time_per_iteration": 2.492274284362793 }, { "auxiliary_loss_clip": 0.01067711, "auxiliary_loss_mlp": 0.01060141, "balance_loss_clip": 1.02604675, "balance_loss_mlp": 1.02101851, "epoch": 0.3534646024349917, "flos": 21795298675200.0, "grad_norm": 1.7827743698549832, "language_loss": 0.79380178, "learning_rate": 2.9992129014769136e-06, "loss": 0.81508034, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.46679688, "step": 5879, "time_per_iteration": 2.3939504623413086 }, { "auxiliary_loss_clip": 0.01071349, "auxiliary_loss_mlp": 0.01062873, "balance_loss_clip": 1.02393889, "balance_loss_mlp": 1.02296233, "epoch": 0.3535247256876597, "flos": 20011062176640.0, "grad_norm": 2.3373779713219376, "language_loss": 0.65776545, "learning_rate": 2.9988755103785493e-06, "loss": 0.67910767, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.484375, "step": 5880, "time_per_iteration": 2.4295570850372314 }, { "auxiliary_loss_clip": 0.01068663, "auxiliary_loss_mlp": 0.01047985, "balance_loss_clip": 1.01446283, "balance_loss_mlp": 1.02235603, "epoch": 0.35358484894032766, "flos": 18186920127360.0, "grad_norm": 2.6210819054728267, "language_loss": 0.68265188, "learning_rate": 2.998538081402727e-06, "loss": 0.70381838, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46289062, "step": 5881, "time_per_iteration": 2.3914356231689453 }, { "auxiliary_loss_clip": 0.01063686, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.01287663, "balance_loss_mlp": 1.02084339, "epoch": 0.3536449721929956, "flos": 22819266207360.0, "grad_norm": 1.4117962647762172, "language_loss": 0.76876795, "learning_rate": 2.998200614562239e-06, "loss": 0.78981268, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42773438, "step": 5882, "time_per_iteration": 2.423184394836426 }, { "auxiliary_loss_clip": 0.01071949, "auxiliary_loss_mlp": 0.01055425, "balance_loss_clip": 1.02397776, "balance_loss_mlp": 1.02431321, "epoch": 0.3537050954456636, "flos": 26431135891200.0, "grad_norm": 2.184619424078453, "language_loss": 0.71764052, "learning_rate": 2.9978631098698847e-06, "loss": 0.73891431, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4765625, "step": 5883, "time_per_iteration": 2.4260873794555664 }, { "auxiliary_loss_clip": 0.01073587, "auxiliary_loss_mlp": 0.0105591, "balance_loss_clip": 1.02298474, "balance_loss_mlp": 1.02380657, "epoch": 0.3537652186983316, "flos": 17196329721600.0, "grad_norm": 1.9924791416319723, "language_loss": 0.79631013, "learning_rate": 2.9975255673384614e-06, "loss": 0.81760514, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.49609375, "step": 5884, "time_per_iteration": 2.386765718460083 }, { "auxiliary_loss_clip": 0.01069176, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.01540947, "balance_loss_mlp": 1.02257681, "epoch": 0.3538253419509996, "flos": 19535754660480.0, "grad_norm": 2.05622164937663, "language_loss": 0.75919867, "learning_rate": 2.9971879869807673e-06, "loss": 0.78034449, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.46679688, "step": 5885, "time_per_iteration": 2.404128074645996 }, { "auxiliary_loss_clip": 0.01070696, "auxiliary_loss_mlp": 0.01052158, "balance_loss_clip": 1.01882744, "balance_loss_mlp": 1.02305651, "epoch": 0.35388546520366754, "flos": 12127813626240.0, "grad_norm": 2.4076030117275615, "language_loss": 0.86429131, "learning_rate": 2.996850368809606e-06, "loss": 0.8855198, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4765625, "step": 5886, "time_per_iteration": 2.394165515899658 }, { "auxiliary_loss_clip": 0.01070238, "auxiliary_loss_mlp": 0.01055166, "balance_loss_clip": 1.01971364, "balance_loss_mlp": 1.02368748, "epoch": 0.3539455884563355, "flos": 19677257867520.0, "grad_norm": 2.7399385378990067, "language_loss": 0.79963672, "learning_rate": 2.9965127128377787e-06, "loss": 0.82089078, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.46484375, "step": 5887, "time_per_iteration": 2.423200845718384 }, { "auxiliary_loss_clip": 0.01066792, "auxiliary_loss_mlp": 0.01054028, "balance_loss_clip": 1.01964796, "balance_loss_mlp": 1.02080846, "epoch": 0.35400571170900347, "flos": 18071218281600.0, "grad_norm": 1.8209362484386444, "language_loss": 0.66795528, "learning_rate": 2.996175019078089e-06, "loss": 0.68916351, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4609375, "step": 5888, "time_per_iteration": 2.356095552444458 }, { "auxiliary_loss_clip": 0.01068605, "auxiliary_loss_mlp": 0.01047242, "balance_loss_clip": 1.01608098, "balance_loss_mlp": 1.02273107, "epoch": 0.35406583496167143, "flos": 26066852098560.0, "grad_norm": 1.8228429992549593, "language_loss": 0.78310168, "learning_rate": 2.9958372875433437e-06, "loss": 0.80426019, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45898438, "step": 5889, "time_per_iteration": 2.4676947593688965 }, { "auxiliary_loss_clip": 0.01068735, "auxiliary_loss_mlp": 0.01050947, "balance_loss_clip": 1.01749706, "balance_loss_mlp": 1.02378702, "epoch": 0.3541259582143394, "flos": 19791423613440.0, "grad_norm": 1.831387166224742, "language_loss": 0.82527131, "learning_rate": 2.9954995182463478e-06, "loss": 0.84646815, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.44921875, "step": 5890, "time_per_iteration": 2.394465446472168 }, { "auxiliary_loss_clip": 0.01066286, "auxiliary_loss_mlp": 0.01048276, "balance_loss_clip": 1.01678038, "balance_loss_mlp": 1.02164626, "epoch": 0.35418608146700736, "flos": 24021011715840.0, "grad_norm": 2.030284473592816, "language_loss": 0.81008446, "learning_rate": 2.99516171119991e-06, "loss": 0.83123004, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44726562, "step": 5891, "time_per_iteration": 2.424584150314331 }, { "auxiliary_loss_clip": 0.01066491, "auxiliary_loss_mlp": 0.01053575, "balance_loss_clip": 1.02031589, "balance_loss_mlp": 1.02117062, "epoch": 0.35424620471967533, "flos": 12384948856320.0, "grad_norm": 2.4270771381580407, "language_loss": 0.74889505, "learning_rate": 2.9948238664168415e-06, "loss": 0.77009571, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.453125, "step": 5892, "time_per_iteration": 2.3653953075408936 }, { "auxiliary_loss_clip": 0.01070666, "auxiliary_loss_mlp": 0.01055943, "balance_loss_clip": 1.0208478, "balance_loss_mlp": 1.02411246, "epoch": 0.3543063279723433, "flos": 19672859036160.0, "grad_norm": 2.3215454876851522, "language_loss": 0.67737931, "learning_rate": 2.9944859839099518e-06, "loss": 0.69864547, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.46484375, "step": 5893, "time_per_iteration": 2.3852875232696533 }, { "auxiliary_loss_clip": 0.01067166, "auxiliary_loss_mlp": 0.0105581, "balance_loss_clip": 1.01992798, "balance_loss_mlp": 1.02145123, "epoch": 0.35436645122501126, "flos": 21908102878080.0, "grad_norm": 3.581656501081007, "language_loss": 0.70871371, "learning_rate": 2.9941480636920533e-06, "loss": 0.72994345, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.45703125, "step": 5894, "time_per_iteration": 2.4012954235076904 }, { "auxiliary_loss_clip": 0.0106782, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.01274228, "balance_loss_mlp": 1.02299309, "epoch": 0.3544265744776792, "flos": 21718629596160.0, "grad_norm": 1.6136607431682202, "language_loss": 0.75712818, "learning_rate": 2.9938101057759615e-06, "loss": 0.7782346, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44726562, "step": 5895, "time_per_iteration": 2.398312568664551 }, { "auxiliary_loss_clip": 0.01066601, "auxiliary_loss_mlp": 0.01045776, "balance_loss_clip": 1.01423311, "balance_loss_mlp": 1.0211575, "epoch": 0.3544866977303472, "flos": 21212214192000.0, "grad_norm": 1.779760119110029, "language_loss": 0.8460359, "learning_rate": 2.993472110174491e-06, "loss": 0.86715961, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 5896, "time_per_iteration": 2.3839147090911865 }, { "auxiliary_loss_clip": 0.01065845, "auxiliary_loss_mlp": 0.01058741, "balance_loss_clip": 1.02303791, "balance_loss_mlp": 1.02094567, "epoch": 0.35454682098301515, "flos": 29310213715200.0, "grad_norm": 1.648376702765252, "language_loss": 0.71419418, "learning_rate": 2.9931340769004576e-06, "loss": 0.73544002, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.44921875, "step": 5897, "time_per_iteration": 2.4598047733306885 }, { "auxiliary_loss_clip": 0.01063499, "auxiliary_loss_mlp": 0.01044979, "balance_loss_clip": 1.01295972, "balance_loss_mlp": 1.01949406, "epoch": 0.3546069442356832, "flos": 24315434144640.0, "grad_norm": 1.7744692632218764, "language_loss": 0.827389, "learning_rate": 2.9927960059666816e-06, "loss": 0.84847385, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.43945312, "step": 5898, "time_per_iteration": 2.4591777324676514 }, { "auxiliary_loss_clip": 0.01064279, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.02456534, "balance_loss_mlp": 1.02034807, "epoch": 0.35466706748835114, "flos": 22856169381120.0, "grad_norm": 1.523384857366134, "language_loss": 0.75781089, "learning_rate": 2.9924578973859804e-06, "loss": 0.77900702, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43945312, "step": 5899, "time_per_iteration": 2.4753365516662598 }, { "auxiliary_loss_clip": 0.01066842, "auxiliary_loss_mlp": 0.0104949, "balance_loss_clip": 1.01804292, "balance_loss_mlp": 1.02061927, "epoch": 0.3547271907410191, "flos": 28328839908480.0, "grad_norm": 1.7560839000387838, "language_loss": 0.80802709, "learning_rate": 2.9921197511711763e-06, "loss": 0.82919037, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46289062, "step": 5900, "time_per_iteration": 2.4402267932891846 }, { "auxiliary_loss_clip": 0.01066134, "auxiliary_loss_mlp": 0.01049795, "balance_loss_clip": 1.01615405, "balance_loss_mlp": 1.02062726, "epoch": 0.35478731399368707, "flos": 23512955477760.0, "grad_norm": 1.9008015261539222, "language_loss": 0.82821137, "learning_rate": 2.991781567335093e-06, "loss": 0.8493706, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.45507812, "step": 5901, "time_per_iteration": 2.4487879276275635 }, { "auxiliary_loss_clip": 0.01069999, "auxiliary_loss_mlp": 0.01048107, "balance_loss_clip": 1.01510942, "balance_loss_mlp": 1.02154148, "epoch": 0.35484743724635504, "flos": 18623334522240.0, "grad_norm": 2.162189409690262, "language_loss": 0.77455759, "learning_rate": 2.9914433458905525e-06, "loss": 0.79573864, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.484375, "step": 5902, "time_per_iteration": 2.4125750064849854 }, { "auxiliary_loss_clip": 0.01064323, "auxiliary_loss_mlp": 0.01053135, "balance_loss_clip": 1.01875532, "balance_loss_mlp": 1.01922405, "epoch": 0.354907560499023, "flos": 17383533765120.0, "grad_norm": 1.8517856797034262, "language_loss": 0.7217384, "learning_rate": 2.991105086850381e-06, "loss": 0.74291301, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.45117188, "step": 5903, "time_per_iteration": 2.392615556716919 }, { "auxiliary_loss_clip": 0.01067691, "auxiliary_loss_mlp": 0.01049086, "balance_loss_clip": 1.0161128, "balance_loss_mlp": 1.02037311, "epoch": 0.35496768375169097, "flos": 19207536168960.0, "grad_norm": 2.3880845475986865, "language_loss": 0.76929057, "learning_rate": 2.9907667902274053e-06, "loss": 0.79045832, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47265625, "step": 5904, "time_per_iteration": 2.380796194076538 }, { "auxiliary_loss_clip": 0.01066593, "auxiliary_loss_mlp": 0.01056219, "balance_loss_clip": 1.02464008, "balance_loss_mlp": 1.02081418, "epoch": 0.35502780700435893, "flos": 18331809736320.0, "grad_norm": 2.155687167612292, "language_loss": 0.80951416, "learning_rate": 2.9904284560344536e-06, "loss": 0.83074224, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45703125, "step": 5905, "time_per_iteration": 2.3985402584075928 }, { "auxiliary_loss_clip": 0.01058806, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.01496482, "balance_loss_mlp": 1.01815021, "epoch": 0.3550879302570269, "flos": 15447704676480.0, "grad_norm": 2.3482271372688923, "language_loss": 0.73329389, "learning_rate": 2.990090084284356e-06, "loss": 0.75430334, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 5906, "time_per_iteration": 2.3453664779663086 }, { "auxiliary_loss_clip": 0.01067315, "auxiliary_loss_mlp": 0.01051317, "balance_loss_clip": 1.0180341, "balance_loss_mlp": 1.02065253, "epoch": 0.35514805350969486, "flos": 21978173710080.0, "grad_norm": 2.017282851695568, "language_loss": 0.76692712, "learning_rate": 2.9897516749899426e-06, "loss": 0.78811342, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46679688, "step": 5907, "time_per_iteration": 2.408691644668579 }, { "auxiliary_loss_clip": 0.01064809, "auxiliary_loss_mlp": 0.01047079, "balance_loss_clip": 1.01644278, "balance_loss_mlp": 1.02040839, "epoch": 0.3552081767623628, "flos": 29860654210560.0, "grad_norm": 1.7290802599883692, "language_loss": 0.76374829, "learning_rate": 2.989413228164047e-06, "loss": 0.78486717, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 5908, "time_per_iteration": 2.4615933895111084 }, { "auxiliary_loss_clip": 0.01064956, "auxiliary_loss_mlp": 0.01047343, "balance_loss_clip": 1.01546645, "balance_loss_mlp": 1.02050781, "epoch": 0.3552683000150308, "flos": 26431066068480.0, "grad_norm": 1.7696360803915674, "language_loss": 0.69150472, "learning_rate": 2.989074743819502e-06, "loss": 0.71262771, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4453125, "step": 5909, "time_per_iteration": 2.4923958778381348 }, { "auxiliary_loss_clip": 0.01062111, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.01649714, "balance_loss_mlp": 1.02016985, "epoch": 0.35532842326769876, "flos": 19785139568640.0, "grad_norm": 2.3952664861992257, "language_loss": 0.80038774, "learning_rate": 2.988736221969144e-06, "loss": 0.82145804, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41992188, "step": 5910, "time_per_iteration": 3.8053572177886963 }, { "auxiliary_loss_clip": 0.01066024, "auxiliary_loss_mlp": 0.01054541, "balance_loss_clip": 1.01741898, "balance_loss_mlp": 1.02028179, "epoch": 0.3553885465203668, "flos": 17238295042560.0, "grad_norm": 1.7258018015921595, "language_loss": 0.72651649, "learning_rate": 2.98839766262581e-06, "loss": 0.74772215, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.45703125, "step": 5911, "time_per_iteration": 2.4054253101348877 }, { "auxiliary_loss_clip": 0.01064874, "auxiliary_loss_mlp": 0.01044717, "balance_loss_clip": 1.01529646, "balance_loss_mlp": 1.0211798, "epoch": 0.35544866977303474, "flos": 14933608773120.0, "grad_norm": 2.1147236413616812, "language_loss": 0.88431484, "learning_rate": 2.9880590658023366e-06, "loss": 0.90541077, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4375, "step": 5912, "time_per_iteration": 3.868826389312744 }, { "auxiliary_loss_clip": 0.01065692, "auxiliary_loss_mlp": 0.01047948, "balance_loss_clip": 1.01626253, "balance_loss_mlp": 1.02080011, "epoch": 0.3555087930257027, "flos": 19755009198720.0, "grad_norm": 2.1107034595955763, "language_loss": 0.78161645, "learning_rate": 2.9877204315115646e-06, "loss": 0.80275291, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 5913, "time_per_iteration": 2.3935678005218506 }, { "auxiliary_loss_clip": 0.01063648, "auxiliary_loss_mlp": 0.01046729, "balance_loss_clip": 1.01565111, "balance_loss_mlp": 1.02022147, "epoch": 0.3555689162783707, "flos": 21067219848960.0, "grad_norm": 1.4940040987155385, "language_loss": 0.83378738, "learning_rate": 2.9873817597663353e-06, "loss": 0.85489118, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43359375, "step": 5914, "time_per_iteration": 3.8625292778015137 }, { "auxiliary_loss_clip": 0.01065104, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.02168608, "balance_loss_mlp": 1.02026892, "epoch": 0.35562903953103864, "flos": 33068334551040.0, "grad_norm": 2.2037554311518894, "language_loss": 0.72017616, "learning_rate": 2.98704305057949e-06, "loss": 0.74136353, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44921875, "step": 5915, "time_per_iteration": 3.8841097354888916 }, { "auxiliary_loss_clip": 0.01063432, "auxiliary_loss_mlp": 0.01049177, "balance_loss_clip": 1.01820672, "balance_loss_mlp": 1.01886773, "epoch": 0.3556891627837066, "flos": 20556824549760.0, "grad_norm": 1.7044241663243156, "language_loss": 0.76911569, "learning_rate": 2.9867043039638737e-06, "loss": 0.79024184, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 5916, "time_per_iteration": 2.371776819229126 }, { "auxiliary_loss_clip": 0.01065847, "auxiliary_loss_mlp": 0.01045247, "balance_loss_clip": 1.01420522, "balance_loss_mlp": 1.02047861, "epoch": 0.35574928603637457, "flos": 20702307651840.0, "grad_norm": 2.59399549557887, "language_loss": 0.89360362, "learning_rate": 2.986365519932332e-06, "loss": 0.91471457, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.453125, "step": 5917, "time_per_iteration": 2.432219982147217 }, { "auxiliary_loss_clip": 0.01064752, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.01473927, "balance_loss_mlp": 1.02012825, "epoch": 0.35580940928904253, "flos": 15193711468800.0, "grad_norm": 2.3542577751938683, "language_loss": 0.77640092, "learning_rate": 2.98602669849771e-06, "loss": 0.79749876, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4453125, "step": 5918, "time_per_iteration": 2.353813648223877 }, { "auxiliary_loss_clip": 0.01019447, "auxiliary_loss_mlp": 0.01008188, "balance_loss_clip": 1.00506461, "balance_loss_mlp": 1.0099169, "epoch": 0.3558695325417105, "flos": 58636312099200.0, "grad_norm": 0.9117779080903505, "language_loss": 0.63950503, "learning_rate": 2.985687839672857e-06, "loss": 0.65978134, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.09570312, "step": 5919, "time_per_iteration": 2.793416738510132 }, { "auxiliary_loss_clip": 0.01066521, "auxiliary_loss_mlp": 0.0105378, "balance_loss_clip": 1.02185559, "balance_loss_mlp": 1.02011609, "epoch": 0.35592965579437846, "flos": 22017136654080.0, "grad_norm": 2.262989004329835, "language_loss": 0.75350797, "learning_rate": 2.9853489434706223e-06, "loss": 0.77471101, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46289062, "step": 5920, "time_per_iteration": 2.4035110473632812 }, { "auxiliary_loss_clip": 0.01062656, "auxiliary_loss_mlp": 0.01042159, "balance_loss_clip": 1.01292932, "balance_loss_mlp": 1.01898956, "epoch": 0.35598977904704643, "flos": 23366564680320.0, "grad_norm": 1.8774481929967304, "language_loss": 0.78455412, "learning_rate": 2.985010009903857e-06, "loss": 0.80560231, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 5921, "time_per_iteration": 2.437350273132324 }, { "auxiliary_loss_clip": 0.01063537, "auxiliary_loss_mlp": 0.01052253, "balance_loss_clip": 1.02290416, "balance_loss_mlp": 1.01961923, "epoch": 0.3560499022997144, "flos": 17784371617920.0, "grad_norm": 1.989483671326524, "language_loss": 0.68982154, "learning_rate": 2.9846710389854133e-06, "loss": 0.7109794, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43945312, "step": 5922, "time_per_iteration": 2.3625378608703613 }, { "auxiliary_loss_clip": 0.01065749, "auxiliary_loss_mlp": 0.01046342, "balance_loss_clip": 1.01471591, "balance_loss_mlp": 1.02030969, "epoch": 0.35611002555238236, "flos": 20739420293760.0, "grad_norm": 3.0009698723860914, "language_loss": 0.80234349, "learning_rate": 2.9843320307281454e-06, "loss": 0.82346433, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45507812, "step": 5923, "time_per_iteration": 2.429516077041626 }, { "auxiliary_loss_clip": 0.01064949, "auxiliary_loss_mlp": 0.01051573, "balance_loss_clip": 1.02264106, "balance_loss_mlp": 1.0201757, "epoch": 0.3561701488050504, "flos": 19461250085760.0, "grad_norm": 1.7010726793893929, "language_loss": 0.86465871, "learning_rate": 2.983992985144908e-06, "loss": 0.88582397, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44726562, "step": 5924, "time_per_iteration": 2.3667945861816406 }, { "auxiliary_loss_clip": 0.01063377, "auxiliary_loss_mlp": 0.0104622, "balance_loss_clip": 1.01604843, "balance_loss_mlp": 1.01984954, "epoch": 0.35623027205771834, "flos": 30773598019200.0, "grad_norm": 2.206200186468841, "language_loss": 0.79515958, "learning_rate": 2.9836539022485578e-06, "loss": 0.81625557, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43359375, "step": 5925, "time_per_iteration": 2.473376750946045 }, { "auxiliary_loss_clip": 0.01062794, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.02124977, "balance_loss_mlp": 1.01800299, "epoch": 0.3562903953103863, "flos": 16980182294400.0, "grad_norm": 1.8541525807553443, "language_loss": 0.76972282, "learning_rate": 2.9833147820519535e-06, "loss": 0.79086751, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44921875, "step": 5926, "time_per_iteration": 2.3773396015167236 }, { "auxiliary_loss_clip": 0.01065259, "auxiliary_loss_mlp": 0.01049155, "balance_loss_clip": 1.01863766, "balance_loss_mlp": 1.02068722, "epoch": 0.3563505185630543, "flos": 23838765085440.0, "grad_norm": 1.879992644150106, "language_loss": 0.70775855, "learning_rate": 2.9829756245679544e-06, "loss": 0.7289027, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 5927, "time_per_iteration": 2.410907030105591 }, { "auxiliary_loss_clip": 0.01062549, "auxiliary_loss_mlp": 0.01047819, "balance_loss_clip": 1.01787353, "balance_loss_mlp": 1.01940703, "epoch": 0.35641064181572224, "flos": 22272351759360.0, "grad_norm": 2.2760989515387333, "language_loss": 0.80555177, "learning_rate": 2.9826364298094212e-06, "loss": 0.82665545, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43164062, "step": 5928, "time_per_iteration": 2.393364429473877 }, { "auxiliary_loss_clip": 0.0106311, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.01348972, "balance_loss_mlp": 1.01903343, "epoch": 0.3564707650683902, "flos": 23000186206080.0, "grad_norm": 1.3298212415451185, "language_loss": 0.82470512, "learning_rate": 2.982297197789215e-06, "loss": 0.84578586, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 5929, "time_per_iteration": 2.4448800086975098 }, { "auxiliary_loss_clip": 0.0106153, "auxiliary_loss_mlp": 0.01044831, "balance_loss_clip": 1.01760328, "balance_loss_mlp": 1.01900959, "epoch": 0.35653088832105817, "flos": 14683385992320.0, "grad_norm": 1.6011721813081803, "language_loss": 0.71885234, "learning_rate": 2.981957928520201e-06, "loss": 0.73991597, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.42578125, "step": 5930, "time_per_iteration": 2.4034078121185303 }, { "auxiliary_loss_clip": 0.01068014, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.02048016, "balance_loss_mlp": 1.02144051, "epoch": 0.35659101157372614, "flos": 23475947569920.0, "grad_norm": 2.0355608899358772, "language_loss": 0.69793952, "learning_rate": 2.981618622015244e-06, "loss": 0.71914077, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46484375, "step": 5931, "time_per_iteration": 2.389711380004883 }, { "auxiliary_loss_clip": 0.01063117, "auxiliary_loss_mlp": 0.01050125, "balance_loss_clip": 1.02072823, "balance_loss_mlp": 1.0196166, "epoch": 0.3566511348263941, "flos": 26577456865920.0, "grad_norm": 1.6999782461956683, "language_loss": 0.69833702, "learning_rate": 2.981279278287211e-06, "loss": 0.71946949, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43554688, "step": 5932, "time_per_iteration": 2.458024501800537 }, { "auxiliary_loss_clip": 0.01064145, "auxiliary_loss_mlp": 0.0104097, "balance_loss_clip": 1.01333737, "balance_loss_mlp": 1.02085912, "epoch": 0.35671125807906207, "flos": 13114179757440.0, "grad_norm": 2.1869938108248035, "language_loss": 0.80576915, "learning_rate": 2.980939897348969e-06, "loss": 0.82682031, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43359375, "step": 5933, "time_per_iteration": 2.356184482574463 }, { "auxiliary_loss_clip": 0.01068207, "auxiliary_loss_mlp": 0.01048756, "balance_loss_clip": 1.01857209, "balance_loss_mlp": 1.02263999, "epoch": 0.35677138133173003, "flos": 32999171414400.0, "grad_norm": 1.729261809243827, "language_loss": 0.71065176, "learning_rate": 2.980600479213388e-06, "loss": 0.73182142, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.45507812, "step": 5934, "time_per_iteration": 2.5117685794830322 }, { "auxiliary_loss_clip": 0.0107058, "auxiliary_loss_mlp": 0.01051117, "balance_loss_clip": 1.01623583, "balance_loss_mlp": 1.02258396, "epoch": 0.356831504584398, "flos": 20776777315200.0, "grad_norm": 1.9215553334968103, "language_loss": 0.72177815, "learning_rate": 2.9802610238933384e-06, "loss": 0.74299514, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48046875, "step": 5935, "time_per_iteration": 2.4047012329101562 }, { "auxiliary_loss_clip": 0.01065027, "auxiliary_loss_mlp": 0.01046347, "balance_loss_clip": 1.01551914, "balance_loss_mlp": 1.02043939, "epoch": 0.35689162783706596, "flos": 12164786622720.0, "grad_norm": 2.2996971793763237, "language_loss": 0.79758334, "learning_rate": 2.979921531401692e-06, "loss": 0.81869704, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 5936, "time_per_iteration": 2.371316909790039 }, { "auxiliary_loss_clip": 0.01064579, "auxiliary_loss_mlp": 0.01049369, "balance_loss_clip": 1.01926875, "balance_loss_mlp": 1.02088523, "epoch": 0.356951751089734, "flos": 23840371008000.0, "grad_norm": 1.468998668375143, "language_loss": 0.66043937, "learning_rate": 2.9795820017513242e-06, "loss": 0.68157887, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4375, "step": 5937, "time_per_iteration": 2.399121046066284 }, { "auxiliary_loss_clip": 0.01067614, "auxiliary_loss_mlp": 0.01049209, "balance_loss_clip": 1.01808393, "balance_loss_mlp": 1.0220623, "epoch": 0.35701187434240195, "flos": 11721564512640.0, "grad_norm": 3.302620549499157, "language_loss": 0.79807794, "learning_rate": 2.9792424349551073e-06, "loss": 0.81924617, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.45507812, "step": 5938, "time_per_iteration": 2.3717987537384033 }, { "auxiliary_loss_clip": 0.01068421, "auxiliary_loss_mlp": 0.01055643, "balance_loss_clip": 1.02582908, "balance_loss_mlp": 1.02283466, "epoch": 0.3570719975950699, "flos": 24897750577920.0, "grad_norm": 1.5444790169734557, "language_loss": 0.80613828, "learning_rate": 2.9789028310259202e-06, "loss": 0.82737899, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.45703125, "step": 5939, "time_per_iteration": 2.4104559421539307 }, { "auxiliary_loss_clip": 0.01069795, "auxiliary_loss_mlp": 0.01049544, "balance_loss_clip": 1.01671338, "balance_loss_mlp": 1.02136254, "epoch": 0.3571321208477379, "flos": 25993639244160.0, "grad_norm": 1.694199067265966, "language_loss": 0.80435586, "learning_rate": 2.9785631899766395e-06, "loss": 0.82554924, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.484375, "step": 5940, "time_per_iteration": 2.448390007019043 }, { "auxiliary_loss_clip": 0.0106845, "auxiliary_loss_mlp": 0.01046072, "balance_loss_clip": 1.01445735, "balance_loss_mlp": 1.0222789, "epoch": 0.35719224410040584, "flos": 14500790248320.0, "grad_norm": 1.9205955218097863, "language_loss": 0.74857092, "learning_rate": 2.9782235118201443e-06, "loss": 0.76971614, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4609375, "step": 5941, "time_per_iteration": 2.4040355682373047 }, { "auxiliary_loss_clip": 0.01067902, "auxiliary_loss_mlp": 0.01049161, "balance_loss_clip": 1.01726043, "balance_loss_mlp": 1.02213752, "epoch": 0.3572523673530738, "flos": 31174121669760.0, "grad_norm": 5.281039865402096, "language_loss": 0.66156113, "learning_rate": 2.9778837965693154e-06, "loss": 0.68273181, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45703125, "step": 5942, "time_per_iteration": 2.5005507469177246 }, { "auxiliary_loss_clip": 0.01065252, "auxiliary_loss_mlp": 0.01049139, "balance_loss_clip": 1.01700008, "balance_loss_mlp": 1.02080858, "epoch": 0.3573124906057418, "flos": 15851056147200.0, "grad_norm": 2.0383366769961357, "language_loss": 0.7514838, "learning_rate": 2.9775440442370354e-06, "loss": 0.77262765, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 5943, "time_per_iteration": 2.3491477966308594 }, { "auxiliary_loss_clip": 0.01012011, "auxiliary_loss_mlp": 0.01002918, "balance_loss_clip": 1.00016475, "balance_loss_mlp": 1.00254035, "epoch": 0.35737261385840974, "flos": 60819989996160.0, "grad_norm": 0.7858810965743278, "language_loss": 0.60776722, "learning_rate": 2.9772042548361867e-06, "loss": 0.62791651, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.09472656, "step": 5944, "time_per_iteration": 3.1353371143341064 }, { "auxiliary_loss_clip": 0.01063852, "auxiliary_loss_mlp": 0.01045954, "balance_loss_clip": 1.01678348, "balance_loss_mlp": 1.02031922, "epoch": 0.3574327371110777, "flos": 18842763617280.0, "grad_norm": 1.804717507623184, "language_loss": 0.74034417, "learning_rate": 2.976864428379655e-06, "loss": 0.76144218, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43554688, "step": 5945, "time_per_iteration": 2.3500185012817383 }, { "auxiliary_loss_clip": 0.01064433, "auxiliary_loss_mlp": 0.0105176, "balance_loss_clip": 1.02133822, "balance_loss_mlp": 1.02019489, "epoch": 0.35749286036374567, "flos": 23548566931200.0, "grad_norm": 1.6594313822798994, "language_loss": 0.82639319, "learning_rate": 2.976524564880326e-06, "loss": 0.84755504, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 5946, "time_per_iteration": 2.4454147815704346 }, { "auxiliary_loss_clip": 0.01068131, "auxiliary_loss_mlp": 0.01049886, "balance_loss_clip": 1.01980948, "balance_loss_mlp": 1.02235746, "epoch": 0.35755298361641363, "flos": 21104437224960.0, "grad_norm": 1.5457970269799857, "language_loss": 0.70786452, "learning_rate": 2.9761846643510882e-06, "loss": 0.72904474, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45703125, "step": 5947, "time_per_iteration": 2.391890287399292 }, { "auxiliary_loss_clip": 0.01060558, "auxiliary_loss_mlp": 0.01053816, "balance_loss_clip": 1.02458596, "balance_loss_mlp": 1.01871848, "epoch": 0.3576131068690816, "flos": 19244020406400.0, "grad_norm": 1.7034024497160882, "language_loss": 0.77138948, "learning_rate": 2.9758447268048297e-06, "loss": 0.79253322, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 5948, "time_per_iteration": 2.394029378890991 }, { "auxiliary_loss_clip": 0.01063668, "auxiliary_loss_mlp": 0.01046409, "balance_loss_clip": 1.01697636, "balance_loss_mlp": 1.01908684, "epoch": 0.35767323012174956, "flos": 28653532352640.0, "grad_norm": 1.8775238969478159, "language_loss": 0.72110635, "learning_rate": 2.9755047522544415e-06, "loss": 0.74220711, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4453125, "step": 5949, "time_per_iteration": 2.428607225418091 }, { "auxiliary_loss_clip": 0.01063862, "auxiliary_loss_mlp": 0.01054838, "balance_loss_clip": 1.02312791, "balance_loss_mlp": 1.01895857, "epoch": 0.35773335337441753, "flos": 17084607770880.0, "grad_norm": 1.870643689759122, "language_loss": 0.78834623, "learning_rate": 2.9751647407128154e-06, "loss": 0.80953324, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 5950, "time_per_iteration": 3.821986675262451 }, { "auxiliary_loss_clip": 0.01065551, "auxiliary_loss_mlp": 0.01046991, "balance_loss_clip": 1.01548374, "balance_loss_mlp": 1.01914477, "epoch": 0.35779347662708555, "flos": 15887680030080.0, "grad_norm": 1.6445095727594887, "language_loss": 0.74353653, "learning_rate": 2.9748246921928445e-06, "loss": 0.76466197, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46484375, "step": 5951, "time_per_iteration": 3.7664127349853516 }, { "auxiliary_loss_clip": 0.01065693, "auxiliary_loss_mlp": 0.01048004, "balance_loss_clip": 1.01617503, "balance_loss_mlp": 1.01916647, "epoch": 0.3578535998797535, "flos": 28657547159040.0, "grad_norm": 2.4147427324980577, "language_loss": 0.70516396, "learning_rate": 2.9744846067074236e-06, "loss": 0.7263009, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46679688, "step": 5952, "time_per_iteration": 2.4257729053497314 }, { "auxiliary_loss_clip": 0.01063868, "auxiliary_loss_mlp": 0.01045514, "balance_loss_clip": 1.01680827, "balance_loss_mlp": 1.01978588, "epoch": 0.3579137231324215, "flos": 37850911678080.0, "grad_norm": 1.6566585885803995, "language_loss": 0.71127021, "learning_rate": 2.974144484269449e-06, "loss": 0.732364, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.44140625, "step": 5953, "time_per_iteration": 2.558886766433716 }, { "auxiliary_loss_clip": 0.01063992, "auxiliary_loss_mlp": 0.01046988, "balance_loss_clip": 1.01587427, "balance_loss_mlp": 1.01936126, "epoch": 0.35797384638508944, "flos": 22345739170560.0, "grad_norm": 1.6991887591867438, "language_loss": 0.68051183, "learning_rate": 2.9738043248918175e-06, "loss": 0.70162165, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 5954, "time_per_iteration": 3.8225338459014893 }, { "auxiliary_loss_clip": 0.01063925, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.02275908, "balance_loss_mlp": 1.01980245, "epoch": 0.3580339696377574, "flos": 13588858869120.0, "grad_norm": 2.6541281938852355, "language_loss": 0.76800513, "learning_rate": 2.9734641285874282e-06, "loss": 0.78917634, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 5955, "time_per_iteration": 3.7609376907348633 }, { "auxiliary_loss_clip": 0.01062277, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.01405144, "balance_loss_mlp": 1.01980114, "epoch": 0.3580940928904254, "flos": 23767123242240.0, "grad_norm": 1.7661992695853792, "language_loss": 0.76903951, "learning_rate": 2.973123895369182e-06, "loss": 0.7900809, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.42382812, "step": 5956, "time_per_iteration": 2.420297622680664 }, { "auxiliary_loss_clip": 0.01063856, "auxiliary_loss_mlp": 0.0104426, "balance_loss_clip": 1.016186, "balance_loss_mlp": 1.02039599, "epoch": 0.35815421614309334, "flos": 19462856008320.0, "grad_norm": 1.8226764865554828, "language_loss": 0.7470327, "learning_rate": 2.9727836252499805e-06, "loss": 0.76811385, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.43359375, "step": 5957, "time_per_iteration": 2.363208293914795 }, { "auxiliary_loss_clip": 0.01065878, "auxiliary_loss_mlp": 0.01047364, "balance_loss_clip": 1.01622605, "balance_loss_mlp": 1.02104878, "epoch": 0.3582143393957613, "flos": 23367053439360.0, "grad_norm": 1.895376695454308, "language_loss": 0.73304701, "learning_rate": 2.972443318242726e-06, "loss": 0.75417936, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44726562, "step": 5958, "time_per_iteration": 2.4336812496185303 }, { "auxiliary_loss_clip": 0.01062786, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.01260674, "balance_loss_mlp": 1.01990092, "epoch": 0.35827446264842927, "flos": 26322067203840.0, "grad_norm": 1.724359855474124, "language_loss": 0.89889646, "learning_rate": 2.972102974360324e-06, "loss": 0.91994679, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42773438, "step": 5959, "time_per_iteration": 2.427905559539795 }, { "auxiliary_loss_clip": 0.01065615, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.01554418, "balance_loss_mlp": 1.02170694, "epoch": 0.35833458590109724, "flos": 30445274793600.0, "grad_norm": 2.1434773633272775, "language_loss": 0.59805429, "learning_rate": 2.971762593615679e-06, "loss": 0.61917436, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43945312, "step": 5960, "time_per_iteration": 2.472266435623169 }, { "auxiliary_loss_clip": 0.01066156, "auxiliary_loss_mlp": 0.01049976, "balance_loss_clip": 1.01616859, "balance_loss_mlp": 1.02083147, "epoch": 0.3583947091537652, "flos": 14829008739840.0, "grad_norm": 2.7142633682281456, "language_loss": 0.77716482, "learning_rate": 2.9714221760216993e-06, "loss": 0.79832619, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.453125, "step": 5961, "time_per_iteration": 2.3718082904815674 }, { "auxiliary_loss_clip": 0.01067374, "auxiliary_loss_mlp": 0.0104379, "balance_loss_clip": 1.01186538, "balance_loss_mlp": 1.02274656, "epoch": 0.35845483240643317, "flos": 34239216551040.0, "grad_norm": 1.718158708216115, "language_loss": 0.72130698, "learning_rate": 2.971081721591294e-06, "loss": 0.74241865, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44726562, "step": 5962, "time_per_iteration": 2.536874294281006 }, { "auxiliary_loss_clip": 0.0106359, "auxiliary_loss_mlp": 0.01044537, "balance_loss_clip": 1.01788211, "balance_loss_mlp": 1.02134705, "epoch": 0.35851495565910113, "flos": 20959023945600.0, "grad_norm": 1.9110352712841838, "language_loss": 0.75610709, "learning_rate": 2.9707412303373716e-06, "loss": 0.77718836, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 5963, "time_per_iteration": 2.386296272277832 }, { "auxiliary_loss_clip": 0.0106565, "auxiliary_loss_mlp": 0.01043935, "balance_loss_clip": 1.01377559, "balance_loss_mlp": 1.02154005, "epoch": 0.35857507891176915, "flos": 22308766174080.0, "grad_norm": 2.812090751538277, "language_loss": 0.80012524, "learning_rate": 2.9704007022728447e-06, "loss": 0.82122111, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 5964, "time_per_iteration": 2.445861339569092 }, { "auxiliary_loss_clip": 0.01066114, "auxiliary_loss_mlp": 0.01053269, "balance_loss_clip": 1.02046275, "balance_loss_mlp": 1.02024233, "epoch": 0.3586352021644371, "flos": 23366739237120.0, "grad_norm": 1.7965609813693006, "language_loss": 0.67619348, "learning_rate": 2.970060137410626e-06, "loss": 0.69738734, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45898438, "step": 5965, "time_per_iteration": 2.406238317489624 }, { "auxiliary_loss_clip": 0.01065276, "auxiliary_loss_mlp": 0.01052499, "balance_loss_clip": 1.02014589, "balance_loss_mlp": 1.02156281, "epoch": 0.3586953254171051, "flos": 27848156042880.0, "grad_norm": 1.8118013127259531, "language_loss": 0.79856896, "learning_rate": 2.9697195357636294e-06, "loss": 0.81974673, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4375, "step": 5966, "time_per_iteration": 2.4770190715789795 }, { "auxiliary_loss_clip": 0.01064513, "auxiliary_loss_mlp": 0.01063297, "balance_loss_clip": 1.02916765, "balance_loss_mlp": 1.01951218, "epoch": 0.35875544866977305, "flos": 19499479891200.0, "grad_norm": 1.8985582454095418, "language_loss": 0.92483246, "learning_rate": 2.9693788973447715e-06, "loss": 0.94611055, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.44921875, "step": 5967, "time_per_iteration": 2.3709988594055176 }, { "auxiliary_loss_clip": 0.01067191, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.02821398, "balance_loss_mlp": 1.0209744, "epoch": 0.358815571922441, "flos": 21470047649280.0, "grad_norm": 2.194333888765889, "language_loss": 0.82124805, "learning_rate": 2.9690382221669682e-06, "loss": 0.84252632, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46289062, "step": 5968, "time_per_iteration": 2.4276368618011475 }, { "auxiliary_loss_clip": 0.01066325, "auxiliary_loss_mlp": 0.01066301, "balance_loss_clip": 1.03165925, "balance_loss_mlp": 1.01982462, "epoch": 0.358875695175109, "flos": 21834331441920.0, "grad_norm": 2.4259509985793333, "language_loss": 0.85823482, "learning_rate": 2.9686975102431384e-06, "loss": 0.87956107, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.46484375, "step": 5969, "time_per_iteration": 2.381028652191162 }, { "auxiliary_loss_clip": 0.01064737, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.01558721, "balance_loss_mlp": 1.0206995, "epoch": 0.35893581842777694, "flos": 32010361488000.0, "grad_norm": 1.695335481041504, "language_loss": 0.72954571, "learning_rate": 2.968356761586202e-06, "loss": 0.75064909, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43945312, "step": 5970, "time_per_iteration": 2.5112342834472656 }, { "auxiliary_loss_clip": 0.01061994, "auxiliary_loss_mlp": 0.01046148, "balance_loss_clip": 1.0158453, "balance_loss_mlp": 1.01860046, "epoch": 0.3589959416804449, "flos": 20484763770240.0, "grad_norm": 1.7517814846282334, "language_loss": 0.81432605, "learning_rate": 2.9680159762090805e-06, "loss": 0.8354075, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 5971, "time_per_iteration": 2.3918800354003906 }, { "auxiliary_loss_clip": 0.01065662, "auxiliary_loss_mlp": 0.01046927, "balance_loss_clip": 1.01526546, "balance_loss_mlp": 1.01912987, "epoch": 0.3590560649331129, "flos": 16179728486400.0, "grad_norm": 2.048734967973474, "language_loss": 0.80042195, "learning_rate": 2.967675154124696e-06, "loss": 0.82154787, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46484375, "step": 5972, "time_per_iteration": 2.3790743350982666 }, { "auxiliary_loss_clip": 0.01063872, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.01751053, "balance_loss_mlp": 1.01885724, "epoch": 0.35911618818578084, "flos": 20374368451200.0, "grad_norm": 2.189375427553425, "language_loss": 0.82083088, "learning_rate": 2.9673342953459722e-06, "loss": 0.84196228, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44921875, "step": 5973, "time_per_iteration": 2.3881454467773438 }, { "auxiliary_loss_clip": 0.01014969, "auxiliary_loss_mlp": 0.0100434, "balance_loss_clip": 1.00158679, "balance_loss_mlp": 1.00470555, "epoch": 0.3591763114384488, "flos": 41234308358400.0, "grad_norm": 0.9160861868809462, "language_loss": 0.56810904, "learning_rate": 2.9669933998858355e-06, "loss": 0.58830214, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.10253906, "step": 5974, "time_per_iteration": 2.908655881881714 }, { "auxiliary_loss_clip": 0.0106577, "auxiliary_loss_mlp": 0.0105349, "balance_loss_clip": 1.02275801, "balance_loss_mlp": 1.02006531, "epoch": 0.35923643469111677, "flos": 18694522517760.0, "grad_norm": 1.8223911727452269, "language_loss": 0.70773208, "learning_rate": 2.9666524677572114e-06, "loss": 0.72892463, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45703125, "step": 5975, "time_per_iteration": 2.361236572265625 }, { "auxiliary_loss_clip": 0.01064672, "auxiliary_loss_mlp": 0.01053179, "balance_loss_clip": 1.02353132, "balance_loss_mlp": 1.01989007, "epoch": 0.35929655794378473, "flos": 25008774301440.0, "grad_norm": 1.7324167990027837, "language_loss": 0.80952322, "learning_rate": 2.96631149897303e-06, "loss": 0.83070171, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44726562, "step": 5976, "time_per_iteration": 2.424760580062866 }, { "auxiliary_loss_clip": 0.01063806, "auxiliary_loss_mlp": 0.01052682, "balance_loss_clip": 1.02037668, "balance_loss_mlp": 1.0189147, "epoch": 0.35935668119645275, "flos": 14974701310080.0, "grad_norm": 1.9522397669206484, "language_loss": 0.79895121, "learning_rate": 2.9659704935462194e-06, "loss": 0.82011604, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44921875, "step": 5977, "time_per_iteration": 2.3556342124938965 }, { "auxiliary_loss_clip": 0.0106475, "auxiliary_loss_mlp": 0.01050065, "balance_loss_clip": 1.02140665, "balance_loss_mlp": 1.02046311, "epoch": 0.3594168044491207, "flos": 21177091497600.0, "grad_norm": 2.734496974948363, "language_loss": 0.81951249, "learning_rate": 2.9656294514897102e-06, "loss": 0.84066057, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.44140625, "step": 5978, "time_per_iteration": 2.4259114265441895 }, { "auxiliary_loss_clip": 0.0106587, "auxiliary_loss_mlp": 0.01054655, "balance_loss_clip": 1.02228928, "balance_loss_mlp": 1.02003312, "epoch": 0.3594769277017887, "flos": 27670936648320.0, "grad_norm": 1.5229743504862585, "language_loss": 0.68062621, "learning_rate": 2.965288372816436e-06, "loss": 0.70183146, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45703125, "step": 5979, "time_per_iteration": 2.4538497924804688 }, { "auxiliary_loss_clip": 0.01065174, "auxiliary_loss_mlp": 0.01060736, "balance_loss_clip": 1.03061152, "balance_loss_mlp": 1.02009785, "epoch": 0.35953705095445665, "flos": 23001233546880.0, "grad_norm": 2.315164296365949, "language_loss": 0.68840331, "learning_rate": 2.9649472575393296e-06, "loss": 0.70966244, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45117188, "step": 5980, "time_per_iteration": 2.416557550430298 }, { "auxiliary_loss_clip": 0.01069336, "auxiliary_loss_mlp": 0.01059082, "balance_loss_clip": 1.02494025, "balance_loss_mlp": 1.02145433, "epoch": 0.3595971742071246, "flos": 25512990289920.0, "grad_norm": 1.6848261917683713, "language_loss": 0.72430789, "learning_rate": 2.964606105671327e-06, "loss": 0.74559212, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.47851562, "step": 5981, "time_per_iteration": 2.4358015060424805 }, { "auxiliary_loss_clip": 0.0106748, "auxiliary_loss_mlp": 0.01060661, "balance_loss_clip": 1.02635241, "balance_loss_mlp": 1.02184319, "epoch": 0.3596572974597926, "flos": 29861247703680.0, "grad_norm": 1.700784967385501, "language_loss": 0.72277397, "learning_rate": 2.9642649172253635e-06, "loss": 0.74405539, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.45703125, "step": 5982, "time_per_iteration": 2.4497992992401123 }, { "auxiliary_loss_clip": 0.0106492, "auxiliary_loss_mlp": 0.01052304, "balance_loss_clip": 1.02307439, "balance_loss_mlp": 1.02184844, "epoch": 0.35971742071246054, "flos": 23111419397760.0, "grad_norm": 1.8796246492144724, "language_loss": 0.76857209, "learning_rate": 2.9639236922143786e-06, "loss": 0.78974438, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 5983, "time_per_iteration": 2.4587509632110596 }, { "auxiliary_loss_clip": 0.01070278, "auxiliary_loss_mlp": 0.01053895, "balance_loss_clip": 1.0179894, "balance_loss_mlp": 1.02212763, "epoch": 0.3597775439651285, "flos": 16724478430080.0, "grad_norm": 1.7392171900075237, "language_loss": 0.77354467, "learning_rate": 2.96358243065131e-06, "loss": 0.79478633, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.48242188, "step": 5984, "time_per_iteration": 2.357228994369507 }, { "auxiliary_loss_clip": 0.01066758, "auxiliary_loss_mlp": 0.01054138, "balance_loss_clip": 1.02172458, "balance_loss_mlp": 1.02158594, "epoch": 0.3598376672177965, "flos": 19718455138560.0, "grad_norm": 1.7184436444566886, "language_loss": 0.87551719, "learning_rate": 2.9632411325490993e-06, "loss": 0.89672613, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.453125, "step": 5985, "time_per_iteration": 2.4332563877105713 }, { "auxiliary_loss_clip": 0.01066234, "auxiliary_loss_mlp": 0.01047461, "balance_loss_clip": 1.0178256, "balance_loss_mlp": 1.02180851, "epoch": 0.35989779047046444, "flos": 17310565290240.0, "grad_norm": 1.588851209134777, "language_loss": 0.73748875, "learning_rate": 2.9628997979206884e-06, "loss": 0.75862569, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4453125, "step": 5986, "time_per_iteration": 2.3704192638397217 }, { "auxiliary_loss_clip": 0.01070095, "auxiliary_loss_mlp": 0.01055168, "balance_loss_clip": 1.01890421, "balance_loss_mlp": 1.0218972, "epoch": 0.3599579137231324, "flos": 22710127697280.0, "grad_norm": 1.7906121574254825, "language_loss": 0.75163788, "learning_rate": 2.9625584267790204e-06, "loss": 0.77289051, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.48046875, "step": 5987, "time_per_iteration": 2.4077744483947754 }, { "auxiliary_loss_clip": 0.01069798, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.0147301, "balance_loss_mlp": 1.02218843, "epoch": 0.36001803697580037, "flos": 20958814477440.0, "grad_norm": 1.7819949714729768, "language_loss": 0.71182537, "learning_rate": 2.9622170191370404e-06, "loss": 0.73300946, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4765625, "step": 5988, "time_per_iteration": 2.37961745262146 }, { "auxiliary_loss_clip": 0.01070509, "auxiliary_loss_mlp": 0.01049388, "balance_loss_clip": 1.01623583, "balance_loss_mlp": 1.02223706, "epoch": 0.36007816022846834, "flos": 20484519390720.0, "grad_norm": 2.5930777769272395, "language_loss": 0.74443376, "learning_rate": 2.9618755750076953e-06, "loss": 0.76563275, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.48242188, "step": 5989, "time_per_iteration": 2.392566680908203 }, { "auxiliary_loss_clip": 0.01063388, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.01940393, "balance_loss_mlp": 1.01888919, "epoch": 0.36013828348113636, "flos": 27999993012480.0, "grad_norm": 1.4835589710238526, "language_loss": 0.80874628, "learning_rate": 2.961534094403931e-06, "loss": 0.82988203, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 5990, "time_per_iteration": 3.9099249839782715 }, { "auxiliary_loss_clip": 0.01067808, "auxiliary_loss_mlp": 0.01052296, "balance_loss_clip": 1.01809454, "balance_loss_mlp": 1.02162313, "epoch": 0.3601984067338043, "flos": 20081202831360.0, "grad_norm": 1.7890676150293896, "language_loss": 0.85160816, "learning_rate": 2.961192577338698e-06, "loss": 0.87280917, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4609375, "step": 5991, "time_per_iteration": 2.374112844467163 }, { "auxiliary_loss_clip": 0.01068916, "auxiliary_loss_mlp": 0.01055656, "balance_loss_clip": 1.01917791, "balance_loss_mlp": 1.02115655, "epoch": 0.3602585299864723, "flos": 18616806097920.0, "grad_norm": 2.0679591543757243, "language_loss": 0.77003568, "learning_rate": 2.9608510238249463e-06, "loss": 0.79128146, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.4765625, "step": 5992, "time_per_iteration": 2.407437562942505 }, { "auxiliary_loss_clip": 0.01063593, "auxiliary_loss_mlp": 0.01053041, "balance_loss_clip": 1.02025867, "balance_loss_mlp": 1.01918459, "epoch": 0.36031865323914025, "flos": 19571994518400.0, "grad_norm": 2.0842862370685435, "language_loss": 0.79251701, "learning_rate": 2.960509433875627e-06, "loss": 0.81368327, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44335938, "step": 5993, "time_per_iteration": 2.4100444316864014 }, { "auxiliary_loss_clip": 0.010701, "auxiliary_loss_mlp": 0.01055436, "balance_loss_clip": 1.01993513, "balance_loss_mlp": 1.02180481, "epoch": 0.3603787764918082, "flos": 17489739720960.0, "grad_norm": 2.3676338883182977, "language_loss": 0.75961196, "learning_rate": 2.9601678075036943e-06, "loss": 0.78086734, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48242188, "step": 5994, "time_per_iteration": 3.905600070953369 }, { "auxiliary_loss_clip": 0.01067679, "auxiliary_loss_mlp": 0.01056967, "balance_loss_clip": 1.02108502, "balance_loss_mlp": 1.02043438, "epoch": 0.3604388997444762, "flos": 15522488542080.0, "grad_norm": 1.7629234950078223, "language_loss": 0.71027625, "learning_rate": 2.9598261447221024e-06, "loss": 0.73152268, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.47265625, "step": 5995, "time_per_iteration": 3.783785820007324 }, { "auxiliary_loss_clip": 0.01069679, "auxiliary_loss_mlp": 0.01057622, "balance_loss_clip": 1.02033377, "balance_loss_mlp": 1.02088356, "epoch": 0.36049902299714415, "flos": 17309936885760.0, "grad_norm": 1.8406735319765823, "language_loss": 0.83927298, "learning_rate": 2.9594844455438057e-06, "loss": 0.86054599, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.48828125, "step": 5996, "time_per_iteration": 2.3577911853790283 }, { "auxiliary_loss_clip": 0.01066854, "auxiliary_loss_mlp": 0.01052389, "balance_loss_clip": 1.01954734, "balance_loss_mlp": 1.02068782, "epoch": 0.3605591462498121, "flos": 17055070894080.0, "grad_norm": 1.5776884139582494, "language_loss": 0.74332714, "learning_rate": 2.959142709981763e-06, "loss": 0.76451951, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4609375, "step": 5997, "time_per_iteration": 2.374495267868042 }, { "auxiliary_loss_clip": 0.01064596, "auxiliary_loss_mlp": 0.01052594, "balance_loss_clip": 1.02004993, "balance_loss_mlp": 1.01939976, "epoch": 0.3606192695024801, "flos": 16835921089920.0, "grad_norm": 2.651191356984711, "language_loss": 0.71166515, "learning_rate": 2.9588009380489337e-06, "loss": 0.73283702, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.453125, "step": 5998, "time_per_iteration": 2.3559012413024902 }, { "auxiliary_loss_clip": 0.01066578, "auxiliary_loss_mlp": 0.01049578, "balance_loss_clip": 1.01567507, "balance_loss_mlp": 1.02029157, "epoch": 0.36067939275514804, "flos": 12128860967040.0, "grad_norm": 2.4768192273040612, "language_loss": 0.79070413, "learning_rate": 2.9584591297582758e-06, "loss": 0.81186563, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.46289062, "step": 5999, "time_per_iteration": 2.358144521713257 }, { "auxiliary_loss_clip": 0.01068347, "auxiliary_loss_mlp": 0.01056508, "balance_loss_clip": 1.02391613, "balance_loss_mlp": 1.02092934, "epoch": 0.360739516007816, "flos": 18040459507200.0, "grad_norm": 1.743335082232669, "language_loss": 0.79624999, "learning_rate": 2.9581172851227516e-06, "loss": 0.81749856, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47265625, "step": 6000, "time_per_iteration": 2.4855520725250244 }, { "auxiliary_loss_clip": 0.01065314, "auxiliary_loss_mlp": 0.01052557, "balance_loss_clip": 1.01839125, "balance_loss_mlp": 1.01975477, "epoch": 0.360799639260484, "flos": 18548864858880.0, "grad_norm": 1.6347947278559263, "language_loss": 0.79332066, "learning_rate": 2.9577754041553243e-06, "loss": 0.81449938, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.45507812, "step": 6001, "time_per_iteration": 2.3546643257141113 }, { "auxiliary_loss_clip": 0.01065188, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.01195335, "balance_loss_mlp": 1.02053404, "epoch": 0.36085976251315194, "flos": 19681028294400.0, "grad_norm": 2.7908463790166635, "language_loss": 0.84233183, "learning_rate": 2.9574334868689575e-06, "loss": 0.8634091, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 6002, "time_per_iteration": 2.3881120681762695 }, { "auxiliary_loss_clip": 0.01062484, "auxiliary_loss_mlp": 0.01047577, "balance_loss_clip": 1.01669002, "balance_loss_mlp": 1.01905847, "epoch": 0.3609198857658199, "flos": 24198021642240.0, "grad_norm": 2.3355286927910193, "language_loss": 0.92213798, "learning_rate": 2.9570915332766165e-06, "loss": 0.94323862, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 6003, "time_per_iteration": 2.379826784133911 }, { "auxiliary_loss_clip": 0.01021841, "auxiliary_loss_mlp": 0.01006049, "balance_loss_clip": 1.00261593, "balance_loss_mlp": 1.01119137, "epoch": 0.3609800090184879, "flos": 57112946346240.0, "grad_norm": 0.9664460917388455, "language_loss": 0.53413963, "learning_rate": 2.9567495433912693e-06, "loss": 0.55441856, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 0.03442383, "router_z_loss_mlp": 0.10644531, "step": 6004, "time_per_iteration": 2.9869070053100586 }, { "auxiliary_loss_clip": 0.01068101, "auxiliary_loss_mlp": 0.01053937, "balance_loss_clip": 1.01617146, "balance_loss_mlp": 1.02059591, "epoch": 0.3610401322711559, "flos": 20810259175680.0, "grad_norm": 1.7901569410440712, "language_loss": 0.78828871, "learning_rate": 2.956407517225883e-06, "loss": 0.80950916, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 0.47265625, "step": 6005, "time_per_iteration": 2.408513069152832 }, { "auxiliary_loss_clip": 0.01066106, "auxiliary_loss_mlp": 0.01047764, "balance_loss_clip": 1.01514864, "balance_loss_mlp": 1.02110124, "epoch": 0.36110025552382385, "flos": 13698311581440.0, "grad_norm": 1.9245382792614767, "language_loss": 0.799564, "learning_rate": 2.956065454793429e-06, "loss": 0.82070267, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44921875, "step": 6006, "time_per_iteration": 2.3796803951263428 }, { "auxiliary_loss_clip": 0.01068586, "auxiliary_loss_mlp": 0.01056608, "balance_loss_clip": 1.01855588, "balance_loss_mlp": 1.02148604, "epoch": 0.3611603787764918, "flos": 22453935073920.0, "grad_norm": 1.8154335021057288, "language_loss": 0.85952306, "learning_rate": 2.955723356106876e-06, "loss": 0.88077497, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 0.38085938, "router_z_loss_mlp": 0.47070312, "step": 6007, "time_per_iteration": 2.400137186050415 }, { "auxiliary_loss_clip": 0.01071155, "auxiliary_loss_mlp": 0.01052984, "balance_loss_clip": 1.01452708, "balance_loss_mlp": 1.02083087, "epoch": 0.3612205020291598, "flos": 20885601623040.0, "grad_norm": 3.2168944122961833, "language_loss": 0.74254429, "learning_rate": 2.955381221179198e-06, "loss": 0.76378572, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 0.5, "step": 6008, "time_per_iteration": 2.4336371421813965 }, { "auxiliary_loss_clip": 0.01065501, "auxiliary_loss_mlp": 0.01053115, "balance_loss_clip": 1.01880622, "balance_loss_mlp": 1.02067256, "epoch": 0.36128062528182775, "flos": 15741079764480.0, "grad_norm": 2.8965197859536005, "language_loss": 0.84822464, "learning_rate": 2.955039050023368e-06, "loss": 0.86941087, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44921875, "step": 6009, "time_per_iteration": 2.3686957359313965 }, { "auxiliary_loss_clip": 0.01067681, "auxiliary_loss_mlp": 0.01051751, "balance_loss_clip": 1.01813424, "balance_loss_mlp": 1.02122915, "epoch": 0.3613407485344957, "flos": 16763546108160.0, "grad_norm": 1.874431776735353, "language_loss": 0.78563225, "learning_rate": 2.954696842652362e-06, "loss": 0.80682659, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46484375, "step": 6010, "time_per_iteration": 2.3846652507781982 }, { "auxiliary_loss_clip": 0.01067735, "auxiliary_loss_mlp": 0.01049969, "balance_loss_clip": 1.0168761, "balance_loss_mlp": 1.02130866, "epoch": 0.3614008717871637, "flos": 20370283822080.0, "grad_norm": 1.60495570058458, "language_loss": 0.83838862, "learning_rate": 2.9543545990791554e-06, "loss": 0.85956562, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46289062, "step": 6011, "time_per_iteration": 2.4071991443634033 }, { "auxiliary_loss_clip": 0.01070637, "auxiliary_loss_mlp": 0.01056015, "balance_loss_clip": 1.01937056, "balance_loss_mlp": 1.0223186, "epoch": 0.36146099503983165, "flos": 22775764786560.0, "grad_norm": 1.9155660515828825, "language_loss": 0.6373263, "learning_rate": 2.954012319316727e-06, "loss": 0.65859282, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.48242188, "step": 6012, "time_per_iteration": 2.4455463886260986 }, { "auxiliary_loss_clip": 0.01062819, "auxiliary_loss_mlp": 0.01057649, "balance_loss_clip": 1.02582002, "balance_loss_mlp": 1.01888442, "epoch": 0.3615211182924996, "flos": 22995717552000.0, "grad_norm": 1.854027394348883, "language_loss": 0.85233831, "learning_rate": 2.9536700033780565e-06, "loss": 0.87354302, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43945312, "step": 6013, "time_per_iteration": 2.40134596824646 }, { "auxiliary_loss_clip": 0.01064926, "auxiliary_loss_mlp": 0.01050441, "balance_loss_clip": 1.01532197, "balance_loss_mlp": 1.01918006, "epoch": 0.3615812415451676, "flos": 16647320592000.0, "grad_norm": 1.6979708046258233, "language_loss": 0.92642605, "learning_rate": 2.9533276512761228e-06, "loss": 0.94757968, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.45703125, "step": 6014, "time_per_iteration": 2.3910655975341797 }, { "auxiliary_loss_clip": 0.01063371, "auxiliary_loss_mlp": 0.01048159, "balance_loss_clip": 1.01518571, "balance_loss_mlp": 1.01853967, "epoch": 0.36164136479783554, "flos": 21319153286400.0, "grad_norm": 1.8960365431521347, "language_loss": 0.75032294, "learning_rate": 2.95298526302391e-06, "loss": 0.77143824, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.44921875, "step": 6015, "time_per_iteration": 2.4150164127349854 }, { "auxiliary_loss_clip": 0.01067279, "auxiliary_loss_mlp": 0.0105135, "balance_loss_clip": 1.01830471, "balance_loss_mlp": 1.01988709, "epoch": 0.3617014880505035, "flos": 24168449854080.0, "grad_norm": 1.8021535183297066, "language_loss": 0.67053616, "learning_rate": 2.9526428386344e-06, "loss": 0.69172251, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47460938, "step": 6016, "time_per_iteration": 2.4747092723846436 }, { "auxiliary_loss_clip": 0.01069097, "auxiliary_loss_mlp": 0.01059103, "balance_loss_clip": 1.01962101, "balance_loss_mlp": 1.0212388, "epoch": 0.3617616113031715, "flos": 39013414951680.0, "grad_norm": 1.8483925414323463, "language_loss": 0.73043448, "learning_rate": 2.9523003781205785e-06, "loss": 0.75171649, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 0.47851562, "step": 6017, "time_per_iteration": 2.552084445953369 }, { "auxiliary_loss_clip": 0.01068711, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.01807785, "balance_loss_mlp": 1.02026224, "epoch": 0.3618217345558395, "flos": 12130013041920.0, "grad_norm": 2.0411449496127942, "language_loss": 0.75059456, "learning_rate": 2.9519578814954307e-06, "loss": 0.77181053, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.48632812, "step": 6018, "time_per_iteration": 2.3631505966186523 }, { "auxiliary_loss_clip": 0.01062903, "auxiliary_loss_mlp": 0.01050017, "balance_loss_clip": 1.016662, "balance_loss_mlp": 1.01910746, "epoch": 0.36188185780850746, "flos": 24933885701760.0, "grad_norm": 1.8574084016516483, "language_loss": 0.70084006, "learning_rate": 2.9516153487719448e-06, "loss": 0.72196925, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4375, "step": 6019, "time_per_iteration": 2.449065923690796 }, { "auxiliary_loss_clip": 0.01070801, "auxiliary_loss_mlp": 0.01054784, "balance_loss_clip": 1.01928365, "balance_loss_mlp": 1.02175879, "epoch": 0.3619419810611754, "flos": 20957802048000.0, "grad_norm": 1.5689644611582465, "language_loss": 0.77784818, "learning_rate": 2.95127277996311e-06, "loss": 0.79910398, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.49023438, "step": 6020, "time_per_iteration": 2.4063119888305664 }, { "auxiliary_loss_clip": 0.01068636, "auxiliary_loss_mlp": 0.01058916, "balance_loss_clip": 1.02103162, "balance_loss_mlp": 1.02161789, "epoch": 0.3620021043138434, "flos": 22527776332800.0, "grad_norm": 1.9400272119151518, "language_loss": 0.74886525, "learning_rate": 2.9509301750819156e-06, "loss": 0.77014077, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 0.47070312, "step": 6021, "time_per_iteration": 2.4168219566345215 }, { "auxiliary_loss_clip": 0.01066231, "auxiliary_loss_mlp": 0.01053458, "balance_loss_clip": 1.0185771, "balance_loss_mlp": 1.01976502, "epoch": 0.36206222756651135, "flos": 15595771219200.0, "grad_norm": 2.5642410008152576, "language_loss": 0.82155049, "learning_rate": 2.9505875341413533e-06, "loss": 0.84274733, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46484375, "step": 6022, "time_per_iteration": 2.415053606033325 }, { "auxiliary_loss_clip": 0.01065803, "auxiliary_loss_mlp": 0.01052201, "balance_loss_clip": 1.01939464, "balance_loss_mlp": 1.02084279, "epoch": 0.3621223508191793, "flos": 23586028686720.0, "grad_norm": 1.5152122707145106, "language_loss": 0.82923388, "learning_rate": 2.950244857154417e-06, "loss": 0.85041398, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44921875, "step": 6023, "time_per_iteration": 2.4238691329956055 }, { "auxiliary_loss_clip": 0.01069081, "auxiliary_loss_mlp": 0.0105603, "balance_loss_clip": 1.02031541, "balance_loss_mlp": 1.020661, "epoch": 0.3621824740718473, "flos": 22308801085440.0, "grad_norm": 1.826065443629867, "language_loss": 0.81824923, "learning_rate": 2.9499021441341e-06, "loss": 0.83950031, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48632812, "step": 6024, "time_per_iteration": 2.418917655944824 }, { "auxiliary_loss_clip": 0.01064743, "auxiliary_loss_mlp": 0.01049872, "balance_loss_clip": 1.01842403, "balance_loss_mlp": 1.0209198, "epoch": 0.36224259732451525, "flos": 16762708235520.0, "grad_norm": 2.295432912708728, "language_loss": 0.75984889, "learning_rate": 2.9495593950933997e-06, "loss": 0.78099501, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43945312, "step": 6025, "time_per_iteration": 2.3947601318359375 }, { "auxiliary_loss_clip": 0.01065909, "auxiliary_loss_mlp": 0.01050213, "balance_loss_clip": 1.01880181, "balance_loss_mlp": 1.02104068, "epoch": 0.3623027205771832, "flos": 23148601862400.0, "grad_norm": 1.5840849188232182, "language_loss": 0.73856223, "learning_rate": 2.9492166100453107e-06, "loss": 0.75972342, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44921875, "step": 6026, "time_per_iteration": 2.4243900775909424 }, { "auxiliary_loss_clip": 0.01072416, "auxiliary_loss_mlp": 0.01063836, "balance_loss_clip": 1.02561736, "balance_loss_mlp": 1.02213848, "epoch": 0.3623628438298512, "flos": 28547884978560.0, "grad_norm": 1.985710687317433, "language_loss": 0.80281603, "learning_rate": 2.948873789002833e-06, "loss": 0.82417858, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.50390625, "step": 6027, "time_per_iteration": 2.4597713947296143 }, { "auxiliary_loss_clip": 0.01069112, "auxiliary_loss_mlp": 0.0105266, "balance_loss_clip": 1.01656389, "balance_loss_mlp": 1.02115369, "epoch": 0.36242296708251914, "flos": 25483732704000.0, "grad_norm": 1.8167003319544601, "language_loss": 0.69465673, "learning_rate": 2.9485309319789667e-06, "loss": 0.71587443, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.48046875, "step": 6028, "time_per_iteration": 2.4455742835998535 }, { "auxiliary_loss_clip": 0.01065494, "auxiliary_loss_mlp": 0.01046516, "balance_loss_clip": 1.01545012, "balance_loss_mlp": 1.02092052, "epoch": 0.3624830903351871, "flos": 16289425578240.0, "grad_norm": 2.5719377544118025, "language_loss": 0.8673234, "learning_rate": 2.9481880389867117e-06, "loss": 0.88844353, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 6029, "time_per_iteration": 5.243697643280029 }, { "auxiliary_loss_clip": 0.0106436, "auxiliary_loss_mlp": 0.01046669, "balance_loss_clip": 1.01600814, "balance_loss_mlp": 1.02086985, "epoch": 0.36254321358785513, "flos": 18295325498880.0, "grad_norm": 1.564412714912511, "language_loss": 0.73721516, "learning_rate": 2.9478451100390714e-06, "loss": 0.75832546, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43554688, "step": 6030, "time_per_iteration": 2.355849266052246 }, { "auxiliary_loss_clip": 0.01069514, "auxiliary_loss_mlp": 0.01054315, "balance_loss_clip": 1.01705015, "balance_loss_mlp": 1.02105188, "epoch": 0.3626033368405231, "flos": 14864445636480.0, "grad_norm": 2.1051685295351374, "language_loss": 0.76701778, "learning_rate": 2.94750214514905e-06, "loss": 0.78825611, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.484375, "step": 6031, "time_per_iteration": 2.413341522216797 }, { "auxiliary_loss_clip": 0.01064157, "auxiliary_loss_mlp": 0.01049704, "balance_loss_clip": 1.01719582, "balance_loss_mlp": 1.01966405, "epoch": 0.36266346009319106, "flos": 22305589240320.0, "grad_norm": 3.468221423645251, "language_loss": 0.74852788, "learning_rate": 2.9471591443296516e-06, "loss": 0.76966655, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4453125, "step": 6032, "time_per_iteration": 2.3793890476226807 }, { "auxiliary_loss_clip": 0.01066467, "auxiliary_loss_mlp": 0.01047569, "balance_loss_clip": 1.01600266, "balance_loss_mlp": 1.02003443, "epoch": 0.362723583345859, "flos": 18221379505920.0, "grad_norm": 3.4707212765275077, "language_loss": 0.79304862, "learning_rate": 2.946816107593884e-06, "loss": 0.81418902, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46484375, "step": 6033, "time_per_iteration": 2.3979406356811523 }, { "auxiliary_loss_clip": 0.01014312, "auxiliary_loss_mlp": 0.01011718, "balance_loss_clip": 1.00804591, "balance_loss_mlp": 1.00398135, "epoch": 0.362783706598527, "flos": 68495818959360.0, "grad_norm": 0.7970406028189138, "language_loss": 0.648929, "learning_rate": 2.9464730349547547e-06, "loss": 0.66918921, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.10351562, "step": 6034, "time_per_iteration": 4.411875247955322 }, { "auxiliary_loss_clip": 0.01063474, "auxiliary_loss_mlp": 0.01049987, "balance_loss_clip": 1.01665616, "balance_loss_mlp": 1.018538, "epoch": 0.36284382985119495, "flos": 26575432007040.0, "grad_norm": 1.64374318272035, "language_loss": 0.90672064, "learning_rate": 2.946129926425273e-06, "loss": 0.92785531, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44921875, "step": 6035, "time_per_iteration": 3.8427653312683105 }, { "auxiliary_loss_clip": 0.01067381, "auxiliary_loss_mlp": 0.01048833, "balance_loss_clip": 1.01457191, "balance_loss_mlp": 1.02002573, "epoch": 0.3629039531038629, "flos": 20155742317440.0, "grad_norm": 2.251950160992578, "language_loss": 0.74839061, "learning_rate": 2.9457867820184496e-06, "loss": 0.76955271, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.47460938, "step": 6036, "time_per_iteration": 2.397188425064087 }, { "auxiliary_loss_clip": 0.01067143, "auxiliary_loss_mlp": 0.01056364, "balance_loss_clip": 1.02055371, "balance_loss_mlp": 1.02056324, "epoch": 0.3629640763565309, "flos": 18624696065280.0, "grad_norm": 1.9179781810052645, "language_loss": 0.76948333, "learning_rate": 2.945443601747297e-06, "loss": 0.79071844, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.46679688, "step": 6037, "time_per_iteration": 2.3758556842803955 }, { "auxiliary_loss_clip": 0.0106298, "auxiliary_loss_mlp": 0.01054975, "balance_loss_clip": 1.02311003, "balance_loss_mlp": 1.02056813, "epoch": 0.36302419960919885, "flos": 19570493329920.0, "grad_norm": 1.8352873530613996, "language_loss": 0.7927835, "learning_rate": 2.945100385624828e-06, "loss": 0.81396306, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.42382812, "step": 6038, "time_per_iteration": 2.4023067951202393 }, { "auxiliary_loss_clip": 0.01014351, "auxiliary_loss_mlp": 0.0100412, "balance_loss_clip": 1.00080621, "balance_loss_mlp": 1.00455213, "epoch": 0.3630843228618668, "flos": 63794239920000.0, "grad_norm": 0.8418757327163344, "language_loss": 0.63619882, "learning_rate": 2.9447571336640573e-06, "loss": 0.65638357, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.09814453, "step": 6039, "time_per_iteration": 3.102839231491089 }, { "auxiliary_loss_clip": 0.01066149, "auxiliary_loss_mlp": 0.01056744, "balance_loss_clip": 1.02496231, "balance_loss_mlp": 1.02185643, "epoch": 0.3631444461145348, "flos": 21834087062400.0, "grad_norm": 2.1931429674313483, "language_loss": 0.72609472, "learning_rate": 2.944413845878002e-06, "loss": 0.74732363, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44335938, "step": 6040, "time_per_iteration": 2.3959462642669678 }, { "auxiliary_loss_clip": 0.01067871, "auxiliary_loss_mlp": 0.01052749, "balance_loss_clip": 1.01839328, "balance_loss_mlp": 1.02096641, "epoch": 0.36320456936720275, "flos": 21721073391360.0, "grad_norm": 1.6664630938019536, "language_loss": 0.82849962, "learning_rate": 2.9440705222796783e-06, "loss": 0.84970582, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46875, "step": 6041, "time_per_iteration": 2.4016969203948975 }, { "auxiliary_loss_clip": 0.01068418, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 1.01949024, "balance_loss_mlp": 1.02072299, "epoch": 0.3632646926198707, "flos": 17018132808960.0, "grad_norm": 2.006842735356281, "language_loss": 0.86020947, "learning_rate": 2.943727162882107e-06, "loss": 0.88142323, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4765625, "step": 6042, "time_per_iteration": 2.4044528007507324 }, { "auxiliary_loss_clip": 0.01064587, "auxiliary_loss_mlp": 0.01055291, "balance_loss_clip": 1.02353382, "balance_loss_mlp": 1.02026761, "epoch": 0.36332481587253873, "flos": 23330045531520.0, "grad_norm": 1.9146668490934713, "language_loss": 0.79420626, "learning_rate": 2.9433837676983064e-06, "loss": 0.81540513, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44335938, "step": 6043, "time_per_iteration": 2.409888505935669 }, { "auxiliary_loss_clip": 0.01062983, "auxiliary_loss_mlp": 0.01058853, "balance_loss_clip": 1.02530777, "balance_loss_mlp": 1.02033389, "epoch": 0.3633849391252067, "flos": 10742774146560.0, "grad_norm": 2.0777489442279045, "language_loss": 0.67665374, "learning_rate": 2.943040336741298e-06, "loss": 0.69787204, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.42578125, "step": 6044, "time_per_iteration": 2.3758041858673096 }, { "auxiliary_loss_clip": 0.01064969, "auxiliary_loss_mlp": 0.01050608, "balance_loss_clip": 1.01858842, "balance_loss_mlp": 1.02146935, "epoch": 0.36344506237787466, "flos": 25847946673920.0, "grad_norm": 1.7118556862556429, "language_loss": 0.82494754, "learning_rate": 2.9426968700241066e-06, "loss": 0.84610331, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.43554688, "step": 6045, "time_per_iteration": 2.451549768447876 }, { "auxiliary_loss_clip": 0.01066653, "auxiliary_loss_mlp": 0.01052679, "balance_loss_clip": 1.01996815, "balance_loss_mlp": 1.02171302, "epoch": 0.3635051856305426, "flos": 30152737578240.0, "grad_norm": 3.0986418979590997, "language_loss": 0.67059064, "learning_rate": 2.942353367559755e-06, "loss": 0.69178396, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44921875, "step": 6046, "time_per_iteration": 2.4797322750091553 }, { "auxiliary_loss_clip": 0.01065578, "auxiliary_loss_mlp": 0.01050735, "balance_loss_clip": 1.0187633, "balance_loss_mlp": 1.02054262, "epoch": 0.3635653088832106, "flos": 22197358425600.0, "grad_norm": 1.5701601978803301, "language_loss": 0.78448278, "learning_rate": 2.9420098293612692e-06, "loss": 0.80564582, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44921875, "step": 6047, "time_per_iteration": 2.439427137374878 }, { "auxiliary_loss_clip": 0.01069475, "auxiliary_loss_mlp": 0.01058973, "balance_loss_clip": 1.02361584, "balance_loss_mlp": 1.02075171, "epoch": 0.36362543213587856, "flos": 24785993715840.0, "grad_norm": 1.59592476816687, "language_loss": 0.80765581, "learning_rate": 2.9416662554416767e-06, "loss": 0.82894033, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.48632812, "step": 6048, "time_per_iteration": 2.449002265930176 }, { "auxiliary_loss_clip": 0.01012747, "auxiliary_loss_mlp": 0.01013999, "balance_loss_clip": 1.0102793, "balance_loss_mlp": 1.00319743, "epoch": 0.3636855553885465, "flos": 62522877427200.0, "grad_norm": 0.768398638911346, "language_loss": 0.52602255, "learning_rate": 2.9413226458140054e-06, "loss": 0.54629004, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.09570312, "step": 6049, "time_per_iteration": 3.1021018028259277 }, { "auxiliary_loss_clip": 0.010673, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.02136385, "balance_loss_mlp": 1.0226531, "epoch": 0.3637456786412145, "flos": 24059520812160.0, "grad_norm": 1.8215225142766682, "language_loss": 0.88060814, "learning_rate": 2.9409790004912845e-06, "loss": 0.9018178, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44726562, "step": 6050, "time_per_iteration": 2.4546291828155518 }, { "auxiliary_loss_clip": 0.01065596, "auxiliary_loss_mlp": 0.0104751, "balance_loss_clip": 1.01704037, "balance_loss_mlp": 1.02206826, "epoch": 0.36380580189388245, "flos": 16690542721920.0, "grad_norm": 1.7905212725623627, "language_loss": 0.79699004, "learning_rate": 2.940635319486546e-06, "loss": 0.81812114, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43554688, "step": 6051, "time_per_iteration": 2.452287197113037 }, { "auxiliary_loss_clip": 0.01065189, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.01722789, "balance_loss_mlp": 1.02158749, "epoch": 0.3638659251465504, "flos": 25113060132480.0, "grad_norm": 1.8044377090517456, "language_loss": 0.84169936, "learning_rate": 2.940291602812822e-06, "loss": 0.86281222, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 6052, "time_per_iteration": 2.464834690093994 }, { "auxiliary_loss_clip": 0.01062815, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.01497316, "balance_loss_mlp": 1.02169895, "epoch": 0.3639260483992184, "flos": 23001896862720.0, "grad_norm": 1.6479627544398416, "language_loss": 0.73463857, "learning_rate": 2.939947850483145e-06, "loss": 0.75568402, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 6053, "time_per_iteration": 2.4610178470611572 }, { "auxiliary_loss_clip": 0.01014951, "auxiliary_loss_mlp": 0.01003151, "balance_loss_clip": 1.00002789, "balance_loss_mlp": 1.00536585, "epoch": 0.36398617165188635, "flos": 70712839071360.0, "grad_norm": 0.7678438056466644, "language_loss": 0.61335182, "learning_rate": 2.9396040625105532e-06, "loss": 0.63353288, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.09570312, "step": 6054, "time_per_iteration": 3.069837808609009 }, { "auxiliary_loss_clip": 0.01066553, "auxiliary_loss_mlp": 0.01059686, "balance_loss_clip": 1.02603269, "balance_loss_mlp": 1.02161443, "epoch": 0.3640462949045543, "flos": 22234401244800.0, "grad_norm": 1.9277682401823124, "language_loss": 0.77414942, "learning_rate": 2.9392602389080802e-06, "loss": 0.79541183, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.44921875, "step": 6055, "time_per_iteration": 2.4518473148345947 }, { "auxiliary_loss_clip": 0.01064281, "auxiliary_loss_mlp": 0.01055458, "balance_loss_clip": 1.02424932, "balance_loss_mlp": 1.02099764, "epoch": 0.3641064181572223, "flos": 21542457542400.0, "grad_norm": 1.770214631831233, "language_loss": 0.76862454, "learning_rate": 2.938916379688765e-06, "loss": 0.78982198, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43164062, "step": 6056, "time_per_iteration": 2.400132894515991 }, { "auxiliary_loss_clip": 0.01064352, "auxiliary_loss_mlp": 0.01054139, "balance_loss_clip": 1.02443159, "balance_loss_mlp": 1.02101016, "epoch": 0.3641665414098903, "flos": 22272212113920.0, "grad_norm": 2.1215672124157017, "language_loss": 0.81835747, "learning_rate": 2.9385724848656468e-06, "loss": 0.83954245, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 6057, "time_per_iteration": 2.414478302001953 }, { "auxiliary_loss_clip": 0.01065362, "auxiliary_loss_mlp": 0.01058418, "balance_loss_clip": 1.02744746, "balance_loss_mlp": 1.02142787, "epoch": 0.36422666466255826, "flos": 28328420972160.0, "grad_norm": 2.2866156848367676, "language_loss": 0.8198843, "learning_rate": 2.9382285544517647e-06, "loss": 0.84112215, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43945312, "step": 6058, "time_per_iteration": 2.4605324268341064 }, { "auxiliary_loss_clip": 0.01065325, "auxiliary_loss_mlp": 0.01064226, "balance_loss_clip": 1.03474534, "balance_loss_mlp": 1.02128923, "epoch": 0.36428678791522623, "flos": 24169357549440.0, "grad_norm": 1.6914433938816842, "language_loss": 0.86271828, "learning_rate": 2.9378845884601636e-06, "loss": 0.88401377, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44140625, "step": 6059, "time_per_iteration": 2.432072401046753 }, { "auxiliary_loss_clip": 0.01064811, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.02874684, "balance_loss_mlp": 1.02048981, "epoch": 0.3643469111678942, "flos": 22527357396480.0, "grad_norm": 1.4735670381864518, "language_loss": 0.88610202, "learning_rate": 2.937540586903884e-06, "loss": 0.90735829, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44335938, "step": 6060, "time_per_iteration": 2.415597915649414 }, { "auxiliary_loss_clip": 0.01065619, "auxiliary_loss_mlp": 0.01061941, "balance_loss_clip": 1.03197145, "balance_loss_mlp": 1.0219543, "epoch": 0.36440703442056216, "flos": 19425603720960.0, "grad_norm": 2.078039531231747, "language_loss": 0.68328512, "learning_rate": 2.937196549795971e-06, "loss": 0.70456076, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4375, "step": 6061, "time_per_iteration": 2.4130678176879883 }, { "auxiliary_loss_clip": 0.01065735, "auxiliary_loss_mlp": 0.01057996, "balance_loss_clip": 1.02673936, "balance_loss_mlp": 1.02128601, "epoch": 0.3644671576732301, "flos": 18039551811840.0, "grad_norm": 3.502665948180738, "language_loss": 0.78082329, "learning_rate": 2.9368524771494718e-06, "loss": 0.8020606, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4453125, "step": 6062, "time_per_iteration": 2.4070374965667725 }, { "auxiliary_loss_clip": 0.0106352, "auxiliary_loss_mlp": 0.01058834, "balance_loss_clip": 1.02748227, "balance_loss_mlp": 1.02019298, "epoch": 0.3645272809258981, "flos": 21541759315200.0, "grad_norm": 2.1379264398893145, "language_loss": 0.73841053, "learning_rate": 2.936508368977432e-06, "loss": 0.75963408, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43359375, "step": 6063, "time_per_iteration": 2.457726001739502 }, { "auxiliary_loss_clip": 0.01061718, "auxiliary_loss_mlp": 0.01056195, "balance_loss_clip": 1.0258801, "balance_loss_mlp": 1.01906681, "epoch": 0.36458740417856605, "flos": 22745774062080.0, "grad_norm": 1.8443569761998375, "language_loss": 0.69214797, "learning_rate": 2.936164225292901e-06, "loss": 0.71332711, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42578125, "step": 6064, "time_per_iteration": 2.4396913051605225 }, { "auxiliary_loss_clip": 0.0106547, "auxiliary_loss_mlp": 0.01062452, "balance_loss_clip": 1.03263831, "balance_loss_mlp": 1.02045178, "epoch": 0.364647527431234, "flos": 26139471459840.0, "grad_norm": 1.7593490485803371, "language_loss": 0.76793051, "learning_rate": 2.9358200461089297e-06, "loss": 0.78920984, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 6065, "time_per_iteration": 2.455273151397705 }, { "auxiliary_loss_clip": 0.01066681, "auxiliary_loss_mlp": 0.01055564, "balance_loss_clip": 1.0236398, "balance_loss_mlp": 1.02045679, "epoch": 0.364707650683902, "flos": 31028568744960.0, "grad_norm": 1.925013973970139, "language_loss": 0.76474226, "learning_rate": 2.9354758314385676e-06, "loss": 0.78596473, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4609375, "step": 6066, "time_per_iteration": 2.506743907928467 }, { "auxiliary_loss_clip": 0.0106359, "auxiliary_loss_mlp": 0.0105237, "balance_loss_clip": 1.02513051, "balance_loss_mlp": 1.02125239, "epoch": 0.36476777393656995, "flos": 19571889784320.0, "grad_norm": 2.010062039731278, "language_loss": 0.77890325, "learning_rate": 2.9351315812948684e-06, "loss": 0.80006289, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42382812, "step": 6067, "time_per_iteration": 2.3956353664398193 }, { "auxiliary_loss_clip": 0.01062706, "auxiliary_loss_mlp": 0.01043702, "balance_loss_clip": 1.0182147, "balance_loss_mlp": 1.02154708, "epoch": 0.3648278971892379, "flos": 17747887380480.0, "grad_norm": 1.8467826870386332, "language_loss": 0.72651887, "learning_rate": 2.934787295690886e-06, "loss": 0.74758291, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.41015625, "step": 6068, "time_per_iteration": 3.824467182159424 }, { "auxiliary_loss_clip": 0.01067341, "auxiliary_loss_mlp": 0.01048243, "balance_loss_clip": 1.01939392, "balance_loss_mlp": 1.02337158, "epoch": 0.3648880204419059, "flos": 17930203833600.0, "grad_norm": 1.8473229036917926, "language_loss": 0.75718153, "learning_rate": 2.9344429746396755e-06, "loss": 0.77833736, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43945312, "step": 6069, "time_per_iteration": 3.840327501296997 }, { "auxiliary_loss_clip": 0.01069561, "auxiliary_loss_mlp": 0.0104597, "balance_loss_clip": 1.0148797, "balance_loss_mlp": 1.02440143, "epoch": 0.3649481436945739, "flos": 22637159222400.0, "grad_norm": 1.7536621086701136, "language_loss": 0.67510307, "learning_rate": 2.9340986181542945e-06, "loss": 0.69625837, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.45117188, "step": 6070, "time_per_iteration": 2.4036340713500977 }, { "auxiliary_loss_clip": 0.01067361, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.01290286, "balance_loss_mlp": 1.02502704, "epoch": 0.36500826694724187, "flos": 21578592666240.0, "grad_norm": 1.6255840910848005, "language_loss": 0.75975692, "learning_rate": 2.9337542262477994e-06, "loss": 0.78082359, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.42382812, "step": 6071, "time_per_iteration": 2.471506357192993 }, { "auxiliary_loss_clip": 0.01067391, "auxiliary_loss_mlp": 0.01044211, "balance_loss_clip": 1.01377654, "balance_loss_mlp": 1.02469277, "epoch": 0.36506839019990983, "flos": 13771664081280.0, "grad_norm": 1.8518747771017972, "language_loss": 0.90138894, "learning_rate": 2.9334097989332506e-06, "loss": 0.92250496, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 6072, "time_per_iteration": 2.3868484497070312 }, { "auxiliary_loss_clip": 0.01067377, "auxiliary_loss_mlp": 0.01050373, "balance_loss_clip": 1.02161932, "balance_loss_mlp": 1.02386093, "epoch": 0.3651285134525778, "flos": 17274011230080.0, "grad_norm": 2.146757077798453, "language_loss": 0.74609745, "learning_rate": 2.9330653362237094e-06, "loss": 0.76727498, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.43554688, "step": 6073, "time_per_iteration": 2.4388935565948486 }, { "auxiliary_loss_clip": 0.01068004, "auxiliary_loss_mlp": 0.01060515, "balance_loss_clip": 1.02869785, "balance_loss_mlp": 1.02340364, "epoch": 0.36518863670524576, "flos": 21906915891840.0, "grad_norm": 2.0135079940506193, "language_loss": 0.68521392, "learning_rate": 2.932720838132236e-06, "loss": 0.7064991, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4453125, "step": 6074, "time_per_iteration": 5.213298082351685 }, { "auxiliary_loss_clip": 0.01067531, "auxiliary_loss_mlp": 0.01048505, "balance_loss_clip": 1.01960886, "balance_loss_mlp": 1.02430725, "epoch": 0.3652487599579137, "flos": 27121054734720.0, "grad_norm": 1.528138869056988, "language_loss": 0.73739088, "learning_rate": 2.9323763046718954e-06, "loss": 0.75855124, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 6075, "time_per_iteration": 2.488754987716675 }, { "auxiliary_loss_clip": 0.01072305, "auxiliary_loss_mlp": 0.01058836, "balance_loss_clip": 1.02536249, "balance_loss_mlp": 1.02454519, "epoch": 0.3653088832105817, "flos": 19754555351040.0, "grad_norm": 3.574055218905957, "language_loss": 0.90777487, "learning_rate": 2.9320317358557524e-06, "loss": 0.92908633, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.47851562, "step": 6076, "time_per_iteration": 2.4288127422332764 }, { "auxiliary_loss_clip": 0.01064791, "auxiliary_loss_mlp": 0.01052711, "balance_loss_clip": 1.02143073, "balance_loss_mlp": 1.02200627, "epoch": 0.36536900646324966, "flos": 13114179757440.0, "grad_norm": 2.0519411467333835, "language_loss": 0.7189377, "learning_rate": 2.931687131696872e-06, "loss": 0.74011272, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.42773438, "step": 6077, "time_per_iteration": 2.39399790763855 }, { "auxiliary_loss_clip": 0.01020268, "auxiliary_loss_mlp": 0.01005211, "balance_loss_clip": 1.0018971, "balance_loss_mlp": 1.0110836, "epoch": 0.3654291297159176, "flos": 71096743048320.0, "grad_norm": 0.7567554185115096, "language_loss": 0.61759639, "learning_rate": 2.9313424922083224e-06, "loss": 0.63785118, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.09179688, "step": 6078, "time_per_iteration": 3.1324706077575684 }, { "auxiliary_loss_clip": 0.01062852, "auxiliary_loss_mlp": 0.01057118, "balance_loss_clip": 1.02631402, "balance_loss_mlp": 1.01915991, "epoch": 0.3654892529685856, "flos": 23616508170240.0, "grad_norm": 4.845880266774484, "language_loss": 0.79115009, "learning_rate": 2.930997817403173e-06, "loss": 0.8123498, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 6079, "time_per_iteration": 2.3886826038360596 }, { "auxiliary_loss_clip": 0.01066096, "auxiliary_loss_mlp": 0.01057785, "balance_loss_clip": 1.0267663, "balance_loss_mlp": 1.02121508, "epoch": 0.36554937622125355, "flos": 43469135130240.0, "grad_norm": 1.8721465913692896, "language_loss": 0.63794357, "learning_rate": 2.9306531072944913e-06, "loss": 0.65918243, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44921875, "step": 6080, "time_per_iteration": 2.5787179470062256 }, { "auxiliary_loss_clip": 0.01067919, "auxiliary_loss_mlp": 0.01060622, "balance_loss_clip": 1.02621806, "balance_loss_mlp": 1.020715, "epoch": 0.3656094994739215, "flos": 23293526382720.0, "grad_norm": 2.354454420305791, "language_loss": 0.69951957, "learning_rate": 2.930308361895352e-06, "loss": 0.72080493, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47265625, "step": 6081, "time_per_iteration": 2.4036571979522705 }, { "auxiliary_loss_clip": 0.01067844, "auxiliary_loss_mlp": 0.01065918, "balance_loss_clip": 1.03408909, "balance_loss_mlp": 1.02130568, "epoch": 0.3656696227265895, "flos": 24570823806720.0, "grad_norm": 1.8596062752508533, "language_loss": 0.76019931, "learning_rate": 2.9299635812188257e-06, "loss": 0.78153694, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46484375, "step": 6082, "time_per_iteration": 2.458595037460327 }, { "auxiliary_loss_clip": 0.01066317, "auxiliary_loss_mlp": 0.01051314, "balance_loss_clip": 1.02120209, "balance_loss_mlp": 1.02165902, "epoch": 0.3657297459792575, "flos": 27927129271680.0, "grad_norm": 1.6370117728677909, "language_loss": 0.83628601, "learning_rate": 2.929618765277987e-06, "loss": 0.85746229, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4453125, "step": 6083, "time_per_iteration": 2.455702304840088 }, { "auxiliary_loss_clip": 0.01013406, "auxiliary_loss_mlp": 0.01010645, "balance_loss_clip": 1.00737834, "balance_loss_mlp": 1.00347567, "epoch": 0.36578986923192547, "flos": 67389631441920.0, "grad_norm": 0.8125260517007118, "language_loss": 0.59365356, "learning_rate": 2.9292739140859125e-06, "loss": 0.6138941, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.09960938, "step": 6084, "time_per_iteration": 3.097857713699341 }, { "auxiliary_loss_clip": 0.01066287, "auxiliary_loss_mlp": 0.01057579, "balance_loss_clip": 1.0263344, "balance_loss_mlp": 1.02203441, "epoch": 0.36584999248459343, "flos": 20226546288000.0, "grad_norm": 1.6638076004526672, "language_loss": 0.73885995, "learning_rate": 2.9289290276556767e-06, "loss": 0.76009858, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44335938, "step": 6085, "time_per_iteration": 2.404226779937744 }, { "auxiliary_loss_clip": 0.01065458, "auxiliary_loss_mlp": 0.0104677, "balance_loss_clip": 1.01799321, "balance_loss_mlp": 1.02243543, "epoch": 0.3659101157372614, "flos": 19061459573760.0, "grad_norm": 2.0052961671392913, "language_loss": 0.79932994, "learning_rate": 2.9285841060003604e-06, "loss": 0.82045227, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 6086, "time_per_iteration": 2.4030630588531494 }, { "auxiliary_loss_clip": 0.01064246, "auxiliary_loss_mlp": 0.01052117, "balance_loss_clip": 1.02273238, "balance_loss_mlp": 1.02186811, "epoch": 0.36597023898992936, "flos": 30809384029440.0, "grad_norm": 1.8043978069176132, "language_loss": 0.78781724, "learning_rate": 2.9282391491330416e-06, "loss": 0.80898082, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42382812, "step": 6087, "time_per_iteration": 2.501413106918335 }, { "auxiliary_loss_clip": 0.01069796, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.01526546, "balance_loss_mlp": 1.02309918, "epoch": 0.36603036224259733, "flos": 20520759248640.0, "grad_norm": 3.2317345540405995, "language_loss": 0.72018051, "learning_rate": 2.9278941570668002e-06, "loss": 0.74134719, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46484375, "step": 6088, "time_per_iteration": 2.4131362438201904 }, { "auxiliary_loss_clip": 0.01075078, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.0186497, "balance_loss_mlp": 1.02518821, "epoch": 0.3660904854952653, "flos": 38327790205440.0, "grad_norm": 1.5664606467148123, "language_loss": 0.80397844, "learning_rate": 2.92754912981472e-06, "loss": 0.82527, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.5, "step": 6089, "time_per_iteration": 2.5901448726654053 }, { "auxiliary_loss_clip": 0.01069924, "auxiliary_loss_mlp": 0.01048608, "balance_loss_clip": 1.01720786, "balance_loss_mlp": 1.02541947, "epoch": 0.36615060874793326, "flos": 21834471087360.0, "grad_norm": 1.7808666210710236, "language_loss": 0.72054511, "learning_rate": 2.927204067389884e-06, "loss": 0.74173039, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 6090, "time_per_iteration": 2.461784839630127 }, { "auxiliary_loss_clip": 0.01071652, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.0275923, "balance_loss_mlp": 1.02820945, "epoch": 0.3662107320006012, "flos": 16580601250560.0, "grad_norm": 1.8750627795033012, "language_loss": 0.75042987, "learning_rate": 2.9268589698053763e-06, "loss": 0.77172649, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43554688, "step": 6091, "time_per_iteration": 2.408586263656616 }, { "auxiliary_loss_clip": 0.01071457, "auxiliary_loss_mlp": 0.01054375, "balance_loss_clip": 1.02348769, "balance_loss_mlp": 1.02661419, "epoch": 0.3662708552532692, "flos": 20957348200320.0, "grad_norm": 1.9661640758421983, "language_loss": 0.74014294, "learning_rate": 2.926513837074284e-06, "loss": 0.76140124, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44921875, "step": 6092, "time_per_iteration": 2.4520299434661865 }, { "auxiliary_loss_clip": 0.01073715, "auxiliary_loss_mlp": 0.01051263, "balance_loss_clip": 1.0200783, "balance_loss_mlp": 1.02811217, "epoch": 0.36633097850593715, "flos": 21901783921920.0, "grad_norm": 1.8855275427413294, "language_loss": 0.79772562, "learning_rate": 2.9261686692096942e-06, "loss": 0.81897539, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45703125, "step": 6093, "time_per_iteration": 2.4207775592803955 }, { "auxiliary_loss_clip": 0.0107309, "auxiliary_loss_mlp": 0.01057832, "balance_loss_clip": 1.02738595, "balance_loss_mlp": 1.02599955, "epoch": 0.3663911017586051, "flos": 32852745705600.0, "grad_norm": 1.8846376262521234, "language_loss": 0.75919402, "learning_rate": 2.925823466224696e-06, "loss": 0.78050327, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.47265625, "step": 6094, "time_per_iteration": 2.5181753635406494 }, { "auxiliary_loss_clip": 0.01073283, "auxiliary_loss_mlp": 0.01053115, "balance_loss_clip": 1.0231936, "balance_loss_mlp": 1.02656221, "epoch": 0.3664512250112731, "flos": 27270517731840.0, "grad_norm": 1.546973379311129, "language_loss": 0.80133414, "learning_rate": 2.9254782281323785e-06, "loss": 0.82259816, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.46679688, "step": 6095, "time_per_iteration": 2.4765758514404297 }, { "auxiliary_loss_clip": 0.01073861, "auxiliary_loss_mlp": 0.01055009, "balance_loss_clip": 1.01965189, "balance_loss_mlp": 1.02663636, "epoch": 0.3665113482639411, "flos": 17783498833920.0, "grad_norm": 2.904092921047173, "language_loss": 0.75969583, "learning_rate": 2.925132954945834e-06, "loss": 0.78098452, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.47265625, "step": 6096, "time_per_iteration": 2.4183590412139893 }, { "auxiliary_loss_clip": 0.01070694, "auxiliary_loss_mlp": 0.01056408, "balance_loss_clip": 1.02271914, "balance_loss_mlp": 1.02400792, "epoch": 0.36657147151660907, "flos": 27853392746880.0, "grad_norm": 1.9658426794854906, "language_loss": 0.68755102, "learning_rate": 2.924787646678155e-06, "loss": 0.70882207, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46679688, "step": 6097, "time_per_iteration": 2.493474245071411 }, { "auxiliary_loss_clip": 0.01071263, "auxiliary_loss_mlp": 0.01062378, "balance_loss_clip": 1.03073978, "balance_loss_mlp": 1.0246048, "epoch": 0.36663159476927704, "flos": 25372848625920.0, "grad_norm": 1.7046302990297402, "language_loss": 0.77896994, "learning_rate": 2.9244423033424365e-06, "loss": 0.80030632, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46679688, "step": 6098, "time_per_iteration": 2.4577386379241943 }, { "auxiliary_loss_clip": 0.0106765, "auxiliary_loss_mlp": 0.01054776, "balance_loss_clip": 1.0256654, "balance_loss_mlp": 1.023561, "epoch": 0.366691718021945, "flos": 21356265928320.0, "grad_norm": 2.2363802359625504, "language_loss": 0.74572456, "learning_rate": 2.9240969249517723e-06, "loss": 0.76694882, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44140625, "step": 6099, "time_per_iteration": 2.425905704498291 }, { "auxiliary_loss_clip": 0.01064477, "auxiliary_loss_mlp": 0.01050915, "balance_loss_clip": 1.02020669, "balance_loss_mlp": 1.02275848, "epoch": 0.36675184127461297, "flos": 16799436852480.0, "grad_norm": 1.6181954774365455, "language_loss": 0.85991752, "learning_rate": 2.9237515115192602e-06, "loss": 0.88107145, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.41796875, "step": 6100, "time_per_iteration": 2.397919178009033 }, { "auxiliary_loss_clip": 0.01068904, "auxiliary_loss_mlp": 0.01052414, "balance_loss_clip": 1.02107441, "balance_loss_mlp": 1.02219105, "epoch": 0.36681196452728093, "flos": 21905484526080.0, "grad_norm": 2.808386622479719, "language_loss": 0.72471547, "learning_rate": 2.9234060630579992e-06, "loss": 0.74592859, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46679688, "step": 6101, "time_per_iteration": 2.51114821434021 }, { "auxiliary_loss_clip": 0.01067583, "auxiliary_loss_mlp": 0.01062445, "balance_loss_clip": 1.02799356, "balance_loss_mlp": 1.02173615, "epoch": 0.3668720877799489, "flos": 17711472965760.0, "grad_norm": 2.7559493690168946, "language_loss": 0.77778792, "learning_rate": 2.9230605795810865e-06, "loss": 0.79908824, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.45703125, "step": 6102, "time_per_iteration": 2.4034926891326904 }, { "auxiliary_loss_clip": 0.01069802, "auxiliary_loss_mlp": 0.01051508, "balance_loss_clip": 1.01789141, "balance_loss_mlp": 1.02251482, "epoch": 0.36693221103261686, "flos": 47043717615360.0, "grad_norm": 1.575081330549062, "language_loss": 0.71712977, "learning_rate": 2.922715061101625e-06, "loss": 0.73834288, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.47265625, "step": 6103, "time_per_iteration": 2.6397671699523926 }, { "auxiliary_loss_clip": 0.01066075, "auxiliary_loss_mlp": 0.01056262, "balance_loss_clip": 1.02471948, "balance_loss_mlp": 1.02061212, "epoch": 0.3669923342852848, "flos": 15960020100480.0, "grad_norm": 2.040876330325197, "language_loss": 0.73165011, "learning_rate": 2.922369507632716e-06, "loss": 0.75287348, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45507812, "step": 6104, "time_per_iteration": 2.3996357917785645 }, { "auxiliary_loss_clip": 0.0106516, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.02210832, "balance_loss_mlp": 1.02086663, "epoch": 0.3670524575379528, "flos": 19973460775680.0, "grad_norm": 1.9476178729516682, "language_loss": 0.83299804, "learning_rate": 2.9220239191874617e-06, "loss": 0.85418522, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44335938, "step": 6105, "time_per_iteration": 2.423210859298706 }, { "auxiliary_loss_clip": 0.0106967, "auxiliary_loss_mlp": 0.01059242, "balance_loss_clip": 1.02455223, "balance_loss_mlp": 1.02255261, "epoch": 0.36711258079062076, "flos": 25701765344640.0, "grad_norm": 2.2218104234480704, "language_loss": 0.8215847, "learning_rate": 2.9216782957789692e-06, "loss": 0.84287381, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.47070312, "step": 6106, "time_per_iteration": 2.4490904808044434 }, { "auxiliary_loss_clip": 0.01013243, "auxiliary_loss_mlp": 0.01011934, "balance_loss_clip": 1.00864351, "balance_loss_mlp": 1.00414026, "epoch": 0.3671727040432887, "flos": 60769364791680.0, "grad_norm": 0.706685652765088, "language_loss": 0.59255385, "learning_rate": 2.9213326374203426e-06, "loss": 0.6128056, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 0.03295898, "router_z_loss_mlp": 0.09082031, "step": 6107, "time_per_iteration": 4.4786036014556885 }, { "auxiliary_loss_clip": 0.01066048, "auxiliary_loss_mlp": 0.010472, "balance_loss_clip": 1.01642048, "balance_loss_mlp": 1.02150452, "epoch": 0.3672328272959567, "flos": 18660307518720.0, "grad_norm": 1.8724020790845193, "language_loss": 0.75217694, "learning_rate": 2.92098694412469e-06, "loss": 0.77330947, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 6108, "time_per_iteration": 3.8907783031463623 }, { "auxiliary_loss_clip": 0.01066283, "auxiliary_loss_mlp": 0.01050172, "balance_loss_clip": 1.0179379, "balance_loss_mlp": 1.02109504, "epoch": 0.3672929505486247, "flos": 15048158544000.0, "grad_norm": 2.35039636279293, "language_loss": 0.74537027, "learning_rate": 2.9206412159051213e-06, "loss": 0.76653481, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45117188, "step": 6109, "time_per_iteration": 2.420531988143921 }, { "auxiliary_loss_clip": 0.01066192, "auxiliary_loss_mlp": 0.01048881, "balance_loss_clip": 1.01915002, "balance_loss_mlp": 1.0228982, "epoch": 0.3673530738012927, "flos": 20588456108160.0, "grad_norm": 2.343074016976726, "language_loss": 0.5441153, "learning_rate": 2.920295452774744e-06, "loss": 0.56526601, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 6110, "time_per_iteration": 2.396131753921509 }, { "auxiliary_loss_clip": 0.01067051, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.01431537, "balance_loss_mlp": 1.02320945, "epoch": 0.36741319705396064, "flos": 21688743605760.0, "grad_norm": 1.406488697497103, "language_loss": 0.81175441, "learning_rate": 2.919949654746672e-06, "loss": 0.83285749, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4375, "step": 6111, "time_per_iteration": 2.456507682800293 }, { "auxiliary_loss_clip": 0.01067763, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.0192759, "balance_loss_mlp": 1.02509117, "epoch": 0.3674733203066286, "flos": 29860898590080.0, "grad_norm": 1.5883862628222674, "language_loss": 0.73711699, "learning_rate": 2.9196038218340163e-06, "loss": 0.75827682, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 6112, "time_per_iteration": 2.4635517597198486 }, { "auxiliary_loss_clip": 0.01071587, "auxiliary_loss_mlp": 0.01046301, "balance_loss_clip": 1.01689196, "balance_loss_mlp": 1.02714002, "epoch": 0.36753344355929657, "flos": 18256118175360.0, "grad_norm": 1.6118316632136163, "language_loss": 0.86402845, "learning_rate": 2.919257954049892e-06, "loss": 0.8852073, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4453125, "step": 6113, "time_per_iteration": 3.9492545127868652 }, { "auxiliary_loss_clip": 0.01073459, "auxiliary_loss_mlp": 0.01051643, "balance_loss_clip": 1.02138758, "balance_loss_mlp": 1.02738047, "epoch": 0.36759356681196453, "flos": 25299984885120.0, "grad_norm": 1.8439952285035381, "language_loss": 0.80146039, "learning_rate": 2.918912051407413e-06, "loss": 0.82271147, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4609375, "step": 6114, "time_per_iteration": 3.8554916381835938 }, { "auxiliary_loss_clip": 0.01075693, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.02078152, "balance_loss_mlp": 1.02884316, "epoch": 0.3676536900646325, "flos": 21031887686400.0, "grad_norm": 1.919928596833533, "language_loss": 0.68252921, "learning_rate": 2.918566113919698e-06, "loss": 0.70381415, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46679688, "step": 6115, "time_per_iteration": 2.443690538406372 }, { "auxiliary_loss_clip": 0.01068969, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.01108384, "balance_loss_mlp": 1.02687156, "epoch": 0.36771381331730046, "flos": 16287610187520.0, "grad_norm": 2.161560255759521, "language_loss": 0.78492749, "learning_rate": 2.9182201415998636e-06, "loss": 0.80600071, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.421875, "step": 6116, "time_per_iteration": 2.408116340637207 }, { "auxiliary_loss_clip": 0.01071934, "auxiliary_loss_mlp": 0.01049733, "balance_loss_clip": 1.02146888, "balance_loss_mlp": 1.02833533, "epoch": 0.36777393656996843, "flos": 22308870908160.0, "grad_norm": 1.9128547754386835, "language_loss": 0.64435792, "learning_rate": 2.9178741344610286e-06, "loss": 0.66557455, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.43554688, "step": 6117, "time_per_iteration": 2.445099353790283 }, { "auxiliary_loss_clip": 0.01070943, "auxiliary_loss_mlp": 0.01043999, "balance_loss_clip": 1.01642549, "balance_loss_mlp": 1.02850389, "epoch": 0.3678340598226364, "flos": 26832846528000.0, "grad_norm": 1.6978635103553648, "language_loss": 0.7515465, "learning_rate": 2.9175280925163156e-06, "loss": 0.77269584, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.42578125, "step": 6118, "time_per_iteration": 2.4761810302734375 }, { "auxiliary_loss_clip": 0.01072155, "auxiliary_loss_mlp": 0.01054387, "balance_loss_clip": 1.02191472, "balance_loss_mlp": 1.02498412, "epoch": 0.36789418307530436, "flos": 21760664739840.0, "grad_norm": 1.9251888962679011, "language_loss": 0.74467659, "learning_rate": 2.9171820157788445e-06, "loss": 0.7659421, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47265625, "step": 6119, "time_per_iteration": 2.412123441696167 }, { "auxiliary_loss_clip": 0.0107344, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.01930904, "balance_loss_mlp": 1.02902484, "epoch": 0.3679543063279723, "flos": 15923291483520.0, "grad_norm": 1.7331851609408127, "language_loss": 0.81716973, "learning_rate": 2.9168359042617404e-06, "loss": 0.83838928, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4453125, "step": 6120, "time_per_iteration": 2.4039199352264404 }, { "auxiliary_loss_clip": 0.01069377, "auxiliary_loss_mlp": 0.01055912, "balance_loss_clip": 1.02603865, "balance_loss_mlp": 1.02523565, "epoch": 0.3680144295806403, "flos": 24274516164480.0, "grad_norm": 1.8786069124218971, "language_loss": 0.66439641, "learning_rate": 2.916489757978126e-06, "loss": 0.68564928, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44140625, "step": 6121, "time_per_iteration": 2.424571990966797 }, { "auxiliary_loss_clip": 0.01070621, "auxiliary_loss_mlp": 0.01054846, "balance_loss_clip": 1.02357697, "balance_loss_mlp": 1.02467358, "epoch": 0.36807455283330826, "flos": 26102952311040.0, "grad_norm": 2.0375896004438205, "language_loss": 0.72793376, "learning_rate": 2.9161435769411286e-06, "loss": 0.74918854, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45898438, "step": 6122, "time_per_iteration": 2.471782922744751 }, { "auxiliary_loss_clip": 0.01065249, "auxiliary_loss_mlp": 0.0105587, "balance_loss_clip": 1.02528083, "balance_loss_mlp": 1.02309883, "epoch": 0.3681346760859763, "flos": 24643827192960.0, "grad_norm": 3.401637180253066, "language_loss": 0.70576191, "learning_rate": 2.915797361163875e-06, "loss": 0.72697306, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.421875, "step": 6123, "time_per_iteration": 2.4221949577331543 }, { "auxiliary_loss_clip": 0.01067776, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.01830518, "balance_loss_mlp": 1.0222702, "epoch": 0.36819479933864424, "flos": 23877239270400.0, "grad_norm": 3.0376166548059675, "language_loss": 0.76262015, "learning_rate": 2.9154511106594933e-06, "loss": 0.78379881, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45507812, "step": 6124, "time_per_iteration": 2.444472551345825 }, { "auxiliary_loss_clip": 0.01066465, "auxiliary_loss_mlp": 0.01064279, "balance_loss_clip": 1.03112662, "balance_loss_mlp": 1.02261627, "epoch": 0.3682549225913122, "flos": 25552895840640.0, "grad_norm": 2.0459387767283244, "language_loss": 0.75789881, "learning_rate": 2.915104825441114e-06, "loss": 0.77920628, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4375, "step": 6125, "time_per_iteration": 2.434241771697998 }, { "auxiliary_loss_clip": 0.01066792, "auxiliary_loss_mlp": 0.01058901, "balance_loss_clip": 1.02552223, "balance_loss_mlp": 1.02175546, "epoch": 0.36831504584398017, "flos": 16945653093120.0, "grad_norm": 1.900138156787967, "language_loss": 0.79959518, "learning_rate": 2.9147585055218686e-06, "loss": 0.8208521, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.44921875, "step": 6126, "time_per_iteration": 2.3881282806396484 }, { "auxiliary_loss_clip": 0.01066667, "auxiliary_loss_mlp": 0.01057804, "balance_loss_clip": 1.0228281, "balance_loss_mlp": 1.01907766, "epoch": 0.36837516909664814, "flos": 19864042974720.0, "grad_norm": 3.004124935259161, "language_loss": 0.6774466, "learning_rate": 2.914412150914888e-06, "loss": 0.69869131, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.4765625, "step": 6127, "time_per_iteration": 2.3881824016571045 }, { "auxiliary_loss_clip": 0.01069792, "auxiliary_loss_mlp": 0.01059393, "balance_loss_clip": 1.02665782, "balance_loss_mlp": 1.02393103, "epoch": 0.3684352923493161, "flos": 37625652385920.0, "grad_norm": 1.9940336169421327, "language_loss": 0.7143752, "learning_rate": 2.9140657616333074e-06, "loss": 0.73566705, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45898438, "step": 6128, "time_per_iteration": 2.5547914505004883 }, { "auxiliary_loss_clip": 0.01067875, "auxiliary_loss_mlp": 0.01054994, "balance_loss_clip": 1.02156758, "balance_loss_mlp": 1.02192438, "epoch": 0.36849541560198407, "flos": 14464620213120.0, "grad_norm": 1.7559817398920063, "language_loss": 0.77116317, "learning_rate": 2.9137193376902614e-06, "loss": 0.79239184, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4609375, "step": 6129, "time_per_iteration": 2.3613719940185547 }, { "auxiliary_loss_clip": 0.01064785, "auxiliary_loss_mlp": 0.01059236, "balance_loss_clip": 1.02918303, "balance_loss_mlp": 1.01999426, "epoch": 0.36855553885465203, "flos": 25769706583680.0, "grad_norm": 1.837245159408968, "language_loss": 0.86047435, "learning_rate": 2.9133728790988868e-06, "loss": 0.88171458, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44921875, "step": 6130, "time_per_iteration": 2.461215019226074 }, { "auxiliary_loss_clip": 0.01013877, "auxiliary_loss_mlp": 0.01009453, "balance_loss_clip": 1.00611496, "balance_loss_mlp": 1.00453305, "epoch": 0.36861566210732, "flos": 65045701071360.0, "grad_norm": 0.8202653869065714, "language_loss": 0.60382074, "learning_rate": 2.913026385872321e-06, "loss": 0.62405401, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.09375, "step": 6131, "time_per_iteration": 3.1161012649536133 }, { "auxiliary_loss_clip": 0.01064969, "auxiliary_loss_mlp": 0.01047773, "balance_loss_clip": 1.01625443, "balance_loss_mlp": 1.02088976, "epoch": 0.36867578535998796, "flos": 30953226297600.0, "grad_norm": 1.5890844020343478, "language_loss": 0.74248856, "learning_rate": 2.9126798580237034e-06, "loss": 0.76361597, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 6132, "time_per_iteration": 2.4971365928649902 }, { "auxiliary_loss_clip": 0.01070685, "auxiliary_loss_mlp": 0.01048888, "balance_loss_clip": 1.01457906, "balance_loss_mlp": 1.02374256, "epoch": 0.3687359086126559, "flos": 28836756501120.0, "grad_norm": 1.78950213190559, "language_loss": 0.74688691, "learning_rate": 2.9123332955661736e-06, "loss": 0.76808262, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46875, "step": 6133, "time_per_iteration": 2.472233772277832 }, { "auxiliary_loss_clip": 0.01066005, "auxiliary_loss_mlp": 0.01044548, "balance_loss_clip": 1.01438856, "balance_loss_mlp": 1.02338886, "epoch": 0.3687960318653239, "flos": 21395752542720.0, "grad_norm": 1.6083685145791635, "language_loss": 0.72312057, "learning_rate": 2.911986698512874e-06, "loss": 0.7442261, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 6134, "time_per_iteration": 2.4402964115142822 }, { "auxiliary_loss_clip": 0.01068561, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.01604187, "balance_loss_mlp": 1.02413654, "epoch": 0.36885615511799186, "flos": 20265020472960.0, "grad_norm": 1.7253574883360732, "language_loss": 0.76449239, "learning_rate": 2.9116400668769477e-06, "loss": 0.78566819, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 6135, "time_per_iteration": 2.4217441082000732 }, { "auxiliary_loss_clip": 0.01018866, "auxiliary_loss_mlp": 0.01007223, "balance_loss_clip": 1.00381374, "balance_loss_mlp": 1.00975716, "epoch": 0.3689162783706599, "flos": 63085922789760.0, "grad_norm": 0.8264676951619004, "language_loss": 0.58846605, "learning_rate": 2.9112934006715376e-06, "loss": 0.60872698, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.09082031, "step": 6136, "time_per_iteration": 3.0040581226348877 }, { "auxiliary_loss_clip": 0.01068511, "auxiliary_loss_mlp": 0.01044753, "balance_loss_clip": 1.01330566, "balance_loss_mlp": 1.02447867, "epoch": 0.36897640162332784, "flos": 10961225723520.0, "grad_norm": 2.059980659289682, "language_loss": 0.80070698, "learning_rate": 2.9109466999097918e-06, "loss": 0.82183969, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 6137, "time_per_iteration": 2.387873649597168 }, { "auxiliary_loss_clip": 0.01069178, "auxiliary_loss_mlp": 0.01049547, "balance_loss_clip": 1.01788521, "balance_loss_mlp": 1.02424884, "epoch": 0.3690365248759958, "flos": 20703250258560.0, "grad_norm": 1.9211346136341438, "language_loss": 0.76243854, "learning_rate": 2.9105999646048552e-06, "loss": 0.78362572, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 6138, "time_per_iteration": 2.45884108543396 }, { "auxiliary_loss_clip": 0.01071555, "auxiliary_loss_mlp": 0.01056292, "balance_loss_clip": 1.02265108, "balance_loss_mlp": 1.02439022, "epoch": 0.3690966481286638, "flos": 31825182303360.0, "grad_norm": 2.1887521067975233, "language_loss": 0.67017686, "learning_rate": 2.9102531947698764e-06, "loss": 0.6914553, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.47265625, "step": 6139, "time_per_iteration": 2.50856351852417 }, { "auxiliary_loss_clip": 0.01065991, "auxiliary_loss_mlp": 0.01062953, "balance_loss_clip": 1.03092098, "balance_loss_mlp": 1.02283525, "epoch": 0.36915677138133174, "flos": 13114109934720.0, "grad_norm": 1.887048012794129, "language_loss": 0.73099154, "learning_rate": 2.909906390418006e-06, "loss": 0.75228095, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.43164062, "step": 6140, "time_per_iteration": 2.4058003425598145 }, { "auxiliary_loss_clip": 0.01016156, "auxiliary_loss_mlp": 0.01014575, "balance_loss_clip": 1.01126099, "balance_loss_mlp": 1.00699437, "epoch": 0.3692168946339997, "flos": 68683372116480.0, "grad_norm": 0.7705507448124861, "language_loss": 0.59369546, "learning_rate": 2.9095595515623934e-06, "loss": 0.61400276, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.09179688, "step": 6141, "time_per_iteration": 3.1289751529693604 }, { "auxiliary_loss_clip": 0.01064302, "auxiliary_loss_mlp": 0.01056852, "balance_loss_clip": 1.02726436, "balance_loss_mlp": 1.02057505, "epoch": 0.36927701788666767, "flos": 22016787540480.0, "grad_norm": 1.692762143847503, "language_loss": 0.76384413, "learning_rate": 2.909212678216192e-06, "loss": 0.7850557, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4375, "step": 6142, "time_per_iteration": 2.4079134464263916 }, { "auxiliary_loss_clip": 0.01063982, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.02631891, "balance_loss_mlp": 1.02086949, "epoch": 0.36933714113933563, "flos": 21834505998720.0, "grad_norm": 1.8443137075397662, "language_loss": 0.77675653, "learning_rate": 2.908865770392555e-06, "loss": 0.79793882, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.43164062, "step": 6143, "time_per_iteration": 2.3961758613586426 }, { "auxiliary_loss_clip": 0.01062538, "auxiliary_loss_mlp": 0.0105256, "balance_loss_clip": 1.02484393, "balance_loss_mlp": 1.02068448, "epoch": 0.3693972643920036, "flos": 23690698542720.0, "grad_norm": 2.438947204468492, "language_loss": 0.83103943, "learning_rate": 2.9085188281046364e-06, "loss": 0.85219043, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 6144, "time_per_iteration": 2.4417450428009033 }, { "auxiliary_loss_clip": 0.01063981, "auxiliary_loss_mlp": 0.0105621, "balance_loss_clip": 1.02674174, "balance_loss_mlp": 1.01956797, "epoch": 0.36945738764467156, "flos": 22855645710720.0, "grad_norm": 2.696169316757798, "language_loss": 0.78866816, "learning_rate": 2.908171851365593e-06, "loss": 0.80987, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.44335938, "step": 6145, "time_per_iteration": 2.38777232170105 }, { "auxiliary_loss_clip": 0.01065749, "auxiliary_loss_mlp": 0.01052962, "balance_loss_clip": 1.02256346, "balance_loss_mlp": 1.02118909, "epoch": 0.36951751089733953, "flos": 16615060629120.0, "grad_norm": 2.170351949388902, "language_loss": 0.78648913, "learning_rate": 2.9078248401885815e-06, "loss": 0.8076762, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 6146, "time_per_iteration": 2.3977911472320557 }, { "auxiliary_loss_clip": 0.01067246, "auxiliary_loss_mlp": 0.01058278, "balance_loss_clip": 1.02628219, "balance_loss_mlp": 1.02149916, "epoch": 0.3695776341500075, "flos": 18913602499200.0, "grad_norm": 1.9516697308121533, "language_loss": 0.82439345, "learning_rate": 2.907477794586761e-06, "loss": 0.84564865, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45703125, "step": 6147, "time_per_iteration": 3.825615644454956 }, { "auxiliary_loss_clip": 0.01066073, "auxiliary_loss_mlp": 0.01052283, "balance_loss_clip": 1.02294588, "balance_loss_mlp": 1.02146327, "epoch": 0.36963775740267546, "flos": 20807571000960.0, "grad_norm": 1.8434205428015917, "language_loss": 0.84942818, "learning_rate": 2.9071307145732926e-06, "loss": 0.87061167, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4453125, "step": 6148, "time_per_iteration": 3.8347694873809814 }, { "auxiliary_loss_clip": 0.01065107, "auxiliary_loss_mlp": 0.01052444, "balance_loss_clip": 1.02371466, "balance_loss_mlp": 1.02251434, "epoch": 0.3696978806553435, "flos": 26060847344640.0, "grad_norm": 2.0440454156607113, "language_loss": 0.76464391, "learning_rate": 2.9067836001613357e-06, "loss": 0.78581935, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 6149, "time_per_iteration": 2.4530186653137207 }, { "auxiliary_loss_clip": 0.01069623, "auxiliary_loss_mlp": 0.01055621, "balance_loss_clip": 1.02302885, "balance_loss_mlp": 1.0249449, "epoch": 0.36975800390801145, "flos": 26832706882560.0, "grad_norm": 1.901272129384223, "language_loss": 0.72442937, "learning_rate": 2.906436451364054e-06, "loss": 0.74568176, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44726562, "step": 6150, "time_per_iteration": 2.4892961978912354 }, { "auxiliary_loss_clip": 0.01070689, "auxiliary_loss_mlp": 0.01059403, "balance_loss_clip": 1.02855134, "balance_loss_mlp": 1.02547276, "epoch": 0.3698181271606794, "flos": 21141549866880.0, "grad_norm": 1.8746921540941004, "language_loss": 0.82638478, "learning_rate": 2.906089268194611e-06, "loss": 0.84768569, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.453125, "step": 6151, "time_per_iteration": 2.458369493484497 }, { "auxiliary_loss_clip": 0.01036833, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.03569031, "balance_loss_mlp": 1.02756858, "epoch": 0.3698782504133474, "flos": 66739478503680.0, "grad_norm": 0.8037791212956692, "language_loss": 0.6317476, "learning_rate": 2.9057420506661726e-06, "loss": 0.65250123, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.09277344, "step": 6152, "time_per_iteration": 3.153697967529297 }, { "auxiliary_loss_clip": 0.01068656, "auxiliary_loss_mlp": 0.01045781, "balance_loss_clip": 1.01647902, "balance_loss_mlp": 1.0263834, "epoch": 0.36993837366601534, "flos": 24310511642880.0, "grad_norm": 1.7676397683107228, "language_loss": 0.7143712, "learning_rate": 2.9053947987919044e-06, "loss": 0.73551559, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 6153, "time_per_iteration": 5.215890407562256 }, { "auxiliary_loss_clip": 0.01072028, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.01484919, "balance_loss_mlp": 1.02768803, "epoch": 0.3699984969186833, "flos": 24348147955200.0, "grad_norm": 1.6785630045355642, "language_loss": 0.73570907, "learning_rate": 2.9050475125849755e-06, "loss": 0.75687361, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44335938, "step": 6154, "time_per_iteration": 2.4871866703033447 }, { "auxiliary_loss_clip": 0.01069503, "auxiliary_loss_mlp": 0.01050396, "balance_loss_clip": 1.02144027, "balance_loss_mlp": 1.02600348, "epoch": 0.37005862017135127, "flos": 19828117319040.0, "grad_norm": 1.6499191852251607, "language_loss": 0.68577218, "learning_rate": 2.9047001920585534e-06, "loss": 0.70697117, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 6155, "time_per_iteration": 2.405717611312866 }, { "auxiliary_loss_clip": 0.01070544, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.01854074, "balance_loss_mlp": 1.02645636, "epoch": 0.37011874342401924, "flos": 19572762568320.0, "grad_norm": 1.862080410167843, "language_loss": 0.68977058, "learning_rate": 2.9043528372258097e-06, "loss": 0.71095473, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 6156, "time_per_iteration": 2.4118590354919434 }, { "auxiliary_loss_clip": 0.0106766, "auxiliary_loss_mlp": 0.01054312, "balance_loss_clip": 1.02591658, "balance_loss_mlp": 1.02505362, "epoch": 0.3701788666766872, "flos": 20373356021760.0, "grad_norm": 1.7599078946132636, "language_loss": 0.82606161, "learning_rate": 2.904005448099916e-06, "loss": 0.8472814, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42578125, "step": 6157, "time_per_iteration": 2.419261932373047 }, { "auxiliary_loss_clip": 0.01070154, "auxiliary_loss_mlp": 0.01069087, "balance_loss_clip": 1.03330064, "balance_loss_mlp": 1.02394986, "epoch": 0.37023898992935517, "flos": 15340032443520.0, "grad_norm": 2.4842314951334363, "language_loss": 0.77984583, "learning_rate": 2.9036580246940444e-06, "loss": 0.80123818, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.4609375, "step": 6158, "time_per_iteration": 2.3943405151367188 }, { "auxiliary_loss_clip": 0.01069614, "auxiliary_loss_mlp": 0.01066183, "balance_loss_clip": 1.03380537, "balance_loss_mlp": 1.02360225, "epoch": 0.37029911318202313, "flos": 19572902213760.0, "grad_norm": 2.067656315048353, "language_loss": 0.70281291, "learning_rate": 2.9033105670213708e-06, "loss": 0.72417092, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4609375, "step": 6159, "time_per_iteration": 2.4557595252990723 }, { "auxiliary_loss_clip": 0.01066414, "auxiliary_loss_mlp": 0.01063875, "balance_loss_clip": 1.03501463, "balance_loss_mlp": 1.02234006, "epoch": 0.3703592364346911, "flos": 26212160643840.0, "grad_norm": 1.702158640034723, "language_loss": 0.71766269, "learning_rate": 2.9029630750950697e-06, "loss": 0.73896563, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44140625, "step": 6160, "time_per_iteration": 2.4414467811584473 }, { "auxiliary_loss_clip": 0.01064062, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.03009439, "balance_loss_mlp": 1.02208185, "epoch": 0.37041935968735906, "flos": 20047267123200.0, "grad_norm": 1.6897245268620424, "language_loss": 0.80329818, "learning_rate": 2.9026155489283176e-06, "loss": 0.82454467, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41992188, "step": 6161, "time_per_iteration": 2.3780417442321777 }, { "auxiliary_loss_clip": 0.01065215, "auxiliary_loss_mlp": 0.01077084, "balance_loss_clip": 1.04392004, "balance_loss_mlp": 1.0211904, "epoch": 0.3704794829400271, "flos": 24132663843840.0, "grad_norm": 2.553293121484324, "language_loss": 0.80170417, "learning_rate": 2.902267988534295e-06, "loss": 0.82312715, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.43945312, "step": 6162, "time_per_iteration": 2.4441370964050293 }, { "auxiliary_loss_clip": 0.01068416, "auxiliary_loss_mlp": 0.01068081, "balance_loss_clip": 1.03634763, "balance_loss_mlp": 1.02306557, "epoch": 0.37053960619269505, "flos": 14865981736320.0, "grad_norm": 2.9065930936496804, "language_loss": 0.81483763, "learning_rate": 2.9019203939261783e-06, "loss": 0.83620262, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 6163, "time_per_iteration": 2.3540284633636475 }, { "auxiliary_loss_clip": 0.01068208, "auxiliary_loss_mlp": 0.01072616, "balance_loss_clip": 1.04014349, "balance_loss_mlp": 1.02285171, "epoch": 0.370599729445363, "flos": 21360420380160.0, "grad_norm": 1.676608654993983, "language_loss": 0.6990751, "learning_rate": 2.9015727651171507e-06, "loss": 0.72048342, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.453125, "step": 6164, "time_per_iteration": 2.416917562484741 }, { "auxiliary_loss_clip": 0.0106999, "auxiliary_loss_mlp": 0.01072617, "balance_loss_clip": 1.0403111, "balance_loss_mlp": 1.02353787, "epoch": 0.370659852698031, "flos": 26827958937600.0, "grad_norm": 2.5611454280670856, "language_loss": 0.85089707, "learning_rate": 2.9012251021203935e-06, "loss": 0.87232316, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46484375, "step": 6165, "time_per_iteration": 2.4528987407684326 }, { "auxiliary_loss_clip": 0.010713, "auxiliary_loss_mlp": 0.01064463, "balance_loss_clip": 1.02803266, "balance_loss_mlp": 1.02353621, "epoch": 0.37071997595069894, "flos": 19098013633920.0, "grad_norm": 2.1772146820153058, "language_loss": 0.70892346, "learning_rate": 2.9008774049490896e-06, "loss": 0.73028111, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.4765625, "step": 6166, "time_per_iteration": 2.4417853355407715 }, { "auxiliary_loss_clip": 0.01014822, "auxiliary_loss_mlp": 0.01029997, "balance_loss_clip": 1.02684963, "balance_loss_mlp": 1.00522506, "epoch": 0.3707800992033669, "flos": 52175809163520.0, "grad_norm": 0.8050277017671205, "language_loss": 0.57130075, "learning_rate": 2.9005296736164244e-06, "loss": 0.59174895, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.09570312, "step": 6167, "time_per_iteration": 2.8692944049835205 }, { "auxiliary_loss_clip": 0.01066752, "auxiliary_loss_mlp": 0.01054727, "balance_loss_clip": 1.02368486, "balance_loss_mlp": 1.0230602, "epoch": 0.3708402224560349, "flos": 19900806503040.0, "grad_norm": 1.826747107734393, "language_loss": 0.7775805, "learning_rate": 2.900181908135584e-06, "loss": 0.79879528, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43554688, "step": 6168, "time_per_iteration": 2.4361259937286377 }, { "auxiliary_loss_clip": 0.01069444, "auxiliary_loss_mlp": 0.0105672, "balance_loss_clip": 1.02541566, "balance_loss_mlp": 1.02393234, "epoch": 0.37090034570870284, "flos": 20006698256640.0, "grad_norm": 3.58720840676508, "language_loss": 0.75917268, "learning_rate": 2.899834108519755e-06, "loss": 0.78043431, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45507812, "step": 6169, "time_per_iteration": 2.3874013423919678 }, { "auxiliary_loss_clip": 0.01070083, "auxiliary_loss_mlp": 0.01044277, "balance_loss_clip": 1.0154407, "balance_loss_mlp": 1.02533138, "epoch": 0.3709604689613708, "flos": 24133536627840.0, "grad_norm": 2.0425189723282027, "language_loss": 0.80549508, "learning_rate": 2.899486274782127e-06, "loss": 0.8266387, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44726562, "step": 6170, "time_per_iteration": 2.4548234939575195 }, { "auxiliary_loss_clip": 0.01074999, "auxiliary_loss_mlp": 0.01053695, "balance_loss_clip": 1.0211513, "balance_loss_mlp": 1.02823782, "epoch": 0.37102059221403877, "flos": 23875004943360.0, "grad_norm": 1.481214911072211, "language_loss": 0.77842087, "learning_rate": 2.8991384069358885e-06, "loss": 0.79970783, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46875, "step": 6171, "time_per_iteration": 2.429335832595825 }, { "auxiliary_loss_clip": 0.01075572, "auxiliary_loss_mlp": 0.01049152, "balance_loss_clip": 1.01715672, "balance_loss_mlp": 1.02919888, "epoch": 0.37108071546670673, "flos": 14500406223360.0, "grad_norm": 1.7748763089769146, "language_loss": 0.82144058, "learning_rate": 2.898790504994232e-06, "loss": 0.84268779, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46484375, "step": 6172, "time_per_iteration": 2.460134983062744 }, { "auxiliary_loss_clip": 0.01077774, "auxiliary_loss_mlp": 0.01059347, "balance_loss_clip": 1.02639771, "balance_loss_mlp": 1.03037548, "epoch": 0.3711408387193747, "flos": 34561360465920.0, "grad_norm": 1.8940442935869428, "language_loss": 0.61105078, "learning_rate": 2.89844256897035e-06, "loss": 0.63242197, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47460938, "step": 6173, "time_per_iteration": 2.548811435699463 }, { "auxiliary_loss_clip": 0.01075866, "auxiliary_loss_mlp": 0.01053197, "balance_loss_clip": 1.02046227, "balance_loss_mlp": 1.02948356, "epoch": 0.37120096197204266, "flos": 17309762328960.0, "grad_norm": 1.8924876567783964, "language_loss": 0.82736403, "learning_rate": 2.898094598877435e-06, "loss": 0.84865463, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46484375, "step": 6174, "time_per_iteration": 2.450430154800415 }, { "auxiliary_loss_clip": 0.01073677, "auxiliary_loss_mlp": 0.01053547, "balance_loss_clip": 1.02348268, "balance_loss_mlp": 1.02922392, "epoch": 0.37126108522471063, "flos": 30662748852480.0, "grad_norm": 1.806166983091659, "language_loss": 0.81665623, "learning_rate": 2.8977465947286826e-06, "loss": 0.83792841, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44335938, "step": 6175, "time_per_iteration": 2.4967219829559326 }, { "auxiliary_loss_clip": 0.01080572, "auxiliary_loss_mlp": 0.01065571, "balance_loss_clip": 1.03316975, "balance_loss_mlp": 1.0340277, "epoch": 0.37132120847737865, "flos": 25154466871680.0, "grad_norm": 1.7661680279857317, "language_loss": 0.90479302, "learning_rate": 2.89739855653729e-06, "loss": 0.92625445, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46484375, "step": 6176, "time_per_iteration": 2.503720998764038 }, { "auxiliary_loss_clip": 0.01076777, "auxiliary_loss_mlp": 0.01051675, "balance_loss_clip": 1.02141929, "balance_loss_mlp": 1.02992129, "epoch": 0.3713813317300466, "flos": 21212458571520.0, "grad_norm": 1.4794812519409979, "language_loss": 0.74540699, "learning_rate": 2.8970504843164546e-06, "loss": 0.76669151, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46875, "step": 6177, "time_per_iteration": 2.4569990634918213 }, { "auxiliary_loss_clip": 0.01075954, "auxiliary_loss_mlp": 0.0105859, "balance_loss_clip": 1.02842999, "balance_loss_mlp": 1.02974284, "epoch": 0.3714414549827146, "flos": 21615565662720.0, "grad_norm": 1.7282440854251964, "language_loss": 0.76534122, "learning_rate": 2.896702378079374e-06, "loss": 0.78668672, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.46289062, "step": 6178, "time_per_iteration": 2.451380491256714 }, { "auxiliary_loss_clip": 0.01075186, "auxiliary_loss_mlp": 0.01076778, "balance_loss_clip": 1.04499674, "balance_loss_mlp": 1.02925539, "epoch": 0.37150157823538255, "flos": 19971331182720.0, "grad_norm": 1.8118013172328051, "language_loss": 0.73222107, "learning_rate": 2.8963542378392502e-06, "loss": 0.75374073, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45898438, "step": 6179, "time_per_iteration": 2.438295602798462 }, { "auxiliary_loss_clip": 0.010731, "auxiliary_loss_mlp": 0.0105813, "balance_loss_clip": 1.02625299, "balance_loss_mlp": 1.02685177, "epoch": 0.3715617014880505, "flos": 24859485861120.0, "grad_norm": 3.2335674360587956, "language_loss": 0.70807952, "learning_rate": 2.896006063609283e-06, "loss": 0.72939181, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 6180, "time_per_iteration": 2.4974782466888428 }, { "auxiliary_loss_clip": 0.01071482, "auxiliary_loss_mlp": 0.01057924, "balance_loss_clip": 1.02685809, "balance_loss_mlp": 1.0259943, "epoch": 0.3716218247407185, "flos": 20448035153280.0, "grad_norm": 1.705477140120759, "language_loss": 0.79165876, "learning_rate": 2.8956578554026767e-06, "loss": 0.81295282, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.45507812, "step": 6181, "time_per_iteration": 2.4232723712921143 }, { "auxiliary_loss_clip": 0.0106951, "auxiliary_loss_mlp": 0.01064755, "balance_loss_clip": 1.03131676, "balance_loss_mlp": 1.02476525, "epoch": 0.37168194799338644, "flos": 24132349641600.0, "grad_norm": 1.9530702750784004, "language_loss": 0.79755151, "learning_rate": 2.8953096132326343e-06, "loss": 0.81889415, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.44921875, "step": 6182, "time_per_iteration": 2.4527652263641357 }, { "auxiliary_loss_clip": 0.01015908, "auxiliary_loss_mlp": 0.01009171, "balance_loss_clip": 1.00557065, "balance_loss_mlp": 1.00645137, "epoch": 0.3717420712460544, "flos": 67405481199360.0, "grad_norm": 0.7891171117585289, "language_loss": 0.57511938, "learning_rate": 2.894961337112362e-06, "loss": 0.59537017, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 0.03588867, "router_z_loss_mlp": 0.09472656, "step": 6183, "time_per_iteration": 3.006164312362671 }, { "auxiliary_loss_clip": 0.01072675, "auxiliary_loss_mlp": 0.01065878, "balance_loss_clip": 1.02930439, "balance_loss_mlp": 1.02358079, "epoch": 0.37180219449872237, "flos": 22375974274560.0, "grad_norm": 1.70237307689659, "language_loss": 0.78278875, "learning_rate": 2.894613027055066e-06, "loss": 0.8041743, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.49023438, "step": 6184, "time_per_iteration": 2.521357536315918 }, { "auxiliary_loss_clip": 0.01065785, "auxiliary_loss_mlp": 0.01052073, "balance_loss_clip": 1.02006483, "balance_loss_mlp": 1.02190566, "epoch": 0.37186231775139034, "flos": 21868860643200.0, "grad_norm": 2.1164873875871613, "language_loss": 0.74055469, "learning_rate": 2.894264683073954e-06, "loss": 0.76173329, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4375, "step": 6185, "time_per_iteration": 2.4144961833953857 }, { "auxiliary_loss_clip": 0.01065697, "auxiliary_loss_mlp": 0.01046739, "balance_loss_clip": 1.01350331, "balance_loss_mlp": 1.02150226, "epoch": 0.3719224410040583, "flos": 22414238991360.0, "grad_norm": 1.5021778050344743, "language_loss": 0.78418303, "learning_rate": 2.8939163051822363e-06, "loss": 0.80530733, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44140625, "step": 6186, "time_per_iteration": 3.9281699657440186 }, { "auxiliary_loss_clip": 0.01068419, "auxiliary_loss_mlp": 0.01052937, "balance_loss_clip": 1.01855719, "balance_loss_mlp": 1.0211997, "epoch": 0.37198256425672627, "flos": 25150172774400.0, "grad_norm": 2.030387640240308, "language_loss": 0.85434473, "learning_rate": 2.8935678933931224e-06, "loss": 0.87555826, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47265625, "step": 6187, "time_per_iteration": 2.4561898708343506 }, { "auxiliary_loss_clip": 0.01066283, "auxiliary_loss_mlp": 0.01049363, "balance_loss_clip": 1.01846373, "balance_loss_mlp": 1.02114487, "epoch": 0.37204268750939423, "flos": 21137360503680.0, "grad_norm": 2.1211545290682072, "language_loss": 0.85168588, "learning_rate": 2.893219447719824e-06, "loss": 0.87284237, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45117188, "step": 6188, "time_per_iteration": 3.7974941730499268 }, { "auxiliary_loss_clip": 0.01066926, "auxiliary_loss_mlp": 0.01048974, "balance_loss_clip": 1.01807451, "balance_loss_mlp": 1.02209294, "epoch": 0.37210281076206225, "flos": 21505763836800.0, "grad_norm": 1.719364336120611, "language_loss": 0.6657846, "learning_rate": 2.8928709681755548e-06, "loss": 0.68694365, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44726562, "step": 6189, "time_per_iteration": 2.4921469688415527 }, { "auxiliary_loss_clip": 0.01070079, "auxiliary_loss_mlp": 0.01054802, "balance_loss_clip": 1.02224588, "balance_loss_mlp": 1.02409148, "epoch": 0.3721629340147302, "flos": 17346874970880.0, "grad_norm": 2.1507482791206867, "language_loss": 0.85157299, "learning_rate": 2.8925224547735293e-06, "loss": 0.87282181, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.45898438, "step": 6190, "time_per_iteration": 2.363616943359375 }, { "auxiliary_loss_clip": 0.01070091, "auxiliary_loss_mlp": 0.01049482, "balance_loss_clip": 1.01742673, "balance_loss_mlp": 1.02306008, "epoch": 0.3722230572673982, "flos": 16431557189760.0, "grad_norm": 2.534085884848953, "language_loss": 0.91087818, "learning_rate": 2.8921739075269633e-06, "loss": 0.93207389, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.47070312, "step": 6191, "time_per_iteration": 2.4337618350982666 }, { "auxiliary_loss_clip": 0.01071639, "auxiliary_loss_mlp": 0.01047524, "balance_loss_clip": 1.01240516, "balance_loss_mlp": 1.02384233, "epoch": 0.37228318052006615, "flos": 22673608548480.0, "grad_norm": 1.7144899418570887, "language_loss": 0.75094247, "learning_rate": 2.891825326449073e-06, "loss": 0.77213413, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.47851562, "step": 6192, "time_per_iteration": 3.870932102203369 }, { "auxiliary_loss_clip": 0.01069855, "auxiliary_loss_mlp": 0.01052129, "balance_loss_clip": 1.0201571, "balance_loss_mlp": 1.02460933, "epoch": 0.3723433037727341, "flos": 25264303608960.0, "grad_norm": 2.380520786741067, "language_loss": 0.81311119, "learning_rate": 2.8914767115530766e-06, "loss": 0.83433104, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 6193, "time_per_iteration": 3.841954231262207 }, { "auxiliary_loss_clip": 0.0107215, "auxiliary_loss_mlp": 0.0105188, "balance_loss_clip": 1.02096868, "balance_loss_mlp": 1.02594995, "epoch": 0.3724034270254021, "flos": 10523903633280.0, "grad_norm": 1.927325504245247, "language_loss": 0.85983491, "learning_rate": 2.891128062852194e-06, "loss": 0.88107526, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4609375, "step": 6194, "time_per_iteration": 2.3971354961395264 }, { "auxiliary_loss_clip": 0.01070545, "auxiliary_loss_mlp": 0.01052761, "balance_loss_clip": 1.02187419, "balance_loss_mlp": 1.02504909, "epoch": 0.37246355027807004, "flos": 20265195029760.0, "grad_norm": 2.918751222440559, "language_loss": 0.79629201, "learning_rate": 2.890779380359646e-06, "loss": 0.81752515, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45507812, "step": 6195, "time_per_iteration": 2.4235358238220215 }, { "auxiliary_loss_clip": 0.01071988, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.01911092, "balance_loss_mlp": 1.02802038, "epoch": 0.372523673530738, "flos": 19499549713920.0, "grad_norm": 4.008457467251298, "language_loss": 0.80612642, "learning_rate": 2.890430664088655e-06, "loss": 0.82738268, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.43945312, "step": 6196, "time_per_iteration": 2.40454363822937 }, { "auxiliary_loss_clip": 0.0107053, "auxiliary_loss_mlp": 0.0105237, "balance_loss_clip": 1.02178097, "balance_loss_mlp": 1.02627885, "epoch": 0.372583796783406, "flos": 16763301728640.0, "grad_norm": 1.9988524397205056, "language_loss": 0.86537284, "learning_rate": 2.890081914052443e-06, "loss": 0.88660192, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44335938, "step": 6197, "time_per_iteration": 2.404353618621826 }, { "auxiliary_loss_clip": 0.01068681, "auxiliary_loss_mlp": 0.01052242, "balance_loss_clip": 1.01900601, "balance_loss_mlp": 1.02486157, "epoch": 0.37264392003607394, "flos": 22636879931520.0, "grad_norm": 1.5328579089843617, "language_loss": 0.65693605, "learning_rate": 2.889733130264237e-06, "loss": 0.67814523, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4375, "step": 6198, "time_per_iteration": 2.431446075439453 }, { "auxiliary_loss_clip": 0.0106899, "auxiliary_loss_mlp": 0.01053639, "balance_loss_clip": 1.02226329, "balance_loss_mlp": 1.02549064, "epoch": 0.3727040432887419, "flos": 19972134144000.0, "grad_norm": 1.3916492545597519, "language_loss": 0.75027448, "learning_rate": 2.889384312737261e-06, "loss": 0.77150077, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43554688, "step": 6199, "time_per_iteration": 2.4463441371917725 }, { "auxiliary_loss_clip": 0.01070019, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.0168761, "balance_loss_mlp": 1.02628052, "epoch": 0.37276416654140987, "flos": 63896991517440.0, "grad_norm": 1.8408988952734409, "language_loss": 0.82236296, "learning_rate": 2.889035461484742e-06, "loss": 0.84354138, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 6200, "time_per_iteration": 2.863687753677368 }, { "auxiliary_loss_clip": 0.01069151, "auxiliary_loss_mlp": 0.0105778, "balance_loss_clip": 1.02416325, "balance_loss_mlp": 1.02312398, "epoch": 0.37282428979407783, "flos": 39784401705600.0, "grad_norm": 1.8531203086096317, "language_loss": 0.61748129, "learning_rate": 2.88868657651991e-06, "loss": 0.63875061, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4609375, "step": 6201, "time_per_iteration": 2.581199884414673 }, { "auxiliary_loss_clip": 0.01071133, "auxiliary_loss_mlp": 0.01053403, "balance_loss_clip": 1.01899946, "balance_loss_mlp": 1.02455878, "epoch": 0.37288441304674586, "flos": 22707998104320.0, "grad_norm": 1.6353477671011873, "language_loss": 0.74941546, "learning_rate": 2.8883376578559934e-06, "loss": 0.77066076, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46484375, "step": 6202, "time_per_iteration": 2.449441909790039 }, { "auxiliary_loss_clip": 0.01068406, "auxiliary_loss_mlp": 0.01056429, "balance_loss_clip": 1.02557731, "balance_loss_mlp": 1.0239985, "epoch": 0.3729445362994138, "flos": 18769306383360.0, "grad_norm": 1.7628705184126339, "language_loss": 0.75196481, "learning_rate": 2.8879887055062243e-06, "loss": 0.77321315, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 6203, "time_per_iteration": 2.409721612930298 }, { "auxiliary_loss_clip": 0.01064768, "auxiliary_loss_mlp": 0.01049483, "balance_loss_clip": 1.02018142, "balance_loss_mlp": 1.02085614, "epoch": 0.3730046595520818, "flos": 22455087148800.0, "grad_norm": 2.116806731411721, "language_loss": 0.83118474, "learning_rate": 2.8876397194838353e-06, "loss": 0.85232723, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43945312, "step": 6204, "time_per_iteration": 2.498588800430298 }, { "auxiliary_loss_clip": 0.01067886, "auxiliary_loss_mlp": 0.01053162, "balance_loss_clip": 1.02161956, "balance_loss_mlp": 1.02205193, "epoch": 0.37306478280474975, "flos": 24315224676480.0, "grad_norm": 1.616420142614673, "language_loss": 0.76209074, "learning_rate": 2.8872906998020577e-06, "loss": 0.78330123, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45898438, "step": 6205, "time_per_iteration": 2.430117607116699 }, { "auxiliary_loss_clip": 0.01065792, "auxiliary_loss_mlp": 0.01052314, "balance_loss_clip": 1.01977026, "balance_loss_mlp": 1.0208869, "epoch": 0.3731249060574177, "flos": 15814257707520.0, "grad_norm": 1.8836810814841336, "language_loss": 0.795555, "learning_rate": 2.886941646474128e-06, "loss": 0.81673604, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44921875, "step": 6206, "time_per_iteration": 2.409475803375244 }, { "auxiliary_loss_clip": 0.01066102, "auxiliary_loss_mlp": 0.01051745, "balance_loss_clip": 1.01929665, "balance_loss_mlp": 1.02007437, "epoch": 0.3731850293100857, "flos": 19827069978240.0, "grad_norm": 1.936852968841416, "language_loss": 0.94352925, "learning_rate": 2.886592559513283e-06, "loss": 0.96470767, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4609375, "step": 6207, "time_per_iteration": 2.3767848014831543 }, { "auxiliary_loss_clip": 0.01067998, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.01400304, "balance_loss_mlp": 1.02196062, "epoch": 0.37324515256275365, "flos": 19061354839680.0, "grad_norm": 2.358150721094156, "language_loss": 0.8432225, "learning_rate": 2.886243438932759e-06, "loss": 0.86436152, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4609375, "step": 6208, "time_per_iteration": 2.467061996459961 }, { "auxiliary_loss_clip": 0.01066499, "auxiliary_loss_mlp": 0.01054054, "balance_loss_clip": 1.02237976, "balance_loss_mlp": 1.02011347, "epoch": 0.3733052758154216, "flos": 20703285169920.0, "grad_norm": 2.2976712733657085, "language_loss": 0.74257135, "learning_rate": 2.8858942847457953e-06, "loss": 0.7637769, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.46484375, "step": 6209, "time_per_iteration": 2.3781378269195557 }, { "auxiliary_loss_clip": 0.01066112, "auxiliary_loss_mlp": 0.01057862, "balance_loss_clip": 1.02132392, "balance_loss_mlp": 1.01994395, "epoch": 0.3733653990680896, "flos": 20192470934400.0, "grad_norm": 1.648909382213249, "language_loss": 0.7185986, "learning_rate": 2.8855450969656305e-06, "loss": 0.73983836, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.4609375, "step": 6210, "time_per_iteration": 2.4245078563690186 }, { "auxiliary_loss_clip": 0.01066611, "auxiliary_loss_mlp": 0.0104618, "balance_loss_clip": 1.01392162, "balance_loss_mlp": 1.02027512, "epoch": 0.37342552232075754, "flos": 20338617352320.0, "grad_norm": 1.5695027775139112, "language_loss": 0.79041934, "learning_rate": 2.8851958756055073e-06, "loss": 0.81154728, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.46484375, "step": 6211, "time_per_iteration": 2.3790764808654785 }, { "auxiliary_loss_clip": 0.01067016, "auxiliary_loss_mlp": 0.01054076, "balance_loss_clip": 1.02062559, "balance_loss_mlp": 1.02109897, "epoch": 0.3734856455734255, "flos": 35516409240960.0, "grad_norm": 2.922005530314484, "language_loss": 0.74687624, "learning_rate": 2.884846620678668e-06, "loss": 0.76808715, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.45898438, "step": 6212, "time_per_iteration": 2.52748966217041 }, { "auxiliary_loss_clip": 0.01072929, "auxiliary_loss_mlp": 0.01060486, "balance_loss_clip": 1.02217174, "balance_loss_mlp": 1.02215147, "epoch": 0.37354576882609347, "flos": 21141235664640.0, "grad_norm": 1.9390132899422765, "language_loss": 0.83572477, "learning_rate": 2.884497332198356e-06, "loss": 0.85705894, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 0.5078125, "step": 6213, "time_per_iteration": 2.3797342777252197 }, { "auxiliary_loss_clip": 0.01068745, "auxiliary_loss_mlp": 0.01049528, "balance_loss_clip": 1.01579142, "balance_loss_mlp": 1.02218342, "epoch": 0.37360589207876144, "flos": 21505728925440.0, "grad_norm": 2.2015986392589806, "language_loss": 0.79071647, "learning_rate": 2.8841480101778167e-06, "loss": 0.81189919, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.46679688, "step": 6214, "time_per_iteration": 2.4351308345794678 }, { "auxiliary_loss_clip": 0.0106597, "auxiliary_loss_mlp": 0.01055971, "balance_loss_clip": 1.02407086, "balance_loss_mlp": 1.02061081, "epoch": 0.37366601533142946, "flos": 38434275452160.0, "grad_norm": 1.6022821623426917, "language_loss": 0.85672796, "learning_rate": 2.883798654630296e-06, "loss": 0.87794733, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 6215, "time_per_iteration": 2.5300703048706055 }, { "auxiliary_loss_clip": 0.01069724, "auxiliary_loss_mlp": 0.01060652, "balance_loss_clip": 1.02391219, "balance_loss_mlp": 1.02218485, "epoch": 0.3737261385840974, "flos": 18440215107840.0, "grad_norm": 1.8902648735577556, "language_loss": 0.6948837, "learning_rate": 2.8834492655690423e-06, "loss": 0.71618748, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.47460938, "step": 6216, "time_per_iteration": 2.4300568103790283 }, { "auxiliary_loss_clip": 0.01069329, "auxiliary_loss_mlp": 0.01051706, "balance_loss_clip": 1.01894784, "balance_loss_mlp": 1.02228165, "epoch": 0.3737862618367654, "flos": 22928753831040.0, "grad_norm": 2.0437986155403447, "language_loss": 0.68222743, "learning_rate": 2.883099843007303e-06, "loss": 0.70343781, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.47070312, "step": 6217, "time_per_iteration": 2.4007680416107178 }, { "auxiliary_loss_clip": 0.01070756, "auxiliary_loss_mlp": 0.01051156, "balance_loss_clip": 1.01789713, "balance_loss_mlp": 1.02337146, "epoch": 0.37384638508943335, "flos": 15408881377920.0, "grad_norm": 1.6306091636435542, "language_loss": 0.81944299, "learning_rate": 2.88275038695833e-06, "loss": 0.840662, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47265625, "step": 6218, "time_per_iteration": 2.4123613834381104 }, { "auxiliary_loss_clip": 0.01063802, "auxiliary_loss_mlp": 0.01054473, "balance_loss_clip": 1.02283525, "balance_loss_mlp": 1.02094841, "epoch": 0.3739065083421013, "flos": 24279648134400.0, "grad_norm": 1.3990121445651011, "language_loss": 0.79888797, "learning_rate": 2.8824008974353736e-06, "loss": 0.82007074, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4296875, "step": 6219, "time_per_iteration": 2.4346718788146973 }, { "auxiliary_loss_clip": 0.01067102, "auxiliary_loss_mlp": 0.01058699, "balance_loss_clip": 1.02372289, "balance_loss_mlp": 1.02153301, "epoch": 0.3739666315947693, "flos": 23001722305920.0, "grad_norm": 1.7930942315456824, "language_loss": 0.77719218, "learning_rate": 2.8820513744516866e-06, "loss": 0.79845023, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.45703125, "step": 6220, "time_per_iteration": 2.459007740020752 }, { "auxiliary_loss_clip": 0.01068263, "auxiliary_loss_mlp": 0.01055042, "balance_loss_clip": 1.02175879, "balance_loss_mlp": 1.021487, "epoch": 0.37402675484743725, "flos": 19390097001600.0, "grad_norm": 2.1427071880354718, "language_loss": 0.8469767, "learning_rate": 2.8817018180205235e-06, "loss": 0.86820972, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46679688, "step": 6221, "time_per_iteration": 2.3790395259857178 }, { "auxiliary_loss_clip": 0.01066232, "auxiliary_loss_mlp": 0.01056569, "balance_loss_clip": 1.02309513, "balance_loss_mlp": 1.02072048, "epoch": 0.3740868781001052, "flos": 17125281371520.0, "grad_norm": 1.7437742802669203, "language_loss": 0.77330327, "learning_rate": 2.8813522281551387e-06, "loss": 0.79453129, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.45507812, "step": 6222, "time_per_iteration": 2.412693500518799 }, { "auxiliary_loss_clip": 0.01069078, "auxiliary_loss_mlp": 0.01050343, "balance_loss_clip": 1.0172739, "balance_loss_mlp": 1.02309752, "epoch": 0.3741470013527732, "flos": 20042589000960.0, "grad_norm": 1.7792024918747102, "language_loss": 0.71955836, "learning_rate": 2.881002604868789e-06, "loss": 0.74075258, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45898438, "step": 6223, "time_per_iteration": 2.3846848011016846 }, { "auxiliary_loss_clip": 0.01066467, "auxiliary_loss_mlp": 0.01051638, "balance_loss_clip": 1.01947546, "balance_loss_mlp": 1.02137601, "epoch": 0.37420712460544114, "flos": 36895967637120.0, "grad_norm": 2.0209762566288303, "language_loss": 0.70235574, "learning_rate": 2.8806529481747325e-06, "loss": 0.72353673, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.45117188, "step": 6224, "time_per_iteration": 2.5586984157562256 }, { "auxiliary_loss_clip": 0.01065223, "auxiliary_loss_mlp": 0.0105116, "balance_loss_clip": 1.01800776, "balance_loss_mlp": 1.02149105, "epoch": 0.3742672478581091, "flos": 22200081511680.0, "grad_norm": 1.7284532789267233, "language_loss": 0.71420431, "learning_rate": 2.880303258086228e-06, "loss": 0.73536813, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4375, "step": 6225, "time_per_iteration": 2.3968114852905273 }, { "auxiliary_loss_clip": 0.01064218, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.01203465, "balance_loss_mlp": 1.02008033, "epoch": 0.3743273711107771, "flos": 24680381253120.0, "grad_norm": 1.84504222583044, "language_loss": 0.80361867, "learning_rate": 2.879953534616536e-06, "loss": 0.8247081, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44140625, "step": 6226, "time_per_iteration": 2.4245471954345703 }, { "auxiliary_loss_clip": 0.01065888, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.01728392, "balance_loss_mlp": 1.02037835, "epoch": 0.37438749436344504, "flos": 24458543274240.0, "grad_norm": 1.9515622136634245, "language_loss": 0.69754744, "learning_rate": 2.879603777778917e-06, "loss": 0.71869719, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45507812, "step": 6227, "time_per_iteration": 5.318897247314453 }, { "auxiliary_loss_clip": 0.01065307, "auxiliary_loss_mlp": 0.01053124, "balance_loss_clip": 1.01919687, "balance_loss_mlp": 1.02176023, "epoch": 0.374447617616113, "flos": 21797672647680.0, "grad_norm": 1.6704511929756602, "language_loss": 0.84708637, "learning_rate": 2.879253987586635e-06, "loss": 0.86827064, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.43554688, "step": 6228, "time_per_iteration": 2.376373767852783 }, { "auxiliary_loss_clip": 0.01064895, "auxiliary_loss_mlp": 0.01052046, "balance_loss_clip": 1.02045572, "balance_loss_mlp": 1.0213387, "epoch": 0.374507740868781, "flos": 17967211741440.0, "grad_norm": 2.630225515341044, "language_loss": 0.75861514, "learning_rate": 2.8789041640529535e-06, "loss": 0.77978456, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43554688, "step": 6229, "time_per_iteration": 2.380281686782837 }, { "auxiliary_loss_clip": 0.01066783, "auxiliary_loss_mlp": 0.01056664, "balance_loss_clip": 1.02068615, "balance_loss_mlp": 1.0202297, "epoch": 0.374567864121449, "flos": 16104944620800.0, "grad_norm": 1.880821784624589, "language_loss": 0.85410851, "learning_rate": 2.8785543071911383e-06, "loss": 0.87534297, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.46484375, "step": 6230, "time_per_iteration": 2.3458127975463867 }, { "auxiliary_loss_clip": 0.01066891, "auxiliary_loss_mlp": 0.0105432, "balance_loss_clip": 1.01934433, "balance_loss_mlp": 1.02089047, "epoch": 0.37462798737411696, "flos": 25772045644800.0, "grad_norm": 2.1184858002424014, "language_loss": 0.74262863, "learning_rate": 2.878204417014456e-06, "loss": 0.76384079, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.4609375, "step": 6231, "time_per_iteration": 2.491569995880127 }, { "auxiliary_loss_clip": 0.01067218, "auxiliary_loss_mlp": 0.01060761, "balance_loss_clip": 1.0249027, "balance_loss_mlp": 1.02164984, "epoch": 0.3746881106267849, "flos": 16653569725440.0, "grad_norm": 2.028678012703331, "language_loss": 0.74439406, "learning_rate": 2.8778544935361735e-06, "loss": 0.76567388, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.45507812, "step": 6232, "time_per_iteration": 3.8391807079315186 }, { "auxiliary_loss_clip": 0.01067235, "auxiliary_loss_mlp": 0.01047392, "balance_loss_clip": 1.01353693, "balance_loss_mlp": 1.02025926, "epoch": 0.3747482338794529, "flos": 26176758658560.0, "grad_norm": 1.6132503418436666, "language_loss": 0.78033262, "learning_rate": 2.877504536769561e-06, "loss": 0.80147892, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.47070312, "step": 6233, "time_per_iteration": 2.4672532081604004 }, { "auxiliary_loss_clip": 0.01066919, "auxiliary_loss_mlp": 0.01059634, "balance_loss_clip": 1.02602935, "balance_loss_mlp": 1.01988816, "epoch": 0.37480835713212085, "flos": 12020246127360.0, "grad_norm": 1.7446080938443214, "language_loss": 0.70766717, "learning_rate": 2.8771545467278883e-06, "loss": 0.72893268, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.47070312, "step": 6234, "time_per_iteration": 2.3572475910186768 }, { "auxiliary_loss_clip": 0.01067511, "auxiliary_loss_mlp": 0.01055624, "balance_loss_clip": 1.02353251, "balance_loss_mlp": 1.02117097, "epoch": 0.3748684803847888, "flos": 19678340119680.0, "grad_norm": 2.682285120805443, "language_loss": 0.84310681, "learning_rate": 2.8768045234244276e-06, "loss": 0.86433816, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 6235, "time_per_iteration": 2.378056764602661 }, { "auxiliary_loss_clip": 0.01068504, "auxiliary_loss_mlp": 0.01057277, "balance_loss_clip": 1.02218223, "balance_loss_mlp": 1.02149773, "epoch": 0.3749286036374568, "flos": 20520165755520.0, "grad_norm": 1.812605518058928, "language_loss": 0.79172707, "learning_rate": 2.8764544668724517e-06, "loss": 0.81298494, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.47070312, "step": 6236, "time_per_iteration": 2.384197473526001 }, { "auxiliary_loss_clip": 0.01070716, "auxiliary_loss_mlp": 0.01057921, "balance_loss_clip": 1.0189395, "balance_loss_mlp": 1.02155304, "epoch": 0.37498872689012475, "flos": 20703564460800.0, "grad_norm": 2.182206234020845, "language_loss": 0.76136672, "learning_rate": 2.876104377085234e-06, "loss": 0.78265309, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 0.4921875, "step": 6237, "time_per_iteration": 2.3894240856170654 }, { "auxiliary_loss_clip": 0.01067021, "auxiliary_loss_mlp": 0.01055146, "balance_loss_clip": 1.01964521, "balance_loss_mlp": 1.0206399, "epoch": 0.3750488501427927, "flos": 21573914544000.0, "grad_norm": 2.06132737624561, "language_loss": 0.94167602, "learning_rate": 2.8757542540760508e-06, "loss": 0.96289778, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.46484375, "step": 6238, "time_per_iteration": 2.374831199645996 }, { "auxiliary_loss_clip": 0.01065824, "auxiliary_loss_mlp": 0.01051387, "balance_loss_clip": 1.01698279, "balance_loss_mlp": 1.02043951, "epoch": 0.3751089733954607, "flos": 15922977281280.0, "grad_norm": 2.0178993440246527, "language_loss": 0.72488832, "learning_rate": 2.8754040978581777e-06, "loss": 0.74606037, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.453125, "step": 6239, "time_per_iteration": 2.393449306488037 }, { "auxiliary_loss_clip": 0.0106838, "auxiliary_loss_mlp": 0.01052677, "balance_loss_clip": 1.01824927, "balance_loss_mlp": 1.02094388, "epoch": 0.37516909664812864, "flos": 36283136808960.0, "grad_norm": 1.6856725075254644, "language_loss": 0.66573024, "learning_rate": 2.875053908444895e-06, "loss": 0.68694079, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47460938, "step": 6240, "time_per_iteration": 2.514086961746216 }, { "auxiliary_loss_clip": 0.0106825, "auxiliary_loss_mlp": 0.01049535, "balance_loss_clip": 1.01696682, "balance_loss_mlp": 1.02086878, "epoch": 0.3752292199007966, "flos": 13515087432960.0, "grad_norm": 1.8421040015712367, "language_loss": 0.77276742, "learning_rate": 2.8747036858494795e-06, "loss": 0.79394531, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.47265625, "step": 6241, "time_per_iteration": 2.394563913345337 }, { "auxiliary_loss_clip": 0.01068751, "auxiliary_loss_mlp": 0.01057852, "balance_loss_clip": 1.02070618, "balance_loss_mlp": 1.02104306, "epoch": 0.3752893431534646, "flos": 27196885941120.0, "grad_norm": 2.1482945059354583, "language_loss": 0.8506968, "learning_rate": 2.874353430085213e-06, "loss": 0.87196279, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.4765625, "step": 6242, "time_per_iteration": 2.426455020904541 }, { "auxiliary_loss_clip": 0.01067429, "auxiliary_loss_mlp": 0.01059545, "balance_loss_clip": 1.02649975, "balance_loss_mlp": 1.02035677, "epoch": 0.3753494664061326, "flos": 30006381692160.0, "grad_norm": 2.3595082021661753, "language_loss": 0.7003125, "learning_rate": 2.8740031411653766e-06, "loss": 0.72158229, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.47070312, "step": 6243, "time_per_iteration": 2.5027406215667725 }, { "auxiliary_loss_clip": 0.01067423, "auxiliary_loss_mlp": 0.01060538, "balance_loss_clip": 1.02503717, "balance_loss_mlp": 1.02149236, "epoch": 0.37540958965880056, "flos": 24460812512640.0, "grad_norm": 1.7974587483423823, "language_loss": 0.85199177, "learning_rate": 2.8736528191032535e-06, "loss": 0.87327135, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.45898438, "step": 6244, "time_per_iteration": 2.4073851108551025 }, { "auxiliary_loss_clip": 0.01065205, "auxiliary_loss_mlp": 0.01053455, "balance_loss_clip": 1.02148342, "balance_loss_mlp": 1.02065992, "epoch": 0.3754697129114685, "flos": 16507458218880.0, "grad_norm": 2.268234925644364, "language_loss": 0.84269309, "learning_rate": 2.8733024639121277e-06, "loss": 0.86387968, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 6245, "time_per_iteration": 2.364856243133545 }, { "auxiliary_loss_clip": 0.01067242, "auxiliary_loss_mlp": 0.01055673, "balance_loss_clip": 1.02041113, "balance_loss_mlp": 1.02091718, "epoch": 0.3755298361641365, "flos": 19389887533440.0, "grad_norm": 1.9609093072141877, "language_loss": 0.65281802, "learning_rate": 2.8729520756052853e-06, "loss": 0.67404717, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.46289062, "step": 6246, "time_per_iteration": 2.3741250038146973 }, { "auxiliary_loss_clip": 0.01068272, "auxiliary_loss_mlp": 0.01059176, "balance_loss_clip": 1.02167261, "balance_loss_mlp": 1.02070451, "epoch": 0.37558995941680445, "flos": 14719521116160.0, "grad_norm": 2.364650680035281, "language_loss": 0.76789892, "learning_rate": 2.8726016541960124e-06, "loss": 0.78917336, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 0.47460938, "step": 6247, "time_per_iteration": 2.3882367610931396 }, { "auxiliary_loss_clip": 0.01066752, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.01551223, "balance_loss_mlp": 1.02072394, "epoch": 0.3756500826694724, "flos": 21688813428480.0, "grad_norm": 2.783516448399697, "language_loss": 0.56067353, "learning_rate": 2.872251199697598e-06, "loss": 0.58182847, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4609375, "step": 6248, "time_per_iteration": 2.380462884902954 }, { "auxiliary_loss_clip": 0.01066415, "auxiliary_loss_mlp": 0.01056665, "balance_loss_clip": 1.02154636, "balance_loss_mlp": 1.0211885, "epoch": 0.3757102059221404, "flos": 26504453479680.0, "grad_norm": 1.7391372630195248, "language_loss": 0.84835106, "learning_rate": 2.8719007121233297e-06, "loss": 0.86958188, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.45117188, "step": 6249, "time_per_iteration": 2.4346978664398193 }, { "auxiliary_loss_clip": 0.01064622, "auxiliary_loss_mlp": 0.01046631, "balance_loss_clip": 1.01517153, "balance_loss_mlp": 1.0202204, "epoch": 0.37577032917480835, "flos": 37336676129280.0, "grad_norm": 1.4910373581303633, "language_loss": 0.70093608, "learning_rate": 2.8715501914864993e-06, "loss": 0.72204864, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 6250, "time_per_iteration": 2.5270063877105713 }, { "auxiliary_loss_clip": 0.01067681, "auxiliary_loss_mlp": 0.01049659, "balance_loss_clip": 1.01907039, "balance_loss_mlp": 1.02197146, "epoch": 0.3758304524274763, "flos": 21907509384960.0, "grad_norm": 2.0057259287560316, "language_loss": 0.79832339, "learning_rate": 2.8711996378003987e-06, "loss": 0.81949675, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45703125, "step": 6251, "time_per_iteration": 2.4000701904296875 }, { "auxiliary_loss_clip": 0.01067087, "auxiliary_loss_mlp": 0.01050939, "balance_loss_clip": 1.01677406, "balance_loss_mlp": 1.02132511, "epoch": 0.3758905756801443, "flos": 36568028436480.0, "grad_norm": 1.8276540268372234, "language_loss": 0.60398042, "learning_rate": 2.8708490510783203e-06, "loss": 0.62516069, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.45703125, "step": 6252, "time_per_iteration": 2.564404010772705 }, { "auxiliary_loss_clip": 0.01068993, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.01981783, "balance_loss_mlp": 1.02141726, "epoch": 0.37595069893281224, "flos": 24527811144960.0, "grad_norm": 1.7305884944017336, "language_loss": 0.9003861, "learning_rate": 2.8704984313335584e-06, "loss": 0.9216311, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4765625, "step": 6253, "time_per_iteration": 2.450216770172119 }, { "auxiliary_loss_clip": 0.01066228, "auxiliary_loss_mlp": 0.01047064, "balance_loss_clip": 1.01548564, "balance_loss_mlp": 1.02182281, "epoch": 0.3760108221854802, "flos": 16434105719040.0, "grad_norm": 2.0215580701775373, "language_loss": 0.78036916, "learning_rate": 2.8701477785794097e-06, "loss": 0.80150211, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44335938, "step": 6254, "time_per_iteration": 2.3608360290527344 }, { "auxiliary_loss_clip": 0.01070555, "auxiliary_loss_mlp": 0.01056218, "balance_loss_clip": 1.02236271, "balance_loss_mlp": 1.02344608, "epoch": 0.37607094543814823, "flos": 13770896031360.0, "grad_norm": 2.1698051231002755, "language_loss": 0.64051855, "learning_rate": 2.869797092829169e-06, "loss": 0.66178632, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.47070312, "step": 6255, "time_per_iteration": 2.3979666233062744 }, { "auxiliary_loss_clip": 0.01068029, "auxiliary_loss_mlp": 0.01056319, "balance_loss_clip": 1.02258229, "balance_loss_mlp": 1.02013636, "epoch": 0.3761310686908162, "flos": 19857095614080.0, "grad_norm": 3.1374547950796194, "language_loss": 0.75517309, "learning_rate": 2.869446374096135e-06, "loss": 0.77641654, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.47851562, "step": 6256, "time_per_iteration": 2.4308218955993652 }, { "auxiliary_loss_clip": 0.01068214, "auxiliary_loss_mlp": 0.01053159, "balance_loss_clip": 1.01992345, "balance_loss_mlp": 1.02096319, "epoch": 0.37619119194348416, "flos": 12749965787520.0, "grad_norm": 2.042367298924999, "language_loss": 0.72261637, "learning_rate": 2.8690956223936088e-06, "loss": 0.74383008, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.47265625, "step": 6257, "time_per_iteration": 2.37766170501709 }, { "auxiliary_loss_clip": 0.01066802, "auxiliary_loss_mlp": 0.01046198, "balance_loss_clip": 1.01317692, "balance_loss_mlp": 1.02140546, "epoch": 0.3762513151961521, "flos": 17529575448960.0, "grad_norm": 1.641680476477804, "language_loss": 0.8535673, "learning_rate": 2.868744837734889e-06, "loss": 0.87469733, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.453125, "step": 6258, "time_per_iteration": 2.454693078994751 }, { "auxiliary_loss_clip": 0.01065369, "auxiliary_loss_mlp": 0.01051164, "balance_loss_clip": 1.02128983, "balance_loss_mlp": 1.02125382, "epoch": 0.3763114384488201, "flos": 23616438347520.0, "grad_norm": 1.525372446921158, "language_loss": 0.81626534, "learning_rate": 2.868394020133277e-06, "loss": 0.83743072, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.44140625, "step": 6259, "time_per_iteration": 2.4088242053985596 }, { "auxiliary_loss_clip": 0.01068338, "auxiliary_loss_mlp": 0.01057692, "balance_loss_clip": 1.02312112, "balance_loss_mlp": 1.02101123, "epoch": 0.37637156170148806, "flos": 25405911550080.0, "grad_norm": 2.1853350900028494, "language_loss": 0.72949708, "learning_rate": 2.8680431696020783e-06, "loss": 0.75075734, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.47265625, "step": 6260, "time_per_iteration": 2.468419075012207 }, { "auxiliary_loss_clip": 0.01066877, "auxiliary_loss_mlp": 0.01047965, "balance_loss_clip": 1.0153017, "balance_loss_mlp": 1.0202136, "epoch": 0.376431684954156, "flos": 23439777534720.0, "grad_norm": 1.612588053038594, "language_loss": 0.79255712, "learning_rate": 2.867692286154594e-06, "loss": 0.81370556, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.46679688, "step": 6261, "time_per_iteration": 2.416405439376831 }, { "auxiliary_loss_clip": 0.01066632, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.02026653, "balance_loss_mlp": 1.01977158, "epoch": 0.376491808206824, "flos": 34203046515840.0, "grad_norm": 1.6210896241644364, "language_loss": 0.81544989, "learning_rate": 2.867341369804132e-06, "loss": 0.8366558, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46875, "step": 6262, "time_per_iteration": 2.536407470703125 }, { "auxiliary_loss_clip": 0.01065226, "auxiliary_loss_mlp": 0.01052748, "balance_loss_clip": 1.01981068, "balance_loss_mlp": 1.02061236, "epoch": 0.37655193145949195, "flos": 35184315588480.0, "grad_norm": 1.6092611316804604, "language_loss": 0.81939054, "learning_rate": 2.866990420563998e-06, "loss": 0.84057027, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4453125, "step": 6263, "time_per_iteration": 2.532015085220337 }, { "auxiliary_loss_clip": 0.01067178, "auxiliary_loss_mlp": 0.01056067, "balance_loss_clip": 1.0240953, "balance_loss_mlp": 1.02186108, "epoch": 0.3766120547121599, "flos": 16760962667520.0, "grad_norm": 1.795921989462818, "language_loss": 0.81230104, "learning_rate": 2.866639438447501e-06, "loss": 0.83353353, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 6264, "time_per_iteration": 2.4235222339630127 }, { "auxiliary_loss_clip": 0.01064574, "auxiliary_loss_mlp": 0.01051125, "balance_loss_clip": 1.01877165, "balance_loss_mlp": 1.01965404, "epoch": 0.3766721779648279, "flos": 23549230247040.0, "grad_norm": 1.9345790281617024, "language_loss": 0.75485885, "learning_rate": 2.8662884234679497e-06, "loss": 0.77601588, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44921875, "step": 6265, "time_per_iteration": 2.3970727920532227 }, { "auxiliary_loss_clip": 0.01065541, "auxiliary_loss_mlp": 0.01055233, "balance_loss_clip": 1.02538347, "balance_loss_mlp": 1.02141476, "epoch": 0.37673230121749585, "flos": 29128001996160.0, "grad_norm": 1.652327539800267, "language_loss": 0.69216037, "learning_rate": 2.865937375638654e-06, "loss": 0.71336812, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44140625, "step": 6266, "time_per_iteration": 3.908853530883789 }, { "auxiliary_loss_clip": 0.01069156, "auxiliary_loss_mlp": 0.01054988, "balance_loss_clip": 1.02034581, "balance_loss_mlp": 1.02082372, "epoch": 0.3767924244701638, "flos": 28145545937280.0, "grad_norm": 2.864214598475721, "language_loss": 0.64740586, "learning_rate": 2.8655862949729264e-06, "loss": 0.66864735, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.484375, "step": 6267, "time_per_iteration": 3.8403568267822266 }, { "auxiliary_loss_clip": 0.01013508, "auxiliary_loss_mlp": 0.0100422, "balance_loss_clip": 1.00123954, "balance_loss_mlp": 1.00409913, "epoch": 0.37685254772283183, "flos": 60794153723520.0, "grad_norm": 0.7207785014702951, "language_loss": 0.58892345, "learning_rate": 2.8652351814840795e-06, "loss": 0.6091007, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.09423828, "step": 6268, "time_per_iteration": 3.1662817001342773 }, { "auxiliary_loss_clip": 0.01065428, "auxiliary_loss_mlp": 0.01051691, "balance_loss_clip": 1.01833642, "balance_loss_mlp": 1.01957476, "epoch": 0.3769126709754998, "flos": 26031310467840.0, "grad_norm": 1.575821130084188, "language_loss": 0.65905333, "learning_rate": 2.8648840351854283e-06, "loss": 0.68022454, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.45898438, "step": 6269, "time_per_iteration": 2.4698076248168945 }, { "auxiliary_loss_clip": 0.01065254, "auxiliary_loss_mlp": 0.01056281, "balance_loss_clip": 1.02290201, "balance_loss_mlp": 1.02268052, "epoch": 0.37697279422816776, "flos": 23578941680640.0, "grad_norm": 1.7048816638798925, "language_loss": 0.72268957, "learning_rate": 2.8645328560902874e-06, "loss": 0.74390495, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.42578125, "step": 6270, "time_per_iteration": 2.4572460651397705 }, { "auxiliary_loss_clip": 0.01012341, "auxiliary_loss_mlp": 0.01008164, "balance_loss_clip": 1.00520718, "balance_loss_mlp": 1.00295258, "epoch": 0.3770329174808357, "flos": 64742550802560.0, "grad_norm": 0.6989844473959727, "language_loss": 0.56116152, "learning_rate": 2.8641816442119746e-06, "loss": 0.5813666, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.09375, "step": 6271, "time_per_iteration": 4.364473819732666 }, { "auxiliary_loss_clip": 0.01062511, "auxiliary_loss_mlp": 0.01049644, "balance_loss_clip": 1.0158366, "balance_loss_mlp": 1.01886916, "epoch": 0.3770930407335037, "flos": 21834226707840.0, "grad_norm": 1.829100297405524, "language_loss": 0.80801922, "learning_rate": 2.8638303995638066e-06, "loss": 0.82914078, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4375, "step": 6272, "time_per_iteration": 3.8443620204925537 }, { "auxiliary_loss_clip": 0.01063756, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.0154655, "balance_loss_mlp": 1.02059281, "epoch": 0.37715316398617166, "flos": 22746786491520.0, "grad_norm": 1.7378585241649005, "language_loss": 0.75885665, "learning_rate": 2.863479122159103e-06, "loss": 0.77994502, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43164062, "step": 6273, "time_per_iteration": 2.4371728897094727 }, { "auxiliary_loss_clip": 0.01066972, "auxiliary_loss_mlp": 0.01053564, "balance_loss_clip": 1.02289104, "balance_loss_mlp": 1.0222609, "epoch": 0.3772132872388396, "flos": 18913637410560.0, "grad_norm": 1.4299582561236608, "language_loss": 0.72915286, "learning_rate": 2.8631278120111858e-06, "loss": 0.75035816, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44726562, "step": 6274, "time_per_iteration": 2.369401454925537 }, { "auxiliary_loss_clip": 0.01067994, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.01709306, "balance_loss_mlp": 1.0217042, "epoch": 0.3772734104915076, "flos": 17345303959680.0, "grad_norm": 2.1638136669224934, "language_loss": 0.85311675, "learning_rate": 2.8627764691333742e-06, "loss": 0.87428761, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 6275, "time_per_iteration": 2.3967461585998535 }, { "auxiliary_loss_clip": 0.01065522, "auxiliary_loss_mlp": 0.01037064, "balance_loss_clip": 1.01074302, "balance_loss_mlp": 1.02305365, "epoch": 0.37733353374417555, "flos": 32341023774720.0, "grad_norm": 1.4934556151906513, "language_loss": 0.76426077, "learning_rate": 2.8624250935389935e-06, "loss": 0.78528666, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.42382812, "step": 6276, "time_per_iteration": 2.4768617153167725 }, { "auxiliary_loss_clip": 0.01068419, "auxiliary_loss_mlp": 0.0104794, "balance_loss_clip": 1.014943, "balance_loss_mlp": 1.02219319, "epoch": 0.3773936569968435, "flos": 23359756965120.0, "grad_norm": 1.9532391330712027, "language_loss": 0.87457734, "learning_rate": 2.862073685241366e-06, "loss": 0.89574099, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.46289062, "step": 6277, "time_per_iteration": 2.4229183197021484 }, { "auxiliary_loss_clip": 0.01066599, "auxiliary_loss_mlp": 0.01043107, "balance_loss_clip": 1.01323342, "balance_loss_mlp": 1.02369452, "epoch": 0.3774537802495115, "flos": 21465823374720.0, "grad_norm": 1.8100537727044947, "language_loss": 0.79400361, "learning_rate": 2.861722244253818e-06, "loss": 0.81510067, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4296875, "step": 6278, "time_per_iteration": 2.3939003944396973 }, { "auxiliary_loss_clip": 0.01068895, "auxiliary_loss_mlp": 0.01048901, "balance_loss_clip": 1.01642823, "balance_loss_mlp": 1.02259314, "epoch": 0.37751390350217945, "flos": 24972534443520.0, "grad_norm": 1.6771018995456737, "language_loss": 0.84281611, "learning_rate": 2.8613707705896767e-06, "loss": 0.86399406, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46289062, "step": 6279, "time_per_iteration": 2.4570884704589844 }, { "auxiliary_loss_clip": 0.010672, "auxiliary_loss_mlp": 0.01049945, "balance_loss_clip": 1.02082193, "balance_loss_mlp": 1.02332246, "epoch": 0.3775740267548474, "flos": 27817851116160.0, "grad_norm": 2.1072148208776853, "language_loss": 0.75894618, "learning_rate": 2.861019264262269e-06, "loss": 0.78011763, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4375, "step": 6280, "time_per_iteration": 2.4425361156463623 }, { "auxiliary_loss_clip": 0.01065651, "auxiliary_loss_mlp": 0.01049658, "balance_loss_clip": 1.02071357, "balance_loss_mlp": 1.02315283, "epoch": 0.3776341500075154, "flos": 22564120924800.0, "grad_norm": 1.5529264121633932, "language_loss": 0.76904577, "learning_rate": 2.8606677252849242e-06, "loss": 0.79019886, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 6281, "time_per_iteration": 2.44450306892395 }, { "auxiliary_loss_clip": 0.01067442, "auxiliary_loss_mlp": 0.0104868, "balance_loss_clip": 1.01787674, "balance_loss_mlp": 1.02366865, "epoch": 0.3776942732601834, "flos": 23076087235200.0, "grad_norm": 1.4406665332336894, "language_loss": 0.85086787, "learning_rate": 2.860316153670974e-06, "loss": 0.87202913, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 6282, "time_per_iteration": 2.4288699626922607 }, { "auxiliary_loss_clip": 0.0106464, "auxiliary_loss_mlp": 0.01046487, "balance_loss_clip": 1.01620793, "balance_loss_mlp": 1.02193642, "epoch": 0.37775439651285136, "flos": 21723377541120.0, "grad_norm": 1.784319296612173, "language_loss": 0.70224154, "learning_rate": 2.8599645494337484e-06, "loss": 0.72335279, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42578125, "step": 6283, "time_per_iteration": 2.438530445098877 }, { "auxiliary_loss_clip": 0.01065457, "auxiliary_loss_mlp": 0.01055359, "balance_loss_clip": 1.02285075, "balance_loss_mlp": 1.02174926, "epoch": 0.37781451976551933, "flos": 23986622160000.0, "grad_norm": 1.62608186544779, "language_loss": 0.77940738, "learning_rate": 2.859612912586581e-06, "loss": 0.80061555, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4375, "step": 6284, "time_per_iteration": 2.4105803966522217 }, { "auxiliary_loss_clip": 0.01070207, "auxiliary_loss_mlp": 0.01053734, "balance_loss_clip": 1.01964045, "balance_loss_mlp": 1.02249491, "epoch": 0.3778746430181873, "flos": 13727324787840.0, "grad_norm": 2.07146892832885, "language_loss": 0.87627763, "learning_rate": 2.8592612431428055e-06, "loss": 0.89751709, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4765625, "step": 6285, "time_per_iteration": 2.3789777755737305 }, { "auxiliary_loss_clip": 0.01066609, "auxiliary_loss_mlp": 0.01054929, "balance_loss_clip": 1.01975036, "balance_loss_mlp": 1.02094889, "epoch": 0.37793476627085526, "flos": 19459574340480.0, "grad_norm": 1.8795040463874788, "language_loss": 0.85952216, "learning_rate": 2.858909541115758e-06, "loss": 0.88073754, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.45703125, "step": 6286, "time_per_iteration": 2.3936920166015625 }, { "auxiliary_loss_clip": 0.01064656, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.02413249, "balance_loss_mlp": 1.0206778, "epoch": 0.3779948895235232, "flos": 10706254997760.0, "grad_norm": 1.9317425163782684, "language_loss": 0.83554327, "learning_rate": 2.858557806518775e-06, "loss": 0.85674518, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43945312, "step": 6287, "time_per_iteration": 2.387052297592163 }, { "auxiliary_loss_clip": 0.01065703, "auxiliary_loss_mlp": 0.01054144, "balance_loss_clip": 1.02244627, "balance_loss_mlp": 1.02066159, "epoch": 0.3780550127761912, "flos": 22308905819520.0, "grad_norm": 2.225176955727695, "language_loss": 0.74023616, "learning_rate": 2.8582060393651927e-06, "loss": 0.76143461, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 6288, "time_per_iteration": 2.410839557647705 }, { "auxiliary_loss_clip": 0.01066777, "auxiliary_loss_mlp": 0.01047668, "balance_loss_clip": 1.01618457, "balance_loss_mlp": 1.02139544, "epoch": 0.37811513602885916, "flos": 28949351235840.0, "grad_norm": 1.7953625808178562, "language_loss": 0.76793075, "learning_rate": 2.857854239668352e-06, "loss": 0.78907514, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45507812, "step": 6289, "time_per_iteration": 2.4465088844299316 }, { "auxiliary_loss_clip": 0.01066954, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.01999593, "balance_loss_mlp": 1.02171993, "epoch": 0.3781752592815271, "flos": 23111803422720.0, "grad_norm": 1.7761964396691463, "language_loss": 0.75377709, "learning_rate": 2.857502407441593e-06, "loss": 0.77496207, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 6290, "time_per_iteration": 2.4401636123657227 }, { "auxiliary_loss_clip": 0.0106696, "auxiliary_loss_mlp": 0.010547, "balance_loss_clip": 1.01850796, "balance_loss_mlp": 1.0200181, "epoch": 0.3782353825341951, "flos": 19754904464640.0, "grad_norm": 3.465297990544801, "language_loss": 0.8114723, "learning_rate": 2.8571505426982566e-06, "loss": 0.83268893, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.46875, "step": 6291, "time_per_iteration": 2.3626456260681152 }, { "auxiliary_loss_clip": 0.01065365, "auxiliary_loss_mlp": 0.01055074, "balance_loss_clip": 1.01983547, "balance_loss_mlp": 1.01926041, "epoch": 0.37829550578686305, "flos": 22049850464640.0, "grad_norm": 1.8276864339882875, "language_loss": 0.77511525, "learning_rate": 2.8567986454516854e-06, "loss": 0.79631966, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.4609375, "step": 6292, "time_per_iteration": 2.4162251949310303 }, { "auxiliary_loss_clip": 0.0106677, "auxiliary_loss_mlp": 0.01056524, "balance_loss_clip": 1.02104688, "balance_loss_mlp": 1.0207665, "epoch": 0.378355629039531, "flos": 16469472792960.0, "grad_norm": 2.007667866964689, "language_loss": 0.71129817, "learning_rate": 2.856446715715224e-06, "loss": 0.73253107, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.45898438, "step": 6293, "time_per_iteration": 2.351641893386841 }, { "auxiliary_loss_clip": 0.0106412, "auxiliary_loss_mlp": 0.0104938, "balance_loss_clip": 1.01719332, "balance_loss_mlp": 1.0199219, "epoch": 0.378415752292199, "flos": 19973809889280.0, "grad_norm": 2.3552797735134123, "language_loss": 0.72523773, "learning_rate": 2.8560947535022173e-06, "loss": 0.74637282, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.44140625, "step": 6294, "time_per_iteration": 2.387784481048584 }, { "auxiliary_loss_clip": 0.01068907, "auxiliary_loss_mlp": 0.01052765, "balance_loss_clip": 1.01793194, "balance_loss_mlp": 1.02053368, "epoch": 0.378475875544867, "flos": 14646517729920.0, "grad_norm": 2.8926078395787704, "language_loss": 0.84911764, "learning_rate": 2.855742758826011e-06, "loss": 0.87033439, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.484375, "step": 6295, "time_per_iteration": 2.347059488296509 }, { "auxiliary_loss_clip": 0.01067038, "auxiliary_loss_mlp": 0.01051783, "balance_loss_clip": 1.01726007, "balance_loss_mlp": 1.02031887, "epoch": 0.37853599879753497, "flos": 26649796936320.0, "grad_norm": 1.6359105726619436, "language_loss": 0.72792768, "learning_rate": 2.8553907316999547e-06, "loss": 0.74911594, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46679688, "step": 6296, "time_per_iteration": 2.4955921173095703 }, { "auxiliary_loss_clip": 0.01064683, "auxiliary_loss_mlp": 0.01053214, "balance_loss_clip": 1.02202868, "balance_loss_mlp": 1.02049005, "epoch": 0.37859612205020293, "flos": 17310984226560.0, "grad_norm": 1.9054788308708104, "language_loss": 0.78887182, "learning_rate": 2.855038672137396e-06, "loss": 0.81005079, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44140625, "step": 6297, "time_per_iteration": 2.367610454559326 }, { "auxiliary_loss_clip": 0.010675, "auxiliary_loss_mlp": 0.01055963, "balance_loss_clip": 1.02229857, "balance_loss_mlp": 1.02071762, "epoch": 0.3786562453028709, "flos": 18219494292480.0, "grad_norm": 1.6094739209044648, "language_loss": 0.80276668, "learning_rate": 2.854686580151684e-06, "loss": 0.82400131, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46875, "step": 6298, "time_per_iteration": 2.3968329429626465 }, { "auxiliary_loss_clip": 0.01064484, "auxiliary_loss_mlp": 0.01054407, "balance_loss_clip": 1.02176762, "balance_loss_mlp": 1.02078462, "epoch": 0.37871636855553886, "flos": 21213820114560.0, "grad_norm": 1.6364420352955638, "language_loss": 0.85778964, "learning_rate": 2.8543344557561722e-06, "loss": 0.87897861, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.4375, "step": 6299, "time_per_iteration": 2.381861448287964 }, { "auxiliary_loss_clip": 0.01067072, "auxiliary_loss_mlp": 0.0105289, "balance_loss_clip": 1.01812816, "balance_loss_mlp": 1.02078271, "epoch": 0.3787764918082068, "flos": 20951867116800.0, "grad_norm": 2.0093958790635846, "language_loss": 0.77738166, "learning_rate": 2.8539822989642116e-06, "loss": 0.7985813, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46289062, "step": 6300, "time_per_iteration": 2.410543918609619 }, { "auxiliary_loss_clip": 0.01068992, "auxiliary_loss_mlp": 0.01061714, "balance_loss_clip": 1.02251828, "balance_loss_mlp": 1.02061212, "epoch": 0.3788366150608748, "flos": 17307143976960.0, "grad_norm": 2.514136677066058, "language_loss": 0.84479249, "learning_rate": 2.8536301097891577e-06, "loss": 0.86609948, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 0.484375, "step": 6301, "time_per_iteration": 2.3485982418060303 }, { "auxiliary_loss_clip": 0.01066779, "auxiliary_loss_mlp": 0.0105212, "balance_loss_clip": 1.01843143, "balance_loss_mlp": 1.02045894, "epoch": 0.37889673831354276, "flos": 24310092706560.0, "grad_norm": 1.729840790087598, "language_loss": 0.69753867, "learning_rate": 2.8532778882443636e-06, "loss": 0.71872759, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46289062, "step": 6302, "time_per_iteration": 2.4267046451568604 }, { "auxiliary_loss_clip": 0.01066386, "auxiliary_loss_mlp": 0.01045826, "balance_loss_clip": 1.01418757, "balance_loss_mlp": 1.02226102, "epoch": 0.3789568615662107, "flos": 26682510746880.0, "grad_norm": 2.4602512519607322, "language_loss": 0.69484949, "learning_rate": 2.8529256343431867e-06, "loss": 0.71597165, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44140625, "step": 6303, "time_per_iteration": 2.451859474182129 }, { "auxiliary_loss_clip": 0.01066019, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.01633608, "balance_loss_mlp": 1.02030981, "epoch": 0.3790169848188787, "flos": 23584108561920.0, "grad_norm": 2.743955822820057, "language_loss": 0.78649014, "learning_rate": 2.8525733480989846e-06, "loss": 0.8076365, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.45703125, "step": 6304, "time_per_iteration": 2.4545810222625732 }, { "auxiliary_loss_clip": 0.01071037, "auxiliary_loss_mlp": 0.01054656, "balance_loss_clip": 1.01782084, "balance_loss_mlp": 1.02212691, "epoch": 0.37907710807154665, "flos": 18436584326400.0, "grad_norm": 2.036611620209349, "language_loss": 0.81321007, "learning_rate": 2.8522210295251146e-06, "loss": 0.83446705, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.48828125, "step": 6305, "time_per_iteration": 3.807636260986328 }, { "auxiliary_loss_clip": 0.01016841, "auxiliary_loss_mlp": 0.01021233, "balance_loss_clip": 1.0180856, "balance_loss_mlp": 1.00679088, "epoch": 0.3791372313242146, "flos": 50104411799040.0, "grad_norm": 0.9894978101182651, "language_loss": 0.64617914, "learning_rate": 2.8518686786349387e-06, "loss": 0.66655988, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.10058594, "step": 6306, "time_per_iteration": 2.956925868988037 }, { "auxiliary_loss_clip": 0.01068416, "auxiliary_loss_mlp": 0.01059392, "balance_loss_clip": 1.02300882, "balance_loss_mlp": 1.02123976, "epoch": 0.3791973545768826, "flos": 24315399233280.0, "grad_norm": 1.7029137520996735, "language_loss": 0.74525696, "learning_rate": 2.851516295441817e-06, "loss": 0.76653504, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.47265625, "step": 6307, "time_per_iteration": 3.844367504119873 }, { "auxiliary_loss_clip": 0.01070184, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.02049732, "balance_loss_mlp": 1.02183902, "epoch": 0.3792574778295506, "flos": 21578837045760.0, "grad_norm": 1.506832752334691, "language_loss": 0.79649723, "learning_rate": 2.851163879959112e-06, "loss": 0.81774831, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.48242188, "step": 6308, "time_per_iteration": 2.3809924125671387 }, { "auxiliary_loss_clip": 0.01066287, "auxiliary_loss_mlp": 0.0106189, "balance_loss_clip": 1.02729523, "balance_loss_mlp": 1.02055907, "epoch": 0.37931760108221857, "flos": 22271653532160.0, "grad_norm": 2.0892643323818576, "language_loss": 0.74139762, "learning_rate": 2.8508114322001876e-06, "loss": 0.7626794, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.45703125, "step": 6309, "time_per_iteration": 2.409688711166382 }, { "auxiliary_loss_clip": 0.01065771, "auxiliary_loss_mlp": 0.01061145, "balance_loss_clip": 1.02929246, "balance_loss_mlp": 1.02093077, "epoch": 0.37937772433488653, "flos": 19681970901120.0, "grad_norm": 1.5477468705862467, "language_loss": 0.79913521, "learning_rate": 2.8504589521784083e-06, "loss": 0.82040441, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44921875, "step": 6310, "time_per_iteration": 2.382229804992676 }, { "auxiliary_loss_clip": 0.01065702, "auxiliary_loss_mlp": 0.01050081, "balance_loss_clip": 1.01903915, "balance_loss_mlp": 1.02001595, "epoch": 0.3794378475875545, "flos": 19098362747520.0, "grad_norm": 2.8061987353218973, "language_loss": 0.77711737, "learning_rate": 2.8501064399071403e-06, "loss": 0.79827523, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.45703125, "step": 6311, "time_per_iteration": 3.869875431060791 }, { "auxiliary_loss_clip": 0.01067398, "auxiliary_loss_mlp": 0.0104899, "balance_loss_clip": 1.01770926, "balance_loss_mlp": 1.02162623, "epoch": 0.37949797084022246, "flos": 20338617352320.0, "grad_norm": 1.4563280323269752, "language_loss": 0.71684313, "learning_rate": 2.8497538953997504e-06, "loss": 0.73800695, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45703125, "step": 6312, "time_per_iteration": 3.8048007488250732 }, { "auxiliary_loss_clip": 0.01019048, "auxiliary_loss_mlp": 0.01005988, "balance_loss_clip": 1.0020299, "balance_loss_mlp": 1.00866461, "epoch": 0.37955809409289043, "flos": 63969050430720.0, "grad_norm": 0.7757654654775705, "language_loss": 0.56200629, "learning_rate": 2.849401318669608e-06, "loss": 0.58225667, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.10400391, "step": 6313, "time_per_iteration": 2.9740657806396484 }, { "auxiliary_loss_clip": 0.01065328, "auxiliary_loss_mlp": 0.01059606, "balance_loss_clip": 1.02927935, "balance_loss_mlp": 1.02100158, "epoch": 0.3796182173455584, "flos": 31539313157760.0, "grad_norm": 1.6808608542223547, "language_loss": 0.72655118, "learning_rate": 2.849048709730083e-06, "loss": 0.74780047, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44335938, "step": 6314, "time_per_iteration": 2.470212697982788 }, { "auxiliary_loss_clip": 0.01069784, "auxiliary_loss_mlp": 0.01060839, "balance_loss_clip": 1.02412236, "balance_loss_mlp": 1.02158046, "epoch": 0.37967834059822636, "flos": 12129978130560.0, "grad_norm": 1.729500347840395, "language_loss": 0.75076461, "learning_rate": 2.848696068594545e-06, "loss": 0.77207088, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 0.48242188, "step": 6315, "time_per_iteration": 2.3571832180023193 }, { "auxiliary_loss_clip": 0.01067338, "auxiliary_loss_mlp": 0.0107055, "balance_loss_clip": 1.03812551, "balance_loss_mlp": 1.02247238, "epoch": 0.3797384638508943, "flos": 39347009792640.0, "grad_norm": 1.8409880623758668, "language_loss": 0.72363824, "learning_rate": 2.8483433952763677e-06, "loss": 0.74501717, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44921875, "step": 6316, "time_per_iteration": 2.5326409339904785 }, { "auxiliary_loss_clip": 0.01067257, "auxiliary_loss_mlp": 0.01060036, "balance_loss_clip": 1.02808833, "balance_loss_mlp": 1.02164793, "epoch": 0.3797985871035623, "flos": 34052710734720.0, "grad_norm": 1.6586920180608467, "language_loss": 0.66374815, "learning_rate": 2.847990689788923e-06, "loss": 0.6850211, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45703125, "step": 6317, "time_per_iteration": 2.4765305519104004 }, { "auxiliary_loss_clip": 0.0106471, "auxiliary_loss_mlp": 0.01057707, "balance_loss_clip": 1.02616405, "balance_loss_mlp": 1.0201869, "epoch": 0.37985871035623026, "flos": 23221046666880.0, "grad_norm": 1.9335744307232055, "language_loss": 0.86988914, "learning_rate": 2.8476379521455877e-06, "loss": 0.89111328, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4453125, "step": 6318, "time_per_iteration": 2.4248201847076416 }, { "auxiliary_loss_clip": 0.01068041, "auxiliary_loss_mlp": 0.01062417, "balance_loss_clip": 1.02798951, "balance_loss_mlp": 1.02132106, "epoch": 0.3799188336088982, "flos": 18113951652480.0, "grad_norm": 1.9416959683633455, "language_loss": 0.78793848, "learning_rate": 2.8472851823597354e-06, "loss": 0.80924296, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46875, "step": 6319, "time_per_iteration": 2.3741204738616943 }, { "auxiliary_loss_clip": 0.01067666, "auxiliary_loss_mlp": 0.01068127, "balance_loss_clip": 1.03632212, "balance_loss_mlp": 1.02185655, "epoch": 0.3799789568615662, "flos": 21870815679360.0, "grad_norm": 4.013086761829991, "language_loss": 0.65197408, "learning_rate": 2.846932380444744e-06, "loss": 0.67333198, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45898438, "step": 6320, "time_per_iteration": 2.4420294761657715 }, { "auxiliary_loss_clip": 0.01066248, "auxiliary_loss_mlp": 0.0105637, "balance_loss_clip": 1.02528059, "balance_loss_mlp": 1.02136958, "epoch": 0.3800390801142342, "flos": 32961570013440.0, "grad_norm": 1.80919709956783, "language_loss": 0.72368693, "learning_rate": 2.846579546413992e-06, "loss": 0.7449131, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44921875, "step": 6321, "time_per_iteration": 2.4924166202545166 }, { "auxiliary_loss_clip": 0.01070213, "auxiliary_loss_mlp": 0.01053551, "balance_loss_clip": 1.01955295, "balance_loss_mlp": 1.02235079, "epoch": 0.38009920336690217, "flos": 26905849914240.0, "grad_norm": 1.8886251520325552, "language_loss": 0.76338959, "learning_rate": 2.846226680280859e-06, "loss": 0.7846272, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.47851562, "step": 6322, "time_per_iteration": 2.467200517654419 }, { "auxiliary_loss_clip": 0.01066934, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.0186044, "balance_loss_mlp": 1.02279329, "epoch": 0.38015932661957014, "flos": 22487905693440.0, "grad_norm": 2.3352981935432036, "language_loss": 0.86752677, "learning_rate": 2.845873782058725e-06, "loss": 0.88869512, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44140625, "step": 6323, "time_per_iteration": 2.391192674636841 }, { "auxiliary_loss_clip": 0.01070798, "auxiliary_loss_mlp": 0.01053538, "balance_loss_clip": 1.01794207, "balance_loss_mlp": 1.02416015, "epoch": 0.3802194498722381, "flos": 21979919278080.0, "grad_norm": 1.8540548598746636, "language_loss": 0.7446003, "learning_rate": 2.845520851760973e-06, "loss": 0.76584363, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.46679688, "step": 6324, "time_per_iteration": 2.4184179306030273 }, { "auxiliary_loss_clip": 0.0107192, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.01196134, "balance_loss_mlp": 1.02524018, "epoch": 0.38027957312490607, "flos": 21323796497280.0, "grad_norm": 1.642924366701546, "language_loss": 0.85864735, "learning_rate": 2.8451678894009847e-06, "loss": 0.87980992, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46679688, "step": 6325, "time_per_iteration": 2.4248766899108887 }, { "auxiliary_loss_clip": 0.01070572, "auxiliary_loss_mlp": 0.01047374, "balance_loss_clip": 1.01733363, "balance_loss_mlp": 1.02503896, "epoch": 0.38033969637757403, "flos": 16690298342400.0, "grad_norm": 1.7444616707184535, "language_loss": 0.81557178, "learning_rate": 2.8448148949921465e-06, "loss": 0.83675122, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45703125, "step": 6326, "time_per_iteration": 2.4223783016204834 }, { "auxiliary_loss_clip": 0.01070561, "auxiliary_loss_mlp": 0.01054069, "balance_loss_clip": 1.02209687, "balance_loss_mlp": 1.0242914, "epoch": 0.380399819630242, "flos": 36209365372800.0, "grad_norm": 1.8677358331334701, "language_loss": 0.7480104, "learning_rate": 2.844461868547842e-06, "loss": 0.76925671, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46289062, "step": 6327, "time_per_iteration": 2.545905590057373 }, { "auxiliary_loss_clip": 0.01069646, "auxiliary_loss_mlp": 0.01053487, "balance_loss_clip": 1.02089477, "balance_loss_mlp": 1.02468538, "epoch": 0.38045994288290996, "flos": 21287766107520.0, "grad_norm": 1.5200753606149444, "language_loss": 0.8396244, "learning_rate": 2.844108810081459e-06, "loss": 0.8608557, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44921875, "step": 6328, "time_per_iteration": 2.4522266387939453 }, { "auxiliary_loss_clip": 0.0106837, "auxiliary_loss_mlp": 0.01046664, "balance_loss_clip": 1.0169692, "balance_loss_mlp": 1.02366161, "epoch": 0.38052006613557793, "flos": 20921841480960.0, "grad_norm": 1.7701961905439585, "language_loss": 0.62636852, "learning_rate": 2.843755719606385e-06, "loss": 0.64751887, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44726562, "step": 6329, "time_per_iteration": 2.403658151626587 }, { "auxiliary_loss_clip": 0.01069017, "auxiliary_loss_mlp": 0.01054443, "balance_loss_clip": 1.02220857, "balance_loss_mlp": 1.0234493, "epoch": 0.3805801893882459, "flos": 20989817631360.0, "grad_norm": 1.818396707629304, "language_loss": 0.56721675, "learning_rate": 2.8434025971360104e-06, "loss": 0.58845139, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.45507812, "step": 6330, "time_per_iteration": 2.4120686054229736 }, { "auxiliary_loss_clip": 0.0106849, "auxiliary_loss_mlp": 0.01051497, "balance_loss_clip": 1.02301836, "balance_loss_mlp": 1.02549982, "epoch": 0.38064031264091386, "flos": 25557364494720.0, "grad_norm": 5.9765305694747735, "language_loss": 0.66909832, "learning_rate": 2.8430494426837243e-06, "loss": 0.6902982, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 6331, "time_per_iteration": 2.466381549835205 }, { "auxiliary_loss_clip": 0.01070467, "auxiliary_loss_mlp": 0.01064851, "balance_loss_clip": 1.03094769, "balance_loss_mlp": 1.02485907, "epoch": 0.3807004358935818, "flos": 15084956983680.0, "grad_norm": 1.5624522512316206, "language_loss": 0.77715003, "learning_rate": 2.842696256262919e-06, "loss": 0.79850316, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.45703125, "step": 6332, "time_per_iteration": 2.4440338611602783 }, { "auxiliary_loss_clip": 0.01068318, "auxiliary_loss_mlp": 0.01054128, "balance_loss_clip": 1.02175117, "balance_loss_mlp": 1.02208877, "epoch": 0.3807605591462498, "flos": 16398459354240.0, "grad_norm": 1.7021448613142254, "language_loss": 0.83089787, "learning_rate": 2.842343037886987e-06, "loss": 0.85212231, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46289062, "step": 6333, "time_per_iteration": 2.412381172180176 }, { "auxiliary_loss_clip": 0.01065961, "auxiliary_loss_mlp": 0.0105219, "balance_loss_clip": 1.02106452, "balance_loss_mlp": 1.02142429, "epoch": 0.3808206823989178, "flos": 29055871393920.0, "grad_norm": 2.0185880754559298, "language_loss": 0.87206191, "learning_rate": 2.8419897875693226e-06, "loss": 0.89324343, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 6334, "time_per_iteration": 2.506929874420166 }, { "auxiliary_loss_clip": 0.01067474, "auxiliary_loss_mlp": 0.01053722, "balance_loss_clip": 1.02126169, "balance_loss_mlp": 1.0223372, "epoch": 0.3808808056515858, "flos": 15704944640640.0, "grad_norm": 1.7016586030709253, "language_loss": 0.80686384, "learning_rate": 2.841636505323321e-06, "loss": 0.82807589, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45117188, "step": 6335, "time_per_iteration": 2.3895583152770996 }, { "auxiliary_loss_clip": 0.01067994, "auxiliary_loss_mlp": 0.01047479, "balance_loss_clip": 1.01454115, "balance_loss_mlp": 1.02215719, "epoch": 0.38094092890425374, "flos": 20703529549440.0, "grad_norm": 1.8434398832748637, "language_loss": 0.74351168, "learning_rate": 2.8412831911623795e-06, "loss": 0.76466638, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45703125, "step": 6336, "time_per_iteration": 2.4231884479522705 }, { "auxiliary_loss_clip": 0.01064282, "auxiliary_loss_mlp": 0.01051236, "balance_loss_clip": 1.02179146, "balance_loss_mlp": 1.020666, "epoch": 0.3810010521569217, "flos": 20666905666560.0, "grad_norm": 3.390208048128217, "language_loss": 0.7059328, "learning_rate": 2.840929845099894e-06, "loss": 0.72708797, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43554688, "step": 6337, "time_per_iteration": 2.376833438873291 }, { "auxiliary_loss_clip": 0.01065661, "auxiliary_loss_mlp": 0.01048866, "balance_loss_clip": 1.01853919, "balance_loss_mlp": 1.02183032, "epoch": 0.38106117540958967, "flos": 31826404200960.0, "grad_norm": 1.9207580719580042, "language_loss": 0.64978153, "learning_rate": 2.8405764671492652e-06, "loss": 0.67092681, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4375, "step": 6338, "time_per_iteration": 2.503746747970581 }, { "auxiliary_loss_clip": 0.01066746, "auxiliary_loss_mlp": 0.0105043, "balance_loss_clip": 1.01907802, "balance_loss_mlp": 1.02170396, "epoch": 0.38112129866225763, "flos": 16902012026880.0, "grad_norm": 2.1501939640328933, "language_loss": 0.70759183, "learning_rate": 2.8402230573238923e-06, "loss": 0.72876358, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44921875, "step": 6339, "time_per_iteration": 2.398677349090576 }, { "auxiliary_loss_clip": 0.01066686, "auxiliary_loss_mlp": 0.01048542, "balance_loss_clip": 1.01809633, "balance_loss_mlp": 1.02118564, "epoch": 0.3811814219149256, "flos": 20886160204800.0, "grad_norm": 2.252975005128591, "language_loss": 0.70024508, "learning_rate": 2.839869615637177e-06, "loss": 0.7213974, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.45507812, "step": 6340, "time_per_iteration": 2.4322092533111572 }, { "auxiliary_loss_clip": 0.0106751, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.01544166, "balance_loss_mlp": 1.0207907, "epoch": 0.38124154516759357, "flos": 16689879406080.0, "grad_norm": 1.9345164811928526, "language_loss": 0.90834975, "learning_rate": 2.839516142102522e-06, "loss": 0.92949778, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46679688, "step": 6341, "time_per_iteration": 2.4129791259765625 }, { "auxiliary_loss_clip": 0.01067292, "auxiliary_loss_mlp": 0.01048122, "balance_loss_clip": 1.01705551, "balance_loss_mlp": 1.02125537, "epoch": 0.38130166842026153, "flos": 19680958471680.0, "grad_norm": 1.6718687298097696, "language_loss": 0.76592928, "learning_rate": 2.83916263673333e-06, "loss": 0.78708345, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4609375, "step": 6342, "time_per_iteration": 2.4632701873779297 }, { "auxiliary_loss_clip": 0.01064351, "auxiliary_loss_mlp": 0.01050392, "balance_loss_clip": 1.01954079, "balance_loss_mlp": 1.01911342, "epoch": 0.3813617916729295, "flos": 22197393336960.0, "grad_norm": 1.7168407437481723, "language_loss": 0.8495968, "learning_rate": 2.838809099543007e-06, "loss": 0.87074423, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.453125, "step": 6343, "time_per_iteration": 2.4077558517456055 }, { "auxiliary_loss_clip": 0.01064875, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.02037871, "balance_loss_mlp": 1.0199101, "epoch": 0.38142191492559746, "flos": 19095953863680.0, "grad_norm": 1.4913530285958556, "language_loss": 0.77976036, "learning_rate": 2.838455530544959e-06, "loss": 0.80091596, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44921875, "step": 6344, "time_per_iteration": 2.401063919067383 }, { "auxiliary_loss_clip": 0.01066349, "auxiliary_loss_mlp": 0.01049828, "balance_loss_clip": 1.01799917, "balance_loss_mlp": 1.02163112, "epoch": 0.3814820381782654, "flos": 24096598542720.0, "grad_norm": 2.140447811961153, "language_loss": 0.75411963, "learning_rate": 2.838101929752593e-06, "loss": 0.77528143, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44726562, "step": 6345, "time_per_iteration": 3.8701794147491455 }, { "auxiliary_loss_clip": 0.01065358, "auxiliary_loss_mlp": 0.01045182, "balance_loss_clip": 1.01690507, "balance_loss_mlp": 1.0220145, "epoch": 0.3815421614309334, "flos": 15777598913280.0, "grad_norm": 3.3563392758260253, "language_loss": 0.71975738, "learning_rate": 2.8377482971793187e-06, "loss": 0.74086279, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.43359375, "step": 6346, "time_per_iteration": 3.846802234649658 }, { "auxiliary_loss_clip": 0.01067732, "auxiliary_loss_mlp": 0.01044372, "balance_loss_clip": 1.01408148, "balance_loss_mlp": 1.02243328, "epoch": 0.38160228468360136, "flos": 19898781644160.0, "grad_norm": 1.8221601250934318, "language_loss": 0.76291704, "learning_rate": 2.8373946328385437e-06, "loss": 0.78403813, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.453125, "step": 6347, "time_per_iteration": 2.411039113998413 }, { "auxiliary_loss_clip": 0.01067187, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.02054811, "balance_loss_mlp": 1.02209723, "epoch": 0.3816624079362694, "flos": 19280050796160.0, "grad_norm": 1.6194568762565813, "language_loss": 0.75829351, "learning_rate": 2.8370409367436813e-06, "loss": 0.77948654, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45117188, "step": 6348, "time_per_iteration": 2.382446765899658 }, { "auxiliary_loss_clip": 0.01066113, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.0140264, "balance_loss_mlp": 1.02146482, "epoch": 0.38172253118893734, "flos": 21176532915840.0, "grad_norm": 1.827558476096686, "language_loss": 0.88350892, "learning_rate": 2.836687208908142e-06, "loss": 0.90461761, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 6349, "time_per_iteration": 2.384216785430908 }, { "auxiliary_loss_clip": 0.01065671, "auxiliary_loss_mlp": 0.01055158, "balance_loss_clip": 1.02264929, "balance_loss_mlp": 1.02216959, "epoch": 0.3817826544416053, "flos": 17528283728640.0, "grad_norm": 1.8437953100507407, "language_loss": 0.78058434, "learning_rate": 2.836333449345341e-06, "loss": 0.80179262, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.43359375, "step": 6350, "time_per_iteration": 3.775536298751831 }, { "auxiliary_loss_clip": 0.01068095, "auxiliary_loss_mlp": 0.01048643, "balance_loss_clip": 1.01800644, "balance_loss_mlp": 1.02307534, "epoch": 0.38184277769427327, "flos": 16325595613440.0, "grad_norm": 2.028221769568398, "language_loss": 0.76954812, "learning_rate": 2.8359796580686907e-06, "loss": 0.79071558, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44921875, "step": 6351, "time_per_iteration": 3.7839794158935547 }, { "auxiliary_loss_clip": 0.01068139, "auxiliary_loss_mlp": 0.01053785, "balance_loss_clip": 1.02181244, "balance_loss_mlp": 1.02281117, "epoch": 0.38190290094694124, "flos": 30442202593920.0, "grad_norm": 1.9078827029400325, "language_loss": 0.76561964, "learning_rate": 2.8356258350916085e-06, "loss": 0.78683889, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 6352, "time_per_iteration": 2.495309829711914 }, { "auxiliary_loss_clip": 0.01064367, "auxiliary_loss_mlp": 0.01039388, "balance_loss_clip": 1.0118866, "balance_loss_mlp": 1.02204013, "epoch": 0.3819630241996092, "flos": 14209055994240.0, "grad_norm": 1.8314766206282933, "language_loss": 0.6526643, "learning_rate": 2.8352719804275104e-06, "loss": 0.67370188, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.42382812, "step": 6353, "time_per_iteration": 2.3915152549743652 }, { "auxiliary_loss_clip": 0.01064658, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.01567388, "balance_loss_mlp": 1.02162826, "epoch": 0.38202314745227717, "flos": 25008529921920.0, "grad_norm": 8.083305430002785, "language_loss": 0.84286642, "learning_rate": 2.834918094089816e-06, "loss": 0.86396009, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4296875, "step": 6354, "time_per_iteration": 2.4214165210723877 }, { "auxiliary_loss_clip": 0.01064604, "auxiliary_loss_mlp": 0.01041256, "balance_loss_clip": 1.01474345, "balance_loss_mlp": 1.02247119, "epoch": 0.38208327070494513, "flos": 20813436109440.0, "grad_norm": 1.7648439249317445, "language_loss": 0.81733942, "learning_rate": 2.834564176091943e-06, "loss": 0.83839804, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 6355, "time_per_iteration": 2.4377002716064453 }, { "auxiliary_loss_clip": 0.01066532, "auxiliary_loss_mlp": 0.01047284, "balance_loss_clip": 1.01698065, "balance_loss_mlp": 1.0225842, "epoch": 0.3821433939576131, "flos": 22636635552000.0, "grad_norm": 1.9760656026346997, "language_loss": 0.76074558, "learning_rate": 2.8342102264473125e-06, "loss": 0.78188372, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 6356, "time_per_iteration": 2.414358377456665 }, { "auxiliary_loss_clip": 0.01064973, "auxiliary_loss_mlp": 0.01044075, "balance_loss_clip": 1.01422513, "balance_loss_mlp": 1.02153397, "epoch": 0.38220351721028106, "flos": 26868667449600.0, "grad_norm": 1.9228274079777237, "language_loss": 0.82766181, "learning_rate": 2.833856245169348e-06, "loss": 0.84875226, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 6357, "time_per_iteration": 2.4571309089660645 }, { "auxiliary_loss_clip": 0.01066866, "auxiliary_loss_mlp": 0.0105301, "balance_loss_clip": 1.01750898, "balance_loss_mlp": 1.02091551, "epoch": 0.38226364046294903, "flos": 23366355212160.0, "grad_norm": 2.1738762692089484, "language_loss": 0.7922101, "learning_rate": 2.8335022322714695e-06, "loss": 0.81340885, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.45898438, "step": 6358, "time_per_iteration": 2.3965306282043457 }, { "auxiliary_loss_clip": 0.01064735, "auxiliary_loss_mlp": 0.01050168, "balance_loss_clip": 1.02019906, "balance_loss_mlp": 1.02125216, "epoch": 0.382323763715617, "flos": 19645207372800.0, "grad_norm": 2.7366478665744753, "language_loss": 0.8048383, "learning_rate": 2.8331481877671036e-06, "loss": 0.82598734, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43554688, "step": 6359, "time_per_iteration": 2.4002082347869873 }, { "auxiliary_loss_clip": 0.0106536, "auxiliary_loss_mlp": 0.01049465, "balance_loss_clip": 1.01751685, "balance_loss_mlp": 1.02070129, "epoch": 0.38238388696828496, "flos": 54122776842240.0, "grad_norm": 1.6147639843334827, "language_loss": 0.70607865, "learning_rate": 2.8327941116696754e-06, "loss": 0.72722685, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 6360, "time_per_iteration": 2.681946039199829 }, { "auxiliary_loss_clip": 0.01063898, "auxiliary_loss_mlp": 0.01039781, "balance_loss_clip": 1.01103926, "balance_loss_mlp": 1.01981962, "epoch": 0.382444010220953, "flos": 24935037776640.0, "grad_norm": 1.814162532659825, "language_loss": 0.79235154, "learning_rate": 2.83244000399261e-06, "loss": 0.81338835, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.44140625, "step": 6361, "time_per_iteration": 2.4558351039886475 }, { "auxiliary_loss_clip": 0.01061235, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.01630092, "balance_loss_mlp": 1.02028894, "epoch": 0.38250413347362094, "flos": 42335784708480.0, "grad_norm": 1.486384485879323, "language_loss": 0.66602397, "learning_rate": 2.832085864749337e-06, "loss": 0.68707472, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 6362, "time_per_iteration": 2.599665403366089 }, { "auxiliary_loss_clip": 0.01061194, "auxiliary_loss_mlp": 0.01042405, "balance_loss_clip": 1.01136279, "balance_loss_mlp": 1.01952636, "epoch": 0.3825642567262889, "flos": 16288308414720.0, "grad_norm": 1.8905823481182726, "language_loss": 0.83128035, "learning_rate": 2.8317316939532848e-06, "loss": 0.85231626, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.41601562, "step": 6363, "time_per_iteration": 2.4060213565826416 }, { "auxiliary_loss_clip": 0.01059851, "auxiliary_loss_mlp": 0.01044997, "balance_loss_clip": 1.01452672, "balance_loss_mlp": 1.01922321, "epoch": 0.3826243799789569, "flos": 45653197052160.0, "grad_norm": 1.693829331783628, "language_loss": 0.6074121, "learning_rate": 2.8313774916178825e-06, "loss": 0.62846053, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.40625, "step": 6364, "time_per_iteration": 2.6088473796844482 }, { "auxiliary_loss_clip": 0.0106466, "auxiliary_loss_mlp": 0.01048511, "balance_loss_clip": 1.01723003, "balance_loss_mlp": 1.02094114, "epoch": 0.38268450323162484, "flos": 25300403821440.0, "grad_norm": 2.46118969117612, "language_loss": 0.69980389, "learning_rate": 2.8310232577565635e-06, "loss": 0.72093558, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4375, "step": 6365, "time_per_iteration": 2.455159902572632 }, { "auxiliary_loss_clip": 0.01064206, "auxiliary_loss_mlp": 0.01049877, "balance_loss_clip": 1.01779735, "balance_loss_mlp": 1.01979184, "epoch": 0.3827446264842928, "flos": 21834924935040.0, "grad_norm": 2.09147588833453, "language_loss": 0.74673223, "learning_rate": 2.830668992382758e-06, "loss": 0.76787305, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 6366, "time_per_iteration": 2.421412944793701 }, { "auxiliary_loss_clip": 0.01063964, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.01377344, "balance_loss_mlp": 1.02028751, "epoch": 0.38280474973696077, "flos": 25733536548480.0, "grad_norm": 2.709213940654289, "language_loss": 0.70052671, "learning_rate": 2.830314695509902e-06, "loss": 0.72161663, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4375, "step": 6367, "time_per_iteration": 2.4637439250946045 }, { "auxiliary_loss_clip": 0.01061162, "auxiliary_loss_mlp": 0.01048802, "balance_loss_clip": 1.01737869, "balance_loss_mlp": 1.01924586, "epoch": 0.38286487298962874, "flos": 24894887846400.0, "grad_norm": 2.06428370367374, "language_loss": 0.65331328, "learning_rate": 2.82996036715143e-06, "loss": 0.67441297, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.41796875, "step": 6368, "time_per_iteration": 2.4155375957489014 }, { "auxiliary_loss_clip": 0.01062633, "auxiliary_loss_mlp": 0.01053532, "balance_loss_clip": 1.02229881, "balance_loss_mlp": 1.01918411, "epoch": 0.3829249962422967, "flos": 28542578451840.0, "grad_norm": 1.3597876252514884, "language_loss": 0.69298339, "learning_rate": 2.8296060073207763e-06, "loss": 0.71414495, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43359375, "step": 6369, "time_per_iteration": 2.462728977203369 }, { "auxiliary_loss_clip": 0.01064682, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.01481915, "balance_loss_mlp": 1.02048624, "epoch": 0.38298511949496467, "flos": 21470117472000.0, "grad_norm": 1.7324415173371763, "language_loss": 0.79319, "learning_rate": 2.8292516160313804e-06, "loss": 0.8143056, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44140625, "step": 6370, "time_per_iteration": 2.3887529373168945 }, { "auxiliary_loss_clip": 0.01062374, "auxiliary_loss_mlp": 0.01052446, "balance_loss_clip": 1.02076054, "balance_loss_mlp": 1.01989603, "epoch": 0.38304524274763263, "flos": 31678826417280.0, "grad_norm": 3.0926876599712254, "language_loss": 0.65513623, "learning_rate": 2.8288971932966805e-06, "loss": 0.67628443, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42382812, "step": 6371, "time_per_iteration": 2.457451343536377 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.01057504, "balance_loss_clip": 1.0241611, "balance_loss_mlp": 1.02112949, "epoch": 0.3831053660003006, "flos": 25075807845120.0, "grad_norm": 2.6250462774194467, "language_loss": 0.73929954, "learning_rate": 2.8285427391301155e-06, "loss": 0.76055205, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.46679688, "step": 6372, "time_per_iteration": 2.4331464767456055 }, { "auxiliary_loss_clip": 0.01064349, "auxiliary_loss_mlp": 0.01046202, "balance_loss_clip": 1.01592314, "balance_loss_mlp": 1.01972926, "epoch": 0.38316548925296856, "flos": 23257880017920.0, "grad_norm": 1.7543730173292114, "language_loss": 0.86057878, "learning_rate": 2.8281882535451266e-06, "loss": 0.8816843, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4453125, "step": 6373, "time_per_iteration": 2.413870096206665 }, { "auxiliary_loss_clip": 0.01066679, "auxiliary_loss_mlp": 0.01056717, "balance_loss_clip": 1.02521038, "balance_loss_mlp": 1.02103448, "epoch": 0.3832256125056366, "flos": 34422021763200.0, "grad_norm": 1.9653266640598404, "language_loss": 0.75857466, "learning_rate": 2.8278337365551567e-06, "loss": 0.77980864, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45703125, "step": 6374, "time_per_iteration": 2.4860427379608154 }, { "auxiliary_loss_clip": 0.01067166, "auxiliary_loss_mlp": 0.01052605, "balance_loss_clip": 1.02022767, "balance_loss_mlp": 1.0206573, "epoch": 0.38328573575830455, "flos": 21761677169280.0, "grad_norm": 2.3615810988998924, "language_loss": 0.77953351, "learning_rate": 2.8274791881736485e-06, "loss": 0.80073118, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.46484375, "step": 6375, "time_per_iteration": 2.4259936809539795 }, { "auxiliary_loss_clip": 0.01065004, "auxiliary_loss_mlp": 0.01043809, "balance_loss_clip": 1.01389921, "balance_loss_mlp": 1.02151334, "epoch": 0.3833458590109725, "flos": 17379169845120.0, "grad_norm": 2.1398987950061974, "language_loss": 0.7457844, "learning_rate": 2.8271246084140457e-06, "loss": 0.76687258, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43554688, "step": 6376, "time_per_iteration": 2.3516883850097656 }, { "auxiliary_loss_clip": 0.01061277, "auxiliary_loss_mlp": 0.01048802, "balance_loss_clip": 1.01885676, "balance_loss_mlp": 1.01860666, "epoch": 0.3834059822636405, "flos": 29423262297600.0, "grad_norm": 1.501776080683288, "language_loss": 0.68600667, "learning_rate": 2.826769997289796e-06, "loss": 0.70710742, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.42578125, "step": 6377, "time_per_iteration": 2.4720356464385986 }, { "auxiliary_loss_clip": 0.01064729, "auxiliary_loss_mlp": 0.01048289, "balance_loss_clip": 1.01610267, "balance_loss_mlp": 1.01992869, "epoch": 0.38346610551630844, "flos": 21469663624320.0, "grad_norm": 3.3393269197829283, "language_loss": 0.74764234, "learning_rate": 2.826415354814344e-06, "loss": 0.76877254, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.44921875, "step": 6378, "time_per_iteration": 2.3663418292999268 }, { "auxiliary_loss_clip": 0.01063408, "auxiliary_loss_mlp": 0.0104844, "balance_loss_clip": 1.01808965, "balance_loss_mlp": 1.01897883, "epoch": 0.3835262287689764, "flos": 27560052570240.0, "grad_norm": 1.8980896128201121, "language_loss": 0.71280247, "learning_rate": 2.8260606810011396e-06, "loss": 0.73392093, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4453125, "step": 6379, "time_per_iteration": 2.4543209075927734 }, { "auxiliary_loss_clip": 0.01062376, "auxiliary_loss_mlp": 0.01048972, "balance_loss_clip": 1.0188241, "balance_loss_mlp": 1.01961577, "epoch": 0.3835863520216444, "flos": 15522802744320.0, "grad_norm": 1.9968826945430231, "language_loss": 0.84540856, "learning_rate": 2.8257059758636315e-06, "loss": 0.86652207, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42773438, "step": 6380, "time_per_iteration": 2.365417957305908 }, { "auxiliary_loss_clip": 0.01062078, "auxiliary_loss_mlp": 0.01042791, "balance_loss_clip": 1.01375151, "balance_loss_mlp": 1.01904714, "epoch": 0.38364647527431234, "flos": 21903948426240.0, "grad_norm": 1.449300061590241, "language_loss": 0.82186115, "learning_rate": 2.8253512394152697e-06, "loss": 0.84290981, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 6381, "time_per_iteration": 2.434833288192749 }, { "auxiliary_loss_clip": 0.01016491, "auxiliary_loss_mlp": 0.01008767, "balance_loss_clip": 1.00449908, "balance_loss_mlp": 1.00632811, "epoch": 0.3837065985269803, "flos": 65531902798080.0, "grad_norm": 0.8237663286722084, "language_loss": 0.60560811, "learning_rate": 2.8249964716695068e-06, "loss": 0.62586069, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 0.04272461, "router_z_loss_mlp": 0.1015625, "step": 6382, "time_per_iteration": 3.013780355453491 }, { "auxiliary_loss_clip": 0.01066526, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.01561022, "balance_loss_mlp": 1.01986933, "epoch": 0.38376672177964827, "flos": 28255347763200.0, "grad_norm": 2.3073193012700344, "language_loss": 0.68002689, "learning_rate": 2.824641672639794e-06, "loss": 0.7011615, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.46679688, "step": 6383, "time_per_iteration": 2.475337266921997 }, { "auxiliary_loss_clip": 0.01065204, "auxiliary_loss_mlp": 0.01049381, "balance_loss_clip": 1.01814806, "balance_loss_mlp": 1.020123, "epoch": 0.38382684503231623, "flos": 20630316695040.0, "grad_norm": 2.0372074969148035, "language_loss": 0.76438439, "learning_rate": 2.824286842339587e-06, "loss": 0.78553027, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.45117188, "step": 6384, "time_per_iteration": 2.4091949462890625 }, { "auxiliary_loss_clip": 0.01062888, "auxiliary_loss_mlp": 0.01045624, "balance_loss_clip": 1.01433158, "balance_loss_mlp": 1.01988971, "epoch": 0.3838869682849842, "flos": 19604917797120.0, "grad_norm": 1.7954730707304243, "language_loss": 0.77515501, "learning_rate": 2.823931980782341e-06, "loss": 0.79624009, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4296875, "step": 6385, "time_per_iteration": 3.9195058345794678 }, { "auxiliary_loss_clip": 0.01016162, "auxiliary_loss_mlp": 0.01005397, "balance_loss_clip": 1.0012958, "balance_loss_mlp": 1.00570953, "epoch": 0.38394709153765216, "flos": 56553428897280.0, "grad_norm": 0.8940279882877806, "language_loss": 0.67136705, "learning_rate": 2.82357708798151e-06, "loss": 0.69158268, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.10449219, "step": 6386, "time_per_iteration": 4.315301895141602 }, { "auxiliary_loss_clip": 0.0106181, "auxiliary_loss_mlp": 0.01049645, "balance_loss_clip": 1.02198839, "balance_loss_mlp": 1.0198437, "epoch": 0.3840072147903202, "flos": 15887819675520.0, "grad_norm": 1.96451650797348, "language_loss": 0.73763233, "learning_rate": 2.8232221639505547e-06, "loss": 0.75874692, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 6387, "time_per_iteration": 2.3827855587005615 }, { "auxiliary_loss_clip": 0.01063424, "auxiliary_loss_mlp": 0.01050768, "balance_loss_clip": 1.02053678, "balance_loss_mlp": 1.02133405, "epoch": 0.38406733804298815, "flos": 28216838666880.0, "grad_norm": 1.6233387617232438, "language_loss": 0.82276624, "learning_rate": 2.822867208702932e-06, "loss": 0.84390819, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.421875, "step": 6388, "time_per_iteration": 2.490025043487549 }, { "auxiliary_loss_clip": 0.0106012, "auxiliary_loss_mlp": 0.0104649, "balance_loss_clip": 1.01877344, "balance_loss_mlp": 1.0183084, "epoch": 0.3841274612956561, "flos": 18222601403520.0, "grad_norm": 1.7551562563109815, "language_loss": 0.77066195, "learning_rate": 2.8225122222521026e-06, "loss": 0.79172796, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 6389, "time_per_iteration": 2.3614163398742676 }, { "auxiliary_loss_clip": 0.01066274, "auxiliary_loss_mlp": 0.01055613, "balance_loss_clip": 1.02235329, "balance_loss_mlp": 1.02040482, "epoch": 0.3841875845483241, "flos": 19791842549760.0, "grad_norm": 1.6580719878320376, "language_loss": 0.77902436, "learning_rate": 2.8221572046115273e-06, "loss": 0.80024326, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.45898438, "step": 6390, "time_per_iteration": 3.7473785877227783 }, { "auxiliary_loss_clip": 0.01064379, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.01996541, "balance_loss_mlp": 1.0193913, "epoch": 0.38424770780099204, "flos": 29897522472960.0, "grad_norm": 2.1421026135593744, "language_loss": 0.71170557, "learning_rate": 2.821802155794668e-06, "loss": 0.73289186, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44921875, "step": 6391, "time_per_iteration": 3.989543914794922 }, { "auxiliary_loss_clip": 0.0106406, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.02185225, "balance_loss_mlp": 1.01903033, "epoch": 0.38430783105366, "flos": 20812668059520.0, "grad_norm": 1.9508558970128071, "language_loss": 0.8540138, "learning_rate": 2.8214470758149884e-06, "loss": 0.87522411, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.44921875, "step": 6392, "time_per_iteration": 2.444394111633301 }, { "auxiliary_loss_clip": 0.01062602, "auxiliary_loss_mlp": 0.0104663, "balance_loss_clip": 1.01649404, "balance_loss_mlp": 1.01866114, "epoch": 0.384367954306328, "flos": 10997814695040.0, "grad_norm": 2.188232238702554, "language_loss": 0.63300234, "learning_rate": 2.8210919646859536e-06, "loss": 0.65409464, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43945312, "step": 6393, "time_per_iteration": 2.399193286895752 }, { "auxiliary_loss_clip": 0.01067034, "auxiliary_loss_mlp": 0.01053708, "balance_loss_clip": 1.01851773, "balance_loss_mlp": 1.01971102, "epoch": 0.38442807755899594, "flos": 25336853147520.0, "grad_norm": 2.198181211409453, "language_loss": 0.7290647, "learning_rate": 2.820736822421029e-06, "loss": 0.75027215, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.47265625, "step": 6394, "time_per_iteration": 2.3978912830352783 }, { "auxiliary_loss_clip": 0.01068591, "auxiliary_loss_mlp": 0.01057534, "balance_loss_clip": 1.02053189, "balance_loss_mlp": 1.02128005, "epoch": 0.3844882008116639, "flos": 21068686126080.0, "grad_norm": 7.7055853384041555, "language_loss": 0.8282401, "learning_rate": 2.8203816490336822e-06, "loss": 0.84950137, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 0.47265625, "step": 6395, "time_per_iteration": 2.412632703781128 }, { "auxiliary_loss_clip": 0.01065883, "auxiliary_loss_mlp": 0.01051509, "balance_loss_clip": 1.01827383, "balance_loss_mlp": 1.02153254, "epoch": 0.38454832406433187, "flos": 17962393973760.0, "grad_norm": 2.0689666648180225, "language_loss": 0.72166789, "learning_rate": 2.8200264445373813e-06, "loss": 0.74284184, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44335938, "step": 6396, "time_per_iteration": 2.3330984115600586 }, { "auxiliary_loss_clip": 0.01015853, "auxiliary_loss_mlp": 0.01013449, "balance_loss_clip": 1.00860918, "balance_loss_mlp": 1.00461471, "epoch": 0.38460844731699984, "flos": 67921392493440.0, "grad_norm": 0.8987857639002326, "language_loss": 0.59740496, "learning_rate": 2.8196712089455954e-06, "loss": 0.61769801, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 0.04833984, "router_z_loss_mlp": 0.11230469, "step": 6397, "time_per_iteration": 3.1016032695770264 }, { "auxiliary_loss_clip": 0.01064135, "auxiliary_loss_mlp": 0.0104959, "balance_loss_clip": 1.01556718, "balance_loss_mlp": 1.01905894, "epoch": 0.3846685705696678, "flos": 25847876851200.0, "grad_norm": 1.7570372434564208, "language_loss": 0.86077309, "learning_rate": 2.819315942271794e-06, "loss": 0.88191026, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.45117188, "step": 6398, "time_per_iteration": 2.4448041915893555 }, { "auxiliary_loss_clip": 0.01062352, "auxiliary_loss_mlp": 0.01042263, "balance_loss_clip": 1.01318753, "balance_loss_mlp": 1.01913118, "epoch": 0.38472869382233577, "flos": 16289251021440.0, "grad_norm": 1.9322418579900946, "language_loss": 0.81198251, "learning_rate": 2.8189606445294515e-06, "loss": 0.83302867, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43164062, "step": 6399, "time_per_iteration": 2.422811269760132 }, { "auxiliary_loss_clip": 0.01065067, "auxiliary_loss_mlp": 0.0104777, "balance_loss_clip": 1.01401043, "balance_loss_mlp": 1.02025008, "epoch": 0.38478881707500373, "flos": 19352146487040.0, "grad_norm": 2.0272011162936083, "language_loss": 0.69000775, "learning_rate": 2.818605315732038e-06, "loss": 0.7111361, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.44921875, "step": 6400, "time_per_iteration": 2.388669490814209 }, { "auxiliary_loss_clip": 0.01067918, "auxiliary_loss_mlp": 0.01052425, "balance_loss_clip": 1.02050054, "balance_loss_mlp": 1.02183008, "epoch": 0.38484894032767175, "flos": 24859765152000.0, "grad_norm": 1.9174719971968093, "language_loss": 0.74230522, "learning_rate": 2.81824995589303e-06, "loss": 0.76350868, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4609375, "step": 6401, "time_per_iteration": 2.443650245666504 }, { "auxiliary_loss_clip": 0.01063204, "auxiliary_loss_mlp": 0.01050424, "balance_loss_clip": 1.01654434, "balance_loss_mlp": 1.01913059, "epoch": 0.3849090635803397, "flos": 14500929893760.0, "grad_norm": 1.833380167793557, "language_loss": 0.74343777, "learning_rate": 2.8178945650259012e-06, "loss": 0.76457405, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.44140625, "step": 6402, "time_per_iteration": 2.4122135639190674 }, { "auxiliary_loss_clip": 0.01061513, "auxiliary_loss_mlp": 0.01049036, "balance_loss_clip": 1.01820862, "balance_loss_mlp": 1.0192889, "epoch": 0.3849691868330077, "flos": 18514859328000.0, "grad_norm": 2.307765745820898, "language_loss": 0.83897817, "learning_rate": 2.817539143144128e-06, "loss": 0.8600837, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.421875, "step": 6403, "time_per_iteration": 2.475731611251831 }, { "auxiliary_loss_clip": 0.0106236, "auxiliary_loss_mlp": 0.01046195, "balance_loss_clip": 1.01510513, "balance_loss_mlp": 1.01974082, "epoch": 0.38502931008567565, "flos": 21615321283200.0, "grad_norm": 2.00314150572524, "language_loss": 0.84062684, "learning_rate": 2.817183690261189e-06, "loss": 0.8617124, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.42578125, "step": 6404, "time_per_iteration": 2.3955297470092773 }, { "auxiliary_loss_clip": 0.01064429, "auxiliary_loss_mlp": 0.0104549, "balance_loss_clip": 1.01425719, "balance_loss_mlp": 1.01970232, "epoch": 0.3850894333383436, "flos": 25414045896960.0, "grad_norm": 1.5542454540892872, "language_loss": 0.71014583, "learning_rate": 2.816828206390563e-06, "loss": 0.73124504, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44726562, "step": 6405, "time_per_iteration": 2.5049026012420654 }, { "auxiliary_loss_clip": 0.01062797, "auxiliary_loss_mlp": 0.01047251, "balance_loss_clip": 1.0183785, "balance_loss_mlp": 1.01973033, "epoch": 0.3851495565910116, "flos": 20226895401600.0, "grad_norm": 2.3360123992548116, "language_loss": 0.80900347, "learning_rate": 2.816472691545729e-06, "loss": 0.83010393, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43164062, "step": 6406, "time_per_iteration": 2.419074296951294 }, { "auxiliary_loss_clip": 0.01064878, "auxiliary_loss_mlp": 0.01048844, "balance_loss_clip": 1.01584697, "balance_loss_mlp": 1.02037525, "epoch": 0.38520967984367954, "flos": 16507528041600.0, "grad_norm": 2.201573896813602, "language_loss": 0.86056358, "learning_rate": 2.8161171457401694e-06, "loss": 0.88170075, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 6407, "time_per_iteration": 2.4199435710906982 }, { "auxiliary_loss_clip": 0.0101579, "auxiliary_loss_mlp": 0.01010922, "balance_loss_clip": 1.00696397, "balance_loss_mlp": 1.00430369, "epoch": 0.3852698030963475, "flos": 61310553776640.0, "grad_norm": 0.8598289708519227, "language_loss": 0.64987934, "learning_rate": 2.815761568987365e-06, "loss": 0.67014647, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.11523438, "step": 6408, "time_per_iteration": 3.090538501739502 }, { "auxiliary_loss_clip": 0.01065761, "auxiliary_loss_mlp": 0.01052847, "balance_loss_clip": 1.02104235, "balance_loss_mlp": 1.02089024, "epoch": 0.3853299263490155, "flos": 22891920480000.0, "grad_norm": 1.4716750777352787, "language_loss": 0.742993, "learning_rate": 2.8154059613008e-06, "loss": 0.76417911, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44921875, "step": 6409, "time_per_iteration": 2.455472469329834 }, { "auxiliary_loss_clip": 0.01067954, "auxiliary_loss_mlp": 0.01058466, "balance_loss_clip": 1.02577877, "balance_loss_mlp": 1.02062023, "epoch": 0.38539004960168344, "flos": 20046464161920.0, "grad_norm": 2.0554562544717947, "language_loss": 0.72029173, "learning_rate": 2.81505032269396e-06, "loss": 0.74155593, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47265625, "step": 6410, "time_per_iteration": 2.3855443000793457 }, { "auxiliary_loss_clip": 0.01015199, "auxiliary_loss_mlp": 0.01004319, "balance_loss_clip": 1.00038493, "balance_loss_mlp": 1.00300407, "epoch": 0.3854501728543514, "flos": 68726978271360.0, "grad_norm": 0.6849419074217806, "language_loss": 0.60400832, "learning_rate": 2.81469465318033e-06, "loss": 0.62420356, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 0.03930664, "router_z_loss_mlp": 0.12207031, "step": 6411, "time_per_iteration": 3.1318328380584717 }, { "auxiliary_loss_clip": 0.01063728, "auxiliary_loss_mlp": 0.01043414, "balance_loss_clip": 1.01506639, "balance_loss_mlp": 1.02028, "epoch": 0.38551029610701937, "flos": 20483995720320.0, "grad_norm": 2.010700687131897, "language_loss": 0.79516947, "learning_rate": 2.814338952773397e-06, "loss": 0.81624091, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.43554688, "step": 6412, "time_per_iteration": 2.409669876098633 }, { "auxiliary_loss_clip": 0.01067574, "auxiliary_loss_mlp": 0.01053677, "balance_loss_clip": 1.02034593, "balance_loss_mlp": 1.02153361, "epoch": 0.38557041935968733, "flos": 23470815600000.0, "grad_norm": 2.074192521123628, "language_loss": 0.79563963, "learning_rate": 2.8139832214866493e-06, "loss": 0.81685209, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.4609375, "step": 6413, "time_per_iteration": 2.4261488914489746 }, { "auxiliary_loss_clip": 0.01014819, "auxiliary_loss_mlp": 0.01006729, "balance_loss_clip": 1.00269938, "balance_loss_mlp": 1.00273275, "epoch": 0.38563054261235535, "flos": 63963639169920.0, "grad_norm": 0.8078189058373881, "language_loss": 0.61315024, "learning_rate": 2.813627459333576e-06, "loss": 0.63336563, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.12109375, "step": 6414, "time_per_iteration": 2.9398422241210938 }, { "auxiliary_loss_clip": 0.01066688, "auxiliary_loss_mlp": 0.01055756, "balance_loss_clip": 1.02538133, "balance_loss_mlp": 1.02129841, "epoch": 0.3856906658650233, "flos": 23986657071360.0, "grad_norm": 2.5914086214084824, "language_loss": 0.79332387, "learning_rate": 2.8132716663276685e-06, "loss": 0.81454837, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.453125, "step": 6415, "time_per_iteration": 2.402998924255371 }, { "auxiliary_loss_clip": 0.01058947, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.01755035, "balance_loss_mlp": 1.01924753, "epoch": 0.3857507891176913, "flos": 25006330506240.0, "grad_norm": 1.6884870856894691, "language_loss": 0.80835587, "learning_rate": 2.8129158424824173e-06, "loss": 0.82937974, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39648438, "step": 6416, "time_per_iteration": 2.4967572689056396 }, { "auxiliary_loss_clip": 0.01063913, "auxiliary_loss_mlp": 0.01048788, "balance_loss_clip": 1.0198561, "balance_loss_mlp": 1.02030325, "epoch": 0.38581091237035925, "flos": 21535894206720.0, "grad_norm": 2.0235730532194913, "language_loss": 0.80235428, "learning_rate": 2.8125599878113155e-06, "loss": 0.82348126, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 6417, "time_per_iteration": 2.37949538230896 }, { "auxiliary_loss_clip": 0.01062091, "auxiliary_loss_mlp": 0.0104825, "balance_loss_clip": 1.02047443, "balance_loss_mlp": 1.01885247, "epoch": 0.3858710356230272, "flos": 17382940272000.0, "grad_norm": 7.623268048276753, "language_loss": 0.82308674, "learning_rate": 2.8122041023278583e-06, "loss": 0.84419012, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43164062, "step": 6418, "time_per_iteration": 2.4818711280822754 }, { "auxiliary_loss_clip": 0.01061157, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.01573896, "balance_loss_mlp": 1.01881564, "epoch": 0.3859311588756952, "flos": 20338547529600.0, "grad_norm": 1.6871021047957437, "language_loss": 0.81322068, "learning_rate": 2.8118481860455407e-06, "loss": 0.83428049, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42382812, "step": 6419, "time_per_iteration": 2.396074056625366 }, { "auxiliary_loss_clip": 0.010614, "auxiliary_loss_mlp": 0.01048545, "balance_loss_clip": 1.01662064, "balance_loss_mlp": 1.0191462, "epoch": 0.38599128212836314, "flos": 26320007433600.0, "grad_norm": 2.1384364006340584, "language_loss": 0.69733268, "learning_rate": 2.8114922389778573e-06, "loss": 0.71843207, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.421875, "step": 6420, "time_per_iteration": 2.5066535472869873 }, { "auxiliary_loss_clip": 0.01060387, "auxiliary_loss_mlp": 0.01046765, "balance_loss_clip": 1.0205276, "balance_loss_mlp": 1.0198915, "epoch": 0.3860514053810311, "flos": 13552968124800.0, "grad_norm": 2.0898081661168764, "language_loss": 0.82438052, "learning_rate": 2.8111362611383076e-06, "loss": 0.84545207, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 6421, "time_per_iteration": 2.3562872409820557 }, { "auxiliary_loss_clip": 0.0106149, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.01444244, "balance_loss_mlp": 1.01890802, "epoch": 0.3861115286336991, "flos": 20953368305280.0, "grad_norm": 2.0316399267435936, "language_loss": 0.73737895, "learning_rate": 2.8107802525403886e-06, "loss": 0.758412, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 6422, "time_per_iteration": 2.397467851638794 }, { "auxiliary_loss_clip": 0.0105944, "auxiliary_loss_mlp": 0.01047089, "balance_loss_clip": 1.01993275, "balance_loss_mlp": 1.01859713, "epoch": 0.38617165188636704, "flos": 16361765648640.0, "grad_norm": 1.5579358128845253, "language_loss": 0.68014836, "learning_rate": 2.8104242131976025e-06, "loss": 0.7012136, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40820312, "step": 6423, "time_per_iteration": 2.3771145343780518 }, { "auxiliary_loss_clip": 0.01063185, "auxiliary_loss_mlp": 0.01054819, "balance_loss_clip": 1.02616072, "balance_loss_mlp": 1.01971579, "epoch": 0.386231775139035, "flos": 34785851708160.0, "grad_norm": 1.7898353472915558, "language_loss": 0.70504463, "learning_rate": 2.810068143123449e-06, "loss": 0.72622466, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 6424, "time_per_iteration": 2.4902095794677734 }, { "auxiliary_loss_clip": 0.01060568, "auxiliary_loss_mlp": 0.01048735, "balance_loss_clip": 1.02024412, "balance_loss_mlp": 1.01890373, "epoch": 0.38629189839170297, "flos": 21725088197760.0, "grad_norm": 1.4461908066186457, "language_loss": 0.7344408, "learning_rate": 2.809712042331429e-06, "loss": 0.75553381, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41601562, "step": 6425, "time_per_iteration": 5.260988473892212 }, { "auxiliary_loss_clip": 0.01065668, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.01642299, "balance_loss_mlp": 1.02040005, "epoch": 0.38635202164437094, "flos": 27922520972160.0, "grad_norm": 2.347155200877132, "language_loss": 0.81012344, "learning_rate": 2.8093559108350484e-06, "loss": 0.83125091, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.453125, "step": 6426, "time_per_iteration": 2.481254816055298 }, { "auxiliary_loss_clip": 0.01062005, "auxiliary_loss_mlp": 0.01045331, "balance_loss_clip": 1.01635098, "balance_loss_mlp": 1.01971936, "epoch": 0.38641214489703896, "flos": 23585505016320.0, "grad_norm": 2.049402789011015, "language_loss": 0.76495999, "learning_rate": 2.80899974864781e-06, "loss": 0.78603327, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.421875, "step": 6427, "time_per_iteration": 2.4035422801971436 }, { "auxiliary_loss_clip": 0.01060366, "auxiliary_loss_mlp": 0.01045904, "balance_loss_clip": 1.01767468, "balance_loss_mlp": 1.01917183, "epoch": 0.3864722681497069, "flos": 12640408341120.0, "grad_norm": 2.0717810882806846, "language_loss": 0.70686519, "learning_rate": 2.8086435557832203e-06, "loss": 0.72792786, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 6428, "time_per_iteration": 2.4081335067749023 }, { "auxiliary_loss_clip": 0.01063402, "auxiliary_loss_mlp": 0.01048087, "balance_loss_clip": 1.01990557, "balance_loss_mlp": 1.02087831, "epoch": 0.3865323914023749, "flos": 17598075269760.0, "grad_norm": 1.9747152575837872, "language_loss": 0.85430783, "learning_rate": 2.8082873322547863e-06, "loss": 0.87542272, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 6429, "time_per_iteration": 2.3632004261016846 }, { "auxiliary_loss_clip": 0.01062031, "auxiliary_loss_mlp": 0.01043135, "balance_loss_clip": 1.0148114, "balance_loss_mlp": 1.02033734, "epoch": 0.38659251465504285, "flos": 18477956154240.0, "grad_norm": 1.9152502264010145, "language_loss": 0.82584089, "learning_rate": 2.807931078076015e-06, "loss": 0.8468926, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41601562, "step": 6430, "time_per_iteration": 3.9454731941223145 }, { "auxiliary_loss_clip": 0.01017662, "auxiliary_loss_mlp": 0.01005693, "balance_loss_clip": 1.00166416, "balance_loss_mlp": 1.00750947, "epoch": 0.3866526379077108, "flos": 64162259228160.0, "grad_norm": 0.7214581572232176, "language_loss": 0.58885682, "learning_rate": 2.807574793260416e-06, "loss": 0.60909033, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.1015625, "step": 6431, "time_per_iteration": 3.095679759979248 }, { "auxiliary_loss_clip": 0.01063823, "auxiliary_loss_mlp": 0.01049117, "balance_loss_clip": 1.0164777, "balance_loss_mlp": 1.01969934, "epoch": 0.3867127611603788, "flos": 14387532197760.0, "grad_norm": 1.9066484634153322, "language_loss": 0.79923284, "learning_rate": 2.8072184778215004e-06, "loss": 0.82036221, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44140625, "step": 6432, "time_per_iteration": 2.4057750701904297 }, { "auxiliary_loss_clip": 0.01065408, "auxiliary_loss_mlp": 0.0104321, "balance_loss_clip": 1.01394439, "balance_loss_mlp": 1.02040148, "epoch": 0.38677288441304675, "flos": 20009735544960.0, "grad_norm": 2.1716257499306804, "language_loss": 0.82504368, "learning_rate": 2.806862131772779e-06, "loss": 0.84612989, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44921875, "step": 6433, "time_per_iteration": 2.4107301235198975 }, { "auxiliary_loss_clip": 0.01063587, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.01727307, "balance_loss_mlp": 1.02024972, "epoch": 0.3868330076657147, "flos": 22235797699200.0, "grad_norm": 1.6875365109040825, "language_loss": 0.72012973, "learning_rate": 2.806505755127765e-06, "loss": 0.74126959, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.43359375, "step": 6434, "time_per_iteration": 2.4042165279388428 }, { "auxiliary_loss_clip": 0.01066107, "auxiliary_loss_mlp": 0.01049667, "balance_loss_clip": 1.01810014, "balance_loss_mlp": 1.02002609, "epoch": 0.3868931309183827, "flos": 16726503288960.0, "grad_norm": 1.9219448438613114, "language_loss": 0.79497194, "learning_rate": 2.806149347899972e-06, "loss": 0.81612968, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4609375, "step": 6435, "time_per_iteration": 2.39243745803833 }, { "auxiliary_loss_clip": 0.01060914, "auxiliary_loss_mlp": 0.01044371, "balance_loss_clip": 1.0163691, "balance_loss_mlp": 1.01896596, "epoch": 0.38695325417105064, "flos": 22673608548480.0, "grad_norm": 1.7717351618753494, "language_loss": 0.81013906, "learning_rate": 2.805792910102915e-06, "loss": 0.8311919, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 6436, "time_per_iteration": 2.392469882965088 }, { "auxiliary_loss_clip": 0.01060357, "auxiliary_loss_mlp": 0.01044926, "balance_loss_clip": 1.01829481, "balance_loss_mlp": 1.01910722, "epoch": 0.3870133774237186, "flos": 23110930638720.0, "grad_norm": 1.6637151325115598, "language_loss": 0.77513492, "learning_rate": 2.8054364417501093e-06, "loss": 0.79618776, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 6437, "time_per_iteration": 2.4074625968933105 }, { "auxiliary_loss_clip": 0.01061726, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.02163339, "balance_loss_mlp": 1.02015519, "epoch": 0.3870735006763866, "flos": 17674744348800.0, "grad_norm": 2.01287512020745, "language_loss": 0.82794023, "learning_rate": 2.805079942855074e-06, "loss": 0.84903705, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41601562, "step": 6438, "time_per_iteration": 2.3617351055145264 }, { "auxiliary_loss_clip": 0.01062094, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.02262819, "balance_loss_mlp": 1.0194459, "epoch": 0.38713362392905454, "flos": 23294643546240.0, "grad_norm": 1.4012944113029324, "language_loss": 0.76252091, "learning_rate": 2.804723413431326e-06, "loss": 0.78364527, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42578125, "step": 6439, "time_per_iteration": 2.430793285369873 }, { "auxiliary_loss_clip": 0.0105733, "auxiliary_loss_mlp": 0.01043878, "balance_loss_clip": 1.0182004, "balance_loss_mlp": 1.01809251, "epoch": 0.38719374718172256, "flos": 21030177029760.0, "grad_norm": 1.4636289750574472, "language_loss": 0.7507655, "learning_rate": 2.8043668534923855e-06, "loss": 0.77177763, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 6440, "time_per_iteration": 2.3811075687408447 }, { "auxiliary_loss_clip": 0.01061714, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.02056265, "balance_loss_mlp": 1.01837993, "epoch": 0.3872538704343905, "flos": 19608758046720.0, "grad_norm": 1.8543697247829694, "language_loss": 0.82964098, "learning_rate": 2.804010263051774e-06, "loss": 0.85076249, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 6441, "time_per_iteration": 2.3872947692871094 }, { "auxiliary_loss_clip": 0.01060987, "auxiliary_loss_mlp": 0.01057104, "balance_loss_clip": 1.03041279, "balance_loss_mlp": 1.01961088, "epoch": 0.3873139936870585, "flos": 17529086689920.0, "grad_norm": 21.527129005084454, "language_loss": 0.82202709, "learning_rate": 2.8036536421230118e-06, "loss": 0.84320801, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 6442, "time_per_iteration": 2.361820936203003 }, { "auxiliary_loss_clip": 0.01060249, "auxiliary_loss_mlp": 0.01044017, "balance_loss_clip": 1.01529944, "balance_loss_mlp": 1.01914263, "epoch": 0.38737411693972645, "flos": 17785558604160.0, "grad_norm": 1.673484587668203, "language_loss": 0.85736901, "learning_rate": 2.803296990719624e-06, "loss": 0.87841165, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41015625, "step": 6443, "time_per_iteration": 2.3715953826904297 }, { "auxiliary_loss_clip": 0.01012298, "auxiliary_loss_mlp": 0.0100887, "balance_loss_clip": 1.00465024, "balance_loss_mlp": 1.0031724, "epoch": 0.3874342401923944, "flos": 58301984119680.0, "grad_norm": 0.7634005985629908, "language_loss": 0.50225204, "learning_rate": 2.8029403088551327e-06, "loss": 0.52246368, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.09082031, "step": 6444, "time_per_iteration": 3.030442714691162 }, { "auxiliary_loss_clip": 0.0105651, "auxiliary_loss_mlp": 0.0103998, "balance_loss_clip": 1.01579249, "balance_loss_mlp": 1.01795161, "epoch": 0.3874943634450624, "flos": 17710984206720.0, "grad_norm": 1.490345437053535, "language_loss": 0.80078948, "learning_rate": 2.802583596543065e-06, "loss": 0.82175446, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38671875, "step": 6445, "time_per_iteration": 2.3960797786712646 }, { "auxiliary_loss_clip": 0.01061226, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.01571035, "balance_loss_mlp": 1.02110052, "epoch": 0.38755448669773035, "flos": 19243845849600.0, "grad_norm": 1.831930497781827, "language_loss": 0.82948619, "learning_rate": 2.8022268537969474e-06, "loss": 0.85051215, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 6446, "time_per_iteration": 2.3920023441314697 }, { "auxiliary_loss_clip": 0.01060827, "auxiliary_loss_mlp": 0.01044929, "balance_loss_clip": 1.01817894, "balance_loss_mlp": 1.01972294, "epoch": 0.3876146099503983, "flos": 20593238964480.0, "grad_norm": 1.6757828249897224, "language_loss": 0.78865302, "learning_rate": 2.801870080630306e-06, "loss": 0.80971056, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 6447, "time_per_iteration": 2.409745216369629 }, { "auxiliary_loss_clip": 0.01061149, "auxiliary_loss_mlp": 0.01040037, "balance_loss_clip": 1.01409721, "balance_loss_mlp": 1.02108371, "epoch": 0.3876747332030663, "flos": 19280120618880.0, "grad_norm": 1.8894455545854087, "language_loss": 0.78239703, "learning_rate": 2.801513277056671e-06, "loss": 0.80340886, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40039062, "step": 6448, "time_per_iteration": 2.3877058029174805 }, { "auxiliary_loss_clip": 0.01060076, "auxiliary_loss_mlp": 0.01039273, "balance_loss_clip": 1.01472807, "balance_loss_mlp": 1.0203433, "epoch": 0.38773485645573424, "flos": 18945094412160.0, "grad_norm": 1.7702356710381366, "language_loss": 0.76773614, "learning_rate": 2.8011564430895725e-06, "loss": 0.78872955, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3984375, "step": 6449, "time_per_iteration": 2.38065242767334 }, { "auxiliary_loss_clip": 0.01062292, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.01891279, "balance_loss_mlp": 1.01953363, "epoch": 0.3877949797084022, "flos": 23070361772160.0, "grad_norm": 1.6732670503903657, "language_loss": 0.79767632, "learning_rate": 2.800799578742542e-06, "loss": 0.81876695, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42773438, "step": 6450, "time_per_iteration": 2.422934055328369 }, { "auxiliary_loss_clip": 0.01065663, "auxiliary_loss_mlp": 0.01051454, "balance_loss_clip": 1.02260482, "balance_loss_mlp": 1.02066088, "epoch": 0.3878551029610702, "flos": 29094275756160.0, "grad_norm": 3.0251453218986195, "language_loss": 0.7911284, "learning_rate": 2.8004426840291106e-06, "loss": 0.81229955, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44921875, "step": 6451, "time_per_iteration": 2.4713737964630127 }, { "auxiliary_loss_clip": 0.01060095, "auxiliary_loss_mlp": 0.01045733, "balance_loss_clip": 1.02047253, "balance_loss_mlp": 1.0198648, "epoch": 0.38791522621373814, "flos": 20995333626240.0, "grad_norm": 1.8219948832117312, "language_loss": 0.77993232, "learning_rate": 2.800085758962812e-06, "loss": 0.80099064, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40234375, "step": 6452, "time_per_iteration": 2.4121556282043457 }, { "auxiliary_loss_clip": 0.01063254, "auxiliary_loss_mlp": 0.01049153, "balance_loss_clip": 1.02198517, "balance_loss_mlp": 1.02200794, "epoch": 0.3879753494664061, "flos": 15485934481920.0, "grad_norm": 1.470247511791582, "language_loss": 0.80659413, "learning_rate": 2.799728803557182e-06, "loss": 0.82771814, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 6453, "time_per_iteration": 2.475649356842041 }, { "auxiliary_loss_clip": 0.0106681, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.01807261, "balance_loss_mlp": 1.0214504, "epoch": 0.3880354727190741, "flos": 22052887752960.0, "grad_norm": 1.5891469118744868, "language_loss": 0.72584623, "learning_rate": 2.7993718178257555e-06, "loss": 0.74701148, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 6454, "time_per_iteration": 2.4295310974121094 }, { "auxiliary_loss_clip": 0.01067298, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.01644993, "balance_loss_mlp": 1.02258372, "epoch": 0.3880955959717421, "flos": 20339245756800.0, "grad_norm": 1.7023499961137403, "language_loss": 0.78772491, "learning_rate": 2.7990148017820694e-06, "loss": 0.80887765, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44726562, "step": 6455, "time_per_iteration": 2.5071732997894287 }, { "auxiliary_loss_clip": 0.01061406, "auxiliary_loss_mlp": 0.01053657, "balance_loss_clip": 1.02521348, "balance_loss_mlp": 1.0200671, "epoch": 0.38815571922441006, "flos": 23074306755840.0, "grad_norm": 1.5222459092012772, "language_loss": 0.77230108, "learning_rate": 2.798657755439662e-06, "loss": 0.79345179, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 6456, "time_per_iteration": 2.3809947967529297 }, { "auxiliary_loss_clip": 0.01064584, "auxiliary_loss_mlp": 0.01044533, "balance_loss_clip": 1.01672161, "balance_loss_mlp": 1.02071691, "epoch": 0.388215842477078, "flos": 20775904531200.0, "grad_norm": 2.316604987255037, "language_loss": 0.6326412, "learning_rate": 2.7983006788120726e-06, "loss": 0.6537323, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43945312, "step": 6457, "time_per_iteration": 2.376450300216675 }, { "auxiliary_loss_clip": 0.01062619, "auxiliary_loss_mlp": 0.01055382, "balance_loss_clip": 1.02326655, "balance_loss_mlp": 1.0182606, "epoch": 0.388275965729746, "flos": 20447162369280.0, "grad_norm": 2.1638512303771344, "language_loss": 0.81780559, "learning_rate": 2.797943571912841e-06, "loss": 0.83898562, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44335938, "step": 6458, "time_per_iteration": 2.4131245613098145 }, { "auxiliary_loss_clip": 0.01063265, "auxiliary_loss_mlp": 0.01043214, "balance_loss_clip": 1.01395965, "balance_loss_mlp": 1.01965165, "epoch": 0.38833608898241395, "flos": 27891133793280.0, "grad_norm": 2.045223033388776, "language_loss": 0.83118916, "learning_rate": 2.797586434755509e-06, "loss": 0.85225391, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43554688, "step": 6459, "time_per_iteration": 2.43587589263916 }, { "auxiliary_loss_clip": 0.01062843, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.0150516, "balance_loss_mlp": 1.01989114, "epoch": 0.3883962122350819, "flos": 18075442556160.0, "grad_norm": 1.5853541933325563, "language_loss": 0.63265228, "learning_rate": 2.7972292673536202e-06, "loss": 0.65371192, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4296875, "step": 6460, "time_per_iteration": 2.3841888904571533 }, { "auxiliary_loss_clip": 0.01061872, "auxiliary_loss_mlp": 0.01046753, "balance_loss_clip": 1.01885831, "balance_loss_mlp": 1.01979089, "epoch": 0.3884563354877499, "flos": 23621151381120.0, "grad_norm": 1.577150847689544, "language_loss": 0.87101662, "learning_rate": 2.796872069720717e-06, "loss": 0.8921029, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.421875, "step": 6461, "time_per_iteration": 2.4171931743621826 }, { "auxiliary_loss_clip": 0.01064089, "auxiliary_loss_mlp": 0.01049527, "balance_loss_clip": 1.01762712, "balance_loss_mlp": 1.02028394, "epoch": 0.38851645874041785, "flos": 27452310514560.0, "grad_norm": 2.371371523605077, "language_loss": 0.7390269, "learning_rate": 2.7965148418703456e-06, "loss": 0.76016307, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4375, "step": 6462, "time_per_iteration": 2.439781427383423 }, { "auxiliary_loss_clip": 0.01062176, "auxiliary_loss_mlp": 0.01047616, "balance_loss_clip": 1.01819491, "balance_loss_mlp": 1.01860368, "epoch": 0.3885765819930858, "flos": 25226911676160.0, "grad_norm": 1.9216882191753863, "language_loss": 0.77695143, "learning_rate": 2.796157583816052e-06, "loss": 0.79804939, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43554688, "step": 6463, "time_per_iteration": 2.413299083709717 }, { "auxiliary_loss_clip": 0.010653, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.01986969, "balance_loss_mlp": 1.02054679, "epoch": 0.3886367052457538, "flos": 16945653093120.0, "grad_norm": 2.017860276909241, "language_loss": 0.7214905, "learning_rate": 2.795800295571382e-06, "loss": 0.7426908, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.4453125, "step": 6464, "time_per_iteration": 2.391571044921875 }, { "auxiliary_loss_clip": 0.01063279, "auxiliary_loss_mlp": 0.01044569, "balance_loss_clip": 1.01632857, "balance_loss_mlp": 1.02201366, "epoch": 0.38869682849842174, "flos": 27153140140800.0, "grad_norm": 2.442741168952276, "language_loss": 0.70621943, "learning_rate": 2.7954429771498858e-06, "loss": 0.7272979, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 6465, "time_per_iteration": 3.8800346851348877 }, { "auxiliary_loss_clip": 0.01063244, "auxiliary_loss_mlp": 0.0104923, "balance_loss_clip": 1.01773429, "balance_loss_mlp": 1.01934361, "epoch": 0.3887569517510897, "flos": 21062716283520.0, "grad_norm": 2.2732253091596526, "language_loss": 0.79298919, "learning_rate": 2.7950856285651117e-06, "loss": 0.81411386, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43945312, "step": 6466, "time_per_iteration": 2.3741443157196045 }, { "auxiliary_loss_clip": 0.0106486, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.01673412, "balance_loss_mlp": 1.02086878, "epoch": 0.38881707500375773, "flos": 29496091127040.0, "grad_norm": 1.5520402516450096, "language_loss": 0.70784736, "learning_rate": 2.794728249830611e-06, "loss": 0.72897637, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43945312, "step": 6467, "time_per_iteration": 2.485211133956909 }, { "auxiliary_loss_clip": 0.01063962, "auxiliary_loss_mlp": 0.01054622, "balance_loss_clip": 1.02014625, "balance_loss_mlp": 1.01937342, "epoch": 0.3888771982564257, "flos": 17487470482560.0, "grad_norm": 2.3776768765907765, "language_loss": 0.84457862, "learning_rate": 2.794370840959936e-06, "loss": 0.8657645, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.4453125, "step": 6468, "time_per_iteration": 2.3377246856689453 }, { "auxiliary_loss_clip": 0.01064021, "auxiliary_loss_mlp": 0.01044626, "balance_loss_clip": 1.01533604, "balance_loss_mlp": 1.01981914, "epoch": 0.38893732150909366, "flos": 21941410181760.0, "grad_norm": 1.710475754605988, "language_loss": 0.84923679, "learning_rate": 2.7940134019666383e-06, "loss": 0.8703233, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 6469, "time_per_iteration": 3.780565023422241 }, { "auxiliary_loss_clip": 0.01061598, "auxiliary_loss_mlp": 0.01051014, "balance_loss_clip": 1.01768279, "balance_loss_mlp": 1.01834941, "epoch": 0.3889974447617616, "flos": 24275319125760.0, "grad_norm": 1.836174943494901, "language_loss": 0.76081049, "learning_rate": 2.793655932864273e-06, "loss": 0.78193659, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.43359375, "step": 6470, "time_per_iteration": 3.917071580886841 }, { "auxiliary_loss_clip": 0.01064658, "auxiliary_loss_mlp": 0.01053632, "balance_loss_clip": 1.0184536, "balance_loss_mlp": 1.01891446, "epoch": 0.3890575680144296, "flos": 25665909511680.0, "grad_norm": 1.585689080283161, "language_loss": 0.75694549, "learning_rate": 2.7932984336663953e-06, "loss": 0.77812839, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.45703125, "step": 6471, "time_per_iteration": 2.443952798843384 }, { "auxiliary_loss_clip": 0.01064861, "auxiliary_loss_mlp": 0.01044133, "balance_loss_clip": 1.01132679, "balance_loss_mlp": 1.02125216, "epoch": 0.38911769126709755, "flos": 22854214344960.0, "grad_norm": 1.589670907747918, "language_loss": 0.6808368, "learning_rate": 2.792940904386562e-06, "loss": 0.70192671, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43554688, "step": 6472, "time_per_iteration": 2.413602113723755 }, { "auxiliary_loss_clip": 0.0106575, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.02169502, "balance_loss_mlp": 1.02150154, "epoch": 0.3891778145197655, "flos": 25446340771200.0, "grad_norm": 1.6883131244629637, "language_loss": 0.77341211, "learning_rate": 2.7925833450383293e-06, "loss": 0.79461133, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44140625, "step": 6473, "time_per_iteration": 2.4130804538726807 }, { "auxiliary_loss_clip": 0.01066656, "auxiliary_loss_mlp": 0.01049772, "balance_loss_clip": 1.01737142, "balance_loss_mlp": 1.02219963, "epoch": 0.3892379377724335, "flos": 14027088654720.0, "grad_norm": 2.038268362633952, "language_loss": 0.73478693, "learning_rate": 2.792225755635257e-06, "loss": 0.75595123, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4453125, "step": 6474, "time_per_iteration": 2.371166706085205 }, { "auxiliary_loss_clip": 0.01065157, "auxiliary_loss_mlp": 0.01048632, "balance_loss_clip": 1.01592112, "balance_loss_mlp": 1.02054715, "epoch": 0.38929806102510145, "flos": 20156405633280.0, "grad_norm": 1.9418069518315269, "language_loss": 0.69615209, "learning_rate": 2.7918681361909046e-06, "loss": 0.71728992, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 6475, "time_per_iteration": 2.383227586746216 }, { "auxiliary_loss_clip": 0.01070534, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.01958108, "balance_loss_mlp": 1.02297306, "epoch": 0.3893581842777694, "flos": 22162864135680.0, "grad_norm": 1.981392822593474, "language_loss": 0.77203846, "learning_rate": 2.7915104867188332e-06, "loss": 0.79329252, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.4765625, "step": 6476, "time_per_iteration": 2.4091298580169678 }, { "auxiliary_loss_clip": 0.01017403, "auxiliary_loss_mlp": 0.01026375, "balance_loss_clip": 1.02236938, "balance_loss_mlp": 1.00620866, "epoch": 0.3894183075304374, "flos": 67298367548160.0, "grad_norm": 0.7978657656102623, "language_loss": 0.58288419, "learning_rate": 2.7911528072326055e-06, "loss": 0.60332203, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.11181641, "step": 6477, "time_per_iteration": 3.0182688236236572 }, { "auxiliary_loss_clip": 0.01065399, "auxiliary_loss_mlp": 0.01052489, "balance_loss_clip": 1.01806128, "balance_loss_mlp": 1.02070808, "epoch": 0.38947843078310534, "flos": 18546630531840.0, "grad_norm": 1.9136510163879519, "language_loss": 0.79007065, "learning_rate": 2.7907950977457832e-06, "loss": 0.81124949, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44726562, "step": 6478, "time_per_iteration": 2.3737242221832275 }, { "auxiliary_loss_clip": 0.01065334, "auxiliary_loss_mlp": 0.01045453, "balance_loss_clip": 1.01445854, "balance_loss_mlp": 1.02079141, "epoch": 0.3895385540357733, "flos": 14605145902080.0, "grad_norm": 2.122791233193536, "language_loss": 0.8431468, "learning_rate": 2.7904373582719317e-06, "loss": 0.86425465, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 6479, "time_per_iteration": 2.3437395095825195 }, { "auxiliary_loss_clip": 0.01063692, "auxiliary_loss_mlp": 0.01043869, "balance_loss_clip": 1.0154022, "balance_loss_mlp": 1.02081919, "epoch": 0.38959867728844133, "flos": 19974159002880.0, "grad_norm": 9.338544083907466, "language_loss": 0.81787723, "learning_rate": 2.790079588824617e-06, "loss": 0.8389529, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 6480, "time_per_iteration": 2.3887434005737305 }, { "auxiliary_loss_clip": 0.01064235, "auxiliary_loss_mlp": 0.01046701, "balance_loss_clip": 1.0175066, "balance_loss_mlp": 1.02198362, "epoch": 0.3896588005411093, "flos": 22671094930560.0, "grad_norm": 1.8766277634015556, "language_loss": 0.84878016, "learning_rate": 2.7897217894174038e-06, "loss": 0.8698895, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 6481, "time_per_iteration": 2.403028964996338 }, { "auxiliary_loss_clip": 0.01063689, "auxiliary_loss_mlp": 0.01045317, "balance_loss_clip": 1.01719522, "balance_loss_mlp": 1.02153039, "epoch": 0.38971892379377726, "flos": 20994984512640.0, "grad_norm": 1.7776804059305822, "language_loss": 0.76262754, "learning_rate": 2.789363960063863e-06, "loss": 0.78371763, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 6482, "time_per_iteration": 2.3985133171081543 }, { "auxiliary_loss_clip": 0.01065119, "auxiliary_loss_mlp": 0.01057507, "balance_loss_clip": 1.02763295, "balance_loss_mlp": 1.02162635, "epoch": 0.3897790470464452, "flos": 22527392307840.0, "grad_norm": 2.1898915449714624, "language_loss": 0.80150783, "learning_rate": 2.78900610077756e-06, "loss": 0.82273412, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43554688, "step": 6483, "time_per_iteration": 2.40327525138855 }, { "auxiliary_loss_clip": 0.01063722, "auxiliary_loss_mlp": 0.01049425, "balance_loss_clip": 1.01692903, "balance_loss_mlp": 1.02003539, "epoch": 0.3898391702991132, "flos": 26208809153280.0, "grad_norm": 1.4963497281179574, "language_loss": 0.80808854, "learning_rate": 2.788648211572067e-06, "loss": 0.82922006, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4375, "step": 6484, "time_per_iteration": 2.411647319793701 }, { "auxiliary_loss_clip": 0.01064651, "auxiliary_loss_mlp": 0.01051736, "balance_loss_clip": 1.01702237, "balance_loss_mlp": 1.02113569, "epoch": 0.38989929355178116, "flos": 21064601496960.0, "grad_norm": 1.6161260121067504, "language_loss": 0.78641796, "learning_rate": 2.7882902924609557e-06, "loss": 0.80758184, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.43554688, "step": 6485, "time_per_iteration": 2.405679941177368 }, { "auxiliary_loss_clip": 0.01066839, "auxiliary_loss_mlp": 0.01053611, "balance_loss_clip": 1.02130556, "balance_loss_mlp": 1.02238488, "epoch": 0.3899594168044491, "flos": 25482929742720.0, "grad_norm": 2.4859184371205116, "language_loss": 0.87100697, "learning_rate": 2.7879323434577965e-06, "loss": 0.89221144, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4453125, "step": 6486, "time_per_iteration": 2.413532257080078 }, { "auxiliary_loss_clip": 0.01067162, "auxiliary_loss_mlp": 0.01059892, "balance_loss_clip": 1.02749085, "balance_loss_mlp": 1.02110052, "epoch": 0.3900195400571171, "flos": 31138021457280.0, "grad_norm": 2.1206154685395116, "language_loss": 0.87121594, "learning_rate": 2.7875743645761645e-06, "loss": 0.89248651, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4609375, "step": 6487, "time_per_iteration": 2.480565071105957 }, { "auxiliary_loss_clip": 0.010635, "auxiliary_loss_mlp": 0.01059299, "balance_loss_clip": 1.02539551, "balance_loss_mlp": 1.02053165, "epoch": 0.39007966330978505, "flos": 20228885349120.0, "grad_norm": 1.5151998717978257, "language_loss": 0.74351346, "learning_rate": 2.787216355829633e-06, "loss": 0.76474148, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4296875, "step": 6488, "time_per_iteration": 2.3809475898742676 }, { "auxiliary_loss_clip": 0.01065706, "auxiliary_loss_mlp": 0.01062761, "balance_loss_clip": 1.03010917, "balance_loss_mlp": 1.02093172, "epoch": 0.390139786562453, "flos": 22527636687360.0, "grad_norm": 1.9876581268588358, "language_loss": 0.69809145, "learning_rate": 2.786858317231779e-06, "loss": 0.71937609, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44726562, "step": 6489, "time_per_iteration": 2.4261162281036377 }, { "auxiliary_loss_clip": 0.01060831, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.0225693, "balance_loss_mlp": 1.01978946, "epoch": 0.390199909815121, "flos": 26431694472960.0, "grad_norm": 1.7758799186442942, "language_loss": 0.81930304, "learning_rate": 2.7865002487961788e-06, "loss": 0.84043193, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41015625, "step": 6490, "time_per_iteration": 2.421757221221924 }, { "auxiliary_loss_clip": 0.01063689, "auxiliary_loss_mlp": 0.01056076, "balance_loss_clip": 1.02543926, "balance_loss_mlp": 1.02028477, "epoch": 0.39026003306778895, "flos": 17273627205120.0, "grad_norm": 1.7921643596368593, "language_loss": 0.90527201, "learning_rate": 2.7861421505364104e-06, "loss": 0.92646968, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 6491, "time_per_iteration": 2.3892433643341064 }, { "auxiliary_loss_clip": 0.01064391, "auxiliary_loss_mlp": 0.0106275, "balance_loss_clip": 1.03011012, "balance_loss_mlp": 1.01965189, "epoch": 0.3903201563204569, "flos": 24531756128640.0, "grad_norm": 4.035657799360741, "language_loss": 0.79826295, "learning_rate": 2.7857840224660523e-06, "loss": 0.81953442, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44726562, "step": 6492, "time_per_iteration": 2.408308267593384 }, { "auxiliary_loss_clip": 0.01065573, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.02094889, "balance_loss_mlp": 1.02097201, "epoch": 0.39038027957312493, "flos": 23766843951360.0, "grad_norm": 1.905768494451364, "language_loss": 0.7530489, "learning_rate": 2.7854258645986857e-06, "loss": 0.77422476, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 6493, "time_per_iteration": 2.4645166397094727 }, { "auxiliary_loss_clip": 0.01067379, "auxiliary_loss_mlp": 0.0105772, "balance_loss_clip": 1.02126575, "balance_loss_mlp": 1.02089214, "epoch": 0.3904404028257929, "flos": 14099742927360.0, "grad_norm": 1.9813766373597879, "language_loss": 0.78167391, "learning_rate": 2.7850676769478916e-06, "loss": 0.80292487, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.46484375, "step": 6494, "time_per_iteration": 2.350229501724243 }, { "auxiliary_loss_clip": 0.01069138, "auxiliary_loss_mlp": 0.01061735, "balance_loss_clip": 1.02530468, "balance_loss_mlp": 1.02042484, "epoch": 0.39050052607846086, "flos": 16909099032960.0, "grad_norm": 3.77379751661921, "language_loss": 0.76744872, "learning_rate": 2.7847094595272525e-06, "loss": 0.78875744, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.48632812, "step": 6495, "time_per_iteration": 2.400339126586914 }, { "auxiliary_loss_clip": 0.01065756, "auxiliary_loss_mlp": 0.01059378, "balance_loss_clip": 1.02568924, "balance_loss_mlp": 1.02132487, "epoch": 0.39056064933112883, "flos": 25914735838080.0, "grad_norm": 1.6599506049462327, "language_loss": 0.69175953, "learning_rate": 2.784351212350352e-06, "loss": 0.71301091, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4453125, "step": 6496, "time_per_iteration": 2.427006483078003 }, { "auxiliary_loss_clip": 0.01018877, "auxiliary_loss_mlp": 0.01011377, "balance_loss_clip": 1.00720513, "balance_loss_mlp": 1.00844216, "epoch": 0.3906207725837968, "flos": 60025471119360.0, "grad_norm": 0.6649028229824571, "language_loss": 0.54045177, "learning_rate": 2.783992935430775e-06, "loss": 0.5607543, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.10449219, "step": 6497, "time_per_iteration": 3.1214606761932373 }, { "auxiliary_loss_clip": 0.01066919, "auxiliary_loss_mlp": 0.0104874, "balance_loss_clip": 1.01629138, "balance_loss_mlp": 1.02242422, "epoch": 0.39068089583646476, "flos": 21067638785280.0, "grad_norm": 2.339866419286431, "language_loss": 0.70322049, "learning_rate": 2.7836346287821068e-06, "loss": 0.72437704, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4453125, "step": 6498, "time_per_iteration": 2.381258487701416 }, { "auxiliary_loss_clip": 0.0102036, "auxiliary_loss_mlp": 0.01017649, "balance_loss_clip": 1.01414406, "balance_loss_mlp": 1.00987601, "epoch": 0.3907410190891327, "flos": 70441911987840.0, "grad_norm": 0.745130211315678, "language_loss": 0.51837468, "learning_rate": 2.783276292417936e-06, "loss": 0.5387547, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 0.03515625, "router_z_loss_mlp": 0.10498047, "step": 6499, "time_per_iteration": 3.0878586769104004 }, { "auxiliary_loss_clip": 0.01068982, "auxiliary_loss_mlp": 0.01054842, "balance_loss_clip": 1.01931739, "balance_loss_mlp": 1.02294576, "epoch": 0.3908011423418007, "flos": 27961274448000.0, "grad_norm": 1.847627525839178, "language_loss": 0.74523389, "learning_rate": 2.7829179263518487e-06, "loss": 0.76647216, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.4609375, "step": 6500, "time_per_iteration": 2.4462313652038574 }, { "auxiliary_loss_clip": 0.01068146, "auxiliary_loss_mlp": 0.01045851, "balance_loss_clip": 1.01430869, "balance_loss_mlp": 1.02328134, "epoch": 0.39086126559446865, "flos": 24460952158080.0, "grad_norm": 3.2205083920803794, "language_loss": 0.7015481, "learning_rate": 2.7825595305974354e-06, "loss": 0.72268808, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 6501, "time_per_iteration": 2.444197177886963 }, { "auxiliary_loss_clip": 0.01068597, "auxiliary_loss_mlp": 0.01052196, "balance_loss_clip": 1.01757717, "balance_loss_mlp": 1.02382815, "epoch": 0.3909213888471366, "flos": 16940730591360.0, "grad_norm": 1.713059225427031, "language_loss": 0.79409224, "learning_rate": 2.782201105168287e-06, "loss": 0.81530023, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.44921875, "step": 6502, "time_per_iteration": 2.3609795570373535 }, { "auxiliary_loss_clip": 0.01064731, "auxiliary_loss_mlp": 0.01048644, "balance_loss_clip": 1.02046239, "balance_loss_mlp": 1.02351642, "epoch": 0.3909815120998046, "flos": 29277115879680.0, "grad_norm": 2.4703372250825386, "language_loss": 0.81788051, "learning_rate": 2.7818426500779932e-06, "loss": 0.83901429, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 6503, "time_per_iteration": 2.454406976699829 }, { "auxiliary_loss_clip": 0.01065397, "auxiliary_loss_mlp": 0.0104662, "balance_loss_clip": 1.01773596, "balance_loss_mlp": 1.0238595, "epoch": 0.39104163535247255, "flos": 18950296204800.0, "grad_norm": 1.8273875549905592, "language_loss": 0.73159319, "learning_rate": 2.7814841653401485e-06, "loss": 0.75271332, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41601562, "step": 6504, "time_per_iteration": 2.3722448348999023 }, { "auxiliary_loss_clip": 0.01065497, "auxiliary_loss_mlp": 0.01057317, "balance_loss_clip": 1.02648997, "balance_loss_mlp": 1.02201724, "epoch": 0.3911017586051405, "flos": 26322137026560.0, "grad_norm": 1.4651605436365642, "language_loss": 0.8394556, "learning_rate": 2.7811256509683454e-06, "loss": 0.86068368, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 6505, "time_per_iteration": 5.270334482192993 }, { "auxiliary_loss_clip": 0.01065294, "auxiliary_loss_mlp": 0.01057143, "balance_loss_clip": 1.02316856, "balance_loss_mlp": 1.02294075, "epoch": 0.3911618818578085, "flos": 21834680555520.0, "grad_norm": 1.8705428989368649, "language_loss": 0.73101103, "learning_rate": 2.7807671069761797e-06, "loss": 0.75223541, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.421875, "step": 6506, "time_per_iteration": 2.430527925491333 }, { "auxiliary_loss_clip": 0.01062762, "auxiliary_loss_mlp": 0.01056336, "balance_loss_clip": 1.0260334, "balance_loss_mlp": 1.02117515, "epoch": 0.3912220051104765, "flos": 16358833094400.0, "grad_norm": 1.9569521034959418, "language_loss": 0.76305038, "learning_rate": 2.7804085333772477e-06, "loss": 0.78424138, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4140625, "step": 6507, "time_per_iteration": 2.365361213684082 }, { "auxiliary_loss_clip": 0.01018651, "auxiliary_loss_mlp": 0.01015946, "balance_loss_clip": 1.01227391, "balance_loss_mlp": 1.00903916, "epoch": 0.39128212836314447, "flos": 71047620898560.0, "grad_norm": 0.7633167338955782, "language_loss": 0.56603694, "learning_rate": 2.7800499301851446e-06, "loss": 0.58638293, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.09619141, "step": 6508, "time_per_iteration": 4.528662919998169 }, { "auxiliary_loss_clip": 0.0106536, "auxiliary_loss_mlp": 0.01058723, "balance_loss_clip": 1.02817011, "balance_loss_mlp": 1.02144194, "epoch": 0.39134225161581243, "flos": 20331146321280.0, "grad_norm": 1.9468153009481834, "language_loss": 0.78137577, "learning_rate": 2.779691297413471e-06, "loss": 0.80261666, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 6509, "time_per_iteration": 3.87225604057312 }, { "auxiliary_loss_clip": 0.01065372, "auxiliary_loss_mlp": 0.01058082, "balance_loss_clip": 1.02308249, "balance_loss_mlp": 1.01997685, "epoch": 0.3914023748684804, "flos": 17017469493120.0, "grad_norm": 2.680423175968471, "language_loss": 0.84814018, "learning_rate": 2.779332635075825e-06, "loss": 0.86937475, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.45507812, "step": 6510, "time_per_iteration": 2.371619462966919 }, { "auxiliary_loss_clip": 0.01065645, "auxiliary_loss_mlp": 0.01061922, "balance_loss_clip": 1.02861524, "balance_loss_mlp": 1.02097833, "epoch": 0.39146249812114836, "flos": 18404254540800.0, "grad_norm": 1.8569065734046586, "language_loss": 0.78263903, "learning_rate": 2.7789739431858073e-06, "loss": 0.80391473, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44726562, "step": 6511, "time_per_iteration": 2.3658549785614014 }, { "auxiliary_loss_clip": 0.01015642, "auxiliary_loss_mlp": 0.0101112, "balance_loss_clip": 1.00740111, "balance_loss_mlp": 1.00523329, "epoch": 0.3915226213738163, "flos": 67633638134400.0, "grad_norm": 0.7250172145191213, "language_loss": 0.57859558, "learning_rate": 2.7786152217570196e-06, "loss": 0.59886318, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.10400391, "step": 6512, "time_per_iteration": 3.135610580444336 }, { "auxiliary_loss_clip": 0.01065327, "auxiliary_loss_mlp": 0.01055265, "balance_loss_clip": 1.02104032, "balance_loss_mlp": 1.02036214, "epoch": 0.3915827446264843, "flos": 26358132504960.0, "grad_norm": 1.7976585700534344, "language_loss": 0.70892614, "learning_rate": 2.7782564708030647e-06, "loss": 0.73013204, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44921875, "step": 6513, "time_per_iteration": 2.4213638305664062 }, { "auxiliary_loss_clip": 0.01068529, "auxiliary_loss_mlp": 0.01057854, "balance_loss_clip": 1.02149534, "balance_loss_mlp": 1.02130175, "epoch": 0.39164286787915226, "flos": 21942841547520.0, "grad_norm": 3.1550097343905494, "language_loss": 0.77031869, "learning_rate": 2.7778976903375464e-06, "loss": 0.79158252, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 0.47265625, "step": 6514, "time_per_iteration": 2.402256727218628 }, { "auxiliary_loss_clip": 0.01065418, "auxiliary_loss_mlp": 0.01046307, "balance_loss_clip": 1.01512182, "balance_loss_mlp": 1.02167869, "epoch": 0.3917029911318202, "flos": 16398878290560.0, "grad_norm": 1.7655216203092243, "language_loss": 0.79040438, "learning_rate": 2.7775388803740693e-06, "loss": 0.81152165, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4375, "step": 6515, "time_per_iteration": 2.368178606033325 }, { "auxiliary_loss_clip": 0.01063275, "auxiliary_loss_mlp": 0.01052499, "balance_loss_clip": 1.02411485, "balance_loss_mlp": 1.020854, "epoch": 0.3917631143844882, "flos": 26210554721280.0, "grad_norm": 1.4053218442219149, "language_loss": 0.80476403, "learning_rate": 2.7771800409262406e-06, "loss": 0.82592177, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42578125, "step": 6516, "time_per_iteration": 2.464479446411133 }, { "auxiliary_loss_clip": 0.01066978, "auxiliary_loss_mlp": 0.01053551, "balance_loss_clip": 1.02215147, "balance_loss_mlp": 1.02210891, "epoch": 0.39182323763715615, "flos": 18547468404480.0, "grad_norm": 2.259430313709165, "language_loss": 0.72581738, "learning_rate": 2.7768211720076665e-06, "loss": 0.74702269, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44921875, "step": 6517, "time_per_iteration": 2.378713369369507 }, { "auxiliary_loss_clip": 0.01067495, "auxiliary_loss_mlp": 0.01058022, "balance_loss_clip": 1.02609801, "balance_loss_mlp": 1.02266753, "epoch": 0.3918833608898241, "flos": 34312115203200.0, "grad_norm": 1.5355293754854469, "language_loss": 0.73323423, "learning_rate": 2.776462273631956e-06, "loss": 0.75448942, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44726562, "step": 6518, "time_per_iteration": 2.52559494972229 }, { "auxiliary_loss_clip": 0.01073175, "auxiliary_loss_mlp": 0.0105611, "balance_loss_clip": 1.02411425, "balance_loss_mlp": 1.02783775, "epoch": 0.3919434841424921, "flos": 36938107514880.0, "grad_norm": 1.6846018961978952, "language_loss": 0.62759113, "learning_rate": 2.7761033458127177e-06, "loss": 0.648884, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 6519, "time_per_iteration": 2.5262880325317383 }, { "auxiliary_loss_clip": 0.01074142, "auxiliary_loss_mlp": 0.01059864, "balance_loss_clip": 1.0251981, "balance_loss_mlp": 1.02731013, "epoch": 0.3920036073951601, "flos": 23507963153280.0, "grad_norm": 2.1804448650297745, "language_loss": 0.68827713, "learning_rate": 2.775744388563563e-06, "loss": 0.7096172, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.46875, "step": 6520, "time_per_iteration": 2.4251515865325928 }, { "auxiliary_loss_clip": 0.01068122, "auxiliary_loss_mlp": 0.01046746, "balance_loss_clip": 1.01584673, "balance_loss_mlp": 1.02508068, "epoch": 0.39206373064782807, "flos": 18405092413440.0, "grad_norm": 1.7258386007102553, "language_loss": 0.80427516, "learning_rate": 2.775385401898104e-06, "loss": 0.82542384, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 6521, "time_per_iteration": 2.3862552642822266 }, { "auxiliary_loss_clip": 0.01076154, "auxiliary_loss_mlp": 0.01064924, "balance_loss_clip": 1.02303374, "balance_loss_mlp": 1.02695382, "epoch": 0.39212385390049603, "flos": 12312224760960.0, "grad_norm": 2.048282458413658, "language_loss": 0.71372569, "learning_rate": 2.775026385829952e-06, "loss": 0.73513651, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 0.4921875, "step": 6522, "time_per_iteration": 2.388674020767212 }, { "auxiliary_loss_clip": 0.01076296, "auxiliary_loss_mlp": 0.01048816, "balance_loss_clip": 1.01666546, "balance_loss_mlp": 1.02987278, "epoch": 0.392183977153164, "flos": 19718140936320.0, "grad_norm": 1.836509255022637, "language_loss": 0.78349835, "learning_rate": 2.774667340372722e-06, "loss": 0.80474949, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46484375, "step": 6523, "time_per_iteration": 2.409184694290161 }, { "auxiliary_loss_clip": 0.01074029, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.01499748, "balance_loss_mlp": 1.02878642, "epoch": 0.39224410040583196, "flos": 33143537352960.0, "grad_norm": 2.543986864836236, "language_loss": 0.63268036, "learning_rate": 2.7743082655400293e-06, "loss": 0.6538837, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.453125, "step": 6524, "time_per_iteration": 2.518162727355957 }, { "auxiliary_loss_clip": 0.01072112, "auxiliary_loss_mlp": 0.01048132, "balance_loss_clip": 1.01556361, "balance_loss_mlp": 1.02774715, "epoch": 0.39230422365849993, "flos": 27781192321920.0, "grad_norm": 1.6132919817899067, "language_loss": 0.75229025, "learning_rate": 2.773949161345489e-06, "loss": 0.77349263, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4453125, "step": 6525, "time_per_iteration": 2.4731082916259766 }, { "auxiliary_loss_clip": 0.01072116, "auxiliary_loss_mlp": 0.01046973, "balance_loss_clip": 1.01670611, "balance_loss_mlp": 1.02754474, "epoch": 0.3923643469111679, "flos": 17930657681280.0, "grad_norm": 3.891026016617497, "language_loss": 0.82787216, "learning_rate": 2.773590027802719e-06, "loss": 0.84906304, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4453125, "step": 6526, "time_per_iteration": 2.3894612789154053 }, { "auxiliary_loss_clip": 0.01069784, "auxiliary_loss_mlp": 0.01050232, "balance_loss_clip": 1.01997709, "balance_loss_mlp": 1.02579427, "epoch": 0.39242447016383586, "flos": 24058438560000.0, "grad_norm": 1.549040156973907, "language_loss": 0.70617598, "learning_rate": 2.7732308649253383e-06, "loss": 0.72737616, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43945312, "step": 6527, "time_per_iteration": 2.4535739421844482 }, { "auxiliary_loss_clip": 0.01069722, "auxiliary_loss_mlp": 0.01052525, "balance_loss_clip": 1.02091098, "balance_loss_mlp": 1.02624989, "epoch": 0.3924845934165038, "flos": 10663486715520.0, "grad_norm": 2.4071139762776514, "language_loss": 0.84321284, "learning_rate": 2.772871672726965e-06, "loss": 0.86443532, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43359375, "step": 6528, "time_per_iteration": 2.3656504154205322 }, { "auxiliary_loss_clip": 0.01066769, "auxiliary_loss_mlp": 0.01049761, "balance_loss_clip": 1.01670432, "balance_loss_mlp": 1.02447867, "epoch": 0.3925447166691718, "flos": 31244646349440.0, "grad_norm": 1.5835867669188777, "language_loss": 0.69740188, "learning_rate": 2.7725124512212205e-06, "loss": 0.71856719, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.421875, "step": 6529, "time_per_iteration": 2.514498710632324 }, { "auxiliary_loss_clip": 0.01068873, "auxiliary_loss_mlp": 0.01053961, "balance_loss_clip": 1.02089214, "balance_loss_mlp": 1.02411973, "epoch": 0.39260483992183975, "flos": 29414010787200.0, "grad_norm": 2.404158504139493, "language_loss": 0.82459617, "learning_rate": 2.7721532004217267e-06, "loss": 0.84582448, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44726562, "step": 6530, "time_per_iteration": 2.468825578689575 }, { "auxiliary_loss_clip": 0.01065382, "auxiliary_loss_mlp": 0.01042137, "balance_loss_clip": 1.01369369, "balance_loss_mlp": 1.02207088, "epoch": 0.3926649631745077, "flos": 22856658140160.0, "grad_norm": 1.555331128386134, "language_loss": 0.77287424, "learning_rate": 2.7717939203421063e-06, "loss": 0.79394937, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 6531, "time_per_iteration": 2.4373130798339844 }, { "auxiliary_loss_clip": 0.01029265, "auxiliary_loss_mlp": 0.01005532, "balance_loss_clip": 1.00154996, "balance_loss_mlp": 1.01824152, "epoch": 0.3927250864271757, "flos": 63890880163200.0, "grad_norm": 0.8132591450334073, "language_loss": 0.60378397, "learning_rate": 2.7714346109959822e-06, "loss": 0.62413192, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 0.03979492, "router_z_loss_mlp": 0.11035156, "step": 6532, "time_per_iteration": 2.9106557369232178 }, { "auxiliary_loss_clip": 0.01023298, "auxiliary_loss_mlp": 0.01004377, "balance_loss_clip": 1.00034738, "balance_loss_mlp": 1.0128324, "epoch": 0.3927852096798437, "flos": 68906117790720.0, "grad_norm": 0.7811912494360661, "language_loss": 0.5556798, "learning_rate": 2.771075272396981e-06, "loss": 0.57595646, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.10449219, "step": 6533, "time_per_iteration": 3.127985715866089 }, { "auxiliary_loss_clip": 0.01068973, "auxiliary_loss_mlp": 0.0106684, "balance_loss_clip": 1.03348517, "balance_loss_mlp": 1.0219028, "epoch": 0.39284533293251167, "flos": 29714682349440.0, "grad_norm": 2.0820089297517947, "language_loss": 0.77479672, "learning_rate": 2.7707159045587284e-06, "loss": 0.79615492, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.47070312, "step": 6534, "time_per_iteration": 2.509692430496216 }, { "auxiliary_loss_clip": 0.01065511, "auxiliary_loss_mlp": 0.01060534, "balance_loss_clip": 1.02872872, "balance_loss_mlp": 1.01934063, "epoch": 0.39290545618517964, "flos": 18551029363200.0, "grad_norm": 2.2317081209241865, "language_loss": 0.79783463, "learning_rate": 2.770356507494851e-06, "loss": 0.81909513, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4609375, "step": 6535, "time_per_iteration": 2.365178108215332 }, { "auxiliary_loss_clip": 0.01062861, "auxiliary_loss_mlp": 0.01071195, "balance_loss_clip": 1.04034352, "balance_loss_mlp": 1.0204258, "epoch": 0.3929655794378476, "flos": 26248295767680.0, "grad_norm": 2.8957527935117597, "language_loss": 0.69908768, "learning_rate": 2.769997081218978e-06, "loss": 0.72042823, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42578125, "step": 6536, "time_per_iteration": 2.447791337966919 }, { "auxiliary_loss_clip": 0.01061467, "auxiliary_loss_mlp": 0.01070226, "balance_loss_clip": 1.04147291, "balance_loss_mlp": 1.01985812, "epoch": 0.39302570269051557, "flos": 29276662032000.0, "grad_norm": 1.8313046263949275, "language_loss": 0.69843125, "learning_rate": 2.769637625744738e-06, "loss": 0.71974826, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41601562, "step": 6537, "time_per_iteration": 2.4622740745544434 }, { "auxiliary_loss_clip": 0.01066783, "auxiliary_loss_mlp": 0.01066824, "balance_loss_clip": 1.03509057, "balance_loss_mlp": 1.02134025, "epoch": 0.39308582594318353, "flos": 17346490945920.0, "grad_norm": 1.9421015415431033, "language_loss": 0.80258363, "learning_rate": 2.769278141085763e-06, "loss": 0.82391971, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45507812, "step": 6538, "time_per_iteration": 2.409656286239624 }, { "auxiliary_loss_clip": 0.01018697, "auxiliary_loss_mlp": 0.01094811, "balance_loss_clip": 1.09032917, "balance_loss_mlp": 1.00879931, "epoch": 0.3931459491958515, "flos": 61004296396800.0, "grad_norm": 0.8654711586514772, "language_loss": 0.61946416, "learning_rate": 2.768918627255683e-06, "loss": 0.64059925, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 0.04492188, "router_z_loss_mlp": 0.09863281, "step": 6539, "time_per_iteration": 2.8049373626708984 }, { "auxiliary_loss_clip": 0.01071715, "auxiliary_loss_mlp": 0.01064878, "balance_loss_clip": 1.02892494, "balance_loss_mlp": 1.02645707, "epoch": 0.39320607244851946, "flos": 39014567026560.0, "grad_norm": 2.5241478292576383, "language_loss": 0.70182884, "learning_rate": 2.7685590842681315e-06, "loss": 0.72319472, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.45117188, "step": 6540, "time_per_iteration": 2.592240810394287 }, { "auxiliary_loss_clip": 0.01072617, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.02622914, "balance_loss_mlp": 1.02794695, "epoch": 0.3932661957011874, "flos": 24678635685120.0, "grad_norm": 1.659816541787921, "language_loss": 0.73444378, "learning_rate": 2.7681995121367433e-06, "loss": 0.75575966, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 6541, "time_per_iteration": 2.4225590229034424 }, { "auxiliary_loss_clip": 0.01036949, "auxiliary_loss_mlp": 0.01037312, "balance_loss_clip": 1.03340161, "balance_loss_mlp": 1.02602625, "epoch": 0.3933263189538554, "flos": 70093375084800.0, "grad_norm": 0.8324111915141185, "language_loss": 0.60442382, "learning_rate": 2.7678399108751516e-06, "loss": 0.62516642, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 0.0390625, "router_z_loss_mlp": 0.109375, "step": 6542, "time_per_iteration": 2.930344820022583 }, { "auxiliary_loss_clip": 0.01076687, "auxiliary_loss_mlp": 0.01055091, "balance_loss_clip": 1.02156878, "balance_loss_mlp": 1.0320996, "epoch": 0.39338644220652336, "flos": 22927985781120.0, "grad_norm": 1.8288865705365573, "language_loss": 0.83388746, "learning_rate": 2.7674802804969947e-06, "loss": 0.85520518, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.44726562, "step": 6543, "time_per_iteration": 2.428105354309082 }, { "auxiliary_loss_clip": 0.01079313, "auxiliary_loss_mlp": 0.01051058, "balance_loss_clip": 1.01860917, "balance_loss_mlp": 1.03357482, "epoch": 0.3934465654591913, "flos": 30846810873600.0, "grad_norm": 1.7394342392752344, "language_loss": 0.70243013, "learning_rate": 2.767120621015908e-06, "loss": 0.72373384, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45703125, "step": 6544, "time_per_iteration": 3.9572207927703857 }, { "auxiliary_loss_clip": 0.01085088, "auxiliary_loss_mlp": 0.01065572, "balance_loss_clip": 1.02940428, "balance_loss_mlp": 1.03706145, "epoch": 0.3935066887118593, "flos": 29235394938240.0, "grad_norm": 1.8536981061983677, "language_loss": 0.77226883, "learning_rate": 2.76676093244553e-06, "loss": 0.7937755, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 0.48046875, "step": 6545, "time_per_iteration": 3.8932409286499023 }, { "auxiliary_loss_clip": 0.01079897, "auxiliary_loss_mlp": 0.01060819, "balance_loss_clip": 1.0324235, "balance_loss_mlp": 1.03780901, "epoch": 0.3935668119645273, "flos": 19134288403200.0, "grad_norm": 1.4925852760802736, "language_loss": 0.75923312, "learning_rate": 2.7664012147995015e-06, "loss": 0.78064024, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 6546, "time_per_iteration": 2.419640302658081 }, { "auxiliary_loss_clip": 0.01088462, "auxiliary_loss_mlp": 0.01069903, "balance_loss_clip": 1.03547525, "balance_loss_mlp": 1.03926778, "epoch": 0.3936269352171953, "flos": 18515103707520.0, "grad_norm": 1.6171623248792848, "language_loss": 0.82558358, "learning_rate": 2.7660414680914617e-06, "loss": 0.84716725, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.4921875, "step": 6547, "time_per_iteration": 2.447960376739502 }, { "auxiliary_loss_clip": 0.01082476, "auxiliary_loss_mlp": 0.0107131, "balance_loss_clip": 1.04048276, "balance_loss_mlp": 1.0359056, "epoch": 0.39368705846986324, "flos": 15631906343040.0, "grad_norm": 1.6847975719523272, "language_loss": 0.85392725, "learning_rate": 2.7656816923350525e-06, "loss": 0.87546515, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.46484375, "step": 6548, "time_per_iteration": 3.834223508834839 }, { "auxiliary_loss_clip": 0.01082718, "auxiliary_loss_mlp": 0.01065795, "balance_loss_clip": 1.03683925, "balance_loss_mlp": 1.03807116, "epoch": 0.3937471817225312, "flos": 21324739104000.0, "grad_norm": 1.697264349363501, "language_loss": 0.73799264, "learning_rate": 2.7653218875439174e-06, "loss": 0.75947785, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4453125, "step": 6549, "time_per_iteration": 3.9004571437835693 }, { "auxiliary_loss_clip": 0.01082471, "auxiliary_loss_mlp": 0.01081933, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.03641891, "epoch": 0.39380730497519917, "flos": 20775660151680.0, "grad_norm": 1.5486394088261988, "language_loss": 0.78909743, "learning_rate": 2.764962053731699e-06, "loss": 0.81074148, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4609375, "step": 6550, "time_per_iteration": 2.5636725425720215 }, { "auxiliary_loss_clip": 0.01078609, "auxiliary_loss_mlp": 0.01075102, "balance_loss_clip": 1.04358304, "balance_loss_mlp": 1.03345394, "epoch": 0.39386742822786713, "flos": 21608897592960.0, "grad_norm": 1.632783132423678, "language_loss": 0.82528687, "learning_rate": 2.7646021909120434e-06, "loss": 0.84682393, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45117188, "step": 6551, "time_per_iteration": 2.4190211296081543 }, { "auxiliary_loss_clip": 0.01076897, "auxiliary_loss_mlp": 0.0107367, "balance_loss_clip": 1.04231787, "balance_loss_mlp": 1.03125238, "epoch": 0.3939275514805351, "flos": 12414031885440.0, "grad_norm": 2.7831094535263365, "language_loss": 0.81305736, "learning_rate": 2.764242299098596e-06, "loss": 0.83456302, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45703125, "step": 6552, "time_per_iteration": 2.4468345642089844 }, { "auxiliary_loss_clip": 0.01076632, "auxiliary_loss_mlp": 0.01074589, "balance_loss_clip": 1.04368973, "balance_loss_mlp": 1.03079081, "epoch": 0.39398767473320306, "flos": 18551029363200.0, "grad_norm": 1.6505830511648856, "language_loss": 0.72442591, "learning_rate": 2.763882378305003e-06, "loss": 0.74593806, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 6553, "time_per_iteration": 2.4525063037872314 }, { "auxiliary_loss_clip": 0.01071137, "auxiliary_loss_mlp": 0.01067366, "balance_loss_clip": 1.03780234, "balance_loss_mlp": 1.02853072, "epoch": 0.39404779798587103, "flos": 29307769920000.0, "grad_norm": 1.7805703118469085, "language_loss": 0.6557681, "learning_rate": 2.7635224285449144e-06, "loss": 0.67715317, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42578125, "step": 6554, "time_per_iteration": 2.5265610218048096 }, { "auxiliary_loss_clip": 0.01071209, "auxiliary_loss_mlp": 0.01062809, "balance_loss_clip": 1.03285122, "balance_loss_mlp": 1.02812088, "epoch": 0.394107921238539, "flos": 34895618622720.0, "grad_norm": 2.9321606981703665, "language_loss": 0.80692804, "learning_rate": 2.7631624498319796e-06, "loss": 0.82826823, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43164062, "step": 6555, "time_per_iteration": 2.5256388187408447 }, { "auxiliary_loss_clip": 0.01070059, "auxiliary_loss_mlp": 0.01057643, "balance_loss_clip": 1.02455068, "balance_loss_mlp": 1.02589285, "epoch": 0.39416804449120696, "flos": 25080276499200.0, "grad_norm": 2.0033417116030408, "language_loss": 0.73507702, "learning_rate": 2.7628024421798473e-06, "loss": 0.75635409, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44140625, "step": 6556, "time_per_iteration": 2.467241048812866 }, { "auxiliary_loss_clip": 0.01067864, "auxiliary_loss_mlp": 0.01047626, "balance_loss_clip": 1.01735902, "balance_loss_mlp": 1.02381575, "epoch": 0.3942281677438749, "flos": 32305272675840.0, "grad_norm": 2.065561051975645, "language_loss": 0.8508631, "learning_rate": 2.7624424056021705e-06, "loss": 0.87201798, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44140625, "step": 6557, "time_per_iteration": 2.4944376945495605 }, { "auxiliary_loss_clip": 0.01065514, "auxiliary_loss_mlp": 0.01052338, "balance_loss_clip": 1.02199924, "balance_loss_mlp": 1.02270913, "epoch": 0.3942882909965429, "flos": 24935456712960.0, "grad_norm": 2.7669281501085212, "language_loss": 0.81785136, "learning_rate": 2.7620823401126004e-06, "loss": 0.83902991, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4296875, "step": 6558, "time_per_iteration": 2.451183319091797 }, { "auxiliary_loss_clip": 0.01064383, "auxiliary_loss_mlp": 0.01044569, "balance_loss_clip": 1.01592278, "balance_loss_mlp": 1.02202511, "epoch": 0.39434841424921085, "flos": 11873994975360.0, "grad_norm": 1.7227336889277227, "language_loss": 0.72334582, "learning_rate": 2.761722245724792e-06, "loss": 0.74443531, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 6559, "time_per_iteration": 2.3705270290374756 }, { "auxiliary_loss_clip": 0.01068841, "auxiliary_loss_mlp": 0.0104949, "balance_loss_clip": 1.01732707, "balance_loss_mlp": 1.02288556, "epoch": 0.3944085375018789, "flos": 16360718307840.0, "grad_norm": 1.9595917626263695, "language_loss": 0.82373452, "learning_rate": 2.7613621224524003e-06, "loss": 0.84491777, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45898438, "step": 6560, "time_per_iteration": 2.3939342498779297 }, { "auxiliary_loss_clip": 0.0106892, "auxiliary_loss_mlp": 0.01047847, "balance_loss_clip": 1.01717424, "balance_loss_mlp": 1.02552021, "epoch": 0.39446866075454684, "flos": 10632623207040.0, "grad_norm": 2.4484280641843235, "language_loss": 0.8492763, "learning_rate": 2.7610019703090803e-06, "loss": 0.870444, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 6561, "time_per_iteration": 2.373145818710327 }, { "auxiliary_loss_clip": 0.01066775, "auxiliary_loss_mlp": 0.01045742, "balance_loss_clip": 1.01601124, "balance_loss_mlp": 1.02282631, "epoch": 0.3945287840072148, "flos": 18186501191040.0, "grad_norm": 2.3351495841689793, "language_loss": 0.80979526, "learning_rate": 2.7606417893084887e-06, "loss": 0.83092046, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43945312, "step": 6562, "time_per_iteration": 2.4006145000457764 }, { "auxiliary_loss_clip": 0.01063859, "auxiliary_loss_mlp": 0.01047529, "balance_loss_clip": 1.01783442, "balance_loss_mlp": 1.02289224, "epoch": 0.39458890725988277, "flos": 23038765125120.0, "grad_norm": 1.5439078273832556, "language_loss": 0.8278231, "learning_rate": 2.7602815794642853e-06, "loss": 0.84893703, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41015625, "step": 6563, "time_per_iteration": 2.418553352355957 }, { "auxiliary_loss_clip": 0.01066371, "auxiliary_loss_mlp": 0.01050327, "balance_loss_clip": 1.01998782, "balance_loss_mlp": 1.02343273, "epoch": 0.39464903051255074, "flos": 17158274472960.0, "grad_norm": 2.9893835273915235, "language_loss": 0.71003211, "learning_rate": 2.759921340790127e-06, "loss": 0.73119909, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4296875, "step": 6564, "time_per_iteration": 2.454163074493408 }, { "auxiliary_loss_clip": 0.01070352, "auxiliary_loss_mlp": 0.01062002, "balance_loss_clip": 1.02867138, "balance_loss_mlp": 1.0244534, "epoch": 0.3947091537652187, "flos": 15888064055040.0, "grad_norm": 2.1402695405660563, "language_loss": 0.84875494, "learning_rate": 2.759561073299676e-06, "loss": 0.87007856, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.45898438, "step": 6565, "time_per_iteration": 2.3947956562042236 }, { "auxiliary_loss_clip": 0.01070209, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.01971853, "balance_loss_mlp": 1.02660036, "epoch": 0.39476927701788667, "flos": 18544675495680.0, "grad_norm": 1.817875420176955, "language_loss": 0.84947002, "learning_rate": 2.7592007770065937e-06, "loss": 0.87065983, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4375, "step": 6566, "time_per_iteration": 2.40604829788208 }, { "auxiliary_loss_clip": 0.01072871, "auxiliary_loss_mlp": 0.01061948, "balance_loss_clip": 1.02922487, "balance_loss_mlp": 1.02579725, "epoch": 0.39482940027055463, "flos": 22274551175040.0, "grad_norm": 1.792203061019, "language_loss": 0.78539824, "learning_rate": 2.7588404519245403e-06, "loss": 0.80674648, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.47070312, "step": 6567, "time_per_iteration": 2.4227511882781982 }, { "auxiliary_loss_clip": 0.01067169, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.01516354, "balance_loss_mlp": 1.0261308, "epoch": 0.3948895235232226, "flos": 14756738492160.0, "grad_norm": 1.6791064890871699, "language_loss": 0.80573606, "learning_rate": 2.758480098067182e-06, "loss": 0.82684547, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 6568, "time_per_iteration": 2.4492266178131104 }, { "auxiliary_loss_clip": 0.01068932, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.01575983, "balance_loss_mlp": 1.02648163, "epoch": 0.39494964677589056, "flos": 22564644595200.0, "grad_norm": 1.6943293293798425, "language_loss": 0.85734987, "learning_rate": 2.7581197154481816e-06, "loss": 0.87849295, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42578125, "step": 6569, "time_per_iteration": 2.4201362133026123 }, { "auxiliary_loss_clip": 0.01069003, "auxiliary_loss_mlp": 0.01043649, "balance_loss_clip": 1.01658821, "balance_loss_mlp": 1.02787292, "epoch": 0.3950097700285585, "flos": 22962165868800.0, "grad_norm": 2.083954393445318, "language_loss": 0.75638384, "learning_rate": 2.7577593040812066e-06, "loss": 0.7775104, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 6570, "time_per_iteration": 2.4737815856933594 }, { "auxiliary_loss_clip": 0.01069484, "auxiliary_loss_mlp": 0.01044668, "balance_loss_clip": 1.01543772, "balance_loss_mlp": 1.02606809, "epoch": 0.3950698932812265, "flos": 20594181571200.0, "grad_norm": 1.7439517526373394, "language_loss": 0.81426352, "learning_rate": 2.757398863979922e-06, "loss": 0.83540505, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 6571, "time_per_iteration": 2.3850979804992676 }, { "auxiliary_loss_clip": 0.0106694, "auxiliary_loss_mlp": 0.01049727, "balance_loss_clip": 1.02085412, "balance_loss_mlp": 1.02534926, "epoch": 0.39513001653389446, "flos": 20374752476160.0, "grad_norm": 1.6006710131683901, "language_loss": 0.79383641, "learning_rate": 2.757038395157997e-06, "loss": 0.8150031, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41601562, "step": 6572, "time_per_iteration": 2.456850528717041 }, { "auxiliary_loss_clip": 0.01068801, "auxiliary_loss_mlp": 0.01049416, "balance_loss_clip": 1.01894605, "balance_loss_mlp": 1.02446175, "epoch": 0.3951901397865625, "flos": 26462592892800.0, "grad_norm": 1.5860161399962769, "language_loss": 0.75712323, "learning_rate": 2.7566778976291002e-06, "loss": 0.77830535, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44335938, "step": 6573, "time_per_iteration": 2.4480621814727783 }, { "auxiliary_loss_clip": 0.01065861, "auxiliary_loss_mlp": 0.01044393, "balance_loss_clip": 1.01769054, "balance_loss_mlp": 1.02328563, "epoch": 0.39525026303923044, "flos": 43836595856640.0, "grad_norm": 1.4404098466532091, "language_loss": 0.68860579, "learning_rate": 2.7563173714069017e-06, "loss": 0.70970833, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.42578125, "step": 6574, "time_per_iteration": 2.6452977657318115 }, { "auxiliary_loss_clip": 0.01067, "auxiliary_loss_mlp": 0.01046943, "balance_loss_clip": 1.01740265, "balance_loss_mlp": 1.0232861, "epoch": 0.3953103862918984, "flos": 18039831102720.0, "grad_norm": 2.6816395077370307, "language_loss": 0.72924733, "learning_rate": 2.755956816505072e-06, "loss": 0.75038677, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4375, "step": 6575, "time_per_iteration": 2.3808021545410156 }, { "auxiliary_loss_clip": 0.01065391, "auxiliary_loss_mlp": 0.01053307, "balance_loss_clip": 1.0223608, "balance_loss_mlp": 1.02096105, "epoch": 0.3953705095445664, "flos": 16975259792640.0, "grad_norm": 1.9528032543563638, "language_loss": 0.74665582, "learning_rate": 2.7555962329372845e-06, "loss": 0.76784283, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44335938, "step": 6576, "time_per_iteration": 2.379528522491455 }, { "auxiliary_loss_clip": 0.01063702, "auxiliary_loss_mlp": 0.01046341, "balance_loss_clip": 1.01767135, "balance_loss_mlp": 1.02088308, "epoch": 0.39543063279723434, "flos": 17410452289920.0, "grad_norm": 2.7240335072642594, "language_loss": 0.85033059, "learning_rate": 2.7552356207172124e-06, "loss": 0.87143099, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4296875, "step": 6577, "time_per_iteration": 2.3428261280059814 }, { "auxiliary_loss_clip": 0.01065288, "auxiliary_loss_mlp": 0.01045182, "balance_loss_clip": 1.01478398, "balance_loss_mlp": 1.02270341, "epoch": 0.3954907560499023, "flos": 22783096172160.0, "grad_norm": 2.5255267154004595, "language_loss": 0.92764366, "learning_rate": 2.75487497985853e-06, "loss": 0.94874835, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 6578, "time_per_iteration": 2.409799337387085 }, { "auxiliary_loss_clip": 0.01067617, "auxiliary_loss_mlp": 0.01050018, "balance_loss_clip": 1.01625824, "balance_loss_mlp": 1.02160263, "epoch": 0.39555087930257027, "flos": 21943330306560.0, "grad_norm": 2.1172474247901523, "language_loss": 0.79952013, "learning_rate": 2.7545143103749117e-06, "loss": 0.82069647, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4609375, "step": 6579, "time_per_iteration": 2.406332492828369 }, { "auxiliary_loss_clip": 0.01064794, "auxiliary_loss_mlp": 0.01046765, "balance_loss_clip": 1.01609206, "balance_loss_mlp": 1.02042818, "epoch": 0.39561100255523823, "flos": 20403800593920.0, "grad_norm": 2.3676734356794813, "language_loss": 0.69703901, "learning_rate": 2.754153612280037e-06, "loss": 0.71815467, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44335938, "step": 6580, "time_per_iteration": 2.3831260204315186 }, { "auxiliary_loss_clip": 0.0106254, "auxiliary_loss_mlp": 0.01045415, "balance_loss_clip": 1.01406264, "balance_loss_mlp": 1.01950717, "epoch": 0.3956711258079062, "flos": 27963334218240.0, "grad_norm": 1.7621189519290987, "language_loss": 0.59903175, "learning_rate": 2.7537928855875797e-06, "loss": 0.62011129, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4296875, "step": 6581, "time_per_iteration": 2.4207732677459717 }, { "auxiliary_loss_clip": 0.01065278, "auxiliary_loss_mlp": 0.01044462, "balance_loss_clip": 1.01322925, "balance_loss_mlp": 1.02157676, "epoch": 0.39573124906057416, "flos": 14427437748480.0, "grad_norm": 1.7655546588486442, "language_loss": 0.71159625, "learning_rate": 2.7534321303112224e-06, "loss": 0.73269367, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4375, "step": 6582, "time_per_iteration": 2.3809916973114014 }, { "auxiliary_loss_clip": 0.01063624, "auxiliary_loss_mlp": 0.01050858, "balance_loss_clip": 1.01811123, "balance_loss_mlp": 1.01963735, "epoch": 0.39579137231324213, "flos": 18732717411840.0, "grad_norm": 1.8816742438369298, "language_loss": 0.78480732, "learning_rate": 2.753071346464642e-06, "loss": 0.80595219, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43945312, "step": 6583, "time_per_iteration": 3.7931673526763916 }, { "auxiliary_loss_clip": 0.01061317, "auxiliary_loss_mlp": 0.01051703, "balance_loss_clip": 1.02194881, "balance_loss_mlp": 1.01921272, "epoch": 0.3958514955659101, "flos": 17675442576000.0, "grad_norm": 1.9834482023032796, "language_loss": 0.67008197, "learning_rate": 2.7527105340615207e-06, "loss": 0.69121218, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 6584, "time_per_iteration": 3.7370786666870117 }, { "auxiliary_loss_clip": 0.01065446, "auxiliary_loss_mlp": 0.01052741, "balance_loss_clip": 1.01855195, "balance_loss_mlp": 1.01965845, "epoch": 0.39591161881857806, "flos": 29307979388160.0, "grad_norm": 2.1315734869060146, "language_loss": 0.73751235, "learning_rate": 2.7523496931155413e-06, "loss": 0.75869423, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.45898438, "step": 6585, "time_per_iteration": 2.417539596557617 }, { "auxiliary_loss_clip": 0.01063896, "auxiliary_loss_mlp": 0.01048317, "balance_loss_clip": 1.01722777, "balance_loss_mlp": 1.01979494, "epoch": 0.3959717420712461, "flos": 25770753924480.0, "grad_norm": 1.745163099221485, "language_loss": 0.73861855, "learning_rate": 2.7519888236403856e-06, "loss": 0.75974071, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44140625, "step": 6586, "time_per_iteration": 2.4298925399780273 }, { "auxiliary_loss_clip": 0.01063657, "auxiliary_loss_mlp": 0.01051219, "balance_loss_clip": 1.01836538, "balance_loss_mlp": 1.01952624, "epoch": 0.39603186532391405, "flos": 20922714264960.0, "grad_norm": 1.6030298734055297, "language_loss": 0.72658145, "learning_rate": 2.7516279256497382e-06, "loss": 0.74773026, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44140625, "step": 6587, "time_per_iteration": 3.851867914199829 }, { "auxiliary_loss_clip": 0.01020052, "auxiliary_loss_mlp": 0.01015401, "balance_loss_clip": 1.01125228, "balance_loss_mlp": 1.00839615, "epoch": 0.396091988576582, "flos": 54878261086080.0, "grad_norm": 0.8640067631226273, "language_loss": 0.61373407, "learning_rate": 2.751266999157285e-06, "loss": 0.63408858, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 0.04150391, "router_z_loss_mlp": 0.11621094, "step": 6588, "time_per_iteration": 2.842393636703491 }, { "auxiliary_loss_clip": 0.0106439, "auxiliary_loss_mlp": 0.01044387, "balance_loss_clip": 1.01215315, "balance_loss_mlp": 1.02043581, "epoch": 0.39615211182925, "flos": 20701888715520.0, "grad_norm": 1.6630146949221762, "language_loss": 0.82335514, "learning_rate": 2.7509060441767115e-06, "loss": 0.84444296, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.43945312, "step": 6589, "time_per_iteration": 3.83420729637146 }, { "auxiliary_loss_clip": 0.01063081, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.01344395, "balance_loss_mlp": 1.01952791, "epoch": 0.39621223508191794, "flos": 20993308767360.0, "grad_norm": 3.4851546388631713, "language_loss": 0.72603256, "learning_rate": 2.7505450607217057e-06, "loss": 0.74712014, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.43554688, "step": 6590, "time_per_iteration": 2.3977856636047363 }, { "auxiliary_loss_clip": 0.01064059, "auxiliary_loss_mlp": 0.01056332, "balance_loss_clip": 1.0261246, "balance_loss_mlp": 1.02020049, "epoch": 0.3962723583345859, "flos": 23367681843840.0, "grad_norm": 1.7348302769586301, "language_loss": 0.76967812, "learning_rate": 2.750184048805956e-06, "loss": 0.79088199, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4375, "step": 6591, "time_per_iteration": 2.412048816680908 }, { "auxiliary_loss_clip": 0.01064102, "auxiliary_loss_mlp": 0.01057956, "balance_loss_clip": 1.02506638, "balance_loss_mlp": 1.02006912, "epoch": 0.39633248158725387, "flos": 25114526409600.0, "grad_norm": 1.6818799987233088, "language_loss": 0.7962997, "learning_rate": 2.749823008443152e-06, "loss": 0.81752032, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43945312, "step": 6592, "time_per_iteration": 2.4393246173858643 }, { "auxiliary_loss_clip": 0.0105936, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.01377559, "balance_loss_mlp": 1.01853168, "epoch": 0.39639260483992184, "flos": 39786007628160.0, "grad_norm": 1.6995618594594704, "language_loss": 0.69781876, "learning_rate": 2.7494619396469843e-06, "loss": 0.71881998, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 6593, "time_per_iteration": 2.5917296409606934 }, { "auxiliary_loss_clip": 0.01064913, "auxiliary_loss_mlp": 0.01055266, "balance_loss_clip": 1.02265, "balance_loss_mlp": 1.0190289, "epoch": 0.3964527280925898, "flos": 17346106920960.0, "grad_norm": 1.6058404737130039, "language_loss": 0.79207981, "learning_rate": 2.7491008424311452e-06, "loss": 0.81328166, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.45898438, "step": 6594, "time_per_iteration": 2.403973340988159 }, { "auxiliary_loss_clip": 0.01018565, "auxiliary_loss_mlp": 0.01003331, "balance_loss_clip": 0.99937326, "balance_loss_mlp": 1.00725555, "epoch": 0.39651285134525777, "flos": 71714182176000.0, "grad_norm": 0.9521354805012647, "language_loss": 0.63132745, "learning_rate": 2.7487397168093265e-06, "loss": 0.65154636, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.11328125, "step": 6595, "time_per_iteration": 3.067514181137085 }, { "auxiliary_loss_clip": 0.0106516, "auxiliary_loss_mlp": 0.01055747, "balance_loss_clip": 1.02105713, "balance_loss_mlp": 1.0193845, "epoch": 0.39657297459792573, "flos": 25774524351360.0, "grad_norm": 2.888005343230197, "language_loss": 0.65235054, "learning_rate": 2.748378562795223e-06, "loss": 0.67355967, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.45703125, "step": 6596, "time_per_iteration": 2.483163595199585 }, { "auxiliary_loss_clip": 0.01061756, "auxiliary_loss_mlp": 0.01059854, "balance_loss_clip": 1.02800155, "balance_loss_mlp": 1.01951718, "epoch": 0.3966330978505937, "flos": 20265090295680.0, "grad_norm": 2.1416483215073843, "language_loss": 0.80095077, "learning_rate": 2.7480173804025293e-06, "loss": 0.82216686, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.421875, "step": 6597, "time_per_iteration": 2.368825674057007 }, { "auxiliary_loss_clip": 0.01065532, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.01689446, "balance_loss_mlp": 1.02022827, "epoch": 0.39669322110326166, "flos": 20630142138240.0, "grad_norm": 2.0175675053984925, "language_loss": 0.69873852, "learning_rate": 2.747656169644941e-06, "loss": 0.71989566, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.453125, "step": 6598, "time_per_iteration": 2.398622512817383 }, { "auxiliary_loss_clip": 0.01062887, "auxiliary_loss_mlp": 0.01049057, "balance_loss_clip": 1.02097154, "balance_loss_mlp": 1.01974797, "epoch": 0.3967533443559297, "flos": 21724983463680.0, "grad_norm": 1.6927891508059618, "language_loss": 0.80700386, "learning_rate": 2.747294930536157e-06, "loss": 0.82812333, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.43164062, "step": 6599, "time_per_iteration": 2.4118435382843018 }, { "auxiliary_loss_clip": 0.01062633, "auxiliary_loss_mlp": 0.01045232, "balance_loss_clip": 1.01156747, "balance_loss_mlp": 1.01963603, "epoch": 0.39681346760859765, "flos": 25482964654080.0, "grad_norm": 2.403904920255821, "language_loss": 0.73786384, "learning_rate": 2.7469336630898737e-06, "loss": 0.75894248, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4296875, "step": 6600, "time_per_iteration": 2.445113182067871 }, { "auxiliary_loss_clip": 0.01061558, "auxiliary_loss_mlp": 0.01044575, "balance_loss_clip": 1.0151782, "balance_loss_mlp": 1.01867366, "epoch": 0.3968735908612656, "flos": 20958535186560.0, "grad_norm": 2.522297849904298, "language_loss": 0.87663019, "learning_rate": 2.746572367319791e-06, "loss": 0.89769149, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 6601, "time_per_iteration": 2.397015333175659 }, { "auxiliary_loss_clip": 0.01067934, "auxiliary_loss_mlp": 0.01052573, "balance_loss_clip": 1.01683438, "balance_loss_mlp": 1.0208385, "epoch": 0.3969337141139336, "flos": 10706324820480.0, "grad_norm": 2.05385878219445, "language_loss": 0.72069091, "learning_rate": 2.7462110432396095e-06, "loss": 0.74189597, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.47070312, "step": 6602, "time_per_iteration": 2.3832638263702393 }, { "auxiliary_loss_clip": 0.01061466, "auxiliary_loss_mlp": 0.01055742, "balance_loss_clip": 1.02222013, "balance_loss_mlp": 1.01904726, "epoch": 0.39699383736660154, "flos": 17593013122560.0, "grad_norm": 2.529332073398103, "language_loss": 0.8586241, "learning_rate": 2.7458496908630305e-06, "loss": 0.87979615, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.42382812, "step": 6603, "time_per_iteration": 2.3587827682495117 }, { "auxiliary_loss_clip": 0.01058196, "auxiliary_loss_mlp": 0.01044211, "balance_loss_clip": 1.0159111, "balance_loss_mlp": 1.01753426, "epoch": 0.3970539606192695, "flos": 17784965111040.0, "grad_norm": 1.5095053369022962, "language_loss": 0.74195206, "learning_rate": 2.7454883102037563e-06, "loss": 0.76297605, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 6604, "time_per_iteration": 2.4264731407165527 }, { "auxiliary_loss_clip": 0.01057908, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.01292944, "balance_loss_mlp": 1.01868033, "epoch": 0.3971140838719375, "flos": 24788367688320.0, "grad_norm": 1.5797461932060621, "language_loss": 0.83840752, "learning_rate": 2.745126901275491e-06, "loss": 0.85940397, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.39257812, "step": 6605, "time_per_iteration": 2.448962926864624 }, { "auxiliary_loss_clip": 0.01058024, "auxiliary_loss_mlp": 0.01042564, "balance_loss_clip": 1.01537263, "balance_loss_mlp": 1.01738191, "epoch": 0.39717420712460544, "flos": 24242430758400.0, "grad_norm": 1.539654948755483, "language_loss": 0.75507736, "learning_rate": 2.7447654640919383e-06, "loss": 0.77608323, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 6606, "time_per_iteration": 2.4119200706481934 }, { "auxiliary_loss_clip": 0.01062614, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.01517034, "balance_loss_mlp": 1.01998138, "epoch": 0.3972343303772734, "flos": 25883523216000.0, "grad_norm": 1.6809790421586417, "language_loss": 0.74940026, "learning_rate": 2.744403998666805e-06, "loss": 0.77047861, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 6607, "time_per_iteration": 2.412935733795166 }, { "auxiliary_loss_clip": 0.01062384, "auxiliary_loss_mlp": 0.01050068, "balance_loss_clip": 1.02052832, "balance_loss_mlp": 1.01872778, "epoch": 0.39729445362994137, "flos": 45621984430080.0, "grad_norm": 1.5057463649613718, "language_loss": 0.69050252, "learning_rate": 2.744042505013797e-06, "loss": 0.71162701, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4375, "step": 6608, "time_per_iteration": 2.622328758239746 }, { "auxiliary_loss_clip": 0.01065055, "auxiliary_loss_mlp": 0.0105457, "balance_loss_clip": 1.02041674, "balance_loss_mlp": 1.01983213, "epoch": 0.39735457688260933, "flos": 20192924782080.0, "grad_norm": 1.8150786087737145, "language_loss": 0.75552428, "learning_rate": 2.7436809831466233e-06, "loss": 0.77672052, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.453125, "step": 6609, "time_per_iteration": 2.3879308700561523 }, { "auxiliary_loss_clip": 0.01061251, "auxiliary_loss_mlp": 0.0104455, "balance_loss_clip": 1.01540375, "balance_loss_mlp": 1.01859832, "epoch": 0.3974147001352773, "flos": 23330045531520.0, "grad_norm": 1.487290648028129, "language_loss": 0.73015839, "learning_rate": 2.7433194330789927e-06, "loss": 0.75121641, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 6610, "time_per_iteration": 2.405268907546997 }, { "auxiliary_loss_clip": 0.01057412, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.01324058, "balance_loss_mlp": 1.01736987, "epoch": 0.39747482338794526, "flos": 21687591530880.0, "grad_norm": 1.6675355648339696, "language_loss": 0.79952741, "learning_rate": 2.7429578548246133e-06, "loss": 0.82050014, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 6611, "time_per_iteration": 2.4055910110473633 }, { "auxiliary_loss_clip": 0.0106243, "auxiliary_loss_mlp": 0.01047336, "balance_loss_clip": 1.01843989, "balance_loss_mlp": 1.01948881, "epoch": 0.3975349466406133, "flos": 30987511119360.0, "grad_norm": 1.7960885725342326, "language_loss": 0.80192322, "learning_rate": 2.7425962483971985e-06, "loss": 0.82302088, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 6612, "time_per_iteration": 2.470108985900879 }, { "auxiliary_loss_clip": 0.01016214, "auxiliary_loss_mlp": 0.0100397, "balance_loss_clip": 1.00025082, "balance_loss_mlp": 1.00533175, "epoch": 0.39759506989328125, "flos": 63680702578560.0, "grad_norm": 0.8485069416470725, "language_loss": 0.65178192, "learning_rate": 2.742234613810459e-06, "loss": 0.67198372, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.10888672, "step": 6613, "time_per_iteration": 2.9407365322113037 }, { "auxiliary_loss_clip": 0.01060653, "auxiliary_loss_mlp": 0.01047536, "balance_loss_clip": 1.01867509, "balance_loss_mlp": 1.01868594, "epoch": 0.3976551931459492, "flos": 23694713349120.0, "grad_norm": 2.2888343184325515, "language_loss": 0.73217654, "learning_rate": 2.741872951078109e-06, "loss": 0.75325841, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 6614, "time_per_iteration": 2.409714698791504 }, { "auxiliary_loss_clip": 0.01061139, "auxiliary_loss_mlp": 0.01045096, "balance_loss_clip": 1.01623535, "balance_loss_mlp": 1.01927805, "epoch": 0.3977153163986172, "flos": 15668739694080.0, "grad_norm": 1.649331351457274, "language_loss": 0.82666487, "learning_rate": 2.741511260213862e-06, "loss": 0.84772718, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 6615, "time_per_iteration": 2.3759772777557373 }, { "auxiliary_loss_clip": 0.01058492, "auxiliary_loss_mlp": 0.0103929, "balance_loss_clip": 1.01095414, "balance_loss_mlp": 1.01707745, "epoch": 0.39777543965128515, "flos": 14063817271680.0, "grad_norm": 2.112959454538708, "language_loss": 0.69551677, "learning_rate": 2.741149541231434e-06, "loss": 0.71649456, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41210938, "step": 6616, "time_per_iteration": 2.347687244415283 }, { "auxiliary_loss_clip": 0.01063002, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.01765847, "balance_loss_mlp": 1.01954603, "epoch": 0.3978355629039531, "flos": 23366355212160.0, "grad_norm": 2.155867540555182, "language_loss": 0.86156642, "learning_rate": 2.740787794144541e-06, "loss": 0.88268393, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43554688, "step": 6617, "time_per_iteration": 2.421766996383667 }, { "auxiliary_loss_clip": 0.01057197, "auxiliary_loss_mlp": 0.01048302, "balance_loss_clip": 1.02226651, "balance_loss_mlp": 1.01773536, "epoch": 0.3978956861566211, "flos": 19061773776000.0, "grad_norm": 1.7062232940647544, "language_loss": 0.73496479, "learning_rate": 2.7404260189669e-06, "loss": 0.75601977, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 6618, "time_per_iteration": 2.3934054374694824 }, { "auxiliary_loss_clip": 0.01060914, "auxiliary_loss_mlp": 0.01042497, "balance_loss_clip": 1.01257575, "balance_loss_mlp": 1.01908362, "epoch": 0.39795580940928904, "flos": 30226369368960.0, "grad_norm": 1.6438761070863968, "language_loss": 0.67033458, "learning_rate": 2.740064215712231e-06, "loss": 0.6913687, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41796875, "step": 6619, "time_per_iteration": 2.481961250305176 }, { "auxiliary_loss_clip": 0.01014121, "auxiliary_loss_mlp": 0.01016175, "balance_loss_clip": 1.01269388, "balance_loss_mlp": 1.00349927, "epoch": 0.398015932661957, "flos": 69843885442560.0, "grad_norm": 0.7744232988815002, "language_loss": 0.58320642, "learning_rate": 2.7397023843942527e-06, "loss": 0.60350931, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.10644531, "step": 6620, "time_per_iteration": 2.9675285816192627 }, { "auxiliary_loss_clip": 0.01059901, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.01565564, "balance_loss_mlp": 1.01853514, "epoch": 0.39807605591462497, "flos": 20156719835520.0, "grad_norm": 1.5353362550086451, "language_loss": 0.80468434, "learning_rate": 2.739340525026686e-06, "loss": 0.82569987, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.4140625, "step": 6621, "time_per_iteration": 2.3846840858459473 }, { "auxiliary_loss_clip": 0.01060415, "auxiliary_loss_mlp": 0.01042116, "balance_loss_clip": 1.01430392, "balance_loss_mlp": 1.01901197, "epoch": 0.39813617916729294, "flos": 21140711994240.0, "grad_norm": 1.8014180249784413, "language_loss": 0.8001368, "learning_rate": 2.738978637623252e-06, "loss": 0.8211621, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 6622, "time_per_iteration": 2.3750078678131104 }, { "auxiliary_loss_clip": 0.01060166, "auxiliary_loss_mlp": 0.01044838, "balance_loss_clip": 1.01501179, "balance_loss_mlp": 1.0180825, "epoch": 0.3981963024199609, "flos": 18987513580800.0, "grad_norm": 1.7673162268155533, "language_loss": 0.76781547, "learning_rate": 2.738616722197674e-06, "loss": 0.78886551, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.421875, "step": 6623, "time_per_iteration": 3.832249402999878 }, { "auxiliary_loss_clip": 0.01060062, "auxiliary_loss_mlp": 0.01047762, "balance_loss_clip": 1.01799536, "balance_loss_mlp": 1.01857662, "epoch": 0.39825642567262887, "flos": 16574352117120.0, "grad_norm": 1.7729362276481357, "language_loss": 0.81874955, "learning_rate": 2.7382547787636766e-06, "loss": 0.83982778, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41601562, "step": 6624, "time_per_iteration": 3.7764759063720703 }, { "auxiliary_loss_clip": 0.01064795, "auxiliary_loss_mlp": 0.01054688, "balance_loss_clip": 1.02195334, "balance_loss_mlp": 1.02065337, "epoch": 0.39831654892529683, "flos": 22198754880000.0, "grad_norm": 2.372502480895591, "language_loss": 0.8499375, "learning_rate": 2.7378928073349832e-06, "loss": 0.87113237, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44140625, "step": 6625, "time_per_iteration": 2.3927953243255615 }, { "auxiliary_loss_clip": 0.01060421, "auxiliary_loss_mlp": 0.01045541, "balance_loss_clip": 1.01719284, "balance_loss_mlp": 1.01901603, "epoch": 0.39837667217796485, "flos": 10487209927680.0, "grad_norm": 2.0360359494721583, "language_loss": 0.88454735, "learning_rate": 2.737530807925321e-06, "loss": 0.90560693, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 6626, "time_per_iteration": 3.7187352180480957 }, { "auxiliary_loss_clip": 0.01062255, "auxiliary_loss_mlp": 0.01043573, "balance_loss_clip": 1.01181602, "balance_loss_mlp": 1.01987875, "epoch": 0.3984367954306328, "flos": 17964383921280.0, "grad_norm": 2.33475797686895, "language_loss": 0.84539515, "learning_rate": 2.737168780548417e-06, "loss": 0.86645347, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42382812, "step": 6627, "time_per_iteration": 2.368659257888794 }, { "auxiliary_loss_clip": 0.01059556, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.01838541, "balance_loss_mlp": 1.01868248, "epoch": 0.3984969186833008, "flos": 22709953140480.0, "grad_norm": 1.5788651830278866, "language_loss": 0.84025139, "learning_rate": 2.736806725217998e-06, "loss": 0.86131024, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41015625, "step": 6628, "time_per_iteration": 2.3978657722473145 }, { "auxiliary_loss_clip": 0.01061941, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.01574898, "balance_loss_mlp": 1.01930296, "epoch": 0.39855704193596875, "flos": 23404619928960.0, "grad_norm": 1.69412254065451, "language_loss": 0.7223711, "learning_rate": 2.7364446419477945e-06, "loss": 0.74343109, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42578125, "step": 6629, "time_per_iteration": 3.824693441390991 }, { "auxiliary_loss_clip": 0.0105923, "auxiliary_loss_mlp": 0.01045725, "balance_loss_clip": 1.01630449, "balance_loss_mlp": 1.01898694, "epoch": 0.3986171651886367, "flos": 21250862933760.0, "grad_norm": 1.6238591893026357, "language_loss": 0.81588125, "learning_rate": 2.7360825307515366e-06, "loss": 0.83693081, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40234375, "step": 6630, "time_per_iteration": 2.385049343109131 }, { "auxiliary_loss_clip": 0.01062793, "auxiliary_loss_mlp": 0.01042597, "balance_loss_clip": 1.0134145, "balance_loss_mlp": 1.02047205, "epoch": 0.3986772884413047, "flos": 12457882419840.0, "grad_norm": 1.8881750720265338, "language_loss": 0.75819683, "learning_rate": 2.7357203916429555e-06, "loss": 0.77925068, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42382812, "step": 6631, "time_per_iteration": 2.4107179641723633 }, { "auxiliary_loss_clip": 0.01061633, "auxiliary_loss_mlp": 0.01046131, "balance_loss_clip": 1.01609027, "balance_loss_mlp": 1.01885915, "epoch": 0.39873741169397264, "flos": 19645102638720.0, "grad_norm": 1.7429534159278208, "language_loss": 0.72554779, "learning_rate": 2.735358224635783e-06, "loss": 0.74662542, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42773438, "step": 6632, "time_per_iteration": 2.3726062774658203 }, { "auxiliary_loss_clip": 0.01059155, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.01495862, "balance_loss_mlp": 1.01751578, "epoch": 0.3987975349466406, "flos": 21683821104000.0, "grad_norm": 1.8342556236667038, "language_loss": 0.76040411, "learning_rate": 2.7349960297437533e-06, "loss": 0.78142458, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41601562, "step": 6633, "time_per_iteration": 2.389531135559082 }, { "auxiliary_loss_clip": 0.01061003, "auxiliary_loss_mlp": 0.01041923, "balance_loss_clip": 1.01368248, "balance_loss_mlp": 1.01840782, "epoch": 0.3988576581993086, "flos": 23912955457920.0, "grad_norm": 1.8908097974129379, "language_loss": 0.82414973, "learning_rate": 2.7346338069806e-06, "loss": 0.84517902, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42578125, "step": 6634, "time_per_iteration": 2.4019882678985596 }, { "auxiliary_loss_clip": 0.01062658, "auxiliary_loss_mlp": 0.01049663, "balance_loss_clip": 1.0190506, "balance_loss_mlp": 1.02001548, "epoch": 0.39891778145197654, "flos": 18148934701440.0, "grad_norm": 1.856461988518296, "language_loss": 0.76246297, "learning_rate": 2.7342715563600597e-06, "loss": 0.78358614, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.42578125, "step": 6635, "time_per_iteration": 2.4229767322540283 }, { "auxiliary_loss_clip": 0.01064688, "auxiliary_loss_mlp": 0.01051002, "balance_loss_clip": 1.01852894, "balance_loss_mlp": 1.01884282, "epoch": 0.3989779047046445, "flos": 22594356028800.0, "grad_norm": 2.14017694516777, "language_loss": 0.67673868, "learning_rate": 2.733909277895868e-06, "loss": 0.69789559, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45703125, "step": 6636, "time_per_iteration": 2.3729288578033447 }, { "auxiliary_loss_clip": 0.01059258, "auxiliary_loss_mlp": 0.01047713, "balance_loss_clip": 1.01669466, "balance_loss_mlp": 1.01782525, "epoch": 0.39903802795731247, "flos": 18076245517440.0, "grad_norm": 1.8355359461641265, "language_loss": 0.82903039, "learning_rate": 2.733546971601763e-06, "loss": 0.8501001, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4140625, "step": 6637, "time_per_iteration": 2.40867280960083 }, { "auxiliary_loss_clip": 0.01012921, "auxiliary_loss_mlp": 0.01004277, "balance_loss_clip": 1.0010581, "balance_loss_mlp": 1.00210822, "epoch": 0.39909815120998043, "flos": 70437722624640.0, "grad_norm": 0.7187733083083574, "language_loss": 0.53270769, "learning_rate": 2.733184637491484e-06, "loss": 0.55287969, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.10839844, "step": 6638, "time_per_iteration": 3.065455198287964 }, { "auxiliary_loss_clip": 0.01063395, "auxiliary_loss_mlp": 0.01046579, "balance_loss_clip": 1.01495266, "balance_loss_mlp": 1.0190227, "epoch": 0.39915827446264845, "flos": 18548341188480.0, "grad_norm": 1.4403640887065177, "language_loss": 0.76329911, "learning_rate": 2.732822275578769e-06, "loss": 0.78439879, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4453125, "step": 6639, "time_per_iteration": 2.4108400344848633 }, { "auxiliary_loss_clip": 0.01057899, "auxiliary_loss_mlp": 0.01046083, "balance_loss_clip": 1.01874816, "balance_loss_mlp": 1.01728892, "epoch": 0.3992183977153164, "flos": 29895986373120.0, "grad_norm": 1.5952850712210533, "language_loss": 0.77374399, "learning_rate": 2.7324598858773603e-06, "loss": 0.79478383, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 6640, "time_per_iteration": 2.443411350250244 }, { "auxiliary_loss_clip": 0.01061827, "auxiliary_loss_mlp": 0.01044727, "balance_loss_clip": 1.01482987, "balance_loss_mlp": 1.01845527, "epoch": 0.3992785209679844, "flos": 22563981279360.0, "grad_norm": 2.3523172322104444, "language_loss": 0.83485568, "learning_rate": 2.7320974684009996e-06, "loss": 0.85592121, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 6641, "time_per_iteration": 2.4087867736816406 }, { "auxiliary_loss_clip": 0.01063632, "auxiliary_loss_mlp": 0.01048348, "balance_loss_clip": 1.01686502, "balance_loss_mlp": 1.01995063, "epoch": 0.39933864422065235, "flos": 19681656698880.0, "grad_norm": 2.083821120885025, "language_loss": 0.78257334, "learning_rate": 2.7317350231634288e-06, "loss": 0.80369318, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43554688, "step": 6642, "time_per_iteration": 2.361539125442505 }, { "auxiliary_loss_clip": 0.01061425, "auxiliary_loss_mlp": 0.01049502, "balance_loss_clip": 1.01912725, "balance_loss_mlp": 1.01841474, "epoch": 0.3993987674733203, "flos": 23037403582080.0, "grad_norm": 2.794269615223449, "language_loss": 0.74068701, "learning_rate": 2.731372550178393e-06, "loss": 0.76179624, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4296875, "step": 6643, "time_per_iteration": 2.420633554458618 }, { "auxiliary_loss_clip": 0.01061618, "auxiliary_loss_mlp": 0.01041254, "balance_loss_clip": 1.01056993, "balance_loss_mlp": 1.01819181, "epoch": 0.3994588907259883, "flos": 19389817710720.0, "grad_norm": 1.501398667070195, "language_loss": 0.67748117, "learning_rate": 2.7310100494596375e-06, "loss": 0.69850993, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 6644, "time_per_iteration": 2.3755807876586914 }, { "auxiliary_loss_clip": 0.01059236, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.01526237, "balance_loss_mlp": 1.01709998, "epoch": 0.39951901397865625, "flos": 13733573921280.0, "grad_norm": 2.129494547185246, "language_loss": 0.79627711, "learning_rate": 2.730647521020907e-06, "loss": 0.81734145, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.41992188, "step": 6645, "time_per_iteration": 2.3878259658813477 }, { "auxiliary_loss_clip": 0.0106142, "auxiliary_loss_mlp": 0.01050201, "balance_loss_clip": 1.01918268, "balance_loss_mlp": 1.01861644, "epoch": 0.3995791372313242, "flos": 23585330459520.0, "grad_norm": 1.4852694359003202, "language_loss": 0.71291232, "learning_rate": 2.73028496487595e-06, "loss": 0.73402858, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4296875, "step": 6646, "time_per_iteration": 2.408247470855713 }, { "auxiliary_loss_clip": 0.01061675, "auxiliary_loss_mlp": 0.01045586, "balance_loss_clip": 1.015414, "balance_loss_mlp": 1.01782763, "epoch": 0.3996392604839922, "flos": 21354974208000.0, "grad_norm": 1.9324502662986602, "language_loss": 0.73321855, "learning_rate": 2.729922381038513e-06, "loss": 0.75429112, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4375, "step": 6647, "time_per_iteration": 2.3924667835235596 }, { "auxiliary_loss_clip": 0.01058236, "auxiliary_loss_mlp": 0.01044029, "balance_loss_clip": 1.01408339, "balance_loss_mlp": 1.01758623, "epoch": 0.39969938373666014, "flos": 26031031176960.0, "grad_norm": 1.4220041875021372, "language_loss": 0.75240505, "learning_rate": 2.7295597695223463e-06, "loss": 0.77342772, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.40625, "step": 6648, "time_per_iteration": 2.4563000202178955 }, { "auxiliary_loss_clip": 0.01063399, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.0144316, "balance_loss_mlp": 1.01935792, "epoch": 0.3997595069893281, "flos": 20115452741760.0, "grad_norm": 1.7164734913746131, "language_loss": 0.67341286, "learning_rate": 2.7291971303412006e-06, "loss": 0.6944961, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 6649, "time_per_iteration": 2.3850913047790527 }, { "auxiliary_loss_clip": 0.01064868, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.02123582, "balance_loss_mlp": 1.02200627, "epoch": 0.39981963024199607, "flos": 27782134928640.0, "grad_norm": 1.6936213033411966, "language_loss": 0.76815593, "learning_rate": 2.728834463508826e-06, "loss": 0.78932077, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4296875, "step": 6650, "time_per_iteration": 2.447040557861328 }, { "auxiliary_loss_clip": 0.0106214, "auxiliary_loss_mlp": 0.01042866, "balance_loss_clip": 1.0142678, "balance_loss_mlp": 1.01903677, "epoch": 0.39987975349466404, "flos": 21943365217920.0, "grad_norm": 1.4696525256074258, "language_loss": 0.72997165, "learning_rate": 2.728471769038975e-06, "loss": 0.75102168, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 6651, "time_per_iteration": 2.4140660762786865 }, { "auxiliary_loss_clip": 0.01063015, "auxiliary_loss_mlp": 0.01047976, "balance_loss_clip": 1.01818514, "balance_loss_mlp": 1.01865208, "epoch": 0.39993987674733206, "flos": 20703354992640.0, "grad_norm": 1.9810109388719117, "language_loss": 0.75106704, "learning_rate": 2.728109046945403e-06, "loss": 0.77217692, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4453125, "step": 6652, "time_per_iteration": 2.3832647800445557 }, { "auxiliary_loss_clip": 0.01015664, "auxiliary_loss_mlp": 0.01013048, "balance_loss_clip": 1.00944829, "balance_loss_mlp": 1.00527608, "epoch": 0.4, "flos": 61522407106560.0, "grad_norm": 0.8477509527979804, "language_loss": 0.60769397, "learning_rate": 2.727746297241862e-06, "loss": 0.62798107, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 0.03588867, "router_z_loss_mlp": 0.10351562, "step": 6653, "time_per_iteration": 2.9542505741119385 }, { "auxiliary_loss_clip": 0.01058531, "auxiliary_loss_mlp": 0.01043904, "balance_loss_clip": 1.01627088, "balance_loss_mlp": 1.01855612, "epoch": 0.400060123252668, "flos": 14501418652800.0, "grad_norm": 1.9650202821663447, "language_loss": 0.67988634, "learning_rate": 2.7273835199421085e-06, "loss": 0.70091069, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 6654, "time_per_iteration": 2.363396406173706 }, { "auxiliary_loss_clip": 0.01061348, "auxiliary_loss_mlp": 0.01050447, "balance_loss_clip": 1.02137232, "balance_loss_mlp": 1.01789975, "epoch": 0.40012024650533595, "flos": 19092462727680.0, "grad_norm": 1.9434368748815518, "language_loss": 0.91249776, "learning_rate": 2.7270207150599e-06, "loss": 0.93361568, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 6655, "time_per_iteration": 2.4396133422851562 }, { "auxiliary_loss_clip": 0.0105877, "auxiliary_loss_mlp": 0.01042609, "balance_loss_clip": 1.01736104, "balance_loss_mlp": 1.01855183, "epoch": 0.4001803697580039, "flos": 29349735240960.0, "grad_norm": 1.6275160134194933, "language_loss": 0.74300581, "learning_rate": 2.7266578826089917e-06, "loss": 0.76401961, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40234375, "step": 6656, "time_per_iteration": 2.469726324081421 }, { "auxiliary_loss_clip": 0.01062456, "auxiliary_loss_mlp": 0.01046747, "balance_loss_clip": 1.01775527, "balance_loss_mlp": 1.01910734, "epoch": 0.4002404930106719, "flos": 20919083483520.0, "grad_norm": 1.5268899967431677, "language_loss": 0.74756992, "learning_rate": 2.726295022603144e-06, "loss": 0.76866192, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 6657, "time_per_iteration": 2.536707878112793 }, { "auxiliary_loss_clip": 0.01061028, "auxiliary_loss_mlp": 0.01053543, "balance_loss_clip": 1.01984262, "balance_loss_mlp": 1.01825309, "epoch": 0.40030061626333985, "flos": 28404391824000.0, "grad_norm": 1.4589387601450203, "language_loss": 0.80331975, "learning_rate": 2.725932135056117e-06, "loss": 0.82446545, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.42773438, "step": 6658, "time_per_iteration": 2.4572863578796387 }, { "auxiliary_loss_clip": 0.01061116, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.01504517, "balance_loss_mlp": 1.01810861, "epoch": 0.4003607395160078, "flos": 25920426389760.0, "grad_norm": 1.758528267437464, "language_loss": 0.78267652, "learning_rate": 2.72556921998167e-06, "loss": 0.80373019, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 6659, "time_per_iteration": 2.4424755573272705 }, { "auxiliary_loss_clip": 0.0105584, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.0117228, "balance_loss_mlp": 1.01660311, "epoch": 0.4004208627686758, "flos": 20767840007040.0, "grad_norm": 1.6280929567521922, "language_loss": 0.73679483, "learning_rate": 2.7252062773935662e-06, "loss": 0.75771976, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39257812, "step": 6660, "time_per_iteration": 2.379701852798462 }, { "auxiliary_loss_clip": 0.01059942, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.01831353, "balance_loss_mlp": 1.01852059, "epoch": 0.40048098602134374, "flos": 24680067050880.0, "grad_norm": 1.854562849094713, "language_loss": 0.7245481, "learning_rate": 2.7248433073055674e-06, "loss": 0.74560368, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 6661, "time_per_iteration": 2.4408788681030273 }, { "auxiliary_loss_clip": 0.01062383, "auxiliary_loss_mlp": 0.01049114, "balance_loss_clip": 1.01805949, "balance_loss_mlp": 1.01861656, "epoch": 0.4005411092740117, "flos": 23184562429440.0, "grad_norm": 1.7400620002660434, "language_loss": 0.77217388, "learning_rate": 2.724480309731437e-06, "loss": 0.79328889, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4375, "step": 6662, "time_per_iteration": 2.410400867462158 }, { "auxiliary_loss_clip": 0.01062741, "auxiliary_loss_mlp": 0.01049083, "balance_loss_clip": 1.01894653, "balance_loss_mlp": 1.01894116, "epoch": 0.4006012325266797, "flos": 17521580747520.0, "grad_norm": 1.9669718289794274, "language_loss": 0.68117326, "learning_rate": 2.7241172846849417e-06, "loss": 0.70229149, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4375, "step": 6663, "time_per_iteration": 4.121229648590088 }, { "auxiliary_loss_clip": 0.01060141, "auxiliary_loss_mlp": 0.01047696, "balance_loss_clip": 1.01894307, "balance_loss_mlp": 1.01758814, "epoch": 0.40066135577934764, "flos": 19856397386880.0, "grad_norm": 2.422715111093911, "language_loss": 0.87837064, "learning_rate": 2.7237542321798455e-06, "loss": 0.89944905, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 6664, "time_per_iteration": 2.378819704055786 }, { "auxiliary_loss_clip": 0.01061525, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.01673639, "balance_loss_mlp": 1.01947939, "epoch": 0.40072147903201566, "flos": 18149039435520.0, "grad_norm": 2.0512983477917177, "language_loss": 0.86302537, "learning_rate": 2.723391152229917e-06, "loss": 0.88409507, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 6665, "time_per_iteration": 2.3858020305633545 }, { "auxiliary_loss_clip": 0.01064611, "auxiliary_loss_mlp": 0.01048257, "balance_loss_clip": 1.01580858, "balance_loss_mlp": 1.02013707, "epoch": 0.4007816022846836, "flos": 18660272607360.0, "grad_norm": 1.532348077686855, "language_loss": 0.79039949, "learning_rate": 2.7230280448489236e-06, "loss": 0.81152815, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4453125, "step": 6666, "time_per_iteration": 3.910855770111084 }, { "auxiliary_loss_clip": 0.01063841, "auxiliary_loss_mlp": 0.01046017, "balance_loss_clip": 1.01571417, "balance_loss_mlp": 1.02116966, "epoch": 0.4008417255373516, "flos": 25701974812800.0, "grad_norm": 2.224874818192285, "language_loss": 0.75637043, "learning_rate": 2.7226649100506333e-06, "loss": 0.77746898, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42773438, "step": 6667, "time_per_iteration": 2.454721689224243 }, { "auxiliary_loss_clip": 0.01065316, "auxiliary_loss_mlp": 0.01052418, "balance_loss_clip": 1.01953983, "balance_loss_mlp": 1.02074289, "epoch": 0.40090184879001955, "flos": 22857461101440.0, "grad_norm": 1.4613830830131478, "language_loss": 0.76566958, "learning_rate": 2.7223017478488183e-06, "loss": 0.78684688, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 6668, "time_per_iteration": 3.9102590084075928 }, { "auxiliary_loss_clip": 0.01062541, "auxiliary_loss_mlp": 0.0104991, "balance_loss_clip": 1.01972628, "balance_loss_mlp": 1.02057731, "epoch": 0.4009619720426875, "flos": 29058559568640.0, "grad_norm": 1.769861034234263, "language_loss": 0.83535975, "learning_rate": 2.721938558257248e-06, "loss": 0.85648429, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41992188, "step": 6669, "time_per_iteration": 2.4540603160858154 }, { "auxiliary_loss_clip": 0.01012869, "auxiliary_loss_mlp": 0.01003825, "balance_loss_clip": 0.99979627, "balance_loss_mlp": 1.00334835, "epoch": 0.4010220952953555, "flos": 66056332464000.0, "grad_norm": 0.7023188949266852, "language_loss": 0.53446221, "learning_rate": 2.721575341289695e-06, "loss": 0.55462909, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.09570312, "step": 6670, "time_per_iteration": 3.3570563793182373 }, { "auxiliary_loss_clip": 0.01061437, "auxiliary_loss_mlp": 0.01043771, "balance_loss_clip": 1.01527977, "balance_loss_mlp": 1.01935291, "epoch": 0.40108221854802345, "flos": 29641539317760.0, "grad_norm": 1.6111001709736401, "language_loss": 0.89767575, "learning_rate": 2.7212120969599333e-06, "loss": 0.91872787, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 6671, "time_per_iteration": 2.476142644882202 }, { "auxiliary_loss_clip": 0.01062571, "auxiliary_loss_mlp": 0.0104744, "balance_loss_clip": 1.01710141, "balance_loss_mlp": 1.01998568, "epoch": 0.4011423418006914, "flos": 19928772368640.0, "grad_norm": 1.737198903830294, "language_loss": 0.79913384, "learning_rate": 2.720848825281736e-06, "loss": 0.820234, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 6672, "time_per_iteration": 2.35787034034729 }, { "auxiliary_loss_clip": 0.01059416, "auxiliary_loss_mlp": 0.01041914, "balance_loss_clip": 1.01397097, "balance_loss_mlp": 1.01866913, "epoch": 0.4012024650533594, "flos": 20083262601600.0, "grad_norm": 2.0319766543297653, "language_loss": 0.65071714, "learning_rate": 2.72048552626888e-06, "loss": 0.6717304, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40820312, "step": 6673, "time_per_iteration": 2.4052648544311523 }, { "auxiliary_loss_clip": 0.01061146, "auxiliary_loss_mlp": 0.01048163, "balance_loss_clip": 1.01816988, "balance_loss_mlp": 1.01838088, "epoch": 0.40126258830602735, "flos": 21694469068800.0, "grad_norm": 1.6722654332631037, "language_loss": 0.80894881, "learning_rate": 2.7201221999351402e-06, "loss": 0.83004189, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42773438, "step": 6674, "time_per_iteration": 2.433069944381714 }, { "auxiliary_loss_clip": 0.01062327, "auxiliary_loss_mlp": 0.01046322, "balance_loss_clip": 1.01600671, "balance_loss_mlp": 1.01843572, "epoch": 0.4013227115586953, "flos": 12019582811520.0, "grad_norm": 3.7031268573787806, "language_loss": 0.84638011, "learning_rate": 2.719758846294294e-06, "loss": 0.86746663, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 6675, "time_per_iteration": 2.3563990592956543 }, { "auxiliary_loss_clip": 0.01058879, "auxiliary_loss_mlp": 0.01047005, "balance_loss_clip": 1.017012, "balance_loss_mlp": 1.01753294, "epoch": 0.4013828348113633, "flos": 25446340771200.0, "grad_norm": 1.6749227630775092, "language_loss": 0.9465239, "learning_rate": 2.71939546536012e-06, "loss": 0.96758276, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4140625, "step": 6676, "time_per_iteration": 2.435750961303711 }, { "auxiliary_loss_clip": 0.01067185, "auxiliary_loss_mlp": 0.01048868, "balance_loss_clip": 1.01501238, "balance_loss_mlp": 1.02103448, "epoch": 0.40144295806403124, "flos": 18582102339840.0, "grad_norm": 1.8065313939949859, "language_loss": 0.80436206, "learning_rate": 2.719032057146399e-06, "loss": 0.82552266, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 0.4609375, "step": 6677, "time_per_iteration": 2.355731964111328 }, { "auxiliary_loss_clip": 0.01061757, "auxiliary_loss_mlp": 0.01046663, "balance_loss_clip": 1.01910138, "balance_loss_mlp": 1.01956284, "epoch": 0.4015030813166992, "flos": 22929102944640.0, "grad_norm": 1.849750971415185, "language_loss": 0.85047531, "learning_rate": 2.71866862166691e-06, "loss": 0.8715595, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.421875, "step": 6678, "time_per_iteration": 2.422074317932129 }, { "auxiliary_loss_clip": 0.01061434, "auxiliary_loss_mlp": 0.01049981, "balance_loss_clip": 1.02079833, "balance_loss_mlp": 1.0200417, "epoch": 0.4015632045693672, "flos": 20594007014400.0, "grad_norm": 2.616815365461533, "language_loss": 0.64778596, "learning_rate": 2.718305158935434e-06, "loss": 0.66890013, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4140625, "step": 6679, "time_per_iteration": 2.3820815086364746 }, { "auxiliary_loss_clip": 0.01058955, "auxiliary_loss_mlp": 0.01042188, "balance_loss_clip": 1.01391125, "balance_loss_mlp": 1.01791775, "epoch": 0.4016233278220352, "flos": 23437857409920.0, "grad_norm": 1.4673847173390842, "language_loss": 0.79642493, "learning_rate": 2.7179416689657554e-06, "loss": 0.81743634, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41015625, "step": 6680, "time_per_iteration": 2.4193241596221924 }, { "auxiliary_loss_clip": 0.0106556, "auxiliary_loss_mlp": 0.01058581, "balance_loss_clip": 1.02777719, "balance_loss_mlp": 1.01996183, "epoch": 0.40168345107470316, "flos": 21430072275840.0, "grad_norm": 1.6660204425943363, "language_loss": 0.77281785, "learning_rate": 2.7175781517716556e-06, "loss": 0.79405922, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 6681, "time_per_iteration": 2.43605637550354 }, { "auxiliary_loss_clip": 0.01059974, "auxiliary_loss_mlp": 0.01045526, "balance_loss_clip": 1.01661801, "balance_loss_mlp": 1.01739991, "epoch": 0.4017435743273711, "flos": 22856099558400.0, "grad_norm": 1.7524689865711462, "language_loss": 0.65529764, "learning_rate": 2.7172146073669213e-06, "loss": 0.67635268, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 6682, "time_per_iteration": 2.4062209129333496 }, { "auxiliary_loss_clip": 0.01063755, "auxiliary_loss_mlp": 0.01051125, "balance_loss_clip": 1.02036881, "balance_loss_mlp": 1.01944768, "epoch": 0.4018036975800391, "flos": 28621028010240.0, "grad_norm": 1.8342176293817907, "language_loss": 0.74788964, "learning_rate": 2.716851035765337e-06, "loss": 0.76903838, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44335938, "step": 6683, "time_per_iteration": 2.4276769161224365 }, { "auxiliary_loss_clip": 0.01060575, "auxiliary_loss_mlp": 0.01045355, "balance_loss_clip": 1.01619673, "balance_loss_mlp": 1.01759434, "epoch": 0.40186382083270705, "flos": 26650006404480.0, "grad_norm": 1.5599264722987427, "language_loss": 0.74250078, "learning_rate": 2.7164874369806896e-06, "loss": 0.76356006, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4296875, "step": 6684, "time_per_iteration": 2.5580732822418213 }, { "auxiliary_loss_clip": 0.01012966, "auxiliary_loss_mlp": 0.01008228, "balance_loss_clip": 1.00469911, "balance_loss_mlp": 1.00378013, "epoch": 0.401923944085375, "flos": 59257102717440.0, "grad_norm": 0.8103456544754818, "language_loss": 0.60447168, "learning_rate": 2.716123811026767e-06, "loss": 0.62468362, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 0.03540039, "router_z_loss_mlp": 0.09179688, "step": 6685, "time_per_iteration": 3.1436448097229004 }, { "auxiliary_loss_clip": 0.01062068, "auxiliary_loss_mlp": 0.01046789, "balance_loss_clip": 1.01565158, "balance_loss_mlp": 1.01835144, "epoch": 0.401984067338043, "flos": 16981858039680.0, "grad_norm": 1.600130865015991, "language_loss": 0.71402645, "learning_rate": 2.715760157917357e-06, "loss": 0.73511505, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4375, "step": 6686, "time_per_iteration": 2.3819172382354736 }, { "auxiliary_loss_clip": 0.01060265, "auxiliary_loss_mlp": 0.01050836, "balance_loss_clip": 1.02104568, "balance_loss_mlp": 1.018471, "epoch": 0.40204419059071095, "flos": 24971347457280.0, "grad_norm": 1.3590088221860444, "language_loss": 0.75248116, "learning_rate": 2.7153964776662504e-06, "loss": 0.77359217, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41796875, "step": 6687, "time_per_iteration": 2.4135968685150146 }, { "auxiliary_loss_clip": 0.01063911, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.01628256, "balance_loss_mlp": 1.01979017, "epoch": 0.4021043138433789, "flos": 23476331594880.0, "grad_norm": 1.7668070233092341, "language_loss": 0.71786571, "learning_rate": 2.7150327702872385e-06, "loss": 0.73897159, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 6688, "time_per_iteration": 2.420823574066162 }, { "auxiliary_loss_clip": 0.0106573, "auxiliary_loss_mlp": 0.01049107, "balance_loss_clip": 1.01670575, "balance_loss_mlp": 1.02037179, "epoch": 0.4021644370960469, "flos": 25994581850880.0, "grad_norm": 1.8283297126128062, "language_loss": 0.65821147, "learning_rate": 2.7146690357941112e-06, "loss": 0.67935979, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.453125, "step": 6689, "time_per_iteration": 2.427699565887451 }, { "auxiliary_loss_clip": 0.01062261, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.0101788, "balance_loss_mlp": 1.0184983, "epoch": 0.40222456034871484, "flos": 13587183123840.0, "grad_norm": 2.2101844232240353, "language_loss": 0.75618124, "learning_rate": 2.7143052742006632e-06, "loss": 0.77720922, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4375, "step": 6690, "time_per_iteration": 2.3774843215942383 }, { "auxiliary_loss_clip": 0.01058973, "auxiliary_loss_mlp": 0.01048189, "balance_loss_clip": 1.0186491, "balance_loss_mlp": 1.01700807, "epoch": 0.4022846836013828, "flos": 24276925048320.0, "grad_norm": 1.6139898094195189, "language_loss": 0.75693083, "learning_rate": 2.7139414855206872e-06, "loss": 0.7780025, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41992188, "step": 6691, "time_per_iteration": 2.4119131565093994 }, { "auxiliary_loss_clip": 0.01063984, "auxiliary_loss_mlp": 0.01048119, "balance_loss_clip": 1.01613498, "balance_loss_mlp": 1.01987576, "epoch": 0.40234480685405083, "flos": 20150715081600.0, "grad_norm": 1.6454589150618726, "language_loss": 0.74889487, "learning_rate": 2.7135776697679785e-06, "loss": 0.77001584, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44140625, "step": 6692, "time_per_iteration": 2.3972322940826416 }, { "auxiliary_loss_clip": 0.0106072, "auxiliary_loss_mlp": 0.01044935, "balance_loss_clip": 1.0152638, "balance_loss_mlp": 1.01783752, "epoch": 0.4024049301067188, "flos": 22929102944640.0, "grad_norm": 1.7758819025610326, "language_loss": 0.84984052, "learning_rate": 2.7132138269563333e-06, "loss": 0.87089705, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 6693, "time_per_iteration": 2.3834784030914307 }, { "auxiliary_loss_clip": 0.01063207, "auxiliary_loss_mlp": 0.01048585, "balance_loss_clip": 1.01997447, "balance_loss_mlp": 1.02016997, "epoch": 0.40246505335938676, "flos": 36026944185600.0, "grad_norm": 1.6776854000515748, "language_loss": 0.71443915, "learning_rate": 2.7128499570995483e-06, "loss": 0.73555708, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4296875, "step": 6694, "time_per_iteration": 2.752451181411743 }, { "auxiliary_loss_clip": 0.01060607, "auxiliary_loss_mlp": 0.01049091, "balance_loss_clip": 1.01887155, "balance_loss_mlp": 1.01810813, "epoch": 0.4025251766120547, "flos": 20593273875840.0, "grad_norm": 2.1437230271760286, "language_loss": 0.69517159, "learning_rate": 2.7124860602114212e-06, "loss": 0.7162686, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42382812, "step": 6695, "time_per_iteration": 2.3652024269104004 }, { "auxiliary_loss_clip": 0.01061684, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.01503897, "balance_loss_mlp": 1.01857162, "epoch": 0.4025852998647227, "flos": 64521657296640.0, "grad_norm": 1.8443421129552275, "language_loss": 0.81325698, "learning_rate": 2.7121221363057515e-06, "loss": 0.83431894, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43164062, "step": 6696, "time_per_iteration": 2.7932941913604736 }, { "auxiliary_loss_clip": 0.0106314, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.02071881, "balance_loss_mlp": 1.01995003, "epoch": 0.40264542311739066, "flos": 20885252509440.0, "grad_norm": 1.7468942609580878, "language_loss": 0.72382659, "learning_rate": 2.7117581853963393e-06, "loss": 0.7449792, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43164062, "step": 6697, "time_per_iteration": 2.4722185134887695 }, { "auxiliary_loss_clip": 0.01060841, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.02156281, "balance_loss_mlp": 1.01876044, "epoch": 0.4027055463700586, "flos": 26248993994880.0, "grad_norm": 1.8903056939822591, "language_loss": 0.63284361, "learning_rate": 2.711394207496984e-06, "loss": 0.65396547, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 6698, "time_per_iteration": 2.522446870803833 }, { "auxiliary_loss_clip": 0.01061489, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.01431179, "balance_loss_mlp": 1.01772869, "epoch": 0.4027656696227266, "flos": 20630351606400.0, "grad_norm": 1.9319210419453947, "language_loss": 0.7880013, "learning_rate": 2.711030202621491e-06, "loss": 0.80908155, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4375, "step": 6699, "time_per_iteration": 2.3892745971679688 }, { "auxiliary_loss_clip": 0.01058093, "auxiliary_loss_mlp": 0.01042806, "balance_loss_clip": 1.01462543, "balance_loss_mlp": 1.01688492, "epoch": 0.40282579287539455, "flos": 22345180588800.0, "grad_norm": 1.578395550803258, "language_loss": 0.82228863, "learning_rate": 2.7106661707836605e-06, "loss": 0.84329766, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 6700, "time_per_iteration": 2.390028715133667 }, { "auxiliary_loss_clip": 0.01062217, "auxiliary_loss_mlp": 0.01053219, "balance_loss_clip": 1.01857686, "balance_loss_mlp": 1.01672196, "epoch": 0.4028859161280625, "flos": 29273799300480.0, "grad_norm": 1.7721704359051866, "language_loss": 0.75794613, "learning_rate": 2.7103021119972977e-06, "loss": 0.77910054, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.453125, "step": 6701, "time_per_iteration": 2.4299838542938232 }, { "auxiliary_loss_clip": 0.01060404, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.01608467, "balance_loss_mlp": 1.0181638, "epoch": 0.4029460393807305, "flos": 28621028010240.0, "grad_norm": 1.8557372655085609, "language_loss": 0.67097676, "learning_rate": 2.709938026276208e-06, "loss": 0.69203395, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 6702, "time_per_iteration": 3.937166929244995 }, { "auxiliary_loss_clip": 0.01061606, "auxiliary_loss_mlp": 0.01050927, "balance_loss_clip": 1.01854944, "balance_loss_mlp": 1.01784968, "epoch": 0.40300616263339845, "flos": 22600814630400.0, "grad_norm": 3.109393997213965, "language_loss": 0.67323166, "learning_rate": 2.7095739136341964e-06, "loss": 0.69435698, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4375, "step": 6703, "time_per_iteration": 3.834017515182495 }, { "auxiliary_loss_clip": 0.01063482, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 1.01977408, "balance_loss_mlp": 1.01997495, "epoch": 0.4030662858860664, "flos": 25519134689280.0, "grad_norm": 1.7722878102181687, "language_loss": 0.83611345, "learning_rate": 2.709209774085071e-06, "loss": 0.8572461, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43359375, "step": 6704, "time_per_iteration": 2.409428119659424 }, { "auxiliary_loss_clip": 0.01064339, "auxiliary_loss_mlp": 0.0105334, "balance_loss_clip": 1.0230608, "balance_loss_mlp": 1.01950598, "epoch": 0.40312640913873443, "flos": 23585574839040.0, "grad_norm": 1.7398558872793886, "language_loss": 0.75493693, "learning_rate": 2.7088456076426407e-06, "loss": 0.77611375, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44726562, "step": 6705, "time_per_iteration": 3.710819721221924 }, { "auxiliary_loss_clip": 0.01058289, "auxiliary_loss_mlp": 0.01050146, "balance_loss_clip": 1.02146459, "balance_loss_mlp": 1.01717186, "epoch": 0.4031865323914024, "flos": 20010014835840.0, "grad_norm": 1.6180606022751252, "language_loss": 0.6816681, "learning_rate": 2.708481414320713e-06, "loss": 0.70275247, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41210938, "step": 6706, "time_per_iteration": 2.406973123550415 }, { "auxiliary_loss_clip": 0.01060699, "auxiliary_loss_mlp": 0.01045492, "balance_loss_clip": 1.01728725, "balance_loss_mlp": 1.01852119, "epoch": 0.40324665564407036, "flos": 21870361831680.0, "grad_norm": 1.2991496258271822, "language_loss": 0.71952873, "learning_rate": 2.7081171941330992e-06, "loss": 0.74059069, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 6707, "time_per_iteration": 2.4201419353485107 }, { "auxiliary_loss_clip": 0.01055321, "auxiliary_loss_mlp": 0.01036925, "balance_loss_clip": 1.01081824, "balance_loss_mlp": 1.0168705, "epoch": 0.4033067788967383, "flos": 23877588384000.0, "grad_norm": 1.5051597102784322, "language_loss": 0.81151378, "learning_rate": 2.707752947093611e-06, "loss": 0.8324362, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38476562, "step": 6708, "time_per_iteration": 3.8970463275909424 }, { "auxiliary_loss_clip": 0.01062213, "auxiliary_loss_mlp": 0.01053932, "balance_loss_clip": 1.02452302, "balance_loss_mlp": 1.01734126, "epoch": 0.4033669021494063, "flos": 17418970661760.0, "grad_norm": 2.032306469775412, "language_loss": 0.84709668, "learning_rate": 2.70738867321606e-06, "loss": 0.86825812, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44921875, "step": 6709, "time_per_iteration": 2.4405786991119385 }, { "auxiliary_loss_clip": 0.01062191, "auxiliary_loss_mlp": 0.01049572, "balance_loss_clip": 1.01829171, "balance_loss_mlp": 1.01900291, "epoch": 0.40342702540207426, "flos": 29599434351360.0, "grad_norm": 1.4977425945501732, "language_loss": 0.7200681, "learning_rate": 2.70702437251426e-06, "loss": 0.74118578, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43164062, "step": 6710, "time_per_iteration": 2.4417598247528076 }, { "auxiliary_loss_clip": 0.01058866, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.01594961, "balance_loss_mlp": 1.01725817, "epoch": 0.4034871486547422, "flos": 11283998042880.0, "grad_norm": 2.270819229349762, "language_loss": 0.86221033, "learning_rate": 2.7066600450020236e-06, "loss": 0.88325119, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41601562, "step": 6711, "time_per_iteration": 2.344531536102295 }, { "auxiliary_loss_clip": 0.01061794, "auxiliary_loss_mlp": 0.01053103, "balance_loss_clip": 1.02146494, "balance_loss_mlp": 1.01878953, "epoch": 0.4035472719074102, "flos": 15552130152960.0, "grad_norm": 3.22884150947739, "language_loss": 0.79094589, "learning_rate": 2.706295690693168e-06, "loss": 0.81209487, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4296875, "step": 6712, "time_per_iteration": 2.3570449352264404 }, { "auxiliary_loss_clip": 0.01060291, "auxiliary_loss_mlp": 0.01047087, "balance_loss_clip": 1.01796365, "balance_loss_mlp": 1.01825631, "epoch": 0.40360739516007815, "flos": 24673398981120.0, "grad_norm": 1.8306544944107408, "language_loss": 0.80552226, "learning_rate": 2.7059313096015096e-06, "loss": 0.82659608, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41992188, "step": 6713, "time_per_iteration": 2.3951334953308105 }, { "auxiliary_loss_clip": 0.0106136, "auxiliary_loss_mlp": 0.01049404, "balance_loss_clip": 1.01869571, "balance_loss_mlp": 1.01819503, "epoch": 0.4036675184127461, "flos": 17303338638720.0, "grad_norm": 1.6365174335071342, "language_loss": 0.89647961, "learning_rate": 2.705566901740865e-06, "loss": 0.91758728, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43164062, "step": 6714, "time_per_iteration": 2.368803024291992 }, { "auxiliary_loss_clip": 0.0105921, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.01868796, "balance_loss_mlp": 1.01810348, "epoch": 0.4037276416654141, "flos": 19863030545280.0, "grad_norm": 1.7477482617658588, "language_loss": 0.70868188, "learning_rate": 2.7052024671250527e-06, "loss": 0.72973502, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41210938, "step": 6715, "time_per_iteration": 2.394578218460083 }, { "auxiliary_loss_clip": 0.01061869, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.01651692, "balance_loss_mlp": 1.01832211, "epoch": 0.40378776491808205, "flos": 18295290587520.0, "grad_norm": 1.9710862158104563, "language_loss": 0.79374021, "learning_rate": 2.704838005767892e-06, "loss": 0.81482339, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43554688, "step": 6716, "time_per_iteration": 2.378110408782959 }, { "auxiliary_loss_clip": 0.01059086, "auxiliary_loss_mlp": 0.01041401, "balance_loss_clip": 1.01419687, "balance_loss_mlp": 1.01771152, "epoch": 0.40384788817075, "flos": 15048472746240.0, "grad_norm": 1.847326448961894, "language_loss": 0.77407616, "learning_rate": 2.7044735176832037e-06, "loss": 0.79508102, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 6717, "time_per_iteration": 2.3410379886627197 }, { "auxiliary_loss_clip": 0.01011772, "auxiliary_loss_mlp": 0.01004774, "balance_loss_clip": 1.00100696, "balance_loss_mlp": 1.00273681, "epoch": 0.40390801142341803, "flos": 61926805918080.0, "grad_norm": 0.9285321132764148, "language_loss": 0.6090163, "learning_rate": 2.7041090028848084e-06, "loss": 0.62918174, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 0.03759766, "router_z_loss_mlp": 0.09033203, "step": 6718, "time_per_iteration": 2.8976173400878906 }, { "auxiliary_loss_clip": 0.01063473, "auxiliary_loss_mlp": 0.01047068, "balance_loss_clip": 1.01457119, "balance_loss_mlp": 1.0184443, "epoch": 0.403968134676086, "flos": 22737919006080.0, "grad_norm": 7.192007474702263, "language_loss": 0.75924873, "learning_rate": 2.7037444613865306e-06, "loss": 0.78035414, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45117188, "step": 6719, "time_per_iteration": 2.3622887134552 }, { "auxiliary_loss_clip": 0.01061629, "auxiliary_loss_mlp": 0.01051115, "balance_loss_clip": 1.02205181, "balance_loss_mlp": 1.01902914, "epoch": 0.40402825792875396, "flos": 19783603468800.0, "grad_norm": 2.0206800155719926, "language_loss": 0.83143884, "learning_rate": 2.7033798932021906e-06, "loss": 0.8525663, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 6720, "time_per_iteration": 2.3720524311065674 }, { "auxiliary_loss_clip": 0.01059936, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.01171672, "balance_loss_mlp": 1.01796269, "epoch": 0.40408838118142193, "flos": 19608269287680.0, "grad_norm": 1.9210080049731184, "language_loss": 0.77692401, "learning_rate": 2.7030152983456153e-06, "loss": 0.79793191, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41796875, "step": 6721, "time_per_iteration": 2.354743719100952 }, { "auxiliary_loss_clip": 0.01056744, "auxiliary_loss_mlp": 0.01035482, "balance_loss_clip": 1.01170027, "balance_loss_mlp": 1.01782584, "epoch": 0.4041485044340899, "flos": 24424886856960.0, "grad_norm": 1.7519820076202277, "language_loss": 0.73954356, "learning_rate": 2.7026506768306304e-06, "loss": 0.76046586, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38867188, "step": 6722, "time_per_iteration": 2.433443784713745 }, { "auxiliary_loss_clip": 0.01058143, "auxiliary_loss_mlp": 0.01037932, "balance_loss_clip": 1.0118494, "balance_loss_mlp": 1.01751614, "epoch": 0.40420862768675786, "flos": 16759356744960.0, "grad_norm": 1.844964166386102, "language_loss": 0.67557895, "learning_rate": 2.7022860286710602e-06, "loss": 0.69653964, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 6723, "time_per_iteration": 2.4324264526367188 }, { "auxiliary_loss_clip": 0.01061719, "auxiliary_loss_mlp": 0.01053155, "balance_loss_clip": 1.02310228, "balance_loss_mlp": 1.0191679, "epoch": 0.4042687509394258, "flos": 22490489134080.0, "grad_norm": 1.5372515738737929, "language_loss": 0.74636304, "learning_rate": 2.701921353880734e-06, "loss": 0.76751179, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 6724, "time_per_iteration": 2.397710084915161 }, { "auxiliary_loss_clip": 0.01055316, "auxiliary_loss_mlp": 0.01041669, "balance_loss_clip": 1.01764846, "balance_loss_mlp": 1.01704979, "epoch": 0.4043288741920938, "flos": 30334844563200.0, "grad_norm": 1.773990244045234, "language_loss": 0.76103783, "learning_rate": 2.7015566524734787e-06, "loss": 0.78200769, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 6725, "time_per_iteration": 2.434135675430298 }, { "auxiliary_loss_clip": 0.01058596, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.01503479, "balance_loss_mlp": 1.01705885, "epoch": 0.40438899744476176, "flos": 46346711765760.0, "grad_norm": 1.4551007173092978, "language_loss": 0.77629346, "learning_rate": 2.701191924463126e-06, "loss": 0.79733133, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4140625, "step": 6726, "time_per_iteration": 2.6206133365631104 }, { "auxiliary_loss_clip": 0.01061209, "auxiliary_loss_mlp": 0.01043895, "balance_loss_clip": 1.01616669, "balance_loss_mlp": 1.01832843, "epoch": 0.4044491206974297, "flos": 13332701157120.0, "grad_norm": 1.9969705625644383, "language_loss": 0.83355892, "learning_rate": 2.7008271698635054e-06, "loss": 0.85460997, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4296875, "step": 6727, "time_per_iteration": 2.346367359161377 }, { "auxiliary_loss_clip": 0.01057767, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.01415539, "balance_loss_mlp": 1.01656651, "epoch": 0.4045092439500977, "flos": 12092935311360.0, "grad_norm": 2.185046239203533, "language_loss": 0.87300706, "learning_rate": 2.700462388688447e-06, "loss": 0.89399576, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 6728, "time_per_iteration": 2.379528045654297 }, { "auxiliary_loss_clip": 0.0105822, "auxiliary_loss_mlp": 0.01041539, "balance_loss_clip": 1.01407361, "balance_loss_mlp": 1.01813388, "epoch": 0.40456936720276565, "flos": 21178592686080.0, "grad_norm": 1.931874867860244, "language_loss": 0.8283723, "learning_rate": 2.700097580951786e-06, "loss": 0.84936994, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 6729, "time_per_iteration": 2.3803186416625977 }, { "auxiliary_loss_clip": 0.01058593, "auxiliary_loss_mlp": 0.01045891, "balance_loss_clip": 1.01873517, "balance_loss_mlp": 1.01837921, "epoch": 0.4046294904554336, "flos": 23914142444160.0, "grad_norm": 2.1374502504587496, "language_loss": 0.75254869, "learning_rate": 2.6997327466673533e-06, "loss": 0.77359354, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 6730, "time_per_iteration": 2.4174606800079346 }, { "auxiliary_loss_clip": 0.01057879, "auxiliary_loss_mlp": 0.01042804, "balance_loss_clip": 1.0158273, "balance_loss_mlp": 1.01726818, "epoch": 0.4046896137081016, "flos": 38069712368640.0, "grad_norm": 1.605361533243142, "language_loss": 0.69351548, "learning_rate": 2.699367885848985e-06, "loss": 0.7145223, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 6731, "time_per_iteration": 2.5135505199432373 }, { "auxiliary_loss_clip": 0.01059303, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.01901984, "balance_loss_mlp": 1.01863468, "epoch": 0.4047497369607696, "flos": 23616298702080.0, "grad_norm": 1.595830247697664, "language_loss": 0.75338024, "learning_rate": 2.699002998510517e-06, "loss": 0.77442515, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 6732, "time_per_iteration": 2.4827959537506104 }, { "auxiliary_loss_clip": 0.01057101, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.01219058, "balance_loss_mlp": 1.01701641, "epoch": 0.40480986021343757, "flos": 12822759705600.0, "grad_norm": 1.5899965227819073, "language_loss": 0.78608429, "learning_rate": 2.6986380846657852e-06, "loss": 0.80702746, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40234375, "step": 6733, "time_per_iteration": 2.358292818069458 }, { "auxiliary_loss_clip": 0.01061553, "auxiliary_loss_mlp": 0.01047136, "balance_loss_clip": 1.01635599, "balance_loss_mlp": 1.01883173, "epoch": 0.40486998346610553, "flos": 23767646912640.0, "grad_norm": 1.8900399297436081, "language_loss": 0.77946436, "learning_rate": 2.698273144328627e-06, "loss": 0.8005513, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42773438, "step": 6734, "time_per_iteration": 2.414019823074341 }, { "auxiliary_loss_clip": 0.01061464, "auxiliary_loss_mlp": 0.01046387, "balance_loss_clip": 1.01753867, "balance_loss_mlp": 1.01818848, "epoch": 0.4049301067187735, "flos": 22855715533440.0, "grad_norm": 2.242730709232734, "language_loss": 0.66661137, "learning_rate": 2.6979081775128805e-06, "loss": 0.68768984, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 6735, "time_per_iteration": 2.4102447032928467 }, { "auxiliary_loss_clip": 0.01055584, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.0130465, "balance_loss_mlp": 1.01661885, "epoch": 0.40499022997144146, "flos": 22782886704000.0, "grad_norm": 1.6592205777843074, "language_loss": 0.84601605, "learning_rate": 2.697543184232387e-06, "loss": 0.86695015, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38867188, "step": 6736, "time_per_iteration": 2.4836626052856445 }, { "auxiliary_loss_clip": 0.0106144, "auxiliary_loss_mlp": 0.01046602, "balance_loss_clip": 1.0168829, "balance_loss_mlp": 1.01845288, "epoch": 0.4050503532241094, "flos": 23038241454720.0, "grad_norm": 1.6406257085431104, "language_loss": 0.76354748, "learning_rate": 2.6971781645009863e-06, "loss": 0.78462791, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 6737, "time_per_iteration": 2.417987585067749 }, { "auxiliary_loss_clip": 0.01058898, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.01391852, "balance_loss_mlp": 1.01850486, "epoch": 0.4051104764767774, "flos": 16647006389760.0, "grad_norm": 2.117880713577063, "language_loss": 0.73722482, "learning_rate": 2.696813118332519e-06, "loss": 0.75822771, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40429688, "step": 6738, "time_per_iteration": 2.343068838119507 }, { "auxiliary_loss_clip": 0.0105814, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.01244545, "balance_loss_mlp": 1.0179857, "epoch": 0.40517059972944536, "flos": 16358134867200.0, "grad_norm": 1.7585958932197852, "language_loss": 0.75609535, "learning_rate": 2.696448045740828e-06, "loss": 0.77705252, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40234375, "step": 6739, "time_per_iteration": 2.378783941268921 }, { "auxiliary_loss_clip": 0.01060038, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.01084638, "balance_loss_mlp": 1.01929736, "epoch": 0.4052307229821133, "flos": 28802122565760.0, "grad_norm": 1.6793368597970977, "language_loss": 0.75017583, "learning_rate": 2.6960829467397576e-06, "loss": 0.77115285, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 6740, "time_per_iteration": 2.432659149169922 }, { "auxiliary_loss_clip": 0.01057639, "auxiliary_loss_mlp": 0.01038918, "balance_loss_clip": 1.01223946, "balance_loss_mlp": 1.01797771, "epoch": 0.4052908462347813, "flos": 21396799883520.0, "grad_norm": 1.4459962157539432, "language_loss": 0.78558034, "learning_rate": 2.695717821343153e-06, "loss": 0.80654597, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 6741, "time_per_iteration": 2.4358673095703125 }, { "auxiliary_loss_clip": 0.01059625, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.012972, "balance_loss_mlp": 1.01760733, "epoch": 0.40535096948744925, "flos": 22417974506880.0, "grad_norm": 3.1775111946841745, "language_loss": 0.72376901, "learning_rate": 2.6953526695648577e-06, "loss": 0.74475515, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41992188, "step": 6742, "time_per_iteration": 3.801206588745117 }, { "auxiliary_loss_clip": 0.01061003, "auxiliary_loss_mlp": 0.01042532, "balance_loss_clip": 1.01387429, "balance_loss_mlp": 1.01931119, "epoch": 0.4054110927401172, "flos": 17010068284800.0, "grad_norm": 2.2186683237312246, "language_loss": 0.73817056, "learning_rate": 2.6949874914187202e-06, "loss": 0.75920594, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41601562, "step": 6743, "time_per_iteration": 3.7643866539001465 }, { "auxiliary_loss_clip": 0.01062294, "auxiliary_loss_mlp": 0.01044554, "balance_loss_clip": 1.01489472, "balance_loss_mlp": 1.01859844, "epoch": 0.4054712159927852, "flos": 21613820094720.0, "grad_norm": 2.5912123632401816, "language_loss": 0.72802448, "learning_rate": 2.694622286918588e-06, "loss": 0.74909306, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4375, "step": 6744, "time_per_iteration": 2.3709847927093506 }, { "auxiliary_loss_clip": 0.01058081, "auxiliary_loss_mlp": 0.01037235, "balance_loss_clip": 1.01155686, "balance_loss_mlp": 1.01771688, "epoch": 0.4055313392454532, "flos": 25811357702400.0, "grad_norm": 1.493238959990269, "language_loss": 0.8127141, "learning_rate": 2.6942570560783076e-06, "loss": 0.83366728, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40429688, "step": 6745, "time_per_iteration": 3.875493288040161 }, { "auxiliary_loss_clip": 0.01058935, "auxiliary_loss_mlp": 0.01040223, "balance_loss_clip": 1.01422369, "balance_loss_mlp": 1.01906276, "epoch": 0.40559146249812117, "flos": 14136227164800.0, "grad_norm": 1.783338820930679, "language_loss": 0.67705888, "learning_rate": 2.693891798911731e-06, "loss": 0.6980505, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3984375, "step": 6746, "time_per_iteration": 2.32987904548645 }, { "auxiliary_loss_clip": 0.01057412, "auxiliary_loss_mlp": 0.01037829, "balance_loss_clip": 1.01225829, "balance_loss_mlp": 1.01760125, "epoch": 0.40565158575078913, "flos": 41353852320000.0, "grad_norm": 1.5094765456128498, "language_loss": 0.58264172, "learning_rate": 2.6935265154327075e-06, "loss": 0.60359418, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 6747, "time_per_iteration": 3.9971392154693604 }, { "auxiliary_loss_clip": 0.01059427, "auxiliary_loss_mlp": 0.01043957, "balance_loss_clip": 1.01830363, "balance_loss_mlp": 1.01860774, "epoch": 0.4057117090034571, "flos": 28543381413120.0, "grad_norm": 1.9353398198231206, "language_loss": 0.8605926, "learning_rate": 2.693161205655089e-06, "loss": 0.88162649, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40820312, "step": 6748, "time_per_iteration": 2.5066874027252197 }, { "auxiliary_loss_clip": 0.01060312, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.01636124, "balance_loss_mlp": 1.01883328, "epoch": 0.40577183225612506, "flos": 18003102485760.0, "grad_norm": 1.7668454995603449, "language_loss": 0.83229339, "learning_rate": 2.6927958695927287e-06, "loss": 0.85333288, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 6749, "time_per_iteration": 2.3854219913482666 }, { "auxiliary_loss_clip": 0.01060183, "auxiliary_loss_mlp": 0.01042842, "balance_loss_clip": 1.0170809, "balance_loss_mlp": 1.01904726, "epoch": 0.40583195550879303, "flos": 19535719749120.0, "grad_norm": 1.496016886610126, "language_loss": 0.76384306, "learning_rate": 2.6924305072594784e-06, "loss": 0.78487337, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41210938, "step": 6750, "time_per_iteration": 2.3812036514282227 }, { "auxiliary_loss_clip": 0.01061521, "auxiliary_loss_mlp": 0.01044882, "balance_loss_clip": 1.01631904, "balance_loss_mlp": 1.01812327, "epoch": 0.405892078761461, "flos": 22308382149120.0, "grad_norm": 2.222324231890562, "language_loss": 0.75204474, "learning_rate": 2.692065118669195e-06, "loss": 0.77310872, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 6751, "time_per_iteration": 2.4254021644592285 }, { "auxiliary_loss_clip": 0.01059473, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.02028704, "balance_loss_mlp": 1.01861429, "epoch": 0.40595220201412896, "flos": 25483209033600.0, "grad_norm": 1.588514380329064, "language_loss": 0.69151771, "learning_rate": 2.6916997038357326e-06, "loss": 0.7125808, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40820312, "step": 6752, "time_per_iteration": 2.4194719791412354 }, { "auxiliary_loss_clip": 0.01061338, "auxiliary_loss_mlp": 0.01046255, "balance_loss_clip": 1.01771617, "balance_loss_mlp": 1.01822042, "epoch": 0.4060123252667969, "flos": 49854155973120.0, "grad_norm": 2.079540196368654, "language_loss": 0.7243607, "learning_rate": 2.691334262772948e-06, "loss": 0.74543667, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43164062, "step": 6753, "time_per_iteration": 2.6267988681793213 }, { "auxiliary_loss_clip": 0.01061425, "auxiliary_loss_mlp": 0.01045332, "balance_loss_clip": 1.01595902, "balance_loss_mlp": 1.01819479, "epoch": 0.4060724485194649, "flos": 21134602506240.0, "grad_norm": 1.809914920535624, "language_loss": 0.73310083, "learning_rate": 2.690968795494699e-06, "loss": 0.75416839, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 6754, "time_per_iteration": 2.356799602508545 }, { "auxiliary_loss_clip": 0.01060532, "auxiliary_loss_mlp": 0.01044699, "balance_loss_clip": 1.01595795, "balance_loss_mlp": 1.01785851, "epoch": 0.40613257177213286, "flos": 21757103781120.0, "grad_norm": 1.7232179734965538, "language_loss": 0.84392726, "learning_rate": 2.690603302014844e-06, "loss": 0.86497962, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 6755, "time_per_iteration": 2.392775535583496 }, { "auxiliary_loss_clip": 0.01062632, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.02016509, "balance_loss_mlp": 1.01894057, "epoch": 0.4061926950248008, "flos": 25553943181440.0, "grad_norm": 1.5802576290960288, "language_loss": 0.72240341, "learning_rate": 2.6902377823472426e-06, "loss": 0.74352872, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4375, "step": 6756, "time_per_iteration": 2.413966178894043 }, { "auxiliary_loss_clip": 0.01062418, "auxiliary_loss_mlp": 0.01048207, "balance_loss_clip": 1.0202527, "balance_loss_mlp": 1.01859176, "epoch": 0.4062528182774688, "flos": 23694678437760.0, "grad_norm": 1.7831995292535898, "language_loss": 0.80722737, "learning_rate": 2.689872236505755e-06, "loss": 0.82833362, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.4375, "step": 6757, "time_per_iteration": 2.42838716506958 }, { "auxiliary_loss_clip": 0.01061884, "auxiliary_loss_mlp": 0.01040187, "balance_loss_clip": 1.01152921, "balance_loss_mlp": 1.01967764, "epoch": 0.4063129415301368, "flos": 21724948552320.0, "grad_norm": 2.55780635764178, "language_loss": 0.79599732, "learning_rate": 2.6895066645042437e-06, "loss": 0.81701803, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 6758, "time_per_iteration": 2.379997968673706 }, { "auxiliary_loss_clip": 0.01059067, "auxiliary_loss_mlp": 0.01041799, "balance_loss_clip": 1.01420176, "balance_loss_mlp": 1.01745605, "epoch": 0.40637306478280477, "flos": 12786729315840.0, "grad_norm": 1.8381279469803526, "language_loss": 0.91089523, "learning_rate": 2.6891410663565703e-06, "loss": 0.9319039, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41601562, "step": 6759, "time_per_iteration": 2.3989579677581787 }, { "auxiliary_loss_clip": 0.01061198, "auxiliary_loss_mlp": 0.01048346, "balance_loss_clip": 1.01931882, "balance_loss_mlp": 1.01946926, "epoch": 0.40643318803547274, "flos": 24023350776960.0, "grad_norm": 1.8285783698118152, "language_loss": 0.66315007, "learning_rate": 2.688775442076598e-06, "loss": 0.68424547, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41796875, "step": 6760, "time_per_iteration": 2.3963887691497803 }, { "auxiliary_loss_clip": 0.01061744, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.01644397, "balance_loss_mlp": 1.01862264, "epoch": 0.4064933112881407, "flos": 25591265291520.0, "grad_norm": 2.029493864358895, "language_loss": 0.76041901, "learning_rate": 2.688409791678193e-06, "loss": 0.78149199, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43164062, "step": 6761, "time_per_iteration": 2.5706796646118164 }, { "auxiliary_loss_clip": 0.01056238, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.01294374, "balance_loss_mlp": 1.01724434, "epoch": 0.40655343454080867, "flos": 22053236866560.0, "grad_norm": 1.5425735839443133, "language_loss": 0.70640212, "learning_rate": 2.6880441151752185e-06, "loss": 0.72734976, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 6762, "time_per_iteration": 2.381429433822632 }, { "auxiliary_loss_clip": 0.01061012, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.01488817, "balance_loss_mlp": 1.01871347, "epoch": 0.40661355779347663, "flos": 26467689951360.0, "grad_norm": 1.3990169506038654, "language_loss": 0.74575359, "learning_rate": 2.6876784125815433e-06, "loss": 0.76680833, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 6763, "time_per_iteration": 2.452731132507324 }, { "auxiliary_loss_clip": 0.01064226, "auxiliary_loss_mlp": 0.01042303, "balance_loss_clip": 1.0117023, "balance_loss_mlp": 1.01987922, "epoch": 0.4066736810461446, "flos": 13260291264000.0, "grad_norm": 1.7421632316578726, "language_loss": 0.70188379, "learning_rate": 2.687312683911033e-06, "loss": 0.72294903, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44335938, "step": 6764, "time_per_iteration": 2.3399744033813477 }, { "auxiliary_loss_clip": 0.01065199, "auxiliary_loss_mlp": 0.01051737, "balance_loss_clip": 1.02039683, "balance_loss_mlp": 1.02046573, "epoch": 0.40673380429881256, "flos": 28802366945280.0, "grad_norm": 2.196932128319159, "language_loss": 0.92828751, "learning_rate": 2.686946929177557e-06, "loss": 0.94945693, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44726562, "step": 6765, "time_per_iteration": 2.4781062602996826 }, { "auxiliary_loss_clip": 0.0106436, "auxiliary_loss_mlp": 0.01052963, "balance_loss_clip": 1.02056217, "balance_loss_mlp": 1.01881289, "epoch": 0.4067939275514805, "flos": 12494506302720.0, "grad_norm": 2.248035761685446, "language_loss": 0.80729765, "learning_rate": 2.6865811483949855e-06, "loss": 0.82847083, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45703125, "step": 6766, "time_per_iteration": 2.3476390838623047 }, { "auxiliary_loss_clip": 0.01062233, "auxiliary_loss_mlp": 0.01044296, "balance_loss_clip": 1.01418424, "balance_loss_mlp": 1.01811528, "epoch": 0.4068540508041485, "flos": 18769515851520.0, "grad_norm": 1.791998091660492, "language_loss": 0.77996147, "learning_rate": 2.6862153415771867e-06, "loss": 0.8010267, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 6767, "time_per_iteration": 2.3910014629364014 }, { "auxiliary_loss_clip": 0.01061193, "auxiliary_loss_mlp": 0.01050294, "balance_loss_clip": 1.02045548, "balance_loss_mlp": 1.01970029, "epoch": 0.40691417405681646, "flos": 28511540386560.0, "grad_norm": 1.6876588172892986, "language_loss": 0.78399885, "learning_rate": 2.685849508738034e-06, "loss": 0.80511367, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4140625, "step": 6768, "time_per_iteration": 2.4395647048950195 }, { "auxiliary_loss_clip": 0.01061894, "auxiliary_loss_mlp": 0.01044099, "balance_loss_clip": 1.01448774, "balance_loss_mlp": 1.01981759, "epoch": 0.4069742973094844, "flos": 20812982261760.0, "grad_norm": 1.8659744906486369, "language_loss": 0.88509721, "learning_rate": 2.6854836498913995e-06, "loss": 0.90615714, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.421875, "step": 6769, "time_per_iteration": 2.3833093643188477 }, { "auxiliary_loss_clip": 0.0106052, "auxiliary_loss_mlp": 0.01046393, "balance_loss_clip": 1.02051234, "balance_loss_mlp": 1.01997995, "epoch": 0.4070344205621524, "flos": 21469209776640.0, "grad_norm": 1.6828611398483073, "language_loss": 0.81921172, "learning_rate": 2.685117765051156e-06, "loss": 0.84028083, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40429688, "step": 6770, "time_per_iteration": 2.380537509918213 }, { "auxiliary_loss_clip": 0.01066695, "auxiliary_loss_mlp": 0.01046281, "balance_loss_clip": 1.01449966, "balance_loss_mlp": 1.02081633, "epoch": 0.4070945438148204, "flos": 26828936455680.0, "grad_norm": 1.5709361951049785, "language_loss": 0.81246626, "learning_rate": 2.6847518542311783e-06, "loss": 0.83359611, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.45703125, "step": 6771, "time_per_iteration": 2.437577724456787 }, { "auxiliary_loss_clip": 0.01060716, "auxiliary_loss_mlp": 0.01046405, "balance_loss_clip": 1.0181402, "balance_loss_mlp": 1.01875019, "epoch": 0.4071546670674884, "flos": 26353105269120.0, "grad_norm": 1.4520322368434146, "language_loss": 0.77761745, "learning_rate": 2.6843859174453417e-06, "loss": 0.79868865, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41992188, "step": 6772, "time_per_iteration": 2.4378464221954346 }, { "auxiliary_loss_clip": 0.01060908, "auxiliary_loss_mlp": 0.01047238, "balance_loss_clip": 1.018628, "balance_loss_mlp": 1.01747561, "epoch": 0.40721479032015634, "flos": 17894417823360.0, "grad_norm": 1.6737339253995835, "language_loss": 0.82739937, "learning_rate": 2.6840199547075218e-06, "loss": 0.84848088, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 6773, "time_per_iteration": 2.373967170715332 }, { "auxiliary_loss_clip": 0.01012123, "auxiliary_loss_mlp": 0.01005771, "balance_loss_clip": 1.00252843, "balance_loss_mlp": 1.00315297, "epoch": 0.4072749135728243, "flos": 49851745223040.0, "grad_norm": 0.8191802058667056, "language_loss": 0.64458442, "learning_rate": 2.683653966031597e-06, "loss": 0.66476333, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.08984375, "step": 6774, "time_per_iteration": 2.9694011211395264 }, { "auxiliary_loss_clip": 0.0106338, "auxiliary_loss_mlp": 0.01049917, "balance_loss_clip": 1.01851737, "balance_loss_mlp": 1.01855028, "epoch": 0.40733503682549227, "flos": 27562391631360.0, "grad_norm": 1.755950378974841, "language_loss": 0.74283135, "learning_rate": 2.683287951431446e-06, "loss": 0.7639643, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44921875, "step": 6775, "time_per_iteration": 2.5026276111602783 }, { "auxiliary_loss_clip": 0.01062374, "auxiliary_loss_mlp": 0.01051466, "balance_loss_clip": 1.02236652, "balance_loss_mlp": 1.0178808, "epoch": 0.40739516007816023, "flos": 22125891139200.0, "grad_norm": 1.3569417248987956, "language_loss": 0.79103756, "learning_rate": 2.6829219109209474e-06, "loss": 0.81217599, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4453125, "step": 6776, "time_per_iteration": 2.389362335205078 }, { "auxiliary_loss_clip": 0.01064488, "auxiliary_loss_mlp": 0.01041583, "balance_loss_clip": 1.01106513, "balance_loss_mlp": 1.01958525, "epoch": 0.4074552833308282, "flos": 23841104146560.0, "grad_norm": 2.3353930905530644, "language_loss": 0.81172508, "learning_rate": 2.682555844513981e-06, "loss": 0.83278579, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44921875, "step": 6777, "time_per_iteration": 2.4420011043548584 }, { "auxiliary_loss_clip": 0.01011575, "auxiliary_loss_mlp": 0.01005488, "balance_loss_clip": 1.00229347, "balance_loss_mlp": 1.00300074, "epoch": 0.40751540658349616, "flos": 57996702391680.0, "grad_norm": 0.6848869639511033, "language_loss": 0.53337395, "learning_rate": 2.6821897522244286e-06, "loss": 0.55354458, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.0859375, "step": 6778, "time_per_iteration": 3.062209367752075 }, { "auxiliary_loss_clip": 0.01060975, "auxiliary_loss_mlp": 0.01048254, "balance_loss_clip": 1.01728368, "balance_loss_mlp": 1.01796782, "epoch": 0.40757552983616413, "flos": 21213610646400.0, "grad_norm": 1.8422266861439953, "language_loss": 0.8412841, "learning_rate": 2.6818236340661718e-06, "loss": 0.86237645, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 6779, "time_per_iteration": 2.453190565109253 }, { "auxiliary_loss_clip": 0.01060089, "auxiliary_loss_mlp": 0.01045478, "balance_loss_clip": 1.01461482, "balance_loss_mlp": 1.01803613, "epoch": 0.4076356530888321, "flos": 26832322857600.0, "grad_norm": 1.471276927691935, "language_loss": 0.77408624, "learning_rate": 2.6814574900530957e-06, "loss": 0.79514194, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41992188, "step": 6780, "time_per_iteration": 2.4125490188598633 }, { "auxiliary_loss_clip": 0.01058805, "auxiliary_loss_mlp": 0.01044299, "balance_loss_clip": 1.0175004, "balance_loss_mlp": 1.01780283, "epoch": 0.40769577634150006, "flos": 12202213466880.0, "grad_norm": 2.475739249448689, "language_loss": 0.68168634, "learning_rate": 2.6810913201990827e-06, "loss": 0.70271736, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 6781, "time_per_iteration": 3.7963500022888184 }, { "auxiliary_loss_clip": 0.01061173, "auxiliary_loss_mlp": 0.01050508, "balance_loss_clip": 1.02130198, "balance_loss_mlp": 1.01860213, "epoch": 0.407755899594168, "flos": 33653897740800.0, "grad_norm": 1.5829730633435086, "language_loss": 0.71932757, "learning_rate": 2.6807251245180183e-06, "loss": 0.74044436, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 6782, "time_per_iteration": 2.476768732070923 }, { "auxiliary_loss_clip": 0.01063209, "auxiliary_loss_mlp": 0.01043802, "balance_loss_clip": 1.01390469, "balance_loss_mlp": 1.01948833, "epoch": 0.407816022846836, "flos": 20156300899200.0, "grad_norm": 1.58075326422288, "language_loss": 0.83490396, "learning_rate": 2.6803589030237897e-06, "loss": 0.85597414, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4375, "step": 6783, "time_per_iteration": 3.797112464904785 }, { "auxiliary_loss_clip": 0.01061679, "auxiliary_loss_mlp": 0.01048575, "balance_loss_clip": 1.01746106, "balance_loss_mlp": 1.01863551, "epoch": 0.40787614609950396, "flos": 21177754813440.0, "grad_norm": 1.5323948204903912, "language_loss": 0.82029247, "learning_rate": 2.679992655730283e-06, "loss": 0.84139496, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4296875, "step": 6784, "time_per_iteration": 3.7212843894958496 }, { "auxiliary_loss_clip": 0.01066226, "auxiliary_loss_mlp": 0.01050761, "balance_loss_clip": 1.01633346, "balance_loss_mlp": 1.01932633, "epoch": 0.407936269352172, "flos": 20519642085120.0, "grad_norm": 2.3011985904092724, "language_loss": 0.67729902, "learning_rate": 2.679626382651386e-06, "loss": 0.69846892, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.46875, "step": 6785, "time_per_iteration": 2.381941556930542 }, { "auxiliary_loss_clip": 0.01061151, "auxiliary_loss_mlp": 0.01043017, "balance_loss_clip": 1.0140729, "balance_loss_mlp": 1.0191462, "epoch": 0.40799639260483994, "flos": 20117826714240.0, "grad_norm": 1.9588399885314052, "language_loss": 0.80781126, "learning_rate": 2.679260083800989e-06, "loss": 0.82885295, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 6786, "time_per_iteration": 3.8912861347198486 }, { "auxiliary_loss_clip": 0.01061866, "auxiliary_loss_mlp": 0.01049825, "balance_loss_clip": 1.02228785, "balance_loss_mlp": 1.01898587, "epoch": 0.4080565158575079, "flos": 20996241321600.0, "grad_norm": 1.6008802814860668, "language_loss": 0.82139266, "learning_rate": 2.678893759192982e-06, "loss": 0.84250957, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.42773438, "step": 6787, "time_per_iteration": 2.3949897289276123 }, { "auxiliary_loss_clip": 0.01059558, "auxiliary_loss_mlp": 0.01041081, "balance_loss_clip": 1.01135063, "balance_loss_mlp": 1.01853299, "epoch": 0.40811663911017587, "flos": 19316709590400.0, "grad_norm": 1.6537731180825717, "language_loss": 0.69187158, "learning_rate": 2.678527408841255e-06, "loss": 0.71287793, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41015625, "step": 6788, "time_per_iteration": 2.363983154296875 }, { "auxiliary_loss_clip": 0.01060173, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.01690733, "balance_loss_mlp": 1.0187062, "epoch": 0.40817676236284384, "flos": 40623783546240.0, "grad_norm": 1.8608151108829818, "language_loss": 0.67489529, "learning_rate": 2.678161032759701e-06, "loss": 0.69596851, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4140625, "step": 6789, "time_per_iteration": 2.5769643783569336 }, { "auxiliary_loss_clip": 0.01062392, "auxiliary_loss_mlp": 0.01041824, "balance_loss_clip": 1.01206899, "balance_loss_mlp": 1.01908708, "epoch": 0.4082368856155118, "flos": 20521038539520.0, "grad_norm": 1.7397700748324245, "language_loss": 0.62699461, "learning_rate": 2.6777946309622123e-06, "loss": 0.64803678, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 6790, "time_per_iteration": 2.3711435794830322 }, { "auxiliary_loss_clip": 0.01063778, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.01813483, "balance_loss_mlp": 1.02082491, "epoch": 0.40829700886817977, "flos": 11427211906560.0, "grad_norm": 2.5702063709109515, "language_loss": 0.71760309, "learning_rate": 2.677428203462683e-06, "loss": 0.73873228, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4296875, "step": 6791, "time_per_iteration": 2.3717784881591797 }, { "auxiliary_loss_clip": 0.01011338, "auxiliary_loss_mlp": 0.01013647, "balance_loss_clip": 1.01042819, "balance_loss_mlp": 1.00255942, "epoch": 0.40835713212084773, "flos": 67327380754560.0, "grad_norm": 0.75727990237952, "language_loss": 0.59818858, "learning_rate": 2.6770617502750093e-06, "loss": 0.61843842, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.08789062, "step": 6792, "time_per_iteration": 2.9931931495666504 }, { "auxiliary_loss_clip": 0.01063951, "auxiliary_loss_mlp": 0.01053575, "balance_loss_clip": 1.02262855, "balance_loss_mlp": 1.02037716, "epoch": 0.4084172553735157, "flos": 21760944030720.0, "grad_norm": 1.6174485141118269, "language_loss": 0.81983435, "learning_rate": 2.6766952714130857e-06, "loss": 0.84100962, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43554688, "step": 6793, "time_per_iteration": 2.4188411235809326 }, { "auxiliary_loss_clip": 0.01064137, "auxiliary_loss_mlp": 0.01052692, "balance_loss_clip": 1.02104151, "balance_loss_mlp": 1.01925719, "epoch": 0.40847737862618366, "flos": 27416035745280.0, "grad_norm": 2.0875358288474337, "language_loss": 0.86006355, "learning_rate": 2.6763287668908094e-06, "loss": 0.8812319, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 6794, "time_per_iteration": 2.428314685821533 }, { "auxiliary_loss_clip": 0.01062777, "auxiliary_loss_mlp": 0.01050147, "balance_loss_clip": 1.01847315, "balance_loss_mlp": 1.01918232, "epoch": 0.4085375018788516, "flos": 18586291703040.0, "grad_norm": 1.5421340811220356, "language_loss": 0.80392683, "learning_rate": 2.6759622367220788e-06, "loss": 0.82505608, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43554688, "step": 6795, "time_per_iteration": 2.4042773246765137 }, { "auxiliary_loss_clip": 0.0106684, "auxiliary_loss_mlp": 0.01056083, "balance_loss_clip": 1.02172685, "balance_loss_mlp": 1.01975918, "epoch": 0.4085976251315196, "flos": 15410941148160.0, "grad_norm": 2.3042246371347272, "language_loss": 0.71610636, "learning_rate": 2.675595680920792e-06, "loss": 0.73733556, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.47070312, "step": 6796, "time_per_iteration": 2.3477187156677246 }, { "auxiliary_loss_clip": 0.01061364, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.01885295, "balance_loss_mlp": 1.01893032, "epoch": 0.40865774838418756, "flos": 21251142224640.0, "grad_norm": 1.6162502838956696, "language_loss": 0.79360318, "learning_rate": 2.6752290995008498e-06, "loss": 0.81470007, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 6797, "time_per_iteration": 2.428269863128662 }, { "auxiliary_loss_clip": 0.01062424, "auxiliary_loss_mlp": 0.01050811, "balance_loss_clip": 1.01964998, "balance_loss_mlp": 1.01885986, "epoch": 0.4087178716368556, "flos": 13771384790400.0, "grad_norm": 1.9984674687327404, "language_loss": 0.87323153, "learning_rate": 2.6748624924761523e-06, "loss": 0.89436388, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43554688, "step": 6798, "time_per_iteration": 2.345200777053833 }, { "auxiliary_loss_clip": 0.01059976, "auxiliary_loss_mlp": 0.01047376, "balance_loss_clip": 1.01924264, "balance_loss_mlp": 1.01932037, "epoch": 0.40877799488952354, "flos": 23620662622080.0, "grad_norm": 1.4749691008674646, "language_loss": 0.85057884, "learning_rate": 2.674495859860601e-06, "loss": 0.87165236, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 6799, "time_per_iteration": 2.570819139480591 }, { "auxiliary_loss_clip": 0.01064471, "auxiliary_loss_mlp": 0.0105396, "balance_loss_clip": 1.02148783, "balance_loss_mlp": 1.01979423, "epoch": 0.4088381181421915, "flos": 20917861585920.0, "grad_norm": 4.89926270316154, "language_loss": 0.85365546, "learning_rate": 2.6741292016681e-06, "loss": 0.87483972, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44726562, "step": 6800, "time_per_iteration": 2.391497850418091 }, { "auxiliary_loss_clip": 0.0106414, "auxiliary_loss_mlp": 0.01049511, "balance_loss_clip": 1.01658499, "balance_loss_mlp": 1.01982999, "epoch": 0.4088982413948595, "flos": 13296740590080.0, "grad_norm": 1.9672785846341585, "language_loss": 0.76572549, "learning_rate": 2.6737625179125514e-06, "loss": 0.78686202, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44335938, "step": 6801, "time_per_iteration": 2.415107011795044 }, { "auxiliary_loss_clip": 0.01064618, "auxiliary_loss_mlp": 0.01052902, "balance_loss_clip": 1.02176428, "balance_loss_mlp": 1.02022004, "epoch": 0.40895836464752744, "flos": 15266784677760.0, "grad_norm": 1.8483921193768464, "language_loss": 0.81740606, "learning_rate": 2.673395808607861e-06, "loss": 0.83858132, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4453125, "step": 6802, "time_per_iteration": 2.333857774734497 }, { "auxiliary_loss_clip": 0.01062396, "auxiliary_loss_mlp": 0.01051105, "balance_loss_clip": 1.01872742, "balance_loss_mlp": 1.01839364, "epoch": 0.4090184879001954, "flos": 14500545868800.0, "grad_norm": 2.0448293928706414, "language_loss": 0.77267683, "learning_rate": 2.673029073767934e-06, "loss": 0.79381186, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44140625, "step": 6803, "time_per_iteration": 2.3563039302825928 }, { "auxiliary_loss_clip": 0.01063839, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.02038276, "balance_loss_mlp": 1.01959276, "epoch": 0.40907861115286337, "flos": 13880732768640.0, "grad_norm": 2.0057673584024487, "language_loss": 0.8003239, "learning_rate": 2.6726623134066764e-06, "loss": 0.82146722, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 6804, "time_per_iteration": 2.398557424545288 }, { "auxiliary_loss_clip": 0.01065624, "auxiliary_loss_mlp": 0.01045788, "balance_loss_clip": 1.01407814, "balance_loss_mlp": 1.01924384, "epoch": 0.40913873440553133, "flos": 28036372515840.0, "grad_norm": 1.7826767445094973, "language_loss": 0.77320707, "learning_rate": 2.672295527537998e-06, "loss": 0.79432118, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.46484375, "step": 6805, "time_per_iteration": 2.4431025981903076 }, { "auxiliary_loss_clip": 0.01064172, "auxiliary_loss_mlp": 0.01047192, "balance_loss_clip": 1.01660275, "balance_loss_mlp": 1.02039647, "epoch": 0.4091988576581993, "flos": 21617066851200.0, "grad_norm": 1.595091393049744, "language_loss": 0.80356628, "learning_rate": 2.671928716175804e-06, "loss": 0.82467985, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4375, "step": 6806, "time_per_iteration": 2.4364919662475586 }, { "auxiliary_loss_clip": 0.01064306, "auxiliary_loss_mlp": 0.01045408, "balance_loss_clip": 1.0114336, "balance_loss_mlp": 1.01963019, "epoch": 0.40925898091086726, "flos": 25223036515200.0, "grad_norm": 2.1469923815840883, "language_loss": 0.73694289, "learning_rate": 2.671561879334007e-06, "loss": 0.75804001, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4453125, "step": 6807, "time_per_iteration": 2.420684814453125 }, { "auxiliary_loss_clip": 0.01012141, "auxiliary_loss_mlp": 0.0100519, "balance_loss_clip": 1.00199521, "balance_loss_mlp": 1.0032872, "epoch": 0.40931910416353523, "flos": 68927380675200.0, "grad_norm": 0.8225381655621209, "language_loss": 0.58833492, "learning_rate": 2.6711950170265155e-06, "loss": 0.60850823, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.08886719, "step": 6808, "time_per_iteration": 3.1178836822509766 }, { "auxiliary_loss_clip": 0.01061368, "auxiliary_loss_mlp": 0.01047378, "balance_loss_clip": 1.01860046, "balance_loss_mlp": 1.01904786, "epoch": 0.4093792274162032, "flos": 20188630684800.0, "grad_norm": 1.5203462041899707, "language_loss": 0.55954403, "learning_rate": 2.670828129267242e-06, "loss": 0.58063149, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 6809, "time_per_iteration": 2.4253292083740234 }, { "auxiliary_loss_clip": 0.01064688, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.01074505, "balance_loss_mlp": 1.02055764, "epoch": 0.40943935066887116, "flos": 25227574992000.0, "grad_norm": 2.0639836128379043, "language_loss": 0.84996992, "learning_rate": 2.6704612160700983e-06, "loss": 0.87101746, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 6810, "time_per_iteration": 2.410400390625 }, { "auxiliary_loss_clip": 0.01066675, "auxiliary_loss_mlp": 0.01054396, "balance_loss_clip": 1.01934898, "balance_loss_mlp": 1.01993322, "epoch": 0.4094994739215392, "flos": 23254284147840.0, "grad_norm": 2.0931004247645477, "language_loss": 0.79787683, "learning_rate": 2.670094277448999e-06, "loss": 0.81908751, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.46679688, "step": 6811, "time_per_iteration": 2.4300496578216553 }, { "auxiliary_loss_clip": 0.01062557, "auxiliary_loss_mlp": 0.01046694, "balance_loss_clip": 1.01426935, "balance_loss_mlp": 1.01850212, "epoch": 0.40955959717420715, "flos": 17381264526720.0, "grad_norm": 1.4944164024388253, "language_loss": 0.71396232, "learning_rate": 2.669727313417857e-06, "loss": 0.73505485, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44140625, "step": 6812, "time_per_iteration": 2.4540069103240967 }, { "auxiliary_loss_clip": 0.0106329, "auxiliary_loss_mlp": 0.01052775, "balance_loss_clip": 1.02099347, "balance_loss_mlp": 1.01920903, "epoch": 0.4096197204268751, "flos": 25081254017280.0, "grad_norm": 1.5278273305104828, "language_loss": 0.68133152, "learning_rate": 2.6693603239905872e-06, "loss": 0.70249218, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44140625, "step": 6813, "time_per_iteration": 2.434086561203003 }, { "auxiliary_loss_clip": 0.0106371, "auxiliary_loss_mlp": 0.01046132, "balance_loss_clip": 1.01435065, "balance_loss_mlp": 1.01951337, "epoch": 0.4096798436795431, "flos": 30585591014400.0, "grad_norm": 1.983238248587552, "language_loss": 0.75298792, "learning_rate": 2.6689933091811087e-06, "loss": 0.77408636, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44140625, "step": 6814, "time_per_iteration": 2.474334955215454 }, { "auxiliary_loss_clip": 0.01064273, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.01806498, "balance_loss_mlp": 1.01907182, "epoch": 0.40973996693221104, "flos": 24132489287040.0, "grad_norm": 2.052459265585176, "language_loss": 0.67934561, "learning_rate": 2.6686262690033357e-06, "loss": 0.70050371, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.453125, "step": 6815, "time_per_iteration": 2.44216251373291 }, { "auxiliary_loss_clip": 0.01063058, "auxiliary_loss_mlp": 0.01057203, "balance_loss_clip": 1.02623296, "balance_loss_mlp": 1.0199666, "epoch": 0.409800090184879, "flos": 23987809146240.0, "grad_norm": 1.6110630095695289, "language_loss": 0.77602434, "learning_rate": 2.668259203471188e-06, "loss": 0.79722691, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 6816, "time_per_iteration": 2.4137892723083496 }, { "auxiliary_loss_clip": 0.01061921, "auxiliary_loss_mlp": 0.01056473, "balance_loss_clip": 1.02531195, "balance_loss_mlp": 1.01910782, "epoch": 0.40986021343754697, "flos": 16142755489920.0, "grad_norm": 2.0466049100470824, "language_loss": 0.82873017, "learning_rate": 2.6678921125985843e-06, "loss": 0.84991407, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4296875, "step": 6817, "time_per_iteration": 2.3611483573913574 }, { "auxiliary_loss_clip": 0.0106434, "auxiliary_loss_mlp": 0.01049856, "balance_loss_clip": 1.01454592, "balance_loss_mlp": 1.01865613, "epoch": 0.40992033669021494, "flos": 24789659408640.0, "grad_norm": 1.584356061447555, "language_loss": 0.81667244, "learning_rate": 2.667524996399444e-06, "loss": 0.83781439, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 0.45703125, "step": 6818, "time_per_iteration": 2.435990810394287 }, { "auxiliary_loss_clip": 0.01062159, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.01711679, "balance_loss_mlp": 1.01797116, "epoch": 0.4099804599428829, "flos": 29640631622400.0, "grad_norm": 1.4712255719436849, "language_loss": 0.67170054, "learning_rate": 2.66715785488769e-06, "loss": 0.69278252, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44140625, "step": 6819, "time_per_iteration": 2.472405195236206 }, { "auxiliary_loss_clip": 0.01066955, "auxiliary_loss_mlp": 0.01058595, "balance_loss_clip": 1.02414358, "balance_loss_mlp": 1.0199244, "epoch": 0.41004058319555087, "flos": 24825445418880.0, "grad_norm": 1.6015480178874426, "language_loss": 0.86720759, "learning_rate": 2.6667906880772428e-06, "loss": 0.88846314, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.47070312, "step": 6820, "time_per_iteration": 2.441683292388916 }, { "auxiliary_loss_clip": 0.01060374, "auxiliary_loss_mlp": 0.01046067, "balance_loss_clip": 1.01757574, "balance_loss_mlp": 1.01861548, "epoch": 0.41010070644821883, "flos": 25736329457280.0, "grad_norm": 1.5714177748038813, "language_loss": 0.72728515, "learning_rate": 2.6664234959820256e-06, "loss": 0.74834955, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 6821, "time_per_iteration": 3.8891398906707764 }, { "auxiliary_loss_clip": 0.01062548, "auxiliary_loss_mlp": 0.01050078, "balance_loss_clip": 1.02115834, "balance_loss_mlp": 1.01884604, "epoch": 0.4101608297008868, "flos": 22344971120640.0, "grad_norm": 1.8565807483905061, "language_loss": 0.75654542, "learning_rate": 2.6660562786159634e-06, "loss": 0.77767169, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4375, "step": 6822, "time_per_iteration": 3.7820310592651367 }, { "auxiliary_loss_clip": 0.01062723, "auxiliary_loss_mlp": 0.01050796, "balance_loss_clip": 1.02155375, "balance_loss_mlp": 1.01891339, "epoch": 0.41022095295355476, "flos": 21943993622400.0, "grad_norm": 2.017679989095914, "language_loss": 0.77601057, "learning_rate": 2.6656890359929796e-06, "loss": 0.79714584, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 6823, "time_per_iteration": 2.428248882293701 }, { "auxiliary_loss_clip": 0.01066619, "auxiliary_loss_mlp": 0.01052481, "balance_loss_clip": 1.01781487, "balance_loss_mlp": 1.01959169, "epoch": 0.4102810762062228, "flos": 27449377960320.0, "grad_norm": 1.6429081943805641, "language_loss": 0.74115437, "learning_rate": 2.665321768127001e-06, "loss": 0.76234537, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.46875, "step": 6824, "time_per_iteration": 3.8870837688446045 }, { "auxiliary_loss_clip": 0.01065438, "auxiliary_loss_mlp": 0.01048741, "balance_loss_clip": 1.01672125, "balance_loss_mlp": 1.01903057, "epoch": 0.41034119945889075, "flos": 24498099711360.0, "grad_norm": 2.167302937304581, "language_loss": 0.73626065, "learning_rate": 2.6649544750319548e-06, "loss": 0.75740242, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46484375, "step": 6825, "time_per_iteration": 2.4103164672851562 }, { "auxiliary_loss_clip": 0.01062169, "auxiliary_loss_mlp": 0.01042948, "balance_loss_clip": 1.01547003, "balance_loss_mlp": 1.01952124, "epoch": 0.4104013227115587, "flos": 24351499445760.0, "grad_norm": 1.9675360724555868, "language_loss": 0.85874903, "learning_rate": 2.664587156721768e-06, "loss": 0.8798002, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 6826, "time_per_iteration": 3.886806011199951 }, { "auxiliary_loss_clip": 0.01059774, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.0138607, "balance_loss_mlp": 1.01924527, "epoch": 0.4104614459642267, "flos": 23728299943680.0, "grad_norm": 1.7119857450170477, "language_loss": 0.67204154, "learning_rate": 2.6642198132103696e-06, "loss": 0.69307452, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40625, "step": 6827, "time_per_iteration": 2.3925068378448486 }, { "auxiliary_loss_clip": 0.01062307, "auxiliary_loss_mlp": 0.01047136, "balance_loss_clip": 1.01833487, "balance_loss_mlp": 1.02006698, "epoch": 0.41052156921689464, "flos": 22126868657280.0, "grad_norm": 1.4479917564285445, "language_loss": 0.73789746, "learning_rate": 2.663852444511689e-06, "loss": 0.75899184, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 6828, "time_per_iteration": 2.4186601638793945 }, { "auxiliary_loss_clip": 0.01064876, "auxiliary_loss_mlp": 0.01051173, "balance_loss_clip": 1.02082193, "balance_loss_mlp": 1.02009714, "epoch": 0.4105816924695626, "flos": 20083332424320.0, "grad_norm": 6.643626070223331, "language_loss": 0.85293156, "learning_rate": 2.6634850506396574e-06, "loss": 0.87409204, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44921875, "step": 6829, "time_per_iteration": 2.3661324977874756 }, { "auxiliary_loss_clip": 0.01060786, "auxiliary_loss_mlp": 0.01044687, "balance_loss_clip": 1.01698279, "balance_loss_mlp": 1.01943994, "epoch": 0.4106418157222306, "flos": 18075826581120.0, "grad_norm": 1.539160306468367, "language_loss": 0.90236974, "learning_rate": 2.663117631608206e-06, "loss": 0.92342454, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 6830, "time_per_iteration": 2.387516498565674 }, { "auxiliary_loss_clip": 0.01062403, "auxiliary_loss_mlp": 0.0104541, "balance_loss_clip": 1.01671624, "balance_loss_mlp": 1.01948524, "epoch": 0.41070193897489854, "flos": 21646917930240.0, "grad_norm": 1.6699164100247337, "language_loss": 0.6651032, "learning_rate": 2.662750187431268e-06, "loss": 0.68618137, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4296875, "step": 6831, "time_per_iteration": 2.3889496326446533 }, { "auxiliary_loss_clip": 0.01061345, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.01092923, "balance_loss_mlp": 1.019346, "epoch": 0.4107620622275665, "flos": 26647073850240.0, "grad_norm": 1.7606193525450733, "language_loss": 0.70623219, "learning_rate": 2.662382718122776e-06, "loss": 0.72723389, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 6832, "time_per_iteration": 2.4610888957977295 }, { "auxiliary_loss_clip": 0.01061171, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.01477075, "balance_loss_mlp": 1.01991701, "epoch": 0.41082218548023447, "flos": 18733310904960.0, "grad_norm": 1.8457383348601888, "language_loss": 0.75205815, "learning_rate": 2.662015223696666e-06, "loss": 0.77307796, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41210938, "step": 6833, "time_per_iteration": 2.394274950027466 }, { "auxiliary_loss_clip": 0.01065306, "auxiliary_loss_mlp": 0.01045024, "balance_loss_clip": 1.01572216, "balance_loss_mlp": 1.02032351, "epoch": 0.41088230873290243, "flos": 22892653618560.0, "grad_norm": 1.6200315330802235, "language_loss": 0.74133486, "learning_rate": 2.6616477041668713e-06, "loss": 0.76243818, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44921875, "step": 6834, "time_per_iteration": 2.414973258972168 }, { "auxiliary_loss_clip": 0.01062809, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.01743388, "balance_loss_mlp": 1.01877093, "epoch": 0.4109424319855704, "flos": 24275912618880.0, "grad_norm": 2.5396849588679644, "language_loss": 0.72563553, "learning_rate": 2.661280159547329e-06, "loss": 0.74672675, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43945312, "step": 6835, "time_per_iteration": 2.432098627090454 }, { "auxiliary_loss_clip": 0.01064125, "auxiliary_loss_mlp": 0.01045808, "balance_loss_clip": 1.01471841, "balance_loss_mlp": 1.02053297, "epoch": 0.41100255523823837, "flos": 12968312630400.0, "grad_norm": 1.8967255257128466, "language_loss": 0.89060462, "learning_rate": 2.660912589851978e-06, "loss": 0.91170394, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43554688, "step": 6836, "time_per_iteration": 2.409686803817749 }, { "auxiliary_loss_clip": 0.01061826, "auxiliary_loss_mlp": 0.01045498, "balance_loss_clip": 1.0179013, "balance_loss_mlp": 1.01968765, "epoch": 0.4110626784909064, "flos": 23144621967360.0, "grad_norm": 2.010308768167035, "language_loss": 0.70250452, "learning_rate": 2.6605449950947547e-06, "loss": 0.72357768, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.421875, "step": 6837, "time_per_iteration": 2.4810755252838135 }, { "auxiliary_loss_clip": 0.01064152, "auxiliary_loss_mlp": 0.01046871, "balance_loss_clip": 1.01535249, "balance_loss_mlp": 1.0195961, "epoch": 0.41112280174357435, "flos": 22746297732480.0, "grad_norm": 1.679286883657385, "language_loss": 0.7641747, "learning_rate": 2.660177375289599e-06, "loss": 0.785285, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 6838, "time_per_iteration": 2.4240612983703613 }, { "auxiliary_loss_clip": 0.01061588, "auxiliary_loss_mlp": 0.01047658, "balance_loss_clip": 1.01749766, "balance_loss_mlp": 1.01894641, "epoch": 0.4111829249962423, "flos": 21101434848000.0, "grad_norm": 1.8298889752443506, "language_loss": 0.83098042, "learning_rate": 2.659809730450451e-06, "loss": 0.85207283, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 6839, "time_per_iteration": 2.3772270679473877 }, { "auxiliary_loss_clip": 0.01060564, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.02423477, "balance_loss_mlp": 1.01865697, "epoch": 0.4112430482489103, "flos": 21504751407360.0, "grad_norm": 1.7161224014367062, "language_loss": 0.82021427, "learning_rate": 2.6594420605912523e-06, "loss": 0.84133017, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41992188, "step": 6840, "time_per_iteration": 2.3982417583465576 }, { "auxiliary_loss_clip": 0.01060007, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.01641786, "balance_loss_mlp": 1.01840067, "epoch": 0.41130317150157825, "flos": 19569096875520.0, "grad_norm": 1.6912461456717376, "language_loss": 0.67917228, "learning_rate": 2.6590743657259442e-06, "loss": 0.70020294, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41601562, "step": 6841, "time_per_iteration": 2.3581559658050537 }, { "auxiliary_loss_clip": 0.01011919, "auxiliary_loss_mlp": 0.01006241, "balance_loss_clip": 1.00242615, "balance_loss_mlp": 1.00345182, "epoch": 0.4113632947542462, "flos": 62379593740800.0, "grad_norm": 0.7784957605423053, "language_loss": 0.59733307, "learning_rate": 2.65870664586847e-06, "loss": 0.61751473, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 0.03808594, "router_z_loss_mlp": 0.08496094, "step": 6842, "time_per_iteration": 3.1188442707061768 }, { "auxiliary_loss_clip": 0.01058778, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.01956832, "balance_loss_mlp": 1.01891458, "epoch": 0.4114234180069142, "flos": 13917740676480.0, "grad_norm": 1.8507635090734271, "language_loss": 0.71458292, "learning_rate": 2.6583389010327742e-06, "loss": 0.73562717, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3984375, "step": 6843, "time_per_iteration": 2.3552639484405518 }, { "auxiliary_loss_clip": 0.01011148, "auxiliary_loss_mlp": 0.01003666, "balance_loss_clip": 1.00030434, "balance_loss_mlp": 1.00311923, "epoch": 0.41148354125958214, "flos": 64925111635200.0, "grad_norm": 0.7091461734915483, "language_loss": 0.53678071, "learning_rate": 2.6579711312328013e-06, "loss": 0.55692887, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 0.03369141, "router_z_loss_mlp": 0.08007812, "step": 6844, "time_per_iteration": 3.0310726165771484 }, { "auxiliary_loss_clip": 0.01060167, "auxiliary_loss_mlp": 0.01046832, "balance_loss_clip": 1.02122617, "balance_loss_mlp": 1.01837373, "epoch": 0.4115436645122501, "flos": 18727934555520.0, "grad_norm": 1.7246607406836638, "language_loss": 0.6764735, "learning_rate": 2.6576033364824967e-06, "loss": 0.69754356, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41796875, "step": 6845, "time_per_iteration": 2.355976104736328 }, { "auxiliary_loss_clip": 0.01060547, "auxiliary_loss_mlp": 0.01048549, "balance_loss_clip": 1.02122593, "balance_loss_mlp": 1.01975322, "epoch": 0.41160378776491807, "flos": 16251998734080.0, "grad_norm": 1.829155277457834, "language_loss": 0.71735072, "learning_rate": 2.657235516795808e-06, "loss": 0.73844165, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 6846, "time_per_iteration": 2.376189947128296 }, { "auxiliary_loss_clip": 0.01059713, "auxiliary_loss_mlp": 0.01053702, "balance_loss_clip": 1.02450812, "balance_loss_mlp": 1.01814067, "epoch": 0.41166391101758604, "flos": 27968640744960.0, "grad_norm": 1.5441236701952108, "language_loss": 0.66539121, "learning_rate": 2.6568676721866826e-06, "loss": 0.6865254, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41601562, "step": 6847, "time_per_iteration": 2.474001169204712 }, { "auxiliary_loss_clip": 0.01060394, "auxiliary_loss_mlp": 0.01050942, "balance_loss_clip": 1.02351189, "balance_loss_mlp": 1.01850748, "epoch": 0.411724034270254, "flos": 34129868572800.0, "grad_norm": 1.3692571601459331, "language_loss": 0.71301413, "learning_rate": 2.656499802669069e-06, "loss": 0.73412746, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41992188, "step": 6848, "time_per_iteration": 2.493251323699951 }, { "auxiliary_loss_clip": 0.01010018, "auxiliary_loss_mlp": 0.0100748, "balance_loss_clip": 1.00364113, "balance_loss_mlp": 1.00187349, "epoch": 0.41178415752292197, "flos": 67920100773120.0, "grad_norm": 0.8957345610649452, "language_loss": 0.56340384, "learning_rate": 2.6561319082569174e-06, "loss": 0.58357882, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.08105469, "step": 6849, "time_per_iteration": 3.1800320148468018 }, { "auxiliary_loss_clip": 0.01062069, "auxiliary_loss_mlp": 0.01048689, "balance_loss_clip": 1.0206871, "balance_loss_mlp": 1.0203588, "epoch": 0.41184428077558993, "flos": 34312499228160.0, "grad_norm": 1.5415271242590203, "language_loss": 0.76932454, "learning_rate": 2.6557639889641783e-06, "loss": 0.7904321, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 6850, "time_per_iteration": 2.5250630378723145 }, { "auxiliary_loss_clip": 0.01058719, "auxiliary_loss_mlp": 0.01044594, "balance_loss_clip": 1.01835608, "balance_loss_mlp": 1.01854467, "epoch": 0.41190440402825795, "flos": 35442672716160.0, "grad_norm": 1.730016362075321, "language_loss": 0.68961453, "learning_rate": 2.6553960448048025e-06, "loss": 0.71064764, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 6851, "time_per_iteration": 2.5043740272521973 }, { "auxiliary_loss_clip": 0.01064678, "auxiliary_loss_mlp": 0.01060292, "balance_loss_clip": 1.0268178, "balance_loss_mlp": 1.0198673, "epoch": 0.4119645272809259, "flos": 20848838094720.0, "grad_norm": 3.386491720986252, "language_loss": 0.82797015, "learning_rate": 2.655028075792743e-06, "loss": 0.8492198, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.44921875, "step": 6852, "time_per_iteration": 2.394551992416382 }, { "auxiliary_loss_clip": 0.01065077, "auxiliary_loss_mlp": 0.01054153, "balance_loss_clip": 1.02182341, "balance_loss_mlp": 1.01997864, "epoch": 0.4120246505335939, "flos": 27560855531520.0, "grad_norm": 2.1019546168811405, "language_loss": 0.7939347, "learning_rate": 2.6546600819419537e-06, "loss": 0.81512702, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.45117188, "step": 6853, "time_per_iteration": 2.4236204624176025 }, { "auxiliary_loss_clip": 0.01066493, "auxiliary_loss_mlp": 0.01053512, "balance_loss_clip": 1.02039576, "balance_loss_mlp": 1.02066755, "epoch": 0.41208477378626185, "flos": 37813938681600.0, "grad_norm": 1.661838399317321, "language_loss": 0.67861915, "learning_rate": 2.6542920632663883e-06, "loss": 0.69981921, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45898438, "step": 6854, "time_per_iteration": 2.5552010536193848 }, { "auxiliary_loss_clip": 0.01062176, "auxiliary_loss_mlp": 0.01044732, "balance_loss_clip": 1.01687288, "balance_loss_mlp": 1.02025592, "epoch": 0.4121448970389298, "flos": 23439637889280.0, "grad_norm": 1.7829995866128785, "language_loss": 0.8520242, "learning_rate": 2.6539240197800023e-06, "loss": 0.87309331, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 6855, "time_per_iteration": 2.3911221027374268 }, { "auxiliary_loss_clip": 0.01059323, "auxiliary_loss_mlp": 0.01044764, "balance_loss_clip": 1.01729822, "balance_loss_mlp": 1.01837885, "epoch": 0.4122050202915978, "flos": 21324215433600.0, "grad_norm": 1.6422147421281972, "language_loss": 0.8034156, "learning_rate": 2.6535559514967517e-06, "loss": 0.82445645, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 6856, "time_per_iteration": 2.4088330268859863 }, { "auxiliary_loss_clip": 0.01064637, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.01588178, "balance_loss_mlp": 1.0209043, "epoch": 0.41226514354426574, "flos": 17305468231680.0, "grad_norm": 2.61216854830912, "language_loss": 0.8142494, "learning_rate": 2.6531878584305935e-06, "loss": 0.83534193, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4375, "step": 6857, "time_per_iteration": 2.340552568435669 }, { "auxiliary_loss_clip": 0.01064536, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.01333523, "balance_loss_mlp": 1.01994634, "epoch": 0.4123252667969337, "flos": 17637910997760.0, "grad_norm": 1.7699545814321902, "language_loss": 0.72143382, "learning_rate": 2.6528197405954873e-06, "loss": 0.74250853, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4453125, "step": 6858, "time_per_iteration": 2.3817138671875 }, { "auxiliary_loss_clip": 0.01060955, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.01733184, "balance_loss_mlp": 1.01953876, "epoch": 0.4123853900496017, "flos": 46423101553920.0, "grad_norm": 1.5299996464339505, "language_loss": 0.60564846, "learning_rate": 2.652451598005391e-06, "loss": 0.62671149, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 6859, "time_per_iteration": 2.5990231037139893 }, { "auxiliary_loss_clip": 0.01062274, "auxiliary_loss_mlp": 0.01045282, "balance_loss_clip": 1.01698184, "balance_loss_mlp": 1.01925504, "epoch": 0.41244551330226964, "flos": 17674220678400.0, "grad_norm": 4.9951363418552965, "language_loss": 0.75393903, "learning_rate": 2.652083430674264e-06, "loss": 0.77501458, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4296875, "step": 6860, "time_per_iteration": 2.3921658992767334 }, { "auxiliary_loss_clip": 0.01061158, "auxiliary_loss_mlp": 0.01041714, "balance_loss_clip": 1.01479626, "balance_loss_mlp": 1.01976669, "epoch": 0.4125056365549376, "flos": 18692846772480.0, "grad_norm": 2.033921765860662, "language_loss": 0.75122398, "learning_rate": 2.651715238616068e-06, "loss": 0.7722528, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 6861, "time_per_iteration": 3.80245304107666 }, { "auxiliary_loss_clip": 0.01062095, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.01537478, "balance_loss_mlp": 1.02097607, "epoch": 0.41256575980760557, "flos": 17894313089280.0, "grad_norm": 13.05125102721179, "language_loss": 0.80844444, "learning_rate": 2.651347021844765e-06, "loss": 0.82949555, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 6862, "time_per_iteration": 3.8166825771331787 }, { "auxiliary_loss_clip": 0.01061694, "auxiliary_loss_mlp": 0.01048624, "balance_loss_clip": 1.02077699, "balance_loss_mlp": 1.01967061, "epoch": 0.41262588306027354, "flos": 21980233480320.0, "grad_norm": 1.6160959060446645, "language_loss": 0.77764046, "learning_rate": 2.650978780374318e-06, "loss": 0.79874361, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.421875, "step": 6863, "time_per_iteration": 3.7784807682037354 }, { "auxiliary_loss_clip": 0.01011823, "auxiliary_loss_mlp": 0.01009009, "balance_loss_clip": 1.00571907, "balance_loss_mlp": 1.00329733, "epoch": 0.41268600631294156, "flos": 53347284656640.0, "grad_norm": 0.7012403835506678, "language_loss": 0.52745116, "learning_rate": 2.650610514218691e-06, "loss": 0.54765952, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 0.03295898, "router_z_loss_mlp": 0.08496094, "step": 6864, "time_per_iteration": 2.975092649459839 }, { "auxiliary_loss_clip": 0.01064617, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.02113497, "balance_loss_mlp": 1.01954734, "epoch": 0.4127461295656095, "flos": 24384317990400.0, "grad_norm": 1.665530575754607, "language_loss": 0.73737884, "learning_rate": 2.6502422233918468e-06, "loss": 0.7585516, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45117188, "step": 6865, "time_per_iteration": 2.3937106132507324 }, { "auxiliary_loss_clip": 0.0101096, "auxiliary_loss_mlp": 0.01012989, "balance_loss_clip": 1.00905526, "balance_loss_mlp": 1.00220299, "epoch": 0.4128062528182775, "flos": 71701928288640.0, "grad_norm": 0.9293256228967738, "language_loss": 0.66686225, "learning_rate": 2.649873907907753e-06, "loss": 0.68710172, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 0.03930664, "router_z_loss_mlp": 0.08789062, "step": 6866, "time_per_iteration": 4.365203619003296 }, { "auxiliary_loss_clip": 0.01059642, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.01944351, "balance_loss_mlp": 1.01756561, "epoch": 0.41286637607094545, "flos": 17848402784640.0, "grad_norm": 2.0524904868874434, "language_loss": 0.82679236, "learning_rate": 2.649505567780375e-06, "loss": 0.84786606, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 6867, "time_per_iteration": 2.342517137527466 }, { "auxiliary_loss_clip": 0.01063498, "auxiliary_loss_mlp": 0.01053104, "balance_loss_clip": 1.0235045, "balance_loss_mlp": 1.02028799, "epoch": 0.4129264993236134, "flos": 25548566832000.0, "grad_norm": 2.1466294765926963, "language_loss": 0.79783857, "learning_rate": 2.6491372030236815e-06, "loss": 0.81900465, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43164062, "step": 6868, "time_per_iteration": 2.4033260345458984 }, { "auxiliary_loss_clip": 0.01012483, "auxiliary_loss_mlp": 0.01020829, "balance_loss_clip": 1.01746738, "balance_loss_mlp": 1.00373089, "epoch": 0.4129866225762814, "flos": 65411732298240.0, "grad_norm": 0.8640014064247609, "language_loss": 0.57907015, "learning_rate": 2.64876881365164e-06, "loss": 0.59940326, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 0.03369141, "router_z_loss_mlp": 0.08740234, "step": 6869, "time_per_iteration": 2.8236522674560547 }, { "auxiliary_loss_clip": 0.01058701, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.01256418, "balance_loss_mlp": 1.01801646, "epoch": 0.41304674582894935, "flos": 28875719445120.0, "grad_norm": 1.6817207501873805, "language_loss": 0.76479036, "learning_rate": 2.64840039967822e-06, "loss": 0.78577358, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 6870, "time_per_iteration": 2.447242498397827 }, { "auxiliary_loss_clip": 0.01064739, "auxiliary_loss_mlp": 0.01050048, "balance_loss_clip": 1.02085376, "balance_loss_mlp": 1.02142859, "epoch": 0.4131068690816173, "flos": 22890908050560.0, "grad_norm": 1.4539915026127357, "language_loss": 0.84408987, "learning_rate": 2.6480319611173912e-06, "loss": 0.86523777, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 6871, "time_per_iteration": 2.412222385406494 }, { "auxiliary_loss_clip": 0.01066632, "auxiliary_loss_mlp": 0.01052859, "balance_loss_clip": 1.02459431, "balance_loss_mlp": 1.02307129, "epoch": 0.4131669923342853, "flos": 26064059189760.0, "grad_norm": 1.9435468515144003, "language_loss": 0.70145893, "learning_rate": 2.6476634979831263e-06, "loss": 0.72265387, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.43554688, "step": 6872, "time_per_iteration": 2.4217400550842285 }, { "auxiliary_loss_clip": 0.01064255, "auxiliary_loss_mlp": 0.01047931, "balance_loss_clip": 1.020751, "balance_loss_mlp": 1.02192116, "epoch": 0.41322711558695324, "flos": 19243566558720.0, "grad_norm": 1.8016787641662964, "language_loss": 0.77036691, "learning_rate": 2.6472950102893964e-06, "loss": 0.79148877, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.42382812, "step": 6873, "time_per_iteration": 2.435338020324707 }, { "auxiliary_loss_clip": 0.01066304, "auxiliary_loss_mlp": 0.01044827, "balance_loss_clip": 1.01691985, "balance_loss_mlp": 1.02318895, "epoch": 0.4132872388396212, "flos": 22673364168960.0, "grad_norm": 1.8082373268695555, "language_loss": 0.84685111, "learning_rate": 2.6469264980501746e-06, "loss": 0.86796248, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.43164062, "step": 6874, "time_per_iteration": 2.434215545654297 }, { "auxiliary_loss_clip": 0.0106552, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.02029467, "balance_loss_mlp": 1.02233219, "epoch": 0.4133473620922892, "flos": 20149353538560.0, "grad_norm": 1.587686525395317, "language_loss": 0.73360229, "learning_rate": 2.646557961279436e-06, "loss": 0.75476015, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43164062, "step": 6875, "time_per_iteration": 2.375777006149292 }, { "auxiliary_loss_clip": 0.01062955, "auxiliary_loss_mlp": 0.01047832, "balance_loss_clip": 1.02248788, "balance_loss_mlp": 1.02296412, "epoch": 0.41340748534495714, "flos": 24241627797120.0, "grad_norm": 1.4436590747716789, "language_loss": 0.83021325, "learning_rate": 2.646189399991154e-06, "loss": 0.8513211, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 6876, "time_per_iteration": 2.4261913299560547 }, { "auxiliary_loss_clip": 0.01065909, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.02628827, "balance_loss_mlp": 1.02136803, "epoch": 0.41346760859762516, "flos": 14391302624640.0, "grad_norm": 2.1708243837018077, "language_loss": 0.6633414, "learning_rate": 2.6458208141993048e-06, "loss": 0.68456447, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4453125, "step": 6877, "time_per_iteration": 2.3520143032073975 }, { "auxiliary_loss_clip": 0.01063247, "auxiliary_loss_mlp": 0.01047475, "balance_loss_clip": 1.02017605, "balance_loss_mlp": 1.02178693, "epoch": 0.4135277318502931, "flos": 22490908070400.0, "grad_norm": 1.7996539414602961, "language_loss": 0.77676094, "learning_rate": 2.6454522039178668e-06, "loss": 0.79786813, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 6878, "time_per_iteration": 2.4239308834075928 }, { "auxiliary_loss_clip": 0.01063991, "auxiliary_loss_mlp": 0.01048841, "balance_loss_clip": 1.02063584, "balance_loss_mlp": 1.02185726, "epoch": 0.4135878551029611, "flos": 22417660304640.0, "grad_norm": 1.839806438391506, "language_loss": 0.81721663, "learning_rate": 2.6450835691608154e-06, "loss": 0.83834493, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 6879, "time_per_iteration": 2.3978376388549805 }, { "auxiliary_loss_clip": 0.0106166, "auxiliary_loss_mlp": 0.01046011, "balance_loss_clip": 1.01846135, "balance_loss_mlp": 1.02045822, "epoch": 0.41364797835562905, "flos": 27051996332160.0, "grad_norm": 1.5923888476707455, "language_loss": 0.85499871, "learning_rate": 2.6447149099421315e-06, "loss": 0.87607551, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 6880, "time_per_iteration": 2.4505865573883057 }, { "auxiliary_loss_clip": 0.01065058, "auxiliary_loss_mlp": 0.01049936, "balance_loss_clip": 1.02089691, "balance_loss_mlp": 1.02089071, "epoch": 0.413708101608297, "flos": 22966459966080.0, "grad_norm": 1.6314582594679947, "language_loss": 0.71768993, "learning_rate": 2.6443462262757927e-06, "loss": 0.73883986, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44140625, "step": 6881, "time_per_iteration": 2.398024082183838 }, { "auxiliary_loss_clip": 0.01060537, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.02422464, "balance_loss_mlp": 1.02015412, "epoch": 0.413768224860965, "flos": 13333155004800.0, "grad_norm": 1.7587442927129842, "language_loss": 0.82967865, "learning_rate": 2.6439775181757805e-06, "loss": 0.85078537, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40234375, "step": 6882, "time_per_iteration": 2.4494967460632324 }, { "auxiliary_loss_clip": 0.01064799, "auxiliary_loss_mlp": 0.01055403, "balance_loss_clip": 1.02222729, "balance_loss_mlp": 1.02079749, "epoch": 0.41382834811363295, "flos": 20812912439040.0, "grad_norm": 2.5343687883453825, "language_loss": 0.71863604, "learning_rate": 2.643608785656077e-06, "loss": 0.739838, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.43945312, "step": 6883, "time_per_iteration": 2.466644763946533 }, { "auxiliary_loss_clip": 0.0106151, "auxiliary_loss_mlp": 0.01056083, "balance_loss_clip": 1.02682853, "balance_loss_mlp": 1.01956558, "epoch": 0.4138884713663009, "flos": 20666102705280.0, "grad_norm": 1.7736203822176544, "language_loss": 0.77130461, "learning_rate": 2.643240028730663e-06, "loss": 0.79248053, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41992188, "step": 6884, "time_per_iteration": 2.405782461166382 }, { "auxiliary_loss_clip": 0.01062786, "auxiliary_loss_mlp": 0.01045574, "balance_loss_clip": 1.01527083, "balance_loss_mlp": 1.01913142, "epoch": 0.4139485946189689, "flos": 29055417546240.0, "grad_norm": 1.5626390475427796, "language_loss": 0.76859319, "learning_rate": 2.642871247413523e-06, "loss": 0.78967673, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4375, "step": 6885, "time_per_iteration": 2.4410901069641113 }, { "auxiliary_loss_clip": 0.01062931, "auxiliary_loss_mlp": 0.01054969, "balance_loss_clip": 1.02440381, "balance_loss_mlp": 1.01955366, "epoch": 0.41400871787163684, "flos": 24424572654720.0, "grad_norm": 2.130401022305573, "language_loss": 0.71113849, "learning_rate": 2.6425024417186414e-06, "loss": 0.73231751, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 6886, "time_per_iteration": 2.4358346462249756 }, { "auxiliary_loss_clip": 0.0106454, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.02202702, "balance_loss_mlp": 1.02086961, "epoch": 0.4140688411243048, "flos": 19463030565120.0, "grad_norm": 1.7176309536534093, "language_loss": 0.76261556, "learning_rate": 2.642133611660002e-06, "loss": 0.78379905, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4375, "step": 6887, "time_per_iteration": 2.454051971435547 }, { "auxiliary_loss_clip": 0.01063071, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.02041149, "balance_loss_mlp": 1.02013755, "epoch": 0.4141289643769728, "flos": 19312764606720.0, "grad_norm": 1.9058903326208385, "language_loss": 0.73120916, "learning_rate": 2.641764757251592e-06, "loss": 0.75232404, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.4296875, "step": 6888, "time_per_iteration": 2.393204689025879 }, { "auxiliary_loss_clip": 0.01061387, "auxiliary_loss_mlp": 0.01046198, "balance_loss_clip": 1.01727819, "balance_loss_mlp": 1.01945519, "epoch": 0.41418908762964074, "flos": 16725979618560.0, "grad_norm": 1.9878321060371102, "language_loss": 0.77252376, "learning_rate": 2.6413958785073976e-06, "loss": 0.79359961, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 6889, "time_per_iteration": 2.3531088829040527 }, { "auxiliary_loss_clip": 0.01062163, "auxiliary_loss_mlp": 0.01047318, "balance_loss_clip": 1.01714659, "balance_loss_mlp": 1.0203712, "epoch": 0.41424921088230876, "flos": 25295795521920.0, "grad_norm": 3.593537180988896, "language_loss": 0.80528754, "learning_rate": 2.6410269754414074e-06, "loss": 0.82638234, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41796875, "step": 6890, "time_per_iteration": 2.451902151107788 }, { "auxiliary_loss_clip": 0.01063141, "auxiliary_loss_mlp": 0.01047521, "balance_loss_clip": 1.01693201, "balance_loss_mlp": 1.02172852, "epoch": 0.4143093341349767, "flos": 20959442881920.0, "grad_norm": 2.0975203473686324, "language_loss": 0.75383955, "learning_rate": 2.6406580480676113e-06, "loss": 0.77494615, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4140625, "step": 6891, "time_per_iteration": 2.3856749534606934 }, { "auxiliary_loss_clip": 0.01066628, "auxiliary_loss_mlp": 0.01055267, "balance_loss_clip": 1.02464175, "balance_loss_mlp": 1.02186, "epoch": 0.4143694573876447, "flos": 22016612983680.0, "grad_norm": 1.8553218036592682, "language_loss": 0.85299969, "learning_rate": 2.6402890963999963e-06, "loss": 0.87421858, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44726562, "step": 6892, "time_per_iteration": 2.4220194816589355 }, { "auxiliary_loss_clip": 0.01063601, "auxiliary_loss_mlp": 0.01041785, "balance_loss_clip": 1.01368725, "balance_loss_mlp": 1.02252388, "epoch": 0.41442958064031266, "flos": 35696002608000.0, "grad_norm": 1.4790262044469062, "language_loss": 0.71161705, "learning_rate": 2.6399201204525554e-06, "loss": 0.7326709, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 6893, "time_per_iteration": 2.5084493160247803 }, { "auxiliary_loss_clip": 0.0106662, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.01521873, "balance_loss_mlp": 1.02327943, "epoch": 0.4144897038929806, "flos": 28292495316480.0, "grad_norm": 1.3831329945201825, "language_loss": 0.73698288, "learning_rate": 2.639551120239279e-06, "loss": 0.75810438, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 6894, "time_per_iteration": 2.4806461334228516 }, { "auxiliary_loss_clip": 0.01065729, "auxiliary_loss_mlp": 0.01046596, "balance_loss_clip": 1.01762772, "balance_loss_mlp": 1.02226341, "epoch": 0.4145498271456486, "flos": 11647513785600.0, "grad_norm": 2.839650379137775, "language_loss": 0.64590549, "learning_rate": 2.63918209577416e-06, "loss": 0.66702873, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 6895, "time_per_iteration": 2.341264009475708 }, { "auxiliary_loss_clip": 0.01064825, "auxiliary_loss_mlp": 0.01046264, "balance_loss_clip": 1.01692605, "balance_loss_mlp": 1.02164841, "epoch": 0.41460995039831655, "flos": 27234382608000.0, "grad_norm": 1.4937804998986777, "language_loss": 0.71696943, "learning_rate": 2.638813047071192e-06, "loss": 0.73808032, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43164062, "step": 6896, "time_per_iteration": 2.4765000343322754 }, { "auxiliary_loss_clip": 0.01064947, "auxiliary_loss_mlp": 0.0105638, "balance_loss_clip": 1.02395523, "balance_loss_mlp": 1.0217123, "epoch": 0.4146700736509845, "flos": 25921159528320.0, "grad_norm": 1.7229300115172874, "language_loss": 0.74450654, "learning_rate": 2.6384439741443696e-06, "loss": 0.76571977, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.43164062, "step": 6897, "time_per_iteration": 2.437869071960449 }, { "auxiliary_loss_clip": 0.01064097, "auxiliary_loss_mlp": 0.01056546, "balance_loss_clip": 1.02778089, "balance_loss_mlp": 1.02117944, "epoch": 0.4147301969036525, "flos": 26832043566720.0, "grad_norm": 1.4476523972964352, "language_loss": 0.85262531, "learning_rate": 2.6380748770076873e-06, "loss": 0.87383175, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4296875, "step": 6898, "time_per_iteration": 2.4773736000061035 }, { "auxiliary_loss_clip": 0.01064409, "auxiliary_loss_mlp": 0.01049593, "balance_loss_clip": 1.02010036, "balance_loss_mlp": 1.02067161, "epoch": 0.41479032015632045, "flos": 20297385169920.0, "grad_norm": 1.6688845313277039, "language_loss": 0.76346159, "learning_rate": 2.6377057556751416e-06, "loss": 0.78460163, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4375, "step": 6899, "time_per_iteration": 2.3939030170440674 }, { "auxiliary_loss_clip": 0.01068182, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.01914322, "balance_loss_mlp": 1.02233624, "epoch": 0.4148504434089884, "flos": 25263814849920.0, "grad_norm": 1.6575461117754335, "language_loss": 0.77418244, "learning_rate": 2.6373366101607306e-06, "loss": 0.79536927, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.45898438, "step": 6900, "time_per_iteration": 3.9573097229003906 }, { "auxiliary_loss_clip": 0.01065814, "auxiliary_loss_mlp": 0.01053394, "balance_loss_clip": 1.02145743, "balance_loss_mlp": 1.0222249, "epoch": 0.4149105666616564, "flos": 12821502896640.0, "grad_norm": 1.967445832138476, "language_loss": 0.81893873, "learning_rate": 2.6369674404784503e-06, "loss": 0.84013075, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.43554688, "step": 6901, "time_per_iteration": 3.776214122772217 }, { "auxiliary_loss_clip": 0.01061042, "auxiliary_loss_mlp": 0.01043188, "balance_loss_clip": 1.01549554, "balance_loss_mlp": 1.01982188, "epoch": 0.41497068991432434, "flos": 16762952615040.0, "grad_norm": 1.9030436346250224, "language_loss": 0.71318185, "learning_rate": 2.6365982466423014e-06, "loss": 0.73422414, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 6902, "time_per_iteration": 3.7447848320007324 }, { "auxiliary_loss_clip": 0.01060343, "auxiliary_loss_mlp": 0.01049146, "balance_loss_clip": 1.02240729, "balance_loss_mlp": 1.01966166, "epoch": 0.4150308131669923, "flos": 18000030286080.0, "grad_norm": 1.6150654586690232, "language_loss": 0.8517645, "learning_rate": 2.6362290286662834e-06, "loss": 0.87285936, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 6903, "time_per_iteration": 2.3958587646484375 }, { "auxiliary_loss_clip": 0.01065899, "auxiliary_loss_mlp": 0.01055967, "balance_loss_clip": 1.02244508, "balance_loss_mlp": 1.02011108, "epoch": 0.41509093641966033, "flos": 30043459422720.0, "grad_norm": 1.7973931327707913, "language_loss": 0.69822544, "learning_rate": 2.6358597865643968e-06, "loss": 0.71944404, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.45703125, "step": 6904, "time_per_iteration": 2.4511280059814453 }, { "auxiliary_loss_clip": 0.01063123, "auxiliary_loss_mlp": 0.01047007, "balance_loss_clip": 1.0169543, "balance_loss_mlp": 1.01959109, "epoch": 0.4151510596723283, "flos": 24278845173120.0, "grad_norm": 1.5892736519392996, "language_loss": 0.79032946, "learning_rate": 2.635490520350643e-06, "loss": 0.81143069, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43554688, "step": 6905, "time_per_iteration": 3.8604774475097656 }, { "auxiliary_loss_clip": 0.01061465, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.01404142, "balance_loss_mlp": 1.01880229, "epoch": 0.41521118292499626, "flos": 23475109697280.0, "grad_norm": 1.5300259607081832, "language_loss": 0.70135194, "learning_rate": 2.635121230039025e-06, "loss": 0.72241127, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42773438, "step": 6906, "time_per_iteration": 2.4253618717193604 }, { "auxiliary_loss_clip": 0.01059864, "auxiliary_loss_mlp": 0.01041654, "balance_loss_clip": 1.01489186, "balance_loss_mlp": 1.01840711, "epoch": 0.4152713061776642, "flos": 22124459773440.0, "grad_norm": 2.0702398993043536, "language_loss": 0.68991339, "learning_rate": 2.6347519156435467e-06, "loss": 0.71092856, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.4140625, "step": 6907, "time_per_iteration": 2.376469135284424 }, { "auxiliary_loss_clip": 0.01062518, "auxiliary_loss_mlp": 0.01047907, "balance_loss_clip": 1.01918983, "balance_loss_mlp": 1.019719, "epoch": 0.4153314294303322, "flos": 21250339263360.0, "grad_norm": 1.8350866762654374, "language_loss": 0.78569794, "learning_rate": 2.6343825771782123e-06, "loss": 0.80680221, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42773438, "step": 6908, "time_per_iteration": 2.4065663814544678 }, { "auxiliary_loss_clip": 0.01018276, "auxiliary_loss_mlp": 0.01005428, "balance_loss_clip": 1.00182796, "balance_loss_mlp": 1.00955594, "epoch": 0.41539155268300015, "flos": 57917554606080.0, "grad_norm": 0.7566920961874242, "language_loss": 0.64952564, "learning_rate": 2.634013214657026e-06, "loss": 0.66976261, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 0.03588867, "router_z_loss_mlp": 0.08691406, "step": 6909, "time_per_iteration": 2.9775915145874023 }, { "auxiliary_loss_clip": 0.0106231, "auxiliary_loss_mlp": 0.01049493, "balance_loss_clip": 1.01938128, "balance_loss_mlp": 1.0195744, "epoch": 0.4154516759356681, "flos": 21902726528640.0, "grad_norm": 1.420516524311748, "language_loss": 0.87988496, "learning_rate": 2.633643828093996e-06, "loss": 0.90100306, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42773438, "step": 6910, "time_per_iteration": 2.4264187812805176 }, { "auxiliary_loss_clip": 0.01019923, "auxiliary_loss_mlp": 0.01006107, "balance_loss_clip": 1.00243545, "balance_loss_mlp": 1.01099062, "epoch": 0.4155117991883361, "flos": 67830584313600.0, "grad_norm": 0.7976033667776528, "language_loss": 0.62121093, "learning_rate": 2.633274417503128e-06, "loss": 0.64147127, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.08935547, "step": 6911, "time_per_iteration": 3.013174533843994 }, { "auxiliary_loss_clip": 0.01067989, "auxiliary_loss_mlp": 0.01049597, "balance_loss_clip": 1.01707649, "balance_loss_mlp": 1.02172911, "epoch": 0.41557192244100405, "flos": 14281815000960.0, "grad_norm": 2.241779656589657, "language_loss": 0.89365095, "learning_rate": 2.6329049828984312e-06, "loss": 0.91482687, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4609375, "step": 6912, "time_per_iteration": 2.3594298362731934 }, { "auxiliary_loss_clip": 0.01063133, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.01747584, "balance_loss_mlp": 1.02122688, "epoch": 0.415632045693672, "flos": 24460812512640.0, "grad_norm": 2.237092431997439, "language_loss": 0.64928246, "learning_rate": 2.632535524293914e-06, "loss": 0.6703546, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41992188, "step": 6913, "time_per_iteration": 2.402801752090454 }, { "auxiliary_loss_clip": 0.01061316, "auxiliary_loss_mlp": 0.01050074, "balance_loss_clip": 1.02358592, "balance_loss_mlp": 1.02000427, "epoch": 0.41569216894634, "flos": 20114405400960.0, "grad_norm": 1.7335303559870685, "language_loss": 0.75827253, "learning_rate": 2.632166041703586e-06, "loss": 0.7793864, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41210938, "step": 6914, "time_per_iteration": 2.404083490371704 }, { "auxiliary_loss_clip": 0.01063047, "auxiliary_loss_mlp": 0.01049494, "balance_loss_clip": 1.01549566, "balance_loss_mlp": 1.01926279, "epoch": 0.41575229219900794, "flos": 23797882016640.0, "grad_norm": 1.7162487386587615, "language_loss": 0.89432567, "learning_rate": 2.631796535141458e-06, "loss": 0.91545105, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.4375, "step": 6915, "time_per_iteration": 2.3877437114715576 }, { "auxiliary_loss_clip": 0.01063824, "auxiliary_loss_mlp": 0.01045794, "balance_loss_clip": 1.01736271, "balance_loss_mlp": 1.02136683, "epoch": 0.4158124154516759, "flos": 23107230034560.0, "grad_norm": 2.538602610590582, "language_loss": 0.72488058, "learning_rate": 2.6314270046215426e-06, "loss": 0.74597669, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 6916, "time_per_iteration": 2.423398971557617 }, { "auxiliary_loss_clip": 0.0106489, "auxiliary_loss_mlp": 0.01050041, "balance_loss_clip": 1.01854563, "balance_loss_mlp": 1.02012575, "epoch": 0.41587253870434393, "flos": 24241837265280.0, "grad_norm": 1.3628526943838877, "language_loss": 0.7326014, "learning_rate": 2.631057450157852e-06, "loss": 0.75375074, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44726562, "step": 6917, "time_per_iteration": 2.4208035469055176 }, { "auxiliary_loss_clip": 0.01061702, "auxiliary_loss_mlp": 0.01045546, "balance_loss_clip": 1.01682878, "balance_loss_mlp": 1.01900792, "epoch": 0.4159326619570119, "flos": 23880381292800.0, "grad_norm": 1.7303363505996443, "language_loss": 0.82210159, "learning_rate": 2.6306878717643988e-06, "loss": 0.8431741, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 6918, "time_per_iteration": 2.453056812286377 }, { "auxiliary_loss_clip": 0.01066038, "auxiliary_loss_mlp": 0.01048787, "balance_loss_clip": 1.01729238, "balance_loss_mlp": 1.02257776, "epoch": 0.41599278520967986, "flos": 40624900709760.0, "grad_norm": 1.3783153946509965, "language_loss": 0.71293807, "learning_rate": 2.6303182694551995e-06, "loss": 0.73408639, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43359375, "step": 6919, "time_per_iteration": 2.562023401260376 }, { "auxiliary_loss_clip": 0.0106526, "auxiliary_loss_mlp": 0.01045984, "balance_loss_clip": 1.01322508, "balance_loss_mlp": 1.0214045, "epoch": 0.4160529084623478, "flos": 18221972999040.0, "grad_norm": 1.8095487424691885, "language_loss": 0.83189595, "learning_rate": 2.6299486432442677e-06, "loss": 0.85300839, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4375, "step": 6920, "time_per_iteration": 2.388762950897217 }, { "auxiliary_loss_clip": 0.01063993, "auxiliary_loss_mlp": 0.01047981, "balance_loss_clip": 1.01546121, "balance_loss_mlp": 1.01981688, "epoch": 0.4161130317150158, "flos": 13661129116800.0, "grad_norm": 1.9646291772722262, "language_loss": 0.67244309, "learning_rate": 2.6295789931456195e-06, "loss": 0.69356287, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44140625, "step": 6921, "time_per_iteration": 2.352733612060547 }, { "auxiliary_loss_clip": 0.01063074, "auxiliary_loss_mlp": 0.01048716, "balance_loss_clip": 1.01940227, "balance_loss_mlp": 1.01996231, "epoch": 0.41617315496768376, "flos": 16177633804800.0, "grad_norm": 1.8839636030783153, "language_loss": 0.82285655, "learning_rate": 2.629209319173274e-06, "loss": 0.84397447, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43164062, "step": 6922, "time_per_iteration": 2.3805439472198486 }, { "auxiliary_loss_clip": 0.01063574, "auxiliary_loss_mlp": 0.01046771, "balance_loss_clip": 1.01491857, "balance_loss_mlp": 1.01946306, "epoch": 0.4162332782203517, "flos": 26212125732480.0, "grad_norm": 1.4953594480890096, "language_loss": 0.68678361, "learning_rate": 2.628839621341247e-06, "loss": 0.70788705, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44140625, "step": 6923, "time_per_iteration": 2.428374767303467 }, { "auxiliary_loss_clip": 0.01064066, "auxiliary_loss_mlp": 0.0105478, "balance_loss_clip": 1.0230701, "balance_loss_mlp": 1.02075958, "epoch": 0.4162934014730197, "flos": 28182728401920.0, "grad_norm": 2.0125392768408967, "language_loss": 0.77214724, "learning_rate": 2.6284698996635593e-06, "loss": 0.79333568, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43359375, "step": 6924, "time_per_iteration": 2.5279664993286133 }, { "auxiliary_loss_clip": 0.01063304, "auxiliary_loss_mlp": 0.01046227, "balance_loss_clip": 1.01766431, "balance_loss_mlp": 1.02008808, "epoch": 0.41635352472568765, "flos": 19864287354240.0, "grad_norm": 1.8552226924705366, "language_loss": 0.75004172, "learning_rate": 2.62810015415423e-06, "loss": 0.77113712, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 6925, "time_per_iteration": 2.4691524505615234 }, { "auxiliary_loss_clip": 0.01060736, "auxiliary_loss_mlp": 0.01043972, "balance_loss_clip": 1.01579046, "balance_loss_mlp": 1.01885724, "epoch": 0.4164136479783556, "flos": 14934586291200.0, "grad_norm": 1.8266046498928117, "language_loss": 0.85557997, "learning_rate": 2.6277303848272792e-06, "loss": 0.87662703, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 6926, "time_per_iteration": 2.422821044921875 }, { "auxiliary_loss_clip": 0.0106049, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.01394486, "balance_loss_mlp": 1.01951051, "epoch": 0.4164737712310236, "flos": 21756649933440.0, "grad_norm": 1.7440688893500638, "language_loss": 0.87908185, "learning_rate": 2.6273605916967302e-06, "loss": 0.90008211, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41015625, "step": 6927, "time_per_iteration": 2.374929189682007 }, { "auxiliary_loss_clip": 0.01061158, "auxiliary_loss_mlp": 0.01044341, "balance_loss_clip": 1.01509917, "balance_loss_mlp": 1.01925635, "epoch": 0.41653389448369155, "flos": 20739106091520.0, "grad_norm": 2.2338879109268674, "language_loss": 0.74933928, "learning_rate": 2.626990774776604e-06, "loss": 0.77039433, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41992188, "step": 6928, "time_per_iteration": 2.399108648300171 }, { "auxiliary_loss_clip": 0.01058716, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.01656818, "balance_loss_mlp": 1.01741755, "epoch": 0.4165940177363595, "flos": 24971731482240.0, "grad_norm": 1.7947413159961685, "language_loss": 0.79321742, "learning_rate": 2.6266209340809254e-06, "loss": 0.81424642, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 6929, "time_per_iteration": 2.397709846496582 }, { "auxiliary_loss_clip": 0.01061636, "auxiliary_loss_mlp": 0.01045714, "balance_loss_clip": 1.01659131, "balance_loss_mlp": 1.01907539, "epoch": 0.41665414098902753, "flos": 20520689425920.0, "grad_norm": 1.8611203523020974, "language_loss": 0.72217715, "learning_rate": 2.6262510696237182e-06, "loss": 0.74325061, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 6930, "time_per_iteration": 2.4103097915649414 }, { "auxiliary_loss_clip": 0.01061718, "auxiliary_loss_mlp": 0.01046481, "balance_loss_clip": 1.01878905, "balance_loss_mlp": 1.01916623, "epoch": 0.4167142642416955, "flos": 19681901078400.0, "grad_norm": 1.5894622593943415, "language_loss": 0.82529783, "learning_rate": 2.625881181419007e-06, "loss": 0.84637982, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42578125, "step": 6931, "time_per_iteration": 2.4070539474487305 }, { "auxiliary_loss_clip": 0.01059731, "auxiliary_loss_mlp": 0.01045924, "balance_loss_clip": 1.01737344, "balance_loss_mlp": 1.0183028, "epoch": 0.41677438749436346, "flos": 23762759322240.0, "grad_norm": 1.6383696271321837, "language_loss": 0.80206114, "learning_rate": 2.6255112694808193e-06, "loss": 0.82311767, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 6932, "time_per_iteration": 2.3880603313446045 }, { "auxiliary_loss_clip": 0.01062431, "auxiliary_loss_mlp": 0.01046766, "balance_loss_clip": 1.01959801, "balance_loss_mlp": 1.02058983, "epoch": 0.41683451074703143, "flos": 30408720733440.0, "grad_norm": 1.7604895218220933, "language_loss": 0.84176856, "learning_rate": 2.6251413338231813e-06, "loss": 0.8628605, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41796875, "step": 6933, "time_per_iteration": 2.470787525177002 }, { "auxiliary_loss_clip": 0.0106387, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.01476097, "balance_loss_mlp": 1.02015567, "epoch": 0.4168946339996994, "flos": 21505694014080.0, "grad_norm": 1.8222445909321046, "language_loss": 0.78148651, "learning_rate": 2.624771374460121e-06, "loss": 0.8025831, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4375, "step": 6934, "time_per_iteration": 2.412191867828369 }, { "auxiliary_loss_clip": 0.01063345, "auxiliary_loss_mlp": 0.01042493, "balance_loss_clip": 1.01409721, "balance_loss_mlp": 1.02156067, "epoch": 0.41695475725236736, "flos": 17637736440960.0, "grad_norm": 1.7425597720308887, "language_loss": 0.68579435, "learning_rate": 2.624401391405668e-06, "loss": 0.70685267, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 6935, "time_per_iteration": 2.4219818115234375 }, { "auxiliary_loss_clip": 0.0106253, "auxiliary_loss_mlp": 0.01045529, "balance_loss_clip": 1.01794374, "balance_loss_mlp": 1.02041507, "epoch": 0.4170148805050353, "flos": 15668006555520.0, "grad_norm": 2.1053048073310716, "language_loss": 0.76210892, "learning_rate": 2.6240313846738513e-06, "loss": 0.78318954, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.421875, "step": 6936, "time_per_iteration": 2.3394296169281006 }, { "auxiliary_loss_clip": 0.01061098, "auxiliary_loss_mlp": 0.01041147, "balance_loss_clip": 1.01426506, "balance_loss_mlp": 1.01969647, "epoch": 0.4170750037577033, "flos": 15158239660800.0, "grad_norm": 1.8511482196411164, "language_loss": 0.76561791, "learning_rate": 2.6236613542787024e-06, "loss": 0.78664041, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 6937, "time_per_iteration": 2.426276206970215 }, { "auxiliary_loss_clip": 0.01061107, "auxiliary_loss_mlp": 0.01045111, "balance_loss_clip": 1.01754975, "balance_loss_mlp": 1.01999855, "epoch": 0.41713512701037125, "flos": 28766999871360.0, "grad_norm": 1.4437855912180177, "language_loss": 0.85099089, "learning_rate": 2.6232913002342518e-06, "loss": 0.87205309, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 6938, "time_per_iteration": 2.5360188484191895 }, { "auxiliary_loss_clip": 0.01063655, "auxiliary_loss_mlp": 0.01043706, "balance_loss_clip": 1.01615691, "balance_loss_mlp": 1.02024555, "epoch": 0.4171952502630392, "flos": 28255731788160.0, "grad_norm": 1.7864956405659764, "language_loss": 0.75179386, "learning_rate": 2.6229212225545334e-06, "loss": 0.77286744, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.43554688, "step": 6939, "time_per_iteration": 2.5022788047790527 }, { "auxiliary_loss_clip": 0.01062433, "auxiliary_loss_mlp": 0.01046449, "balance_loss_clip": 1.0188756, "balance_loss_mlp": 1.01933146, "epoch": 0.4172553735157072, "flos": 24570544515840.0, "grad_norm": 1.4663211926028332, "language_loss": 0.7575112, "learning_rate": 2.622551121253579e-06, "loss": 0.77859998, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.43164062, "step": 6940, "time_per_iteration": 3.8669564723968506 }, { "auxiliary_loss_clip": 0.01061931, "auxiliary_loss_mlp": 0.01047962, "balance_loss_clip": 1.01991189, "balance_loss_mlp": 1.02009702, "epoch": 0.41731549676837515, "flos": 27044769680640.0, "grad_norm": 1.7656908593088827, "language_loss": 0.72513461, "learning_rate": 2.622180996345424e-06, "loss": 0.74623358, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 6941, "time_per_iteration": 5.289955377578735 }, { "auxiliary_loss_clip": 0.01065504, "auxiliary_loss_mlp": 0.01052722, "balance_loss_clip": 1.02200174, "balance_loss_mlp": 1.02100301, "epoch": 0.4173756200210431, "flos": 28393045632000.0, "grad_norm": 1.8598881220408374, "language_loss": 0.74919146, "learning_rate": 2.621810847844104e-06, "loss": 0.77037382, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 6942, "time_per_iteration": 2.423283815383911 }, { "auxiliary_loss_clip": 0.01067122, "auxiliary_loss_mlp": 0.01053558, "balance_loss_clip": 1.02301657, "balance_loss_mlp": 1.02170777, "epoch": 0.41743574327371114, "flos": 22520654415360.0, "grad_norm": 2.021604697133654, "language_loss": 0.73830318, "learning_rate": 2.6214406757636534e-06, "loss": 0.75950998, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.453125, "step": 6943, "time_per_iteration": 2.4139246940612793 }, { "auxiliary_loss_clip": 0.01065325, "auxiliary_loss_mlp": 0.01051217, "balance_loss_clip": 1.0206635, "balance_loss_mlp": 1.020648, "epoch": 0.4174958665263791, "flos": 30112238534400.0, "grad_norm": 1.7742853564056418, "language_loss": 0.6469385, "learning_rate": 2.621070480118111e-06, "loss": 0.66810393, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44726562, "step": 6944, "time_per_iteration": 3.8718364238739014 }, { "auxiliary_loss_clip": 0.01064069, "auxiliary_loss_mlp": 0.01044175, "balance_loss_clip": 1.01595855, "balance_loss_mlp": 1.0206126, "epoch": 0.41755598977904707, "flos": 25262313661440.0, "grad_norm": 1.5217779472690336, "language_loss": 0.71582866, "learning_rate": 2.620700260921513e-06, "loss": 0.73691112, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.43359375, "step": 6945, "time_per_iteration": 2.458113670349121 }, { "auxiliary_loss_clip": 0.01060709, "auxiliary_loss_mlp": 0.01051669, "balance_loss_clip": 1.02232003, "balance_loss_mlp": 1.01874399, "epoch": 0.41761611303171503, "flos": 19827558737280.0, "grad_norm": 1.705515089135438, "language_loss": 0.82492703, "learning_rate": 2.620330018187899e-06, "loss": 0.8460508, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 6946, "time_per_iteration": 2.3652594089508057 }, { "auxiliary_loss_clip": 0.01062765, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.01797891, "balance_loss_mlp": 1.01946449, "epoch": 0.417676236284383, "flos": 15522348896640.0, "grad_norm": 2.0339594832603334, "language_loss": 0.78808093, "learning_rate": 2.6199597519313086e-06, "loss": 0.80916607, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43359375, "step": 6947, "time_per_iteration": 2.3816962242126465 }, { "auxiliary_loss_clip": 0.01064254, "auxiliary_loss_mlp": 0.01047978, "balance_loss_clip": 1.01877129, "balance_loss_mlp": 1.01970959, "epoch": 0.41773635953705096, "flos": 32523130759680.0, "grad_norm": 1.5640097634199697, "language_loss": 0.72925097, "learning_rate": 2.6195894621657825e-06, "loss": 0.75037324, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4453125, "step": 6948, "time_per_iteration": 2.466956615447998 }, { "auxiliary_loss_clip": 0.01059232, "auxiliary_loss_mlp": 0.01044452, "balance_loss_clip": 1.0160203, "balance_loss_mlp": 1.01813805, "epoch": 0.4177964827897189, "flos": 23439812446080.0, "grad_norm": 1.5117055401685704, "language_loss": 0.7749927, "learning_rate": 2.619219148905362e-06, "loss": 0.79602957, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 6949, "time_per_iteration": 2.481131076812744 }, { "auxiliary_loss_clip": 0.01065132, "auxiliary_loss_mlp": 0.01052354, "balance_loss_clip": 1.02165782, "balance_loss_mlp": 1.02054036, "epoch": 0.4178566060423869, "flos": 22747764009600.0, "grad_norm": 1.547902210265985, "language_loss": 0.82839572, "learning_rate": 2.6188488121640888e-06, "loss": 0.84957051, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 6950, "time_per_iteration": 2.3883023262023926 }, { "auxiliary_loss_clip": 0.01059505, "auxiliary_loss_mlp": 0.01041451, "balance_loss_clip": 1.01529646, "balance_loss_mlp": 1.0192802, "epoch": 0.41791672929505486, "flos": 26031554847360.0, "grad_norm": 1.3379075853303293, "language_loss": 0.77186656, "learning_rate": 2.618478451956007e-06, "loss": 0.79287612, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 6951, "time_per_iteration": 2.4552152156829834 }, { "auxiliary_loss_clip": 0.01064416, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.01685572, "balance_loss_mlp": 1.01920116, "epoch": 0.4179768525477228, "flos": 19567805155200.0, "grad_norm": 1.6856666494322161, "language_loss": 0.74500126, "learning_rate": 2.61810806829516e-06, "loss": 0.76611274, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.45117188, "step": 6952, "time_per_iteration": 2.3558571338653564 }, { "auxiliary_loss_clip": 0.01063492, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.01934099, "balance_loss_mlp": 1.02009487, "epoch": 0.4180369758003908, "flos": 17782905340800.0, "grad_norm": 2.2522734516358227, "language_loss": 0.741467, "learning_rate": 2.617737661195593e-06, "loss": 0.76258636, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 6953, "time_per_iteration": 2.3688621520996094 }, { "auxiliary_loss_clip": 0.01061203, "auxiliary_loss_mlp": 0.01045158, "balance_loss_clip": 1.0147357, "balance_loss_mlp": 1.02012384, "epoch": 0.41809709905305875, "flos": 20959582527360.0, "grad_norm": 1.6108852155326008, "language_loss": 0.77560413, "learning_rate": 2.617367230671353e-06, "loss": 0.7966677, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41015625, "step": 6954, "time_per_iteration": 2.3741347789764404 }, { "auxiliary_loss_clip": 0.01062678, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.02145123, "balance_loss_mlp": 1.02016246, "epoch": 0.4181572223057267, "flos": 22016543160960.0, "grad_norm": 2.0901331358541184, "language_loss": 0.85908985, "learning_rate": 2.616996776736485e-06, "loss": 0.8802315, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42382812, "step": 6955, "time_per_iteration": 2.4504058361053467 }, { "auxiliary_loss_clip": 0.01060306, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.01175404, "balance_loss_mlp": 1.01913953, "epoch": 0.4182173455583947, "flos": 26244455518080.0, "grad_norm": 1.5504858464025972, "language_loss": 0.84279883, "learning_rate": 2.616626299405037e-06, "loss": 0.8637923, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41210938, "step": 6956, "time_per_iteration": 2.4231624603271484 }, { "auxiliary_loss_clip": 0.0106526, "auxiliary_loss_mlp": 0.01052808, "balance_loss_clip": 1.02305305, "balance_loss_mlp": 1.02148998, "epoch": 0.4182774688110627, "flos": 14790778934400.0, "grad_norm": 2.109289019397376, "language_loss": 0.7336092, "learning_rate": 2.616255798691059e-06, "loss": 0.75478989, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4375, "step": 6957, "time_per_iteration": 2.4056179523468018 }, { "auxiliary_loss_clip": 0.01064112, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.02107596, "balance_loss_mlp": 1.02120018, "epoch": 0.41833759206373067, "flos": 20410992334080.0, "grad_norm": 2.100268717896933, "language_loss": 0.77243382, "learning_rate": 2.6158852746085982e-06, "loss": 0.79356784, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4296875, "step": 6958, "time_per_iteration": 2.3812100887298584 }, { "auxiliary_loss_clip": 0.01063659, "auxiliary_loss_mlp": 0.01039916, "balance_loss_clip": 1.01243806, "balance_loss_mlp": 1.02061176, "epoch": 0.41839771531639863, "flos": 23655296557440.0, "grad_norm": 1.855385230747985, "language_loss": 0.78230143, "learning_rate": 2.6155147271717066e-06, "loss": 0.80333722, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4296875, "step": 6959, "time_per_iteration": 2.4208006858825684 }, { "auxiliary_loss_clip": 0.01063782, "auxiliary_loss_mlp": 0.01047854, "balance_loss_clip": 1.017169, "balance_loss_mlp": 1.0199157, "epoch": 0.4184578385690666, "flos": 19753158896640.0, "grad_norm": 1.8059121675539462, "language_loss": 0.77932799, "learning_rate": 2.6151441563944347e-06, "loss": 0.80044436, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43945312, "step": 6960, "time_per_iteration": 2.370515823364258 }, { "auxiliary_loss_clip": 0.01060812, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.00939608, "balance_loss_mlp": 1.02130902, "epoch": 0.41851796182173456, "flos": 20192366200320.0, "grad_norm": 2.0444204351759425, "language_loss": 0.77315831, "learning_rate": 2.614773562290835e-06, "loss": 0.79410684, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39453125, "step": 6961, "time_per_iteration": 2.5369179248809814 }, { "auxiliary_loss_clip": 0.01024771, "auxiliary_loss_mlp": 0.01001799, "balance_loss_clip": 0.99841332, "balance_loss_mlp": 1.01558852, "epoch": 0.41857808507440253, "flos": 59015537953920.0, "grad_norm": 0.772597990464078, "language_loss": 0.54675245, "learning_rate": 2.61440294487496e-06, "loss": 0.56701815, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.09179688, "step": 6962, "time_per_iteration": 2.9491982460021973 }, { "auxiliary_loss_clip": 0.01066381, "auxiliary_loss_mlp": 0.0104864, "balance_loss_clip": 1.01752651, "balance_loss_mlp": 1.02145314, "epoch": 0.4186382083270705, "flos": 18477816508800.0, "grad_norm": 1.8946214904360785, "language_loss": 0.87171221, "learning_rate": 2.614032304160864e-06, "loss": 0.89286244, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44921875, "step": 6963, "time_per_iteration": 2.4910085201263428 }, { "auxiliary_loss_clip": 0.01064487, "auxiliary_loss_mlp": 0.010464, "balance_loss_clip": 1.01912534, "balance_loss_mlp": 1.02204406, "epoch": 0.41869833157973846, "flos": 21577719882240.0, "grad_norm": 1.5286151617973818, "language_loss": 0.71507204, "learning_rate": 2.6136616401626014e-06, "loss": 0.73618096, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42382812, "step": 6964, "time_per_iteration": 2.3960442543029785 }, { "auxiliary_loss_clip": 0.01060647, "auxiliary_loss_mlp": 0.01046904, "balance_loss_clip": 1.01941478, "balance_loss_mlp": 1.02002192, "epoch": 0.4187584548324064, "flos": 35515955393280.0, "grad_norm": 1.8059326409105234, "language_loss": 0.72195196, "learning_rate": 2.6132909528942273e-06, "loss": 0.74302751, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 6965, "time_per_iteration": 2.568942070007324 }, { "auxiliary_loss_clip": 0.010587, "auxiliary_loss_mlp": 0.01044193, "balance_loss_clip": 1.01791954, "balance_loss_mlp": 1.01772296, "epoch": 0.4188185780850744, "flos": 18655035903360.0, "grad_norm": 1.494466307953777, "language_loss": 0.72715104, "learning_rate": 2.6129202423697997e-06, "loss": 0.74817991, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41015625, "step": 6966, "time_per_iteration": 2.4081456661224365 }, { "auxiliary_loss_clip": 0.01064488, "auxiliary_loss_mlp": 0.01050314, "balance_loss_clip": 1.02070272, "balance_loss_mlp": 1.01992154, "epoch": 0.41887870133774235, "flos": 40331839824000.0, "grad_norm": 2.3078952410206, "language_loss": 0.72610259, "learning_rate": 2.612549508603375e-06, "loss": 0.74725062, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4453125, "step": 6967, "time_per_iteration": 2.591635227203369 }, { "auxiliary_loss_clip": 0.01011459, "auxiliary_loss_mlp": 0.01008652, "balance_loss_clip": 1.00526643, "balance_loss_mlp": 1.00294423, "epoch": 0.4189388245904103, "flos": 61368545278080.0, "grad_norm": 0.6851648707951838, "language_loss": 0.46343893, "learning_rate": 2.612178751609011e-06, "loss": 0.48364002, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.08496094, "step": 6968, "time_per_iteration": 3.0140902996063232 }, { "auxiliary_loss_clip": 0.01063515, "auxiliary_loss_mlp": 0.01048976, "balance_loss_clip": 1.01797009, "balance_loss_mlp": 1.01885319, "epoch": 0.4189989478430783, "flos": 28214499605760.0, "grad_norm": 1.5796098643700078, "language_loss": 0.76588833, "learning_rate": 2.6118079714007685e-06, "loss": 0.78701317, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44726562, "step": 6969, "time_per_iteration": 2.450751543045044 }, { "auxiliary_loss_clip": 0.01060708, "auxiliary_loss_mlp": 0.01051136, "balance_loss_clip": 1.02245462, "balance_loss_mlp": 1.01856804, "epoch": 0.4190590710957463, "flos": 24564888875520.0, "grad_norm": 1.4778441490816057, "language_loss": 0.81217384, "learning_rate": 2.611437167992705e-06, "loss": 0.83329225, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 6970, "time_per_iteration": 2.401111602783203 }, { "auxiliary_loss_clip": 0.01060219, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.02134085, "balance_loss_mlp": 1.01853585, "epoch": 0.41911919434841427, "flos": 21724948552320.0, "grad_norm": 2.3290168556379123, "language_loss": 0.84543395, "learning_rate": 2.6110663413988835e-06, "loss": 0.8665393, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41601562, "step": 6971, "time_per_iteration": 2.409578800201416 }, { "auxiliary_loss_clip": 0.01062331, "auxiliary_loss_mlp": 0.01046496, "balance_loss_clip": 1.01779032, "balance_loss_mlp": 1.0206356, "epoch": 0.41917931760108224, "flos": 17600623799040.0, "grad_norm": 2.1112468824618604, "language_loss": 0.76417398, "learning_rate": 2.6106954916333648e-06, "loss": 0.78526223, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 6972, "time_per_iteration": 2.3574726581573486 }, { "auxiliary_loss_clip": 0.01061725, "auxiliary_loss_mlp": 0.01047896, "balance_loss_clip": 1.02001309, "balance_loss_mlp": 1.01988935, "epoch": 0.4192394408537502, "flos": 37815160579200.0, "grad_norm": 1.5729468666016286, "language_loss": 0.7393887, "learning_rate": 2.610324618710212e-06, "loss": 0.76048493, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 6973, "time_per_iteration": 2.5703933238983154 }, { "auxiliary_loss_clip": 0.01068677, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.01607049, "balance_loss_mlp": 1.0227493, "epoch": 0.41929956410641817, "flos": 23106741275520.0, "grad_norm": 2.2760616564474137, "language_loss": 0.75783718, "learning_rate": 2.609953722643489e-06, "loss": 0.77897882, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.45898438, "step": 6974, "time_per_iteration": 2.411020040512085 }, { "auxiliary_loss_clip": 0.01063611, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.01532531, "balance_loss_mlp": 1.02172327, "epoch": 0.41935968735908613, "flos": 22523552058240.0, "grad_norm": 1.6799829352751552, "language_loss": 0.7412554, "learning_rate": 2.609582803447259e-06, "loss": 0.76234305, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41796875, "step": 6975, "time_per_iteration": 2.4225268363952637 }, { "auxiliary_loss_clip": 0.01063206, "auxiliary_loss_mlp": 0.01049139, "balance_loss_clip": 1.02002764, "balance_loss_mlp": 1.02099538, "epoch": 0.4194198106117541, "flos": 26869226031360.0, "grad_norm": 1.4556015215150344, "language_loss": 0.81380975, "learning_rate": 2.6092118611355885e-06, "loss": 0.83493316, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.421875, "step": 6976, "time_per_iteration": 2.5013701915740967 }, { "auxiliary_loss_clip": 0.01063651, "auxiliary_loss_mlp": 0.01045915, "balance_loss_clip": 1.01649404, "balance_loss_mlp": 1.02036476, "epoch": 0.41947993386442206, "flos": 19901365084800.0, "grad_norm": 1.8857980186826795, "language_loss": 0.70160383, "learning_rate": 2.6088408957225425e-06, "loss": 0.72269952, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 6977, "time_per_iteration": 2.405846357345581 }, { "auxiliary_loss_clip": 0.01065397, "auxiliary_loss_mlp": 0.01044726, "balance_loss_clip": 1.01544857, "balance_loss_mlp": 1.02267611, "epoch": 0.41954005711709, "flos": 17382940272000.0, "grad_norm": 2.319037137216687, "language_loss": 0.82640183, "learning_rate": 2.6084699072221898e-06, "loss": 0.84750301, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 6978, "time_per_iteration": 2.386373996734619 }, { "auxiliary_loss_clip": 0.01064446, "auxiliary_loss_mlp": 0.01047184, "balance_loss_clip": 1.0162847, "balance_loss_mlp": 1.01992953, "epoch": 0.419600180369758, "flos": 25002315699840.0, "grad_norm": 1.6225564972976276, "language_loss": 0.8387785, "learning_rate": 2.6080988956485964e-06, "loss": 0.85989475, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 6979, "time_per_iteration": 3.8772037029266357 }, { "auxiliary_loss_clip": 0.01063812, "auxiliary_loss_mlp": 0.01042012, "balance_loss_clip": 1.01182783, "balance_loss_mlp": 1.02083588, "epoch": 0.41966030362242596, "flos": 17382835537920.0, "grad_norm": 1.7133137237807547, "language_loss": 0.84760332, "learning_rate": 2.6077278610158325e-06, "loss": 0.86866152, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 6980, "time_per_iteration": 2.370277166366577 }, { "auxiliary_loss_clip": 0.01066021, "auxiliary_loss_mlp": 0.01046224, "balance_loss_clip": 1.01755476, "balance_loss_mlp": 1.02182448, "epoch": 0.4197204268750939, "flos": 22155288370560.0, "grad_norm": 2.2909717757559287, "language_loss": 0.80730355, "learning_rate": 2.6073568033379665e-06, "loss": 0.828426, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.44140625, "step": 6981, "time_per_iteration": 3.872154474258423 }, { "auxiliary_loss_clip": 0.01060206, "auxiliary_loss_mlp": 0.01038021, "balance_loss_clip": 1.0104003, "balance_loss_mlp": 1.01860726, "epoch": 0.4197805501277619, "flos": 22083227591040.0, "grad_norm": 1.6185662692359881, "language_loss": 0.85261506, "learning_rate": 2.6069857226290696e-06, "loss": 0.87359726, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41601562, "step": 6982, "time_per_iteration": 2.438020944595337 }, { "auxiliary_loss_clip": 0.01065383, "auxiliary_loss_mlp": 0.01052645, "balance_loss_clip": 1.02177012, "balance_loss_mlp": 1.02082002, "epoch": 0.4198406733804299, "flos": 26430996245760.0, "grad_norm": 1.8423761448485036, "language_loss": 0.57318807, "learning_rate": 2.606614618903214e-06, "loss": 0.59436834, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 6983, "time_per_iteration": 2.422132730484009 }, { "auxiliary_loss_clip": 0.01062134, "auxiliary_loss_mlp": 0.01045417, "balance_loss_clip": 1.01680636, "balance_loss_mlp": 1.02054477, "epoch": 0.4199007966330979, "flos": 12530222490240.0, "grad_norm": 1.7195865543058393, "language_loss": 0.84566033, "learning_rate": 2.606243492174471e-06, "loss": 0.86673582, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41601562, "step": 6984, "time_per_iteration": 3.8606343269348145 }, { "auxiliary_loss_clip": 0.01061905, "auxiliary_loss_mlp": 0.01044071, "balance_loss_clip": 1.01556802, "balance_loss_mlp": 1.01999664, "epoch": 0.41996091988576584, "flos": 21761851726080.0, "grad_norm": 1.7707844950745832, "language_loss": 0.80654454, "learning_rate": 2.605872342456914e-06, "loss": 0.82760429, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 6985, "time_per_iteration": 2.4222891330718994 }, { "auxiliary_loss_clip": 0.0106457, "auxiliary_loss_mlp": 0.01049479, "balance_loss_clip": 1.01738811, "balance_loss_mlp": 1.0194484, "epoch": 0.4200210431384338, "flos": 26540728248960.0, "grad_norm": 1.553942388199177, "language_loss": 0.79432082, "learning_rate": 2.6055011697646173e-06, "loss": 0.81546134, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.45117188, "step": 6986, "time_per_iteration": 2.461432695388794 }, { "auxiliary_loss_clip": 0.01058067, "auxiliary_loss_mlp": 0.01041596, "balance_loss_clip": 1.01537049, "balance_loss_mlp": 1.01814258, "epoch": 0.42008116639110177, "flos": 26794651633920.0, "grad_norm": 1.657994269881226, "language_loss": 0.73325145, "learning_rate": 2.605129974111655e-06, "loss": 0.75424814, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 6987, "time_per_iteration": 2.429535150527954 }, { "auxiliary_loss_clip": 0.01063268, "auxiliary_loss_mlp": 0.01045899, "balance_loss_clip": 1.01644289, "balance_loss_mlp": 1.02033055, "epoch": 0.42014128964376973, "flos": 32085983226240.0, "grad_norm": 1.5497007706244421, "language_loss": 0.76079369, "learning_rate": 2.604758755512104e-06, "loss": 0.78188539, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 6988, "time_per_iteration": 2.6145107746124268 }, { "auxiliary_loss_clip": 0.01063363, "auxiliary_loss_mlp": 0.01050654, "balance_loss_clip": 1.02030313, "balance_loss_mlp": 1.01929331, "epoch": 0.4202014128964377, "flos": 26465979294720.0, "grad_norm": 1.487907087922204, "language_loss": 0.75290453, "learning_rate": 2.60438751398004e-06, "loss": 0.77404475, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44140625, "step": 6989, "time_per_iteration": 2.44108510017395 }, { "auxiliary_loss_clip": 0.01060713, "auxiliary_loss_mlp": 0.01047814, "balance_loss_clip": 1.01856017, "balance_loss_mlp": 1.01807451, "epoch": 0.42026153614910566, "flos": 13400537662080.0, "grad_norm": 1.9392142728509727, "language_loss": 0.72382712, "learning_rate": 2.6040162495295404e-06, "loss": 0.74491239, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 6990, "time_per_iteration": 2.399506092071533 }, { "auxiliary_loss_clip": 0.01012237, "auxiliary_loss_mlp": 0.01003781, "balance_loss_clip": 1.00056255, "balance_loss_mlp": 1.00379598, "epoch": 0.42032165940177363, "flos": 60247413832320.0, "grad_norm": 0.8271413923703486, "language_loss": 0.6055088, "learning_rate": 2.603644962174685e-06, "loss": 0.625669, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.08447266, "step": 6991, "time_per_iteration": 2.904433012008667 }, { "auxiliary_loss_clip": 0.01064613, "auxiliary_loss_mlp": 0.01046616, "balance_loss_clip": 1.01690865, "balance_loss_mlp": 1.02047431, "epoch": 0.4203817826544416, "flos": 24534060278400.0, "grad_norm": 1.4900120713827931, "language_loss": 0.8417905, "learning_rate": 2.6032736519295517e-06, "loss": 0.86290276, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44140625, "step": 6992, "time_per_iteration": 2.4334983825683594 }, { "auxiliary_loss_clip": 0.01011422, "auxiliary_loss_mlp": 0.01002517, "balance_loss_clip": 0.99936998, "balance_loss_mlp": 1.00279546, "epoch": 0.42044190590710956, "flos": 58817965236480.0, "grad_norm": 0.8160422846959353, "language_loss": 0.65585411, "learning_rate": 2.6029023188082217e-06, "loss": 0.67599344, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.08642578, "step": 6993, "time_per_iteration": 3.0211689472198486 }, { "auxiliary_loss_clip": 0.01065149, "auxiliary_loss_mlp": 0.01047797, "balance_loss_clip": 1.01475227, "balance_loss_mlp": 1.01917362, "epoch": 0.4205020291597775, "flos": 16435118148480.0, "grad_norm": 2.284467228835431, "language_loss": 0.84618032, "learning_rate": 2.6025309628247746e-06, "loss": 0.86730981, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4609375, "step": 6994, "time_per_iteration": 2.350649118423462 }, { "auxiliary_loss_clip": 0.01062557, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.01552534, "balance_loss_mlp": 1.02152348, "epoch": 0.4205621524124455, "flos": 18404673477120.0, "grad_norm": 1.513573976131421, "language_loss": 0.80076993, "learning_rate": 2.6021595839932934e-06, "loss": 0.82183838, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 6995, "time_per_iteration": 2.388667345046997 }, { "auxiliary_loss_clip": 0.0105826, "auxiliary_loss_mlp": 0.01045336, "balance_loss_clip": 1.01906216, "balance_loss_mlp": 1.01777363, "epoch": 0.4206222756651135, "flos": 25518925221120.0, "grad_norm": 1.3689152432876912, "language_loss": 0.81123638, "learning_rate": 2.60178818232786e-06, "loss": 0.83227229, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40429688, "step": 6996, "time_per_iteration": 2.398460626602173 }, { "auxiliary_loss_clip": 0.01062082, "auxiliary_loss_mlp": 0.01040601, "balance_loss_clip": 1.01214528, "balance_loss_mlp": 1.01966763, "epoch": 0.4206823989177815, "flos": 15303443472000.0, "grad_norm": 1.8568914992380874, "language_loss": 0.76873147, "learning_rate": 2.601416757842559e-06, "loss": 0.78975832, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42578125, "step": 6997, "time_per_iteration": 2.381293535232544 }, { "auxiliary_loss_clip": 0.01062774, "auxiliary_loss_mlp": 0.01048781, "balance_loss_clip": 1.01965857, "balance_loss_mlp": 1.02018356, "epoch": 0.42074252217044944, "flos": 15553352050560.0, "grad_norm": 1.7529189448676497, "language_loss": 0.75935477, "learning_rate": 2.6010453105514743e-06, "loss": 0.78047031, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 6998, "time_per_iteration": 2.371706485748291 }, { "auxiliary_loss_clip": 0.01064084, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.01682878, "balance_loss_mlp": 1.02066755, "epoch": 0.4208026454231174, "flos": 26144533607040.0, "grad_norm": 1.8321725656563574, "language_loss": 0.77930397, "learning_rate": 2.60067384046869e-06, "loss": 0.80041564, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 6999, "time_per_iteration": 2.4788832664489746 }, { "auxiliary_loss_clip": 0.01061759, "auxiliary_loss_mlp": 0.01046425, "balance_loss_clip": 1.01819611, "balance_loss_mlp": 1.02035689, "epoch": 0.42086276867578537, "flos": 23548985867520.0, "grad_norm": 1.7182368435645439, "language_loss": 0.65083271, "learning_rate": 2.600302347608295e-06, "loss": 0.67191452, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 7000, "time_per_iteration": 2.42244029045105 }, { "auxiliary_loss_clip": 0.01062435, "auxiliary_loss_mlp": 0.01044243, "balance_loss_clip": 1.01539421, "balance_loss_mlp": 1.02013206, "epoch": 0.42092289192845334, "flos": 18112450464000.0, "grad_norm": 1.575725068459106, "language_loss": 0.77526605, "learning_rate": 2.5999308319843743e-06, "loss": 0.79633284, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 7001, "time_per_iteration": 2.4829678535461426 }, { "auxiliary_loss_clip": 0.01063289, "auxiliary_loss_mlp": 0.0103973, "balance_loss_clip": 1.01326549, "balance_loss_mlp": 1.02142596, "epoch": 0.4209830151811213, "flos": 20005685827200.0, "grad_norm": 1.5829303879582668, "language_loss": 0.87409461, "learning_rate": 2.5995592936110154e-06, "loss": 0.89512479, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41796875, "step": 7002, "time_per_iteration": 2.427133321762085 }, { "auxiliary_loss_clip": 0.01061203, "auxiliary_loss_mlp": 0.01041203, "balance_loss_clip": 1.01378441, "balance_loss_mlp": 1.01958513, "epoch": 0.42104313843378927, "flos": 21977929330560.0, "grad_norm": 1.9552841722307295, "language_loss": 0.68808049, "learning_rate": 2.5991877325023096e-06, "loss": 0.7091046, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41601562, "step": 7003, "time_per_iteration": 2.377445936203003 }, { "auxiliary_loss_clip": 0.01065063, "auxiliary_loss_mlp": 0.01047119, "balance_loss_clip": 1.01662493, "balance_loss_mlp": 1.02161098, "epoch": 0.42110326168645723, "flos": 25442884546560.0, "grad_norm": 1.887311179039325, "language_loss": 0.7955147, "learning_rate": 2.598816148672344e-06, "loss": 0.81663656, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7004, "time_per_iteration": 2.449510335922241 }, { "auxiliary_loss_clip": 0.01060524, "auxiliary_loss_mlp": 0.01049462, "balance_loss_clip": 1.02211523, "balance_loss_mlp": 1.02017879, "epoch": 0.4211633849391252, "flos": 17821588993920.0, "grad_norm": 1.557279867399258, "language_loss": 0.69982749, "learning_rate": 2.59844454213521e-06, "loss": 0.72092736, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 7005, "time_per_iteration": 2.365572214126587 }, { "auxiliary_loss_clip": 0.01063077, "auxiliary_loss_mlp": 0.01041854, "balance_loss_clip": 1.01674843, "balance_loss_mlp": 1.02084613, "epoch": 0.42122350819179316, "flos": 16281710167680.0, "grad_norm": 1.7585008069990156, "language_loss": 0.74249858, "learning_rate": 2.5980729129049994e-06, "loss": 0.7635479, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.421875, "step": 7006, "time_per_iteration": 2.3916828632354736 }, { "auxiliary_loss_clip": 0.01062326, "auxiliary_loss_mlp": 0.01046503, "balance_loss_clip": 1.02002716, "balance_loss_mlp": 1.02000809, "epoch": 0.4212836314444611, "flos": 19644858259200.0, "grad_norm": 2.281167554569502, "language_loss": 0.72638774, "learning_rate": 2.5977012609958033e-06, "loss": 0.74747598, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 7007, "time_per_iteration": 2.360018253326416 }, { "auxiliary_loss_clip": 0.01062487, "auxiliary_loss_mlp": 0.01046625, "balance_loss_clip": 1.02061307, "balance_loss_mlp": 1.01995039, "epoch": 0.4213437546971291, "flos": 18368049594240.0, "grad_norm": 1.8228566319821353, "language_loss": 0.83360696, "learning_rate": 2.5973295864217166e-06, "loss": 0.85469806, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.42382812, "step": 7008, "time_per_iteration": 2.393033742904663 }, { "auxiliary_loss_clip": 0.01062047, "auxiliary_loss_mlp": 0.01047116, "balance_loss_clip": 1.01891077, "balance_loss_mlp": 1.01977515, "epoch": 0.42140387794979706, "flos": 27703406079360.0, "grad_norm": 1.8363189849385069, "language_loss": 0.72701329, "learning_rate": 2.596957889196831e-06, "loss": 0.74810487, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 7009, "time_per_iteration": 2.417668581008911 }, { "auxiliary_loss_clip": 0.01058967, "auxiliary_loss_mlp": 0.01045258, "balance_loss_clip": 1.0193541, "balance_loss_mlp": 1.01754022, "epoch": 0.4214640012024651, "flos": 28145825228160.0, "grad_norm": 2.2468253968349585, "language_loss": 0.67614365, "learning_rate": 2.596586169335243e-06, "loss": 0.69718593, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.4140625, "step": 7010, "time_per_iteration": 2.4362387657165527 }, { "auxiliary_loss_clip": 0.0105871, "auxiliary_loss_mlp": 0.01052448, "balance_loss_clip": 1.02531552, "balance_loss_mlp": 1.01784563, "epoch": 0.42152412445513304, "flos": 22996311045120.0, "grad_norm": 1.402197107947976, "language_loss": 0.7320298, "learning_rate": 2.5962144268510477e-06, "loss": 0.7531414, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40820312, "step": 7011, "time_per_iteration": 2.3694143295288086 }, { "auxiliary_loss_clip": 0.01010296, "auxiliary_loss_mlp": 0.01020274, "balance_loss_clip": 1.01710355, "balance_loss_mlp": 1.0018754, "epoch": 0.421584247707801, "flos": 63746549136000.0, "grad_norm": 0.8115876951864184, "language_loss": 0.54451984, "learning_rate": 2.5958426617583417e-06, "loss": 0.56482559, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.08398438, "step": 7012, "time_per_iteration": 2.9188554286956787 }, { "auxiliary_loss_clip": 0.01060346, "auxiliary_loss_mlp": 0.01046754, "balance_loss_clip": 1.01997972, "balance_loss_mlp": 1.01871061, "epoch": 0.421644370960469, "flos": 24313514019840.0, "grad_norm": 1.3181305507484915, "language_loss": 0.79576981, "learning_rate": 2.5954708740712215e-06, "loss": 0.81684077, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41601562, "step": 7013, "time_per_iteration": 2.404327630996704 }, { "auxiliary_loss_clip": 0.01061422, "auxiliary_loss_mlp": 0.01042726, "balance_loss_clip": 1.0139246, "balance_loss_mlp": 1.01845253, "epoch": 0.42170449421313694, "flos": 23439568066560.0, "grad_norm": 1.6364213695517416, "language_loss": 0.82028139, "learning_rate": 2.595099063803787e-06, "loss": 0.8413229, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 7014, "time_per_iteration": 2.4892189502716064 }, { "auxiliary_loss_clip": 0.01061255, "auxiliary_loss_mlp": 0.01049952, "balance_loss_clip": 1.02153254, "balance_loss_mlp": 1.01925647, "epoch": 0.4217646174658049, "flos": 23694364235520.0, "grad_norm": 1.4861353046465025, "language_loss": 0.79063839, "learning_rate": 2.5947272309701354e-06, "loss": 0.81175047, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 7015, "time_per_iteration": 2.423894166946411 }, { "auxiliary_loss_clip": 0.01064278, "auxiliary_loss_mlp": 0.0105011, "balance_loss_clip": 1.01987886, "balance_loss_mlp": 1.02072239, "epoch": 0.42182474071847287, "flos": 24970439761920.0, "grad_norm": 1.3070260407391678, "language_loss": 0.83131444, "learning_rate": 2.594355375584368e-06, "loss": 0.85245836, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43554688, "step": 7016, "time_per_iteration": 2.4258899688720703 }, { "auxiliary_loss_clip": 0.01065702, "auxiliary_loss_mlp": 0.01050262, "balance_loss_clip": 1.01786137, "balance_loss_mlp": 1.0220964, "epoch": 0.42188486397114083, "flos": 22855540976640.0, "grad_norm": 1.8213109615779581, "language_loss": 0.6962707, "learning_rate": 2.593983497660586e-06, "loss": 0.71743035, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.43554688, "step": 7017, "time_per_iteration": 2.377586841583252 }, { "auxiliary_loss_clip": 0.01019591, "auxiliary_loss_mlp": 0.01007234, "balance_loss_clip": 1.00401568, "balance_loss_mlp": 1.01088893, "epoch": 0.4219449872238088, "flos": 66972139004160.0, "grad_norm": 0.6872751907513095, "language_loss": 0.59599411, "learning_rate": 2.5936115972128895e-06, "loss": 0.61626244, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.08691406, "step": 7018, "time_per_iteration": 4.4762938022613525 }, { "auxiliary_loss_clip": 0.01065003, "auxiliary_loss_mlp": 0.01042177, "balance_loss_clip": 1.01438904, "balance_loss_mlp": 1.02191567, "epoch": 0.42200511047647676, "flos": 13114528871040.0, "grad_norm": 1.8959677751689596, "language_loss": 0.77679139, "learning_rate": 2.593239674255382e-06, "loss": 0.79786319, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.43164062, "step": 7019, "time_per_iteration": 2.403743028640747 }, { "auxiliary_loss_clip": 0.0106316, "auxiliary_loss_mlp": 0.01047833, "balance_loss_clip": 1.01892447, "balance_loss_mlp": 1.02111983, "epoch": 0.42206523372914473, "flos": 13990325126400.0, "grad_norm": 1.946900206351706, "language_loss": 0.70781988, "learning_rate": 2.592867728802166e-06, "loss": 0.72892982, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 7020, "time_per_iteration": 5.127791166305542 }, { "auxiliary_loss_clip": 0.01062347, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.02203631, "balance_loss_mlp": 1.02077115, "epoch": 0.4221253569818127, "flos": 21941305447680.0, "grad_norm": 1.6199512194630543, "language_loss": 0.82108384, "learning_rate": 2.592495760867347e-06, "loss": 0.84220821, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41601562, "step": 7021, "time_per_iteration": 2.388517379760742 }, { "auxiliary_loss_clip": 0.01064728, "auxiliary_loss_mlp": 0.01053881, "balance_loss_clip": 1.02425718, "balance_loss_mlp": 1.02158654, "epoch": 0.42218548023448066, "flos": 32191351309440.0, "grad_norm": 1.4524928738282323, "language_loss": 0.70925677, "learning_rate": 2.5921237704650293e-06, "loss": 0.73044288, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 7022, "time_per_iteration": 2.499687671661377 }, { "auxiliary_loss_clip": 0.01061413, "auxiliary_loss_mlp": 0.01048484, "balance_loss_clip": 1.02370083, "balance_loss_mlp": 1.02220476, "epoch": 0.4222456034871487, "flos": 30117614883840.0, "grad_norm": 1.6354897982622338, "language_loss": 0.68709046, "learning_rate": 2.5917517576093188e-06, "loss": 0.70818943, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 7023, "time_per_iteration": 2.4555563926696777 }, { "auxiliary_loss_clip": 0.01064604, "auxiliary_loss_mlp": 0.0106252, "balance_loss_clip": 1.03083456, "balance_loss_mlp": 1.02218473, "epoch": 0.42230572673981664, "flos": 22126798834560.0, "grad_norm": 1.5692864532023039, "language_loss": 0.70869625, "learning_rate": 2.591379722314322e-06, "loss": 0.72996753, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42382812, "step": 7024, "time_per_iteration": 3.8536741733551025 }, { "auxiliary_loss_clip": 0.01064715, "auxiliary_loss_mlp": 0.01060384, "balance_loss_clip": 1.03173828, "balance_loss_mlp": 1.02153623, "epoch": 0.4223658499924846, "flos": 22053970005120.0, "grad_norm": 1.4715617742923865, "language_loss": 0.77560329, "learning_rate": 2.591007664594147e-06, "loss": 0.79685432, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.43164062, "step": 7025, "time_per_iteration": 2.4837124347686768 }, { "auxiliary_loss_clip": 0.01059983, "auxiliary_loss_mlp": 0.01058261, "balance_loss_clip": 1.02886426, "balance_loss_mlp": 1.01972938, "epoch": 0.4224259732451526, "flos": 20409735525120.0, "grad_norm": 1.8295413582628264, "language_loss": 0.81112993, "learning_rate": 2.5906355844629024e-06, "loss": 0.83231235, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40234375, "step": 7026, "time_per_iteration": 2.4410085678100586 }, { "auxiliary_loss_clip": 0.01018295, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.02249777, "balance_loss_mlp": 1.00927055, "epoch": 0.42248609649782054, "flos": 62843380508160.0, "grad_norm": 0.7631552107929441, "language_loss": 0.62097669, "learning_rate": 2.5902634819346966e-06, "loss": 0.64142132, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.09033203, "step": 7027, "time_per_iteration": 3.111438751220703 }, { "auxiliary_loss_clip": 0.01060924, "auxiliary_loss_mlp": 0.01053044, "balance_loss_clip": 1.02520859, "balance_loss_mlp": 1.01867223, "epoch": 0.4225462197504885, "flos": 26248749615360.0, "grad_norm": 2.077638704611507, "language_loss": 0.72302473, "learning_rate": 2.5898913570236414e-06, "loss": 0.74416441, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.421875, "step": 7028, "time_per_iteration": 2.433840274810791 }, { "auxiliary_loss_clip": 0.01063489, "auxiliary_loss_mlp": 0.0105842, "balance_loss_clip": 1.02685344, "balance_loss_mlp": 1.02018905, "epoch": 0.42260634300315647, "flos": 20520898894080.0, "grad_norm": 1.9173418893414949, "language_loss": 0.83507717, "learning_rate": 2.589519209743846e-06, "loss": 0.8562963, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43359375, "step": 7029, "time_per_iteration": 2.409543514251709 }, { "auxiliary_loss_clip": 0.01064928, "auxiliary_loss_mlp": 0.01051178, "balance_loss_clip": 1.02071965, "balance_loss_mlp": 1.02011323, "epoch": 0.42266646625582444, "flos": 24315573790080.0, "grad_norm": 2.0046448697178505, "language_loss": 0.76663566, "learning_rate": 2.589147040109424e-06, "loss": 0.78779674, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44726562, "step": 7030, "time_per_iteration": 2.4128429889678955 }, { "auxiliary_loss_clip": 0.010608, "auxiliary_loss_mlp": 0.01052915, "balance_loss_clip": 1.02158689, "balance_loss_mlp": 1.01831996, "epoch": 0.4227265895084924, "flos": 24203083789440.0, "grad_norm": 2.6018484797626518, "language_loss": 0.87779951, "learning_rate": 2.588774848134486e-06, "loss": 0.89893675, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.42382812, "step": 7031, "time_per_iteration": 2.404933452606201 }, { "auxiliary_loss_clip": 0.01063034, "auxiliary_loss_mlp": 0.01050742, "balance_loss_clip": 1.01841271, "balance_loss_mlp": 1.02001095, "epoch": 0.42278671276116037, "flos": 16908819742080.0, "grad_norm": 2.006021257473933, "language_loss": 0.75040734, "learning_rate": 2.5884026338331473e-06, "loss": 0.77154511, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4296875, "step": 7032, "time_per_iteration": 2.3725264072418213 }, { "auxiliary_loss_clip": 0.01063727, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.01988173, "balance_loss_mlp": 1.01990557, "epoch": 0.42284683601382833, "flos": 25409891445120.0, "grad_norm": 1.4706723715567316, "language_loss": 0.71996534, "learning_rate": 2.5880303972195222e-06, "loss": 0.74108195, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4375, "step": 7033, "time_per_iteration": 2.4294841289520264 }, { "auxiliary_loss_clip": 0.0106351, "auxiliary_loss_mlp": 0.01044739, "balance_loss_clip": 1.01541352, "balance_loss_mlp": 1.02075684, "epoch": 0.4229069592664963, "flos": 23039184061440.0, "grad_norm": 1.7790354904096468, "language_loss": 0.91906285, "learning_rate": 2.5876581383077256e-06, "loss": 0.94014537, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42773438, "step": 7034, "time_per_iteration": 2.489312171936035 }, { "auxiliary_loss_clip": 0.01064055, "auxiliary_loss_mlp": 0.01041555, "balance_loss_clip": 1.01462579, "balance_loss_mlp": 1.02187049, "epoch": 0.42296708251916426, "flos": 26066258605440.0, "grad_norm": 1.5861115438622075, "language_loss": 0.78666127, "learning_rate": 2.5872858571118723e-06, "loss": 0.80771732, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.421875, "step": 7035, "time_per_iteration": 2.4245872497558594 }, { "auxiliary_loss_clip": 0.01066887, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.02208376, "balance_loss_mlp": 1.02399528, "epoch": 0.4230272057718323, "flos": 19457514570240.0, "grad_norm": 1.8551081219813903, "language_loss": 0.83682704, "learning_rate": 2.5869135536460817e-06, "loss": 0.85802484, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 7036, "time_per_iteration": 2.3810019493103027 }, { "auxiliary_loss_clip": 0.01066751, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.014467, "balance_loss_mlp": 1.02627826, "epoch": 0.42308732902450025, "flos": 22382188496640.0, "grad_norm": 1.6054841089416787, "language_loss": 0.71532083, "learning_rate": 2.58654122792447e-06, "loss": 0.73641586, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40429688, "step": 7037, "time_per_iteration": 2.426670789718628 }, { "auxiliary_loss_clip": 0.01068135, "auxiliary_loss_mlp": 0.0104905, "balance_loss_clip": 1.01803136, "balance_loss_mlp": 1.02519238, "epoch": 0.4231474522771682, "flos": 20994391019520.0, "grad_norm": 1.5717391903505562, "language_loss": 0.79313028, "learning_rate": 2.586168879961155e-06, "loss": 0.81430209, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4296875, "step": 7038, "time_per_iteration": 2.51074481010437 }, { "auxiliary_loss_clip": 0.01072304, "auxiliary_loss_mlp": 0.01063183, "balance_loss_clip": 1.02734828, "balance_loss_mlp": 1.02565825, "epoch": 0.4232075755298362, "flos": 14974980600960.0, "grad_norm": 2.1605150038157994, "language_loss": 0.69722795, "learning_rate": 2.585796509770259e-06, "loss": 0.71858281, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 0.46679688, "step": 7039, "time_per_iteration": 2.361018180847168 }, { "auxiliary_loss_clip": 0.01070883, "auxiliary_loss_mlp": 0.01050383, "balance_loss_clip": 1.01947236, "balance_loss_mlp": 1.02539158, "epoch": 0.42326769878250414, "flos": 24531581571840.0, "grad_norm": 1.8570197629135705, "language_loss": 0.76564413, "learning_rate": 2.5854241173658996e-06, "loss": 0.78685677, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45507812, "step": 7040, "time_per_iteration": 2.4396746158599854 }, { "auxiliary_loss_clip": 0.01071144, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.0138824, "balance_loss_mlp": 1.02726865, "epoch": 0.4233278220351721, "flos": 26869086385920.0, "grad_norm": 1.764787542755636, "language_loss": 0.66907573, "learning_rate": 2.5850517027621996e-06, "loss": 0.69023144, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 7041, "time_per_iteration": 2.436676025390625 }, { "auxiliary_loss_clip": 0.01068226, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.01509142, "balance_loss_mlp": 1.02408457, "epoch": 0.4233879452878401, "flos": 42813256728960.0, "grad_norm": 1.7470933615250035, "language_loss": 0.7536785, "learning_rate": 2.5846792659732803e-06, "loss": 0.77481425, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 7042, "time_per_iteration": 2.588362693786621 }, { "auxiliary_loss_clip": 0.01064891, "auxiliary_loss_mlp": 0.01048438, "balance_loss_clip": 1.02166319, "balance_loss_mlp": 1.02335656, "epoch": 0.42344806854050804, "flos": 25227819371520.0, "grad_norm": 1.2610364600599953, "language_loss": 0.82831442, "learning_rate": 2.5843068070132643e-06, "loss": 0.84944767, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.4140625, "step": 7043, "time_per_iteration": 2.441255569458008 }, { "auxiliary_loss_clip": 0.01068063, "auxiliary_loss_mlp": 0.01053393, "balance_loss_clip": 1.02189827, "balance_loss_mlp": 1.02510273, "epoch": 0.423508191793176, "flos": 22777859468160.0, "grad_norm": 2.2912522014569174, "language_loss": 0.66761816, "learning_rate": 2.5839343258962763e-06, "loss": 0.68883276, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4296875, "step": 7044, "time_per_iteration": 2.408329486846924 }, { "auxiliary_loss_clip": 0.01067938, "auxiliary_loss_mlp": 0.01052533, "balance_loss_clip": 1.02048945, "balance_loss_mlp": 1.02383447, "epoch": 0.42356831504584397, "flos": 34636179242880.0, "grad_norm": 3.141586302742389, "language_loss": 0.76606703, "learning_rate": 2.5835618226364393e-06, "loss": 0.78727174, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44140625, "step": 7045, "time_per_iteration": 2.499405860900879 }, { "auxiliary_loss_clip": 0.01066042, "auxiliary_loss_mlp": 0.0104476, "balance_loss_clip": 1.0176158, "balance_loss_mlp": 1.02466428, "epoch": 0.42362843829851193, "flos": 17595980588160.0, "grad_norm": 2.200874456923916, "language_loss": 0.82526577, "learning_rate": 2.5831892972478797e-06, "loss": 0.8463738, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 7046, "time_per_iteration": 2.388711929321289 }, { "auxiliary_loss_clip": 0.01065554, "auxiliary_loss_mlp": 0.01049799, "balance_loss_clip": 1.02011561, "balance_loss_mlp": 1.02195263, "epoch": 0.4236885615511799, "flos": 22564574772480.0, "grad_norm": 2.153288002744184, "language_loss": 0.77982473, "learning_rate": 2.5828167497447242e-06, "loss": 0.8009783, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4375, "step": 7047, "time_per_iteration": 2.393549680709839 }, { "auxiliary_loss_clip": 0.01063904, "auxiliary_loss_mlp": 0.01052025, "balance_loss_clip": 1.02356982, "balance_loss_mlp": 1.02246678, "epoch": 0.42374868480384786, "flos": 26468004153600.0, "grad_norm": 1.676196671168699, "language_loss": 0.69262099, "learning_rate": 2.582444180141098e-06, "loss": 0.71378028, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 7048, "time_per_iteration": 2.466933250427246 }, { "auxiliary_loss_clip": 0.01063365, "auxiliary_loss_mlp": 0.01048911, "balance_loss_clip": 1.01946676, "balance_loss_mlp": 1.02057207, "epoch": 0.4238088080565159, "flos": 20369341215360.0, "grad_norm": 1.731967014770645, "language_loss": 0.79367673, "learning_rate": 2.5820715884511307e-06, "loss": 0.81479955, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42773438, "step": 7049, "time_per_iteration": 2.375509738922119 }, { "auxiliary_loss_clip": 0.01063665, "auxiliary_loss_mlp": 0.0105995, "balance_loss_clip": 1.02819228, "balance_loss_mlp": 1.02015054, "epoch": 0.42386893130918385, "flos": 21171226389120.0, "grad_norm": 1.7579478155730783, "language_loss": 0.84000552, "learning_rate": 2.5816989746889504e-06, "loss": 0.8612417, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43359375, "step": 7050, "time_per_iteration": 2.4404566287994385 }, { "auxiliary_loss_clip": 0.01060572, "auxiliary_loss_mlp": 0.01052409, "balance_loss_clip": 1.02305937, "balance_loss_mlp": 1.01820457, "epoch": 0.4239290545618518, "flos": 17674674526080.0, "grad_norm": 2.156134022157503, "language_loss": 0.75179565, "learning_rate": 2.581326338868687e-06, "loss": 0.77292544, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42382812, "step": 7051, "time_per_iteration": 2.3480515480041504 }, { "auxiliary_loss_clip": 0.01059738, "auxiliary_loss_mlp": 0.01049414, "balance_loss_clip": 1.01925421, "balance_loss_mlp": 1.0189091, "epoch": 0.4239891778145198, "flos": 24313409285760.0, "grad_norm": 1.3920812039497787, "language_loss": 0.8712281, "learning_rate": 2.5809536810044706e-06, "loss": 0.89231962, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.40820312, "step": 7052, "time_per_iteration": 2.528925895690918 }, { "auxiliary_loss_clip": 0.01062917, "auxiliary_loss_mlp": 0.01051664, "balance_loss_clip": 1.02210057, "balance_loss_mlp": 1.0198946, "epoch": 0.42404930106718774, "flos": 20557383131520.0, "grad_norm": 1.4915131728520232, "language_loss": 0.73435211, "learning_rate": 2.5805810011104323e-06, "loss": 0.75549793, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7053, "time_per_iteration": 2.3708138465881348 }, { "auxiliary_loss_clip": 0.01061124, "auxiliary_loss_mlp": 0.01046855, "balance_loss_clip": 1.01872194, "balance_loss_mlp": 1.01989055, "epoch": 0.4241094243198557, "flos": 22307020606080.0, "grad_norm": 2.1553594058614305, "language_loss": 0.83547509, "learning_rate": 2.580208299200704e-06, "loss": 0.85655487, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 7054, "time_per_iteration": 2.435666084289551 }, { "auxiliary_loss_clip": 0.01013222, "auxiliary_loss_mlp": 0.01007898, "balance_loss_clip": 1.00420284, "balance_loss_mlp": 1.00473595, "epoch": 0.4241695475725237, "flos": 70609111822080.0, "grad_norm": 0.7823861302483855, "language_loss": 0.60518193, "learning_rate": 2.5798355752894183e-06, "loss": 0.62539309, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.08496094, "step": 7055, "time_per_iteration": 2.9726390838623047 }, { "auxiliary_loss_clip": 0.01062567, "auxiliary_loss_mlp": 0.01051945, "balance_loss_clip": 1.02043796, "balance_loss_mlp": 1.0195874, "epoch": 0.42422967082519164, "flos": 14026599895680.0, "grad_norm": 2.629165747488401, "language_loss": 0.78747523, "learning_rate": 2.5794628293907107e-06, "loss": 0.80862033, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4296875, "step": 7056, "time_per_iteration": 2.391479969024658 }, { "auxiliary_loss_clip": 0.01066979, "auxiliary_loss_mlp": 0.01043711, "balance_loss_clip": 1.01200187, "balance_loss_mlp": 1.0214833, "epoch": 0.4242897940778596, "flos": 22344447450240.0, "grad_norm": 1.7773338764018567, "language_loss": 0.8555789, "learning_rate": 2.579090061518714e-06, "loss": 0.87668586, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45507812, "step": 7057, "time_per_iteration": 2.3921215534210205 }, { "auxiliary_loss_clip": 0.01066243, "auxiliary_loss_mlp": 0.01047801, "balance_loss_clip": 1.01477969, "balance_loss_mlp": 1.02182913, "epoch": 0.42434991733052757, "flos": 22594914610560.0, "grad_norm": 2.27188235587229, "language_loss": 0.84127378, "learning_rate": 2.5787172716875642e-06, "loss": 0.86241418, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.44335938, "step": 7058, "time_per_iteration": 3.8619606494903564 }, { "auxiliary_loss_clip": 0.0106199, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.01811957, "balance_loss_mlp": 1.02049279, "epoch": 0.42441004058319554, "flos": 20010398860800.0, "grad_norm": 1.9325642564035053, "language_loss": 0.81690085, "learning_rate": 2.5783444599113973e-06, "loss": 0.83796775, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41601562, "step": 7059, "time_per_iteration": 3.8843259811401367 }, { "auxiliary_loss_clip": 0.01065092, "auxiliary_loss_mlp": 0.01049759, "balance_loss_clip": 1.01741779, "balance_loss_mlp": 1.02074051, "epoch": 0.4244701638358635, "flos": 11144205492480.0, "grad_norm": 2.677350905218339, "language_loss": 0.71509683, "learning_rate": 2.57797162620435e-06, "loss": 0.73624533, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.44335938, "step": 7060, "time_per_iteration": 3.8871965408325195 }, { "auxiliary_loss_clip": 0.01064138, "auxiliary_loss_mlp": 0.01043725, "balance_loss_clip": 1.01487625, "balance_loss_mlp": 1.02203369, "epoch": 0.42453028708853147, "flos": 23986622160000.0, "grad_norm": 1.5612158493837427, "language_loss": 0.76720655, "learning_rate": 2.577598770580562e-06, "loss": 0.78828514, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 7061, "time_per_iteration": 2.448136806488037 }, { "auxiliary_loss_clip": 0.01065418, "auxiliary_loss_mlp": 0.01046285, "balance_loss_clip": 1.01601768, "balance_loss_mlp": 1.02209723, "epoch": 0.42459041034119943, "flos": 18405336792960.0, "grad_norm": 2.0189664063784742, "language_loss": 0.74846333, "learning_rate": 2.5772258930541693e-06, "loss": 0.76958036, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 7062, "time_per_iteration": 2.4076755046844482 }, { "auxiliary_loss_clip": 0.0106515, "auxiliary_loss_mlp": 0.01056968, "balance_loss_clip": 1.02690351, "balance_loss_mlp": 1.02178645, "epoch": 0.42465053359386745, "flos": 20956999086720.0, "grad_norm": 1.8102841021843734, "language_loss": 0.67076635, "learning_rate": 2.5768529936393137e-06, "loss": 0.69198751, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43359375, "step": 7063, "time_per_iteration": 2.3859846591949463 }, { "auxiliary_loss_clip": 0.01060866, "auxiliary_loss_mlp": 0.01040528, "balance_loss_clip": 1.01345563, "balance_loss_mlp": 1.02040422, "epoch": 0.4247106568465354, "flos": 33104888611200.0, "grad_norm": 1.5007505672677646, "language_loss": 0.79559779, "learning_rate": 2.5764800723501354e-06, "loss": 0.81661177, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 7064, "time_per_iteration": 3.8425300121307373 }, { "auxiliary_loss_clip": 0.01062855, "auxiliary_loss_mlp": 0.01045189, "balance_loss_clip": 1.01545787, "balance_loss_mlp": 1.02071512, "epoch": 0.4247707800992034, "flos": 20045905580160.0, "grad_norm": 1.8425283830935937, "language_loss": 0.76477951, "learning_rate": 2.5761071292007736e-06, "loss": 0.78586, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7065, "time_per_iteration": 2.4308419227600098 }, { "auxiliary_loss_clip": 0.0106381, "auxiliary_loss_mlp": 0.01049499, "balance_loss_clip": 1.0203644, "balance_loss_mlp": 1.02170467, "epoch": 0.42483090335187135, "flos": 22383968976000.0, "grad_norm": 1.3603584040310257, "language_loss": 0.73052561, "learning_rate": 2.5757341642053725e-06, "loss": 0.75165868, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41992188, "step": 7066, "time_per_iteration": 2.4468142986297607 }, { "auxiliary_loss_clip": 0.01065623, "auxiliary_loss_mlp": 0.01053483, "balance_loss_clip": 1.02238166, "balance_loss_mlp": 1.02180481, "epoch": 0.4248910266045393, "flos": 21355881903360.0, "grad_norm": 1.9969188210026327, "language_loss": 0.81012046, "learning_rate": 2.5753611773780745e-06, "loss": 0.83131158, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4375, "step": 7067, "time_per_iteration": 2.3843860626220703 }, { "auxiliary_loss_clip": 0.01015624, "auxiliary_loss_mlp": 0.0100753, "balance_loss_clip": 1.00435925, "balance_loss_mlp": 1.00684357, "epoch": 0.4249511498572073, "flos": 64004976086400.0, "grad_norm": 0.9175460588093126, "language_loss": 0.63614625, "learning_rate": 2.574988168733022e-06, "loss": 0.65637779, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.08789062, "step": 7068, "time_per_iteration": 2.985718011856079 }, { "auxiliary_loss_clip": 0.01062157, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.01546812, "balance_loss_mlp": 1.01923656, "epoch": 0.42501127310987524, "flos": 19606104783360.0, "grad_norm": 1.8443240715664695, "language_loss": 0.74089432, "learning_rate": 2.574615138284361e-06, "loss": 0.76197267, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 7069, "time_per_iteration": 2.3807029724121094 }, { "auxiliary_loss_clip": 0.01063208, "auxiliary_loss_mlp": 0.01052373, "balance_loss_clip": 1.02065122, "balance_loss_mlp": 1.02005816, "epoch": 0.4250713963625432, "flos": 19461354819840.0, "grad_norm": 1.964308186991792, "language_loss": 0.80826068, "learning_rate": 2.5742420860462364e-06, "loss": 0.82941651, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43164062, "step": 7070, "time_per_iteration": 2.419203996658325 }, { "auxiliary_loss_clip": 0.01060628, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.01490688, "balance_loss_mlp": 1.01827979, "epoch": 0.4251315196152112, "flos": 25336538945280.0, "grad_norm": 1.9562490555233887, "language_loss": 0.71506941, "learning_rate": 2.573869012032795e-06, "loss": 0.73612207, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42382812, "step": 7071, "time_per_iteration": 2.421995162963867 }, { "auxiliary_loss_clip": 0.01059944, "auxiliary_loss_mlp": 0.01046108, "balance_loss_clip": 1.01714027, "balance_loss_mlp": 1.01812482, "epoch": 0.42519164286787914, "flos": 26357992859520.0, "grad_norm": 2.4142923390016158, "language_loss": 0.73412502, "learning_rate": 2.5734959162581824e-06, "loss": 0.75518548, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7072, "time_per_iteration": 2.451519727706909 }, { "auxiliary_loss_clip": 0.01062465, "auxiliary_loss_mlp": 0.01046591, "balance_loss_clip": 1.01763546, "balance_loss_mlp": 1.0194397, "epoch": 0.4252517661205471, "flos": 26029879102080.0, "grad_norm": 1.6702541698407491, "language_loss": 0.83087534, "learning_rate": 2.5731227987365475e-06, "loss": 0.8519659, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 7073, "time_per_iteration": 2.439826488494873 }, { "auxiliary_loss_clip": 0.01058865, "auxiliary_loss_mlp": 0.01042011, "balance_loss_clip": 1.0133884, "balance_loss_mlp": 1.01802742, "epoch": 0.42531188937321507, "flos": 12712818234240.0, "grad_norm": 2.2250197823485287, "language_loss": 0.93070495, "learning_rate": 2.5727496594820386e-06, "loss": 0.95171368, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40820312, "step": 7074, "time_per_iteration": 2.375411033630371 }, { "auxiliary_loss_clip": 0.0106205, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.01188707, "balance_loss_mlp": 1.01801777, "epoch": 0.42537201262588303, "flos": 22090558976640.0, "grad_norm": 1.694292793572092, "language_loss": 0.64918876, "learning_rate": 2.572376498508805e-06, "loss": 0.67023629, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44140625, "step": 7075, "time_per_iteration": 2.396697998046875 }, { "auxiliary_loss_clip": 0.01058887, "auxiliary_loss_mlp": 0.01040959, "balance_loss_clip": 1.01429224, "balance_loss_mlp": 1.01906228, "epoch": 0.42543213587855105, "flos": 23002001596800.0, "grad_norm": 1.598573831565425, "language_loss": 0.75430012, "learning_rate": 2.5720033158309973e-06, "loss": 0.7752986, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7076, "time_per_iteration": 2.4554989337921143 }, { "auxiliary_loss_clip": 0.01063253, "auxiliary_loss_mlp": 0.01044711, "balance_loss_clip": 1.01400328, "balance_loss_mlp": 1.01862991, "epoch": 0.425492259131219, "flos": 25081288928640.0, "grad_norm": 1.7186593960114798, "language_loss": 0.80787182, "learning_rate": 2.571630111462766e-06, "loss": 0.82895148, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4453125, "step": 7077, "time_per_iteration": 2.401122808456421 }, { "auxiliary_loss_clip": 0.0105808, "auxiliary_loss_mlp": 0.01043299, "balance_loss_clip": 1.01870573, "balance_loss_mlp": 1.01858425, "epoch": 0.425552382383887, "flos": 22815844894080.0, "grad_norm": 1.5438464987871954, "language_loss": 0.74048656, "learning_rate": 2.571256885418265e-06, "loss": 0.76150036, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39453125, "step": 7078, "time_per_iteration": 2.5226354598999023 }, { "auxiliary_loss_clip": 0.01062239, "auxiliary_loss_mlp": 0.01046344, "balance_loss_clip": 1.01946187, "balance_loss_mlp": 1.021384, "epoch": 0.42561250563655495, "flos": 13552723745280.0, "grad_norm": 2.419177192250445, "language_loss": 0.81715393, "learning_rate": 2.5708836377116445e-06, "loss": 0.83823973, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 7079, "time_per_iteration": 2.3789761066436768 }, { "auxiliary_loss_clip": 0.0106202, "auxiliary_loss_mlp": 0.01039922, "balance_loss_clip": 1.0137198, "balance_loss_mlp": 1.0214057, "epoch": 0.4256726288892229, "flos": 46976404780800.0, "grad_norm": 1.3535396153281427, "language_loss": 0.72908199, "learning_rate": 2.5705103683570592e-06, "loss": 0.75010139, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 7080, "time_per_iteration": 2.668379306793213 }, { "auxiliary_loss_clip": 0.01061495, "auxiliary_loss_mlp": 0.01047877, "balance_loss_clip": 1.02225852, "balance_loss_mlp": 1.02045369, "epoch": 0.4257327521418909, "flos": 23585330459520.0, "grad_norm": 2.135285399700532, "language_loss": 0.81375611, "learning_rate": 2.5701370773686646e-06, "loss": 0.83484983, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41015625, "step": 7081, "time_per_iteration": 2.388793468475342 }, { "auxiliary_loss_clip": 0.01059828, "auxiliary_loss_mlp": 0.01042503, "balance_loss_clip": 1.01662314, "balance_loss_mlp": 1.02069521, "epoch": 0.42579287539455885, "flos": 18988979857920.0, "grad_norm": 1.6297901112165836, "language_loss": 0.82100552, "learning_rate": 2.5697637647606138e-06, "loss": 0.84202886, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 7082, "time_per_iteration": 2.396122932434082 }, { "auxiliary_loss_clip": 0.01062068, "auxiliary_loss_mlp": 0.01049833, "balance_loss_clip": 1.02220047, "balance_loss_mlp": 1.0212698, "epoch": 0.4258529986472268, "flos": 25190741640960.0, "grad_norm": 2.0556936076843515, "language_loss": 0.71151543, "learning_rate": 2.569390430547065e-06, "loss": 0.73263443, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 7083, "time_per_iteration": 2.4003679752349854 }, { "auxiliary_loss_clip": 0.01014954, "auxiliary_loss_mlp": 0.01005378, "balance_loss_clip": 1.00175393, "balance_loss_mlp": 1.00656188, "epoch": 0.4259131218998948, "flos": 69964614524160.0, "grad_norm": 0.8774692434108863, "language_loss": 0.6715073, "learning_rate": 2.569017074742173e-06, "loss": 0.69171059, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 0.03613281, "router_z_loss_mlp": 0.08398438, "step": 7084, "time_per_iteration": 3.105396270751953 }, { "auxiliary_loss_clip": 0.01063076, "auxiliary_loss_mlp": 0.01047029, "balance_loss_clip": 1.01853848, "balance_loss_mlp": 1.02255225, "epoch": 0.42597324515256274, "flos": 18003975269760.0, "grad_norm": 1.904409565292695, "language_loss": 0.79889947, "learning_rate": 2.5686436973600964e-06, "loss": 0.82000053, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40625, "step": 7085, "time_per_iteration": 2.5312464237213135 }, { "auxiliary_loss_clip": 0.01066105, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.02029479, "balance_loss_mlp": 1.02152443, "epoch": 0.4260333684052307, "flos": 15157890547200.0, "grad_norm": 1.9391301518000674, "language_loss": 0.77232021, "learning_rate": 2.568270298414995e-06, "loss": 0.79348856, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 7086, "time_per_iteration": 2.4556844234466553 }, { "auxiliary_loss_clip": 0.01064217, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.0195483, "balance_loss_mlp": 1.02262855, "epoch": 0.42609349165789867, "flos": 14938461452160.0, "grad_norm": 1.7527787026326156, "language_loss": 0.81814086, "learning_rate": 2.5678968779210255e-06, "loss": 0.8392514, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41601562, "step": 7087, "time_per_iteration": 2.350301742553711 }, { "auxiliary_loss_clip": 0.01062127, "auxiliary_loss_mlp": 0.01044841, "balance_loss_clip": 1.01817369, "balance_loss_mlp": 1.02112556, "epoch": 0.42615361491056664, "flos": 23730848472960.0, "grad_norm": 2.0794903303860983, "language_loss": 0.6773566, "learning_rate": 2.5675234358923505e-06, "loss": 0.69842637, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41015625, "step": 7088, "time_per_iteration": 2.48347806930542 }, { "auxiliary_loss_clip": 0.01061464, "auxiliary_loss_mlp": 0.01050458, "balance_loss_clip": 1.02366018, "balance_loss_mlp": 1.01881576, "epoch": 0.42621373816323466, "flos": 24935282156160.0, "grad_norm": 1.8877935459084403, "language_loss": 0.69795668, "learning_rate": 2.56714997234313e-06, "loss": 0.71907592, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.42578125, "step": 7089, "time_per_iteration": 2.410597085952759 }, { "auxiliary_loss_clip": 0.01061285, "auxiliary_loss_mlp": 0.01044185, "balance_loss_clip": 1.01757789, "balance_loss_mlp": 1.01907182, "epoch": 0.4262738614159026, "flos": 13552130252160.0, "grad_norm": 3.1986335662398906, "language_loss": 0.76215214, "learning_rate": 2.566776487287525e-06, "loss": 0.78320682, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 7090, "time_per_iteration": 2.3808205127716064 }, { "auxiliary_loss_clip": 0.01063235, "auxiliary_loss_mlp": 0.01049118, "balance_loss_clip": 1.0208652, "balance_loss_mlp": 1.01976097, "epoch": 0.4263339846685706, "flos": 29747605628160.0, "grad_norm": 1.9111605278408594, "language_loss": 0.76090419, "learning_rate": 2.5664029807396994e-06, "loss": 0.78202772, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.43359375, "step": 7091, "time_per_iteration": 2.426913022994995 }, { "auxiliary_loss_clip": 0.01056609, "auxiliary_loss_mlp": 0.01045436, "balance_loss_clip": 1.02147496, "balance_loss_mlp": 1.01806998, "epoch": 0.42639410792123855, "flos": 16833337649280.0, "grad_norm": 1.6738369077042294, "language_loss": 0.84012449, "learning_rate": 2.5660294527138156e-06, "loss": 0.86114502, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38476562, "step": 7092, "time_per_iteration": 2.4408488273620605 }, { "auxiliary_loss_clip": 0.01062806, "auxiliary_loss_mlp": 0.01049811, "balance_loss_clip": 1.02081919, "balance_loss_mlp": 1.01854312, "epoch": 0.4264542311739065, "flos": 28761972635520.0, "grad_norm": 1.5659470454292643, "language_loss": 0.75080383, "learning_rate": 2.565655903224038e-06, "loss": 0.77192998, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44140625, "step": 7093, "time_per_iteration": 2.49782133102417 }, { "auxiliary_loss_clip": 0.01060441, "auxiliary_loss_mlp": 0.01046648, "balance_loss_clip": 1.01746607, "balance_loss_mlp": 1.01952362, "epoch": 0.4265143544265745, "flos": 24712571393280.0, "grad_norm": 2.1937760679972853, "language_loss": 0.71988213, "learning_rate": 2.565282332284532e-06, "loss": 0.74095297, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 7094, "time_per_iteration": 2.462940216064453 }, { "auxiliary_loss_clip": 0.01060849, "auxiliary_loss_mlp": 0.01048355, "balance_loss_clip": 1.02047205, "balance_loss_mlp": 1.01934028, "epoch": 0.42657447767924245, "flos": 21865055304960.0, "grad_norm": 2.2828912983415184, "language_loss": 0.83650643, "learning_rate": 2.564908739909464e-06, "loss": 0.85759842, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41601562, "step": 7095, "time_per_iteration": 2.4193615913391113 }, { "auxiliary_loss_clip": 0.01061816, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.02311683, "balance_loss_mlp": 1.01983249, "epoch": 0.4266346009319104, "flos": 21469174865280.0, "grad_norm": 1.722246782821041, "language_loss": 0.82179749, "learning_rate": 2.5645351261129996e-06, "loss": 0.8429358, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 7096, "time_per_iteration": 2.4393861293792725 }, { "auxiliary_loss_clip": 0.0106395, "auxiliary_loss_mlp": 0.0104862, "balance_loss_clip": 1.02098739, "balance_loss_mlp": 1.02017319, "epoch": 0.4266947241845784, "flos": 25518226993920.0, "grad_norm": 1.8654285385093228, "language_loss": 0.66829956, "learning_rate": 2.5641614909093066e-06, "loss": 0.68942529, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4375, "step": 7097, "time_per_iteration": 4.726633310317993 }, { "auxiliary_loss_clip": 0.01059198, "auxiliary_loss_mlp": 0.0104365, "balance_loss_clip": 1.01687574, "balance_loss_mlp": 1.01845396, "epoch": 0.42675484743724634, "flos": 26540030021760.0, "grad_norm": 1.6238429899422295, "language_loss": 0.75274056, "learning_rate": 2.5637878343125535e-06, "loss": 0.77376902, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40820312, "step": 7098, "time_per_iteration": 3.9148941040039062 }, { "auxiliary_loss_clip": 0.01058527, "auxiliary_loss_mlp": 0.01043573, "balance_loss_clip": 1.01597643, "balance_loss_mlp": 1.0183754, "epoch": 0.4268149706899143, "flos": 23111593954560.0, "grad_norm": 1.6479332541172997, "language_loss": 0.76729417, "learning_rate": 2.5634141563369086e-06, "loss": 0.78831518, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40234375, "step": 7099, "time_per_iteration": 3.7686681747436523 }, { "auxiliary_loss_clip": 0.01062218, "auxiliary_loss_mlp": 0.01047503, "balance_loss_clip": 1.01736653, "balance_loss_mlp": 1.01984274, "epoch": 0.4268750939425823, "flos": 22705554309120.0, "grad_norm": 2.0361918164121526, "language_loss": 0.84480894, "learning_rate": 2.5630404569965432e-06, "loss": 0.86590618, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42382812, "step": 7100, "time_per_iteration": 2.459355354309082 }, { "auxiliary_loss_clip": 0.01061399, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 1.01424277, "balance_loss_mlp": 1.01882601, "epoch": 0.42693521719525024, "flos": 25373686498560.0, "grad_norm": 1.3548182291888482, "language_loss": 0.82980454, "learning_rate": 2.562666736305627e-06, "loss": 0.85083508, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 7101, "time_per_iteration": 2.4331889152526855 }, { "auxiliary_loss_clip": 0.01064775, "auxiliary_loss_mlp": 0.01046256, "balance_loss_clip": 1.01701379, "balance_loss_mlp": 1.0209738, "epoch": 0.42699534044791826, "flos": 18149702751360.0, "grad_norm": 1.8121704532004443, "language_loss": 0.74265373, "learning_rate": 2.5622929942783314e-06, "loss": 0.76376402, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 7102, "time_per_iteration": 2.423921585083008 }, { "auxiliary_loss_clip": 0.01060683, "auxiliary_loss_mlp": 0.01043843, "balance_loss_clip": 1.01479173, "balance_loss_mlp": 1.01936769, "epoch": 0.4270554637005862, "flos": 13697578442880.0, "grad_norm": 1.7522531121775522, "language_loss": 0.85040694, "learning_rate": 2.5619192309288297e-06, "loss": 0.87145221, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4140625, "step": 7103, "time_per_iteration": 2.3918521404266357 }, { "auxiliary_loss_clip": 0.01063273, "auxiliary_loss_mlp": 0.01046311, "balance_loss_clip": 1.01700974, "balance_loss_mlp": 1.01947224, "epoch": 0.4271155869532542, "flos": 17492637363840.0, "grad_norm": 2.2472432002383935, "language_loss": 0.74861735, "learning_rate": 2.561545446271294e-06, "loss": 0.76971316, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 7104, "time_per_iteration": 3.892261028289795 }, { "auxiliary_loss_clip": 0.01061929, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.00860524, "balance_loss_mlp": 1.01997232, "epoch": 0.42717571020592215, "flos": 32450930334720.0, "grad_norm": 2.1509854232229855, "language_loss": 0.77646971, "learning_rate": 2.5611716403198987e-06, "loss": 0.79742444, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.41992188, "step": 7105, "time_per_iteration": 2.52921199798584 }, { "auxiliary_loss_clip": 0.01062473, "auxiliary_loss_mlp": 0.01041721, "balance_loss_clip": 1.01439786, "balance_loss_mlp": 1.0203737, "epoch": 0.4272358334585901, "flos": 16252138379520.0, "grad_norm": 1.8074749579327214, "language_loss": 0.78500015, "learning_rate": 2.560797813088819e-06, "loss": 0.80604208, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.421875, "step": 7106, "time_per_iteration": 2.472628116607666 }, { "auxiliary_loss_clip": 0.01061019, "auxiliary_loss_mlp": 0.01047547, "balance_loss_clip": 1.01897264, "balance_loss_mlp": 1.01906157, "epoch": 0.4272959567112581, "flos": 24199138805760.0, "grad_norm": 1.7775745604894322, "language_loss": 0.82101858, "learning_rate": 2.560423964592229e-06, "loss": 0.8421042, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 7107, "time_per_iteration": 2.4683425426483154 }, { "auxiliary_loss_clip": 0.01061762, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.01640475, "balance_loss_mlp": 1.02129269, "epoch": 0.42735607996392605, "flos": 27962286877440.0, "grad_norm": 1.4157054898646022, "language_loss": 0.68688524, "learning_rate": 2.5600500948443075e-06, "loss": 0.70796388, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40429688, "step": 7108, "time_per_iteration": 2.5705130100250244 }, { "auxiliary_loss_clip": 0.01061002, "auxiliary_loss_mlp": 0.01048554, "balance_loss_clip": 1.02077818, "balance_loss_mlp": 1.01825678, "epoch": 0.427416203216594, "flos": 20294766817920.0, "grad_norm": 1.6961118895184641, "language_loss": 0.73211497, "learning_rate": 2.5596762038592294e-06, "loss": 0.75321054, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42773438, "step": 7109, "time_per_iteration": 2.4466779232025146 }, { "auxiliary_loss_clip": 0.01061423, "auxiliary_loss_mlp": 0.01049083, "balance_loss_clip": 1.01777923, "balance_loss_mlp": 1.01841557, "epoch": 0.427476326469262, "flos": 26942718176640.0, "grad_norm": 1.8632046963716977, "language_loss": 0.65896255, "learning_rate": 2.559302291651174e-06, "loss": 0.68006766, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4296875, "step": 7110, "time_per_iteration": 2.489095449447632 }, { "auxiliary_loss_clip": 0.01060198, "auxiliary_loss_mlp": 0.01048181, "balance_loss_clip": 1.0189991, "balance_loss_mlp": 1.01820326, "epoch": 0.42753644972192995, "flos": 25701660610560.0, "grad_norm": 1.7535767076756146, "language_loss": 0.77772814, "learning_rate": 2.5589283582343197e-06, "loss": 0.79881191, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41992188, "step": 7111, "time_per_iteration": 2.4730191230773926 }, { "auxiliary_loss_clip": 0.01061006, "auxiliary_loss_mlp": 0.01041131, "balance_loss_clip": 1.01358175, "balance_loss_mlp": 1.0191921, "epoch": 0.4275965729745979, "flos": 18766513474560.0, "grad_norm": 1.9849111933941892, "language_loss": 0.74744594, "learning_rate": 2.558554403622845e-06, "loss": 0.76846743, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 7112, "time_per_iteration": 2.504270315170288 }, { "auxiliary_loss_clip": 0.01058503, "auxiliary_loss_mlp": 0.01044157, "balance_loss_clip": 1.01754975, "balance_loss_mlp": 1.01816535, "epoch": 0.4276566962272659, "flos": 23763422638080.0, "grad_norm": 1.5414804027061388, "language_loss": 0.72324634, "learning_rate": 2.5581804278309323e-06, "loss": 0.74427295, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 7113, "time_per_iteration": 2.412998914718628 }, { "auxiliary_loss_clip": 0.01062782, "auxiliary_loss_mlp": 0.01056898, "balance_loss_clip": 1.02692854, "balance_loss_mlp": 1.02003765, "epoch": 0.42771681947993384, "flos": 22491396829440.0, "grad_norm": 1.6227575222862194, "language_loss": 0.6312803, "learning_rate": 2.5578064308727617e-06, "loss": 0.65247715, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.42773438, "step": 7114, "time_per_iteration": 2.567538022994995 }, { "auxiliary_loss_clip": 0.01063329, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.02016711, "balance_loss_mlp": 1.0188458, "epoch": 0.42777694273260186, "flos": 25043582793600.0, "grad_norm": 1.547263451445419, "language_loss": 0.65540814, "learning_rate": 2.5574324127625153e-06, "loss": 0.67655945, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4453125, "step": 7115, "time_per_iteration": 2.42500901222229 }, { "auxiliary_loss_clip": 0.01058509, "auxiliary_loss_mlp": 0.01046475, "balance_loss_clip": 1.02034485, "balance_loss_mlp": 1.0169549, "epoch": 0.4278370659852698, "flos": 18660516986880.0, "grad_norm": 1.4779445658233834, "language_loss": 0.74453264, "learning_rate": 2.5570583735143753e-06, "loss": 0.7655825, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41601562, "step": 7116, "time_per_iteration": 2.434450387954712 }, { "auxiliary_loss_clip": 0.01058403, "auxiliary_loss_mlp": 0.01043095, "balance_loss_clip": 1.0175606, "balance_loss_mlp": 1.01825488, "epoch": 0.4278971892379378, "flos": 27307036880640.0, "grad_norm": 1.592921566738255, "language_loss": 0.7042945, "learning_rate": 2.5566843131425275e-06, "loss": 0.72530949, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 7117, "time_per_iteration": 2.4244508743286133 }, { "auxiliary_loss_clip": 0.01061835, "auxiliary_loss_mlp": 0.01046854, "balance_loss_clip": 1.01706374, "balance_loss_mlp": 1.01961851, "epoch": 0.42795731249060576, "flos": 12888082592640.0, "grad_norm": 2.9072234490366187, "language_loss": 0.71343875, "learning_rate": 2.5563102316611536e-06, "loss": 0.73452562, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7118, "time_per_iteration": 2.3739075660705566 }, { "auxiliary_loss_clip": 0.01058772, "auxiliary_loss_mlp": 0.0104573, "balance_loss_clip": 1.01666677, "balance_loss_mlp": 1.01742363, "epoch": 0.4280174357432737, "flos": 33400044178560.0, "grad_norm": 1.8259791501328313, "language_loss": 0.75648296, "learning_rate": 2.55593612908444e-06, "loss": 0.77752799, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4140625, "step": 7119, "time_per_iteration": 2.484738826751709 }, { "auxiliary_loss_clip": 0.01060439, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.01980233, "balance_loss_mlp": 1.01857078, "epoch": 0.4280775589959417, "flos": 18258143034240.0, "grad_norm": 1.7700519237003003, "language_loss": 0.75620431, "learning_rate": 2.555562005426573e-06, "loss": 0.7772857, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 7120, "time_per_iteration": 2.412067174911499 }, { "auxiliary_loss_clip": 0.01059003, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.01774788, "balance_loss_mlp": 1.01814771, "epoch": 0.42813768224860965, "flos": 21470187294720.0, "grad_norm": 1.5302823349098866, "language_loss": 0.77668488, "learning_rate": 2.5551878607017385e-06, "loss": 0.79771924, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 7121, "time_per_iteration": 2.3772034645080566 }, { "auxiliary_loss_clip": 0.01060078, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.01489282, "balance_loss_mlp": 1.01925576, "epoch": 0.4281978055012776, "flos": 15668355669120.0, "grad_norm": 2.287238292921327, "language_loss": 0.87033945, "learning_rate": 2.554813694924126e-06, "loss": 0.89132446, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.40820312, "step": 7122, "time_per_iteration": 2.3753767013549805 }, { "auxiliary_loss_clip": 0.01058566, "auxiliary_loss_mlp": 0.01040699, "balance_loss_clip": 1.01590323, "balance_loss_mlp": 1.01916027, "epoch": 0.4282579287539456, "flos": 17711054029440.0, "grad_norm": 1.7620258611752178, "language_loss": 0.82466829, "learning_rate": 2.554439508107921e-06, "loss": 0.84566092, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39453125, "step": 7123, "time_per_iteration": 2.3760831356048584 }, { "auxiliary_loss_clip": 0.01058911, "auxiliary_loss_mlp": 0.01045342, "balance_loss_clip": 1.01821017, "balance_loss_mlp": 1.01853704, "epoch": 0.42831805200661355, "flos": 19280155530240.0, "grad_norm": 1.516239405562046, "language_loss": 0.81649786, "learning_rate": 2.5540653002673153e-06, "loss": 0.83754039, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 7124, "time_per_iteration": 2.3926174640655518 }, { "auxiliary_loss_clip": 0.01060774, "auxiliary_loss_mlp": 0.0105078, "balance_loss_clip": 1.01986945, "balance_loss_mlp": 1.0190258, "epoch": 0.4283781752592815, "flos": 19791598170240.0, "grad_norm": 1.8247720408406791, "language_loss": 0.81441242, "learning_rate": 2.553691071416498e-06, "loss": 0.8355279, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41796875, "step": 7125, "time_per_iteration": 2.377126932144165 }, { "auxiliary_loss_clip": 0.01060287, "auxiliary_loss_mlp": 0.01041036, "balance_loss_clip": 1.01557243, "balance_loss_mlp": 1.02008247, "epoch": 0.4284382985119495, "flos": 16507144016640.0, "grad_norm": 1.823112887700415, "language_loss": 0.75970936, "learning_rate": 2.553316821569659e-06, "loss": 0.7807225, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40234375, "step": 7126, "time_per_iteration": 2.3913590908050537 }, { "auxiliary_loss_clip": 0.01059152, "auxiliary_loss_mlp": 0.01045574, "balance_loss_clip": 1.01784647, "balance_loss_mlp": 1.01850414, "epoch": 0.42849842176461744, "flos": 23329661506560.0, "grad_norm": 1.5858936275459181, "language_loss": 0.82768387, "learning_rate": 2.5529425507409913e-06, "loss": 0.84873116, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 7127, "time_per_iteration": 2.4677815437316895 }, { "auxiliary_loss_clip": 0.010599, "auxiliary_loss_mlp": 0.01042765, "balance_loss_clip": 1.01557362, "balance_loss_mlp": 1.01836705, "epoch": 0.4285585450172854, "flos": 17273487559680.0, "grad_norm": 2.105098437501118, "language_loss": 0.76515102, "learning_rate": 2.5525682589446867e-06, "loss": 0.78617764, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 7128, "time_per_iteration": 2.3807613849639893 }, { "auxiliary_loss_clip": 0.01062259, "auxiliary_loss_mlp": 0.01043738, "balance_loss_clip": 1.01474667, "balance_loss_mlp": 1.01887703, "epoch": 0.42861866826995343, "flos": 24278461148160.0, "grad_norm": 2.040571365091949, "language_loss": 0.74581587, "learning_rate": 2.552193946194937e-06, "loss": 0.76687574, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 7129, "time_per_iteration": 2.392146587371826 }, { "auxiliary_loss_clip": 0.01062202, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.01305199, "balance_loss_mlp": 1.02048707, "epoch": 0.4286787915226214, "flos": 24351953293440.0, "grad_norm": 1.6331817519804848, "language_loss": 0.79797363, "learning_rate": 2.5518196125059394e-06, "loss": 0.81900644, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 7130, "time_per_iteration": 2.4554696083068848 }, { "auxiliary_loss_clip": 0.01064477, "auxiliary_loss_mlp": 0.01044237, "balance_loss_clip": 1.0155549, "balance_loss_mlp": 1.02146149, "epoch": 0.42873891477528936, "flos": 15449101130880.0, "grad_norm": 1.9166529364148959, "language_loss": 0.7500149, "learning_rate": 2.551445257891886e-06, "loss": 0.77110207, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4296875, "step": 7131, "time_per_iteration": 2.364858627319336 }, { "auxiliary_loss_clip": 0.0106088, "auxiliary_loss_mlp": 0.01043958, "balance_loss_clip": 1.01617062, "balance_loss_mlp": 1.01890492, "epoch": 0.4287990380279573, "flos": 17638609224960.0, "grad_norm": 2.435784632563269, "language_loss": 0.7900629, "learning_rate": 2.551070882366973e-06, "loss": 0.81111127, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 7132, "time_per_iteration": 2.4228222370147705 }, { "auxiliary_loss_clip": 0.01062946, "auxiliary_loss_mlp": 0.01045109, "balance_loss_clip": 1.0167129, "balance_loss_mlp": 1.02145684, "epoch": 0.4288591612806253, "flos": 27161099930880.0, "grad_norm": 1.576375223862875, "language_loss": 0.7978583, "learning_rate": 2.550696485945397e-06, "loss": 0.81893885, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 7133, "time_per_iteration": 2.5782976150512695 }, { "auxiliary_loss_clip": 0.01061538, "auxiliary_loss_mlp": 0.01046652, "balance_loss_clip": 1.01936507, "balance_loss_mlp": 1.01898241, "epoch": 0.42891928453329325, "flos": 17162289279360.0, "grad_norm": 1.9201273333221922, "language_loss": 0.7650423, "learning_rate": 2.550322068641355e-06, "loss": 0.78612411, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 7134, "time_per_iteration": 2.433131217956543 }, { "auxiliary_loss_clip": 0.01056983, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.01631927, "balance_loss_mlp": 1.01750576, "epoch": 0.4289794077859612, "flos": 18186047343360.0, "grad_norm": 1.7525280747535679, "language_loss": 0.85356522, "learning_rate": 2.5499476304690455e-06, "loss": 0.87454945, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 7135, "time_per_iteration": 2.4047629833221436 }, { "auxiliary_loss_clip": 0.01056205, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.01255584, "balance_loss_mlp": 1.01750565, "epoch": 0.4290395310386292, "flos": 28255627054080.0, "grad_norm": 1.8399752202663024, "language_loss": 0.77339071, "learning_rate": 2.549573171442666e-06, "loss": 0.79433608, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 7136, "time_per_iteration": 2.4801571369171143 }, { "auxiliary_loss_clip": 0.010579, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.01776242, "balance_loss_mlp": 1.01652992, "epoch": 0.42909965429129715, "flos": 16215165383040.0, "grad_norm": 1.9659011682676815, "language_loss": 0.81303084, "learning_rate": 2.5491986915764175e-06, "loss": 0.83405846, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 7137, "time_per_iteration": 4.447194576263428 }, { "auxiliary_loss_clip": 0.01061197, "auxiliary_loss_mlp": 0.01043331, "balance_loss_clip": 1.0155673, "balance_loss_mlp": 1.01933408, "epoch": 0.4291597775439651, "flos": 23111733600000.0, "grad_norm": 1.9829509210855931, "language_loss": 0.77919817, "learning_rate": 2.548824190884499e-06, "loss": 0.8002435, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 7138, "time_per_iteration": 3.839500904083252 }, { "auxiliary_loss_clip": 0.01011458, "auxiliary_loss_mlp": 0.01005674, "balance_loss_clip": 1.00197804, "balance_loss_mlp": 1.00279164, "epoch": 0.4292199007966331, "flos": 67543004511360.0, "grad_norm": 0.7758363582394228, "language_loss": 0.56287479, "learning_rate": 2.548449669381113e-06, "loss": 0.58304608, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.08691406, "step": 7139, "time_per_iteration": 4.3483545780181885 }, { "auxiliary_loss_clip": 0.01057991, "auxiliary_loss_mlp": 0.01043574, "balance_loss_clip": 1.02064979, "balance_loss_mlp": 1.01921797, "epoch": 0.42928002404930105, "flos": 22998824663040.0, "grad_norm": 1.6347169872899117, "language_loss": 0.81658506, "learning_rate": 2.5480751270804595e-06, "loss": 0.83760071, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.38671875, "step": 7140, "time_per_iteration": 2.4201200008392334 }, { "auxiliary_loss_clip": 0.0105867, "auxiliary_loss_mlp": 0.01045357, "balance_loss_clip": 1.01982188, "balance_loss_mlp": 1.01835072, "epoch": 0.429340147301969, "flos": 11544170561280.0, "grad_norm": 1.8816753064993963, "language_loss": 0.83117628, "learning_rate": 2.5477005639967424e-06, "loss": 0.85221654, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 7141, "time_per_iteration": 2.374645948410034 }, { "auxiliary_loss_clip": 0.01060505, "auxiliary_loss_mlp": 0.01054649, "balance_loss_clip": 1.02715933, "balance_loss_mlp": 1.01884484, "epoch": 0.42940027055463703, "flos": 25263814849920.0, "grad_norm": 1.7134811576447517, "language_loss": 0.87975895, "learning_rate": 2.547325980144166e-06, "loss": 0.9009105, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 7142, "time_per_iteration": 3.8696744441986084 }, { "auxiliary_loss_clip": 0.01056735, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.01703024, "balance_loss_mlp": 1.01845837, "epoch": 0.429460393807305, "flos": 23803886770560.0, "grad_norm": 1.9709071273517011, "language_loss": 0.79539752, "learning_rate": 2.5469513755369323e-06, "loss": 0.81637251, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 7143, "time_per_iteration": 2.445742130279541 }, { "auxiliary_loss_clip": 0.01057564, "auxiliary_loss_mlp": 0.01042416, "balance_loss_clip": 1.01648831, "balance_loss_mlp": 1.01793182, "epoch": 0.42952051705997296, "flos": 13917426474240.0, "grad_norm": 2.1595880893253883, "language_loss": 0.78495872, "learning_rate": 2.5465767501892484e-06, "loss": 0.80595851, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 7144, "time_per_iteration": 2.383176803588867 }, { "auxiliary_loss_clip": 0.01058242, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.01674318, "balance_loss_mlp": 1.01807857, "epoch": 0.4295806403126409, "flos": 26759179825920.0, "grad_norm": 2.2225051820734776, "language_loss": 0.75637913, "learning_rate": 2.54620210411532e-06, "loss": 0.77738279, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40234375, "step": 7145, "time_per_iteration": 2.4220123291015625 }, { "auxiliary_loss_clip": 0.01060058, "auxiliary_loss_mlp": 0.01041441, "balance_loss_clip": 1.01587045, "balance_loss_mlp": 1.01918614, "epoch": 0.4296407635653089, "flos": 20951762382720.0, "grad_norm": 1.9627279618844375, "language_loss": 0.80693102, "learning_rate": 2.545827437329352e-06, "loss": 0.82794607, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40820312, "step": 7146, "time_per_iteration": 2.400641679763794 }, { "auxiliary_loss_clip": 0.01057657, "auxiliary_loss_mlp": 0.01041293, "balance_loss_clip": 1.01565111, "balance_loss_mlp": 1.01829696, "epoch": 0.42970088681797686, "flos": 15851405260800.0, "grad_norm": 2.0803092586269076, "language_loss": 0.84250683, "learning_rate": 2.5454527498455532e-06, "loss": 0.8634963, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 7147, "time_per_iteration": 2.353229522705078 }, { "auxiliary_loss_clip": 0.01061551, "auxiliary_loss_mlp": 0.01042822, "balance_loss_clip": 1.01598752, "balance_loss_mlp": 1.01940441, "epoch": 0.4297610100706448, "flos": 22381525180800.0, "grad_norm": 1.8943487407854593, "language_loss": 0.88104403, "learning_rate": 2.545078041678131e-06, "loss": 0.90208781, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.421875, "step": 7148, "time_per_iteration": 2.4030771255493164 }, { "auxiliary_loss_clip": 0.01058267, "auxiliary_loss_mlp": 0.01043513, "balance_loss_clip": 1.01753712, "balance_loss_mlp": 1.01820922, "epoch": 0.4298211333233128, "flos": 27924510919680.0, "grad_norm": 1.7633007639421525, "language_loss": 0.79594195, "learning_rate": 2.5447033128412957e-06, "loss": 0.81695974, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40039062, "step": 7149, "time_per_iteration": 2.4749510288238525 }, { "auxiliary_loss_clip": 0.01058168, "auxiliary_loss_mlp": 0.01040104, "balance_loss_clip": 1.0145936, "balance_loss_mlp": 1.01839256, "epoch": 0.42988125657598075, "flos": 24424467920640.0, "grad_norm": 1.6130407582481165, "language_loss": 0.81004262, "learning_rate": 2.544328563349256e-06, "loss": 0.83102536, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 7150, "time_per_iteration": 2.476376533508301 }, { "auxiliary_loss_clip": 0.01062195, "auxiliary_loss_mlp": 0.01047968, "balance_loss_clip": 1.02001309, "balance_loss_mlp": 1.01983654, "epoch": 0.4299413798286487, "flos": 15849310579200.0, "grad_norm": 2.013932123516498, "language_loss": 0.77224398, "learning_rate": 2.5439537932162222e-06, "loss": 0.79334563, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 7151, "time_per_iteration": 2.4122672080993652 }, { "auxiliary_loss_clip": 0.01063376, "auxiliary_loss_mlp": 0.01045113, "balance_loss_clip": 1.01573992, "balance_loss_mlp": 1.02054584, "epoch": 0.4300015030813167, "flos": 22308417060480.0, "grad_norm": 1.9354616669465323, "language_loss": 0.71799934, "learning_rate": 2.543579002456406e-06, "loss": 0.73908424, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42773438, "step": 7152, "time_per_iteration": 2.5281565189361572 }, { "auxiliary_loss_clip": 0.01059217, "auxiliary_loss_mlp": 0.01043961, "balance_loss_clip": 1.01775932, "balance_loss_mlp": 1.01833737, "epoch": 0.43006162633398465, "flos": 34896212115840.0, "grad_norm": 1.5804167269229072, "language_loss": 0.72615719, "learning_rate": 2.54320419108402e-06, "loss": 0.74718904, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41015625, "step": 7153, "time_per_iteration": 2.5258703231811523 }, { "auxiliary_loss_clip": 0.0106078, "auxiliary_loss_mlp": 0.01038173, "balance_loss_clip": 1.01046896, "balance_loss_mlp": 1.01917744, "epoch": 0.4301217495866526, "flos": 15960648504960.0, "grad_norm": 2.458239342696999, "language_loss": 0.80617976, "learning_rate": 2.542829359113276e-06, "loss": 0.8271693, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 7154, "time_per_iteration": 2.4534530639648438 }, { "auxiliary_loss_clip": 0.01058466, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.01629686, "balance_loss_mlp": 1.01960647, "epoch": 0.43018187283932063, "flos": 18769376206080.0, "grad_norm": 1.5948911335845621, "language_loss": 0.801525, "learning_rate": 2.542454506558389e-06, "loss": 0.82251292, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38867188, "step": 7155, "time_per_iteration": 2.4419100284576416 }, { "auxiliary_loss_clip": 0.01058429, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.01244593, "balance_loss_mlp": 1.01834023, "epoch": 0.4302419960919886, "flos": 20150819815680.0, "grad_norm": 1.7430846704680216, "language_loss": 0.90161943, "learning_rate": 2.5420796334335723e-06, "loss": 0.92256808, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.40039062, "step": 7156, "time_per_iteration": 2.592782497406006 }, { "auxiliary_loss_clip": 0.01061063, "auxiliary_loss_mlp": 0.01040159, "balance_loss_clip": 1.01115537, "balance_loss_mlp": 1.01872396, "epoch": 0.43030211934465656, "flos": 26431519916160.0, "grad_norm": 1.676717078979573, "language_loss": 0.84601462, "learning_rate": 2.541704739753042e-06, "loss": 0.86702681, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42382812, "step": 7157, "time_per_iteration": 2.5139517784118652 }, { "auxiliary_loss_clip": 0.01062675, "auxiliary_loss_mlp": 0.01043914, "balance_loss_clip": 1.01649606, "balance_loss_mlp": 1.01995885, "epoch": 0.43036224259732453, "flos": 24388088417280.0, "grad_norm": 2.129035916271821, "language_loss": 0.73637521, "learning_rate": 2.5413298255310132e-06, "loss": 0.7574411, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 7158, "time_per_iteration": 2.5387179851531982 }, { "auxiliary_loss_clip": 0.01060926, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.01618516, "balance_loss_mlp": 1.01961267, "epoch": 0.4304223658499925, "flos": 17200763464320.0, "grad_norm": 1.8098031999358877, "language_loss": 0.84843552, "learning_rate": 2.5409548907817034e-06, "loss": 0.86946988, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.4140625, "step": 7159, "time_per_iteration": 2.433806896209717 }, { "auxiliary_loss_clip": 0.01059515, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.01650155, "balance_loss_mlp": 1.01845002, "epoch": 0.43048248910266046, "flos": 14902116860160.0, "grad_norm": 2.210293602418018, "language_loss": 0.85060197, "learning_rate": 2.54057993551933e-06, "loss": 0.87163156, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 7160, "time_per_iteration": 2.4539196491241455 }, { "auxiliary_loss_clip": 0.0106126, "auxiliary_loss_mlp": 0.01054342, "balance_loss_clip": 1.02227449, "balance_loss_mlp": 1.01831985, "epoch": 0.4305426123553284, "flos": 21578767223040.0, "grad_norm": 6.990140104024967, "language_loss": 0.78853214, "learning_rate": 2.5402049597581116e-06, "loss": 0.80968809, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4296875, "step": 7161, "time_per_iteration": 2.4627578258514404 }, { "auxiliary_loss_clip": 0.01059881, "auxiliary_loss_mlp": 0.01042902, "balance_loss_clip": 1.01665235, "balance_loss_mlp": 1.01854086, "epoch": 0.4306027356079964, "flos": 22600186225920.0, "grad_norm": 2.634567561112199, "language_loss": 0.74913383, "learning_rate": 2.5398299635122662e-06, "loss": 0.77016163, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.4140625, "step": 7162, "time_per_iteration": 2.477551221847534 }, { "auxiliary_loss_clip": 0.01011676, "auxiliary_loss_mlp": 0.01006347, "balance_loss_clip": 1.00315273, "balance_loss_mlp": 1.00316954, "epoch": 0.43066285886066435, "flos": 70669128182400.0, "grad_norm": 1.436855119596756, "language_loss": 0.59181809, "learning_rate": 2.5394549467960147e-06, "loss": 0.61199832, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.08496094, "step": 7163, "time_per_iteration": 2.963192939758301 }, { "auxiliary_loss_clip": 0.01058024, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.01543319, "balance_loss_mlp": 1.01747406, "epoch": 0.4307229821133323, "flos": 26719483743360.0, "grad_norm": 1.5604910230556561, "language_loss": 0.79966348, "learning_rate": 2.5390799096235783e-06, "loss": 0.82065165, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 7164, "time_per_iteration": 2.5151240825653076 }, { "auxiliary_loss_clip": 0.0106107, "auxiliary_loss_mlp": 0.01050916, "balance_loss_clip": 1.02271104, "balance_loss_mlp": 1.01814938, "epoch": 0.4307831053660003, "flos": 26175920785920.0, "grad_norm": 1.7272694827222066, "language_loss": 0.70325226, "learning_rate": 2.538704852009177e-06, "loss": 0.72437209, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4296875, "step": 7165, "time_per_iteration": 2.521815776824951 }, { "auxiliary_loss_clip": 0.01060342, "auxiliary_loss_mlp": 0.01054658, "balance_loss_clip": 1.02672708, "balance_loss_mlp": 1.0187974, "epoch": 0.43084322861866825, "flos": 18909517870080.0, "grad_norm": 2.014244628065655, "language_loss": 0.76137173, "learning_rate": 2.538329773967034e-06, "loss": 0.78252178, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.4140625, "step": 7166, "time_per_iteration": 2.5475032329559326 }, { "auxiliary_loss_clip": 0.01059422, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.02097154, "balance_loss_mlp": 1.01898384, "epoch": 0.4309033518713362, "flos": 26431694472960.0, "grad_norm": 1.607737102827125, "language_loss": 0.73255843, "learning_rate": 2.537954675511372e-06, "loss": 0.7536236, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 7167, "time_per_iteration": 2.509458541870117 }, { "auxiliary_loss_clip": 0.010574, "auxiliary_loss_mlp": 0.01041263, "balance_loss_clip": 1.01665807, "balance_loss_mlp": 1.01848531, "epoch": 0.43096347512400424, "flos": 21212284014720.0, "grad_norm": 1.5314434970669786, "language_loss": 0.79610252, "learning_rate": 2.537579556656414e-06, "loss": 0.81708908, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.390625, "step": 7168, "time_per_iteration": 2.5042614936828613 }, { "auxiliary_loss_clip": 0.01060783, "auxiliary_loss_mlp": 0.01048838, "balance_loss_clip": 1.02063274, "balance_loss_mlp": 1.01871157, "epoch": 0.4310235983766722, "flos": 16539334156800.0, "grad_norm": 2.675383762337884, "language_loss": 0.83623576, "learning_rate": 2.537204417416387e-06, "loss": 0.85733193, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 7169, "time_per_iteration": 2.4548728466033936 }, { "auxiliary_loss_clip": 0.01011194, "auxiliary_loss_mlp": 0.01010358, "balance_loss_clip": 1.00738931, "balance_loss_mlp": 1.00257242, "epoch": 0.43108372162934017, "flos": 64772506615680.0, "grad_norm": 0.672636339054128, "language_loss": 0.608257, "learning_rate": 2.5368292578055132e-06, "loss": 0.62847251, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.0859375, "step": 7170, "time_per_iteration": 3.2654716968536377 }, { "auxiliary_loss_clip": 0.01060257, "auxiliary_loss_mlp": 0.01043186, "balance_loss_clip": 1.01619685, "balance_loss_mlp": 1.0187881, "epoch": 0.43114384488200813, "flos": 13443236121600.0, "grad_norm": 1.6245605107089953, "language_loss": 0.77982366, "learning_rate": 2.536454077838021e-06, "loss": 0.80085814, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7171, "time_per_iteration": 2.433366537094116 }, { "auxiliary_loss_clip": 0.01059686, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.01644325, "balance_loss_mlp": 1.01877165, "epoch": 0.4312039681346761, "flos": 26285478232320.0, "grad_norm": 1.889531474753538, "language_loss": 0.78238022, "learning_rate": 2.5360788775281357e-06, "loss": 0.8034147, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 7172, "time_per_iteration": 2.523635149002075 }, { "auxiliary_loss_clip": 0.01061639, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.01752543, "balance_loss_mlp": 1.01831985, "epoch": 0.43126409138734406, "flos": 20375625260160.0, "grad_norm": 1.90557236436243, "language_loss": 0.78335935, "learning_rate": 2.535703656890086e-06, "loss": 0.80445278, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 7173, "time_per_iteration": 2.4714770317077637 }, { "auxiliary_loss_clip": 0.0106029, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.01747632, "balance_loss_mlp": 1.01912737, "epoch": 0.431324214640012, "flos": 22122120712320.0, "grad_norm": 1.536180315555645, "language_loss": 0.77190536, "learning_rate": 2.5353284159381e-06, "loss": 0.79296553, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 7174, "time_per_iteration": 2.523775100708008 }, { "auxiliary_loss_clip": 0.01061735, "auxiliary_loss_mlp": 0.01043893, "balance_loss_clip": 1.01380479, "balance_loss_mlp": 1.01865101, "epoch": 0.43138433789268, "flos": 15230125883520.0, "grad_norm": 1.4468362805532222, "language_loss": 0.83705318, "learning_rate": 2.534953154686407e-06, "loss": 0.85810941, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 7175, "time_per_iteration": 2.529812812805176 }, { "auxiliary_loss_clip": 0.01063985, "auxiliary_loss_mlp": 0.01052126, "balance_loss_clip": 1.02134585, "balance_loss_mlp": 1.02019048, "epoch": 0.43144446114534796, "flos": 18149318726400.0, "grad_norm": 2.22916184166232, "language_loss": 0.76551402, "learning_rate": 2.5345778731492366e-06, "loss": 0.7866751, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 7176, "time_per_iteration": 2.432892322540283 }, { "auxiliary_loss_clip": 0.01061544, "auxiliary_loss_mlp": 0.01046379, "balance_loss_clip": 1.01626635, "balance_loss_mlp": 1.01913822, "epoch": 0.4315045843980159, "flos": 22928753831040.0, "grad_norm": 4.330548958783723, "language_loss": 0.75277901, "learning_rate": 2.534202571340819e-06, "loss": 0.77385825, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42382812, "step": 7177, "time_per_iteration": 5.944704055786133 }, { "auxiliary_loss_clip": 0.01066685, "auxiliary_loss_mlp": 0.01052038, "balance_loss_clip": 1.01889825, "balance_loss_mlp": 1.02030897, "epoch": 0.4315647076506839, "flos": 22125786405120.0, "grad_norm": 1.928211966503039, "language_loss": 0.82920355, "learning_rate": 2.533827249275387e-06, "loss": 0.85039079, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46484375, "step": 7178, "time_per_iteration": 3.848681688308716 }, { "auxiliary_loss_clip": 0.01059171, "auxiliary_loss_mlp": 0.01044689, "balance_loss_clip": 1.01767659, "balance_loss_mlp": 1.01929593, "epoch": 0.43162483090335185, "flos": 26869889347200.0, "grad_norm": 1.47402342841166, "language_loss": 0.84706205, "learning_rate": 2.5334519069671725e-06, "loss": 0.86810064, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 7179, "time_per_iteration": 2.4304518699645996 }, { "auxiliary_loss_clip": 0.01062299, "auxiliary_loss_mlp": 0.010453, "balance_loss_clip": 1.01682091, "balance_loss_mlp": 1.02073288, "epoch": 0.4316849541560198, "flos": 13912399238400.0, "grad_norm": 1.6322716524806418, "language_loss": 0.76931638, "learning_rate": 2.5330765444304075e-06, "loss": 0.7903924, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41601562, "step": 7180, "time_per_iteration": 2.4406208992004395 }, { "auxiliary_loss_clip": 0.0106167, "auxiliary_loss_mlp": 0.0104904, "balance_loss_clip": 1.0178076, "balance_loss_mlp": 1.01847363, "epoch": 0.4317450774086878, "flos": 16434245364480.0, "grad_norm": 1.682701197060865, "language_loss": 0.83005786, "learning_rate": 2.5327011616793274e-06, "loss": 0.851165, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43164062, "step": 7181, "time_per_iteration": 2.3584885597229004 }, { "auxiliary_loss_clip": 0.01063739, "auxiliary_loss_mlp": 0.01049316, "balance_loss_clip": 1.01705837, "balance_loss_mlp": 1.01964617, "epoch": 0.4318052006613558, "flos": 20554031640960.0, "grad_norm": 1.5094432165405487, "language_loss": 0.90043747, "learning_rate": 2.532325758728165e-06, "loss": 0.92156792, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.44140625, "step": 7182, "time_per_iteration": 3.885211229324341 }, { "auxiliary_loss_clip": 0.01058701, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.01526737, "balance_loss_mlp": 1.01796961, "epoch": 0.43186532391402377, "flos": 22818952005120.0, "grad_norm": 1.6107760234146897, "language_loss": 0.77142203, "learning_rate": 2.5319503355911566e-06, "loss": 0.79244637, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 7183, "time_per_iteration": 2.542673349380493 }, { "auxiliary_loss_clip": 0.01060462, "auxiliary_loss_mlp": 0.01049021, "balance_loss_clip": 1.01785946, "balance_loss_mlp": 1.0174005, "epoch": 0.43192544716669173, "flos": 25555409458560.0, "grad_norm": 1.503850797554251, "language_loss": 0.78890949, "learning_rate": 2.5315748922825393e-06, "loss": 0.81000435, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4296875, "step": 7184, "time_per_iteration": 2.487062931060791 }, { "auxiliary_loss_clip": 0.01057042, "auxiliary_loss_mlp": 0.01044986, "balance_loss_clip": 1.01860487, "balance_loss_mlp": 1.01724219, "epoch": 0.4319855704193597, "flos": 30953400854400.0, "grad_norm": 1.8917905370775512, "language_loss": 0.74628729, "learning_rate": 2.5311994288165474e-06, "loss": 0.76730758, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 7185, "time_per_iteration": 2.51365065574646 }, { "auxiliary_loss_clip": 0.01063694, "auxiliary_loss_mlp": 0.01058742, "balance_loss_clip": 1.02677011, "balance_loss_mlp": 1.0186938, "epoch": 0.43204569367202766, "flos": 24237717724800.0, "grad_norm": 2.2189323072832132, "language_loss": 0.77298737, "learning_rate": 2.530823945207421e-06, "loss": 0.79421169, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44921875, "step": 7186, "time_per_iteration": 2.443582057952881 }, { "auxiliary_loss_clip": 0.01061089, "auxiliary_loss_mlp": 0.01045666, "balance_loss_clip": 1.01635289, "balance_loss_mlp": 1.01857829, "epoch": 0.43210581692469563, "flos": 18405930286080.0, "grad_norm": 2.2955046663005354, "language_loss": 0.7707836, "learning_rate": 2.5304484414693962e-06, "loss": 0.79185116, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 7187, "time_per_iteration": 2.4538910388946533 }, { "auxiliary_loss_clip": 0.01009344, "auxiliary_loss_mlp": 0.01004828, "balance_loss_clip": 1.0013473, "balance_loss_mlp": 1.00089359, "epoch": 0.4321659401773636, "flos": 49829155706880.0, "grad_norm": 0.854052341629861, "language_loss": 0.68349922, "learning_rate": 2.530072917616714e-06, "loss": 0.70364094, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.08496094, "step": 7188, "time_per_iteration": 3.0745813846588135 }, { "auxiliary_loss_clip": 0.01056901, "auxiliary_loss_mlp": 0.01044129, "balance_loss_clip": 1.01730692, "balance_loss_mlp": 1.01699722, "epoch": 0.43222606343003156, "flos": 17127620432640.0, "grad_norm": 1.7871946743936769, "language_loss": 0.79783064, "learning_rate": 2.529697373663614e-06, "loss": 0.81884098, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 7189, "time_per_iteration": 2.458501100540161 }, { "auxiliary_loss_clip": 0.01062223, "auxiliary_loss_mlp": 0.01047777, "balance_loss_clip": 1.01699722, "balance_loss_mlp": 1.01783729, "epoch": 0.4322861866826995, "flos": 22748776439040.0, "grad_norm": 1.7844175418296926, "language_loss": 0.73600197, "learning_rate": 2.5293218096243364e-06, "loss": 0.75710195, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44335938, "step": 7190, "time_per_iteration": 2.4763271808624268 }, { "auxiliary_loss_clip": 0.01056632, "auxiliary_loss_mlp": 0.01046138, "balance_loss_clip": 1.01718211, "balance_loss_mlp": 1.01623988, "epoch": 0.4323463099353675, "flos": 27890679945600.0, "grad_norm": 1.3473584153746445, "language_loss": 0.80786061, "learning_rate": 2.5289462255131223e-06, "loss": 0.8288883, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40429688, "step": 7191, "time_per_iteration": 2.481743335723877 }, { "auxiliary_loss_clip": 0.0105859, "auxiliary_loss_mlp": 0.01042337, "balance_loss_clip": 1.01398897, "balance_loss_mlp": 1.01802862, "epoch": 0.43240643318803546, "flos": 21613715360640.0, "grad_norm": 1.6006910870569688, "language_loss": 0.77242219, "learning_rate": 2.5285706213442146e-06, "loss": 0.79343152, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 7192, "time_per_iteration": 2.465409994125366 }, { "auxiliary_loss_clip": 0.01060704, "auxiliary_loss_mlp": 0.01049474, "balance_loss_clip": 1.01900446, "balance_loss_mlp": 1.01840234, "epoch": 0.4324665564407034, "flos": 17557646048640.0, "grad_norm": 1.771776424319067, "language_loss": 0.79882407, "learning_rate": 2.5281949971318557e-06, "loss": 0.81992579, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.421875, "step": 7193, "time_per_iteration": 2.4294402599334717 }, { "auxiliary_loss_clip": 0.01060215, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.01928973, "balance_loss_mlp": 1.01840568, "epoch": 0.4325266796933714, "flos": 18401531454720.0, "grad_norm": 2.1180400700083273, "language_loss": 0.76332742, "learning_rate": 2.5278193528902897e-06, "loss": 0.78443086, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41796875, "step": 7194, "time_per_iteration": 2.3969926834106445 }, { "auxiliary_loss_clip": 0.01061712, "auxiliary_loss_mlp": 0.01050459, "balance_loss_clip": 1.01944077, "balance_loss_mlp": 1.01951957, "epoch": 0.4325868029460394, "flos": 22563701988480.0, "grad_norm": 1.823426272341479, "language_loss": 0.61335123, "learning_rate": 2.5274436886337613e-06, "loss": 0.63447291, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.421875, "step": 7195, "time_per_iteration": 2.4437122344970703 }, { "auxiliary_loss_clip": 0.01063468, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.01265216, "balance_loss_mlp": 1.02007484, "epoch": 0.43264692619870737, "flos": 14604796788480.0, "grad_norm": 2.229463416741991, "language_loss": 0.66996223, "learning_rate": 2.527068004376515e-06, "loss": 0.69102573, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43359375, "step": 7196, "time_per_iteration": 2.375047445297241 }, { "auxiliary_loss_clip": 0.01065763, "auxiliary_loss_mlp": 0.01053091, "balance_loss_clip": 1.02194118, "balance_loss_mlp": 1.01920664, "epoch": 0.43270704945137534, "flos": 21500736600960.0, "grad_norm": 2.050825528987171, "language_loss": 0.73649091, "learning_rate": 2.526692300132797e-06, "loss": 0.7576794, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.46679688, "step": 7197, "time_per_iteration": 2.4208762645721436 }, { "auxiliary_loss_clip": 0.01059823, "auxiliary_loss_mlp": 0.01045879, "balance_loss_clip": 1.01847267, "balance_loss_mlp": 1.01877964, "epoch": 0.4327671727040433, "flos": 25154711251200.0, "grad_norm": 1.9107183219017396, "language_loss": 0.73398054, "learning_rate": 2.5263165759168547e-06, "loss": 0.75503755, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 7198, "time_per_iteration": 2.4871480464935303 }, { "auxiliary_loss_clip": 0.01059328, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.01021981, "balance_loss_mlp": 1.01825786, "epoch": 0.43282729595671127, "flos": 25445991657600.0, "grad_norm": 1.346488812381435, "language_loss": 0.81792063, "learning_rate": 2.525940831742934e-06, "loss": 0.83889127, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 7199, "time_per_iteration": 2.5029296875 }, { "auxiliary_loss_clip": 0.01061514, "auxiliary_loss_mlp": 0.01044115, "balance_loss_clip": 1.01557624, "balance_loss_mlp": 1.01964283, "epoch": 0.43288741920937923, "flos": 24125192812800.0, "grad_norm": 2.218615200353538, "language_loss": 0.70411575, "learning_rate": 2.525565067625286e-06, "loss": 0.72517204, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 7200, "time_per_iteration": 2.456690788269043 }, { "auxiliary_loss_clip": 0.01061782, "auxiliary_loss_mlp": 0.01054576, "balance_loss_clip": 1.02274716, "balance_loss_mlp": 1.01867318, "epoch": 0.4329475424620472, "flos": 19204045032960.0, "grad_norm": 1.7914641704922911, "language_loss": 0.89010668, "learning_rate": 2.525189283578157e-06, "loss": 0.91127026, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43164062, "step": 7201, "time_per_iteration": 2.4084606170654297 }, { "auxiliary_loss_clip": 0.0106546, "auxiliary_loss_mlp": 0.01054522, "balance_loss_clip": 1.02178681, "balance_loss_mlp": 1.02121615, "epoch": 0.43300766571471516, "flos": 22637263956480.0, "grad_norm": 2.381333491361938, "language_loss": 0.66418135, "learning_rate": 2.5248134796157974e-06, "loss": 0.68538117, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44140625, "step": 7202, "time_per_iteration": 2.447852849960327 }, { "auxiliary_loss_clip": 0.01060691, "auxiliary_loss_mlp": 0.01042699, "balance_loss_clip": 1.01585269, "balance_loss_mlp": 1.01925397, "epoch": 0.4330677889673831, "flos": 22120165676160.0, "grad_norm": 1.7299329415608007, "language_loss": 0.83219254, "learning_rate": 2.5244376557524586e-06, "loss": 0.85322642, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.4140625, "step": 7203, "time_per_iteration": 2.415625810623169 }, { "auxiliary_loss_clip": 0.01063301, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.02059793, "balance_loss_mlp": 1.01806355, "epoch": 0.4331279122200511, "flos": 23220418262400.0, "grad_norm": 2.0087591890587273, "language_loss": 0.82338387, "learning_rate": 2.5240618120023912e-06, "loss": 0.84453923, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 7204, "time_per_iteration": 2.469393730163574 }, { "auxiliary_loss_clip": 0.01061017, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.01473737, "balance_loss_mlp": 1.01893735, "epoch": 0.43318803547271906, "flos": 18258771438720.0, "grad_norm": 4.717251897415797, "language_loss": 0.75741363, "learning_rate": 2.5236859483798468e-06, "loss": 0.77845716, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.421875, "step": 7205, "time_per_iteration": 2.401212692260742 }, { "auxiliary_loss_clip": 0.01059834, "auxiliary_loss_mlp": 0.01043937, "balance_loss_clip": 1.01620924, "balance_loss_mlp": 1.01990962, "epoch": 0.433248158725387, "flos": 27417152908800.0, "grad_norm": 1.8506772376123721, "language_loss": 0.76048994, "learning_rate": 2.5233100648990803e-06, "loss": 0.7815277, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 7206, "time_per_iteration": 2.5068812370300293 }, { "auxiliary_loss_clip": 0.01059857, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.01347733, "balance_loss_mlp": 1.01804495, "epoch": 0.433308281978055, "flos": 23216996949120.0, "grad_norm": 1.94484167270054, "language_loss": 0.80228627, "learning_rate": 2.522934161574342e-06, "loss": 0.82330477, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 7207, "time_per_iteration": 2.411655902862549 }, { "auxiliary_loss_clip": 0.0106218, "auxiliary_loss_mlp": 0.01049595, "balance_loss_clip": 1.019292, "balance_loss_mlp": 1.01832068, "epoch": 0.433368405230723, "flos": 15851475083520.0, "grad_norm": 5.116659620132036, "language_loss": 0.81459206, "learning_rate": 2.5225582384198888e-06, "loss": 0.83570981, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43945312, "step": 7208, "time_per_iteration": 2.4860317707061768 }, { "auxiliary_loss_clip": 0.01059918, "auxiliary_loss_mlp": 0.01043107, "balance_loss_clip": 1.01553392, "balance_loss_mlp": 1.01764214, "epoch": 0.433428528483391, "flos": 19025080070400.0, "grad_norm": 2.6398422258695855, "language_loss": 0.72500366, "learning_rate": 2.5221822954499744e-06, "loss": 0.74603391, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.421875, "step": 7209, "time_per_iteration": 2.4088857173919678 }, { "auxiliary_loss_clip": 0.01061194, "auxiliary_loss_mlp": 0.01047985, "balance_loss_clip": 1.01792073, "balance_loss_mlp": 1.0191524, "epoch": 0.43348865173605894, "flos": 24717074958720.0, "grad_norm": 1.532202535407639, "language_loss": 0.82138592, "learning_rate": 2.5218063326788557e-06, "loss": 0.84247768, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41992188, "step": 7210, "time_per_iteration": 2.464977264404297 }, { "auxiliary_loss_clip": 0.01058841, "auxiliary_loss_mlp": 0.01045724, "balance_loss_clip": 1.01868773, "balance_loss_mlp": 1.01768875, "epoch": 0.4335487749887269, "flos": 22089581458560.0, "grad_norm": 1.8177881228333863, "language_loss": 0.82796288, "learning_rate": 2.5214303501207885e-06, "loss": 0.84900856, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 7211, "time_per_iteration": 2.4403293132781982 }, { "auxiliary_loss_clip": 0.01057834, "auxiliary_loss_mlp": 0.01041384, "balance_loss_clip": 1.01460946, "balance_loss_mlp": 1.01763844, "epoch": 0.43360889824139487, "flos": 22381839383040.0, "grad_norm": 2.0611927925642344, "language_loss": 0.76518184, "learning_rate": 2.521054347790029e-06, "loss": 0.786174, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 7212, "time_per_iteration": 2.424225330352783 }, { "auxiliary_loss_clip": 0.01060665, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.01682878, "balance_loss_mlp": 1.01937795, "epoch": 0.43366902149406283, "flos": 17527376033280.0, "grad_norm": 1.6365430360657236, "language_loss": 0.77523905, "learning_rate": 2.5206783257008375e-06, "loss": 0.79628056, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41210938, "step": 7213, "time_per_iteration": 2.386165142059326 }, { "auxiliary_loss_clip": 0.01060344, "auxiliary_loss_mlp": 0.01049196, "balance_loss_clip": 1.02132452, "balance_loss_mlp": 1.01850772, "epoch": 0.4337291447467308, "flos": 19021763491200.0, "grad_norm": 1.5836433618532024, "language_loss": 0.65649271, "learning_rate": 2.520302283867471e-06, "loss": 0.67758811, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 7214, "time_per_iteration": 2.4387238025665283 }, { "auxiliary_loss_clip": 0.01056902, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.01601291, "balance_loss_mlp": 1.01781058, "epoch": 0.43378926799939876, "flos": 27232846508160.0, "grad_norm": 1.630806901420469, "language_loss": 0.72219521, "learning_rate": 2.519926222304191e-06, "loss": 0.74319303, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 7215, "time_per_iteration": 2.510660171508789 }, { "auxiliary_loss_clip": 0.01058631, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.01840496, "balance_loss_mlp": 1.01872492, "epoch": 0.43384939125206673, "flos": 15960194657280.0, "grad_norm": 1.8139245939597084, "language_loss": 0.76345122, "learning_rate": 2.519550141025255e-06, "loss": 0.78448755, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7216, "time_per_iteration": 6.073709964752197 }, { "auxiliary_loss_clip": 0.01065188, "auxiliary_loss_mlp": 0.01049183, "balance_loss_clip": 1.0156858, "balance_loss_mlp": 1.01917851, "epoch": 0.4339095145047347, "flos": 21792296298240.0, "grad_norm": 3.157210619327897, "language_loss": 0.779917, "learning_rate": 2.519174040044927e-06, "loss": 0.80106074, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4609375, "step": 7217, "time_per_iteration": 3.816547155380249 }, { "auxiliary_loss_clip": 0.01061293, "auxiliary_loss_mlp": 0.01048951, "balance_loss_clip": 1.02066267, "balance_loss_mlp": 1.01893473, "epoch": 0.43396963775740266, "flos": 14208986171520.0, "grad_norm": 1.9355293790525843, "language_loss": 0.75286162, "learning_rate": 2.5187979193774664e-06, "loss": 0.77396405, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42382812, "step": 7218, "time_per_iteration": 2.3996498584747314 }, { "auxiliary_loss_clip": 0.01061731, "auxiliary_loss_mlp": 0.01047325, "balance_loss_clip": 1.01702178, "balance_loss_mlp": 1.01853693, "epoch": 0.4340297610100706, "flos": 19718036202240.0, "grad_norm": 1.8558112183635016, "language_loss": 0.70679045, "learning_rate": 2.5184217790371367e-06, "loss": 0.72788101, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43164062, "step": 7219, "time_per_iteration": 2.4120004177093506 }, { "auxiliary_loss_clip": 0.01060087, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.01834977, "balance_loss_mlp": 1.01921701, "epoch": 0.4340898842627386, "flos": 18952495620480.0, "grad_norm": 1.5914494276078401, "language_loss": 0.78566885, "learning_rate": 2.518045619038202e-06, "loss": 0.8067379, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 7220, "time_per_iteration": 2.4493658542633057 }, { "auxiliary_loss_clip": 0.01059169, "auxiliary_loss_mlp": 0.01044121, "balance_loss_clip": 1.01667893, "balance_loss_mlp": 1.01862669, "epoch": 0.4341500075154066, "flos": 22017206476800.0, "grad_norm": 1.872346508453308, "language_loss": 0.70273894, "learning_rate": 2.5176694393949243e-06, "loss": 0.72377187, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 7221, "time_per_iteration": 2.4338510036468506 }, { "auxiliary_loss_clip": 0.01061318, "auxiliary_loss_mlp": 0.01045162, "balance_loss_clip": 1.01777935, "balance_loss_mlp": 1.01930952, "epoch": 0.4342101307680746, "flos": 23581455298560.0, "grad_norm": 1.7656287341704238, "language_loss": 0.66270709, "learning_rate": 2.51729324012157e-06, "loss": 0.68377185, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41992188, "step": 7222, "time_per_iteration": 3.8299214839935303 }, { "auxiliary_loss_clip": 0.01058185, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.01322496, "balance_loss_mlp": 1.01776028, "epoch": 0.43427025402074254, "flos": 17967002273280.0, "grad_norm": 3.1536880394021227, "language_loss": 0.7477265, "learning_rate": 2.5169170212324053e-06, "loss": 0.76871121, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 7223, "time_per_iteration": 2.4220056533813477 }, { "auxiliary_loss_clip": 0.01061551, "auxiliary_loss_mlp": 0.01043474, "balance_loss_clip": 1.0161159, "balance_loss_mlp": 1.01880467, "epoch": 0.4343303772734105, "flos": 26285198941440.0, "grad_norm": 1.9788595466805834, "language_loss": 0.94854051, "learning_rate": 2.516540782741694e-06, "loss": 0.96959078, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42773438, "step": 7224, "time_per_iteration": 2.4639101028442383 }, { "auxiliary_loss_clip": 0.01058807, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.01471651, "balance_loss_mlp": 1.01823735, "epoch": 0.43439050052607847, "flos": 26832741793920.0, "grad_norm": 1.7781925718811329, "language_loss": 0.61970675, "learning_rate": 2.5161645246637056e-06, "loss": 0.6407212, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 7225, "time_per_iteration": 2.496264696121216 }, { "auxiliary_loss_clip": 0.01059499, "auxiliary_loss_mlp": 0.01041309, "balance_loss_clip": 1.01559544, "balance_loss_mlp": 1.01895368, "epoch": 0.43445062377874644, "flos": 21396590415360.0, "grad_norm": 2.000241194240365, "language_loss": 0.79472637, "learning_rate": 2.5157882470127054e-06, "loss": 0.81573451, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40625, "step": 7226, "time_per_iteration": 2.4356067180633545 }, { "auxiliary_loss_clip": 0.01058314, "auxiliary_loss_mlp": 0.01043053, "balance_loss_clip": 1.01745832, "balance_loss_mlp": 1.01889741, "epoch": 0.4345107470314144, "flos": 19900911237120.0, "grad_norm": 1.7456541532261833, "language_loss": 0.86092103, "learning_rate": 2.515411949802964e-06, "loss": 0.8819347, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 7227, "time_per_iteration": 2.405559778213501 }, { "auxiliary_loss_clip": 0.01058849, "auxiliary_loss_mlp": 0.01051053, "balance_loss_clip": 1.02274096, "balance_loss_mlp": 1.01833773, "epoch": 0.43457087028408237, "flos": 26431415182080.0, "grad_norm": 1.9781709709021398, "language_loss": 0.78416884, "learning_rate": 2.5150356330487498e-06, "loss": 0.80526781, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40429688, "step": 7228, "time_per_iteration": 2.495698928833008 }, { "auxiliary_loss_clip": 0.0105861, "auxiliary_loss_mlp": 0.01046994, "balance_loss_clip": 1.01845503, "balance_loss_mlp": 1.01819026, "epoch": 0.43463099353675033, "flos": 31867461826560.0, "grad_norm": 1.543915600521893, "language_loss": 0.81778544, "learning_rate": 2.5146592967643324e-06, "loss": 0.8388415, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40429688, "step": 7229, "time_per_iteration": 2.503937244415283 }, { "auxiliary_loss_clip": 0.01060675, "auxiliary_loss_mlp": 0.01043539, "balance_loss_clip": 1.01407075, "balance_loss_mlp": 1.01830864, "epoch": 0.4346911167894183, "flos": 24570125579520.0, "grad_norm": 1.9287491933985998, "language_loss": 0.82994664, "learning_rate": 2.5142829409639834e-06, "loss": 0.85098881, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 7230, "time_per_iteration": 2.4554226398468018 }, { "auxiliary_loss_clip": 0.0106428, "auxiliary_loss_mlp": 0.01050917, "balance_loss_clip": 1.02177012, "balance_loss_mlp": 1.0202167, "epoch": 0.43475124004208626, "flos": 17089774652160.0, "grad_norm": 2.149355720167987, "language_loss": 0.78819335, "learning_rate": 2.513906565661973e-06, "loss": 0.80934536, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44140625, "step": 7231, "time_per_iteration": 2.379403591156006 }, { "auxiliary_loss_clip": 0.01057082, "auxiliary_loss_mlp": 0.01043184, "balance_loss_clip": 1.0178405, "balance_loss_mlp": 1.01744628, "epoch": 0.4348113632947542, "flos": 26103406158720.0, "grad_norm": 1.447883877632418, "language_loss": 0.69807744, "learning_rate": 2.513530170872575e-06, "loss": 0.71908009, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 7232, "time_per_iteration": 2.5072898864746094 }, { "auxiliary_loss_clip": 0.01061449, "auxiliary_loss_mlp": 0.0104434, "balance_loss_clip": 1.0161109, "balance_loss_mlp": 1.01860964, "epoch": 0.4348714865474222, "flos": 34199171354880.0, "grad_norm": 1.8029140613846586, "language_loss": 0.72855532, "learning_rate": 2.5131537566100605e-06, "loss": 0.74961323, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42773438, "step": 7233, "time_per_iteration": 2.5074596405029297 }, { "auxiliary_loss_clip": 0.01062286, "auxiliary_loss_mlp": 0.01048536, "balance_loss_clip": 1.01865005, "balance_loss_mlp": 1.01869917, "epoch": 0.43493160980009016, "flos": 31536206046720.0, "grad_norm": 2.0916033044491527, "language_loss": 0.75744772, "learning_rate": 2.5127773228887053e-06, "loss": 0.77855593, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43554688, "step": 7234, "time_per_iteration": 2.56392240524292 }, { "auxiliary_loss_clip": 0.01062475, "auxiliary_loss_mlp": 0.01053943, "balance_loss_clip": 1.02266216, "balance_loss_mlp": 1.01934361, "epoch": 0.4349917330527582, "flos": 24060184128000.0, "grad_norm": 2.374468395817092, "language_loss": 0.60553217, "learning_rate": 2.512400869722782e-06, "loss": 0.62669629, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4296875, "step": 7235, "time_per_iteration": 2.4698903560638428 }, { "auxiliary_loss_clip": 0.01061427, "auxiliary_loss_mlp": 0.01045381, "balance_loss_clip": 1.01491106, "balance_loss_mlp": 1.01803446, "epoch": 0.43505185630542614, "flos": 30517998888960.0, "grad_norm": 2.1403308154972596, "language_loss": 0.78222287, "learning_rate": 2.512024397126566e-06, "loss": 0.80329096, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7236, "time_per_iteration": 2.5081276893615723 }, { "auxiliary_loss_clip": 0.01059458, "auxiliary_loss_mlp": 0.0104344, "balance_loss_clip": 1.01593864, "balance_loss_mlp": 1.01916075, "epoch": 0.4351119795580941, "flos": 15734446606080.0, "grad_norm": 2.384284804746426, "language_loss": 0.82004988, "learning_rate": 2.5116479051143345e-06, "loss": 0.84107888, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40234375, "step": 7237, "time_per_iteration": 2.4114208221435547 }, { "auxiliary_loss_clip": 0.01059296, "auxiliary_loss_mlp": 0.01043619, "balance_loss_clip": 1.01618862, "balance_loss_mlp": 1.0182364, "epoch": 0.4351721028107621, "flos": 18730832198400.0, "grad_norm": 1.670508583618334, "language_loss": 0.64055276, "learning_rate": 2.5112713937003623e-06, "loss": 0.66158187, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 7238, "time_per_iteration": 2.456557512283325 }, { "auxiliary_loss_clip": 0.01058131, "auxiliary_loss_mlp": 0.01049012, "balance_loss_clip": 1.01988912, "balance_loss_mlp": 1.01758504, "epoch": 0.43523222606343004, "flos": 25225759601280.0, "grad_norm": 1.6868226932697041, "language_loss": 0.8666333, "learning_rate": 2.510894862898928e-06, "loss": 0.88770473, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40625, "step": 7239, "time_per_iteration": 2.44221568107605 }, { "auxiliary_loss_clip": 0.01061168, "auxiliary_loss_mlp": 0.01039808, "balance_loss_clip": 1.01049483, "balance_loss_mlp": 1.01986575, "epoch": 0.435292349316098, "flos": 22708137749760.0, "grad_norm": 1.6837704325941958, "language_loss": 0.74158537, "learning_rate": 2.510518312724309e-06, "loss": 0.76259506, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4140625, "step": 7240, "time_per_iteration": 2.43851637840271 }, { "auxiliary_loss_clip": 0.01063228, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.01425886, "balance_loss_mlp": 1.02019656, "epoch": 0.43535247256876597, "flos": 25774698908160.0, "grad_norm": 1.734747043323094, "language_loss": 0.82662988, "learning_rate": 2.5101417431907842e-06, "loss": 0.84772515, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4296875, "step": 7241, "time_per_iteration": 2.4432718753814697 }, { "auxiliary_loss_clip": 0.0106462, "auxiliary_loss_mlp": 0.01047445, "balance_loss_clip": 1.01568794, "balance_loss_mlp": 1.0196979, "epoch": 0.43541259582143393, "flos": 17527236387840.0, "grad_norm": 2.54591015861134, "language_loss": 0.80753809, "learning_rate": 2.5097651543126345e-06, "loss": 0.8286587, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.44921875, "step": 7242, "time_per_iteration": 2.3900856971740723 }, { "auxiliary_loss_clip": 0.0106271, "auxiliary_loss_mlp": 0.01047184, "balance_loss_clip": 1.01542628, "balance_loss_mlp": 1.01829815, "epoch": 0.4354727190741019, "flos": 15194304961920.0, "grad_norm": 2.0901314103983664, "language_loss": 0.7074796, "learning_rate": 2.509388546104138e-06, "loss": 0.72857851, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4453125, "step": 7243, "time_per_iteration": 2.378183126449585 }, { "auxiliary_loss_clip": 0.01058985, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.01494932, "balance_loss_mlp": 1.01884246, "epoch": 0.43553284232676986, "flos": 16648472666880.0, "grad_norm": 1.5931986632256858, "language_loss": 0.82405639, "learning_rate": 2.5090119185795766e-06, "loss": 0.8450557, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40234375, "step": 7244, "time_per_iteration": 2.404832124710083 }, { "auxiliary_loss_clip": 0.01060492, "auxiliary_loss_mlp": 0.01042462, "balance_loss_clip": 1.01455522, "balance_loss_mlp": 1.0183332, "epoch": 0.43559296557943783, "flos": 23399837072640.0, "grad_norm": 1.6496200814887945, "language_loss": 0.74305522, "learning_rate": 2.508635271753234e-06, "loss": 0.76408476, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.421875, "step": 7245, "time_per_iteration": 2.405592918395996 }, { "auxiliary_loss_clip": 0.01062047, "auxiliary_loss_mlp": 0.01051283, "balance_loss_clip": 1.02190971, "balance_loss_mlp": 1.01910305, "epoch": 0.4356530888321058, "flos": 22417974506880.0, "grad_norm": 1.593495521317607, "language_loss": 0.78758073, "learning_rate": 2.508258605639389e-06, "loss": 0.80871403, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7246, "time_per_iteration": 2.518554449081421 }, { "auxiliary_loss_clip": 0.01061627, "auxiliary_loss_mlp": 0.01050251, "balance_loss_clip": 1.01975763, "balance_loss_mlp": 1.01879787, "epoch": 0.43571321208477376, "flos": 21615076903680.0, "grad_norm": 1.9461150110039138, "language_loss": 0.87172788, "learning_rate": 2.5078819202523275e-06, "loss": 0.89284664, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4296875, "step": 7247, "time_per_iteration": 2.4161739349365234 }, { "auxiliary_loss_clip": 0.01062232, "auxiliary_loss_mlp": 0.01056235, "balance_loss_clip": 1.02555084, "balance_loss_mlp": 1.01948786, "epoch": 0.4357733353374418, "flos": 23986238135040.0, "grad_norm": 1.7127224920227593, "language_loss": 0.73643243, "learning_rate": 2.507505215606333e-06, "loss": 0.75761712, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.42773438, "step": 7248, "time_per_iteration": 2.4894182682037354 }, { "auxiliary_loss_clip": 0.01062292, "auxiliary_loss_mlp": 0.01049511, "balance_loss_clip": 1.01861167, "balance_loss_mlp": 1.01942813, "epoch": 0.43583345859010975, "flos": 25263570470400.0, "grad_norm": 1.4506900692625024, "language_loss": 0.87911606, "learning_rate": 2.5071284917156893e-06, "loss": 0.9002341, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42773438, "step": 7249, "time_per_iteration": 2.4342713356018066 }, { "auxiliary_loss_clip": 0.01063931, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.02362192, "balance_loss_mlp": 1.02054429, "epoch": 0.4358935818427777, "flos": 23695167196800.0, "grad_norm": 1.7341307479292052, "language_loss": 0.82988262, "learning_rate": 2.506751748594683e-06, "loss": 0.85106421, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7250, "time_per_iteration": 2.468912124633789 }, { "auxiliary_loss_clip": 0.0106233, "auxiliary_loss_mlp": 0.01046136, "balance_loss_clip": 1.01363957, "balance_loss_mlp": 1.01948023, "epoch": 0.4359537050954457, "flos": 29531562935040.0, "grad_norm": 1.668448177763121, "language_loss": 0.86354482, "learning_rate": 2.5063749862575988e-06, "loss": 0.88462949, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.42773438, "step": 7251, "time_per_iteration": 2.4663045406341553 }, { "auxiliary_loss_clip": 0.01058821, "auxiliary_loss_mlp": 0.01050543, "balance_loss_clip": 1.01954865, "balance_loss_mlp": 1.01794744, "epoch": 0.43601382834811364, "flos": 22710162608640.0, "grad_norm": 1.4579019776635065, "language_loss": 0.70383686, "learning_rate": 2.5059982047187245e-06, "loss": 0.72493052, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.40820312, "step": 7252, "time_per_iteration": 2.455044746398926 }, { "auxiliary_loss_clip": 0.0105955, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.01659513, "balance_loss_mlp": 1.01825416, "epoch": 0.4360739516007816, "flos": 19097734343040.0, "grad_norm": 1.6226356929420411, "language_loss": 0.85019135, "learning_rate": 2.505621403992348e-06, "loss": 0.87122905, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 7253, "time_per_iteration": 2.4109578132629395 }, { "auxiliary_loss_clip": 0.01061716, "auxiliary_loss_mlp": 0.0104483, "balance_loss_clip": 1.01494384, "balance_loss_mlp": 1.01878774, "epoch": 0.43613407485344957, "flos": 23403293297280.0, "grad_norm": 1.5778030614518577, "language_loss": 0.71406627, "learning_rate": 2.505244584092757e-06, "loss": 0.73513174, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4296875, "step": 7254, "time_per_iteration": 2.498340606689453 }, { "auxiliary_loss_clip": 0.01059761, "auxiliary_loss_mlp": 0.01047884, "balance_loss_clip": 1.01626956, "balance_loss_mlp": 1.01814997, "epoch": 0.43619419810611754, "flos": 22636705374720.0, "grad_norm": 2.4086388282572457, "language_loss": 0.82595444, "learning_rate": 2.5048677450342406e-06, "loss": 0.84703088, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.41601562, "step": 7255, "time_per_iteration": 2.410710096359253 }, { "auxiliary_loss_clip": 0.0105984, "auxiliary_loss_mlp": 0.01045207, "balance_loss_clip": 1.01540458, "balance_loss_mlp": 1.01738286, "epoch": 0.4362543213587855, "flos": 20046918009600.0, "grad_norm": 1.708505657555692, "language_loss": 0.78488672, "learning_rate": 2.504490886831089e-06, "loss": 0.80593717, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42382812, "step": 7256, "time_per_iteration": 5.403707027435303 }, { "auxiliary_loss_clip": 0.01060241, "auxiliary_loss_mlp": 0.01051405, "balance_loss_clip": 1.01883674, "balance_loss_mlp": 1.01891041, "epoch": 0.43631444461145347, "flos": 21360245823360.0, "grad_norm": 1.4857585139889533, "language_loss": 0.77192998, "learning_rate": 2.5041140094975922e-06, "loss": 0.79304641, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.4140625, "step": 7257, "time_per_iteration": 2.4434125423431396 }, { "auxiliary_loss_clip": 0.01060308, "auxiliary_loss_mlp": 0.01042871, "balance_loss_clip": 1.01194823, "balance_loss_mlp": 1.01795578, "epoch": 0.43637456786412143, "flos": 22417450836480.0, "grad_norm": 1.6665422210036394, "language_loss": 0.74079013, "learning_rate": 2.5037371130480417e-06, "loss": 0.76182193, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42382812, "step": 7258, "time_per_iteration": 2.4162564277648926 }, { "auxiliary_loss_clip": 0.01060732, "auxiliary_loss_mlp": 0.01041306, "balance_loss_clip": 1.01242161, "balance_loss_mlp": 1.01800013, "epoch": 0.4364346911167894, "flos": 28547570776320.0, "grad_norm": 1.7953296279094395, "language_loss": 0.78327072, "learning_rate": 2.5033601974967297e-06, "loss": 0.80429107, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42773438, "step": 7259, "time_per_iteration": 2.5225110054016113 }, { "auxiliary_loss_clip": 0.0101256, "auxiliary_loss_mlp": 0.01004058, "balance_loss_clip": 1.00095892, "balance_loss_mlp": 1.00358367, "epoch": 0.43649481436945736, "flos": 62656211376000.0, "grad_norm": 0.7489872958787197, "language_loss": 0.57088745, "learning_rate": 2.5029832628579483e-06, "loss": 0.59105361, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 0.03100586, "router_z_loss_mlp": 0.08984375, "step": 7260, "time_per_iteration": 2.994781494140625 }, { "auxiliary_loss_clip": 0.01061481, "auxiliary_loss_mlp": 0.01047373, "balance_loss_clip": 1.01454258, "balance_loss_mlp": 1.01811182, "epoch": 0.4365549376221254, "flos": 30590792807040.0, "grad_norm": 1.9268872618588564, "language_loss": 0.72241974, "learning_rate": 2.5026063091459907e-06, "loss": 0.74350828, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43359375, "step": 7261, "time_per_iteration": 3.986769199371338 }, { "auxiliary_loss_clip": 0.01060178, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.02013636, "balance_loss_mlp": 1.0173018, "epoch": 0.43661506087479335, "flos": 17164907631360.0, "grad_norm": 1.9553467228589583, "language_loss": 0.70968747, "learning_rate": 2.5022293363751522e-06, "loss": 0.73078763, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 7262, "time_per_iteration": 2.4204537868499756 }, { "auxiliary_loss_clip": 0.01058474, "auxiliary_loss_mlp": 0.01042026, "balance_loss_clip": 1.01591969, "balance_loss_mlp": 1.01983654, "epoch": 0.4366751841274613, "flos": 22046603708160.0, "grad_norm": 1.6113552833893432, "language_loss": 0.80773258, "learning_rate": 2.501852344559726e-06, "loss": 0.82873762, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 7263, "time_per_iteration": 2.417114496231079 }, { "auxiliary_loss_clip": 0.01060358, "auxiliary_loss_mlp": 0.01045613, "balance_loss_clip": 1.01722884, "balance_loss_mlp": 1.01953745, "epoch": 0.4367353073801293, "flos": 15996399603840.0, "grad_norm": 1.7059403925342342, "language_loss": 0.76571941, "learning_rate": 2.50147533371401e-06, "loss": 0.78677905, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 7264, "time_per_iteration": 2.389117956161499 }, { "auxiliary_loss_clip": 0.01059457, "auxiliary_loss_mlp": 0.01043522, "balance_loss_clip": 1.01612723, "balance_loss_mlp": 1.01800597, "epoch": 0.43679543063279724, "flos": 38215998432000.0, "grad_norm": 1.8590905006112874, "language_loss": 0.6354726, "learning_rate": 2.501098303852298e-06, "loss": 0.65650237, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7265, "time_per_iteration": 2.5082695484161377 }, { "auxiliary_loss_clip": 0.01057677, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.01130247, "balance_loss_mlp": 1.01746821, "epoch": 0.4368555538854652, "flos": 15192384837120.0, "grad_norm": 2.027301064648061, "language_loss": 0.74146211, "learning_rate": 2.5007212549888884e-06, "loss": 0.76243347, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 7266, "time_per_iteration": 2.38924503326416 }, { "auxiliary_loss_clip": 0.01061225, "auxiliary_loss_mlp": 0.01043457, "balance_loss_clip": 1.0153234, "balance_loss_mlp": 1.01921248, "epoch": 0.4369156771381332, "flos": 23068162356480.0, "grad_norm": 1.9705221364745482, "language_loss": 0.83578223, "learning_rate": 2.5003441871380794e-06, "loss": 0.85682905, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7267, "time_per_iteration": 2.3830809593200684 }, { "auxiliary_loss_clip": 0.01059695, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.01157355, "balance_loss_mlp": 1.01840496, "epoch": 0.43697580039080114, "flos": 23439952091520.0, "grad_norm": 1.837530162600504, "language_loss": 0.75864649, "learning_rate": 2.4999671003141674e-06, "loss": 0.77963459, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41210938, "step": 7268, "time_per_iteration": 2.408071279525757 }, { "auxiliary_loss_clip": 0.01062703, "auxiliary_loss_mlp": 0.01051037, "balance_loss_clip": 1.02147353, "balance_loss_mlp": 1.01936841, "epoch": 0.4370359236434691, "flos": 18513707253120.0, "grad_norm": 2.1361051460597342, "language_loss": 0.81477088, "learning_rate": 2.499589994531454e-06, "loss": 0.83590829, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43359375, "step": 7269, "time_per_iteration": 2.3714592456817627 }, { "auxiliary_loss_clip": 0.01058846, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.01654959, "balance_loss_mlp": 1.01870382, "epoch": 0.43709604689613707, "flos": 23221360869120.0, "grad_norm": 1.784599587098049, "language_loss": 0.76114619, "learning_rate": 2.499212869804237e-06, "loss": 0.78217131, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 7270, "time_per_iteration": 2.4248766899108887 }, { "auxiliary_loss_clip": 0.01060923, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.01347947, "balance_loss_mlp": 1.01905251, "epoch": 0.43715617014880503, "flos": 23802629961600.0, "grad_norm": 1.716008088922179, "language_loss": 0.80516469, "learning_rate": 2.4988357261468182e-06, "loss": 0.82619131, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41992188, "step": 7271, "time_per_iteration": 2.415276288986206 }, { "auxiliary_loss_clip": 0.01010171, "auxiliary_loss_mlp": 0.01003401, "balance_loss_clip": 0.99989665, "balance_loss_mlp": 1.0016911, "epoch": 0.437216293401473, "flos": 61937768085120.0, "grad_norm": 0.7031852598678011, "language_loss": 0.54979455, "learning_rate": 2.4984585635734993e-06, "loss": 0.56993032, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 0.03515625, "router_z_loss_mlp": 0.08496094, "step": 7272, "time_per_iteration": 3.0944581031799316 }, { "auxiliary_loss_clip": 0.01061785, "auxiliary_loss_mlp": 0.01050825, "balance_loss_clip": 1.0203315, "balance_loss_mlp": 1.01994061, "epoch": 0.43727641665414096, "flos": 21981141175680.0, "grad_norm": 1.6543342685010045, "language_loss": 0.71012402, "learning_rate": 2.498081382098581e-06, "loss": 0.73125005, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41796875, "step": 7273, "time_per_iteration": 2.4050798416137695 }, { "auxiliary_loss_clip": 0.01061417, "auxiliary_loss_mlp": 0.01047344, "balance_loss_clip": 1.0182569, "balance_loss_mlp": 1.01903784, "epoch": 0.437336539906809, "flos": 39529291334400.0, "grad_norm": 1.7713239548930133, "language_loss": 0.78059328, "learning_rate": 2.497704181736367e-06, "loss": 0.80168086, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42382812, "step": 7274, "time_per_iteration": 2.567021608352661 }, { "auxiliary_loss_clip": 0.01057245, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.0117352, "balance_loss_mlp": 1.01700246, "epoch": 0.43739666315947695, "flos": 17456188037760.0, "grad_norm": 1.9197336378327285, "language_loss": 0.81434011, "learning_rate": 2.49732696250116e-06, "loss": 0.83527815, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.40234375, "step": 7275, "time_per_iteration": 2.353828191757202 }, { "auxiliary_loss_clip": 0.01060431, "auxiliary_loss_mlp": 0.01047543, "balance_loss_clip": 1.02011275, "balance_loss_mlp": 1.01939559, "epoch": 0.4374567864121449, "flos": 16357925399040.0, "grad_norm": 2.2118297283344703, "language_loss": 0.81908619, "learning_rate": 2.496949724407266e-06, "loss": 0.84016591, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 7276, "time_per_iteration": 2.3761255741119385 }, { "auxiliary_loss_clip": 0.01063986, "auxiliary_loss_mlp": 0.01050863, "balance_loss_clip": 1.01895118, "balance_loss_mlp": 1.019315, "epoch": 0.4375169096648129, "flos": 30586324152960.0, "grad_norm": 2.411715766094461, "language_loss": 0.74420345, "learning_rate": 2.496572467468988e-06, "loss": 0.76535201, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44726562, "step": 7277, "time_per_iteration": 2.4802350997924805 }, { "auxiliary_loss_clip": 0.01058731, "auxiliary_loss_mlp": 0.01047108, "balance_loss_clip": 1.01830697, "balance_loss_mlp": 1.01705575, "epoch": 0.43757703291748085, "flos": 30554273658240.0, "grad_norm": 2.5585596810012765, "language_loss": 0.74098837, "learning_rate": 2.4961951917006317e-06, "loss": 0.76204681, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41601562, "step": 7278, "time_per_iteration": 2.4695401191711426 }, { "auxiliary_loss_clip": 0.01057975, "auxiliary_loss_mlp": 0.01049646, "balance_loss_clip": 1.0235033, "balance_loss_mlp": 1.01795959, "epoch": 0.4376371561701488, "flos": 21396311124480.0, "grad_norm": 1.603053283743268, "language_loss": 0.67597657, "learning_rate": 2.4958178971165046e-06, "loss": 0.69705278, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 7279, "time_per_iteration": 2.399232864379883 }, { "auxiliary_loss_clip": 0.01063974, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.02582741, "balance_loss_mlp": 1.02049923, "epoch": 0.4376972794228168, "flos": 23403258385920.0, "grad_norm": 1.8943765127530712, "language_loss": 0.83247763, "learning_rate": 2.4954405837309126e-06, "loss": 0.85367119, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 7280, "time_per_iteration": 2.4058315753936768 }, { "auxiliary_loss_clip": 0.01055755, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.01939011, "balance_loss_mlp": 1.01612926, "epoch": 0.43775740267548474, "flos": 22891850657280.0, "grad_norm": 1.5736777842160419, "language_loss": 0.78024936, "learning_rate": 2.4950632515581653e-06, "loss": 0.80126625, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 7281, "time_per_iteration": 2.3885397911071777 }, { "auxiliary_loss_clip": 0.01058442, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.01521039, "balance_loss_mlp": 1.01738262, "epoch": 0.4378175259281527, "flos": 23293282003200.0, "grad_norm": 1.9251414261803406, "language_loss": 0.77112889, "learning_rate": 2.494685900612569e-06, "loss": 0.79212958, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 7282, "time_per_iteration": 2.379964828491211 }, { "auxiliary_loss_clip": 0.01061244, "auxiliary_loss_mlp": 0.01046939, "balance_loss_clip": 1.01677895, "balance_loss_mlp": 1.01918983, "epoch": 0.43787764918082067, "flos": 23875807904640.0, "grad_norm": 1.7869918120781367, "language_loss": 0.86286801, "learning_rate": 2.4943085309084333e-06, "loss": 0.88394988, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.41992188, "step": 7283, "time_per_iteration": 2.4463982582092285 }, { "auxiliary_loss_clip": 0.010624, "auxiliary_loss_mlp": 0.01044162, "balance_loss_clip": 1.01334596, "balance_loss_mlp": 1.01871395, "epoch": 0.43793777243348864, "flos": 23987006184960.0, "grad_norm": 1.7024830483412288, "language_loss": 0.81518173, "learning_rate": 2.49393114246007e-06, "loss": 0.83624732, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43554688, "step": 7284, "time_per_iteration": 2.393005132675171 }, { "auxiliary_loss_clip": 0.01059121, "auxiliary_loss_mlp": 0.01050588, "balance_loss_clip": 1.0230031, "balance_loss_mlp": 1.01762199, "epoch": 0.4379978956861566, "flos": 18623090142720.0, "grad_norm": 2.18000639063845, "language_loss": 0.81613672, "learning_rate": 2.493553735281787e-06, "loss": 0.83723378, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4140625, "step": 7285, "time_per_iteration": 2.4198246002197266 }, { "auxiliary_loss_clip": 0.01059808, "auxiliary_loss_mlp": 0.01042095, "balance_loss_clip": 1.01298428, "balance_loss_mlp": 1.01850593, "epoch": 0.43805801893882457, "flos": 21980303303040.0, "grad_norm": 2.0600247713932007, "language_loss": 0.75408256, "learning_rate": 2.493176309387897e-06, "loss": 0.77510154, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41210938, "step": 7286, "time_per_iteration": 2.375105381011963 }, { "auxiliary_loss_clip": 0.01060861, "auxiliary_loss_mlp": 0.01047226, "balance_loss_clip": 1.01768601, "balance_loss_mlp": 1.01805401, "epoch": 0.43811814219149253, "flos": 26392207858560.0, "grad_norm": 2.193043740158527, "language_loss": 0.74713695, "learning_rate": 2.492798864792712e-06, "loss": 0.7682178, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7287, "time_per_iteration": 2.442220687866211 }, { "auxiliary_loss_clip": 0.01062136, "auxiliary_loss_mlp": 0.01053082, "balance_loss_clip": 1.02330399, "balance_loss_mlp": 1.01982665, "epoch": 0.43817826544416055, "flos": 17492358072960.0, "grad_norm": 1.7552469550869914, "language_loss": 0.84033799, "learning_rate": 2.492421401510545e-06, "loss": 0.86149019, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7288, "time_per_iteration": 2.370328903198242 }, { "auxiliary_loss_clip": 0.01062616, "auxiliary_loss_mlp": 0.01046688, "balance_loss_clip": 1.01676702, "balance_loss_mlp": 1.01872134, "epoch": 0.4382383886968285, "flos": 21579919297920.0, "grad_norm": 1.3935986503864533, "language_loss": 0.84869266, "learning_rate": 2.4920439195557093e-06, "loss": 0.86978567, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43945312, "step": 7289, "time_per_iteration": 2.4336354732513428 }, { "auxiliary_loss_clip": 0.01063014, "auxiliary_loss_mlp": 0.01048464, "balance_loss_clip": 1.01870966, "balance_loss_mlp": 1.01828504, "epoch": 0.4382985119494965, "flos": 27922625706240.0, "grad_norm": 2.1227753501590816, "language_loss": 0.79161209, "learning_rate": 2.4916664189425183e-06, "loss": 0.81272686, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44726562, "step": 7290, "time_per_iteration": 2.4175989627838135 }, { "auxiliary_loss_clip": 0.01058603, "auxiliary_loss_mlp": 0.01044435, "balance_loss_clip": 1.0162065, "balance_loss_mlp": 1.01783335, "epoch": 0.43835863520216445, "flos": 24935666181120.0, "grad_norm": 2.1057694776811986, "language_loss": 0.79284644, "learning_rate": 2.491288899685288e-06, "loss": 0.81387681, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 7291, "time_per_iteration": 2.4486818313598633 }, { "auxiliary_loss_clip": 0.01060667, "auxiliary_loss_mlp": 0.01042758, "balance_loss_clip": 1.01494634, "balance_loss_mlp": 1.01905727, "epoch": 0.4384187584548324, "flos": 33508903397760.0, "grad_norm": 1.7019848218104956, "language_loss": 0.66320062, "learning_rate": 2.4909113617983325e-06, "loss": 0.68423486, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 7292, "time_per_iteration": 2.4842960834503174 }, { "auxiliary_loss_clip": 0.01060596, "auxiliary_loss_mlp": 0.01038921, "balance_loss_clip": 1.01150262, "balance_loss_mlp": 1.01874924, "epoch": 0.4384788817075004, "flos": 23949928454400.0, "grad_norm": 1.8268222825076799, "language_loss": 0.75726479, "learning_rate": 2.49053380529597e-06, "loss": 0.77825999, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41796875, "step": 7293, "time_per_iteration": 2.435281276702881 }, { "auxiliary_loss_clip": 0.01060472, "auxiliary_loss_mlp": 0.01045556, "balance_loss_clip": 1.01464498, "balance_loss_mlp": 1.01798761, "epoch": 0.43853900496016834, "flos": 19097524874880.0, "grad_norm": 1.8274360074500626, "language_loss": 0.8059845, "learning_rate": 2.490156230192516e-06, "loss": 0.82704473, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42578125, "step": 7294, "time_per_iteration": 2.3682472705841064 }, { "auxiliary_loss_clip": 0.0106281, "auxiliary_loss_mlp": 0.01046122, "balance_loss_clip": 1.01798916, "balance_loss_mlp": 1.02033865, "epoch": 0.4385991282128363, "flos": 13224505253760.0, "grad_norm": 1.8557812731595573, "language_loss": 0.74372649, "learning_rate": 2.4897786365022883e-06, "loss": 0.76481581, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42382812, "step": 7295, "time_per_iteration": 3.766171455383301 }, { "auxiliary_loss_clip": 0.0106277, "auxiliary_loss_mlp": 0.01047104, "balance_loss_clip": 1.01577628, "balance_loss_mlp": 1.01941538, "epoch": 0.4386592514655043, "flos": 14318997465600.0, "grad_norm": 1.6637092409755259, "language_loss": 0.76261288, "learning_rate": 2.4894010242396063e-06, "loss": 0.78371161, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43359375, "step": 7296, "time_per_iteration": 5.186948776245117 }, { "auxiliary_loss_clip": 0.01060549, "auxiliary_loss_mlp": 0.01046952, "balance_loss_clip": 1.01765037, "balance_loss_mlp": 1.01831114, "epoch": 0.43871937471817224, "flos": 22783305640320.0, "grad_norm": 2.1633159758761944, "language_loss": 0.70417559, "learning_rate": 2.4890233934187873e-06, "loss": 0.7252506, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 7297, "time_per_iteration": 2.4623565673828125 }, { "auxiliary_loss_clip": 0.01059312, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.01174498, "balance_loss_mlp": 1.01789522, "epoch": 0.4387794979708402, "flos": 28071111185280.0, "grad_norm": 1.5128305863123008, "language_loss": 0.71367806, "learning_rate": 2.4886457440541535e-06, "loss": 0.73466176, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7298, "time_per_iteration": 2.4492974281311035 }, { "auxiliary_loss_clip": 0.01059407, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.01466656, "balance_loss_mlp": 1.01866531, "epoch": 0.43883962122350817, "flos": 26248365590400.0, "grad_norm": 2.157175793690281, "language_loss": 0.73466778, "learning_rate": 2.4882680761600238e-06, "loss": 0.75570631, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40820312, "step": 7299, "time_per_iteration": 2.44999098777771 }, { "auxiliary_loss_clip": 0.01064173, "auxiliary_loss_mlp": 0.01048258, "balance_loss_clip": 1.01392543, "balance_loss_mlp": 1.01941276, "epoch": 0.43889974447617613, "flos": 25882615520640.0, "grad_norm": 1.9281355343772062, "language_loss": 0.78611612, "learning_rate": 2.487890389750719e-06, "loss": 0.80724043, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44726562, "step": 7300, "time_per_iteration": 2.4579555988311768 }, { "auxiliary_loss_clip": 0.01061881, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.01779127, "balance_loss_mlp": 1.01855087, "epoch": 0.43895986772884416, "flos": 25045433095680.0, "grad_norm": 1.893186658732785, "language_loss": 0.73207045, "learning_rate": 2.4875126848405626e-06, "loss": 0.75318402, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43359375, "step": 7301, "time_per_iteration": 3.858041524887085 }, { "auxiliary_loss_clip": 0.01064124, "auxiliary_loss_mlp": 0.01047316, "balance_loss_clip": 1.01441419, "balance_loss_mlp": 1.01956987, "epoch": 0.4390199909815121, "flos": 25993394864640.0, "grad_norm": 2.15830940212214, "language_loss": 0.72258341, "learning_rate": 2.4871349614438757e-06, "loss": 0.74369776, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4453125, "step": 7302, "time_per_iteration": 2.4330086708068848 }, { "auxiliary_loss_clip": 0.01062367, "auxiliary_loss_mlp": 0.01047285, "balance_loss_clip": 1.0174228, "balance_loss_mlp": 1.01902533, "epoch": 0.4390801142341801, "flos": 29020993079040.0, "grad_norm": 1.777201791060803, "language_loss": 0.83281636, "learning_rate": 2.486757219574983e-06, "loss": 0.85391283, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 7303, "time_per_iteration": 2.476213216781616 }, { "auxiliary_loss_clip": 0.01066237, "auxiliary_loss_mlp": 0.01054685, "balance_loss_clip": 1.02125835, "balance_loss_mlp": 1.01949286, "epoch": 0.43914023748684805, "flos": 33437121909120.0, "grad_norm": 1.8658270369586951, "language_loss": 0.70212865, "learning_rate": 2.4863794592482067e-06, "loss": 0.72333789, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46679688, "step": 7304, "time_per_iteration": 2.4966511726379395 }, { "auxiliary_loss_clip": 0.01059228, "auxiliary_loss_mlp": 0.01048494, "balance_loss_clip": 1.02070618, "balance_loss_mlp": 1.01787901, "epoch": 0.439200360739516, "flos": 34530427134720.0, "grad_norm": 1.533887665136781, "language_loss": 0.79193187, "learning_rate": 2.486001680477873e-06, "loss": 0.81300908, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 7305, "time_per_iteration": 2.491532564163208 }, { "auxiliary_loss_clip": 0.01061454, "auxiliary_loss_mlp": 0.01055533, "balance_loss_clip": 1.02385926, "balance_loss_mlp": 1.01898372, "epoch": 0.439260483992184, "flos": 21906776246400.0, "grad_norm": 1.674203335379242, "language_loss": 0.70174617, "learning_rate": 2.485623883278308e-06, "loss": 0.72291613, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42578125, "step": 7306, "time_per_iteration": 2.386427640914917 }, { "auxiliary_loss_clip": 0.01062823, "auxiliary_loss_mlp": 0.01044375, "balance_loss_clip": 1.01373804, "balance_loss_mlp": 1.01957369, "epoch": 0.43932060724485195, "flos": 20995368537600.0, "grad_norm": 1.652152664364772, "language_loss": 0.64685053, "learning_rate": 2.4852460676638344e-06, "loss": 0.66792256, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 7307, "time_per_iteration": 2.4130983352661133 }, { "auxiliary_loss_clip": 0.0106559, "auxiliary_loss_mlp": 0.01044963, "balance_loss_clip": 1.01438546, "balance_loss_mlp": 1.02007794, "epoch": 0.4393807304975199, "flos": 17746141812480.0, "grad_norm": 2.276553847959972, "language_loss": 0.73295546, "learning_rate": 2.4848682336487828e-06, "loss": 0.75406098, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.45507812, "step": 7308, "time_per_iteration": 2.341179609298706 }, { "auxiliary_loss_clip": 0.01063698, "auxiliary_loss_mlp": 0.01051046, "balance_loss_clip": 1.01847792, "balance_loss_mlp": 1.01827371, "epoch": 0.4394408537501879, "flos": 22527427219200.0, "grad_norm": 8.952840129246773, "language_loss": 0.77506065, "learning_rate": 2.4844903812474787e-06, "loss": 0.79620802, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.453125, "step": 7309, "time_per_iteration": 2.4719150066375732 }, { "auxiliary_loss_clip": 0.01058909, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.01495266, "balance_loss_mlp": 1.01835704, "epoch": 0.43950097700285584, "flos": 23439533155200.0, "grad_norm": 1.6806576653859169, "language_loss": 0.7269522, "learning_rate": 2.484112510474251e-06, "loss": 0.74795681, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40625, "step": 7310, "time_per_iteration": 2.400636911392212 }, { "auxiliary_loss_clip": 0.01061952, "auxiliary_loss_mlp": 0.01049514, "balance_loss_clip": 1.01810265, "balance_loss_mlp": 1.01891768, "epoch": 0.4395611002555238, "flos": 23179709750400.0, "grad_norm": 1.8975296324059103, "language_loss": 0.77321887, "learning_rate": 2.483734621343429e-06, "loss": 0.79433352, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4296875, "step": 7311, "time_per_iteration": 2.427537441253662 }, { "auxiliary_loss_clip": 0.01062786, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.0146333, "balance_loss_mlp": 1.01949108, "epoch": 0.43962122350819177, "flos": 22126275164160.0, "grad_norm": 2.260130338078128, "language_loss": 0.82983363, "learning_rate": 2.483356713869341e-06, "loss": 0.85091662, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43164062, "step": 7312, "time_per_iteration": 2.3986408710479736 }, { "auxiliary_loss_clip": 0.01060939, "auxiliary_loss_mlp": 0.01039506, "balance_loss_clip": 1.0100373, "balance_loss_mlp": 1.01815271, "epoch": 0.43968134676085974, "flos": 17419599066240.0, "grad_norm": 1.8122897199524053, "language_loss": 0.87158227, "learning_rate": 2.482978788066318e-06, "loss": 0.89258671, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42773438, "step": 7313, "time_per_iteration": 2.3987982273101807 }, { "auxiliary_loss_clip": 0.0106187, "auxiliary_loss_mlp": 0.01044383, "balance_loss_clip": 1.01436567, "balance_loss_mlp": 1.01843023, "epoch": 0.43974147001352776, "flos": 18951657747840.0, "grad_norm": 1.9158231644092658, "language_loss": 0.68759257, "learning_rate": 2.4826008439486904e-06, "loss": 0.70865512, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43554688, "step": 7314, "time_per_iteration": 2.357686996459961 }, { "auxiliary_loss_clip": 0.01064398, "auxiliary_loss_mlp": 0.01052042, "balance_loss_clip": 1.01999843, "balance_loss_mlp": 1.02096105, "epoch": 0.4398015932661957, "flos": 18952495620480.0, "grad_norm": 1.8207796307461122, "language_loss": 0.77882272, "learning_rate": 2.4822228815307915e-06, "loss": 0.79998714, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.43359375, "step": 7315, "time_per_iteration": 2.3925325870513916 }, { "auxiliary_loss_clip": 0.01060088, "auxiliary_loss_mlp": 0.0103984, "balance_loss_clip": 1.01384044, "balance_loss_mlp": 1.01868296, "epoch": 0.4398617165188637, "flos": 24198964248960.0, "grad_norm": 2.7186508942090275, "language_loss": 0.76055741, "learning_rate": 2.4818449008269523e-06, "loss": 0.78155667, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.4140625, "step": 7316, "time_per_iteration": 2.4064395427703857 }, { "auxiliary_loss_clip": 0.01062092, "auxiliary_loss_mlp": 0.01043289, "balance_loss_clip": 1.01446414, "balance_loss_mlp": 1.01988757, "epoch": 0.43992183977153165, "flos": 22235588231040.0, "grad_norm": 2.784521056576409, "language_loss": 0.66906399, "learning_rate": 2.481466901851506e-06, "loss": 0.69011784, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 7317, "time_per_iteration": 2.4330952167510986 }, { "auxiliary_loss_clip": 0.01064463, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.01823235, "balance_loss_mlp": 1.02083564, "epoch": 0.4399819630241996, "flos": 18696477553920.0, "grad_norm": 1.7363703215128456, "language_loss": 0.80803013, "learning_rate": 2.4810888846187865e-06, "loss": 0.82915097, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 7318, "time_per_iteration": 2.3593099117279053 }, { "auxiliary_loss_clip": 0.01064284, "auxiliary_loss_mlp": 0.01047454, "balance_loss_clip": 1.01796126, "balance_loss_mlp": 1.01984119, "epoch": 0.4400420862768676, "flos": 23878216788480.0, "grad_norm": 1.7009335364060372, "language_loss": 0.81300569, "learning_rate": 2.4807108491431283e-06, "loss": 0.83412302, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4453125, "step": 7319, "time_per_iteration": 2.43990421295166 }, { "auxiliary_loss_clip": 0.01063863, "auxiliary_loss_mlp": 0.01044079, "balance_loss_clip": 1.01511168, "balance_loss_mlp": 1.02011383, "epoch": 0.44010220952953555, "flos": 28036372515840.0, "grad_norm": 1.6103783448450437, "language_loss": 0.80287182, "learning_rate": 2.4803327954388667e-06, "loss": 0.82395124, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4375, "step": 7320, "time_per_iteration": 2.480468511581421 }, { "auxiliary_loss_clip": 0.01062322, "auxiliary_loss_mlp": 0.01044179, "balance_loss_clip": 1.01686788, "balance_loss_mlp": 1.02007961, "epoch": 0.4401623327822035, "flos": 23767856380800.0, "grad_norm": 1.4526986808522624, "language_loss": 0.71004057, "learning_rate": 2.4799547235203376e-06, "loss": 0.73110551, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.421875, "step": 7321, "time_per_iteration": 2.4319217205047607 }, { "auxiliary_loss_clip": 0.01014292, "auxiliary_loss_mlp": 0.01005455, "balance_loss_clip": 1.00233173, "balance_loss_mlp": 1.00562501, "epoch": 0.4402224560348715, "flos": 70770793795200.0, "grad_norm": 0.8794083617603583, "language_loss": 0.56999892, "learning_rate": 2.4795766334018763e-06, "loss": 0.59019637, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.08691406, "step": 7322, "time_per_iteration": 3.1158978939056396 }, { "auxiliary_loss_clip": 0.01061648, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 1.01488709, "balance_loss_mlp": 1.0194962, "epoch": 0.44028257928753944, "flos": 22890733493760.0, "grad_norm": 1.4309498576008288, "language_loss": 0.77373135, "learning_rate": 2.479198525097822e-06, "loss": 0.79476428, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.421875, "step": 7323, "time_per_iteration": 2.4308297634124756 }, { "auxiliary_loss_clip": 0.01061381, "auxiliary_loss_mlp": 0.0105159, "balance_loss_clip": 1.02009487, "balance_loss_mlp": 1.01864946, "epoch": 0.4403427025402074, "flos": 17894766936960.0, "grad_norm": 2.171901754457414, "language_loss": 0.81565857, "learning_rate": 2.478820398622511e-06, "loss": 0.83678836, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.42773438, "step": 7324, "time_per_iteration": 2.352919816970825 }, { "auxiliary_loss_clip": 0.01013656, "auxiliary_loss_mlp": 0.01014096, "balance_loss_clip": 1.01094854, "balance_loss_mlp": 1.00473452, "epoch": 0.4404028257928754, "flos": 69558993815040.0, "grad_norm": 0.6764434934057291, "language_loss": 0.54537606, "learning_rate": 2.478442253990283e-06, "loss": 0.56565356, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.08935547, "step": 7325, "time_per_iteration": 3.0427229404449463 }, { "auxiliary_loss_clip": 0.01060683, "auxiliary_loss_mlp": 0.01040221, "balance_loss_clip": 1.01380444, "balance_loss_mlp": 1.01963806, "epoch": 0.44046294904554334, "flos": 20922609530880.0, "grad_norm": 1.4879926269902644, "language_loss": 0.70613664, "learning_rate": 2.4780640912154766e-06, "loss": 0.72714567, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 7326, "time_per_iteration": 2.3984360694885254 }, { "auxiliary_loss_clip": 0.01059514, "auxiliary_loss_mlp": 0.01043568, "balance_loss_clip": 1.01558912, "balance_loss_mlp": 1.01823974, "epoch": 0.44052307229821136, "flos": 23622338367360.0, "grad_norm": 1.4620135726747976, "language_loss": 0.77451193, "learning_rate": 2.477685910312432e-06, "loss": 0.79554272, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41210938, "step": 7327, "time_per_iteration": 2.4216556549072266 }, { "auxiliary_loss_clip": 0.01059044, "auxiliary_loss_mlp": 0.01041075, "balance_loss_clip": 1.01297712, "balance_loss_mlp": 1.01744676, "epoch": 0.4405831955508793, "flos": 17596853372160.0, "grad_norm": 2.2507353585766054, "language_loss": 0.85296571, "learning_rate": 2.4773077112954897e-06, "loss": 0.87396693, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41601562, "step": 7328, "time_per_iteration": 2.3294596672058105 }, { "auxiliary_loss_clip": 0.0105934, "auxiliary_loss_mlp": 0.01044639, "balance_loss_clip": 1.01760232, "balance_loss_mlp": 1.01749432, "epoch": 0.4406433188035473, "flos": 21462506795520.0, "grad_norm": 1.8601428665869066, "language_loss": 0.78443682, "learning_rate": 2.4769294941789908e-06, "loss": 0.80547655, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 7329, "time_per_iteration": 2.4241573810577393 }, { "auxiliary_loss_clip": 0.01063714, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.01241195, "balance_loss_mlp": 1.01930189, "epoch": 0.44070344205621526, "flos": 22672491384960.0, "grad_norm": 1.4837961079016122, "language_loss": 0.74657083, "learning_rate": 2.476551258977278e-06, "loss": 0.76763701, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 7330, "time_per_iteration": 2.3878095149993896 }, { "auxiliary_loss_clip": 0.01061405, "auxiliary_loss_mlp": 0.01046814, "balance_loss_clip": 1.01782179, "balance_loss_mlp": 1.01933599, "epoch": 0.4407635653088832, "flos": 23440056825600.0, "grad_norm": 1.8981351777463733, "language_loss": 0.75422025, "learning_rate": 2.4761730057046936e-06, "loss": 0.77530235, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41992188, "step": 7331, "time_per_iteration": 2.447476863861084 }, { "auxiliary_loss_clip": 0.0105809, "auxiliary_loss_mlp": 0.01042159, "balance_loss_clip": 1.01422882, "balance_loss_mlp": 1.01781368, "epoch": 0.4408236885615512, "flos": 24020243665920.0, "grad_norm": 1.359323039423722, "language_loss": 0.77433515, "learning_rate": 2.475794734375581e-06, "loss": 0.79533762, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40234375, "step": 7332, "time_per_iteration": 2.4144577980041504 }, { "auxiliary_loss_clip": 0.01060155, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.02074742, "balance_loss_mlp": 1.01889884, "epoch": 0.44088381181421915, "flos": 12676019794560.0, "grad_norm": 1.6650089871565255, "language_loss": 0.75312299, "learning_rate": 2.475416445004285e-06, "loss": 0.77421266, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 7333, "time_per_iteration": 2.4148573875427246 }, { "auxiliary_loss_clip": 0.01058481, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.01257634, "balance_loss_mlp": 1.01890635, "epoch": 0.4409439350668871, "flos": 24568764036480.0, "grad_norm": 1.7876222631169818, "language_loss": 0.81414127, "learning_rate": 2.4750381376051493e-06, "loss": 0.83514041, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.39453125, "step": 7334, "time_per_iteration": 3.8585569858551025 }, { "auxiliary_loss_clip": 0.01067814, "auxiliary_loss_mlp": 0.01053851, "balance_loss_clip": 1.01796842, "balance_loss_mlp": 1.02036905, "epoch": 0.4410040583195551, "flos": 22667638705920.0, "grad_norm": 2.089981064346398, "language_loss": 0.77407479, "learning_rate": 2.47465981219252e-06, "loss": 0.79529154, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.47460938, "step": 7335, "time_per_iteration": 3.870195150375366 }, { "auxiliary_loss_clip": 0.01062123, "auxiliary_loss_mlp": 0.01052869, "balance_loss_clip": 1.02188611, "balance_loss_mlp": 1.0193125, "epoch": 0.44106418157222305, "flos": 10851773011200.0, "grad_norm": 1.9683759228334394, "language_loss": 0.73479533, "learning_rate": 2.4742814687807423e-06, "loss": 0.75594521, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.42773438, "step": 7336, "time_per_iteration": 3.7258658409118652 }, { "auxiliary_loss_clip": 0.01063475, "auxiliary_loss_mlp": 0.01057381, "balance_loss_clip": 1.02465832, "balance_loss_mlp": 1.0188024, "epoch": 0.441124304824891, "flos": 21725611868160.0, "grad_norm": 2.486735629668177, "language_loss": 0.65754402, "learning_rate": 2.473903107384165e-06, "loss": 0.67875254, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44726562, "step": 7337, "time_per_iteration": 2.3925702571868896 }, { "auxiliary_loss_clip": 0.01015682, "auxiliary_loss_mlp": 0.01010602, "balance_loss_clip": 1.00750268, "balance_loss_mlp": 1.00650883, "epoch": 0.441184428077559, "flos": 63216950722560.0, "grad_norm": 0.7526140761932489, "language_loss": 0.52748525, "learning_rate": 2.473524728017134e-06, "loss": 0.54774809, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.09179688, "step": 7338, "time_per_iteration": 3.059663772583008 }, { "auxiliary_loss_clip": 0.01064901, "auxiliary_loss_mlp": 0.01055744, "balance_loss_clip": 1.02117348, "balance_loss_mlp": 1.01876521, "epoch": 0.44124455133022694, "flos": 21176916940800.0, "grad_norm": 1.9110692821300688, "language_loss": 0.72349161, "learning_rate": 2.473146330693997e-06, "loss": 0.74469805, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 0.4609375, "step": 7339, "time_per_iteration": 2.3849451541900635 }, { "auxiliary_loss_clip": 0.01060784, "auxiliary_loss_mlp": 0.01047063, "balance_loss_clip": 1.0206939, "balance_loss_mlp": 1.02078032, "epoch": 0.4413046745828949, "flos": 17456886264960.0, "grad_norm": 1.4442040090961115, "language_loss": 0.70734489, "learning_rate": 2.472767915429105e-06, "loss": 0.72842336, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 7340, "time_per_iteration": 2.422581195831299 }, { "auxiliary_loss_clip": 0.0101631, "auxiliary_loss_mlp": 0.01007701, "balance_loss_clip": 1.00476873, "balance_loss_mlp": 1.00728893, "epoch": 0.4413647978355629, "flos": 61583470807680.0, "grad_norm": 0.911749609981512, "language_loss": 0.64028013, "learning_rate": 2.4723894822368054e-06, "loss": 0.66052026, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 0.02929688, "router_z_loss_mlp": 0.09033203, "step": 7341, "time_per_iteration": 4.282296657562256 }, { "auxiliary_loss_clip": 0.01062079, "auxiliary_loss_mlp": 0.01052875, "balance_loss_clip": 1.02340662, "balance_loss_mlp": 1.01968455, "epoch": 0.4414249210882309, "flos": 27525767748480.0, "grad_norm": 2.0064868041555384, "language_loss": 0.75096887, "learning_rate": 2.47201103113145e-06, "loss": 0.77211845, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 7342, "time_per_iteration": 2.4320425987243652 }, { "auxiliary_loss_clip": 0.01060122, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.01927066, "balance_loss_mlp": 1.01759934, "epoch": 0.44148504434089886, "flos": 23512850743680.0, "grad_norm": 1.6877123454569463, "language_loss": 0.81236267, "learning_rate": 2.4716325621273886e-06, "loss": 0.83346397, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.42578125, "step": 7343, "time_per_iteration": 2.4582061767578125 }, { "auxiliary_loss_clip": 0.01059931, "auxiliary_loss_mlp": 0.01045009, "balance_loss_clip": 1.01637459, "balance_loss_mlp": 1.01812673, "epoch": 0.4415451675935668, "flos": 21579500361600.0, "grad_norm": 1.569953266894175, "language_loss": 0.77560043, "learning_rate": 2.4712540752389725e-06, "loss": 0.79664981, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7344, "time_per_iteration": 2.406813621520996 }, { "auxiliary_loss_clip": 0.01014775, "auxiliary_loss_mlp": 0.01007858, "balance_loss_clip": 1.00486541, "balance_loss_mlp": 1.00593352, "epoch": 0.4416052908462348, "flos": 59003458623360.0, "grad_norm": 0.8037914184343714, "language_loss": 0.6386956, "learning_rate": 2.470875570480556e-06, "loss": 0.65892196, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 0.02990723, "router_z_loss_mlp": 0.08886719, "step": 7345, "time_per_iteration": 2.812398910522461 }, { "auxiliary_loss_clip": 0.01062908, "auxiliary_loss_mlp": 0.01051137, "balance_loss_clip": 1.02035761, "balance_loss_mlp": 1.01935863, "epoch": 0.44166541409890275, "flos": 26356491671040.0, "grad_norm": 1.6660603291529221, "language_loss": 0.86785954, "learning_rate": 2.470497047866489e-06, "loss": 0.88899994, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43554688, "step": 7346, "time_per_iteration": 2.417098045349121 }, { "auxiliary_loss_clip": 0.01062412, "auxiliary_loss_mlp": 0.01061264, "balance_loss_clip": 1.02950644, "balance_loss_mlp": 1.01974368, "epoch": 0.4417255373515707, "flos": 20191667973120.0, "grad_norm": 1.6509689029097487, "language_loss": 0.81412232, "learning_rate": 2.470118507411128e-06, "loss": 0.8353591, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42578125, "step": 7347, "time_per_iteration": 2.465620517730713 }, { "auxiliary_loss_clip": 0.01062283, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.02999198, "balance_loss_mlp": 1.01926613, "epoch": 0.4417856606042387, "flos": 17887121349120.0, "grad_norm": 1.9092763332397331, "language_loss": 0.84725559, "learning_rate": 2.4697399491288263e-06, "loss": 0.86850584, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4296875, "step": 7348, "time_per_iteration": 2.485146999359131 }, { "auxiliary_loss_clip": 0.01062859, "auxiliary_loss_mlp": 0.01051407, "balance_loss_clip": 1.02211714, "balance_loss_mlp": 1.01907039, "epoch": 0.44184578385690665, "flos": 27962810547840.0, "grad_norm": 1.7556654308338602, "language_loss": 0.7311331, "learning_rate": 2.469361373033938e-06, "loss": 0.75227571, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 7349, "time_per_iteration": 2.525250196456909 }, { "auxiliary_loss_clip": 0.01062412, "auxiliary_loss_mlp": 0.01053862, "balance_loss_clip": 1.02086473, "balance_loss_mlp": 1.01888788, "epoch": 0.4419059071095746, "flos": 23366774148480.0, "grad_norm": 1.7278907622273492, "language_loss": 0.75466394, "learning_rate": 2.468982779140819e-06, "loss": 0.77582669, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.43554688, "step": 7350, "time_per_iteration": 2.3771934509277344 }, { "auxiliary_loss_clip": 0.01061467, "auxiliary_loss_mlp": 0.01059402, "balance_loss_clip": 1.02847934, "balance_loss_mlp": 1.01827776, "epoch": 0.4419660303622426, "flos": 15011290281600.0, "grad_norm": 2.734310648942757, "language_loss": 0.8342129, "learning_rate": 2.468604167463827e-06, "loss": 0.85542166, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43164062, "step": 7351, "time_per_iteration": 2.428382396697998 }, { "auxiliary_loss_clip": 0.01058465, "auxiliary_loss_mlp": 0.01048665, "balance_loss_clip": 1.02316618, "balance_loss_mlp": 1.01837564, "epoch": 0.44202615361491054, "flos": 25370649210240.0, "grad_norm": 1.437596158834641, "language_loss": 0.7388941, "learning_rate": 2.4682255380173176e-06, "loss": 0.75996542, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40039062, "step": 7352, "time_per_iteration": 2.4401159286499023 }, { "auxiliary_loss_clip": 0.01059659, "auxiliary_loss_mlp": 0.01050322, "balance_loss_clip": 1.02173555, "balance_loss_mlp": 1.01816583, "epoch": 0.4420862768675785, "flos": 24679962316800.0, "grad_norm": 1.6983210640906514, "language_loss": 0.88690233, "learning_rate": 2.467846890815649e-06, "loss": 0.90800214, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 7353, "time_per_iteration": 2.494453191757202 }, { "auxiliary_loss_clip": 0.01064295, "auxiliary_loss_mlp": 0.01050626, "balance_loss_clip": 1.02201629, "balance_loss_mlp": 1.02082181, "epoch": 0.44214640012024653, "flos": 19527655224960.0, "grad_norm": 2.2693578461490787, "language_loss": 0.77128625, "learning_rate": 2.4674682258731795e-06, "loss": 0.79243541, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43554688, "step": 7354, "time_per_iteration": 2.380127191543579 }, { "auxiliary_loss_clip": 0.01058582, "auxiliary_loss_mlp": 0.01048298, "balance_loss_clip": 1.0205816, "balance_loss_mlp": 1.01829302, "epoch": 0.4422065233729145, "flos": 47555649014400.0, "grad_norm": 2.1187763497857475, "language_loss": 0.65882498, "learning_rate": 2.467089543204268e-06, "loss": 0.67989373, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 7355, "time_per_iteration": 2.6712300777435303 }, { "auxiliary_loss_clip": 0.01065068, "auxiliary_loss_mlp": 0.0105054, "balance_loss_clip": 1.01973593, "balance_loss_mlp": 1.01985526, "epoch": 0.44226664662558246, "flos": 19280050796160.0, "grad_norm": 1.894133097203628, "language_loss": 0.79750907, "learning_rate": 2.466710842823274e-06, "loss": 0.81866527, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.453125, "step": 7356, "time_per_iteration": 2.393824577331543 }, { "auxiliary_loss_clip": 0.01064556, "auxiliary_loss_mlp": 0.01052989, "balance_loss_clip": 1.02347243, "balance_loss_mlp": 1.02060246, "epoch": 0.4423267698782504, "flos": 17820855855360.0, "grad_norm": 1.7472717161802904, "language_loss": 0.78438509, "learning_rate": 2.4663321247445577e-06, "loss": 0.80556059, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43945312, "step": 7357, "time_per_iteration": 2.4396588802337646 }, { "auxiliary_loss_clip": 0.01061957, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.01239192, "balance_loss_mlp": 1.02046371, "epoch": 0.4423868931309184, "flos": 29203169886720.0, "grad_norm": 1.5635766792460084, "language_loss": 0.74247235, "learning_rate": 2.465953388982481e-06, "loss": 0.76349694, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 7358, "time_per_iteration": 2.459625244140625 }, { "auxiliary_loss_clip": 0.01067013, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.01029003, "balance_loss_mlp": 1.02352333, "epoch": 0.44244701638358636, "flos": 29711924352000.0, "grad_norm": 1.8976403948468035, "language_loss": 0.76914799, "learning_rate": 2.465574635551405e-06, "loss": 0.79020542, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 7359, "time_per_iteration": 2.5101490020751953 }, { "auxiliary_loss_clip": 0.01066191, "auxiliary_loss_mlp": 0.01044259, "balance_loss_clip": 1.01526725, "balance_loss_mlp": 1.02272367, "epoch": 0.4425071396362543, "flos": 22928928387840.0, "grad_norm": 1.723749922386189, "language_loss": 0.71154416, "learning_rate": 2.4651958644656923e-06, "loss": 0.73264867, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 7360, "time_per_iteration": 2.4012649059295654 }, { "auxiliary_loss_clip": 0.01065704, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.01569426, "balance_loss_mlp": 1.02263474, "epoch": 0.4425672628889223, "flos": 19791318879360.0, "grad_norm": 3.01416642953967, "language_loss": 0.70705867, "learning_rate": 2.4648170757397053e-06, "loss": 0.7281583, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 7361, "time_per_iteration": 2.419052839279175 }, { "auxiliary_loss_clip": 0.01065608, "auxiliary_loss_mlp": 0.01049454, "balance_loss_clip": 1.01642084, "balance_loss_mlp": 1.02130497, "epoch": 0.44262738614159025, "flos": 13661373496320.0, "grad_norm": 1.9578112824084006, "language_loss": 0.83736676, "learning_rate": 2.464438269387809e-06, "loss": 0.85851741, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44335938, "step": 7362, "time_per_iteration": 2.36337947845459 }, { "auxiliary_loss_clip": 0.01068803, "auxiliary_loss_mlp": 0.01050443, "balance_loss_clip": 1.01625323, "balance_loss_mlp": 1.02229428, "epoch": 0.4426875093942582, "flos": 14209335285120.0, "grad_norm": 1.7067309282926402, "language_loss": 0.75399995, "learning_rate": 2.464059445424366e-06, "loss": 0.77519232, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.46484375, "step": 7363, "time_per_iteration": 2.3842103481292725 }, { "auxiliary_loss_clip": 0.01015649, "auxiliary_loss_mlp": 0.01015515, "balance_loss_clip": 1.01227236, "balance_loss_mlp": 1.00637484, "epoch": 0.4427476326469262, "flos": 70113763319040.0, "grad_norm": 0.6836030913050111, "language_loss": 0.55691677, "learning_rate": 2.463680603863743e-06, "loss": 0.57722843, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.09277344, "step": 7364, "time_per_iteration": 3.1010994911193848 }, { "auxiliary_loss_clip": 0.01060932, "auxiliary_loss_mlp": 0.01044304, "balance_loss_clip": 1.01544309, "balance_loss_mlp": 1.0191493, "epoch": 0.44280775589959415, "flos": 25443966798720.0, "grad_norm": 1.5942693731321327, "language_loss": 0.75484025, "learning_rate": 2.463301744720305e-06, "loss": 0.77589262, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7365, "time_per_iteration": 2.465895175933838 }, { "auxiliary_loss_clip": 0.010624, "auxiliary_loss_mlp": 0.01048847, "balance_loss_clip": 1.02171493, "balance_loss_mlp": 1.01994729, "epoch": 0.4428678791522621, "flos": 22856099558400.0, "grad_norm": 1.6045562687198986, "language_loss": 0.75788999, "learning_rate": 2.4629228680084184e-06, "loss": 0.77900243, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.42382812, "step": 7366, "time_per_iteration": 2.3839659690856934 }, { "auxiliary_loss_clip": 0.01064761, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.01478064, "balance_loss_mlp": 1.02195454, "epoch": 0.44292800240493013, "flos": 25811252968320.0, "grad_norm": 2.0302094076946586, "language_loss": 0.74680662, "learning_rate": 2.46254397374245e-06, "loss": 0.76789612, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 7367, "time_per_iteration": 2.4520552158355713 }, { "auxiliary_loss_clip": 0.0106513, "auxiliary_loss_mlp": 0.0104984, "balance_loss_clip": 1.0191195, "balance_loss_mlp": 1.02164972, "epoch": 0.4429881256575981, "flos": 32415493438080.0, "grad_norm": 1.564151857456975, "language_loss": 0.74756664, "learning_rate": 2.4621650619367677e-06, "loss": 0.76871634, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 7368, "time_per_iteration": 2.4760656356811523 }, { "auxiliary_loss_clip": 0.01059269, "auxiliary_loss_mlp": 0.01046991, "balance_loss_clip": 1.02015662, "balance_loss_mlp": 1.01879716, "epoch": 0.44304824891026606, "flos": 22162619756160.0, "grad_norm": 1.6375422527064407, "language_loss": 0.80791867, "learning_rate": 2.4617861326057403e-06, "loss": 0.82898122, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 7369, "time_per_iteration": 2.4084253311157227 }, { "auxiliary_loss_clip": 0.01059266, "auxiliary_loss_mlp": 0.01046954, "balance_loss_clip": 1.02051306, "balance_loss_mlp": 1.01949835, "epoch": 0.443108372162934, "flos": 25337376817920.0, "grad_norm": 1.8887486311064874, "language_loss": 0.73691005, "learning_rate": 2.461407185763737e-06, "loss": 0.75797224, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7370, "time_per_iteration": 2.4032037258148193 }, { "auxiliary_loss_clip": 0.01062227, "auxiliary_loss_mlp": 0.01050748, "balance_loss_clip": 1.02018285, "balance_loss_mlp": 1.01888037, "epoch": 0.443168495415602, "flos": 23329836063360.0, "grad_norm": 2.00278626441293, "language_loss": 0.72219491, "learning_rate": 2.461028221425126e-06, "loss": 0.7433247, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 7371, "time_per_iteration": 2.4172191619873047 }, { "auxiliary_loss_clip": 0.01060177, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.01880443, "balance_loss_mlp": 1.01870549, "epoch": 0.44322861866826996, "flos": 21870431654400.0, "grad_norm": 2.5219169774899943, "language_loss": 0.70279497, "learning_rate": 2.4606492396042786e-06, "loss": 0.72385633, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 7372, "time_per_iteration": 2.371110677719116 }, { "auxiliary_loss_clip": 0.01061333, "auxiliary_loss_mlp": 0.01054311, "balance_loss_clip": 1.02527189, "balance_loss_mlp": 1.01839924, "epoch": 0.4432887419209379, "flos": 20083367335680.0, "grad_norm": 1.6467661260459605, "language_loss": 0.8382954, "learning_rate": 2.4602702403155664e-06, "loss": 0.85945189, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4296875, "step": 7373, "time_per_iteration": 3.6988110542297363 }, { "auxiliary_loss_clip": 0.01013895, "auxiliary_loss_mlp": 0.01006346, "balance_loss_clip": 1.00265026, "balance_loss_mlp": 1.00397801, "epoch": 0.4433488651736059, "flos": 70032032092800.0, "grad_norm": 0.7628464334091587, "language_loss": 0.55240899, "learning_rate": 2.4598912235733604e-06, "loss": 0.57261139, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.09912109, "step": 7374, "time_per_iteration": 3.049039363861084 }, { "auxiliary_loss_clip": 0.01059385, "auxiliary_loss_mlp": 0.01055692, "balance_loss_clip": 1.02622366, "balance_loss_mlp": 1.01790833, "epoch": 0.44340898842627385, "flos": 16281745079040.0, "grad_norm": 2.258315707337408, "language_loss": 0.84714651, "learning_rate": 2.4595121893920327e-06, "loss": 0.86829728, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4140625, "step": 7375, "time_per_iteration": 5.263845682144165 }, { "auxiliary_loss_clip": 0.01061826, "auxiliary_loss_mlp": 0.0104829, "balance_loss_clip": 1.01877403, "balance_loss_mlp": 1.01886892, "epoch": 0.4434691116789418, "flos": 16611220379520.0, "grad_norm": 1.753460545681197, "language_loss": 0.84483445, "learning_rate": 2.4591331377859578e-06, "loss": 0.86593556, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7376, "time_per_iteration": 2.349104166030884 }, { "auxiliary_loss_clip": 0.01060311, "auxiliary_loss_mlp": 0.01048215, "balance_loss_clip": 1.0195334, "balance_loss_mlp": 1.01923358, "epoch": 0.4435292349316098, "flos": 19062227623680.0, "grad_norm": 1.5761601048074176, "language_loss": 0.79088855, "learning_rate": 2.4587540687695077e-06, "loss": 0.81197381, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41015625, "step": 7377, "time_per_iteration": 2.3644888401031494 }, { "auxiliary_loss_clip": 0.0105814, "auxiliary_loss_mlp": 0.01042313, "balance_loss_clip": 1.01520491, "balance_loss_mlp": 1.01868999, "epoch": 0.44358935818427775, "flos": 21250269440640.0, "grad_norm": 2.1335934662876106, "language_loss": 0.77791131, "learning_rate": 2.458374982357057e-06, "loss": 0.7989158, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 7378, "time_per_iteration": 2.400728702545166 }, { "auxiliary_loss_clip": 0.0106021, "auxiliary_loss_mlp": 0.01050818, "balance_loss_clip": 1.02111101, "balance_loss_mlp": 1.01847577, "epoch": 0.4436494814369457, "flos": 12494471391360.0, "grad_norm": 3.9384389089010137, "language_loss": 0.7177223, "learning_rate": 2.457995878562982e-06, "loss": 0.73883259, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41796875, "step": 7379, "time_per_iteration": 2.353121519088745 }, { "auxiliary_loss_clip": 0.01062327, "auxiliary_loss_mlp": 0.01050506, "balance_loss_clip": 1.02094221, "balance_loss_mlp": 1.02018976, "epoch": 0.44370960468961373, "flos": 23658717870720.0, "grad_norm": 1.92187761155757, "language_loss": 0.74544013, "learning_rate": 2.457616757401656e-06, "loss": 0.76656842, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7380, "time_per_iteration": 3.9661660194396973 }, { "auxiliary_loss_clip": 0.01060903, "auxiliary_loss_mlp": 0.01050217, "balance_loss_clip": 1.01946115, "balance_loss_mlp": 1.0191586, "epoch": 0.4437697279422817, "flos": 32415458526720.0, "grad_norm": 1.5432808209276452, "language_loss": 0.66076767, "learning_rate": 2.457237618887458e-06, "loss": 0.68187886, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41796875, "step": 7381, "time_per_iteration": 2.494781255722046 }, { "auxiliary_loss_clip": 0.0106209, "auxiliary_loss_mlp": 0.0104981, "balance_loss_clip": 1.02159274, "balance_loss_mlp": 1.02031851, "epoch": 0.44382985119494966, "flos": 18111926793600.0, "grad_norm": 1.9444067520392052, "language_loss": 0.81389618, "learning_rate": 2.456858463034763e-06, "loss": 0.83501518, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 7382, "time_per_iteration": 2.361161947250366 }, { "auxiliary_loss_clip": 0.01063373, "auxiliary_loss_mlp": 0.01055597, "balance_loss_clip": 1.02610445, "balance_loss_mlp": 1.02054524, "epoch": 0.44388997444761763, "flos": 30772829969280.0, "grad_norm": 1.6252828821653198, "language_loss": 0.65851176, "learning_rate": 2.456479289857949e-06, "loss": 0.67970145, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42773438, "step": 7383, "time_per_iteration": 2.472119092941284 }, { "auxiliary_loss_clip": 0.01065371, "auxiliary_loss_mlp": 0.01048482, "balance_loss_clip": 1.01586664, "balance_loss_mlp": 1.02106249, "epoch": 0.4439500977002856, "flos": 20338128593280.0, "grad_norm": 2.1443905949071893, "language_loss": 0.77766377, "learning_rate": 2.4561000993713953e-06, "loss": 0.79880226, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.44335938, "step": 7384, "time_per_iteration": 2.3564555644989014 }, { "auxiliary_loss_clip": 0.01062086, "auxiliary_loss_mlp": 0.01052784, "balance_loss_clip": 1.02231407, "balance_loss_mlp": 1.02045798, "epoch": 0.44401022095295356, "flos": 20370318733440.0, "grad_norm": 1.4853305932763385, "language_loss": 0.82183659, "learning_rate": 2.4557208915894796e-06, "loss": 0.84298521, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41601562, "step": 7385, "time_per_iteration": 2.4059641361236572 }, { "auxiliary_loss_clip": 0.01064057, "auxiliary_loss_mlp": 0.01050025, "balance_loss_clip": 1.01886392, "balance_loss_mlp": 1.02074432, "epoch": 0.4440703442056215, "flos": 20229583576320.0, "grad_norm": 1.7714162826237845, "language_loss": 0.83016968, "learning_rate": 2.455341666526582e-06, "loss": 0.85131049, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43359375, "step": 7386, "time_per_iteration": 2.364975929260254 }, { "auxiliary_loss_clip": 0.01067995, "auxiliary_loss_mlp": 0.01046131, "balance_loss_clip": 1.01310968, "balance_loss_mlp": 1.02298546, "epoch": 0.4441304674582895, "flos": 39493121299200.0, "grad_norm": 1.8437513052332828, "language_loss": 0.70450169, "learning_rate": 2.4549624241970832e-06, "loss": 0.72564292, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.44921875, "step": 7387, "time_per_iteration": 2.561840534210205 }, { "auxiliary_loss_clip": 0.01063731, "auxiliary_loss_mlp": 0.01052763, "balance_loss_clip": 1.02169752, "balance_loss_mlp": 1.02063203, "epoch": 0.44419059071095746, "flos": 14828799271680.0, "grad_norm": 1.877917070581985, "language_loss": 0.72565764, "learning_rate": 2.4545831646153628e-06, "loss": 0.7468226, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43164062, "step": 7388, "time_per_iteration": 2.360121011734009 }, { "auxiliary_loss_clip": 0.01065576, "auxiliary_loss_mlp": 0.01042158, "balance_loss_clip": 1.01164079, "balance_loss_mlp": 1.02220988, "epoch": 0.4442507139636254, "flos": 22636740286080.0, "grad_norm": 1.6623496588440339, "language_loss": 0.70542103, "learning_rate": 2.4542038877958044e-06, "loss": 0.72649837, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7389, "time_per_iteration": 2.418391704559326 }, { "auxiliary_loss_clip": 0.01063165, "auxiliary_loss_mlp": 0.01045114, "balance_loss_clip": 1.01842284, "balance_loss_mlp": 1.02063465, "epoch": 0.4443108372162934, "flos": 38289176375040.0, "grad_norm": 1.8125730157072082, "language_loss": 0.75250775, "learning_rate": 2.453824593752788e-06, "loss": 0.77359056, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.42578125, "step": 7390, "time_per_iteration": 2.513326406478882 }, { "auxiliary_loss_clip": 0.01062439, "auxiliary_loss_mlp": 0.0104219, "balance_loss_clip": 1.01139784, "balance_loss_mlp": 1.02112341, "epoch": 0.44437096046896135, "flos": 17748027025920.0, "grad_norm": 1.8447632521418622, "language_loss": 0.83333737, "learning_rate": 2.4534452825006988e-06, "loss": 0.85438365, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4140625, "step": 7391, "time_per_iteration": 2.375920534133911 }, { "auxiliary_loss_clip": 0.01063194, "auxiliary_loss_mlp": 0.01047431, "balance_loss_clip": 1.01946473, "balance_loss_mlp": 1.02130628, "epoch": 0.4444310837216293, "flos": 13731583973760.0, "grad_norm": 2.753426804903021, "language_loss": 0.74865478, "learning_rate": 2.4530659540539185e-06, "loss": 0.76976097, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41992188, "step": 7392, "time_per_iteration": 2.3652689456939697 }, { "auxiliary_loss_clip": 0.01061207, "auxiliary_loss_mlp": 0.01049526, "balance_loss_clip": 1.02173829, "balance_loss_mlp": 1.01933861, "epoch": 0.44449120697429734, "flos": 25009053592320.0, "grad_norm": 1.5493224362014473, "language_loss": 0.80716985, "learning_rate": 2.4526866084268313e-06, "loss": 0.82827717, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 7393, "time_per_iteration": 2.4181606769561768 }, { "auxiliary_loss_clip": 0.01064793, "auxiliary_loss_mlp": 0.01045462, "balance_loss_clip": 1.01358509, "balance_loss_mlp": 1.02082825, "epoch": 0.4445513302269653, "flos": 32670324518400.0, "grad_norm": 1.812638860839533, "language_loss": 0.81585932, "learning_rate": 2.4523072456338226e-06, "loss": 0.83696187, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43945312, "step": 7394, "time_per_iteration": 2.484042167663574 }, { "auxiliary_loss_clip": 0.01059622, "auxiliary_loss_mlp": 0.01043584, "balance_loss_clip": 1.0170598, "balance_loss_mlp": 1.01826406, "epoch": 0.44461145347963327, "flos": 11655019728000.0, "grad_norm": 2.1098196255938166, "language_loss": 0.81020403, "learning_rate": 2.4519278656892785e-06, "loss": 0.831236, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 7395, "time_per_iteration": 2.355422258377075 }, { "auxiliary_loss_clip": 0.01061341, "auxiliary_loss_mlp": 0.01047422, "balance_loss_clip": 1.01940763, "balance_loss_mlp": 1.01942027, "epoch": 0.44467157673230123, "flos": 20885706357120.0, "grad_norm": 1.8054886468750466, "language_loss": 0.69759274, "learning_rate": 2.451548468607584e-06, "loss": 0.71868038, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7396, "time_per_iteration": 2.4111058712005615 }, { "auxiliary_loss_clip": 0.01060973, "auxiliary_loss_mlp": 0.01044856, "balance_loss_clip": 1.01563811, "balance_loss_mlp": 1.01789892, "epoch": 0.4447316999849692, "flos": 18545303900160.0, "grad_norm": 1.7186724628272425, "language_loss": 0.81673181, "learning_rate": 2.451169054403126e-06, "loss": 0.83779001, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 7397, "time_per_iteration": 2.3499839305877686 }, { "auxiliary_loss_clip": 0.01060885, "auxiliary_loss_mlp": 0.01045054, "balance_loss_clip": 1.01619387, "balance_loss_mlp": 1.0190537, "epoch": 0.44479182323763716, "flos": 23767926203520.0, "grad_norm": 1.6328011961286737, "language_loss": 0.68048555, "learning_rate": 2.450789623090293e-06, "loss": 0.70154494, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7398, "time_per_iteration": 2.4421751499176025 }, { "auxiliary_loss_clip": 0.01059671, "auxiliary_loss_mlp": 0.01048535, "balance_loss_clip": 1.02261901, "balance_loss_mlp": 1.01852977, "epoch": 0.44485194649030513, "flos": 16542930026880.0, "grad_norm": 1.7065551656218512, "language_loss": 0.71188271, "learning_rate": 2.450410174683472e-06, "loss": 0.73296481, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41210938, "step": 7399, "time_per_iteration": 2.377603530883789 }, { "auxiliary_loss_clip": 0.01058962, "auxiliary_loss_mlp": 0.01044105, "balance_loss_clip": 1.01735413, "balance_loss_mlp": 1.01855385, "epoch": 0.4449120697429731, "flos": 22599872023680.0, "grad_norm": 1.7119984809684277, "language_loss": 0.73393667, "learning_rate": 2.4500307091970514e-06, "loss": 0.75496733, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40429688, "step": 7400, "time_per_iteration": 2.3969647884368896 }, { "auxiliary_loss_clip": 0.01061037, "auxiliary_loss_mlp": 0.01048284, "balance_loss_clip": 1.01974511, "balance_loss_mlp": 1.01887512, "epoch": 0.44497219299564106, "flos": 20004010081920.0, "grad_norm": 1.5764505486063307, "language_loss": 0.86159092, "learning_rate": 2.449651226645422e-06, "loss": 0.88268411, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.421875, "step": 7401, "time_per_iteration": 2.362785816192627 }, { "auxiliary_loss_clip": 0.01057367, "auxiliary_loss_mlp": 0.01043757, "balance_loss_clip": 1.01699448, "balance_loss_mlp": 1.01826108, "epoch": 0.445032316248309, "flos": 25593045770880.0, "grad_norm": 1.4781262823600751, "language_loss": 0.84788442, "learning_rate": 2.449271727042973e-06, "loss": 0.86889565, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 7402, "time_per_iteration": 2.434708595275879 }, { "auxiliary_loss_clip": 0.01061354, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.01377177, "balance_loss_mlp": 1.01943016, "epoch": 0.445092439500977, "flos": 21249396656640.0, "grad_norm": 1.7459370908241383, "language_loss": 0.78125179, "learning_rate": 2.4488922104040947e-06, "loss": 0.80229056, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41992188, "step": 7403, "time_per_iteration": 2.3605964183807373 }, { "auxiliary_loss_clip": 0.01013687, "auxiliary_loss_mlp": 0.01007713, "balance_loss_clip": 1.00486398, "balance_loss_mlp": 1.0045979, "epoch": 0.44515256275364495, "flos": 57762051943680.0, "grad_norm": 0.7503272162185524, "language_loss": 0.60141772, "learning_rate": 2.4485126767431793e-06, "loss": 0.62163174, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.09082031, "step": 7404, "time_per_iteration": 3.0382485389709473 }, { "auxiliary_loss_clip": 0.0106436, "auxiliary_loss_mlp": 0.01050617, "balance_loss_clip": 1.01702356, "balance_loss_mlp": 1.01967871, "epoch": 0.4452126860063129, "flos": 15595107903360.0, "grad_norm": 1.5107107827549195, "language_loss": 0.83027613, "learning_rate": 2.4481331260746177e-06, "loss": 0.85142595, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.44726562, "step": 7405, "time_per_iteration": 2.370190382003784 }, { "auxiliary_loss_clip": 0.01061335, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.01428485, "balance_loss_mlp": 1.01855719, "epoch": 0.4452728092589809, "flos": 21616298801280.0, "grad_norm": 1.9100722170479782, "language_loss": 0.76371241, "learning_rate": 2.4477535584128036e-06, "loss": 0.78475106, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42773438, "step": 7406, "time_per_iteration": 2.39978289604187 }, { "auxiliary_loss_clip": 0.01058877, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.01405871, "balance_loss_mlp": 1.01890898, "epoch": 0.4453329325116489, "flos": 29496195861120.0, "grad_norm": 1.9245763869952761, "language_loss": 0.67185366, "learning_rate": 2.447373973772129e-06, "loss": 0.6928429, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40039062, "step": 7407, "time_per_iteration": 2.427783250808716 }, { "auxiliary_loss_clip": 0.01065897, "auxiliary_loss_mlp": 0.0104308, "balance_loss_clip": 1.01547122, "balance_loss_mlp": 1.02301908, "epoch": 0.44539305576431687, "flos": 21360071266560.0, "grad_norm": 1.547990100049085, "language_loss": 0.69185454, "learning_rate": 2.4469943721669887e-06, "loss": 0.71294427, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4296875, "step": 7408, "time_per_iteration": 2.394376039505005 }, { "auxiliary_loss_clip": 0.01062424, "auxiliary_loss_mlp": 0.0105325, "balance_loss_clip": 1.01972818, "balance_loss_mlp": 1.01946771, "epoch": 0.44545317901698483, "flos": 41426017833600.0, "grad_norm": 1.4146912970739978, "language_loss": 0.72651309, "learning_rate": 2.4466147536117776e-06, "loss": 0.74766982, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.4296875, "step": 7409, "time_per_iteration": 2.5373780727386475 }, { "auxiliary_loss_clip": 0.01063965, "auxiliary_loss_mlp": 0.01047219, "balance_loss_clip": 1.01697576, "balance_loss_mlp": 1.02045476, "epoch": 0.4455133022696528, "flos": 22053900182400.0, "grad_norm": 1.8702441018025147, "language_loss": 0.65390164, "learning_rate": 2.4462351181208895e-06, "loss": 0.67501342, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.43554688, "step": 7410, "time_per_iteration": 2.4029409885406494 }, { "auxiliary_loss_clip": 0.01067903, "auxiliary_loss_mlp": 0.01048224, "balance_loss_clip": 1.0177784, "balance_loss_mlp": 1.02222455, "epoch": 0.44557342552232077, "flos": 23475842835840.0, "grad_norm": 1.8106774757251416, "language_loss": 0.75821018, "learning_rate": 2.4458554657087217e-06, "loss": 0.77937138, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.45703125, "step": 7411, "time_per_iteration": 2.39731502532959 }, { "auxiliary_loss_clip": 0.01060767, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.012959, "balance_loss_mlp": 1.02131057, "epoch": 0.44563354877498873, "flos": 19133694910080.0, "grad_norm": 1.7669878540799737, "language_loss": 0.79891396, "learning_rate": 2.4454757963896695e-06, "loss": 0.81991518, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 7412, "time_per_iteration": 2.382634401321411 }, { "auxiliary_loss_clip": 0.01064134, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.01975346, "balance_loss_mlp": 1.02069306, "epoch": 0.4456936720276567, "flos": 13620699895680.0, "grad_norm": 2.401567271118526, "language_loss": 0.815467, "learning_rate": 2.4450961101781304e-06, "loss": 0.83661592, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43359375, "step": 7413, "time_per_iteration": 3.703620433807373 }, { "auxiliary_loss_clip": 0.010607, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.01167154, "balance_loss_mlp": 1.01992774, "epoch": 0.44575379528032466, "flos": 14713027603200.0, "grad_norm": 1.974671180344838, "language_loss": 0.78090119, "learning_rate": 2.4447164070885026e-06, "loss": 0.80189574, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 7414, "time_per_iteration": 3.7568609714508057 }, { "auxiliary_loss_clip": 0.01061112, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.0142498, "balance_loss_mlp": 1.01993585, "epoch": 0.4458139185329926, "flos": 24169532106240.0, "grad_norm": 1.4685668184123488, "language_loss": 0.84332961, "learning_rate": 2.4443366871351837e-06, "loss": 0.86436486, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 7415, "time_per_iteration": 3.776383399963379 }, { "auxiliary_loss_clip": 0.01061792, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.01560044, "balance_loss_mlp": 1.0196383, "epoch": 0.4458740417856606, "flos": 21761153498880.0, "grad_norm": 1.7445033093781286, "language_loss": 0.85369706, "learning_rate": 2.4439569503325732e-06, "loss": 0.87475026, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.421875, "step": 7416, "time_per_iteration": 2.4076192378997803 }, { "auxiliary_loss_clip": 0.01064266, "auxiliary_loss_mlp": 0.01051588, "balance_loss_clip": 1.02016461, "balance_loss_mlp": 1.02020216, "epoch": 0.44593416503832856, "flos": 21067743519360.0, "grad_norm": 1.5892124605191462, "language_loss": 0.81942344, "learning_rate": 2.4435771966950706e-06, "loss": 0.84058201, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 7417, "time_per_iteration": 2.4322540760040283 }, { "auxiliary_loss_clip": 0.01064531, "auxiliary_loss_mlp": 0.0104738, "balance_loss_clip": 1.01793551, "balance_loss_mlp": 1.0216589, "epoch": 0.4459942882909965, "flos": 22599418176000.0, "grad_norm": 2.0086329281685447, "language_loss": 0.82794309, "learning_rate": 2.443197426237077e-06, "loss": 0.8490622, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7418, "time_per_iteration": 2.3648276329040527 }, { "auxiliary_loss_clip": 0.01061693, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.01759553, "balance_loss_mlp": 1.01971757, "epoch": 0.4460544115436645, "flos": 26504278922880.0, "grad_norm": 1.7334981462282826, "language_loss": 0.78695917, "learning_rate": 2.442817638972991e-06, "loss": 0.80803442, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7419, "time_per_iteration": 2.4358415603637695 }, { "auxiliary_loss_clip": 0.01061916, "auxiliary_loss_mlp": 0.01045842, "balance_loss_clip": 1.02039087, "balance_loss_mlp": 1.02051115, "epoch": 0.4461145347963325, "flos": 17603032682880.0, "grad_norm": 1.498351492273621, "language_loss": 0.73411441, "learning_rate": 2.4424378349172176e-06, "loss": 0.75519204, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.4140625, "step": 7420, "time_per_iteration": 3.8257832527160645 }, { "auxiliary_loss_clip": 0.01060458, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.01813459, "balance_loss_mlp": 1.02025211, "epoch": 0.44617465804900047, "flos": 27267061507200.0, "grad_norm": 1.9074377438836017, "language_loss": 0.75608492, "learning_rate": 2.442058014084156e-06, "loss": 0.77714443, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 7421, "time_per_iteration": 2.4191856384277344 }, { "auxiliary_loss_clip": 0.01058341, "auxiliary_loss_mlp": 0.01048275, "balance_loss_clip": 1.02221572, "balance_loss_mlp": 1.01877809, "epoch": 0.44623478130166844, "flos": 17785418958720.0, "grad_norm": 1.8564371978519814, "language_loss": 0.773754, "learning_rate": 2.44167817648821e-06, "loss": 0.79482013, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 7422, "time_per_iteration": 2.384601593017578 }, { "auxiliary_loss_clip": 0.0106133, "auxiliary_loss_mlp": 0.01043086, "balance_loss_clip": 1.01732504, "balance_loss_mlp": 1.01936913, "epoch": 0.4462949045543364, "flos": 23001896862720.0, "grad_norm": 1.3601470128279411, "language_loss": 0.66094363, "learning_rate": 2.441298322143784e-06, "loss": 0.68198782, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41796875, "step": 7423, "time_per_iteration": 2.3729543685913086 }, { "auxiliary_loss_clip": 0.0105632, "auxiliary_loss_mlp": 0.01044312, "balance_loss_clip": 1.01888514, "balance_loss_mlp": 1.01747656, "epoch": 0.44635502780700437, "flos": 17819180110080.0, "grad_norm": 1.481183099833573, "language_loss": 0.80978894, "learning_rate": 2.4409184510652807e-06, "loss": 0.83079529, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 7424, "time_per_iteration": 2.428053617477417 }, { "auxiliary_loss_clip": 0.01057828, "auxiliary_loss_mlp": 0.01047929, "balance_loss_clip": 1.02340746, "balance_loss_mlp": 1.01832557, "epoch": 0.44641515105967233, "flos": 26686804844160.0, "grad_norm": 1.4083217951576967, "language_loss": 0.81129557, "learning_rate": 2.4405385632671063e-06, "loss": 0.83235312, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.39453125, "step": 7425, "time_per_iteration": 2.4148483276367188 }, { "auxiliary_loss_clip": 0.01060138, "auxiliary_loss_mlp": 0.01050037, "balance_loss_clip": 1.02551568, "balance_loss_mlp": 1.01915646, "epoch": 0.4464752743123403, "flos": 18912415512960.0, "grad_norm": 5.76676989632695, "language_loss": 0.78022385, "learning_rate": 2.4401586587636655e-06, "loss": 0.80132556, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.41015625, "step": 7426, "time_per_iteration": 2.3825652599334717 }, { "auxiliary_loss_clip": 0.01061218, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.02418876, "balance_loss_mlp": 1.01914215, "epoch": 0.44653539756500826, "flos": 29569024690560.0, "grad_norm": 1.7061085577252628, "language_loss": 0.65887141, "learning_rate": 2.4397787375693634e-06, "loss": 0.67999113, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 7427, "time_per_iteration": 2.4287450313568115 }, { "auxiliary_loss_clip": 0.0105894, "auxiliary_loss_mlp": 0.01045897, "balance_loss_clip": 1.0190506, "balance_loss_mlp": 1.0188005, "epoch": 0.44659552081767623, "flos": 21467952967680.0, "grad_norm": 1.7038854891567468, "language_loss": 0.76216054, "learning_rate": 2.439398799698608e-06, "loss": 0.78320897, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 7428, "time_per_iteration": 2.417336940765381 }, { "auxiliary_loss_clip": 0.0105944, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.01908123, "balance_loss_mlp": 1.01823556, "epoch": 0.4466556440703442, "flos": 17930902060800.0, "grad_norm": 1.7557531392474424, "language_loss": 0.79142976, "learning_rate": 2.439018845165806e-06, "loss": 0.81247306, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41210938, "step": 7429, "time_per_iteration": 2.3588950634002686 }, { "auxiliary_loss_clip": 0.01063797, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.01729655, "balance_loss_mlp": 1.02087939, "epoch": 0.44671576732301216, "flos": 21106322438400.0, "grad_norm": 1.7622683032664104, "language_loss": 0.91483247, "learning_rate": 2.438638873985366e-06, "loss": 0.93592393, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4296875, "step": 7430, "time_per_iteration": 2.419955015182495 }, { "auxiliary_loss_clip": 0.01065109, "auxiliary_loss_mlp": 0.01050251, "balance_loss_clip": 1.02012658, "balance_loss_mlp": 1.02098382, "epoch": 0.4467758905756801, "flos": 23507928241920.0, "grad_norm": 1.672524802184418, "language_loss": 0.80359221, "learning_rate": 2.4382588861716954e-06, "loss": 0.82474583, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44140625, "step": 7431, "time_per_iteration": 2.403712272644043 }, { "auxiliary_loss_clip": 0.01063703, "auxiliary_loss_mlp": 0.01044198, "balance_loss_clip": 1.0157907, "balance_loss_mlp": 1.02051532, "epoch": 0.4468360138283481, "flos": 18733031614080.0, "grad_norm": 2.032434057676348, "language_loss": 0.81158793, "learning_rate": 2.437878881739204e-06, "loss": 0.83266687, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43164062, "step": 7432, "time_per_iteration": 2.348212957382202 }, { "auxiliary_loss_clip": 0.01062998, "auxiliary_loss_mlp": 0.01041481, "balance_loss_clip": 1.01442015, "balance_loss_mlp": 1.02033091, "epoch": 0.4468961370810161, "flos": 23476017392640.0, "grad_norm": 1.8671244700388248, "language_loss": 0.78401721, "learning_rate": 2.437498860702301e-06, "loss": 0.80506194, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.42773438, "step": 7433, "time_per_iteration": 2.4316258430480957 }, { "auxiliary_loss_clip": 0.01059148, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.01798224, "balance_loss_mlp": 1.02039838, "epoch": 0.4469562603336841, "flos": 30073903994880.0, "grad_norm": 2.188048877768603, "language_loss": 0.77925396, "learning_rate": 2.437118823075398e-06, "loss": 0.80026042, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.38671875, "step": 7434, "time_per_iteration": 2.442866325378418 }, { "auxiliary_loss_clip": 0.01064535, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.01168156, "balance_loss_mlp": 1.02152491, "epoch": 0.44701638358635204, "flos": 22455296616960.0, "grad_norm": 1.7041198966513709, "language_loss": 0.65700853, "learning_rate": 2.436738768872905e-06, "loss": 0.6780436, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4296875, "step": 7435, "time_per_iteration": 2.417102098464966 }, { "auxiliary_loss_clip": 0.0106235, "auxiliary_loss_mlp": 0.01045884, "balance_loss_clip": 1.0161531, "balance_loss_mlp": 1.02101243, "epoch": 0.44707650683902, "flos": 24056797726080.0, "grad_norm": 2.06497543312855, "language_loss": 0.84771824, "learning_rate": 2.4363586981092346e-06, "loss": 0.86880058, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4140625, "step": 7436, "time_per_iteration": 2.4159233570098877 }, { "auxiliary_loss_clip": 0.01066766, "auxiliary_loss_mlp": 0.01042745, "balance_loss_clip": 1.01429009, "balance_loss_mlp": 1.02281952, "epoch": 0.44713663009168797, "flos": 23765866433280.0, "grad_norm": 1.6726615927272321, "language_loss": 0.80789399, "learning_rate": 2.435978610798798e-06, "loss": 0.82898903, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43945312, "step": 7437, "time_per_iteration": 2.464812994003296 }, { "auxiliary_loss_clip": 0.01064903, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.01386225, "balance_loss_mlp": 1.02102959, "epoch": 0.44719675334435594, "flos": 24498099711360.0, "grad_norm": 1.8139162809730864, "language_loss": 0.73439437, "learning_rate": 2.435598506956009e-06, "loss": 0.75546551, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.43945312, "step": 7438, "time_per_iteration": 2.403141498565674 }, { "auxiliary_loss_clip": 0.01062635, "auxiliary_loss_mlp": 0.01046939, "balance_loss_clip": 1.01940143, "balance_loss_mlp": 1.02087259, "epoch": 0.4472568765970239, "flos": 29780668552320.0, "grad_norm": 1.6677403775960167, "language_loss": 0.68405747, "learning_rate": 2.4352183865952808e-06, "loss": 0.70515317, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 7439, "time_per_iteration": 2.469871759414673 }, { "auxiliary_loss_clip": 0.01065755, "auxiliary_loss_mlp": 0.01047284, "balance_loss_clip": 1.0179944, "balance_loss_mlp": 1.02197158, "epoch": 0.44731699984969187, "flos": 24642011802240.0, "grad_norm": 1.759812683303084, "language_loss": 0.75411522, "learning_rate": 2.4348382497310285e-06, "loss": 0.77524567, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4375, "step": 7440, "time_per_iteration": 2.433894634246826 }, { "auxiliary_loss_clip": 0.0106427, "auxiliary_loss_mlp": 0.01042972, "balance_loss_clip": 1.01572061, "balance_loss_mlp": 1.02146316, "epoch": 0.44737712310235983, "flos": 29454544742400.0, "grad_norm": 1.7978331368072171, "language_loss": 0.75141722, "learning_rate": 2.4344580963776655e-06, "loss": 0.77248967, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.42773438, "step": 7441, "time_per_iteration": 2.48005747795105 }, { "auxiliary_loss_clip": 0.01063659, "auxiliary_loss_mlp": 0.0104959, "balance_loss_clip": 1.02043176, "balance_loss_mlp": 1.02035904, "epoch": 0.4474372463550278, "flos": 24895760630400.0, "grad_norm": 2.1019543279721553, "language_loss": 0.76137096, "learning_rate": 2.4340779265496082e-06, "loss": 0.78250337, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 7442, "time_per_iteration": 2.3905279636383057 }, { "auxiliary_loss_clip": 0.01065136, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.01792622, "balance_loss_mlp": 1.02093756, "epoch": 0.44749736960769576, "flos": 33180231058560.0, "grad_norm": 1.9839847683388172, "language_loss": 0.75856185, "learning_rate": 2.433697740261273e-06, "loss": 0.77968407, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44140625, "step": 7443, "time_per_iteration": 2.5045106410980225 }, { "auxiliary_loss_clip": 0.01062318, "auxiliary_loss_mlp": 0.01048801, "balance_loss_clip": 1.02007103, "balance_loss_mlp": 1.01941752, "epoch": 0.4475574928603637, "flos": 21070676073600.0, "grad_norm": 1.492686661711205, "language_loss": 0.78180754, "learning_rate": 2.4333175375270748e-06, "loss": 0.80291873, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 7444, "time_per_iteration": 2.4029104709625244 }, { "auxiliary_loss_clip": 0.0105925, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.01614332, "balance_loss_mlp": 1.01844728, "epoch": 0.4476176161130317, "flos": 21861703814400.0, "grad_norm": 2.4355008600319787, "language_loss": 0.85946393, "learning_rate": 2.4329373183614333e-06, "loss": 0.88050902, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40820312, "step": 7445, "time_per_iteration": 2.390942096710205 }, { "auxiliary_loss_clip": 0.01062238, "auxiliary_loss_mlp": 0.01044261, "balance_loss_clip": 1.01560259, "balance_loss_mlp": 1.01954746, "epoch": 0.4476777393656997, "flos": 22527566864640.0, "grad_norm": 2.1418163189396875, "language_loss": 0.66012031, "learning_rate": 2.432557082778765e-06, "loss": 0.68118525, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 7446, "time_per_iteration": 2.379326820373535 }, { "auxiliary_loss_clip": 0.01014453, "auxiliary_loss_mlp": 0.01009643, "balance_loss_clip": 1.00712788, "balance_loss_mlp": 1.00603938, "epoch": 0.4477378626183677, "flos": 49014283507200.0, "grad_norm": 0.7394995978133688, "language_loss": 0.50406832, "learning_rate": 2.4321768307934884e-06, "loss": 0.52430928, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.08398438, "step": 7447, "time_per_iteration": 2.917346239089966 }, { "auxiliary_loss_clip": 0.01012499, "auxiliary_loss_mlp": 0.01007694, "balance_loss_clip": 1.00505912, "balance_loss_mlp": 1.00400996, "epoch": 0.44779798587103564, "flos": 56538868993920.0, "grad_norm": 0.7463159587188575, "language_loss": 0.59377801, "learning_rate": 2.4317965624200235e-06, "loss": 0.61397994, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.08496094, "step": 7448, "time_per_iteration": 3.0910396575927734 }, { "auxiliary_loss_clip": 0.01060034, "auxiliary_loss_mlp": 0.01044537, "balance_loss_clip": 1.01826298, "balance_loss_mlp": 1.01884913, "epoch": 0.4478581091237036, "flos": 46496803167360.0, "grad_norm": 1.9700735084543406, "language_loss": 0.59584785, "learning_rate": 2.431416277672789e-06, "loss": 0.61689359, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41210938, "step": 7449, "time_per_iteration": 2.6133785247802734 }, { "auxiliary_loss_clip": 0.01059414, "auxiliary_loss_mlp": 0.01047134, "balance_loss_clip": 1.02040768, "balance_loss_mlp": 1.01825356, "epoch": 0.4479182323763716, "flos": 20813296464000.0, "grad_norm": 3.239418436180509, "language_loss": 0.81973028, "learning_rate": 2.4310359765662065e-06, "loss": 0.84079581, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 7450, "time_per_iteration": 2.387289047241211 }, { "auxiliary_loss_clip": 0.01058619, "auxiliary_loss_mlp": 0.01045275, "balance_loss_clip": 1.01900172, "balance_loss_mlp": 1.01862407, "epoch": 0.44797835562903954, "flos": 14245121295360.0, "grad_norm": 1.9783622904797427, "language_loss": 0.81283045, "learning_rate": 2.430655659114697e-06, "loss": 0.83386934, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 7451, "time_per_iteration": 2.382765769958496 }, { "auxiliary_loss_clip": 0.01011691, "auxiliary_loss_mlp": 0.01007989, "balance_loss_clip": 1.00534296, "balance_loss_mlp": 1.00326574, "epoch": 0.4480384788817075, "flos": 63531414138240.0, "grad_norm": 0.827160850581408, "language_loss": 0.62832248, "learning_rate": 2.430275325332681e-06, "loss": 0.6485194, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.08398438, "step": 7452, "time_per_iteration": 4.529294490814209 }, { "auxiliary_loss_clip": 0.0106111, "auxiliary_loss_mlp": 0.01050297, "balance_loss_clip": 1.02063775, "balance_loss_mlp": 1.01897693, "epoch": 0.44809860213437547, "flos": 21651561141120.0, "grad_norm": 1.831648228212988, "language_loss": 0.64371121, "learning_rate": 2.429894975234582e-06, "loss": 0.6648252, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7453, "time_per_iteration": 2.376901149749756 }, { "auxiliary_loss_clip": 0.01013009, "auxiliary_loss_mlp": 0.01006127, "balance_loss_clip": 1.00309908, "balance_loss_mlp": 1.0045588, "epoch": 0.44815872538704343, "flos": 69187308814080.0, "grad_norm": 0.7909282751716382, "language_loss": 0.57159221, "learning_rate": 2.4295146088348224e-06, "loss": 0.59178364, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 0.03027344, "router_z_loss_mlp": 0.08447266, "step": 7454, "time_per_iteration": 5.593378067016602 }, { "auxiliary_loss_clip": 0.01060608, "auxiliary_loss_mlp": 0.01050567, "balance_loss_clip": 1.02405524, "balance_loss_mlp": 1.01958728, "epoch": 0.4482188486397114, "flos": 12597640058880.0, "grad_norm": 2.3634121033443685, "language_loss": 0.76583117, "learning_rate": 2.4291342261478255e-06, "loss": 0.7869429, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 7455, "time_per_iteration": 2.370837926864624 }, { "auxiliary_loss_clip": 0.01060937, "auxiliary_loss_mlp": 0.01055281, "balance_loss_clip": 1.02743351, "balance_loss_mlp": 1.01954722, "epoch": 0.44827897189237936, "flos": 34056760452480.0, "grad_norm": 1.724433408342534, "language_loss": 0.77031076, "learning_rate": 2.428753827188016e-06, "loss": 0.79147291, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 7456, "time_per_iteration": 2.489419460296631 }, { "auxiliary_loss_clip": 0.01061787, "auxiliary_loss_mlp": 0.01049695, "balance_loss_clip": 1.02356446, "balance_loss_mlp": 1.02134752, "epoch": 0.44833909514504733, "flos": 25146472170240.0, "grad_norm": 2.0334951594428476, "language_loss": 0.77779019, "learning_rate": 2.428373411969818e-06, "loss": 0.7989049, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 7457, "time_per_iteration": 2.4270012378692627 }, { "auxiliary_loss_clip": 0.01061817, "auxiliary_loss_mlp": 0.01043085, "balance_loss_clip": 1.01416516, "balance_loss_mlp": 1.02041936, "epoch": 0.4483992183977153, "flos": 16179065170560.0, "grad_norm": 2.126938931187814, "language_loss": 0.69004887, "learning_rate": 2.4279929805076576e-06, "loss": 0.71109784, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 7458, "time_per_iteration": 2.3531150817871094 }, { "auxiliary_loss_clip": 0.01063861, "auxiliary_loss_mlp": 0.01052192, "balance_loss_clip": 1.02333188, "balance_loss_mlp": 1.01998734, "epoch": 0.44845934165038326, "flos": 17745164294400.0, "grad_norm": 1.516548979560439, "language_loss": 0.72714198, "learning_rate": 2.427612532815961e-06, "loss": 0.74830246, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4375, "step": 7459, "time_per_iteration": 2.4357681274414062 }, { "auxiliary_loss_clip": 0.01060758, "auxiliary_loss_mlp": 0.01051066, "balance_loss_clip": 1.02383876, "balance_loss_mlp": 1.01954532, "epoch": 0.4485194649030513, "flos": 21834820200960.0, "grad_norm": 3.2218251676191842, "language_loss": 0.70643497, "learning_rate": 2.427232068909154e-06, "loss": 0.72755313, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41210938, "step": 7460, "time_per_iteration": 3.8474650382995605 }, { "auxiliary_loss_clip": 0.01063723, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.02850187, "balance_loss_mlp": 1.02148438, "epoch": 0.44857958815571924, "flos": 20083472069760.0, "grad_norm": 2.011588832194262, "language_loss": 0.79714096, "learning_rate": 2.4268515888016635e-06, "loss": 0.81833273, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.421875, "step": 7461, "time_per_iteration": 2.386831521987915 }, { "auxiliary_loss_clip": 0.01064398, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.01949072, "balance_loss_mlp": 1.0213151, "epoch": 0.4486397114083872, "flos": 27052275623040.0, "grad_norm": 3.3465856170604398, "language_loss": 0.69754052, "learning_rate": 2.4264710925079184e-06, "loss": 0.7186597, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.43164062, "step": 7462, "time_per_iteration": 2.479278802871704 }, { "auxiliary_loss_clip": 0.0102157, "auxiliary_loss_mlp": 0.01021977, "balance_loss_clip": 1.01940238, "balance_loss_mlp": 1.01300263, "epoch": 0.4486998346610552, "flos": 67318164155520.0, "grad_norm": 1.950634939162188, "language_loss": 0.54497659, "learning_rate": 2.4260905800423462e-06, "loss": 0.56541204, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.0859375, "step": 7463, "time_per_iteration": 3.0324108600616455 }, { "auxiliary_loss_clip": 0.0106194, "auxiliary_loss_mlp": 0.01042193, "balance_loss_clip": 1.01539493, "balance_loss_mlp": 1.02131641, "epoch": 0.44875995791372314, "flos": 27635569574400.0, "grad_norm": 2.0350491602530942, "language_loss": 0.77395082, "learning_rate": 2.4257100514193775e-06, "loss": 0.79499215, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 7464, "time_per_iteration": 2.483410120010376 }, { "auxiliary_loss_clip": 0.01063165, "auxiliary_loss_mlp": 0.01040989, "balance_loss_clip": 1.01655126, "balance_loss_mlp": 1.02321362, "epoch": 0.4488200811663911, "flos": 13005111070080.0, "grad_norm": 1.7632963398374, "language_loss": 0.76112831, "learning_rate": 2.425329506653441e-06, "loss": 0.78216988, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.40039062, "step": 7465, "time_per_iteration": 2.414313316345215 }, { "auxiliary_loss_clip": 0.01067352, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.02119672, "balance_loss_mlp": 1.02281857, "epoch": 0.44888020441905907, "flos": 27488759840640.0, "grad_norm": 1.9759589488394178, "language_loss": 0.81716955, "learning_rate": 2.424948945758966e-06, "loss": 0.83835995, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4453125, "step": 7466, "time_per_iteration": 2.475558042526245 }, { "auxiliary_loss_clip": 0.01064494, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.01640475, "balance_loss_mlp": 1.02355838, "epoch": 0.44894032767172704, "flos": 18258701616000.0, "grad_norm": 2.257730549369128, "language_loss": 0.81741452, "learning_rate": 2.4245683687503844e-06, "loss": 0.83847308, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.41015625, "step": 7467, "time_per_iteration": 2.357679843902588 }, { "auxiliary_loss_clip": 0.0106217, "auxiliary_loss_mlp": 0.01041482, "balance_loss_clip": 1.01791418, "balance_loss_mlp": 1.02341485, "epoch": 0.449000450924395, "flos": 21578767223040.0, "grad_norm": 1.6218531905722746, "language_loss": 0.76381624, "learning_rate": 2.424187775642129e-06, "loss": 0.7848528, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38671875, "step": 7468, "time_per_iteration": 2.403348207473755 }, { "auxiliary_loss_clip": 0.01064981, "auxiliary_loss_mlp": 0.01041045, "balance_loss_clip": 1.01716709, "balance_loss_mlp": 1.02384937, "epoch": 0.44906057417706297, "flos": 17966932450560.0, "grad_norm": 1.6417705449681603, "language_loss": 0.71900392, "learning_rate": 2.4238071664486297e-06, "loss": 0.74006414, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.41210938, "step": 7469, "time_per_iteration": 2.381255626678467 }, { "auxiliary_loss_clip": 0.0106224, "auxiliary_loss_mlp": 0.01046374, "balance_loss_clip": 1.01959944, "balance_loss_mlp": 1.02101719, "epoch": 0.44912069742973093, "flos": 20046324516480.0, "grad_norm": 1.6446459304072352, "language_loss": 0.72535467, "learning_rate": 2.4234265411843203e-06, "loss": 0.74644083, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41210938, "step": 7470, "time_per_iteration": 2.4082746505737305 }, { "auxiliary_loss_clip": 0.0106462, "auxiliary_loss_mlp": 0.01045524, "balance_loss_clip": 1.01814163, "balance_loss_mlp": 1.02308202, "epoch": 0.4491808206823989, "flos": 21032446268160.0, "grad_norm": 1.7836943497017024, "language_loss": 0.78435224, "learning_rate": 2.423045899863634e-06, "loss": 0.80545366, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41601562, "step": 7471, "time_per_iteration": 2.381223440170288 }, { "auxiliary_loss_clip": 0.01061788, "auxiliary_loss_mlp": 0.01059628, "balance_loss_clip": 1.03069615, "balance_loss_mlp": 1.02044487, "epoch": 0.44924094393506686, "flos": 22966006118400.0, "grad_norm": 1.6957215060082014, "language_loss": 0.71949792, "learning_rate": 2.4226652425010048e-06, "loss": 0.74071217, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 7472, "time_per_iteration": 2.4694647789001465 }, { "auxiliary_loss_clip": 0.01017179, "auxiliary_loss_mlp": 0.01007838, "balance_loss_clip": 1.00497687, "balance_loss_mlp": 1.00875127, "epoch": 0.4493010671877349, "flos": 59230323838080.0, "grad_norm": 0.7878824221482037, "language_loss": 0.61807913, "learning_rate": 2.4222845691108676e-06, "loss": 0.63832927, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.08398438, "step": 7473, "time_per_iteration": 2.9801182746887207 }, { "auxiliary_loss_clip": 0.01060488, "auxiliary_loss_mlp": 0.01047885, "balance_loss_clip": 1.02210021, "balance_loss_mlp": 1.01947725, "epoch": 0.44936119044040285, "flos": 18003905447040.0, "grad_norm": 1.9129298958746344, "language_loss": 0.7905125, "learning_rate": 2.421903879707657e-06, "loss": 0.81159621, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 7474, "time_per_iteration": 2.4021730422973633 }, { "auxiliary_loss_clip": 0.01057973, "auxiliary_loss_mlp": 0.01058705, "balance_loss_clip": 1.03121495, "balance_loss_mlp": 1.0182879, "epoch": 0.4494213136930708, "flos": 21250758199680.0, "grad_norm": 1.6572647697526866, "language_loss": 0.73197532, "learning_rate": 2.4215231743058086e-06, "loss": 0.75314212, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.3984375, "step": 7475, "time_per_iteration": 2.3916754722595215 }, { "auxiliary_loss_clip": 0.01059435, "auxiliary_loss_mlp": 0.01058973, "balance_loss_clip": 1.03219891, "balance_loss_mlp": 1.01768494, "epoch": 0.4494814369457388, "flos": 27417432199680.0, "grad_norm": 1.8982123997833262, "language_loss": 0.78000748, "learning_rate": 2.4211424529197594e-06, "loss": 0.80119157, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41796875, "step": 7476, "time_per_iteration": 2.45082426071167 }, { "auxiliary_loss_clip": 0.01061794, "auxiliary_loss_mlp": 0.01068399, "balance_loss_clip": 1.03804803, "balance_loss_mlp": 1.01924014, "epoch": 0.44954156019840674, "flos": 22853027358720.0, "grad_norm": 2.3153479449573022, "language_loss": 0.73268032, "learning_rate": 2.4207617155639464e-06, "loss": 0.75398231, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 7477, "time_per_iteration": 2.405015468597412 }, { "auxiliary_loss_clip": 0.01063634, "auxiliary_loss_mlp": 0.01055901, "balance_loss_clip": 1.02607441, "balance_loss_mlp": 1.02017474, "epoch": 0.4496016834510747, "flos": 17200623818880.0, "grad_norm": 1.9902788312258757, "language_loss": 0.70298278, "learning_rate": 2.4203809622528062e-06, "loss": 0.72417808, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43554688, "step": 7478, "time_per_iteration": 2.411987781524658 }, { "auxiliary_loss_clip": 0.0105736, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.03255463, "balance_loss_mlp": 1.01874053, "epoch": 0.4496618067037427, "flos": 18915627358080.0, "grad_norm": 1.8224918742826195, "language_loss": 0.9009285, "learning_rate": 2.420000193000779e-06, "loss": 0.92209613, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 7479, "time_per_iteration": 2.372098207473755 }, { "auxiliary_loss_clip": 0.01063053, "auxiliary_loss_mlp": 0.01060557, "balance_loss_clip": 1.03250718, "balance_loss_mlp": 1.02134788, "epoch": 0.44972192995641064, "flos": 21030630877440.0, "grad_norm": 1.665484434548596, "language_loss": 0.76902831, "learning_rate": 2.419619407822302e-06, "loss": 0.79026437, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 7480, "time_per_iteration": 2.429140567779541 }, { "auxiliary_loss_clip": 0.01065452, "auxiliary_loss_mlp": 0.01057482, "balance_loss_clip": 1.02763176, "balance_loss_mlp": 1.02212262, "epoch": 0.4497820532090786, "flos": 20776044176640.0, "grad_norm": 2.1280579417771657, "language_loss": 0.81380546, "learning_rate": 2.419238606731815e-06, "loss": 0.83503479, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 7481, "time_per_iteration": 2.4074056148529053 }, { "auxiliary_loss_clip": 0.01060795, "auxiliary_loss_mlp": 0.01044928, "balance_loss_clip": 1.01870203, "balance_loss_mlp": 1.02140367, "epoch": 0.44984217646174657, "flos": 33801196233600.0, "grad_norm": 1.7390548316514016, "language_loss": 0.69721901, "learning_rate": 2.418857789743758e-06, "loss": 0.71827614, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 7482, "time_per_iteration": 2.5256571769714355 }, { "auxiliary_loss_clip": 0.01064254, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.01990283, "balance_loss_mlp": 1.02246571, "epoch": 0.44990229971441453, "flos": 15517600951680.0, "grad_norm": 4.112418058019232, "language_loss": 0.85994375, "learning_rate": 2.418476956872571e-06, "loss": 0.88107324, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7483, "time_per_iteration": 2.354069232940674 }, { "auxiliary_loss_clip": 0.01068223, "auxiliary_loss_mlp": 0.01048286, "balance_loss_clip": 1.01977134, "balance_loss_mlp": 1.02437997, "epoch": 0.4499624229670825, "flos": 29860619299200.0, "grad_norm": 1.738103393695953, "language_loss": 0.81887078, "learning_rate": 2.4180961081326967e-06, "loss": 0.84003592, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4375, "step": 7484, "time_per_iteration": 2.502774953842163 }, { "auxiliary_loss_clip": 0.01071005, "auxiliary_loss_mlp": 0.0104998, "balance_loss_clip": 1.01502812, "balance_loss_mlp": 1.02381635, "epoch": 0.45002254621975046, "flos": 18512729735040.0, "grad_norm": 2.609101713777786, "language_loss": 0.76085693, "learning_rate": 2.4177152435385754e-06, "loss": 0.78206676, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 0.47265625, "step": 7485, "time_per_iteration": 2.401819944381714 }, { "auxiliary_loss_clip": 0.01025166, "auxiliary_loss_mlp": 0.01023606, "balance_loss_clip": 1.0210551, "balance_loss_mlp": 1.01624537, "epoch": 0.4500826694724185, "flos": 70417334390400.0, "grad_norm": 0.807937363010597, "language_loss": 0.5890553, "learning_rate": 2.4173343631046504e-06, "loss": 0.60954297, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.08886719, "step": 7486, "time_per_iteration": 3.0681605339050293 }, { "auxiliary_loss_clip": 0.01067933, "auxiliary_loss_mlp": 0.01047795, "balance_loss_clip": 1.01856518, "balance_loss_mlp": 1.02484572, "epoch": 0.45014279272508645, "flos": 15777982938240.0, "grad_norm": 1.9189894107674152, "language_loss": 0.84294266, "learning_rate": 2.4169534668453654e-06, "loss": 0.86409998, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 7487, "time_per_iteration": 2.3868765830993652 }, { "auxiliary_loss_clip": 0.01069, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.01582026, "balance_loss_mlp": 1.02709186, "epoch": 0.4502029159777544, "flos": 21798475608960.0, "grad_norm": 1.5297381873543638, "language_loss": 0.78511143, "learning_rate": 2.4165725547751622e-06, "loss": 0.80623972, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 7488, "time_per_iteration": 2.4674696922302246 }, { "auxiliary_loss_clip": 0.01072994, "auxiliary_loss_mlp": 0.01045573, "balance_loss_clip": 1.01295733, "balance_loss_mlp": 1.02663648, "epoch": 0.4502630392304224, "flos": 28766685669120.0, "grad_norm": 2.1089852196065686, "language_loss": 0.73376125, "learning_rate": 2.4161916269084858e-06, "loss": 0.75494695, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.46289062, "step": 7489, "time_per_iteration": 2.448042154312134 }, { "auxiliary_loss_clip": 0.01072346, "auxiliary_loss_mlp": 0.01054469, "balance_loss_clip": 1.01939726, "balance_loss_mlp": 1.02700043, "epoch": 0.45032316248309034, "flos": 15843480382080.0, "grad_norm": 2.2370465351909483, "language_loss": 0.70779502, "learning_rate": 2.4158106832597817e-06, "loss": 0.72906315, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 0.453125, "step": 7490, "time_per_iteration": 2.4084415435791016 }, { "auxiliary_loss_clip": 0.01031972, "auxiliary_loss_mlp": 0.01004894, "balance_loss_clip": 1.00240278, "balance_loss_mlp": 1.02197397, "epoch": 0.4503832857357583, "flos": 57850311594240.0, "grad_norm": 0.7854790760003224, "language_loss": 0.56746721, "learning_rate": 2.415429723843495e-06, "loss": 0.58783585, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.10009766, "step": 7491, "time_per_iteration": 4.339057922363281 }, { "auxiliary_loss_clip": 0.0106863, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.02546573, "balance_loss_mlp": 1.0264523, "epoch": 0.4504434089884263, "flos": 23876959979520.0, "grad_norm": 1.742570288445148, "language_loss": 0.80823934, "learning_rate": 2.4150487486740713e-06, "loss": 0.82944769, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.421875, "step": 7492, "time_per_iteration": 2.481872081756592 }, { "auxiliary_loss_clip": 0.01069872, "auxiliary_loss_mlp": 0.01063411, "balance_loss_clip": 1.03379965, "balance_loss_mlp": 1.02482629, "epoch": 0.45050353224109424, "flos": 17784127238400.0, "grad_norm": 2.3968384721121043, "language_loss": 0.9345969, "learning_rate": 2.4146677577659573e-06, "loss": 0.95592976, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 7493, "time_per_iteration": 3.7811665534973145 }, { "auxiliary_loss_clip": 0.01027315, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.03116536, "balance_loss_mlp": 1.01757753, "epoch": 0.4505636554937622, "flos": 65060330797440.0, "grad_norm": 0.8257773623827361, "language_loss": 0.62939882, "learning_rate": 2.4142867511336e-06, "loss": 0.65000689, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.09716797, "step": 7494, "time_per_iteration": 4.562364816665649 }, { "auxiliary_loss_clip": 0.01066032, "auxiliary_loss_mlp": 0.01055043, "balance_loss_clip": 1.02701688, "balance_loss_mlp": 1.02344131, "epoch": 0.45062377874643017, "flos": 22198999259520.0, "grad_norm": 1.4160153824057204, "language_loss": 0.82280254, "learning_rate": 2.4139057287914484e-06, "loss": 0.84401333, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 7495, "time_per_iteration": 2.431205987930298 }, { "auxiliary_loss_clip": 0.01066117, "auxiliary_loss_mlp": 0.0105786, "balance_loss_clip": 1.02739, "balance_loss_mlp": 1.02221024, "epoch": 0.45068390199909814, "flos": 37668769781760.0, "grad_norm": 1.7182156437282932, "language_loss": 0.86489022, "learning_rate": 2.41352469075395e-06, "loss": 0.88612998, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4375, "step": 7496, "time_per_iteration": 2.570542812347412 }, { "auxiliary_loss_clip": 0.01065489, "auxiliary_loss_mlp": 0.01052528, "balance_loss_clip": 1.02037668, "balance_loss_mlp": 1.02118182, "epoch": 0.4507440252517661, "flos": 22301609345280.0, "grad_norm": 2.0406691434005944, "language_loss": 0.76899946, "learning_rate": 2.4131436370355534e-06, "loss": 0.79017961, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44335938, "step": 7497, "time_per_iteration": 2.3763365745544434 }, { "auxiliary_loss_clip": 0.010652, "auxiliary_loss_mlp": 0.01053849, "balance_loss_clip": 1.02440405, "balance_loss_mlp": 1.02122402, "epoch": 0.45080414850443407, "flos": 13187532257280.0, "grad_norm": 1.9457869926186593, "language_loss": 0.75808609, "learning_rate": 2.4127625676507088e-06, "loss": 0.77927649, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43945312, "step": 7498, "time_per_iteration": 2.449838399887085 }, { "auxiliary_loss_clip": 0.01063421, "auxiliary_loss_mlp": 0.01050808, "balance_loss_clip": 1.01863396, "balance_loss_mlp": 1.02036822, "epoch": 0.4508642717571021, "flos": 21943853976960.0, "grad_norm": 2.542104794700752, "language_loss": 0.71806896, "learning_rate": 2.4123814826138663e-06, "loss": 0.7392112, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4296875, "step": 7499, "time_per_iteration": 2.402391195297241 }, { "auxiliary_loss_clip": 0.0106586, "auxiliary_loss_mlp": 0.01056884, "balance_loss_clip": 1.02654552, "balance_loss_mlp": 1.02161193, "epoch": 0.45092439500977005, "flos": 23366355212160.0, "grad_norm": 2.1771482242174267, "language_loss": 0.78594398, "learning_rate": 2.412000381939477e-06, "loss": 0.8071714, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44140625, "step": 7500, "time_per_iteration": 3.9439070224761963 }, { "auxiliary_loss_clip": 0.01064382, "auxiliary_loss_mlp": 0.01044894, "balance_loss_clip": 1.01341081, "balance_loss_mlp": 1.02159595, "epoch": 0.450984518262438, "flos": 20772029370240.0, "grad_norm": 1.9239433131708068, "language_loss": 0.6347084, "learning_rate": 2.411619265641992e-06, "loss": 0.65580118, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4296875, "step": 7501, "time_per_iteration": 2.3783490657806396 }, { "auxiliary_loss_clip": 0.01066746, "auxiliary_loss_mlp": 0.01052206, "balance_loss_clip": 1.02118731, "balance_loss_mlp": 1.02185822, "epoch": 0.451044641515106, "flos": 17706550464000.0, "grad_norm": 2.459238839527523, "language_loss": 0.8586114, "learning_rate": 2.411238133735863e-06, "loss": 0.87980092, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.44921875, "step": 7502, "time_per_iteration": 2.4061007499694824 }, { "auxiliary_loss_clip": 0.01064241, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.01267207, "balance_loss_mlp": 1.02265012, "epoch": 0.45110476476777395, "flos": 20593657900800.0, "grad_norm": 1.3701952291222161, "language_loss": 0.80600768, "learning_rate": 2.4108569862355418e-06, "loss": 0.82707709, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41601562, "step": 7503, "time_per_iteration": 2.395280361175537 }, { "auxiliary_loss_clip": 0.01066021, "auxiliary_loss_mlp": 0.01043705, "balance_loss_clip": 1.01604843, "balance_loss_mlp": 1.02451015, "epoch": 0.4511648880204419, "flos": 16033128220800.0, "grad_norm": 2.3016941142101417, "language_loss": 0.81983912, "learning_rate": 2.410475823155484e-06, "loss": 0.8409363, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 7504, "time_per_iteration": 2.426525592803955 }, { "auxiliary_loss_clip": 0.01066357, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.01317477, "balance_loss_mlp": 1.02408779, "epoch": 0.4512250112731099, "flos": 23977929231360.0, "grad_norm": 1.6788049799273335, "language_loss": 0.6470964, "learning_rate": 2.4100946445101405e-06, "loss": 0.66817462, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 7505, "time_per_iteration": 2.3967859745025635 }, { "auxiliary_loss_clip": 0.01027602, "auxiliary_loss_mlp": 0.01005706, "balance_loss_clip": 1.00332224, "balance_loss_mlp": 1.01846719, "epoch": 0.45128513452577784, "flos": 71458652868480.0, "grad_norm": 0.8353640560549044, "language_loss": 0.58959502, "learning_rate": 2.409713450313968e-06, "loss": 0.60992813, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.09130859, "step": 7506, "time_per_iteration": 3.1318719387054443 }, { "auxiliary_loss_clip": 0.01069628, "auxiliary_loss_mlp": 0.01049037, "balance_loss_clip": 1.01887751, "balance_loss_mlp": 1.02651286, "epoch": 0.4513452577784458, "flos": 22089756015360.0, "grad_norm": 1.624289331125538, "language_loss": 0.80226862, "learning_rate": 2.40933224058142e-06, "loss": 0.82345533, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4296875, "step": 7507, "time_per_iteration": 2.3989713191986084 }, { "auxiliary_loss_clip": 0.01070123, "auxiliary_loss_mlp": 0.01050366, "balance_loss_clip": 1.01767898, "balance_loss_mlp": 1.02617681, "epoch": 0.4514053810311138, "flos": 24275354037120.0, "grad_norm": 1.513531296535427, "language_loss": 0.75116217, "learning_rate": 2.4089510153269526e-06, "loss": 0.772367, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43945312, "step": 7508, "time_per_iteration": 2.4558751583099365 }, { "auxiliary_loss_clip": 0.01066236, "auxiliary_loss_mlp": 0.01045452, "balance_loss_clip": 1.01772428, "balance_loss_mlp": 1.02508795, "epoch": 0.45146550428378174, "flos": 17886039096960.0, "grad_norm": 2.3790295099153083, "language_loss": 0.80915803, "learning_rate": 2.4085697745650217e-06, "loss": 0.83027494, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 7509, "time_per_iteration": 2.3785064220428467 }, { "auxiliary_loss_clip": 0.01068017, "auxiliary_loss_mlp": 0.01048743, "balance_loss_clip": 1.01950121, "balance_loss_mlp": 1.02527213, "epoch": 0.4515256275364497, "flos": 24242291112960.0, "grad_norm": 1.8872486767345809, "language_loss": 0.75019985, "learning_rate": 2.4081885183100837e-06, "loss": 0.77136743, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42773438, "step": 7510, "time_per_iteration": 2.487206220626831 }, { "auxiliary_loss_clip": 0.01069483, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.01531506, "balance_loss_mlp": 1.02554691, "epoch": 0.45158575078911767, "flos": 20630002492800.0, "grad_norm": 1.7782379598352216, "language_loss": 0.78783864, "learning_rate": 2.4078072465765964e-06, "loss": 0.80899131, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43945312, "step": 7511, "time_per_iteration": 2.402622699737549 }, { "auxiliary_loss_clip": 0.01066936, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.01261628, "balance_loss_mlp": 1.02255726, "epoch": 0.45164587404178563, "flos": 23326728952320.0, "grad_norm": 1.6629933873265164, "language_loss": 0.80061173, "learning_rate": 2.4074259593790174e-06, "loss": 0.8216992, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44335938, "step": 7512, "time_per_iteration": 2.440854787826538 }, { "auxiliary_loss_clip": 0.01071546, "auxiliary_loss_mlp": 0.01053005, "balance_loss_clip": 1.01941216, "balance_loss_mlp": 1.02508426, "epoch": 0.45170599729445365, "flos": 23804829377280.0, "grad_norm": 2.0873175716152046, "language_loss": 0.88352227, "learning_rate": 2.4070446567318053e-06, "loss": 0.90476775, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.46484375, "step": 7513, "time_per_iteration": 2.4164586067199707 }, { "auxiliary_loss_clip": 0.01061937, "auxiliary_loss_mlp": 0.01040559, "balance_loss_clip": 1.01553655, "balance_loss_mlp": 1.02298594, "epoch": 0.4517661205471216, "flos": 23511838314240.0, "grad_norm": 1.7516360184333193, "language_loss": 0.68372703, "learning_rate": 2.406663338649419e-06, "loss": 0.70475203, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 7514, "time_per_iteration": 2.448357343673706 }, { "auxiliary_loss_clip": 0.01067267, "auxiliary_loss_mlp": 0.01049516, "balance_loss_clip": 1.01671004, "balance_loss_mlp": 1.02399337, "epoch": 0.4518262437997896, "flos": 23512815832320.0, "grad_norm": 1.8136106769651723, "language_loss": 0.70960569, "learning_rate": 2.406282005146318e-06, "loss": 0.73077357, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43359375, "step": 7515, "time_per_iteration": 2.411487102508545 }, { "auxiliary_loss_clip": 0.01067135, "auxiliary_loss_mlp": 0.01055349, "balance_loss_clip": 1.02262652, "balance_loss_mlp": 1.02185559, "epoch": 0.45188636705245755, "flos": 14567369944320.0, "grad_norm": 2.2380827510852037, "language_loss": 0.82904404, "learning_rate": 2.405900656236963e-06, "loss": 0.8502689, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.453125, "step": 7516, "time_per_iteration": 2.379821300506592 }, { "auxiliary_loss_clip": 0.01061956, "auxiliary_loss_mlp": 0.01051877, "balance_loss_clip": 1.02303994, "balance_loss_mlp": 1.02099013, "epoch": 0.4519464903051255, "flos": 19900527212160.0, "grad_norm": 1.776936438771301, "language_loss": 0.6733954, "learning_rate": 2.4055192919358137e-06, "loss": 0.69453371, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 7517, "time_per_iteration": 2.3759145736694336 }, { "auxiliary_loss_clip": 0.01063887, "auxiliary_loss_mlp": 0.01040174, "balance_loss_clip": 1.01359022, "balance_loss_mlp": 1.02303934, "epoch": 0.4520066135577935, "flos": 18843357110400.0, "grad_norm": 1.854733095290723, "language_loss": 0.64080125, "learning_rate": 2.405137912257333e-06, "loss": 0.66184187, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40820312, "step": 7518, "time_per_iteration": 2.4072892665863037 }, { "auxiliary_loss_clip": 0.01063657, "auxiliary_loss_mlp": 0.01047478, "balance_loss_clip": 1.0197736, "balance_loss_mlp": 1.02139246, "epoch": 0.45206673681046144, "flos": 48212609667840.0, "grad_norm": 1.3158955196885416, "language_loss": 0.60289198, "learning_rate": 2.404756517215982e-06, "loss": 0.62400329, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42382812, "step": 7519, "time_per_iteration": 2.6101343631744385 }, { "auxiliary_loss_clip": 0.01065322, "auxiliary_loss_mlp": 0.01049539, "balance_loss_clip": 1.01878345, "balance_loss_mlp": 1.0223496, "epoch": 0.4521268600631294, "flos": 23841034323840.0, "grad_norm": 1.5500488275541031, "language_loss": 0.7340861, "learning_rate": 2.404375106826223e-06, "loss": 0.75523472, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 7520, "time_per_iteration": 2.4613234996795654 }, { "auxiliary_loss_clip": 0.01063357, "auxiliary_loss_mlp": 0.01050845, "balance_loss_clip": 1.0214839, "balance_loss_mlp": 1.02024734, "epoch": 0.4521869833157974, "flos": 18842623971840.0, "grad_norm": 2.3132486496994433, "language_loss": 0.76998031, "learning_rate": 2.4039936811025194e-06, "loss": 0.79112238, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43164062, "step": 7521, "time_per_iteration": 2.357624053955078 }, { "auxiliary_loss_clip": 0.01065544, "auxiliary_loss_mlp": 0.01051777, "balance_loss_clip": 1.02214146, "balance_loss_mlp": 1.02081573, "epoch": 0.45224710656846534, "flos": 19787164427520.0, "grad_norm": 1.7082416112724692, "language_loss": 0.68585873, "learning_rate": 2.4036122400593343e-06, "loss": 0.70703197, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44726562, "step": 7522, "time_per_iteration": 2.415496587753296 }, { "auxiliary_loss_clip": 0.0106102, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.02298903, "balance_loss_mlp": 1.01891303, "epoch": 0.4523072298211333, "flos": 28254893915520.0, "grad_norm": 1.5460426862177024, "language_loss": 0.62509149, "learning_rate": 2.403230783711134e-06, "loss": 0.64621246, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 7523, "time_per_iteration": 2.43674898147583 }, { "auxiliary_loss_clip": 0.01065695, "auxiliary_loss_mlp": 0.01053911, "balance_loss_clip": 1.02308369, "balance_loss_mlp": 1.02156031, "epoch": 0.45236735307380127, "flos": 11180270793600.0, "grad_norm": 2.2580041357568055, "language_loss": 0.8005116, "learning_rate": 2.4028493120723813e-06, "loss": 0.82170773, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.44140625, "step": 7524, "time_per_iteration": 2.388261556625366 }, { "auxiliary_loss_clip": 0.01063154, "auxiliary_loss_mlp": 0.01051813, "balance_loss_clip": 1.02388227, "balance_loss_mlp": 1.02094293, "epoch": 0.45242747632646924, "flos": 22600290960000.0, "grad_norm": 1.9958813143792098, "language_loss": 0.64987016, "learning_rate": 2.4024678251575417e-06, "loss": 0.67101979, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.421875, "step": 7525, "time_per_iteration": 2.40183424949646 }, { "auxiliary_loss_clip": 0.01061798, "auxiliary_loss_mlp": 0.01049067, "balance_loss_clip": 1.02179217, "balance_loss_mlp": 1.02022898, "epoch": 0.45248759957913726, "flos": 18255385036800.0, "grad_norm": 1.5839960717519521, "language_loss": 0.80485368, "learning_rate": 2.402086322981083e-06, "loss": 0.82596231, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7526, "time_per_iteration": 2.384517192840576 }, { "auxiliary_loss_clip": 0.01060406, "auxiliary_loss_mlp": 0.0104888, "balance_loss_clip": 1.02108049, "balance_loss_mlp": 1.01951313, "epoch": 0.4525477228318052, "flos": 22449152217600.0, "grad_norm": 1.62904524332469, "language_loss": 0.81982303, "learning_rate": 2.40170480555747e-06, "loss": 0.84091592, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 7527, "time_per_iteration": 2.3906197547912598 }, { "auxiliary_loss_clip": 0.01063523, "auxiliary_loss_mlp": 0.01043764, "balance_loss_clip": 1.01589298, "balance_loss_mlp": 1.02164245, "epoch": 0.4526078460844732, "flos": 29643529265280.0, "grad_norm": 1.9612117783958356, "language_loss": 0.66406012, "learning_rate": 2.4013232729011706e-06, "loss": 0.68513298, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 7528, "time_per_iteration": 2.5078327655792236 }, { "auxiliary_loss_clip": 0.01060165, "auxiliary_loss_mlp": 0.01045031, "balance_loss_clip": 1.01782775, "balance_loss_mlp": 1.01976204, "epoch": 0.45266796933714115, "flos": 23038625479680.0, "grad_norm": 1.5443909066694728, "language_loss": 0.76207799, "learning_rate": 2.4009417250266525e-06, "loss": 0.78312999, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40429688, "step": 7529, "time_per_iteration": 2.4032959938049316 }, { "auxiliary_loss_clip": 0.01062269, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.0177784, "balance_loss_mlp": 1.02080059, "epoch": 0.4527280925898091, "flos": 14427542482560.0, "grad_norm": 2.429139085087504, "language_loss": 0.74597615, "learning_rate": 2.400560161948384e-06, "loss": 0.76705778, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 7530, "time_per_iteration": 2.366192102432251 }, { "auxiliary_loss_clip": 0.01063731, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.01528788, "balance_loss_mlp": 1.02165318, "epoch": 0.4527882158424771, "flos": 22924529556480.0, "grad_norm": 1.8486309003832744, "language_loss": 0.77156699, "learning_rate": 2.400178583680834e-06, "loss": 0.79263824, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7531, "time_per_iteration": 3.754228115081787 }, { "auxiliary_loss_clip": 0.01060354, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.01371408, "balance_loss_mlp": 1.02066374, "epoch": 0.45284833909514505, "flos": 25554187560960.0, "grad_norm": 2.630467720386154, "language_loss": 0.6859082, "learning_rate": 2.3997969902384717e-06, "loss": 0.70692718, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39648438, "step": 7532, "time_per_iteration": 2.449387788772583 }, { "auxiliary_loss_clip": 0.01062368, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.01062393, "balance_loss_mlp": 1.02224588, "epoch": 0.452908462347813, "flos": 18149039435520.0, "grad_norm": 5.213538689237841, "language_loss": 0.78910911, "learning_rate": 2.399415381635768e-06, "loss": 0.8101114, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 7533, "time_per_iteration": 3.7315876483917236 }, { "auxiliary_loss_clip": 0.01068391, "auxiliary_loss_mlp": 0.01048682, "balance_loss_clip": 1.0167582, "balance_loss_mlp": 1.0218339, "epoch": 0.452968585600481, "flos": 19061738864640.0, "grad_norm": 1.802402658000889, "language_loss": 0.85077262, "learning_rate": 2.3990337578871927e-06, "loss": 0.87194335, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46679688, "step": 7534, "time_per_iteration": 2.4087021350860596 }, { "auxiliary_loss_clip": 0.01065911, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 1.01290154, "balance_loss_mlp": 1.0240314, "epoch": 0.45302870885314894, "flos": 22050723248640.0, "grad_norm": 1.7093093689011205, "language_loss": 0.77556741, "learning_rate": 2.3986521190072176e-06, "loss": 0.7966392, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 7535, "time_per_iteration": 3.817349672317505 }, { "auxiliary_loss_clip": 0.01062766, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.01454377, "balance_loss_mlp": 1.02259207, "epoch": 0.4530888321058169, "flos": 20375171412480.0, "grad_norm": 1.5338947837655712, "language_loss": 0.82336545, "learning_rate": 2.3982704650103138e-06, "loss": 0.84439719, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 7536, "time_per_iteration": 2.3797943592071533 }, { "auxiliary_loss_clip": 0.01063912, "auxiliary_loss_mlp": 0.01040934, "balance_loss_clip": 1.01387334, "balance_loss_mlp": 1.02163112, "epoch": 0.4531489553584849, "flos": 14829532410240.0, "grad_norm": 1.6289766735434226, "language_loss": 0.77005279, "learning_rate": 2.3978887959109544e-06, "loss": 0.79110122, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.421875, "step": 7537, "time_per_iteration": 2.408872127532959 }, { "auxiliary_loss_clip": 0.01064275, "auxiliary_loss_mlp": 0.01040608, "balance_loss_clip": 1.01450133, "balance_loss_mlp": 1.02171504, "epoch": 0.45320907861115284, "flos": 21943888888320.0, "grad_norm": 2.0384824522201086, "language_loss": 0.76922905, "learning_rate": 2.3975071117236118e-06, "loss": 0.79027784, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.42578125, "step": 7538, "time_per_iteration": 2.40981125831604 }, { "auxiliary_loss_clip": 0.01017276, "auxiliary_loss_mlp": 0.01023723, "balance_loss_clip": 1.02080202, "balance_loss_mlp": 1.00921988, "epoch": 0.45326920186382086, "flos": 66247760782080.0, "grad_norm": 0.8062914014356619, "language_loss": 0.62353498, "learning_rate": 2.3971254124627593e-06, "loss": 0.64394498, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.08056641, "step": 7539, "time_per_iteration": 4.464643478393555 }, { "auxiliary_loss_clip": 0.01062909, "auxiliary_loss_mlp": 0.01041335, "balance_loss_clip": 1.01495385, "balance_loss_mlp": 1.02255869, "epoch": 0.4533293251164888, "flos": 14683351080960.0, "grad_norm": 1.7206702170265986, "language_loss": 0.67246509, "learning_rate": 2.396743698142872e-06, "loss": 0.69350743, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 7540, "time_per_iteration": 2.411545753479004 }, { "auxiliary_loss_clip": 0.01066391, "auxiliary_loss_mlp": 0.01058013, "balance_loss_clip": 1.02649426, "balance_loss_mlp": 1.02211607, "epoch": 0.4533894483691568, "flos": 22600116403200.0, "grad_norm": 1.9064700828604393, "language_loss": 0.86361408, "learning_rate": 2.396361968778424e-06, "loss": 0.88485807, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 7541, "time_per_iteration": 2.4239509105682373 }, { "auxiliary_loss_clip": 0.01063443, "auxiliary_loss_mlp": 0.01052241, "balance_loss_clip": 1.02587235, "balance_loss_mlp": 1.02195084, "epoch": 0.45344957162182475, "flos": 34750170432000.0, "grad_norm": 1.695000076332881, "language_loss": 0.7811783, "learning_rate": 2.395980224383889e-06, "loss": 0.8023352, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41601562, "step": 7542, "time_per_iteration": 2.534183979034424 }, { "auxiliary_loss_clip": 0.01063376, "auxiliary_loss_mlp": 0.01049391, "balance_loss_clip": 1.02222371, "balance_loss_mlp": 1.02179372, "epoch": 0.4535096948744927, "flos": 23549090601600.0, "grad_norm": 2.028592681332716, "language_loss": 0.81631964, "learning_rate": 2.395598464973746e-06, "loss": 0.83744735, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41601562, "step": 7543, "time_per_iteration": 2.431640863418579 }, { "auxiliary_loss_clip": 0.01064203, "auxiliary_loss_mlp": 0.01060723, "balance_loss_clip": 1.03461576, "balance_loss_mlp": 1.02202845, "epoch": 0.4535698181271607, "flos": 25556352065280.0, "grad_norm": 1.7051442945146436, "language_loss": 0.77486491, "learning_rate": 2.395216690562469e-06, "loss": 0.79611409, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.421875, "step": 7544, "time_per_iteration": 2.4908459186553955 }, { "auxiliary_loss_clip": 0.010633, "auxiliary_loss_mlp": 0.01063235, "balance_loss_clip": 1.03616285, "balance_loss_mlp": 1.02143908, "epoch": 0.45362994137982865, "flos": 24862942085760.0, "grad_norm": 1.6798603648020651, "language_loss": 0.76961583, "learning_rate": 2.3948349011645355e-06, "loss": 0.79088116, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41796875, "step": 7545, "time_per_iteration": 2.421607255935669 }, { "auxiliary_loss_clip": 0.01062926, "auxiliary_loss_mlp": 0.01051353, "balance_loss_clip": 1.02634287, "balance_loss_mlp": 1.02091742, "epoch": 0.4536900646324966, "flos": 30805578691200.0, "grad_norm": 1.6139418750589096, "language_loss": 0.72845864, "learning_rate": 2.394453096794423e-06, "loss": 0.74960148, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.41992188, "step": 7546, "time_per_iteration": 2.503974437713623 }, { "auxiliary_loss_clip": 0.01062174, "auxiliary_loss_mlp": 0.0106323, "balance_loss_clip": 1.0345248, "balance_loss_mlp": 1.01973009, "epoch": 0.4537501878851646, "flos": 23403188563200.0, "grad_norm": 1.5735791045972793, "language_loss": 0.77323651, "learning_rate": 2.394071277466609e-06, "loss": 0.79449058, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42578125, "step": 7547, "time_per_iteration": 2.4390709400177 }, { "auxiliary_loss_clip": 0.01061625, "auxiliary_loss_mlp": 0.01061811, "balance_loss_clip": 1.03235483, "balance_loss_mlp": 1.0198431, "epoch": 0.45381031113783254, "flos": 18148341208320.0, "grad_norm": 2.3845396588392993, "language_loss": 0.70914245, "learning_rate": 2.393689443195573e-06, "loss": 0.73037684, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41796875, "step": 7548, "time_per_iteration": 2.3879354000091553 }, { "auxiliary_loss_clip": 0.01059379, "auxiliary_loss_mlp": 0.01064733, "balance_loss_clip": 1.03879273, "balance_loss_mlp": 1.01851368, "epoch": 0.4538704343905005, "flos": 25335526515840.0, "grad_norm": 2.1761699459910853, "language_loss": 0.73978591, "learning_rate": 2.393307593995794e-06, "loss": 0.7610271, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40820312, "step": 7549, "time_per_iteration": 2.434638261795044 }, { "auxiliary_loss_clip": 0.01058528, "auxiliary_loss_mlp": 0.01054712, "balance_loss_clip": 1.0286293, "balance_loss_mlp": 1.01839447, "epoch": 0.4539305576431685, "flos": 28730166520320.0, "grad_norm": 1.4053676913984436, "language_loss": 0.66369891, "learning_rate": 2.392925729881751e-06, "loss": 0.68483126, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 7550, "time_per_iteration": 2.4462249279022217 }, { "auxiliary_loss_clip": 0.01058783, "auxiliary_loss_mlp": 0.01052471, "balance_loss_clip": 1.02585125, "balance_loss_mlp": 1.01892221, "epoch": 0.45399068089583644, "flos": 22491292095360.0, "grad_norm": 1.607829597010775, "language_loss": 0.69687402, "learning_rate": 2.3925438508679263e-06, "loss": 0.71798658, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7551, "time_per_iteration": 2.4201266765594482 }, { "auxiliary_loss_clip": 0.01060958, "auxiliary_loss_mlp": 0.01050634, "balance_loss_clip": 1.02368093, "balance_loss_mlp": 1.01808548, "epoch": 0.45405080414850446, "flos": 12892655980800.0, "grad_norm": 1.767975966681464, "language_loss": 0.80474919, "learning_rate": 2.392161956968798e-06, "loss": 0.82586509, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4296875, "step": 7552, "time_per_iteration": 2.3344578742980957 }, { "auxiliary_loss_clip": 0.01011773, "auxiliary_loss_mlp": 0.01005534, "balance_loss_clip": 1.00312567, "balance_loss_mlp": 1.00369716, "epoch": 0.4541109274011724, "flos": 59764146526080.0, "grad_norm": 0.8189387899927604, "language_loss": 0.57869726, "learning_rate": 2.39178004819885e-06, "loss": 0.59887034, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.08105469, "step": 7553, "time_per_iteration": 2.9818027019500732 }, { "auxiliary_loss_clip": 0.01058432, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.01789212, "balance_loss_mlp": 1.01884305, "epoch": 0.4541710506538404, "flos": 28510493045760.0, "grad_norm": 1.374811528453148, "language_loss": 0.77982134, "learning_rate": 2.3913981245725626e-06, "loss": 0.80083573, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39648438, "step": 7554, "time_per_iteration": 2.4302072525024414 }, { "auxiliary_loss_clip": 0.01065392, "auxiliary_loss_mlp": 0.01044467, "balance_loss_clip": 1.01460481, "balance_loss_mlp": 1.02152681, "epoch": 0.45423117390650836, "flos": 17674639614720.0, "grad_norm": 2.5767865161365298, "language_loss": 0.78525281, "learning_rate": 2.3910161861044194e-06, "loss": 0.80635142, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43945312, "step": 7555, "time_per_iteration": 2.3951215744018555 }, { "auxiliary_loss_clip": 0.01061327, "auxiliary_loss_mlp": 0.01037632, "balance_loss_clip": 1.01185906, "balance_loss_mlp": 1.02034211, "epoch": 0.4542912971591763, "flos": 28071355564800.0, "grad_norm": 1.4496442320092022, "language_loss": 0.73375082, "learning_rate": 2.390634232808903e-06, "loss": 0.75474042, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 7556, "time_per_iteration": 2.4454572200775146 }, { "auxiliary_loss_clip": 0.01065257, "auxiliary_loss_mlp": 0.01042165, "balance_loss_clip": 1.01076519, "balance_loss_mlp": 1.02113295, "epoch": 0.4543514204118443, "flos": 22670745816960.0, "grad_norm": 1.8180800008287303, "language_loss": 0.64687508, "learning_rate": 2.3902522647004982e-06, "loss": 0.66794926, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.44140625, "step": 7557, "time_per_iteration": 2.412522792816162 }, { "auxiliary_loss_clip": 0.01018487, "auxiliary_loss_mlp": 0.01004049, "balance_loss_clip": 1.00129485, "balance_loss_mlp": 1.00995886, "epoch": 0.45441154366451225, "flos": 58213303666560.0, "grad_norm": 0.6929412655031698, "language_loss": 0.57646191, "learning_rate": 2.3898702817936875e-06, "loss": 0.5966872, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.08496094, "step": 7558, "time_per_iteration": 2.9754416942596436 }, { "auxiliary_loss_clip": 0.01065701, "auxiliary_loss_mlp": 0.0105064, "balance_loss_clip": 1.01774967, "balance_loss_mlp": 1.02225137, "epoch": 0.4544716669171802, "flos": 16763336640000.0, "grad_norm": 3.085064424633043, "language_loss": 0.5941056, "learning_rate": 2.3894882841029573e-06, "loss": 0.61526906, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.43554688, "step": 7559, "time_per_iteration": 2.36574125289917 }, { "auxiliary_loss_clip": 0.01061778, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.02003407, "balance_loss_mlp": 1.02090061, "epoch": 0.4545317901698482, "flos": 15924303912960.0, "grad_norm": 1.8276368582948233, "language_loss": 0.73037326, "learning_rate": 2.389106271642792e-06, "loss": 0.75146401, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 7560, "time_per_iteration": 2.4502646923065186 }, { "auxiliary_loss_clip": 0.010648, "auxiliary_loss_mlp": 0.01047639, "balance_loss_clip": 1.01622701, "balance_loss_mlp": 1.02125943, "epoch": 0.45459191342251615, "flos": 17638783781760.0, "grad_norm": 2.8251203747107474, "language_loss": 0.7071203, "learning_rate": 2.3887242444276775e-06, "loss": 0.72824472, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43554688, "step": 7561, "time_per_iteration": 2.3899810314178467 }, { "auxiliary_loss_clip": 0.01057716, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.0113945, "balance_loss_mlp": 1.01873469, "epoch": 0.4546520366751841, "flos": 16175783502720.0, "grad_norm": 1.990346769810943, "language_loss": 0.86210561, "learning_rate": 2.3883422024721015e-06, "loss": 0.88305283, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38867188, "step": 7562, "time_per_iteration": 2.372159004211426 }, { "auxiliary_loss_clip": 0.01059113, "auxiliary_loss_mlp": 0.01049142, "balance_loss_clip": 1.02127051, "balance_loss_mlp": 1.01989353, "epoch": 0.4547121599278521, "flos": 19750540544640.0, "grad_norm": 1.816953152115645, "language_loss": 0.90390152, "learning_rate": 2.38796014579055e-06, "loss": 0.9249841, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.390625, "step": 7563, "time_per_iteration": 2.3999404907226562 }, { "auxiliary_loss_clip": 0.0106171, "auxiliary_loss_mlp": 0.01045492, "balance_loss_clip": 1.01653564, "balance_loss_mlp": 1.01944625, "epoch": 0.45477228318052004, "flos": 19936452867840.0, "grad_norm": 1.9939108152979832, "language_loss": 0.73590481, "learning_rate": 2.3875780743975097e-06, "loss": 0.75697684, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 7564, "time_per_iteration": 2.3637027740478516 }, { "auxiliary_loss_clip": 0.01061394, "auxiliary_loss_mlp": 0.01044982, "balance_loss_clip": 1.01489329, "balance_loss_mlp": 1.01829994, "epoch": 0.454832406433188, "flos": 21287242437120.0, "grad_norm": 1.9964359278868038, "language_loss": 0.69341028, "learning_rate": 2.3871959883074713e-06, "loss": 0.71447408, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43164062, "step": 7565, "time_per_iteration": 2.4022116661071777 }, { "auxiliary_loss_clip": 0.01060489, "auxiliary_loss_mlp": 0.01041244, "balance_loss_clip": 1.01299179, "balance_loss_mlp": 1.01997042, "epoch": 0.45489252968585603, "flos": 24497576040960.0, "grad_norm": 2.187256620095692, "language_loss": 0.81252712, "learning_rate": 2.386813887534922e-06, "loss": 0.83354449, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 7566, "time_per_iteration": 2.401824474334717 }, { "auxiliary_loss_clip": 0.01061344, "auxiliary_loss_mlp": 0.01042364, "balance_loss_clip": 1.01241922, "balance_loss_mlp": 1.01971722, "epoch": 0.454952652938524, "flos": 17091520220160.0, "grad_norm": 1.6854513653180536, "language_loss": 0.75133491, "learning_rate": 2.3864317720943508e-06, "loss": 0.77237195, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41601562, "step": 7567, "time_per_iteration": 2.3912529945373535 }, { "auxiliary_loss_clip": 0.01063197, "auxiliary_loss_mlp": 0.0104734, "balance_loss_clip": 1.01832485, "balance_loss_mlp": 1.01989758, "epoch": 0.45501277619119196, "flos": 27629320440960.0, "grad_norm": 1.4186883110526398, "language_loss": 0.81505972, "learning_rate": 2.386049642000249e-06, "loss": 0.83616507, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 7568, "time_per_iteration": 2.4388492107391357 }, { "auxiliary_loss_clip": 0.01063974, "auxiliary_loss_mlp": 0.01053732, "balance_loss_clip": 1.02175987, "balance_loss_mlp": 1.01916337, "epoch": 0.4550728994438599, "flos": 19973635332480.0, "grad_norm": 1.7751440893209678, "language_loss": 0.81376028, "learning_rate": 2.3856674972671055e-06, "loss": 0.83493733, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44921875, "step": 7569, "time_per_iteration": 2.385939359664917 }, { "auxiliary_loss_clip": 0.010636, "auxiliary_loss_mlp": 0.01052147, "balance_loss_clip": 1.02242851, "balance_loss_mlp": 1.02007115, "epoch": 0.4551330226965279, "flos": 26065700023680.0, "grad_norm": 1.3588551377855331, "language_loss": 0.76063496, "learning_rate": 2.385285337909412e-06, "loss": 0.7817924, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 7570, "time_per_iteration": 2.429088592529297 }, { "auxiliary_loss_clip": 0.0105966, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.01831388, "balance_loss_mlp": 1.0194478, "epoch": 0.45519314594919585, "flos": 32779707408000.0, "grad_norm": 1.6901056792684868, "language_loss": 0.75211442, "learning_rate": 2.3849031639416596e-06, "loss": 0.77318013, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40234375, "step": 7571, "time_per_iteration": 3.8648102283477783 }, { "auxiliary_loss_clip": 0.01057397, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.01273251, "balance_loss_mlp": 1.01805949, "epoch": 0.4552532692018638, "flos": 19171645424640.0, "grad_norm": 1.482503826940187, "language_loss": 0.82003731, "learning_rate": 2.3845209753783414e-06, "loss": 0.84100252, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 7572, "time_per_iteration": 3.858579635620117 }, { "auxiliary_loss_clip": 0.01062283, "auxiliary_loss_mlp": 0.01055682, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.01813948, "epoch": 0.4553133924545318, "flos": 26026073763840.0, "grad_norm": 1.879501608298425, "language_loss": 0.74073493, "learning_rate": 2.3841387722339486e-06, "loss": 0.76191455, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44140625, "step": 7573, "time_per_iteration": 2.3973395824432373 }, { "auxiliary_loss_clip": 0.0106437, "auxiliary_loss_mlp": 0.01049186, "balance_loss_clip": 1.01497269, "balance_loss_mlp": 1.02009666, "epoch": 0.45537351570719975, "flos": 30660305057280.0, "grad_norm": 2.063658661293185, "language_loss": 0.75367129, "learning_rate": 2.3837565545229748e-06, "loss": 0.7748068, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 0.44335938, "step": 7574, "time_per_iteration": 3.8338615894317627 }, { "auxiliary_loss_clip": 0.0106248, "auxiliary_loss_mlp": 0.01047302, "balance_loss_clip": 1.01832235, "balance_loss_mlp": 1.01946664, "epoch": 0.4554336389598677, "flos": 24352232584320.0, "grad_norm": 1.7330775576866297, "language_loss": 0.72254074, "learning_rate": 2.383374322259915e-06, "loss": 0.74363852, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4296875, "step": 7575, "time_per_iteration": 2.449084997177124 }, { "auxiliary_loss_clip": 0.01060649, "auxiliary_loss_mlp": 0.01045362, "balance_loss_clip": 1.018695, "balance_loss_mlp": 1.01874948, "epoch": 0.4554937622125357, "flos": 20556894372480.0, "grad_norm": 1.835831977234608, "language_loss": 0.74933523, "learning_rate": 2.3829920754592617e-06, "loss": 0.77039534, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41796875, "step": 7576, "time_per_iteration": 2.358691930770874 }, { "auxiliary_loss_clip": 0.01058994, "auxiliary_loss_mlp": 0.01048946, "balance_loss_clip": 1.02087271, "balance_loss_mlp": 1.01832414, "epoch": 0.45555388546520365, "flos": 22819650232320.0, "grad_norm": 2.134846264786994, "language_loss": 0.68237299, "learning_rate": 2.382609814135511e-06, "loss": 0.70345241, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 7577, "time_per_iteration": 2.3962113857269287 }, { "auxiliary_loss_clip": 0.01061387, "auxiliary_loss_mlp": 0.01054252, "balance_loss_clip": 1.02183938, "balance_loss_mlp": 1.01928806, "epoch": 0.4556140087178716, "flos": 21724913640960.0, "grad_norm": 10.014538859408013, "language_loss": 0.76007283, "learning_rate": 2.382227538303157e-06, "loss": 0.78122926, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.421875, "step": 7578, "time_per_iteration": 2.381537914276123 }, { "auxiliary_loss_clip": 0.01059374, "auxiliary_loss_mlp": 0.0104129, "balance_loss_clip": 1.01344252, "balance_loss_mlp": 1.01935494, "epoch": 0.45567413197053963, "flos": 25993325041920.0, "grad_norm": 2.011837873609319, "language_loss": 0.72267008, "learning_rate": 2.381845247976697e-06, "loss": 0.74367666, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40039062, "step": 7579, "time_per_iteration": 3.8259294033050537 }, { "auxiliary_loss_clip": 0.01058967, "auxiliary_loss_mlp": 0.01041817, "balance_loss_clip": 1.01534104, "balance_loss_mlp": 1.01890194, "epoch": 0.4557342552232076, "flos": 21536697168000.0, "grad_norm": 1.9618503421512454, "language_loss": 0.79557502, "learning_rate": 2.381462943170627e-06, "loss": 0.81658286, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 7580, "time_per_iteration": 2.3910229206085205 }, { "auxiliary_loss_clip": 0.01060695, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.01554894, "balance_loss_mlp": 1.01964641, "epoch": 0.45579437847587556, "flos": 40000479310080.0, "grad_norm": 1.5560266412279595, "language_loss": 0.69535816, "learning_rate": 2.381080623899444e-06, "loss": 0.71639907, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 7581, "time_per_iteration": 2.590573787689209 }, { "auxiliary_loss_clip": 0.01059472, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.01703203, "balance_loss_mlp": 1.01878095, "epoch": 0.4558545017285435, "flos": 31137183584640.0, "grad_norm": 2.217065787033831, "language_loss": 0.73471355, "learning_rate": 2.3806982901776455e-06, "loss": 0.75575113, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 7582, "time_per_iteration": 2.4482460021972656 }, { "auxiliary_loss_clip": 0.01063592, "auxiliary_loss_mlp": 0.01049974, "balance_loss_clip": 1.01750159, "balance_loss_mlp": 1.02049029, "epoch": 0.4559146249812115, "flos": 21724704172800.0, "grad_norm": 1.7539031666427538, "language_loss": 0.73730284, "learning_rate": 2.380315942019729e-06, "loss": 0.75843853, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4296875, "step": 7583, "time_per_iteration": 2.4571034908294678 }, { "auxiliary_loss_clip": 0.01063981, "auxiliary_loss_mlp": 0.0105267, "balance_loss_clip": 1.02167606, "balance_loss_mlp": 1.01955557, "epoch": 0.45597474823387946, "flos": 23804829377280.0, "grad_norm": 3.235966591266685, "language_loss": 0.73899925, "learning_rate": 2.379933579440195e-06, "loss": 0.76016575, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4453125, "step": 7584, "time_per_iteration": 2.4091238975524902 }, { "auxiliary_loss_clip": 0.01062235, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.01447773, "balance_loss_mlp": 1.02083194, "epoch": 0.4560348714865474, "flos": 31904295177600.0, "grad_norm": 1.5277985422426574, "language_loss": 0.68786895, "learning_rate": 2.379551202453541e-06, "loss": 0.70890927, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7585, "time_per_iteration": 2.4985082149505615 }, { "auxiliary_loss_clip": 0.01060292, "auxiliary_loss_mlp": 0.01042792, "balance_loss_clip": 1.01414585, "balance_loss_mlp": 1.01996493, "epoch": 0.4560949947392154, "flos": 22047895428480.0, "grad_norm": 1.7361174709269103, "language_loss": 0.77246082, "learning_rate": 2.379168811074267e-06, "loss": 0.7934916, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40234375, "step": 7586, "time_per_iteration": 2.397373914718628 }, { "auxiliary_loss_clip": 0.01058128, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.01408625, "balance_loss_mlp": 1.01885104, "epoch": 0.45615511799188335, "flos": 24570649249920.0, "grad_norm": 1.6794587565867642, "language_loss": 0.79138744, "learning_rate": 2.3787864053168747e-06, "loss": 0.81235003, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.39257812, "step": 7587, "time_per_iteration": 2.4596943855285645 }, { "auxiliary_loss_clip": 0.01063069, "auxiliary_loss_mlp": 0.01049842, "balance_loss_clip": 1.02131486, "balance_loss_mlp": 1.01965117, "epoch": 0.4562152412445513, "flos": 18329784877440.0, "grad_norm": 1.8487835072057834, "language_loss": 0.71118236, "learning_rate": 2.378403985195863e-06, "loss": 0.73231143, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43359375, "step": 7588, "time_per_iteration": 2.3714163303375244 }, { "auxiliary_loss_clip": 0.01059172, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.01163566, "balance_loss_mlp": 1.01910365, "epoch": 0.4562753644972193, "flos": 13515680926080.0, "grad_norm": 1.7631616088138158, "language_loss": 0.79918408, "learning_rate": 2.378021550725735e-06, "loss": 0.82015026, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 7589, "time_per_iteration": 2.4145472049713135 }, { "auxiliary_loss_clip": 0.010613, "auxiliary_loss_mlp": 0.01044454, "balance_loss_clip": 1.01785874, "balance_loss_mlp": 1.01991022, "epoch": 0.45633548774988725, "flos": 29638502029440.0, "grad_norm": 2.4656556069908757, "language_loss": 0.63906503, "learning_rate": 2.377639101920992e-06, "loss": 0.66012257, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 7590, "time_per_iteration": 2.444105625152588 }, { "auxiliary_loss_clip": 0.01060569, "auxiliary_loss_mlp": 0.01045514, "balance_loss_clip": 1.01853657, "balance_loss_mlp": 1.01978672, "epoch": 0.4563956110025552, "flos": 22232411297280.0, "grad_norm": 1.7216062935071048, "language_loss": 0.73730582, "learning_rate": 2.377256638796135e-06, "loss": 0.7583667, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 7591, "time_per_iteration": 2.4021365642547607 }, { "auxiliary_loss_clip": 0.01063799, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.01724482, "balance_loss_mlp": 1.02140081, "epoch": 0.45645573425522323, "flos": 17091101283840.0, "grad_norm": 2.3331162102960885, "language_loss": 0.7872467, "learning_rate": 2.3768741613656695e-06, "loss": 0.80832887, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.42382812, "step": 7592, "time_per_iteration": 2.342581033706665 }, { "auxiliary_loss_clip": 0.01060247, "auxiliary_loss_mlp": 0.01043497, "balance_loss_clip": 1.01615071, "balance_loss_mlp": 1.01889586, "epoch": 0.4565158575078912, "flos": 20331495434880.0, "grad_norm": 2.027895791899318, "language_loss": 0.71038616, "learning_rate": 2.376491669644098e-06, "loss": 0.73142362, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7593, "time_per_iteration": 2.3955235481262207 }, { "auxiliary_loss_clip": 0.01055686, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.01779103, "balance_loss_mlp": 1.01791883, "epoch": 0.45657598076055916, "flos": 23982013860480.0, "grad_norm": 1.8585867894698758, "language_loss": 0.84834862, "learning_rate": 2.3761091636459248e-06, "loss": 0.86932695, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 7594, "time_per_iteration": 2.3982200622558594 }, { "auxiliary_loss_clip": 0.01011818, "auxiliary_loss_mlp": 0.01003378, "balance_loss_clip": 1.00093424, "balance_loss_mlp": 1.00377011, "epoch": 0.45663610401322713, "flos": 69361211629440.0, "grad_norm": 0.7811502830930908, "language_loss": 0.52802372, "learning_rate": 2.375726643385654e-06, "loss": 0.54817569, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.08007812, "step": 7595, "time_per_iteration": 3.0359461307525635 }, { "auxiliary_loss_clip": 0.01063223, "auxiliary_loss_mlp": 0.01044139, "balance_loss_clip": 1.01524234, "balance_loss_mlp": 1.01977324, "epoch": 0.4566962272658951, "flos": 15148464480000.0, "grad_norm": 2.2313334549397204, "language_loss": 0.8945027, "learning_rate": 2.3753441088777915e-06, "loss": 0.91557634, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 7596, "time_per_iteration": 2.3227362632751465 }, { "auxiliary_loss_clip": 0.01061244, "auxiliary_loss_mlp": 0.01045915, "balance_loss_clip": 1.01803219, "balance_loss_mlp": 1.01888835, "epoch": 0.45675635051856306, "flos": 18696477553920.0, "grad_norm": 1.4996810455330196, "language_loss": 0.78092468, "learning_rate": 2.374961560136843e-06, "loss": 0.80199629, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.42382812, "step": 7597, "time_per_iteration": 2.3933229446411133 }, { "auxiliary_loss_clip": 0.01060592, "auxiliary_loss_mlp": 0.01044486, "balance_loss_clip": 1.01643634, "balance_loss_mlp": 1.01873112, "epoch": 0.456816473771231, "flos": 19097315406720.0, "grad_norm": 1.6232572055955337, "language_loss": 0.79308867, "learning_rate": 2.374578997177314e-06, "loss": 0.81413949, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7598, "time_per_iteration": 2.3688669204711914 }, { "auxiliary_loss_clip": 0.01056706, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.01106155, "balance_loss_mlp": 1.01717114, "epoch": 0.456876597023899, "flos": 28948792654080.0, "grad_norm": 2.0769349949547022, "language_loss": 0.72700226, "learning_rate": 2.374196420013712e-06, "loss": 0.74792969, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39453125, "step": 7599, "time_per_iteration": 2.445981025695801 }, { "auxiliary_loss_clip": 0.01056254, "auxiliary_loss_mlp": 0.01042967, "balance_loss_clip": 1.01742029, "balance_loss_mlp": 1.01674163, "epoch": 0.45693672027656695, "flos": 23288499146880.0, "grad_norm": 3.226841599092553, "language_loss": 0.7097019, "learning_rate": 2.373813828660544e-06, "loss": 0.73069412, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 7600, "time_per_iteration": 2.3679611682891846 }, { "auxiliary_loss_clip": 0.01058438, "auxiliary_loss_mlp": 0.01043846, "balance_loss_clip": 1.01834762, "balance_loss_mlp": 1.01847243, "epoch": 0.4569968435292349, "flos": 20557173663360.0, "grad_norm": 1.7740166001843118, "language_loss": 0.80736661, "learning_rate": 2.373431223132319e-06, "loss": 0.82838941, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 7601, "time_per_iteration": 2.3913936614990234 }, { "auxiliary_loss_clip": 0.01060042, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.02074718, "balance_loss_mlp": 1.01895809, "epoch": 0.4570569667819029, "flos": 41280988579200.0, "grad_norm": 1.6827401025545106, "language_loss": 0.7319445, "learning_rate": 2.3730486034435448e-06, "loss": 0.75300324, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.41015625, "step": 7602, "time_per_iteration": 2.535433769226074 }, { "auxiliary_loss_clip": 0.01059539, "auxiliary_loss_mlp": 0.01050032, "balance_loss_clip": 1.02149343, "balance_loss_mlp": 1.0189774, "epoch": 0.45711709003457085, "flos": 26030367861120.0, "grad_norm": 1.9845510835016633, "language_loss": 0.75118232, "learning_rate": 2.372665969608729e-06, "loss": 0.77227801, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40625, "step": 7603, "time_per_iteration": 2.4382476806640625 }, { "auxiliary_loss_clip": 0.01057818, "auxiliary_loss_mlp": 0.01053064, "balance_loss_clip": 1.02603889, "balance_loss_mlp": 1.01768446, "epoch": 0.4571772132872388, "flos": 22157138672640.0, "grad_norm": 2.0895261253163455, "language_loss": 0.84190595, "learning_rate": 2.372283321642383e-06, "loss": 0.8630147, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 7604, "time_per_iteration": 2.3670825958251953 }, { "auxiliary_loss_clip": 0.01065413, "auxiliary_loss_mlp": 0.01054225, "balance_loss_clip": 1.02335012, "balance_loss_mlp": 1.02102017, "epoch": 0.45723733653990684, "flos": 23877728029440.0, "grad_norm": 2.05827111641087, "language_loss": 0.87572074, "learning_rate": 2.371900659559016e-06, "loss": 0.8969171, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4453125, "step": 7605, "time_per_iteration": 2.3866117000579834 }, { "auxiliary_loss_clip": 0.01060637, "auxiliary_loss_mlp": 0.01042794, "balance_loss_clip": 1.01635289, "balance_loss_mlp": 1.01864743, "epoch": 0.4572974597925748, "flos": 16870904138880.0, "grad_norm": 1.8144405181312455, "language_loss": 0.7511003, "learning_rate": 2.371517983373138e-06, "loss": 0.7721346, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41992188, "step": 7606, "time_per_iteration": 2.3695812225341797 }, { "auxiliary_loss_clip": 0.01061164, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.01438928, "balance_loss_mlp": 1.0190742, "epoch": 0.45735758304524277, "flos": 13770651651840.0, "grad_norm": 2.034643272951108, "language_loss": 0.81941718, "learning_rate": 2.371135293099262e-06, "loss": 0.84045118, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 7607, "time_per_iteration": 2.3556575775146484 }, { "auxiliary_loss_clip": 0.01061068, "auxiliary_loss_mlp": 0.01046162, "balance_loss_clip": 1.01830292, "balance_loss_mlp": 1.02012479, "epoch": 0.45741770629791073, "flos": 21099828925440.0, "grad_norm": 1.7336368565476437, "language_loss": 0.81407315, "learning_rate": 2.3707525887518982e-06, "loss": 0.83514547, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 7608, "time_per_iteration": 2.398467779159546 }, { "auxiliary_loss_clip": 0.01057934, "auxiliary_loss_mlp": 0.01039612, "balance_loss_clip": 1.01376772, "balance_loss_mlp": 1.01694822, "epoch": 0.4574778295505787, "flos": 23111768511360.0, "grad_norm": 1.6206119650206945, "language_loss": 0.69531363, "learning_rate": 2.370369870345559e-06, "loss": 0.7162891, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 7609, "time_per_iteration": 2.3712785243988037 }, { "auxiliary_loss_clip": 0.01059942, "auxiliary_loss_mlp": 0.01046399, "balance_loss_clip": 1.02018452, "balance_loss_mlp": 1.01918185, "epoch": 0.45753795280324666, "flos": 24351778736640.0, "grad_norm": 1.804244782556104, "language_loss": 0.82632053, "learning_rate": 2.369987137894757e-06, "loss": 0.84738392, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 7610, "time_per_iteration": 2.41630220413208 }, { "auxiliary_loss_clip": 0.01061698, "auxiliary_loss_mlp": 0.01045349, "balance_loss_clip": 1.0168817, "balance_loss_mlp": 1.01871932, "epoch": 0.4575980760559146, "flos": 16652871498240.0, "grad_norm": 2.6771548215740855, "language_loss": 0.83400738, "learning_rate": 2.3696043914140057e-06, "loss": 0.85507792, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 7611, "time_per_iteration": 5.056148290634155 }, { "auxiliary_loss_clip": 0.01061539, "auxiliary_loss_mlp": 0.01046669, "balance_loss_clip": 1.01798773, "balance_loss_mlp": 1.01969194, "epoch": 0.4576581993085826, "flos": 35910160087680.0, "grad_norm": 1.7711288939672405, "language_loss": 0.75233096, "learning_rate": 2.369221630917819e-06, "loss": 0.77341306, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7612, "time_per_iteration": 2.49027681350708 }, { "auxiliary_loss_clip": 0.01058126, "auxiliary_loss_mlp": 0.01038345, "balance_loss_clip": 1.01258361, "balance_loss_mlp": 1.01804602, "epoch": 0.45771832256125056, "flos": 20079492174720.0, "grad_norm": 1.848481831453945, "language_loss": 0.86142892, "learning_rate": 2.368838856420711e-06, "loss": 0.8823936, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 7613, "time_per_iteration": 2.3878791332244873 }, { "auxiliary_loss_clip": 0.01062908, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.01422787, "balance_loss_mlp": 1.02091157, "epoch": 0.4577784458139185, "flos": 10743542196480.0, "grad_norm": 2.169837578846919, "language_loss": 0.77329063, "learning_rate": 2.3684560679371965e-06, "loss": 0.79433972, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 7614, "time_per_iteration": 3.80096697807312 }, { "auxiliary_loss_clip": 0.01061079, "auxiliary_loss_mlp": 0.01043943, "balance_loss_clip": 1.01703739, "balance_loss_mlp": 1.02024269, "epoch": 0.4578385690665865, "flos": 21906217664640.0, "grad_norm": 1.5408649038362092, "language_loss": 0.7531724, "learning_rate": 2.368073265481791e-06, "loss": 0.77422261, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 7615, "time_per_iteration": 2.3691372871398926 }, { "auxiliary_loss_clip": 0.01011812, "auxiliary_loss_mlp": 0.0101266, "balance_loss_clip": 1.01010847, "balance_loss_mlp": 1.00328565, "epoch": 0.45789869231925445, "flos": 64755574606080.0, "grad_norm": 0.7929262380307466, "language_loss": 0.57771432, "learning_rate": 2.3676904490690105e-06, "loss": 0.59795904, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.08496094, "step": 7616, "time_per_iteration": 2.9558324813842773 }, { "auxiliary_loss_clip": 0.01059394, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 1.0164727, "balance_loss_mlp": 1.0181725, "epoch": 0.4579588155719224, "flos": 16143069692160.0, "grad_norm": 1.5137194189612533, "language_loss": 0.72087508, "learning_rate": 2.3673076187133704e-06, "loss": 0.74190301, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 7617, "time_per_iteration": 2.3521227836608887 }, { "auxiliary_loss_clip": 0.01059546, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.01112926, "balance_loss_mlp": 1.01940811, "epoch": 0.45801893882459044, "flos": 21394530645120.0, "grad_norm": 2.0287878278241243, "language_loss": 0.7770983, "learning_rate": 2.36692477442939e-06, "loss": 0.79805708, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40234375, "step": 7618, "time_per_iteration": 2.3814167976379395 }, { "auxiliary_loss_clip": 0.01062869, "auxiliary_loss_mlp": 0.01044488, "balance_loss_clip": 1.01591396, "balance_loss_mlp": 1.02017009, "epoch": 0.4580790620772584, "flos": 19535545192320.0, "grad_norm": 1.8061431895305433, "language_loss": 0.78766, "learning_rate": 2.366541916231585e-06, "loss": 0.80873358, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42578125, "step": 7619, "time_per_iteration": 3.767815589904785 }, { "auxiliary_loss_clip": 0.0105877, "auxiliary_loss_mlp": 0.01042514, "balance_loss_clip": 1.01759958, "balance_loss_mlp": 1.0191834, "epoch": 0.45813918532992637, "flos": 16580147402880.0, "grad_norm": 1.7793254827838023, "language_loss": 0.72770739, "learning_rate": 2.366159044134473e-06, "loss": 0.74872029, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 7620, "time_per_iteration": 2.370732545852661 }, { "auxiliary_loss_clip": 0.01058109, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.01349545, "balance_loss_mlp": 1.0183847, "epoch": 0.45819930858259433, "flos": 42228671057280.0, "grad_norm": 1.6928584031994731, "language_loss": 0.78981167, "learning_rate": 2.3657761581525748e-06, "loss": 0.81077659, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3984375, "step": 7621, "time_per_iteration": 2.5454959869384766 }, { "auxiliary_loss_clip": 0.01009818, "auxiliary_loss_mlp": 0.01003715, "balance_loss_clip": 1.00110412, "balance_loss_mlp": 1.00182438, "epoch": 0.4582594318352623, "flos": 63711705732480.0, "grad_norm": 0.7850306508084729, "language_loss": 0.65066141, "learning_rate": 2.3653932583004063e-06, "loss": 0.67079669, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.08007812, "step": 7622, "time_per_iteration": 3.0070178508758545 }, { "auxiliary_loss_clip": 0.01062607, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.01110125, "balance_loss_mlp": 1.01954579, "epoch": 0.45831955508793026, "flos": 26868772183680.0, "grad_norm": 1.7081261105395746, "language_loss": 0.80907011, "learning_rate": 2.3650103445924903e-06, "loss": 0.8301149, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4296875, "step": 7623, "time_per_iteration": 2.4171340465545654 }, { "auxiliary_loss_clip": 0.01060623, "auxiliary_loss_mlp": 0.0104322, "balance_loss_clip": 1.0141449, "balance_loss_mlp": 1.01814699, "epoch": 0.45837967834059823, "flos": 18732961791360.0, "grad_norm": 2.185298001555558, "language_loss": 0.72116017, "learning_rate": 2.3646274170433452e-06, "loss": 0.74219865, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42382812, "step": 7624, "time_per_iteration": 2.3985538482666016 }, { "auxiliary_loss_clip": 0.01060244, "auxiliary_loss_mlp": 0.01044027, "balance_loss_clip": 1.0153569, "balance_loss_mlp": 1.01787329, "epoch": 0.4584398015932662, "flos": 21177056586240.0, "grad_norm": 1.789770086953154, "language_loss": 0.7426964, "learning_rate": 2.364244475667491e-06, "loss": 0.76373911, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42382812, "step": 7625, "time_per_iteration": 2.379972457885742 }, { "auxiliary_loss_clip": 0.01058787, "auxiliary_loss_mlp": 0.01047727, "balance_loss_clip": 1.01980877, "balance_loss_mlp": 1.01818991, "epoch": 0.45849992484593416, "flos": 19789084552320.0, "grad_norm": 2.16751493550557, "language_loss": 0.79168308, "learning_rate": 2.363861520479451e-06, "loss": 0.81274825, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 7626, "time_per_iteration": 2.392838716506958 }, { "auxiliary_loss_clip": 0.01061693, "auxiliary_loss_mlp": 0.01047822, "balance_loss_clip": 1.01691151, "balance_loss_mlp": 1.01852036, "epoch": 0.4585600480986021, "flos": 18222287201280.0, "grad_norm": 2.04518428941276, "language_loss": 0.8645786, "learning_rate": 2.3634785514937445e-06, "loss": 0.88567376, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43164062, "step": 7627, "time_per_iteration": 2.331693649291992 }, { "auxiliary_loss_clip": 0.01063703, "auxiliary_loss_mlp": 0.01051944, "balance_loss_clip": 1.01653886, "balance_loss_mlp": 1.01875734, "epoch": 0.4586201713512701, "flos": 29020958167680.0, "grad_norm": 1.5458735696990489, "language_loss": 0.70731962, "learning_rate": 2.3630955687248953e-06, "loss": 0.72847605, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 0.44921875, "step": 7628, "time_per_iteration": 2.43790602684021 }, { "auxiliary_loss_clip": 0.0105915, "auxiliary_loss_mlp": 0.01040991, "balance_loss_clip": 1.0128814, "balance_loss_mlp": 1.01757836, "epoch": 0.45868029460393805, "flos": 23403467854080.0, "grad_norm": 1.48292274360582, "language_loss": 0.78972363, "learning_rate": 2.3627125721874265e-06, "loss": 0.81072497, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41601562, "step": 7629, "time_per_iteration": 2.4037492275238037 }, { "auxiliary_loss_clip": 0.01064378, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.01663113, "balance_loss_mlp": 1.01858902, "epoch": 0.458740417856606, "flos": 18221030392320.0, "grad_norm": 2.7075777117511466, "language_loss": 0.8125459, "learning_rate": 2.3623295618958595e-06, "loss": 0.83367252, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.45703125, "step": 7630, "time_per_iteration": 2.3722639083862305 }, { "auxiliary_loss_clip": 0.01062811, "auxiliary_loss_mlp": 0.01051262, "balance_loss_clip": 1.01907539, "balance_loss_mlp": 1.01846266, "epoch": 0.458800541109274, "flos": 34567330308480.0, "grad_norm": 1.8666396289257838, "language_loss": 0.73632479, "learning_rate": 2.3619465378647198e-06, "loss": 0.75746548, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4453125, "step": 7631, "time_per_iteration": 2.4682650566101074 }, { "auxiliary_loss_clip": 0.0106263, "auxiliary_loss_mlp": 0.01052644, "balance_loss_clip": 1.02127957, "balance_loss_mlp": 1.01934266, "epoch": 0.458860664361942, "flos": 17711158763520.0, "grad_norm": 2.1365234652590637, "language_loss": 0.73511523, "learning_rate": 2.361563500108531e-06, "loss": 0.75626802, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.43359375, "step": 7632, "time_per_iteration": 2.396500825881958 }, { "auxiliary_loss_clip": 0.01063565, "auxiliary_loss_mlp": 0.01048166, "balance_loss_clip": 1.01535964, "balance_loss_mlp": 1.0180341, "epoch": 0.45892078761460997, "flos": 18440913335040.0, "grad_norm": 2.461462919057188, "language_loss": 0.71278334, "learning_rate": 2.3611804486418178e-06, "loss": 0.73390067, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.45507812, "step": 7633, "time_per_iteration": 2.349790334701538 }, { "auxiliary_loss_clip": 0.01062441, "auxiliary_loss_mlp": 0.01055773, "balance_loss_clip": 1.02482665, "balance_loss_mlp": 1.01929379, "epoch": 0.45898091086727794, "flos": 22671897891840.0, "grad_norm": 1.5036837676908137, "language_loss": 0.82565147, "learning_rate": 2.3607973834791062e-06, "loss": 0.84683359, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43164062, "step": 7634, "time_per_iteration": 2.4247701168060303 }, { "auxiliary_loss_clip": 0.01064401, "auxiliary_loss_mlp": 0.0105527, "balance_loss_clip": 1.01881623, "balance_loss_mlp": 1.01875639, "epoch": 0.4590410341199459, "flos": 21651875343360.0, "grad_norm": 5.120830366455669, "language_loss": 0.82799065, "learning_rate": 2.3604143046349216e-06, "loss": 0.84918737, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 0.45703125, "step": 7635, "time_per_iteration": 2.3572170734405518 }, { "auxiliary_loss_clip": 0.01061608, "auxiliary_loss_mlp": 0.01060603, "balance_loss_clip": 1.03057456, "balance_loss_mlp": 1.01916265, "epoch": 0.45910115737261387, "flos": 36533987994240.0, "grad_norm": 1.5108601783086366, "language_loss": 0.66209006, "learning_rate": 2.3600312121237905e-06, "loss": 0.68331218, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 7636, "time_per_iteration": 2.523144483566284 }, { "auxiliary_loss_clip": 0.01061144, "auxiliary_loss_mlp": 0.01042301, "balance_loss_clip": 1.01414418, "balance_loss_mlp": 1.01966608, "epoch": 0.45916128062528183, "flos": 24418882103040.0, "grad_norm": 1.4665530940942548, "language_loss": 0.81398308, "learning_rate": 2.3596481059602395e-06, "loss": 0.83501756, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 7637, "time_per_iteration": 2.409820318222046 }, { "auxiliary_loss_clip": 0.01064537, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.0142405, "balance_loss_mlp": 1.01931977, "epoch": 0.4592214038779498, "flos": 23220837198720.0, "grad_norm": 1.6578607587424679, "language_loss": 0.76383936, "learning_rate": 2.3592649861587965e-06, "loss": 0.78495145, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.453125, "step": 7638, "time_per_iteration": 2.4298181533813477 }, { "auxiliary_loss_clip": 0.01060398, "auxiliary_loss_mlp": 0.01053038, "balance_loss_clip": 1.02098298, "balance_loss_mlp": 1.01837587, "epoch": 0.45928152713061776, "flos": 19171121754240.0, "grad_norm": 1.7229400916486581, "language_loss": 0.75006276, "learning_rate": 2.358881852733989e-06, "loss": 0.77119714, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.41992188, "step": 7639, "time_per_iteration": 2.3341550827026367 }, { "auxiliary_loss_clip": 0.01061717, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.01661181, "balance_loss_mlp": 1.01809335, "epoch": 0.4593416503832857, "flos": 22413715320960.0, "grad_norm": 1.7656305632229226, "language_loss": 0.69466132, "learning_rate": 2.358498705700346e-06, "loss": 0.71575814, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43554688, "step": 7640, "time_per_iteration": 2.4109485149383545 }, { "auxiliary_loss_clip": 0.01063456, "auxiliary_loss_mlp": 0.01052766, "balance_loss_clip": 1.02108002, "balance_loss_mlp": 1.01819742, "epoch": 0.4594017736359537, "flos": 18879212943360.0, "grad_norm": 1.85300086816382, "language_loss": 0.76823634, "learning_rate": 2.3581155450723958e-06, "loss": 0.78939855, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.453125, "step": 7641, "time_per_iteration": 2.352515697479248 }, { "auxiliary_loss_clip": 0.01062886, "auxiliary_loss_mlp": 0.01046741, "balance_loss_clip": 1.01370871, "balance_loss_mlp": 1.01848102, "epoch": 0.45946189688862166, "flos": 20517617226240.0, "grad_norm": 2.782012021907221, "language_loss": 0.76368207, "learning_rate": 2.357732370864668e-06, "loss": 0.78477836, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4453125, "step": 7642, "time_per_iteration": 2.4027509689331055 }, { "auxiliary_loss_clip": 0.01011666, "auxiliary_loss_mlp": 0.01007238, "balance_loss_clip": 1.00471127, "balance_loss_mlp": 1.00339127, "epoch": 0.4595220201412896, "flos": 61403249036160.0, "grad_norm": 0.8396385840465036, "language_loss": 0.58259141, "learning_rate": 2.357349183091694e-06, "loss": 0.60278046, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.08300781, "step": 7643, "time_per_iteration": 2.7522642612457275 }, { "auxiliary_loss_clip": 0.0106424, "auxiliary_loss_mlp": 0.01051437, "balance_loss_clip": 1.01839197, "balance_loss_mlp": 1.01814687, "epoch": 0.4595821433939576, "flos": 23329836063360.0, "grad_norm": 1.4743489910457728, "language_loss": 0.93959427, "learning_rate": 2.3569659817680016e-06, "loss": 0.96075106, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.4609375, "step": 7644, "time_per_iteration": 2.4043054580688477 }, { "auxiliary_loss_clip": 0.01063857, "auxiliary_loss_mlp": 0.01047967, "balance_loss_clip": 1.01535177, "balance_loss_mlp": 1.01848757, "epoch": 0.4596422666466256, "flos": 14281500798720.0, "grad_norm": 2.0799827875031767, "language_loss": 0.84561551, "learning_rate": 2.3565827669081243e-06, "loss": 0.86673379, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 0.453125, "step": 7645, "time_per_iteration": 2.33394455909729 }, { "auxiliary_loss_clip": 0.0100959, "auxiliary_loss_mlp": 0.01007081, "balance_loss_clip": 1.0043031, "balance_loss_mlp": 1.00165439, "epoch": 0.4597023898992936, "flos": 65724029343360.0, "grad_norm": 0.7834073321176499, "language_loss": 0.59890699, "learning_rate": 2.356199538526593e-06, "loss": 0.61907375, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.07910156, "step": 7646, "time_per_iteration": 2.944260358810425 }, { "auxiliary_loss_clip": 0.01062275, "auxiliary_loss_mlp": 0.01043162, "balance_loss_clip": 1.01312137, "balance_loss_mlp": 1.01878631, "epoch": 0.45976251315196154, "flos": 26905849914240.0, "grad_norm": 1.564301884401858, "language_loss": 0.73685789, "learning_rate": 2.355816296637939e-06, "loss": 0.75791228, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.43554688, "step": 7647, "time_per_iteration": 2.4041519165039062 }, { "auxiliary_loss_clip": 0.01063284, "auxiliary_loss_mlp": 0.0104828, "balance_loss_clip": 1.01775026, "balance_loss_mlp": 1.01946545, "epoch": 0.4598226364046295, "flos": 26616768923520.0, "grad_norm": 1.6791709403390798, "language_loss": 0.68000674, "learning_rate": 2.3554330412566957e-06, "loss": 0.7011224, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4375, "step": 7648, "time_per_iteration": 2.436443567276001 }, { "auxiliary_loss_clip": 0.01062548, "auxiliary_loss_mlp": 0.01048077, "balance_loss_clip": 1.01665306, "balance_loss_mlp": 1.01888943, "epoch": 0.45988275965729747, "flos": 24386657051520.0, "grad_norm": 1.7325451442904258, "language_loss": 0.79666764, "learning_rate": 2.3550497723973953e-06, "loss": 0.81777382, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4375, "step": 7649, "time_per_iteration": 2.398726463317871 }, { "auxiliary_loss_clip": 0.01062495, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.01744676, "balance_loss_mlp": 1.0202316, "epoch": 0.45994288290996543, "flos": 24534653771520.0, "grad_norm": 1.9541120109094536, "language_loss": 0.70800239, "learning_rate": 2.3546664900745726e-06, "loss": 0.729096, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.421875, "step": 7650, "time_per_iteration": 3.7953343391418457 }, { "auxiliary_loss_clip": 0.01065938, "auxiliary_loss_mlp": 0.01051906, "balance_loss_clip": 1.01604831, "balance_loss_mlp": 1.01963496, "epoch": 0.4600030061626334, "flos": 14829357853440.0, "grad_norm": 1.9806342668653707, "language_loss": 0.85886681, "learning_rate": 2.354283194302761e-06, "loss": 0.88004518, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.46289062, "step": 7651, "time_per_iteration": 3.7432727813720703 }, { "auxiliary_loss_clip": 0.01060872, "auxiliary_loss_mlp": 0.01045176, "balance_loss_clip": 1.01496863, "balance_loss_mlp": 1.01840699, "epoch": 0.46006312941530136, "flos": 18112869400320.0, "grad_norm": 1.6921433007267233, "language_loss": 0.76261151, "learning_rate": 2.3538998850964948e-06, "loss": 0.78367198, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42382812, "step": 7652, "time_per_iteration": 2.3489830493927 }, { "auxiliary_loss_clip": 0.01060988, "auxiliary_loss_mlp": 0.01045086, "balance_loss_clip": 1.01593959, "balance_loss_mlp": 1.01834929, "epoch": 0.46012325266796933, "flos": 21975520446720.0, "grad_norm": 1.8641937943133977, "language_loss": 0.76947695, "learning_rate": 2.3535165624703097e-06, "loss": 0.79053771, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 7653, "time_per_iteration": 3.834712266921997 }, { "auxiliary_loss_clip": 0.01067405, "auxiliary_loss_mlp": 0.01048067, "balance_loss_clip": 1.01473594, "balance_loss_mlp": 1.02177024, "epoch": 0.4601833759206373, "flos": 15267168702720.0, "grad_norm": 2.0952528042894794, "language_loss": 0.67590415, "learning_rate": 2.353133226438741e-06, "loss": 0.69705886, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.45703125, "step": 7654, "time_per_iteration": 2.3489742279052734 }, { "auxiliary_loss_clip": 0.01060338, "auxiliary_loss_mlp": 0.01047123, "balance_loss_clip": 1.01839375, "balance_loss_mlp": 1.01836538, "epoch": 0.46024349917330526, "flos": 27087782342400.0, "grad_norm": 1.6670386250789198, "language_loss": 0.80507815, "learning_rate": 2.3527498770163248e-06, "loss": 0.82615274, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 7655, "time_per_iteration": 2.419264793395996 }, { "auxiliary_loss_clip": 0.01059178, "auxiliary_loss_mlp": 0.01046335, "balance_loss_clip": 1.01706934, "balance_loss_mlp": 1.01776731, "epoch": 0.4603036224259732, "flos": 24461755119360.0, "grad_norm": 1.5823434977786461, "language_loss": 0.68747461, "learning_rate": 2.3523665142175985e-06, "loss": 0.70852977, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4140625, "step": 7656, "time_per_iteration": 2.3946292400360107 }, { "auxiliary_loss_clip": 0.01062804, "auxiliary_loss_mlp": 0.01049971, "balance_loss_clip": 1.019418, "balance_loss_mlp": 1.01955247, "epoch": 0.4603637456786412, "flos": 28108084181760.0, "grad_norm": 1.5597807719779306, "language_loss": 0.81988567, "learning_rate": 2.351983138057098e-06, "loss": 0.84101337, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7657, "time_per_iteration": 2.4348127841949463 }, { "auxiliary_loss_clip": 0.0106059, "auxiliary_loss_mlp": 0.01042859, "balance_loss_clip": 1.01164961, "balance_loss_mlp": 1.01795316, "epoch": 0.4604238689313092, "flos": 24347903575680.0, "grad_norm": 2.0213099870495426, "language_loss": 0.71669883, "learning_rate": 2.3515997485493623e-06, "loss": 0.7377333, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.42578125, "step": 7658, "time_per_iteration": 2.386352062225342 }, { "auxiliary_loss_clip": 0.01010643, "auxiliary_loss_mlp": 0.0100841, "balance_loss_clip": 1.00603759, "balance_loss_mlp": 1.00214195, "epoch": 0.4604839921839772, "flos": 53603477280000.0, "grad_norm": 0.9768039702030216, "language_loss": 0.62233955, "learning_rate": 2.351216345708928e-06, "loss": 0.64253008, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.08496094, "step": 7659, "time_per_iteration": 4.465997695922852 }, { "auxiliary_loss_clip": 0.01061234, "auxiliary_loss_mlp": 0.01052728, "balance_loss_clip": 1.02347445, "balance_loss_mlp": 1.02039206, "epoch": 0.46054411543664514, "flos": 31247090144640.0, "grad_norm": 1.5450528965580168, "language_loss": 0.69146204, "learning_rate": 2.350832929550336e-06, "loss": 0.71260166, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40820312, "step": 7660, "time_per_iteration": 2.523466110229492 }, { "auxiliary_loss_clip": 0.01059799, "auxiliary_loss_mlp": 0.0105093, "balance_loss_clip": 1.02090085, "balance_loss_mlp": 1.0177896, "epoch": 0.4606042386893131, "flos": 24091850597760.0, "grad_norm": 1.6188750548987032, "language_loss": 0.77859831, "learning_rate": 2.3504495000881227e-06, "loss": 0.79970562, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41992188, "step": 7661, "time_per_iteration": 2.404334783554077 }, { "auxiliary_loss_clip": 0.01060999, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.01480389, "balance_loss_mlp": 1.01966918, "epoch": 0.46066436194198107, "flos": 26577247397760.0, "grad_norm": 1.8889096207113918, "language_loss": 0.75287342, "learning_rate": 2.3500660573368305e-06, "loss": 0.77390939, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 7662, "time_per_iteration": 2.4249329566955566 }, { "auxiliary_loss_clip": 0.01064028, "auxiliary_loss_mlp": 0.01046763, "balance_loss_clip": 1.01648355, "balance_loss_mlp": 1.01876152, "epoch": 0.46072448519464904, "flos": 17774910639360.0, "grad_norm": 2.477153087973142, "language_loss": 0.81325567, "learning_rate": 2.349682601310998e-06, "loss": 0.83436358, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.453125, "step": 7663, "time_per_iteration": 2.308344602584839 }, { "auxiliary_loss_clip": 0.01058906, "auxiliary_loss_mlp": 0.0104335, "balance_loss_clip": 1.01721907, "balance_loss_mlp": 1.01836646, "epoch": 0.460784608447317, "flos": 15085201363200.0, "grad_norm": 1.7934225103637977, "language_loss": 0.74696875, "learning_rate": 2.3492991320251653e-06, "loss": 0.7679913, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 7664, "time_per_iteration": 2.4094228744506836 }, { "auxiliary_loss_clip": 0.01061743, "auxiliary_loss_mlp": 0.01050042, "balance_loss_clip": 1.02078807, "balance_loss_mlp": 1.01915848, "epoch": 0.46084473169998497, "flos": 18587269221120.0, "grad_norm": 1.47343522674296, "language_loss": 0.73796451, "learning_rate": 2.3489156494938753e-06, "loss": 0.75908232, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 7665, "time_per_iteration": 2.345404863357544 }, { "auxiliary_loss_clip": 0.01059196, "auxiliary_loss_mlp": 0.01043869, "balance_loss_clip": 1.01503253, "balance_loss_mlp": 1.01765633, "epoch": 0.46090485495265293, "flos": 19493928984960.0, "grad_norm": 1.7989892762261108, "language_loss": 0.78770357, "learning_rate": 2.348532153731669e-06, "loss": 0.80873418, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 7666, "time_per_iteration": 2.356403350830078 }, { "auxiliary_loss_clip": 0.01058037, "auxiliary_loss_mlp": 0.01042689, "balance_loss_clip": 1.01336384, "balance_loss_mlp": 1.01700056, "epoch": 0.4609649782053209, "flos": 33363525029760.0, "grad_norm": 1.3889926790483555, "language_loss": 0.7494669, "learning_rate": 2.348148644753088e-06, "loss": 0.7704742, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 7667, "time_per_iteration": 2.474224328994751 }, { "auxiliary_loss_clip": 0.01058388, "auxiliary_loss_mlp": 0.01043133, "balance_loss_clip": 1.01576209, "balance_loss_mlp": 1.01702714, "epoch": 0.46102510145798886, "flos": 23768030937600.0, "grad_norm": 1.3712990062022, "language_loss": 0.77223027, "learning_rate": 2.347765122572676e-06, "loss": 0.79324543, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7668, "time_per_iteration": 2.3919124603271484 }, { "auxiliary_loss_clip": 0.01057571, "auxiliary_loss_mlp": 0.01041727, "balance_loss_clip": 1.01618063, "balance_loss_mlp": 1.01799047, "epoch": 0.4610852247106568, "flos": 23293700939520.0, "grad_norm": 1.5792744580482982, "language_loss": 0.78558874, "learning_rate": 2.347381587204975e-06, "loss": 0.8065818, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 7669, "time_per_iteration": 2.413553237915039 }, { "auxiliary_loss_clip": 0.01061122, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.01497483, "balance_loss_mlp": 1.01951885, "epoch": 0.4611453479633248, "flos": 25446270948480.0, "grad_norm": 1.731573701298637, "language_loss": 0.84048313, "learning_rate": 2.34699803866453e-06, "loss": 0.86153507, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41796875, "step": 7670, "time_per_iteration": 2.4130802154541016 }, { "auxiliary_loss_clip": 0.0105846, "auxiliary_loss_mlp": 0.01044136, "balance_loss_clip": 1.01761222, "balance_loss_mlp": 1.01795208, "epoch": 0.4612054712159928, "flos": 21138617312640.0, "grad_norm": 1.5463473670581702, "language_loss": 0.64363927, "learning_rate": 2.3466144769658845e-06, "loss": 0.66466522, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40429688, "step": 7671, "time_per_iteration": 2.394991159439087 }, { "auxiliary_loss_clip": 0.01009781, "auxiliary_loss_mlp": 0.01004605, "balance_loss_clip": 1.00173247, "balance_loss_mlp": 1.00176406, "epoch": 0.4612655944686608, "flos": 69955851772800.0, "grad_norm": 0.7019538810597213, "language_loss": 0.55946207, "learning_rate": 2.346230902123583e-06, "loss": 0.57960594, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.08007812, "step": 7672, "time_per_iteration": 3.112905263900757 }, { "auxiliary_loss_clip": 0.01060729, "auxiliary_loss_mlp": 0.01046606, "balance_loss_clip": 1.01682758, "balance_loss_mlp": 1.01834154, "epoch": 0.46132571772132874, "flos": 16836200380800.0, "grad_norm": 1.87682454826404, "language_loss": 0.72567713, "learning_rate": 2.3458473141521715e-06, "loss": 0.74675047, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42382812, "step": 7673, "time_per_iteration": 2.3370959758758545 }, { "auxiliary_loss_clip": 0.01060442, "auxiliary_loss_mlp": 0.01042072, "balance_loss_clip": 1.01405752, "balance_loss_mlp": 1.01972985, "epoch": 0.4613858409739967, "flos": 35807480179200.0, "grad_norm": 1.8743863304483863, "language_loss": 0.71922696, "learning_rate": 2.345463713066195e-06, "loss": 0.74025208, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 7674, "time_per_iteration": 2.516575336456299 }, { "auxiliary_loss_clip": 0.01059705, "auxiliary_loss_mlp": 0.01041918, "balance_loss_clip": 1.01224649, "balance_loss_mlp": 1.01746237, "epoch": 0.4614459642266647, "flos": 35265174030720.0, "grad_norm": 1.619378595854543, "language_loss": 0.67201805, "learning_rate": 2.3450800988801996e-06, "loss": 0.69303429, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 7675, "time_per_iteration": 2.5457570552825928 }, { "auxiliary_loss_clip": 0.0100992, "auxiliary_loss_mlp": 0.01005098, "balance_loss_clip": 1.00248718, "balance_loss_mlp": 1.00206184, "epoch": 0.46150608747933264, "flos": 66701493077760.0, "grad_norm": 0.7299214303106868, "language_loss": 0.58669358, "learning_rate": 2.3446964716087327e-06, "loss": 0.60684377, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.07861328, "step": 7676, "time_per_iteration": 3.0389010906219482 }, { "auxiliary_loss_clip": 0.01010756, "auxiliary_loss_mlp": 0.01005114, "balance_loss_clip": 1.00257456, "balance_loss_mlp": 1.00285244, "epoch": 0.4615662107320006, "flos": 55827409841280.0, "grad_norm": 0.7923994863483977, "language_loss": 0.6277504, "learning_rate": 2.344312831266341e-06, "loss": 0.64790916, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.07910156, "step": 7677, "time_per_iteration": 2.9186391830444336 }, { "auxiliary_loss_clip": 0.01059187, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.01517916, "balance_loss_mlp": 1.01883626, "epoch": 0.46162633398466857, "flos": 15482443345920.0, "grad_norm": 2.1757796344568723, "language_loss": 0.77656114, "learning_rate": 2.3439291778675718e-06, "loss": 0.79756629, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 7678, "time_per_iteration": 2.3908486366271973 }, { "auxiliary_loss_clip": 0.01062353, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.01276493, "balance_loss_mlp": 1.02057958, "epoch": 0.46168645723733653, "flos": 20010398860800.0, "grad_norm": 2.261113559921667, "language_loss": 0.67602885, "learning_rate": 2.343545511426974e-06, "loss": 0.69706815, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7679, "time_per_iteration": 2.3506360054016113 }, { "auxiliary_loss_clip": 0.01060606, "auxiliary_loss_mlp": 0.01049096, "balance_loss_clip": 1.02356148, "balance_loss_mlp": 1.01986957, "epoch": 0.4617465804900045, "flos": 20297629549440.0, "grad_norm": 2.370558510213089, "language_loss": 0.71851933, "learning_rate": 2.3431618319590963e-06, "loss": 0.73961639, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40820312, "step": 7680, "time_per_iteration": 2.385673761367798 }, { "auxiliary_loss_clip": 0.01065588, "auxiliary_loss_mlp": 0.01051335, "balance_loss_clip": 1.02197361, "balance_loss_mlp": 1.02152634, "epoch": 0.46180670374267246, "flos": 22345215500160.0, "grad_norm": 1.890668783587243, "language_loss": 0.65564239, "learning_rate": 2.342778139478487e-06, "loss": 0.6768117, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 7681, "time_per_iteration": 2.3830442428588867 }, { "auxiliary_loss_clip": 0.01058409, "auxiliary_loss_mlp": 0.01046292, "balance_loss_clip": 1.01927876, "balance_loss_mlp": 1.01890743, "epoch": 0.46186682699534043, "flos": 19894836660480.0, "grad_norm": 1.5360773257238376, "language_loss": 0.68287838, "learning_rate": 2.342394433999697e-06, "loss": 0.70392537, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 7682, "time_per_iteration": 2.4170122146606445 }, { "auxiliary_loss_clip": 0.01061421, "auxiliary_loss_mlp": 0.01056368, "balance_loss_clip": 1.02760339, "balance_loss_mlp": 1.02001143, "epoch": 0.4619269502480084, "flos": 31502235427200.0, "grad_norm": 2.227603967223568, "language_loss": 0.75753176, "learning_rate": 2.342010715537275e-06, "loss": 0.77870965, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 7683, "time_per_iteration": 2.4309606552124023 }, { "auxiliary_loss_clip": 0.01059883, "auxiliary_loss_mlp": 0.01049756, "balance_loss_clip": 1.02195692, "balance_loss_mlp": 1.01962864, "epoch": 0.46198707350067636, "flos": 25008320453760.0, "grad_norm": 1.9371825872341872, "language_loss": 0.77618849, "learning_rate": 2.3416269841057726e-06, "loss": 0.79728496, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 7684, "time_per_iteration": 2.430478096008301 }, { "auxiliary_loss_clip": 0.01064731, "auxiliary_loss_mlp": 0.01068618, "balance_loss_clip": 1.03973353, "balance_loss_mlp": 1.02013838, "epoch": 0.4620471967533444, "flos": 18291485249280.0, "grad_norm": 2.0823064843299774, "language_loss": 0.80327106, "learning_rate": 2.3412432397197412e-06, "loss": 0.82460457, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4453125, "step": 7685, "time_per_iteration": 2.3590614795684814 }, { "auxiliary_loss_clip": 0.01059987, "auxiliary_loss_mlp": 0.01048015, "balance_loss_clip": 1.02099061, "balance_loss_mlp": 1.01940513, "epoch": 0.46210732000601235, "flos": 33983687243520.0, "grad_norm": 2.182972644051508, "language_loss": 0.67955768, "learning_rate": 2.340859482393731e-06, "loss": 0.7006377, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 7686, "time_per_iteration": 2.5280306339263916 }, { "auxiliary_loss_clip": 0.01061872, "auxiliary_loss_mlp": 0.01055669, "balance_loss_clip": 1.02678442, "balance_loss_mlp": 1.018888, "epoch": 0.4621674432586803, "flos": 25008250631040.0, "grad_norm": 2.106244347642457, "language_loss": 0.7517575, "learning_rate": 2.340475712142296e-06, "loss": 0.77293289, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 7687, "time_per_iteration": 2.3905930519104004 }, { "auxiliary_loss_clip": 0.01061993, "auxiliary_loss_mlp": 0.01052546, "balance_loss_clip": 1.02499628, "balance_loss_mlp": 1.02004194, "epoch": 0.4622275665113483, "flos": 22013052024960.0, "grad_norm": 2.747064401471875, "language_loss": 0.76409316, "learning_rate": 2.3400919289799873e-06, "loss": 0.7852385, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41992188, "step": 7688, "time_per_iteration": 2.400160789489746 }, { "auxiliary_loss_clip": 0.01058016, "auxiliary_loss_mlp": 0.01054212, "balance_loss_clip": 1.02740204, "balance_loss_mlp": 1.01738143, "epoch": 0.46228768976401624, "flos": 24057740332800.0, "grad_norm": 1.6287726889385674, "language_loss": 0.8009401, "learning_rate": 2.3397081329213585e-06, "loss": 0.82206237, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 7689, "time_per_iteration": 3.6335947513580322 }, { "auxiliary_loss_clip": 0.01063244, "auxiliary_loss_mlp": 0.01051281, "balance_loss_clip": 1.02054858, "balance_loss_mlp": 1.01947033, "epoch": 0.4623478130166842, "flos": 26650180961280.0, "grad_norm": 2.3943845148046434, "language_loss": 0.57749492, "learning_rate": 2.339324323980964e-06, "loss": 0.5986402, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 7690, "time_per_iteration": 2.470726251602173 }, { "auxiliary_loss_clip": 0.01059342, "auxiliary_loss_mlp": 0.01052902, "balance_loss_clip": 1.0259254, "balance_loss_mlp": 1.01798975, "epoch": 0.46240793626935217, "flos": 20557383131520.0, "grad_norm": 1.92670327346222, "language_loss": 0.84115297, "learning_rate": 2.3389405021733562e-06, "loss": 0.86227536, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7691, "time_per_iteration": 3.7698986530303955 }, { "auxiliary_loss_clip": 0.01060796, "auxiliary_loss_mlp": 0.01049967, "balance_loss_clip": 1.02293086, "balance_loss_mlp": 1.01965714, "epoch": 0.46246805952202014, "flos": 22454947503360.0, "grad_norm": 1.5368733351370816, "language_loss": 0.76215923, "learning_rate": 2.338556667513091e-06, "loss": 0.7832669, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 7692, "time_per_iteration": 3.9141244888305664 }, { "auxiliary_loss_clip": 0.01062271, "auxiliary_loss_mlp": 0.01049786, "balance_loss_clip": 1.01993632, "balance_loss_mlp": 1.01964116, "epoch": 0.4625281827746881, "flos": 35039914738560.0, "grad_norm": 1.5133440548305996, "language_loss": 0.75203216, "learning_rate": 2.338172820014723e-06, "loss": 0.77315271, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42578125, "step": 7693, "time_per_iteration": 2.5443947315216064 }, { "auxiliary_loss_clip": 0.01063309, "auxiliary_loss_mlp": 0.0104736, "balance_loss_clip": 1.01914263, "balance_loss_mlp": 1.02149534, "epoch": 0.46258830602735607, "flos": 21067603873920.0, "grad_norm": 1.4585410889391353, "language_loss": 0.86225206, "learning_rate": 2.337788959692808e-06, "loss": 0.88335878, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 7694, "time_per_iteration": 2.3847227096557617 }, { "auxiliary_loss_clip": 0.01061944, "auxiliary_loss_mlp": 0.010458, "balance_loss_clip": 1.0186919, "balance_loss_mlp": 1.02085781, "epoch": 0.46264842928002403, "flos": 26176025520000.0, "grad_norm": 1.9104033688223259, "language_loss": 0.79886782, "learning_rate": 2.337405086561902e-06, "loss": 0.81994528, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 7695, "time_per_iteration": 2.454000234603882 }, { "auxiliary_loss_clip": 0.01061023, "auxiliary_loss_mlp": 0.01045295, "balance_loss_clip": 1.01898599, "balance_loss_mlp": 1.01997781, "epoch": 0.462708552532692, "flos": 16763266817280.0, "grad_norm": 1.97390896321549, "language_loss": 0.73658907, "learning_rate": 2.3370212006365606e-06, "loss": 0.75765216, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 7696, "time_per_iteration": 2.381009340286255 }, { "auxiliary_loss_clip": 0.01062951, "auxiliary_loss_mlp": 0.01044103, "balance_loss_clip": 1.01612425, "balance_loss_mlp": 1.02176893, "epoch": 0.46276867578535996, "flos": 15559531361280.0, "grad_norm": 2.482232221662205, "language_loss": 0.70685434, "learning_rate": 2.3366373019313423e-06, "loss": 0.72792488, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41210938, "step": 7697, "time_per_iteration": 2.3951914310455322 }, { "auxiliary_loss_clip": 0.01062014, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.0136373, "balance_loss_mlp": 1.02193022, "epoch": 0.462828799038028, "flos": 22414413548160.0, "grad_norm": 1.8223132282354166, "language_loss": 0.8530032, "learning_rate": 2.3362533904608025e-06, "loss": 0.87402141, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 7698, "time_per_iteration": 3.85278582572937 }, { "auxiliary_loss_clip": 0.01063638, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.01204515, "balance_loss_mlp": 1.02225828, "epoch": 0.46288892229069595, "flos": 21068511569280.0, "grad_norm": 2.0397901865622745, "language_loss": 0.72687161, "learning_rate": 2.335869466239502e-06, "loss": 0.74789774, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7699, "time_per_iteration": 2.3980605602264404 }, { "auxiliary_loss_clip": 0.01064537, "auxiliary_loss_mlp": 0.0104751, "balance_loss_clip": 1.01762414, "balance_loss_mlp": 1.02153587, "epoch": 0.4629490455433639, "flos": 23184562429440.0, "grad_norm": 1.8110953944847128, "language_loss": 0.72656739, "learning_rate": 2.335485529281996e-06, "loss": 0.74768794, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4296875, "step": 7700, "time_per_iteration": 2.402522563934326 }, { "auxiliary_loss_clip": 0.01061705, "auxiliary_loss_mlp": 0.01042833, "balance_loss_clip": 1.01427007, "balance_loss_mlp": 1.02075362, "epoch": 0.4630091687960319, "flos": 18834768915840.0, "grad_norm": 5.84540738630768, "language_loss": 0.73579967, "learning_rate": 2.3351015796028467e-06, "loss": 0.756845, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 7701, "time_per_iteration": 2.397334575653076 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.01044157, "balance_loss_clip": 1.01502204, "balance_loss_mlp": 1.02386963, "epoch": 0.46306929204869984, "flos": 38905568161920.0, "grad_norm": 3.797362018590303, "language_loss": 0.65673232, "learning_rate": 2.3347176172166114e-06, "loss": 0.67785126, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43945312, "step": 7702, "time_per_iteration": 2.5191195011138916 }, { "auxiliary_loss_clip": 0.01063107, "auxiliary_loss_mlp": 0.01039007, "balance_loss_clip": 1.01195908, "balance_loss_mlp": 1.02171767, "epoch": 0.4631294153013678, "flos": 19643217425280.0, "grad_norm": 2.020100199358354, "language_loss": 0.74693066, "learning_rate": 2.33433364213785e-06, "loss": 0.76795185, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7703, "time_per_iteration": 2.4120192527770996 }, { "auxiliary_loss_clip": 0.01064731, "auxiliary_loss_mlp": 0.01050493, "balance_loss_clip": 1.01871192, "balance_loss_mlp": 1.02170181, "epoch": 0.4631895385540358, "flos": 24607098576000.0, "grad_norm": 1.5794271138002782, "language_loss": 0.6981045, "learning_rate": 2.3339496543811243e-06, "loss": 0.7192567, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4296875, "step": 7704, "time_per_iteration": 2.403841972351074 }, { "auxiliary_loss_clip": 0.01063504, "auxiliary_loss_mlp": 0.01047391, "balance_loss_clip": 1.01880431, "balance_loss_mlp": 1.02057266, "epoch": 0.46324966180670374, "flos": 26318995004160.0, "grad_norm": 2.072470200963084, "language_loss": 0.82362366, "learning_rate": 2.3335656539609934e-06, "loss": 0.84473258, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 7705, "time_per_iteration": 2.4477264881134033 }, { "auxiliary_loss_clip": 0.0106365, "auxiliary_loss_mlp": 0.0104771, "balance_loss_clip": 1.01809812, "balance_loss_mlp": 1.02010608, "epoch": 0.4633097850593717, "flos": 19239621575040.0, "grad_norm": 2.416199490240897, "language_loss": 0.78609657, "learning_rate": 2.3331816408920196e-06, "loss": 0.80721009, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43554688, "step": 7706, "time_per_iteration": 2.363382577896118 }, { "auxiliary_loss_clip": 0.01059658, "auxiliary_loss_mlp": 0.01051072, "balance_loss_clip": 1.02215242, "balance_loss_mlp": 1.01965928, "epoch": 0.46336990831203967, "flos": 22782083742720.0, "grad_norm": 1.8433642272977389, "language_loss": 0.71520853, "learning_rate": 2.3327976151887654e-06, "loss": 0.73631585, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40039062, "step": 7707, "time_per_iteration": 2.410961627960205 }, { "auxiliary_loss_clip": 0.01063291, "auxiliary_loss_mlp": 0.01055759, "balance_loss_clip": 1.02438319, "balance_loss_mlp": 1.01885915, "epoch": 0.46343003156470763, "flos": 38209260539520.0, "grad_norm": 1.9605414941144852, "language_loss": 0.62307763, "learning_rate": 2.332413576865791e-06, "loss": 0.64426804, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 7708, "time_per_iteration": 2.4999494552612305 }, { "auxiliary_loss_clip": 0.01062186, "auxiliary_loss_mlp": 0.01055731, "balance_loss_clip": 1.02600026, "balance_loss_mlp": 1.01976109, "epoch": 0.4634901548173756, "flos": 31937288279040.0, "grad_norm": 2.407911473774212, "language_loss": 0.78146595, "learning_rate": 2.3320295259376614e-06, "loss": 0.80264515, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42382812, "step": 7709, "time_per_iteration": 2.472043991088867 }, { "auxiliary_loss_clip": 0.01063394, "auxiliary_loss_mlp": 0.01054934, "balance_loss_clip": 1.02396369, "balance_loss_mlp": 1.01973319, "epoch": 0.46355027807004356, "flos": 20081551944960.0, "grad_norm": 1.7252107878954535, "language_loss": 0.78522819, "learning_rate": 2.3316454624189385e-06, "loss": 0.8064115, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.4375, "step": 7710, "time_per_iteration": 2.374325752258301 }, { "auxiliary_loss_clip": 0.01064597, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.0206058, "balance_loss_mlp": 1.02014267, "epoch": 0.4636104013227116, "flos": 24060219039360.0, "grad_norm": 2.343754833877412, "language_loss": 0.7461611, "learning_rate": 2.3312613863241865e-06, "loss": 0.76733285, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4453125, "step": 7711, "time_per_iteration": 2.467454671859741 }, { "auxiliary_loss_clip": 0.01061971, "auxiliary_loss_mlp": 0.01056806, "balance_loss_clip": 1.02646708, "balance_loss_mlp": 1.02045226, "epoch": 0.46367052457537955, "flos": 23913514039680.0, "grad_norm": 1.2824809445630814, "language_loss": 0.72637415, "learning_rate": 2.33087729766797e-06, "loss": 0.74756193, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.41601562, "step": 7712, "time_per_iteration": 2.4148452281951904 }, { "auxiliary_loss_clip": 0.01065983, "auxiliary_loss_mlp": 0.01065809, "balance_loss_clip": 1.03219175, "balance_loss_mlp": 1.02013326, "epoch": 0.4637306478280475, "flos": 26395314969600.0, "grad_norm": 1.6689263978641424, "language_loss": 0.74379748, "learning_rate": 2.3304931964648524e-06, "loss": 0.76511538, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.45898438, "step": 7713, "time_per_iteration": 2.4980030059814453 }, { "auxiliary_loss_clip": 0.01063818, "auxiliary_loss_mlp": 0.01056634, "balance_loss_clip": 1.02416134, "balance_loss_mlp": 1.0190475, "epoch": 0.4637907710807155, "flos": 21979639987200.0, "grad_norm": 1.8692476723845421, "language_loss": 0.60609603, "learning_rate": 2.3301090827294e-06, "loss": 0.62730056, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.44726562, "step": 7714, "time_per_iteration": 2.385563611984253 }, { "auxiliary_loss_clip": 0.01059588, "auxiliary_loss_mlp": 0.01056283, "balance_loss_clip": 1.02447832, "balance_loss_mlp": 1.01804137, "epoch": 0.46385089433338345, "flos": 12421468005120.0, "grad_norm": 4.365879384273636, "language_loss": 0.70870769, "learning_rate": 2.3297249564761784e-06, "loss": 0.72986639, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.41601562, "step": 7715, "time_per_iteration": 2.385610342025757 }, { "auxiliary_loss_clip": 0.01066641, "auxiliary_loss_mlp": 0.01052573, "balance_loss_clip": 1.02064919, "balance_loss_mlp": 1.0205164, "epoch": 0.4639110175860514, "flos": 23914596291840.0, "grad_norm": 2.9956151696584064, "language_loss": 0.69622141, "learning_rate": 2.3293408177197527e-06, "loss": 0.71741354, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.4609375, "step": 7716, "time_per_iteration": 2.3896665573120117 }, { "auxiliary_loss_clip": 0.01064866, "auxiliary_loss_mlp": 0.01051327, "balance_loss_clip": 1.01933169, "balance_loss_mlp": 1.019701, "epoch": 0.4639711408387194, "flos": 25299251746560.0, "grad_norm": 1.605371228142037, "language_loss": 0.82143563, "learning_rate": 2.328956666474691e-06, "loss": 0.8425976, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.453125, "step": 7717, "time_per_iteration": 2.4502758979797363 }, { "auxiliary_loss_clip": 0.01065031, "auxiliary_loss_mlp": 0.01046272, "balance_loss_clip": 1.0146811, "balance_loss_mlp": 1.02015507, "epoch": 0.46403126409138734, "flos": 21210852648960.0, "grad_norm": 1.7405919977552307, "language_loss": 0.74390274, "learning_rate": 2.3285725027555593e-06, "loss": 0.76501578, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.44921875, "step": 7718, "time_per_iteration": 2.378511905670166 }, { "auxiliary_loss_clip": 0.01062829, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.02357841, "balance_loss_mlp": 1.01924551, "epoch": 0.4640913873440553, "flos": 35844104062080.0, "grad_norm": 1.7298828723218822, "language_loss": 0.71640426, "learning_rate": 2.3281883265769254e-06, "loss": 0.73757422, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43554688, "step": 7719, "time_per_iteration": 2.5230586528778076 }, { "auxiliary_loss_clip": 0.01067119, "auxiliary_loss_mlp": 0.01051906, "balance_loss_clip": 1.01878953, "balance_loss_mlp": 1.02075315, "epoch": 0.46415151059672327, "flos": 19165361379840.0, "grad_norm": 2.5785232113690895, "language_loss": 0.87531042, "learning_rate": 2.327804137953357e-06, "loss": 0.89650071, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.46289062, "step": 7720, "time_per_iteration": 2.363158702850342 }, { "auxiliary_loss_clip": 0.01014888, "auxiliary_loss_mlp": 0.0100487, "balance_loss_clip": 1.00231934, "balance_loss_mlp": 1.00667787, "epoch": 0.46421163384939124, "flos": 58909401820800.0, "grad_norm": 0.7286691291723035, "language_loss": 0.55092412, "learning_rate": 2.3274199368994226e-06, "loss": 0.57112163, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.08203125, "step": 7721, "time_per_iteration": 3.057159185409546 }, { "auxiliary_loss_clip": 0.01063572, "auxiliary_loss_mlp": 0.0105306, "balance_loss_clip": 1.02001524, "balance_loss_mlp": 1.0210247, "epoch": 0.4642717571020592, "flos": 20156300899200.0, "grad_norm": 2.2341439918799986, "language_loss": 0.8064093, "learning_rate": 2.3270357234296918e-06, "loss": 0.82757568, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.42578125, "step": 7722, "time_per_iteration": 2.3673644065856934 }, { "auxiliary_loss_clip": 0.01066621, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.0127095, "balance_loss_mlp": 1.02177405, "epoch": 0.46433188035472717, "flos": 25045014159360.0, "grad_norm": 2.197113484047951, "language_loss": 0.78682667, "learning_rate": 2.3266514975587332e-06, "loss": 0.80795974, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 0.44921875, "step": 7723, "time_per_iteration": 2.4375202655792236 }, { "auxiliary_loss_clip": 0.01063149, "auxiliary_loss_mlp": 0.010496, "balance_loss_clip": 1.01665092, "balance_loss_mlp": 1.01938748, "epoch": 0.4643920036073952, "flos": 28074357941760.0, "grad_norm": 1.5947324440446287, "language_loss": 0.69365025, "learning_rate": 2.326267259301118e-06, "loss": 0.71477771, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4375, "step": 7724, "time_per_iteration": 2.437103033065796 }, { "auxiliary_loss_clip": 0.01064569, "auxiliary_loss_mlp": 0.01046423, "balance_loss_clip": 1.01720488, "balance_loss_mlp": 1.0215894, "epoch": 0.46445212686006315, "flos": 18368363796480.0, "grad_norm": 2.153775602909375, "language_loss": 0.69129717, "learning_rate": 2.325883008671415e-06, "loss": 0.71240711, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 7725, "time_per_iteration": 2.404132127761841 }, { "auxiliary_loss_clip": 0.01061175, "auxiliary_loss_mlp": 0.01046078, "balance_loss_clip": 1.01756334, "balance_loss_mlp": 1.02018833, "epoch": 0.4645122501127311, "flos": 31720302979200.0, "grad_norm": 1.7071924832533731, "language_loss": 0.66462874, "learning_rate": 2.3254987456841955e-06, "loss": 0.68570125, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 7726, "time_per_iteration": 2.458108901977539 }, { "auxiliary_loss_clip": 0.01064198, "auxiliary_loss_mlp": 0.01057885, "balance_loss_clip": 1.02789223, "balance_loss_mlp": 1.02132297, "epoch": 0.4645723733653991, "flos": 23767681824000.0, "grad_norm": 1.8202246219584926, "language_loss": 0.75917971, "learning_rate": 2.3251144703540307e-06, "loss": 0.78040057, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 7727, "time_per_iteration": 2.4059643745422363 }, { "auxiliary_loss_clip": 0.01064471, "auxiliary_loss_mlp": 0.01051123, "balance_loss_clip": 1.02112961, "balance_loss_mlp": 1.02076662, "epoch": 0.46463249661806705, "flos": 33144130846080.0, "grad_norm": 1.963373560009747, "language_loss": 0.80413675, "learning_rate": 2.3247301826954936e-06, "loss": 0.82529265, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4375, "step": 7728, "time_per_iteration": 2.4513487815856934 }, { "auxiliary_loss_clip": 0.01063946, "auxiliary_loss_mlp": 0.0105348, "balance_loss_clip": 1.02067339, "balance_loss_mlp": 1.02091074, "epoch": 0.464692619870735, "flos": 18295046208000.0, "grad_norm": 1.7256566729337512, "language_loss": 0.76694024, "learning_rate": 2.324345882723155e-06, "loss": 0.78811449, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.4296875, "step": 7729, "time_per_iteration": 3.691314458847046 }, { "auxiliary_loss_clip": 0.01063767, "auxiliary_loss_mlp": 0.01058449, "balance_loss_clip": 1.02721667, "balance_loss_mlp": 1.02081048, "epoch": 0.464752743123403, "flos": 22636949754240.0, "grad_norm": 1.6607633307326528, "language_loss": 0.81447208, "learning_rate": 2.323961570451588e-06, "loss": 0.83569425, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4296875, "step": 7730, "time_per_iteration": 2.38246488571167 }, { "auxiliary_loss_clip": 0.01060608, "auxiliary_loss_mlp": 0.01059458, "balance_loss_clip": 1.02841544, "balance_loss_mlp": 1.01895368, "epoch": 0.46481286637607094, "flos": 20411097068160.0, "grad_norm": 1.566540440584837, "language_loss": 0.78260458, "learning_rate": 2.3235772458953655e-06, "loss": 0.80380523, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41796875, "step": 7731, "time_per_iteration": 3.768162727355957 }, { "auxiliary_loss_clip": 0.01060589, "auxiliary_loss_mlp": 0.01045408, "balance_loss_clip": 1.01870561, "balance_loss_mlp": 1.01968622, "epoch": 0.4648729896287389, "flos": 34274025043200.0, "grad_norm": 1.688239820417337, "language_loss": 0.66610718, "learning_rate": 2.323192909069061e-06, "loss": 0.68716717, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 7732, "time_per_iteration": 3.838547945022583 }, { "auxiliary_loss_clip": 0.01063776, "auxiliary_loss_mlp": 0.01057396, "balance_loss_clip": 1.02156186, "balance_loss_mlp": 1.01896298, "epoch": 0.4649331128814069, "flos": 21320794120320.0, "grad_norm": 3.3408094665503043, "language_loss": 0.73912078, "learning_rate": 2.32280855998725e-06, "loss": 0.76033247, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 0.44921875, "step": 7733, "time_per_iteration": 2.374772548675537 }, { "auxiliary_loss_clip": 0.01010184, "auxiliary_loss_mlp": 0.01015728, "balance_loss_clip": 1.01285481, "balance_loss_mlp": 1.00250912, "epoch": 0.46499323613407484, "flos": 58305754546560.0, "grad_norm": 1.2600022761213459, "language_loss": 0.52085185, "learning_rate": 2.3224241986645057e-06, "loss": 0.54111099, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.07666016, "step": 7734, "time_per_iteration": 2.9554405212402344 }, { "auxiliary_loss_clip": 0.01061063, "auxiliary_loss_mlp": 0.0104802, "balance_loss_clip": 1.0166204, "balance_loss_mlp": 1.01855171, "epoch": 0.4650533593867428, "flos": 10888885653120.0, "grad_norm": 2.037897062118562, "language_loss": 0.77063197, "learning_rate": 2.3220398251154035e-06, "loss": 0.79172277, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.42578125, "step": 7735, "time_per_iteration": 2.331630229949951 }, { "auxiliary_loss_clip": 0.01060078, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.02519262, "balance_loss_mlp": 1.0188632, "epoch": 0.46511348263941077, "flos": 19973565509760.0, "grad_norm": 2.303358795571282, "language_loss": 0.71072978, "learning_rate": 2.321655439354519e-06, "loss": 0.73188382, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41210938, "step": 7736, "time_per_iteration": 2.404212474822998 }, { "auxiliary_loss_clip": 0.01059184, "auxiliary_loss_mlp": 0.0104864, "balance_loss_clip": 1.02118635, "balance_loss_mlp": 1.01852643, "epoch": 0.46517360589207873, "flos": 19677502247040.0, "grad_norm": 1.5584611025866622, "language_loss": 0.73010939, "learning_rate": 2.321271041396427e-06, "loss": 0.75118762, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40820312, "step": 7737, "time_per_iteration": 2.3608734607696533 }, { "auxiliary_loss_clip": 0.01063431, "auxiliary_loss_mlp": 0.01052656, "balance_loss_clip": 1.02129221, "balance_loss_mlp": 1.02106786, "epoch": 0.46523372914474675, "flos": 16871742011520.0, "grad_norm": 1.9097089420678, "language_loss": 0.84505951, "learning_rate": 2.3208866312557065e-06, "loss": 0.86622036, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.42382812, "step": 7738, "time_per_iteration": 3.7910706996917725 }, { "auxiliary_loss_clip": 0.01010856, "auxiliary_loss_mlp": 0.01018455, "balance_loss_clip": 1.01577246, "balance_loss_mlp": 1.0031625, "epoch": 0.4652938523974147, "flos": 53435963243520.0, "grad_norm": 0.7670022416207088, "language_loss": 0.57852137, "learning_rate": 2.320502208946932e-06, "loss": 0.59881449, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.07714844, "step": 7739, "time_per_iteration": 3.0808160305023193 }, { "auxiliary_loss_clip": 0.01060667, "auxiliary_loss_mlp": 0.01057472, "balance_loss_clip": 1.02944613, "balance_loss_mlp": 1.0195415, "epoch": 0.4653539756500827, "flos": 15230405174400.0, "grad_norm": 1.7329680802560934, "language_loss": 0.86130267, "learning_rate": 2.3201177744846815e-06, "loss": 0.88248408, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41015625, "step": 7740, "time_per_iteration": 2.343319892883301 }, { "auxiliary_loss_clip": 0.01062243, "auxiliary_loss_mlp": 0.01050439, "balance_loss_clip": 1.01954007, "balance_loss_mlp": 1.02116334, "epoch": 0.46541409890275065, "flos": 23731127763840.0, "grad_norm": 1.6479007503068148, "language_loss": 0.76378405, "learning_rate": 2.3197333278835327e-06, "loss": 0.78491092, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41015625, "step": 7741, "time_per_iteration": 2.4303243160247803 }, { "auxiliary_loss_clip": 0.01067115, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.02035046, "balance_loss_mlp": 1.02211905, "epoch": 0.4654742221554186, "flos": 20846359388160.0, "grad_norm": 2.1694290513296877, "language_loss": 0.81759471, "learning_rate": 2.319348869158064e-06, "loss": 0.83877003, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45117188, "step": 7742, "time_per_iteration": 2.389361619949341 }, { "auxiliary_loss_clip": 0.01065799, "auxiliary_loss_mlp": 0.01054087, "balance_loss_clip": 1.02297306, "balance_loss_mlp": 1.02185822, "epoch": 0.4655343454080866, "flos": 20703773928960.0, "grad_norm": 1.6044049654842119, "language_loss": 0.73701543, "learning_rate": 2.3189643983228555e-06, "loss": 0.75821435, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43945312, "step": 7743, "time_per_iteration": 2.4023377895355225 }, { "auxiliary_loss_clip": 0.01063307, "auxiliary_loss_mlp": 0.01046971, "balance_loss_clip": 1.01758623, "balance_loss_mlp": 1.02071619, "epoch": 0.46559446866075455, "flos": 18988840212480.0, "grad_norm": 2.0952100409196093, "language_loss": 0.72614896, "learning_rate": 2.318579915392483e-06, "loss": 0.74725169, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42578125, "step": 7744, "time_per_iteration": 2.384129047393799 }, { "auxiliary_loss_clip": 0.01062438, "auxiliary_loss_mlp": 0.01039248, "balance_loss_clip": 1.01482213, "balance_loss_mlp": 1.02252662, "epoch": 0.4656545919134225, "flos": 34494920415360.0, "grad_norm": 1.4858152459723244, "language_loss": 0.85575444, "learning_rate": 2.31819542038153e-06, "loss": 0.87677127, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3984375, "step": 7745, "time_per_iteration": 2.51816463470459 }, { "auxiliary_loss_clip": 0.01064294, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.01447582, "balance_loss_mlp": 1.02382565, "epoch": 0.4657147151660905, "flos": 24309569036160.0, "grad_norm": 1.5945846523857399, "language_loss": 0.73737228, "learning_rate": 2.317810913304574e-06, "loss": 0.7584489, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40429688, "step": 7746, "time_per_iteration": 2.4029455184936523 }, { "auxiliary_loss_clip": 0.01063042, "auxiliary_loss_mlp": 0.01045925, "balance_loss_clip": 1.01941335, "balance_loss_mlp": 1.02306592, "epoch": 0.46577483841875844, "flos": 58793038525440.0, "grad_norm": 1.505215186546429, "language_loss": 0.71050096, "learning_rate": 2.3174263941761963e-06, "loss": 0.73159063, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 7747, "time_per_iteration": 2.7494115829467773 }, { "auxiliary_loss_clip": 0.01061836, "auxiliary_loss_mlp": 0.01048918, "balance_loss_clip": 1.02170205, "balance_loss_mlp": 1.02175891, "epoch": 0.4658349616714264, "flos": 31320617201280.0, "grad_norm": 1.58263099299657, "language_loss": 0.68362588, "learning_rate": 2.317041863010978e-06, "loss": 0.70473343, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 7748, "time_per_iteration": 2.4643189907073975 }, { "auxiliary_loss_clip": 0.01065293, "auxiliary_loss_mlp": 0.01050598, "balance_loss_clip": 1.01950788, "balance_loss_mlp": 1.02212608, "epoch": 0.46589508492409437, "flos": 14859627868800.0, "grad_norm": 2.025016069604226, "language_loss": 0.6630739, "learning_rate": 2.3166573198235007e-06, "loss": 0.68423277, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43359375, "step": 7749, "time_per_iteration": 2.3929193019866943 }, { "auxiliary_loss_clip": 0.01065433, "auxiliary_loss_mlp": 0.01048806, "balance_loss_clip": 1.01975453, "balance_loss_mlp": 1.02284026, "epoch": 0.46595520817676234, "flos": 12895169598720.0, "grad_norm": 1.9886957123345985, "language_loss": 0.7543211, "learning_rate": 2.3162727646283456e-06, "loss": 0.77546352, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 7750, "time_per_iteration": 2.3537771701812744 }, { "auxiliary_loss_clip": 0.01064314, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.01531219, "balance_loss_mlp": 1.02130795, "epoch": 0.46601533142943036, "flos": 32852780616960.0, "grad_norm": 2.020092971078369, "language_loss": 0.75695062, "learning_rate": 2.3158881974400963e-06, "loss": 0.7780121, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.43164062, "step": 7751, "time_per_iteration": 2.493079423904419 }, { "auxiliary_loss_clip": 0.01065635, "auxiliary_loss_mlp": 0.01051298, "balance_loss_clip": 1.02188873, "balance_loss_mlp": 1.02236414, "epoch": 0.4660754546820983, "flos": 19966687971840.0, "grad_norm": 1.9821187078305789, "language_loss": 0.74980283, "learning_rate": 2.3155036182733345e-06, "loss": 0.77097207, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 7752, "time_per_iteration": 2.348349094390869 }, { "auxiliary_loss_clip": 0.01063773, "auxiliary_loss_mlp": 0.0105543, "balance_loss_clip": 1.02659297, "balance_loss_mlp": 1.02042866, "epoch": 0.4661355779347663, "flos": 26686944489600.0, "grad_norm": 2.2402111051636253, "language_loss": 0.71250015, "learning_rate": 2.315119027142644e-06, "loss": 0.73369217, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43359375, "step": 7753, "time_per_iteration": 2.4525625705718994 }, { "auxiliary_loss_clip": 0.01059783, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.01849484, "balance_loss_mlp": 1.02001381, "epoch": 0.46619570118743425, "flos": 20958395541120.0, "grad_norm": 2.9785662837282287, "language_loss": 0.74098778, "learning_rate": 2.3147344240626076e-06, "loss": 0.76202893, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 7754, "time_per_iteration": 2.3645730018615723 }, { "auxiliary_loss_clip": 0.01065109, "auxiliary_loss_mlp": 0.01052445, "balance_loss_clip": 1.02321458, "balance_loss_mlp": 1.02102852, "epoch": 0.4662558244401022, "flos": 24424921768320.0, "grad_norm": 2.089828090416352, "language_loss": 0.80062532, "learning_rate": 2.3143498090478114e-06, "loss": 0.82180095, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 7755, "time_per_iteration": 2.4428043365478516 }, { "auxiliary_loss_clip": 0.01057903, "auxiliary_loss_mlp": 0.01051967, "balance_loss_clip": 1.0245136, "balance_loss_mlp": 1.0179708, "epoch": 0.4663159476927702, "flos": 20594391039360.0, "grad_norm": 1.5952910574245276, "language_loss": 0.73218393, "learning_rate": 2.3139651821128382e-06, "loss": 0.75328261, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 7756, "time_per_iteration": 2.3651952743530273 }, { "auxiliary_loss_clip": 0.01059477, "auxiliary_loss_mlp": 0.0104534, "balance_loss_clip": 1.0180409, "balance_loss_mlp": 1.01927757, "epoch": 0.46637607094543815, "flos": 25660812453120.0, "grad_norm": 1.6253400368963045, "language_loss": 0.78906095, "learning_rate": 2.313580543272274e-06, "loss": 0.81010914, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 7757, "time_per_iteration": 2.4442760944366455 }, { "auxiliary_loss_clip": 0.01057763, "auxiliary_loss_mlp": 0.01049617, "balance_loss_clip": 1.02173448, "balance_loss_mlp": 1.01706851, "epoch": 0.4664361941981061, "flos": 24272875330560.0, "grad_norm": 1.910035993011496, "language_loss": 0.67820388, "learning_rate": 2.313195892540705e-06, "loss": 0.6992777, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 7758, "time_per_iteration": 2.4202370643615723 }, { "auxiliary_loss_clip": 0.01060399, "auxiliary_loss_mlp": 0.01050115, "balance_loss_clip": 1.02212429, "balance_loss_mlp": 1.01894784, "epoch": 0.4664963174507741, "flos": 18404882945280.0, "grad_norm": 1.7201659397462121, "language_loss": 0.76238203, "learning_rate": 2.3128112299327147e-06, "loss": 0.7834872, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.4140625, "step": 7759, "time_per_iteration": 2.4078283309936523 }, { "auxiliary_loss_clip": 0.0105965, "auxiliary_loss_mlp": 0.01051883, "balance_loss_clip": 1.02490568, "balance_loss_mlp": 1.0189147, "epoch": 0.46655644070344204, "flos": 22454039808000.0, "grad_norm": 1.5974373699986775, "language_loss": 0.78132343, "learning_rate": 2.312426555462893e-06, "loss": 0.8024388, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 7760, "time_per_iteration": 2.4319489002227783 }, { "auxiliary_loss_clip": 0.01057208, "auxiliary_loss_mlp": 0.01046074, "balance_loss_clip": 1.01990795, "balance_loss_mlp": 1.01800978, "epoch": 0.46661656395611, "flos": 13807554825600.0, "grad_norm": 1.6902864471619454, "language_loss": 0.76029134, "learning_rate": 2.3120418691458237e-06, "loss": 0.78132421, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 7761, "time_per_iteration": 2.3493776321411133 }, { "auxiliary_loss_clip": 0.01061971, "auxiliary_loss_mlp": 0.01053256, "balance_loss_clip": 1.02195215, "balance_loss_mlp": 1.01953053, "epoch": 0.466676687208778, "flos": 21651107293440.0, "grad_norm": 1.6775834476930387, "language_loss": 0.79803693, "learning_rate": 2.3116571709960956e-06, "loss": 0.81918919, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.42382812, "step": 7762, "time_per_iteration": 2.401060104370117 }, { "auxiliary_loss_clip": 0.010115, "auxiliary_loss_mlp": 0.01013796, "balance_loss_clip": 1.01104236, "balance_loss_mlp": 1.00389266, "epoch": 0.46673681046144594, "flos": 68530941653760.0, "grad_norm": 0.8213144378579171, "language_loss": 0.59910226, "learning_rate": 2.311272461028297e-06, "loss": 0.6193552, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.07617188, "step": 7763, "time_per_iteration": 3.1485788822174072 }, { "auxiliary_loss_clip": 0.0106285, "auxiliary_loss_mlp": 0.01045583, "balance_loss_clip": 1.01503003, "balance_loss_mlp": 1.01999879, "epoch": 0.46679693371411396, "flos": 15813559480320.0, "grad_norm": 2.2742388297086094, "language_loss": 0.80113053, "learning_rate": 2.3108877392570146e-06, "loss": 0.82221484, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4296875, "step": 7764, "time_per_iteration": 2.350856065750122 }, { "auxiliary_loss_clip": 0.01060644, "auxiliary_loss_mlp": 0.01042896, "balance_loss_clip": 1.01818383, "balance_loss_mlp": 1.02149522, "epoch": 0.4668570569667819, "flos": 18513602519040.0, "grad_norm": 1.7937899392023746, "language_loss": 0.73363829, "learning_rate": 2.310503005696839e-06, "loss": 0.75467366, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39257812, "step": 7765, "time_per_iteration": 2.376589775085449 }, { "auxiliary_loss_clip": 0.01062561, "auxiliary_loss_mlp": 0.01046108, "balance_loss_clip": 1.0172112, "balance_loss_mlp": 1.02068698, "epoch": 0.4669171802194499, "flos": 19205685866880.0, "grad_norm": 3.288266368161227, "language_loss": 0.79914534, "learning_rate": 2.3101182603623576e-06, "loss": 0.82023203, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7766, "time_per_iteration": 2.352276563644409 }, { "auxiliary_loss_clip": 0.01061471, "auxiliary_loss_mlp": 0.01046643, "balance_loss_clip": 1.0189743, "balance_loss_mlp": 1.02104187, "epoch": 0.46697730347211786, "flos": 12275321587200.0, "grad_norm": 2.9695605022363205, "language_loss": 0.6661284, "learning_rate": 2.3097335032681607e-06, "loss": 0.68720949, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 7767, "time_per_iteration": 2.3489773273468018 }, { "auxiliary_loss_clip": 0.01065729, "auxiliary_loss_mlp": 0.01043733, "balance_loss_clip": 1.01625514, "balance_loss_mlp": 1.02327025, "epoch": 0.4670374267247858, "flos": 23585609750400.0, "grad_norm": 2.046740949835028, "language_loss": 0.75515103, "learning_rate": 2.3093487344288393e-06, "loss": 0.77624559, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.42578125, "step": 7768, "time_per_iteration": 2.4242918491363525 }, { "auxiliary_loss_clip": 0.01064529, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.01491404, "balance_loss_mlp": 1.02302027, "epoch": 0.4670975499774538, "flos": 15990359938560.0, "grad_norm": 1.7072447625060694, "language_loss": 0.71459734, "learning_rate": 2.308963953858982e-06, "loss": 0.7356416, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.4140625, "step": 7769, "time_per_iteration": 3.642825126647949 }, { "auxiliary_loss_clip": 0.01062159, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.01300299, "balance_loss_mlp": 1.02084136, "epoch": 0.46715767323012175, "flos": 15376691237760.0, "grad_norm": 3.2356923136944884, "language_loss": 0.83394587, "learning_rate": 2.3085791615731803e-06, "loss": 0.85495496, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.4140625, "step": 7770, "time_per_iteration": 3.704324960708618 }, { "auxiliary_loss_clip": 0.01016936, "auxiliary_loss_mlp": 0.01006257, "balance_loss_clip": 1.00364602, "balance_loss_mlp": 1.00895143, "epoch": 0.4672177964827897, "flos": 60249124488960.0, "grad_norm": 0.8004469076539501, "language_loss": 0.55693418, "learning_rate": 2.3081943575860265e-06, "loss": 0.57716614, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.08007812, "step": 7771, "time_per_iteration": 4.359293460845947 }, { "auxiliary_loss_clip": 0.01062585, "auxiliary_loss_mlp": 0.01048482, "balance_loss_clip": 1.02408004, "balance_loss_mlp": 1.02246726, "epoch": 0.4672779197354577, "flos": 27634906258560.0, "grad_norm": 2.2655550019433814, "language_loss": 0.67073905, "learning_rate": 2.3078095419121117e-06, "loss": 0.69184971, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.40234375, "step": 7772, "time_per_iteration": 2.453249931335449 }, { "auxiliary_loss_clip": 0.01061236, "auxiliary_loss_mlp": 0.01050693, "balance_loss_clip": 1.02486038, "balance_loss_mlp": 1.02130628, "epoch": 0.46733804298812565, "flos": 31392922360320.0, "grad_norm": 2.4313814338856523, "language_loss": 0.64693952, "learning_rate": 2.3074247145660283e-06, "loss": 0.66805887, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 7773, "time_per_iteration": 2.4556703567504883 }, { "auxiliary_loss_clip": 0.0106096, "auxiliary_loss_mlp": 0.01051724, "balance_loss_clip": 1.02412701, "balance_loss_mlp": 1.01996052, "epoch": 0.4673981662407936, "flos": 19499584625280.0, "grad_norm": 1.7950671575375483, "language_loss": 0.81768447, "learning_rate": 2.3070398755623685e-06, "loss": 0.83881134, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 7774, "time_per_iteration": 2.395684242248535 }, { "auxiliary_loss_clip": 0.010629, "auxiliary_loss_mlp": 0.01051148, "balance_loss_clip": 1.0236938, "balance_loss_mlp": 1.02128112, "epoch": 0.4674582894934616, "flos": 20520794160000.0, "grad_norm": 1.7850005108627631, "language_loss": 0.79257977, "learning_rate": 2.306655024915726e-06, "loss": 0.81372023, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41601562, "step": 7775, "time_per_iteration": 2.3757708072662354 }, { "auxiliary_loss_clip": 0.01060608, "auxiliary_loss_mlp": 0.01054369, "balance_loss_clip": 1.02488828, "balance_loss_mlp": 1.02059889, "epoch": 0.46751841274612954, "flos": 22089860749440.0, "grad_norm": 1.793971568873115, "language_loss": 0.70931977, "learning_rate": 2.306270162640694e-06, "loss": 0.73046958, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40039062, "step": 7776, "time_per_iteration": 2.4042041301727295 }, { "auxiliary_loss_clip": 0.0106286, "auxiliary_loss_mlp": 0.01052545, "balance_loss_clip": 1.02695072, "balance_loss_mlp": 1.02126336, "epoch": 0.46757853599879756, "flos": 26978853300480.0, "grad_norm": 1.389042603820999, "language_loss": 0.74478555, "learning_rate": 2.3058852887518678e-06, "loss": 0.76593965, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41601562, "step": 7777, "time_per_iteration": 2.415757656097412 }, { "auxiliary_loss_clip": 0.01060533, "auxiliary_loss_mlp": 0.01055913, "balance_loss_clip": 1.03000855, "balance_loss_mlp": 1.0186249, "epoch": 0.4676386592514655, "flos": 24132908223360.0, "grad_norm": 1.8890180696293926, "language_loss": 0.71421552, "learning_rate": 2.3055004032638394e-06, "loss": 0.73537993, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41796875, "step": 7778, "time_per_iteration": 3.861370325088501 }, { "auxiliary_loss_clip": 0.01060938, "auxiliary_loss_mlp": 0.01045741, "balance_loss_clip": 1.01723814, "balance_loss_mlp": 1.01942217, "epoch": 0.4676987825041335, "flos": 25482545717760.0, "grad_norm": 1.6841242781431371, "language_loss": 0.74807322, "learning_rate": 2.305115506191206e-06, "loss": 0.76914001, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 7779, "time_per_iteration": 2.423671007156372 }, { "auxiliary_loss_clip": 0.01059824, "auxiliary_loss_mlp": 0.01053717, "balance_loss_clip": 1.02561975, "balance_loss_mlp": 1.01956666, "epoch": 0.46775890575680146, "flos": 21944203090560.0, "grad_norm": 1.4726677163064938, "language_loss": 0.73534739, "learning_rate": 2.304730597548562e-06, "loss": 0.75648272, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 7780, "time_per_iteration": 2.415987014770508 }, { "auxiliary_loss_clip": 0.01061991, "auxiliary_loss_mlp": 0.01047539, "balance_loss_clip": 1.01923823, "balance_loss_mlp": 1.01835346, "epoch": 0.4678190290094694, "flos": 25227225878400.0, "grad_norm": 2.1084711443683033, "language_loss": 0.74959117, "learning_rate": 2.3043456773505023e-06, "loss": 0.77068651, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4375, "step": 7781, "time_per_iteration": 2.3927628993988037 }, { "auxiliary_loss_clip": 0.01059705, "auxiliary_loss_mlp": 0.01045885, "balance_loss_clip": 1.01684546, "balance_loss_mlp": 1.01778555, "epoch": 0.4678791522621374, "flos": 32267042870400.0, "grad_norm": 1.9137160181325696, "language_loss": 0.63836265, "learning_rate": 2.3039607456116252e-06, "loss": 0.65941852, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41796875, "step": 7782, "time_per_iteration": 2.4727392196655273 }, { "auxiliary_loss_clip": 0.0106424, "auxiliary_loss_mlp": 0.01047014, "balance_loss_clip": 1.01812983, "balance_loss_mlp": 1.01988888, "epoch": 0.46793927551480535, "flos": 27045432996480.0, "grad_norm": 2.0700181493198593, "language_loss": 0.64627683, "learning_rate": 2.3035758023465254e-06, "loss": 0.66738939, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.44335938, "step": 7783, "time_per_iteration": 2.4182987213134766 }, { "auxiliary_loss_clip": 0.01064562, "auxiliary_loss_mlp": 0.01050912, "balance_loss_clip": 1.02090693, "balance_loss_mlp": 1.02011967, "epoch": 0.4679993987674733, "flos": 17456432417280.0, "grad_norm": 2.474503611536118, "language_loss": 0.69663328, "learning_rate": 2.303190847569801e-06, "loss": 0.71778804, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.44335938, "step": 7784, "time_per_iteration": 2.3916144371032715 }, { "auxiliary_loss_clip": 0.01061346, "auxiliary_loss_mlp": 0.01044983, "balance_loss_clip": 1.01763642, "balance_loss_mlp": 1.02065396, "epoch": 0.4680595220201413, "flos": 17164174492800.0, "grad_norm": 2.7222036535210345, "language_loss": 0.85957915, "learning_rate": 2.3028058812960497e-06, "loss": 0.88064241, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 7785, "time_per_iteration": 2.363743305206299 }, { "auxiliary_loss_clip": 0.01064041, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.02103329, "balance_loss_mlp": 1.02117395, "epoch": 0.46811964527280925, "flos": 11326801236480.0, "grad_norm": 1.9248623707658652, "language_loss": 0.78930551, "learning_rate": 2.3024209035398678e-06, "loss": 0.81045175, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 7786, "time_per_iteration": 2.4428436756134033 }, { "auxiliary_loss_clip": 0.01058889, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.01221967, "balance_loss_mlp": 1.01956677, "epoch": 0.4681797685254772, "flos": 24277693098240.0, "grad_norm": 1.7604738604904435, "language_loss": 0.75292897, "learning_rate": 2.302035914315856e-06, "loss": 0.77388787, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39453125, "step": 7787, "time_per_iteration": 2.3883590698242188 }, { "auxiliary_loss_clip": 0.0106369, "auxiliary_loss_mlp": 0.01048068, "balance_loss_clip": 1.02037585, "balance_loss_mlp": 1.0221467, "epoch": 0.4682398917781452, "flos": 31649010249600.0, "grad_norm": 1.67846293910167, "language_loss": 0.6696822, "learning_rate": 2.3016509136386116e-06, "loss": 0.69079977, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 7788, "time_per_iteration": 2.4894304275512695 }, { "auxiliary_loss_clip": 0.01062401, "auxiliary_loss_mlp": 0.01040737, "balance_loss_clip": 1.01548874, "balance_loss_mlp": 1.02156472, "epoch": 0.46830001503081314, "flos": 28109515547520.0, "grad_norm": 2.7033969479384563, "language_loss": 0.65038407, "learning_rate": 2.3012659015227343e-06, "loss": 0.67141545, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40820312, "step": 7789, "time_per_iteration": 2.4446873664855957 }, { "auxiliary_loss_clip": 0.01019191, "auxiliary_loss_mlp": 0.01012611, "balance_loss_clip": 1.0099889, "balance_loss_mlp": 1.01134038, "epoch": 0.4683601382834811, "flos": 57878661484800.0, "grad_norm": 0.7014177006080612, "language_loss": 0.6190331, "learning_rate": 2.300880877982825e-06, "loss": 0.63935113, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.078125, "step": 7790, "time_per_iteration": 3.112109661102295 }, { "auxiliary_loss_clip": 0.01063111, "auxiliary_loss_mlp": 0.01039682, "balance_loss_clip": 1.01455307, "balance_loss_mlp": 1.02358985, "epoch": 0.46842026153614913, "flos": 21870850590720.0, "grad_norm": 1.6788595373636737, "language_loss": 0.80050588, "learning_rate": 2.3004958430334808e-06, "loss": 0.8215338, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 7791, "time_per_iteration": 2.3955464363098145 }, { "auxiliary_loss_clip": 0.01063238, "auxiliary_loss_mlp": 0.01046635, "balance_loss_clip": 1.01889455, "balance_loss_mlp": 1.02193964, "epoch": 0.4684803847888171, "flos": 24899635791360.0, "grad_norm": 1.5662915910041637, "language_loss": 0.75373697, "learning_rate": 2.3001107966893052e-06, "loss": 0.77483571, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 7792, "time_per_iteration": 2.4700374603271484 }, { "auxiliary_loss_clip": 0.01060798, "auxiliary_loss_mlp": 0.01038489, "balance_loss_clip": 1.01338339, "balance_loss_mlp": 1.02095056, "epoch": 0.46854050804148506, "flos": 26250425360640.0, "grad_norm": 2.2396379676281657, "language_loss": 0.68921649, "learning_rate": 2.299725738964898e-06, "loss": 0.71020937, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3984375, "step": 7793, "time_per_iteration": 2.4273033142089844 }, { "auxiliary_loss_clip": 0.01062898, "auxiliary_loss_mlp": 0.01040166, "balance_loss_clip": 1.01478648, "balance_loss_mlp": 1.02228665, "epoch": 0.468600631294153, "flos": 21578732311680.0, "grad_norm": 1.687992561788393, "language_loss": 0.75001907, "learning_rate": 2.2993406698748607e-06, "loss": 0.77104974, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 7794, "time_per_iteration": 2.418278455734253 }, { "auxiliary_loss_clip": 0.01064155, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.01483154, "balance_loss_mlp": 1.02222824, "epoch": 0.468660754546821, "flos": 25884430911360.0, "grad_norm": 2.2692688428806975, "language_loss": 0.64738476, "learning_rate": 2.2989555894337953e-06, "loss": 0.66844779, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41796875, "step": 7795, "time_per_iteration": 2.420742988586426 }, { "auxiliary_loss_clip": 0.01061066, "auxiliary_loss_mlp": 0.01042541, "balance_loss_clip": 1.01582646, "balance_loss_mlp": 1.02060652, "epoch": 0.46872087779948896, "flos": 35473710781440.0, "grad_norm": 1.6186563108088952, "language_loss": 0.69756842, "learning_rate": 2.298570497656304e-06, "loss": 0.71860445, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 7796, "time_per_iteration": 2.5142431259155273 }, { "auxiliary_loss_clip": 0.01063076, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.01808953, "balance_loss_mlp": 1.02168667, "epoch": 0.4687810010521569, "flos": 26395210235520.0, "grad_norm": 1.6388877722670203, "language_loss": 0.71434015, "learning_rate": 2.2981853945569894e-06, "loss": 0.73540336, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.4140625, "step": 7797, "time_per_iteration": 2.4260106086730957 }, { "auxiliary_loss_clip": 0.01063726, "auxiliary_loss_mlp": 0.01049316, "balance_loss_clip": 1.02040744, "balance_loss_mlp": 1.02202451, "epoch": 0.4688411243048249, "flos": 19971785030400.0, "grad_norm": 1.954974415853462, "language_loss": 0.68431717, "learning_rate": 2.297800280150454e-06, "loss": 0.70544755, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7798, "time_per_iteration": 2.4706454277038574 }, { "auxiliary_loss_clip": 0.01018268, "auxiliary_loss_mlp": 0.01002047, "balance_loss_clip": 0.99954391, "balance_loss_mlp": 1.01013756, "epoch": 0.46890124755749285, "flos": 63973728552960.0, "grad_norm": 0.9294963703961346, "language_loss": 0.64680713, "learning_rate": 2.2974151544513033e-06, "loss": 0.66701031, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.08105469, "step": 7799, "time_per_iteration": 3.1854240894317627 }, { "auxiliary_loss_clip": 0.01061554, "auxiliary_loss_mlp": 0.01043608, "balance_loss_clip": 1.01590335, "balance_loss_mlp": 1.02108741, "epoch": 0.4689613708101608, "flos": 23767856380800.0, "grad_norm": 1.3337267406610476, "language_loss": 0.73268652, "learning_rate": 2.2970300174741395e-06, "loss": 0.75373811, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 7800, "time_per_iteration": 2.4434688091278076 }, { "auxiliary_loss_clip": 0.01058901, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.01920414, "balance_loss_mlp": 1.0198276, "epoch": 0.4690214940628288, "flos": 24787599638400.0, "grad_norm": 3.1813351888853427, "language_loss": 0.73504543, "learning_rate": 2.296644869233568e-06, "loss": 0.7560755, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.390625, "step": 7801, "time_per_iteration": 2.417325496673584 }, { "auxiliary_loss_clip": 0.01065723, "auxiliary_loss_mlp": 0.01052232, "balance_loss_clip": 1.02077246, "balance_loss_mlp": 1.02125835, "epoch": 0.46908161731549675, "flos": 18076350251520.0, "grad_norm": 1.8351219924079774, "language_loss": 0.65107512, "learning_rate": 2.2962597097441936e-06, "loss": 0.67225468, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 7802, "time_per_iteration": 2.3957178592681885 }, { "auxiliary_loss_clip": 0.01061129, "auxiliary_loss_mlp": 0.01052377, "balance_loss_clip": 1.02505374, "balance_loss_mlp": 1.0185926, "epoch": 0.4691417405681647, "flos": 25702149369600.0, "grad_norm": 2.125524417458925, "language_loss": 0.74776351, "learning_rate": 2.2958745390206206e-06, "loss": 0.76889861, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42578125, "step": 7803, "time_per_iteration": 2.4061081409454346 }, { "auxiliary_loss_clip": 0.01058735, "auxiliary_loss_mlp": 0.01057211, "balance_loss_clip": 1.03152144, "balance_loss_mlp": 1.0185411, "epoch": 0.46920186382083273, "flos": 17456083303680.0, "grad_norm": 2.6816017453627414, "language_loss": 0.79472256, "learning_rate": 2.2954893570774558e-06, "loss": 0.81588203, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 7804, "time_per_iteration": 2.378934383392334 }, { "auxiliary_loss_clip": 0.01059265, "auxiliary_loss_mlp": 0.01059238, "balance_loss_clip": 1.03160524, "balance_loss_mlp": 1.01773214, "epoch": 0.4692619870735007, "flos": 20338407884160.0, "grad_norm": 1.9509879593078754, "language_loss": 0.78403449, "learning_rate": 2.295104163929305e-06, "loss": 0.80521953, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41601562, "step": 7805, "time_per_iteration": 2.363140106201172 }, { "auxiliary_loss_clip": 0.01064621, "auxiliary_loss_mlp": 0.01053264, "balance_loss_clip": 1.02397394, "balance_loss_mlp": 1.02036357, "epoch": 0.46932211032616866, "flos": 29495288165760.0, "grad_norm": 1.61192931744742, "language_loss": 0.83707368, "learning_rate": 2.2947189595907742e-06, "loss": 0.85825253, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.44140625, "step": 7806, "time_per_iteration": 2.4540414810180664 }, { "auxiliary_loss_clip": 0.01061937, "auxiliary_loss_mlp": 0.01060603, "balance_loss_clip": 1.03167117, "balance_loss_mlp": 1.01981521, "epoch": 0.4693822335788366, "flos": 36209749397760.0, "grad_norm": 3.237683308904276, "language_loss": 0.78777957, "learning_rate": 2.294333744076472e-06, "loss": 0.8090049, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 7807, "time_per_iteration": 2.4886929988861084 }, { "auxiliary_loss_clip": 0.01061042, "auxiliary_loss_mlp": 0.01048526, "balance_loss_clip": 1.01966572, "balance_loss_mlp": 1.01963115, "epoch": 0.4694423568315046, "flos": 20337954036480.0, "grad_norm": 2.630599248059758, "language_loss": 0.51941347, "learning_rate": 2.2939485174010035e-06, "loss": 0.54050916, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 7808, "time_per_iteration": 3.5892951488494873 }, { "auxiliary_loss_clip": 0.01013969, "auxiliary_loss_mlp": 0.01011676, "balance_loss_clip": 1.00925624, "balance_loss_mlp": 1.00530887, "epoch": 0.46950248008417256, "flos": 64323489219840.0, "grad_norm": 0.7946352889237837, "language_loss": 0.57825267, "learning_rate": 2.293563279578978e-06, "loss": 0.59850907, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.08691406, "step": 7809, "time_per_iteration": 2.869509220123291 }, { "auxiliary_loss_clip": 0.01064775, "auxiliary_loss_mlp": 0.01057394, "balance_loss_clip": 1.02699566, "balance_loss_mlp": 1.02146876, "epoch": 0.4695626033368405, "flos": 19199331999360.0, "grad_norm": 1.9682407570279368, "language_loss": 0.73312759, "learning_rate": 2.2931780306250045e-06, "loss": 0.75434929, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 7810, "time_per_iteration": 5.289606094360352 }, { "auxiliary_loss_clip": 0.01063802, "auxiliary_loss_mlp": 0.01048677, "balance_loss_clip": 1.01970887, "balance_loss_mlp": 1.02234745, "epoch": 0.4696227265895085, "flos": 23001338280960.0, "grad_norm": 2.4616524056753937, "language_loss": 0.82589269, "learning_rate": 2.29279277055369e-06, "loss": 0.84701741, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 7811, "time_per_iteration": 2.4006187915802 }, { "auxiliary_loss_clip": 0.01064951, "auxiliary_loss_mlp": 0.01049723, "balance_loss_clip": 1.02125561, "balance_loss_mlp": 1.02149642, "epoch": 0.46968284984217645, "flos": 21869803249920.0, "grad_norm": 1.9474656082962947, "language_loss": 0.81178689, "learning_rate": 2.292407499379644e-06, "loss": 0.83293366, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.43554688, "step": 7812, "time_per_iteration": 2.45174503326416 }, { "auxiliary_loss_clip": 0.01063853, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.0153594, "balance_loss_mlp": 1.02306271, "epoch": 0.4697429730948444, "flos": 19973949534720.0, "grad_norm": 1.9192964158845418, "language_loss": 0.75676513, "learning_rate": 2.292022217117477e-06, "loss": 0.77781177, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40820312, "step": 7813, "time_per_iteration": 2.3654377460479736 }, { "auxiliary_loss_clip": 0.01064599, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.01361895, "balance_loss_mlp": 1.02255487, "epoch": 0.4698030963475124, "flos": 15155376929280.0, "grad_norm": 2.4554212054812736, "language_loss": 0.85337442, "learning_rate": 2.291636923781798e-06, "loss": 0.87443876, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 7814, "time_per_iteration": 2.387277364730835 }, { "auxiliary_loss_clip": 0.01064851, "auxiliary_loss_mlp": 0.01046453, "balance_loss_clip": 1.01942801, "balance_loss_mlp": 1.02315056, "epoch": 0.46986321960018035, "flos": 15150489338880.0, "grad_norm": 2.3330959518573184, "language_loss": 0.82538867, "learning_rate": 2.291251619387217e-06, "loss": 0.84650171, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 7815, "time_per_iteration": 2.3585915565490723 }, { "auxiliary_loss_clip": 0.01067831, "auxiliary_loss_mlp": 0.01046913, "balance_loss_clip": 1.01594281, "balance_loss_mlp": 1.02403307, "epoch": 0.4699233428528483, "flos": 23107893350400.0, "grad_norm": 1.886207091183442, "language_loss": 0.79008448, "learning_rate": 2.2908663039483468e-06, "loss": 0.81123191, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.4375, "step": 7816, "time_per_iteration": 2.441071033477783 }, { "auxiliary_loss_clip": 0.01020418, "auxiliary_loss_mlp": 0.01009001, "balance_loss_clip": 1.00639009, "balance_loss_mlp": 1.01148772, "epoch": 0.46998346610551633, "flos": 68101998289920.0, "grad_norm": 0.8515240677606969, "language_loss": 0.5915395, "learning_rate": 2.290480977479796e-06, "loss": 0.61183369, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.08935547, "step": 7817, "time_per_iteration": 3.0076546669006348 }, { "auxiliary_loss_clip": 0.01065779, "auxiliary_loss_mlp": 0.01041136, "balance_loss_clip": 1.01421833, "balance_loss_mlp": 1.02519393, "epoch": 0.4700435893581843, "flos": 24128439569280.0, "grad_norm": 1.7806709751254075, "language_loss": 0.80183744, "learning_rate": 2.2900956399961775e-06, "loss": 0.82290661, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 7818, "time_per_iteration": 3.90934419631958 }, { "auxiliary_loss_clip": 0.01064948, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.01589417, "balance_loss_mlp": 1.022156, "epoch": 0.47010371261085226, "flos": 20149667740800.0, "grad_norm": 1.8004634963754325, "language_loss": 0.85007942, "learning_rate": 2.289710291512104e-06, "loss": 0.87117589, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42773438, "step": 7819, "time_per_iteration": 2.390690565109253 }, { "auxiliary_loss_clip": 0.01067135, "auxiliary_loss_mlp": 0.01052719, "balance_loss_clip": 1.02087855, "balance_loss_mlp": 1.0218389, "epoch": 0.47016383586352023, "flos": 15121301575680.0, "grad_norm": 7.837933408911312, "language_loss": 0.7843461, "learning_rate": 2.289324932042186e-06, "loss": 0.80554473, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.453125, "step": 7820, "time_per_iteration": 2.373380184173584 }, { "auxiliary_loss_clip": 0.01064066, "auxiliary_loss_mlp": 0.01046896, "balance_loss_clip": 1.01929903, "balance_loss_mlp": 1.02341878, "epoch": 0.4702239591161882, "flos": 13552130252160.0, "grad_norm": 2.130901903397475, "language_loss": 0.7666679, "learning_rate": 2.288939561601039e-06, "loss": 0.78777748, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 7821, "time_per_iteration": 2.390869140625 }, { "auxiliary_loss_clip": 0.01063819, "auxiliary_loss_mlp": 0.01044527, "balance_loss_clip": 1.0178833, "balance_loss_mlp": 1.0225215, "epoch": 0.47028408236885616, "flos": 24275458771200.0, "grad_norm": 2.67522461115942, "language_loss": 0.90054327, "learning_rate": 2.2885541802032746e-06, "loss": 0.92162675, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.4140625, "step": 7822, "time_per_iteration": 2.4590673446655273 }, { "auxiliary_loss_clip": 0.01063074, "auxiliary_loss_mlp": 0.01052254, "balance_loss_clip": 1.02428699, "balance_loss_mlp": 1.02150655, "epoch": 0.4703442056215241, "flos": 22855820267520.0, "grad_norm": 1.4774753809896473, "language_loss": 0.80569232, "learning_rate": 2.2881687878635055e-06, "loss": 0.82684559, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41601562, "step": 7823, "time_per_iteration": 2.41172456741333 }, { "auxiliary_loss_clip": 0.01016544, "auxiliary_loss_mlp": 0.01016124, "balance_loss_clip": 1.01346612, "balance_loss_mlp": 1.0076133, "epoch": 0.4704043288741921, "flos": 69236535697920.0, "grad_norm": 0.7068002695197454, "language_loss": 0.56792927, "learning_rate": 2.2877833845963487e-06, "loss": 0.58825588, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.08886719, "step": 7824, "time_per_iteration": 3.0900912284851074 }, { "auxiliary_loss_clip": 0.01063529, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.02549708, "balance_loss_mlp": 1.02036715, "epoch": 0.47046445212686006, "flos": 18040110393600.0, "grad_norm": 1.6573664019394294, "language_loss": 0.81818438, "learning_rate": 2.2873979704164157e-06, "loss": 0.83937263, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43164062, "step": 7825, "time_per_iteration": 2.358977794647217 }, { "auxiliary_loss_clip": 0.01062556, "auxiliary_loss_mlp": 0.01051842, "balance_loss_clip": 1.02320826, "balance_loss_mlp": 1.02103484, "epoch": 0.470524575379528, "flos": 23950312479360.0, "grad_norm": 1.9895817876258772, "language_loss": 0.68394744, "learning_rate": 2.287012545338324e-06, "loss": 0.70509142, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 7826, "time_per_iteration": 2.4384548664093018 }, { "auxiliary_loss_clip": 0.01063773, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.02498198, "balance_loss_mlp": 1.02051139, "epoch": 0.470584698632196, "flos": 18112590109440.0, "grad_norm": 2.3312876414048564, "language_loss": 0.84191203, "learning_rate": 2.2866271093766877e-06, "loss": 0.86309278, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43164062, "step": 7827, "time_per_iteration": 2.361374855041504 }, { "auxiliary_loss_clip": 0.01012544, "auxiliary_loss_mlp": 0.01016209, "balance_loss_clip": 1.01372898, "balance_loss_mlp": 1.00412416, "epoch": 0.47064482188486395, "flos": 57249143026560.0, "grad_norm": 0.8092041398675978, "language_loss": 0.55681646, "learning_rate": 2.286241662546122e-06, "loss": 0.57710397, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.08398438, "step": 7828, "time_per_iteration": 3.0136284828186035 }, { "auxiliary_loss_clip": 0.01059896, "auxiliary_loss_mlp": 0.01047645, "balance_loss_clip": 1.02113247, "balance_loss_mlp": 1.01968753, "epoch": 0.4707049451375319, "flos": 17894103621120.0, "grad_norm": 2.579353567452235, "language_loss": 0.82184583, "learning_rate": 2.285856204861245e-06, "loss": 0.84292126, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 7829, "time_per_iteration": 2.364840030670166 }, { "auxiliary_loss_clip": 0.01060694, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.01429033, "balance_loss_mlp": 1.02002835, "epoch": 0.47076506839019994, "flos": 25231380330240.0, "grad_norm": 1.267241125002107, "language_loss": 0.76309586, "learning_rate": 2.2854707363366703e-06, "loss": 0.78410882, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 7830, "time_per_iteration": 2.462272882461548 }, { "auxiliary_loss_clip": 0.0106076, "auxiliary_loss_mlp": 0.01043897, "balance_loss_clip": 1.01388013, "balance_loss_mlp": 1.01971281, "epoch": 0.4708251916428679, "flos": 13478847575040.0, "grad_norm": 1.8626475272001433, "language_loss": 0.79476857, "learning_rate": 2.2850852569870177e-06, "loss": 0.81581509, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41015625, "step": 7831, "time_per_iteration": 2.359921932220459 }, { "auxiliary_loss_clip": 0.01065733, "auxiliary_loss_mlp": 0.01048935, "balance_loss_clip": 1.01836991, "balance_loss_mlp": 1.02104664, "epoch": 0.47088531489553587, "flos": 30146697912960.0, "grad_norm": 1.8965456425185359, "language_loss": 0.76548743, "learning_rate": 2.2846997668269033e-06, "loss": 0.78663415, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.44726562, "step": 7832, "time_per_iteration": 2.4623947143554688 }, { "auxiliary_loss_clip": 0.01061189, "auxiliary_loss_mlp": 0.01044454, "balance_loss_clip": 1.01641607, "balance_loss_mlp": 1.02046347, "epoch": 0.47094543814820383, "flos": 21797218800000.0, "grad_norm": 1.4806490513808277, "language_loss": 0.75598192, "learning_rate": 2.2843142658709454e-06, "loss": 0.7770384, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 7833, "time_per_iteration": 2.389000415802002 }, { "auxiliary_loss_clip": 0.01061954, "auxiliary_loss_mlp": 0.0104794, "balance_loss_clip": 1.01888847, "balance_loss_mlp": 1.0203079, "epoch": 0.4710055614008718, "flos": 23001896862720.0, "grad_norm": 1.7392210252883136, "language_loss": 0.7693541, "learning_rate": 2.283928754133762e-06, "loss": 0.79045296, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41601562, "step": 7834, "time_per_iteration": 2.429323196411133 }, { "auxiliary_loss_clip": 0.0106265, "auxiliary_loss_mlp": 0.01046854, "balance_loss_clip": 1.01861322, "balance_loss_mlp": 1.02129984, "epoch": 0.47106568465353976, "flos": 42739694760960.0, "grad_norm": 1.4780861198969002, "language_loss": 0.67282194, "learning_rate": 2.283543231629972e-06, "loss": 0.69391704, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 7835, "time_per_iteration": 2.5691614151000977 }, { "auxiliary_loss_clip": 0.01018496, "auxiliary_loss_mlp": 0.01003799, "balance_loss_clip": 1.00112867, "balance_loss_mlp": 1.01022196, "epoch": 0.4711258079062077, "flos": 68551157197440.0, "grad_norm": 0.8710584516514523, "language_loss": 0.62270784, "learning_rate": 2.283157698374194e-06, "loss": 0.64293081, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.08300781, "step": 7836, "time_per_iteration": 3.005786418914795 }, { "auxiliary_loss_clip": 0.01068135, "auxiliary_loss_mlp": 0.01046407, "balance_loss_clip": 1.01555538, "balance_loss_mlp": 1.02232933, "epoch": 0.4711859311588757, "flos": 25445433075840.0, "grad_norm": 1.695939790642757, "language_loss": 0.70698118, "learning_rate": 2.2827721543810475e-06, "loss": 0.72812659, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.45703125, "step": 7837, "time_per_iteration": 2.4421114921569824 }, { "auxiliary_loss_clip": 0.01063229, "auxiliary_loss_mlp": 0.01043786, "balance_loss_clip": 1.01446104, "balance_loss_mlp": 1.02144003, "epoch": 0.47124605441154366, "flos": 21980792062080.0, "grad_norm": 1.7510343002849857, "language_loss": 0.68025219, "learning_rate": 2.282386599665153e-06, "loss": 0.70132232, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 7838, "time_per_iteration": 2.4117214679718018 }, { "auxiliary_loss_clip": 0.01064812, "auxiliary_loss_mlp": 0.01047593, "balance_loss_clip": 1.01639605, "balance_loss_mlp": 1.02073324, "epoch": 0.4713061776642116, "flos": 25411462456320.0, "grad_norm": 2.3565392412622175, "language_loss": 0.78492486, "learning_rate": 2.2820010342411304e-06, "loss": 0.80604887, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.44140625, "step": 7839, "time_per_iteration": 2.450242042541504 }, { "auxiliary_loss_clip": 0.01059758, "auxiliary_loss_mlp": 0.01042888, "balance_loss_clip": 1.015172, "balance_loss_mlp": 1.01973522, "epoch": 0.4713663009168796, "flos": 26541042451200.0, "grad_norm": 1.8485421048320883, "language_loss": 0.73770714, "learning_rate": 2.2816154581235993e-06, "loss": 0.75873363, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 7840, "time_per_iteration": 2.4319353103637695 }, { "auxiliary_loss_clip": 0.01061336, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.01893258, "balance_loss_mlp": 1.02029705, "epoch": 0.47142642416954755, "flos": 23622443101440.0, "grad_norm": 1.7460929192326227, "language_loss": 0.76705557, "learning_rate": 2.2812298713271833e-06, "loss": 0.78813493, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 7841, "time_per_iteration": 2.394404649734497 }, { "auxiliary_loss_clip": 0.0106301, "auxiliary_loss_mlp": 0.01043963, "balance_loss_clip": 1.01758242, "balance_loss_mlp": 1.02188301, "epoch": 0.4714865474222155, "flos": 22309045464960.0, "grad_norm": 1.5163726485814648, "language_loss": 0.71930885, "learning_rate": 2.280844273866501e-06, "loss": 0.74037862, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41210938, "step": 7842, "time_per_iteration": 2.449740171432495 }, { "auxiliary_loss_clip": 0.01063271, "auxiliary_loss_mlp": 0.0104566, "balance_loss_clip": 1.01693022, "balance_loss_mlp": 1.02149248, "epoch": 0.4715466706748835, "flos": 17821449348480.0, "grad_norm": 2.049990938121652, "language_loss": 0.79677904, "learning_rate": 2.280458665756177e-06, "loss": 0.81786835, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7843, "time_per_iteration": 2.399085283279419 }, { "auxiliary_loss_clip": 0.01060428, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.01869535, "balance_loss_mlp": 1.01936042, "epoch": 0.4716067939275515, "flos": 23658403668480.0, "grad_norm": 1.7096829830547409, "language_loss": 0.750718, "learning_rate": 2.280073047010832e-06, "loss": 0.77176845, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 7844, "time_per_iteration": 2.4345993995666504 }, { "auxiliary_loss_clip": 0.01059994, "auxiliary_loss_mlp": 0.01046862, "balance_loss_clip": 1.01727462, "balance_loss_mlp": 1.01925492, "epoch": 0.47166691718021947, "flos": 17929226315520.0, "grad_norm": 1.4865601655496228, "language_loss": 0.79794782, "learning_rate": 2.279687417645088e-06, "loss": 0.8190164, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40820312, "step": 7845, "time_per_iteration": 2.363325595855713 }, { "auxiliary_loss_clip": 0.01058967, "auxiliary_loss_mlp": 0.01045455, "balance_loss_clip": 1.02029014, "balance_loss_mlp": 1.01992524, "epoch": 0.47172704043288743, "flos": 26613382521600.0, "grad_norm": 1.366678485527142, "language_loss": 0.74106073, "learning_rate": 2.2793017776735703e-06, "loss": 0.76210493, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 7846, "time_per_iteration": 2.440913200378418 }, { "auxiliary_loss_clip": 0.0105703, "auxiliary_loss_mlp": 0.01041553, "balance_loss_clip": 1.01536238, "balance_loss_mlp": 1.0180552, "epoch": 0.4717871636855554, "flos": 27921613276800.0, "grad_norm": 1.3936492314109212, "language_loss": 0.75225455, "learning_rate": 2.2789161271109e-06, "loss": 0.77324033, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 7847, "time_per_iteration": 3.8726038932800293 }, { "auxiliary_loss_clip": 0.01062518, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.01827312, "balance_loss_mlp": 1.02069759, "epoch": 0.47184728693822336, "flos": 14501348830080.0, "grad_norm": 1.7724984047949506, "language_loss": 0.81881046, "learning_rate": 2.278530465971703e-06, "loss": 0.83990461, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7848, "time_per_iteration": 2.3496346473693848 }, { "auxiliary_loss_clip": 0.01062889, "auxiliary_loss_mlp": 0.01042369, "balance_loss_clip": 1.01646459, "balance_loss_mlp": 1.02146685, "epoch": 0.47190741019089133, "flos": 17855629436160.0, "grad_norm": 1.7339397575819548, "language_loss": 0.7132417, "learning_rate": 2.2781447942706032e-06, "loss": 0.7342943, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.4140625, "step": 7849, "time_per_iteration": 3.850473403930664 }, { "auxiliary_loss_clip": 0.01063384, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.02538633, "balance_loss_mlp": 1.01968169, "epoch": 0.4719675334435593, "flos": 17894487646080.0, "grad_norm": 2.243314898675664, "language_loss": 0.7086339, "learning_rate": 2.277759112022224e-06, "loss": 0.72980952, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4375, "step": 7850, "time_per_iteration": 3.811210870742798 }, { "auxiliary_loss_clip": 0.01060346, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.01695251, "balance_loss_mlp": 1.01882422, "epoch": 0.47202765669622726, "flos": 20703320081280.0, "grad_norm": 1.8576121757386315, "language_loss": 0.76421678, "learning_rate": 2.2773734192411916e-06, "loss": 0.78526413, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 7851, "time_per_iteration": 2.364367723464966 }, { "auxiliary_loss_clip": 0.01061517, "auxiliary_loss_mlp": 0.01054112, "balance_loss_clip": 1.0267179, "balance_loss_mlp": 1.01981974, "epoch": 0.4720877799488952, "flos": 16359391676160.0, "grad_norm": 1.6796844507535476, "language_loss": 0.77641487, "learning_rate": 2.276987715942132e-06, "loss": 0.79757118, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41796875, "step": 7852, "time_per_iteration": 2.3586106300354004 }, { "auxiliary_loss_clip": 0.01060999, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.0132643, "balance_loss_mlp": 1.02007806, "epoch": 0.4721479032015632, "flos": 20667115134720.0, "grad_norm": 1.6320181709078498, "language_loss": 0.70009482, "learning_rate": 2.2766020021396696e-06, "loss": 0.72111142, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 7853, "time_per_iteration": 2.369363307952881 }, { "auxiliary_loss_clip": 0.01012204, "auxiliary_loss_mlp": 0.01006129, "balance_loss_clip": 1.00344646, "balance_loss_mlp": 1.00376964, "epoch": 0.47220802645423116, "flos": 67746616894080.0, "grad_norm": 0.6941354192692705, "language_loss": 0.50237358, "learning_rate": 2.276216277848432e-06, "loss": 0.5225569, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.08447266, "step": 7854, "time_per_iteration": 3.141892671585083 }, { "auxiliary_loss_clip": 0.01061296, "auxiliary_loss_mlp": 0.0104694, "balance_loss_clip": 1.01750755, "balance_loss_mlp": 1.01903963, "epoch": 0.4722681497068991, "flos": 20920445026560.0, "grad_norm": 1.7653264717101098, "language_loss": 0.64525592, "learning_rate": 2.2758305430830455e-06, "loss": 0.66633826, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 7855, "time_per_iteration": 2.372880458831787 }, { "auxiliary_loss_clip": 0.0106022, "auxiliary_loss_mlp": 0.01049517, "balance_loss_clip": 1.02101421, "balance_loss_mlp": 1.01897001, "epoch": 0.4723282729595671, "flos": 28291832000640.0, "grad_norm": 2.3226950435079474, "language_loss": 0.76613164, "learning_rate": 2.2754447978581376e-06, "loss": 0.78722906, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41210938, "step": 7856, "time_per_iteration": 2.4617667198181152 }, { "auxiliary_loss_clip": 0.01057217, "auxiliary_loss_mlp": 0.01044197, "balance_loss_clip": 1.01775646, "balance_loss_mlp": 1.01784384, "epoch": 0.4723883962122351, "flos": 27123847643520.0, "grad_norm": 1.9377869842617634, "language_loss": 0.76283681, "learning_rate": 2.2750590421883347e-06, "loss": 0.78385091, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 7857, "time_per_iteration": 2.421924352645874 }, { "auxiliary_loss_clip": 0.01057634, "auxiliary_loss_mlp": 0.01048505, "balance_loss_clip": 1.02343571, "balance_loss_mlp": 1.01788068, "epoch": 0.47244851946490307, "flos": 31535996578560.0, "grad_norm": 1.71778160277913, "language_loss": 0.65685982, "learning_rate": 2.2746732760882655e-06, "loss": 0.67792118, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3984375, "step": 7858, "time_per_iteration": 3.9005205631256104 }, { "auxiliary_loss_clip": 0.01057568, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.02061534, "balance_loss_mlp": 1.0182873, "epoch": 0.47250864271757104, "flos": 20885496888960.0, "grad_norm": 1.4708174141107118, "language_loss": 0.71407509, "learning_rate": 2.2742874995725575e-06, "loss": 0.73512369, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39257812, "step": 7859, "time_per_iteration": 2.4208033084869385 }, { "auxiliary_loss_clip": 0.01063894, "auxiliary_loss_mlp": 0.01048864, "balance_loss_clip": 1.02064705, "balance_loss_mlp": 1.02017546, "epoch": 0.472568765970239, "flos": 20521038539520.0, "grad_norm": 1.7918033355708163, "language_loss": 0.63885432, "learning_rate": 2.2739017126558413e-06, "loss": 0.65998197, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4375, "step": 7860, "time_per_iteration": 2.395272731781006 }, { "auxiliary_loss_clip": 0.01060218, "auxiliary_loss_mlp": 0.01045435, "balance_loss_clip": 1.01893497, "balance_loss_mlp": 1.01941049, "epoch": 0.47262888922290697, "flos": 35803849397760.0, "grad_norm": 1.9952109809236571, "language_loss": 0.72740829, "learning_rate": 2.2735159153527445e-06, "loss": 0.74846476, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40820312, "step": 7861, "time_per_iteration": 2.510735511779785 }, { "auxiliary_loss_clip": 0.01060432, "auxiliary_loss_mlp": 0.0104829, "balance_loss_clip": 1.02015686, "balance_loss_mlp": 1.01893723, "epoch": 0.47268901247557493, "flos": 20666696198400.0, "grad_norm": 1.8882236758463984, "language_loss": 0.86304462, "learning_rate": 2.273130107677896e-06, "loss": 0.88413185, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 7862, "time_per_iteration": 2.4680874347686768 }, { "auxiliary_loss_clip": 0.01061675, "auxiliary_loss_mlp": 0.0105114, "balance_loss_clip": 1.02401996, "balance_loss_mlp": 1.01935112, "epoch": 0.4727491357282429, "flos": 19572273809280.0, "grad_norm": 1.8700632174242922, "language_loss": 0.85424006, "learning_rate": 2.272744289645927e-06, "loss": 0.87536824, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.421875, "step": 7863, "time_per_iteration": 2.3868300914764404 }, { "auxiliary_loss_clip": 0.01061941, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.0219053, "balance_loss_mlp": 1.01945078, "epoch": 0.47280925898091086, "flos": 18216422092800.0, "grad_norm": 1.8371830289639786, "language_loss": 0.66718853, "learning_rate": 2.272358461271467e-06, "loss": 0.68829751, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.42578125, "step": 7864, "time_per_iteration": 2.4481844902038574 }, { "auxiliary_loss_clip": 0.01062084, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.01349831, "balance_loss_mlp": 1.02106917, "epoch": 0.4728693822335788, "flos": 17820855855360.0, "grad_norm": 1.7859299693093245, "language_loss": 0.66958207, "learning_rate": 2.271972622569147e-06, "loss": 0.6906091, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 7865, "time_per_iteration": 2.373434543609619 }, { "auxiliary_loss_clip": 0.0105977, "auxiliary_loss_mlp": 0.0104681, "balance_loss_clip": 1.01829469, "balance_loss_mlp": 1.0187093, "epoch": 0.4729295054862468, "flos": 20594007014400.0, "grad_norm": 1.818020094920685, "language_loss": 0.74843353, "learning_rate": 2.2715867735535976e-06, "loss": 0.7694993, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 7866, "time_per_iteration": 2.401461124420166 }, { "auxiliary_loss_clip": 0.01062694, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.00923979, "balance_loss_mlp": 1.02046347, "epoch": 0.47298962873891476, "flos": 23366948705280.0, "grad_norm": 8.746038079266748, "language_loss": 0.84280336, "learning_rate": 2.271200914239451e-06, "loss": 0.86379397, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.421875, "step": 7867, "time_per_iteration": 2.397719144821167 }, { "auxiliary_loss_clip": 0.01060193, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.01507294, "balance_loss_mlp": 1.01995039, "epoch": 0.4730497519915827, "flos": 22051212007680.0, "grad_norm": 1.554149708412771, "language_loss": 0.80036402, "learning_rate": 2.2708150446413385e-06, "loss": 0.82139051, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 7868, "time_per_iteration": 2.4186081886291504 }, { "auxiliary_loss_clip": 0.01063098, "auxiliary_loss_mlp": 0.01050726, "balance_loss_clip": 1.02012515, "balance_loss_mlp": 1.02063918, "epoch": 0.4731098752442507, "flos": 21068651214720.0, "grad_norm": 2.3728147371392696, "language_loss": 0.7690652, "learning_rate": 2.2704291647738915e-06, "loss": 0.79020345, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 7869, "time_per_iteration": 2.3728842735290527 }, { "auxiliary_loss_clip": 0.01062723, "auxiliary_loss_mlp": 0.0104432, "balance_loss_clip": 1.01432729, "balance_loss_mlp": 1.02079797, "epoch": 0.4731699984969187, "flos": 22527671598720.0, "grad_norm": 1.4538496652214903, "language_loss": 0.74254495, "learning_rate": 2.2700432746517443e-06, "loss": 0.76361537, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41992188, "step": 7870, "time_per_iteration": 2.4109201431274414 }, { "auxiliary_loss_clip": 0.01064783, "auxiliary_loss_mlp": 0.01048158, "balance_loss_clip": 1.01649642, "balance_loss_mlp": 1.02087641, "epoch": 0.4732301217495867, "flos": 24897017439360.0, "grad_norm": 1.9039865651274053, "language_loss": 0.82800937, "learning_rate": 2.2696573742895292e-06, "loss": 0.8491388, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43945312, "step": 7871, "time_per_iteration": 2.418464422225952 }, { "auxiliary_loss_clip": 0.01062583, "auxiliary_loss_mlp": 0.0104667, "balance_loss_clip": 1.01791668, "balance_loss_mlp": 1.02084756, "epoch": 0.47329024500225464, "flos": 22783305640320.0, "grad_norm": 1.5811866917624622, "language_loss": 0.77068722, "learning_rate": 2.269271463701879e-06, "loss": 0.79177982, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7872, "time_per_iteration": 2.4526121616363525 }, { "auxiliary_loss_clip": 0.01061298, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.01992309, "balance_loss_mlp": 1.01938295, "epoch": 0.4733503682549226, "flos": 38694238502400.0, "grad_norm": 1.8189947377360534, "language_loss": 0.68725634, "learning_rate": 2.268885542903428e-06, "loss": 0.70834804, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41992188, "step": 7873, "time_per_iteration": 2.5501456260681152 }, { "auxiliary_loss_clip": 0.01060465, "auxiliary_loss_mlp": 0.01042354, "balance_loss_clip": 1.01532972, "balance_loss_mlp": 1.01996303, "epoch": 0.47341049150759057, "flos": 22965726827520.0, "grad_norm": 1.5322898657719985, "language_loss": 0.73691487, "learning_rate": 2.26849961190881e-06, "loss": 0.75794309, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 7874, "time_per_iteration": 2.4464874267578125 }, { "auxiliary_loss_clip": 0.0106359, "auxiliary_loss_mlp": 0.01047603, "balance_loss_clip": 1.01981497, "balance_loss_mlp": 1.02120256, "epoch": 0.47347061476025853, "flos": 14537588688000.0, "grad_norm": 2.169001746031164, "language_loss": 0.66784632, "learning_rate": 2.26811367073266e-06, "loss": 0.68895829, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42382812, "step": 7875, "time_per_iteration": 2.346041679382324 }, { "auxiliary_loss_clip": 0.01063412, "auxiliary_loss_mlp": 0.01054282, "balance_loss_clip": 1.02524304, "balance_loss_mlp": 1.02167737, "epoch": 0.4735307380129265, "flos": 30261945911040.0, "grad_norm": 2.223903411234512, "language_loss": 0.81933606, "learning_rate": 2.2677277193896125e-06, "loss": 0.84051299, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41796875, "step": 7876, "time_per_iteration": 2.4821507930755615 }, { "auxiliary_loss_clip": 0.01060497, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.01779306, "balance_loss_mlp": 1.01910758, "epoch": 0.47359086126559446, "flos": 19390027178880.0, "grad_norm": 2.9075930633100318, "language_loss": 0.80027872, "learning_rate": 2.267341757894304e-06, "loss": 0.8213321, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7877, "time_per_iteration": 2.36898136138916 }, { "auxiliary_loss_clip": 0.01062352, "auxiliary_loss_mlp": 0.01045837, "balance_loss_clip": 1.02027822, "balance_loss_mlp": 1.02139211, "epoch": 0.47365098451826243, "flos": 21938477627520.0, "grad_norm": 1.9175271349607586, "language_loss": 0.71843958, "learning_rate": 2.2669557862613685e-06, "loss": 0.7395215, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41015625, "step": 7878, "time_per_iteration": 2.450464963912964 }, { "auxiliary_loss_clip": 0.01058946, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.0171479, "balance_loss_mlp": 1.01970458, "epoch": 0.4737111077709304, "flos": 25843966778880.0, "grad_norm": 2.1203891881962313, "language_loss": 0.76207554, "learning_rate": 2.2665698045054425e-06, "loss": 0.78308511, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39257812, "step": 7879, "time_per_iteration": 2.452617645263672 }, { "auxiliary_loss_clip": 0.01011554, "auxiliary_loss_mlp": 0.01008566, "balance_loss_clip": 1.00599074, "balance_loss_mlp": 1.00308561, "epoch": 0.47377123102359836, "flos": 67757790395520.0, "grad_norm": 0.7332812914740752, "language_loss": 0.61345857, "learning_rate": 2.266183812641164e-06, "loss": 0.63365978, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.08496094, "step": 7880, "time_per_iteration": 3.0479912757873535 }, { "auxiliary_loss_clip": 0.01059456, "auxiliary_loss_mlp": 0.01044023, "balance_loss_clip": 1.01511526, "balance_loss_mlp": 1.01948261, "epoch": 0.4738313542762663, "flos": 24314840651520.0, "grad_norm": 1.4991735449154941, "language_loss": 0.6928196, "learning_rate": 2.2657978106831675e-06, "loss": 0.71385437, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40039062, "step": 7881, "time_per_iteration": 2.4042608737945557 }, { "auxiliary_loss_clip": 0.01059299, "auxiliary_loss_mlp": 0.0103898, "balance_loss_clip": 1.01318336, "balance_loss_mlp": 1.01862609, "epoch": 0.4738914775289343, "flos": 20704262688000.0, "grad_norm": 1.5602559070946251, "language_loss": 0.77969015, "learning_rate": 2.265411798646092e-06, "loss": 0.80067289, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40625, "step": 7882, "time_per_iteration": 2.433347463607788 }, { "auxiliary_loss_clip": 0.01059974, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.02436447, "balance_loss_mlp": 1.01862907, "epoch": 0.4739516007816023, "flos": 25445188696320.0, "grad_norm": 2.715900533342876, "language_loss": 0.77013183, "learning_rate": 2.2650257765445747e-06, "loss": 0.79124624, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 7883, "time_per_iteration": 2.412372589111328 }, { "auxiliary_loss_clip": 0.01058661, "auxiliary_loss_mlp": 0.01046804, "balance_loss_clip": 1.02129316, "balance_loss_mlp": 1.01879668, "epoch": 0.4740117240342703, "flos": 19973321130240.0, "grad_norm": 1.865736679086033, "language_loss": 0.73615783, "learning_rate": 2.2646397443932525e-06, "loss": 0.75721252, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 7884, "time_per_iteration": 2.402825355529785 }, { "auxiliary_loss_clip": 0.01062584, "auxiliary_loss_mlp": 0.01042892, "balance_loss_clip": 1.01357865, "balance_loss_mlp": 1.01930642, "epoch": 0.47407184728693824, "flos": 15660465701760.0, "grad_norm": 2.118256616332098, "language_loss": 0.83172506, "learning_rate": 2.2642537022067655e-06, "loss": 0.85277987, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 7885, "time_per_iteration": 2.3320119380950928 }, { "auxiliary_loss_clip": 0.01060269, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.01689434, "balance_loss_mlp": 1.01893687, "epoch": 0.4741319705396062, "flos": 18587792891520.0, "grad_norm": 2.09479738543879, "language_loss": 0.74958456, "learning_rate": 2.263867649999751e-06, "loss": 0.7706219, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 7886, "time_per_iteration": 2.428664207458496 }, { "auxiliary_loss_clip": 0.01063474, "auxiliary_loss_mlp": 0.01052372, "balance_loss_clip": 1.01746738, "balance_loss_mlp": 1.01863647, "epoch": 0.47419209379227417, "flos": 13260256352640.0, "grad_norm": 1.8485934958996502, "language_loss": 0.75198752, "learning_rate": 2.263481587786849e-06, "loss": 0.77314597, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 0.44921875, "step": 7887, "time_per_iteration": 3.785902976989746 }, { "auxiliary_loss_clip": 0.01057515, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.01485169, "balance_loss_mlp": 1.01777506, "epoch": 0.47425221704494214, "flos": 20043112671360.0, "grad_norm": 1.7798335676387431, "language_loss": 0.78066963, "learning_rate": 2.2630955155826993e-06, "loss": 0.80164284, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 7888, "time_per_iteration": 3.7688424587249756 }, { "auxiliary_loss_clip": 0.01059762, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.01165116, "balance_loss_mlp": 1.01803899, "epoch": 0.4743123402976101, "flos": 27270657377280.0, "grad_norm": 1.5581905272536558, "language_loss": 0.73835409, "learning_rate": 2.2627094334019406e-06, "loss": 0.75932896, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41796875, "step": 7889, "time_per_iteration": 2.433394193649292 }, { "auxiliary_loss_clip": 0.01021805, "auxiliary_loss_mlp": 0.01003993, "balance_loss_clip": 1.00101292, "balance_loss_mlp": 1.01238465, "epoch": 0.47437246355027807, "flos": 55391170003200.0, "grad_norm": 0.7141560876123028, "language_loss": 0.56168926, "learning_rate": 2.262323341259214e-06, "loss": 0.58194721, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.09423828, "step": 7890, "time_per_iteration": 4.476652145385742 }, { "auxiliary_loss_clip": 0.01061518, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.01433587, "balance_loss_mlp": 1.01929581, "epoch": 0.47443258680294603, "flos": 23877344004480.0, "grad_norm": 2.0731866835392316, "language_loss": 0.66745013, "learning_rate": 2.2619372391691605e-06, "loss": 0.68850315, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.421875, "step": 7891, "time_per_iteration": 2.4517343044281006 }, { "auxiliary_loss_clip": 0.01065677, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.01696491, "balance_loss_mlp": 1.0200789, "epoch": 0.474492710055614, "flos": 21976777255680.0, "grad_norm": 2.229793800607644, "language_loss": 0.72340447, "learning_rate": 2.26155112714642e-06, "loss": 0.74456179, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45507812, "step": 7892, "time_per_iteration": 2.3907554149627686 }, { "auxiliary_loss_clip": 0.01019446, "auxiliary_loss_mlp": 0.01006315, "balance_loss_clip": 1.0036447, "balance_loss_mlp": 1.01039839, "epoch": 0.47455283330828196, "flos": 62553845669760.0, "grad_norm": 0.7989489478679904, "language_loss": 0.58658564, "learning_rate": 2.2611650052056355e-06, "loss": 0.60684323, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.09033203, "step": 7893, "time_per_iteration": 3.130575656890869 }, { "auxiliary_loss_clip": 0.01060155, "auxiliary_loss_mlp": 0.01045175, "balance_loss_clip": 1.01876974, "balance_loss_mlp": 1.01935196, "epoch": 0.47461295656094993, "flos": 12092830577280.0, "grad_norm": 1.6434069367865434, "language_loss": 0.78929567, "learning_rate": 2.2607788733614463e-06, "loss": 0.81034899, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40820312, "step": 7894, "time_per_iteration": 2.358513355255127 }, { "auxiliary_loss_clip": 0.01060217, "auxiliary_loss_mlp": 0.01046637, "balance_loss_clip": 1.02084053, "balance_loss_mlp": 1.01891112, "epoch": 0.4746730798136179, "flos": 20883576764160.0, "grad_norm": 1.6535621950616135, "language_loss": 0.75888336, "learning_rate": 2.260392731628497e-06, "loss": 0.77995193, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.4140625, "step": 7895, "time_per_iteration": 2.36547589302063 }, { "auxiliary_loss_clip": 0.01060439, "auxiliary_loss_mlp": 0.01047422, "balance_loss_clip": 1.0182631, "balance_loss_mlp": 1.01948082, "epoch": 0.4747332030662859, "flos": 19973774977920.0, "grad_norm": 2.5183694286012726, "language_loss": 0.83292663, "learning_rate": 2.260006580021429e-06, "loss": 0.85400522, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41015625, "step": 7896, "time_per_iteration": 2.392159938812256 }, { "auxiliary_loss_clip": 0.01060242, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.01310468, "balance_loss_mlp": 1.01865304, "epoch": 0.4747933263189539, "flos": 16033267866240.0, "grad_norm": 2.8830506358675265, "language_loss": 0.77156091, "learning_rate": 2.259620418554886e-06, "loss": 0.79259241, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41601562, "step": 7897, "time_per_iteration": 3.7698278427124023 }, { "auxiliary_loss_clip": 0.01064238, "auxiliary_loss_mlp": 0.01052344, "balance_loss_clip": 1.02261293, "balance_loss_mlp": 1.01994908, "epoch": 0.47485344957162184, "flos": 13954224913920.0, "grad_norm": 2.012769271147644, "language_loss": 0.65536821, "learning_rate": 2.25923424724351e-06, "loss": 0.67653406, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44335938, "step": 7898, "time_per_iteration": 2.3681626319885254 }, { "auxiliary_loss_clip": 0.01060222, "auxiliary_loss_mlp": 0.01048493, "balance_loss_clip": 1.01888096, "balance_loss_mlp": 1.01893878, "epoch": 0.4749135728242898, "flos": 20448035153280.0, "grad_norm": 2.5914080369277577, "language_loss": 0.7112968, "learning_rate": 2.258848066101946e-06, "loss": 0.73238397, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4140625, "step": 7899, "time_per_iteration": 2.371290922164917 }, { "auxiliary_loss_clip": 0.01061835, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.01523662, "balance_loss_mlp": 1.0202986, "epoch": 0.4749736960769578, "flos": 28948687920000.0, "grad_norm": 1.8220344496806027, "language_loss": 0.69975889, "learning_rate": 2.258461875144837e-06, "loss": 0.72081405, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 7900, "time_per_iteration": 2.4827263355255127 }, { "auxiliary_loss_clip": 0.0105956, "auxiliary_loss_mlp": 0.01045767, "balance_loss_clip": 1.01931417, "balance_loss_mlp": 1.01913297, "epoch": 0.47503381932962574, "flos": 31937497747200.0, "grad_norm": 1.8101877782906082, "language_loss": 0.72177327, "learning_rate": 2.2580756743868273e-06, "loss": 0.74282652, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40429688, "step": 7901, "time_per_iteration": 2.5073821544647217 }, { "auxiliary_loss_clip": 0.0106134, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.027637, "balance_loss_mlp": 1.02050233, "epoch": 0.4750939425822937, "flos": 22126170430080.0, "grad_norm": 1.6538023132123763, "language_loss": 0.74812269, "learning_rate": 2.2576894638425636e-06, "loss": 0.76927435, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 7902, "time_per_iteration": 2.5126986503601074 }, { "auxiliary_loss_clip": 0.01058128, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.01541197, "balance_loss_mlp": 1.01909876, "epoch": 0.47515406583496167, "flos": 20849047562880.0, "grad_norm": 1.8579596080030094, "language_loss": 0.69843209, "learning_rate": 2.257303243526688e-06, "loss": 0.71941108, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.390625, "step": 7903, "time_per_iteration": 2.3729562759399414 }, { "auxiliary_loss_clip": 0.01059522, "auxiliary_loss_mlp": 0.01041967, "balance_loss_clip": 1.0179106, "balance_loss_mlp": 1.01932693, "epoch": 0.47521418908762963, "flos": 17523989631360.0, "grad_norm": 2.3182482308590218, "language_loss": 0.73449039, "learning_rate": 2.256917013453848e-06, "loss": 0.75550532, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.40234375, "step": 7904, "time_per_iteration": 2.4128899574279785 }, { "auxiliary_loss_clip": 0.01059321, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.01630092, "balance_loss_mlp": 1.01980805, "epoch": 0.4752743123402976, "flos": 20558360649600.0, "grad_norm": 1.7670281386018132, "language_loss": 0.87266958, "learning_rate": 2.25653077363869e-06, "loss": 0.89367509, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39453125, "step": 7905, "time_per_iteration": 2.370866537094116 }, { "auxiliary_loss_clip": 0.01056679, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.01985335, "balance_loss_mlp": 1.01829696, "epoch": 0.47533443559296557, "flos": 26359389313920.0, "grad_norm": 1.5983772291181422, "language_loss": 0.83119142, "learning_rate": 2.2561445240958583e-06, "loss": 0.85220158, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 7906, "time_per_iteration": 2.5027198791503906 }, { "auxiliary_loss_clip": 0.010162, "auxiliary_loss_mlp": 0.01003095, "balance_loss_clip": 1.00018597, "balance_loss_mlp": 1.00767934, "epoch": 0.47539455884563353, "flos": 65946251347200.0, "grad_norm": 0.6692666830014874, "language_loss": 0.59060419, "learning_rate": 2.255758264840002e-06, "loss": 0.61079717, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 0.02905273, "router_z_loss_mlp": 0.08496094, "step": 7907, "time_per_iteration": 3.123603343963623 }, { "auxiliary_loss_clip": 0.01059609, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.01793361, "balance_loss_mlp": 1.0195725, "epoch": 0.4754546820983015, "flos": 17237177879040.0, "grad_norm": 1.7792303860108796, "language_loss": 0.81759417, "learning_rate": 2.255371995885765e-06, "loss": 0.83862484, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 7908, "time_per_iteration": 2.3709094524383545 }, { "auxiliary_loss_clip": 0.01059773, "auxiliary_loss_mlp": 0.01045837, "balance_loss_clip": 1.02105343, "balance_loss_mlp": 1.01939416, "epoch": 0.47551480535096946, "flos": 19824940385280.0, "grad_norm": 1.8243332316400749, "language_loss": 0.75678802, "learning_rate": 2.254985717247797e-06, "loss": 0.77784413, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.40429688, "step": 7909, "time_per_iteration": 2.383418560028076 }, { "auxiliary_loss_clip": 0.01058313, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.01833773, "balance_loss_mlp": 1.01934505, "epoch": 0.4755749286036375, "flos": 22162864135680.0, "grad_norm": 1.6005404906931817, "language_loss": 0.7588768, "learning_rate": 2.2545994289407457e-06, "loss": 0.77989435, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 7910, "time_per_iteration": 2.412743091583252 }, { "auxiliary_loss_clip": 0.01056196, "auxiliary_loss_mlp": 0.0104116, "balance_loss_clip": 1.01738977, "balance_loss_mlp": 1.01784372, "epoch": 0.47563505185630545, "flos": 21647336866560.0, "grad_norm": 1.9028034395074134, "language_loss": 0.79583287, "learning_rate": 2.2542131309792577e-06, "loss": 0.81680644, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 7911, "time_per_iteration": 2.3711490631103516 }, { "auxiliary_loss_clip": 0.01060949, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.02130878, "balance_loss_mlp": 1.01942194, "epoch": 0.4756951751089734, "flos": 20627803077120.0, "grad_norm": 1.841054313976103, "language_loss": 0.77002585, "learning_rate": 2.253826823377983e-06, "loss": 0.79111207, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.4140625, "step": 7912, "time_per_iteration": 2.4338953495025635 }, { "auxiliary_loss_clip": 0.01056899, "auxiliary_loss_mlp": 0.01046844, "balance_loss_clip": 1.02229881, "balance_loss_mlp": 1.01768708, "epoch": 0.4757552983616414, "flos": 25847597560320.0, "grad_norm": 1.6608778744668011, "language_loss": 0.75815189, "learning_rate": 2.253440506151569e-06, "loss": 0.77918935, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.390625, "step": 7913, "time_per_iteration": 2.4204211235046387 }, { "auxiliary_loss_clip": 0.01058687, "auxiliary_loss_mlp": 0.01041167, "balance_loss_clip": 1.01749194, "balance_loss_mlp": 1.01978958, "epoch": 0.47581542161430934, "flos": 18222042821760.0, "grad_norm": 2.0097608947127363, "language_loss": 0.73485464, "learning_rate": 2.253054179314666e-06, "loss": 0.75585318, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38867188, "step": 7914, "time_per_iteration": 2.368048667907715 }, { "auxiliary_loss_clip": 0.01060226, "auxiliary_loss_mlp": 0.01042107, "balance_loss_clip": 1.01539183, "balance_loss_mlp": 1.02011776, "epoch": 0.4758755448669773, "flos": 21578697400320.0, "grad_norm": 1.9439937539422598, "language_loss": 0.65901661, "learning_rate": 2.2526678428819227e-06, "loss": 0.68003988, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40039062, "step": 7915, "time_per_iteration": 2.3776769638061523 }, { "auxiliary_loss_clip": 0.01056588, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.01758039, "balance_loss_mlp": 1.01879764, "epoch": 0.47593566811964527, "flos": 15230265528960.0, "grad_norm": 1.6698076935776078, "language_loss": 0.78398323, "learning_rate": 2.2522814968679896e-06, "loss": 0.80495703, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37890625, "step": 7916, "time_per_iteration": 2.3720967769622803 }, { "auxiliary_loss_clip": 0.01056862, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.01873469, "balance_loss_mlp": 1.01851726, "epoch": 0.47599579137231324, "flos": 21542178251520.0, "grad_norm": 2.025042098630158, "language_loss": 0.64896262, "learning_rate": 2.2518951412875173e-06, "loss": 0.66994649, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3828125, "step": 7917, "time_per_iteration": 2.3721086978912354 }, { "auxiliary_loss_clip": 0.01013841, "auxiliary_loss_mlp": 0.01016819, "balance_loss_clip": 1.01364779, "balance_loss_mlp": 1.00521624, "epoch": 0.4760559146249812, "flos": 64551471598080.0, "grad_norm": 0.8499760382628104, "language_loss": 0.65765858, "learning_rate": 2.2515087761551557e-06, "loss": 0.67796516, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.0859375, "step": 7918, "time_per_iteration": 3.0589041709899902 }, { "auxiliary_loss_clip": 0.01061458, "auxiliary_loss_mlp": 0.01039208, "balance_loss_clip": 1.01441288, "balance_loss_mlp": 1.02099872, "epoch": 0.47611603787764917, "flos": 22232865144960.0, "grad_norm": 1.642339122181158, "language_loss": 0.69426382, "learning_rate": 2.2511224014855563e-06, "loss": 0.71527052, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.40429688, "step": 7919, "time_per_iteration": 2.390829563140869 }, { "auxiliary_loss_clip": 0.01060425, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.01638699, "balance_loss_mlp": 1.01960862, "epoch": 0.47617616113031713, "flos": 22779011543040.0, "grad_norm": 1.6768083449851492, "language_loss": 0.7566812, "learning_rate": 2.2507360172933694e-06, "loss": 0.77768064, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.40820312, "step": 7920, "time_per_iteration": 2.488020896911621 }, { "auxiliary_loss_clip": 0.01063272, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.01399398, "balance_loss_mlp": 1.02147782, "epoch": 0.4762362843829851, "flos": 24132663843840.0, "grad_norm": 1.8783873190070004, "language_loss": 0.7823692, "learning_rate": 2.2503496235932487e-06, "loss": 0.80341542, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41796875, "step": 7921, "time_per_iteration": 2.416640520095825 }, { "auxiliary_loss_clip": 0.01061202, "auxiliary_loss_mlp": 0.01040119, "balance_loss_clip": 1.01355922, "balance_loss_mlp": 1.02138436, "epoch": 0.47629640763565306, "flos": 22451072342400.0, "grad_norm": 1.492012331170785, "language_loss": 0.79174507, "learning_rate": 2.249963220399845e-06, "loss": 0.81275827, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7922, "time_per_iteration": 2.4559710025787354 }, { "auxiliary_loss_clip": 0.0106218, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.01698923, "balance_loss_mlp": 1.02121449, "epoch": 0.4763565308883211, "flos": 11180619907200.0, "grad_norm": 1.6958723170596361, "language_loss": 0.73853838, "learning_rate": 2.2495768077278104e-06, "loss": 0.75960457, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 7923, "time_per_iteration": 2.347822666168213 }, { "auxiliary_loss_clip": 0.01060632, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.01663613, "balance_loss_mlp": 1.02072465, "epoch": 0.47641665414098905, "flos": 22381071333120.0, "grad_norm": 1.7167714609210076, "language_loss": 0.83854806, "learning_rate": 2.2491903855917992e-06, "loss": 0.85955721, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3984375, "step": 7924, "time_per_iteration": 2.4255478382110596 }, { "auxiliary_loss_clip": 0.01064913, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.02028751, "balance_loss_mlp": 1.02201819, "epoch": 0.476476777393657, "flos": 25044979248000.0, "grad_norm": 1.707890724411671, "language_loss": 0.8188948, "learning_rate": 2.2488039540064626e-06, "loss": 0.84004128, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 7925, "time_per_iteration": 2.5074949264526367 }, { "auxiliary_loss_clip": 0.01059375, "auxiliary_loss_mlp": 0.01044527, "balance_loss_clip": 1.01948082, "balance_loss_mlp": 1.02027202, "epoch": 0.476536900646325, "flos": 27268737252480.0, "grad_norm": 1.652447227243215, "language_loss": 0.73637593, "learning_rate": 2.2484175129864558e-06, "loss": 0.75741488, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 7926, "time_per_iteration": 2.510043144226074 }, { "auxiliary_loss_clip": 0.01062578, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.01474714, "balance_loss_mlp": 1.02122426, "epoch": 0.47659702389899294, "flos": 25300229264640.0, "grad_norm": 1.8511155592438404, "language_loss": 0.7035659, "learning_rate": 2.248031062546432e-06, "loss": 0.72460544, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 7927, "time_per_iteration": 3.806177854537964 }, { "auxiliary_loss_clip": 0.01057988, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.01701641, "balance_loss_mlp": 1.0196979, "epoch": 0.4766571471516609, "flos": 25991719119360.0, "grad_norm": 1.5316927208180362, "language_loss": 0.68599045, "learning_rate": 2.247644602701045e-06, "loss": 0.70696765, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3828125, "step": 7928, "time_per_iteration": 3.814880609512329 }, { "auxiliary_loss_clip": 0.01061353, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.02041864, "balance_loss_mlp": 1.02090621, "epoch": 0.4767172704043289, "flos": 16031347741440.0, "grad_norm": 1.9185391139380377, "language_loss": 0.79401451, "learning_rate": 2.2472581334649496e-06, "loss": 0.81510448, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 7929, "time_per_iteration": 2.3753602504730225 }, { "auxiliary_loss_clip": 0.0105974, "auxiliary_loss_mlp": 0.010439, "balance_loss_clip": 1.01952219, "balance_loss_mlp": 1.02062917, "epoch": 0.47677739365699684, "flos": 39233891387520.0, "grad_norm": 1.8064559585972237, "language_loss": 0.67381382, "learning_rate": 2.2468716548528016e-06, "loss": 0.69485027, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.390625, "step": 7930, "time_per_iteration": 3.8612468242645264 }, { "auxiliary_loss_clip": 0.01058361, "auxiliary_loss_mlp": 0.01046235, "balance_loss_clip": 1.02310836, "balance_loss_mlp": 1.0190506, "epoch": 0.4768375169096648, "flos": 24716621111040.0, "grad_norm": 1.8157186513081014, "language_loss": 0.80433702, "learning_rate": 2.2464851668792555e-06, "loss": 0.82538295, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.39453125, "step": 7931, "time_per_iteration": 2.427858591079712 }, { "auxiliary_loss_clip": 0.01060011, "auxiliary_loss_mlp": 0.01045125, "balance_loss_clip": 1.01849413, "balance_loss_mlp": 1.02050138, "epoch": 0.47689764016233277, "flos": 22527566864640.0, "grad_norm": 1.807340716065254, "language_loss": 0.77676904, "learning_rate": 2.2460986695589678e-06, "loss": 0.79782039, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 7932, "time_per_iteration": 2.3967626094818115 }, { "auxiliary_loss_clip": 0.01057494, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.01806927, "balance_loss_mlp": 1.01998055, "epoch": 0.47695776341500074, "flos": 15119765475840.0, "grad_norm": 1.7504829617371402, "language_loss": 0.81167185, "learning_rate": 2.245712162906593e-06, "loss": 0.83268118, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 7933, "time_per_iteration": 2.5742087364196777 }, { "auxiliary_loss_clip": 0.01064573, "auxiliary_loss_mlp": 0.01049864, "balance_loss_clip": 1.01951337, "balance_loss_mlp": 1.02044344, "epoch": 0.4770178866676687, "flos": 14678184199680.0, "grad_norm": 1.915105015780136, "language_loss": 0.75262022, "learning_rate": 2.2453256469367888e-06, "loss": 0.77376461, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 7934, "time_per_iteration": 2.339354991912842 }, { "auxiliary_loss_clip": 0.01062312, "auxiliary_loss_mlp": 0.01044906, "balance_loss_clip": 1.01744008, "balance_loss_mlp": 1.02021837, "epoch": 0.47707800992033667, "flos": 22564470038400.0, "grad_norm": 1.7894954610540696, "language_loss": 0.81110859, "learning_rate": 2.244939121664211e-06, "loss": 0.83218074, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.421875, "step": 7935, "time_per_iteration": 2.4033567905426025 }, { "auxiliary_loss_clip": 0.01064807, "auxiliary_loss_mlp": 0.01050696, "balance_loss_clip": 1.02156138, "balance_loss_mlp": 1.0201534, "epoch": 0.4771381331730047, "flos": 30916951528320.0, "grad_norm": 1.70908421706898, "language_loss": 0.72219789, "learning_rate": 2.2445525871035177e-06, "loss": 0.74335289, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.44726562, "step": 7936, "time_per_iteration": 3.949907064437866 }, { "auxiliary_loss_clip": 0.01061948, "auxiliary_loss_mlp": 0.01044688, "balance_loss_clip": 1.01668537, "balance_loss_mlp": 1.01948583, "epoch": 0.47719825642567265, "flos": 25737725911680.0, "grad_norm": 2.0316751095921783, "language_loss": 0.68567377, "learning_rate": 2.2441660432693656e-06, "loss": 0.70674014, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 7937, "time_per_iteration": 2.4147164821624756 }, { "auxiliary_loss_clip": 0.01012546, "auxiliary_loss_mlp": 0.01008079, "balance_loss_clip": 1.005445, "balance_loss_mlp": 1.00438857, "epoch": 0.4772583796783406, "flos": 66351592765440.0, "grad_norm": 0.7146660475141763, "language_loss": 0.56536514, "learning_rate": 2.2437794901764128e-06, "loss": 0.58557135, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.08154297, "step": 7938, "time_per_iteration": 3.1767678260803223 }, { "auxiliary_loss_clip": 0.01063052, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.01201534, "balance_loss_mlp": 1.02129292, "epoch": 0.4773185029310086, "flos": 22050094844160.0, "grad_norm": 1.7128713193066494, "language_loss": 0.90203929, "learning_rate": 2.243392927839317e-06, "loss": 0.92307556, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 7939, "time_per_iteration": 2.3885273933410645 }, { "auxiliary_loss_clip": 0.010623, "auxiliary_loss_mlp": 0.01046321, "balance_loss_clip": 1.0201546, "balance_loss_mlp": 1.02073681, "epoch": 0.47737862618367655, "flos": 16726852402560.0, "grad_norm": 1.9393402592246656, "language_loss": 0.78655005, "learning_rate": 2.2430063562727367e-06, "loss": 0.80763626, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41601562, "step": 7940, "time_per_iteration": 2.367142915725708 }, { "auxiliary_loss_clip": 0.0105999, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.0178194, "balance_loss_mlp": 1.02121401, "epoch": 0.4774387494363445, "flos": 19608443844480.0, "grad_norm": 1.6866227812753702, "language_loss": 0.85956085, "learning_rate": 2.2426197754913322e-06, "loss": 0.8805865, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38671875, "step": 7941, "time_per_iteration": 2.390547513961792 }, { "auxiliary_loss_clip": 0.01064488, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.01711738, "balance_loss_mlp": 1.02161062, "epoch": 0.4774988726890125, "flos": 16653046055040.0, "grad_norm": 1.77094756236938, "language_loss": 0.77148718, "learning_rate": 2.24223318550976e-06, "loss": 0.79259145, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42773438, "step": 7942, "time_per_iteration": 2.3902204036712646 }, { "auxiliary_loss_clip": 0.0106327, "auxiliary_loss_mlp": 0.01050735, "balance_loss_clip": 1.02350736, "balance_loss_mlp": 1.02199805, "epoch": 0.47755899594168044, "flos": 20484519390720.0, "grad_norm": 1.754314422679348, "language_loss": 0.64877015, "learning_rate": 2.241846586342682e-06, "loss": 0.66991019, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41210938, "step": 7943, "time_per_iteration": 2.3747384548187256 }, { "auxiliary_loss_clip": 0.01064407, "auxiliary_loss_mlp": 0.01047097, "balance_loss_clip": 1.01762831, "balance_loss_mlp": 1.02122486, "epoch": 0.4776191191943484, "flos": 21651735697920.0, "grad_norm": 1.6469644641716175, "language_loss": 0.75208992, "learning_rate": 2.2414599780047577e-06, "loss": 0.77320504, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43164062, "step": 7944, "time_per_iteration": 2.4242780208587646 }, { "auxiliary_loss_clip": 0.0106384, "auxiliary_loss_mlp": 0.01048116, "balance_loss_clip": 1.01938629, "balance_loss_mlp": 1.02157235, "epoch": 0.4776792424470164, "flos": 18769236560640.0, "grad_norm": 1.9983627527367607, "language_loss": 0.69708908, "learning_rate": 2.2410733605106456e-06, "loss": 0.71820867, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 7945, "time_per_iteration": 2.3689916133880615 }, { "auxiliary_loss_clip": 0.01061631, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.01478791, "balance_loss_mlp": 1.01950097, "epoch": 0.47773936569968434, "flos": 29714542704000.0, "grad_norm": 2.0508922019754694, "language_loss": 0.7703656, "learning_rate": 2.240686733875009e-06, "loss": 0.79140007, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.421875, "step": 7946, "time_per_iteration": 2.4657633304595947 }, { "auxiliary_loss_clip": 0.01066322, "auxiliary_loss_mlp": 0.01055389, "balance_loss_clip": 1.02357221, "balance_loss_mlp": 1.02274418, "epoch": 0.4777994889523523, "flos": 24790357635840.0, "grad_norm": 1.7950525779694382, "language_loss": 0.80673069, "learning_rate": 2.240300098112506e-06, "loss": 0.82794774, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43554688, "step": 7947, "time_per_iteration": 2.417748212814331 }, { "auxiliary_loss_clip": 0.01060142, "auxiliary_loss_mlp": 0.01043863, "balance_loss_clip": 1.01470399, "balance_loss_mlp": 1.01977658, "epoch": 0.47785961220502027, "flos": 17857200447360.0, "grad_norm": 2.0710250440307076, "language_loss": 0.74775869, "learning_rate": 2.2399134532377998e-06, "loss": 0.76879871, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40429688, "step": 7948, "time_per_iteration": 2.3824448585510254 }, { "auxiliary_loss_clip": 0.0106357, "auxiliary_loss_mlp": 0.01040549, "balance_loss_clip": 1.01258254, "balance_loss_mlp": 1.0217768, "epoch": 0.4779197354576883, "flos": 20265509232000.0, "grad_norm": 1.5179100543911805, "language_loss": 0.79379153, "learning_rate": 2.2395267992655514e-06, "loss": 0.81483269, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 7949, "time_per_iteration": 2.3861618041992188 }, { "auxiliary_loss_clip": 0.01060648, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.01593721, "balance_loss_mlp": 1.0208683, "epoch": 0.47797985871035625, "flos": 17055629475840.0, "grad_norm": 2.272126439049425, "language_loss": 0.7621702, "learning_rate": 2.2391401362104227e-06, "loss": 0.78319341, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 7950, "time_per_iteration": 2.3618481159210205 }, { "auxiliary_loss_clip": 0.01061488, "auxiliary_loss_mlp": 0.01047126, "balance_loss_clip": 1.01868272, "balance_loss_mlp": 1.02065134, "epoch": 0.4780399819630242, "flos": 31357066527360.0, "grad_norm": 1.5513406557281755, "language_loss": 0.75801873, "learning_rate": 2.2387534640870756e-06, "loss": 0.77910483, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 7951, "time_per_iteration": 2.4895410537719727 }, { "auxiliary_loss_clip": 0.01063456, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.01595592, "balance_loss_mlp": 1.02039564, "epoch": 0.4781001052156922, "flos": 24898448805120.0, "grad_norm": 2.0924915635689807, "language_loss": 0.82129991, "learning_rate": 2.238366782910174e-06, "loss": 0.84237707, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.43164062, "step": 7952, "time_per_iteration": 2.400447368621826 }, { "auxiliary_loss_clip": 0.01062824, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.01495957, "balance_loss_mlp": 1.02025104, "epoch": 0.47816022846836015, "flos": 18696721933440.0, "grad_norm": 2.039262991219003, "language_loss": 0.80140108, "learning_rate": 2.23798009269438e-06, "loss": 0.82246673, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 7953, "time_per_iteration": 2.380124092102051 }, { "auxiliary_loss_clip": 0.01063391, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.0142343, "balance_loss_mlp": 1.02036643, "epoch": 0.4782203517210281, "flos": 11976954174720.0, "grad_norm": 2.0630334579386953, "language_loss": 0.85164297, "learning_rate": 2.2375933934543566e-06, "loss": 0.87271035, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4296875, "step": 7954, "time_per_iteration": 2.3169827461242676 }, { "auxiliary_loss_clip": 0.01062229, "auxiliary_loss_mlp": 0.01044434, "balance_loss_clip": 1.01386893, "balance_loss_mlp": 1.02058673, "epoch": 0.4782804749736961, "flos": 20812458591360.0, "grad_norm": 1.3724927982050883, "language_loss": 0.71145171, "learning_rate": 2.237206685204768e-06, "loss": 0.73251832, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41601562, "step": 7955, "time_per_iteration": 2.398909568786621 }, { "auxiliary_loss_clip": 0.01061837, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.01393282, "balance_loss_mlp": 1.01999807, "epoch": 0.47834059822636404, "flos": 23839218933120.0, "grad_norm": 1.5277184697611863, "language_loss": 0.83092427, "learning_rate": 2.2368199679602787e-06, "loss": 0.85196871, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 7956, "time_per_iteration": 2.391289234161377 }, { "auxiliary_loss_clip": 0.01060208, "auxiliary_loss_mlp": 0.01044911, "balance_loss_clip": 1.01739788, "balance_loss_mlp": 1.02017093, "epoch": 0.478400721479032, "flos": 22632795302400.0, "grad_norm": 1.7793498693584444, "language_loss": 0.85705292, "learning_rate": 2.2364332417355516e-06, "loss": 0.87810415, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 7957, "time_per_iteration": 2.3942055702209473 }, { "auxiliary_loss_clip": 0.0106039, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.01447558, "balance_loss_mlp": 1.01967001, "epoch": 0.4784608447317, "flos": 19353926966400.0, "grad_norm": 1.5830615652275102, "language_loss": 0.80698413, "learning_rate": 2.2360465065452527e-06, "loss": 0.82798851, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40625, "step": 7958, "time_per_iteration": 2.381096839904785 }, { "auxiliary_loss_clip": 0.01059264, "auxiliary_loss_mlp": 0.01044364, "balance_loss_clip": 1.01637328, "balance_loss_mlp": 1.01863146, "epoch": 0.47852096798436794, "flos": 24020069109120.0, "grad_norm": 2.2716173378074718, "language_loss": 0.84527659, "learning_rate": 2.235659762404047e-06, "loss": 0.86631286, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 7959, "time_per_iteration": 2.4463672637939453 }, { "auxiliary_loss_clip": 0.01059807, "auxiliary_loss_mlp": 0.0104386, "balance_loss_clip": 1.01715708, "balance_loss_mlp": 1.02043223, "epoch": 0.4785810912370359, "flos": 25665246195840.0, "grad_norm": 2.1401209338887774, "language_loss": 0.74313933, "learning_rate": 2.235273009326599e-06, "loss": 0.76417601, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 7960, "time_per_iteration": 2.4222304821014404 }, { "auxiliary_loss_clip": 0.01059299, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.01470411, "balance_loss_mlp": 1.01977968, "epoch": 0.47864121448970387, "flos": 21431119616640.0, "grad_norm": 1.617371804236592, "language_loss": 0.7816987, "learning_rate": 2.2348862473275745e-06, "loss": 0.80269945, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 7961, "time_per_iteration": 2.4220728874206543 }, { "auxiliary_loss_clip": 0.01059034, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.01412368, "balance_loss_mlp": 1.01900935, "epoch": 0.47870133774237184, "flos": 16142964958080.0, "grad_norm": 1.6092551590337072, "language_loss": 0.79346073, "learning_rate": 2.2344994764216405e-06, "loss": 0.81446517, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 7962, "time_per_iteration": 2.3809974193573 }, { "auxiliary_loss_clip": 0.01061262, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.01215696, "balance_loss_mlp": 1.02027321, "epoch": 0.47876146099503986, "flos": 26905570623360.0, "grad_norm": 1.6836193942477087, "language_loss": 0.66574401, "learning_rate": 2.2341126966234635e-06, "loss": 0.68675953, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 7963, "time_per_iteration": 2.5148062705993652 }, { "auxiliary_loss_clip": 0.01058818, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.01507103, "balance_loss_mlp": 1.01917386, "epoch": 0.4788215842477078, "flos": 45330354910080.0, "grad_norm": 1.7615332186158594, "language_loss": 0.79694748, "learning_rate": 2.2337259079477083e-06, "loss": 0.81795186, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 7964, "time_per_iteration": 2.6183600425720215 }, { "auxiliary_loss_clip": 0.01061554, "auxiliary_loss_mlp": 0.01045717, "balance_loss_clip": 1.01527071, "balance_loss_mlp": 1.01885378, "epoch": 0.4788817075003758, "flos": 22236076990080.0, "grad_norm": 1.7904226823603722, "language_loss": 0.77887094, "learning_rate": 2.233339110409044e-06, "loss": 0.79994363, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42773438, "step": 7965, "time_per_iteration": 2.4420006275177 }, { "auxiliary_loss_clip": 0.01059836, "auxiliary_loss_mlp": 0.01044794, "balance_loss_clip": 1.01667213, "balance_loss_mlp": 1.01885939, "epoch": 0.47894183075304375, "flos": 16470275754240.0, "grad_norm": 1.635084174245906, "language_loss": 0.76080096, "learning_rate": 2.232952304022137e-06, "loss": 0.78184724, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 7966, "time_per_iteration": 2.384932041168213 }, { "auxiliary_loss_clip": 0.01061666, "auxiliary_loss_mlp": 0.0104384, "balance_loss_clip": 1.01482487, "balance_loss_mlp": 1.0199405, "epoch": 0.4790019540057117, "flos": 24281463525120.0, "grad_norm": 1.6441350949909974, "language_loss": 0.74409735, "learning_rate": 2.232565488801655e-06, "loss": 0.76515245, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 7967, "time_per_iteration": 3.8738393783569336 }, { "auxiliary_loss_clip": 0.0105576, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.01611257, "balance_loss_mlp": 1.0180409, "epoch": 0.4790620772583797, "flos": 25665281107200.0, "grad_norm": 1.9835282571599955, "language_loss": 0.8067832, "learning_rate": 2.232178664762267e-06, "loss": 0.82776171, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37695312, "step": 7968, "time_per_iteration": 3.81965708732605 }, { "auxiliary_loss_clip": 0.01012443, "auxiliary_loss_mlp": 0.01011891, "balance_loss_clip": 1.00924408, "balance_loss_mlp": 1.00415611, "epoch": 0.47912220051104765, "flos": 69424228500480.0, "grad_norm": 0.7766479473014086, "language_loss": 0.62345088, "learning_rate": 2.2317918319186408e-06, "loss": 0.64369422, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.08300781, "step": 7969, "time_per_iteration": 3.183363676071167 }, { "auxiliary_loss_clip": 0.01058515, "auxiliary_loss_mlp": 0.01044672, "balance_loss_clip": 1.01900673, "balance_loss_mlp": 1.01913476, "epoch": 0.4791823237637156, "flos": 24167821449600.0, "grad_norm": 1.3514346964038169, "language_loss": 0.78355289, "learning_rate": 2.2314049902854446e-06, "loss": 0.80458474, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 7970, "time_per_iteration": 3.8085100650787354 }, { "auxiliary_loss_clip": 0.01060362, "auxiliary_loss_mlp": 0.01046235, "balance_loss_clip": 1.0174222, "balance_loss_mlp": 1.0193373, "epoch": 0.4792424470163836, "flos": 24750382262400.0, "grad_norm": 1.529345933331152, "language_loss": 0.7093336, "learning_rate": 2.231018139877349e-06, "loss": 0.73039961, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 7971, "time_per_iteration": 2.4606971740722656 }, { "auxiliary_loss_clip": 0.0105912, "auxiliary_loss_mlp": 0.01043842, "balance_loss_clip": 1.0153749, "balance_loss_mlp": 1.01838326, "epoch": 0.47930257026905154, "flos": 23256797765760.0, "grad_norm": 1.3157229223490616, "language_loss": 0.80565399, "learning_rate": 2.230631280709021e-06, "loss": 0.82668364, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 7972, "time_per_iteration": 2.4166030883789062 }, { "auxiliary_loss_clip": 0.0106075, "auxiliary_loss_mlp": 0.01043041, "balance_loss_clip": 1.0149076, "balance_loss_mlp": 1.01902723, "epoch": 0.4793626935217195, "flos": 14063223778560.0, "grad_norm": 2.169576561754285, "language_loss": 0.7101447, "learning_rate": 2.2302444127951327e-06, "loss": 0.73118258, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 7973, "time_per_iteration": 2.4202895164489746 }, { "auxiliary_loss_clip": 0.01058399, "auxiliary_loss_mlp": 0.01040641, "balance_loss_clip": 1.0155952, "balance_loss_mlp": 1.02019405, "epoch": 0.4794228167743875, "flos": 21797777381760.0, "grad_norm": 1.7654522865993942, "language_loss": 0.80228341, "learning_rate": 2.2298575361503523e-06, "loss": 0.82327378, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 7974, "time_per_iteration": 2.4075348377227783 }, { "auxiliary_loss_clip": 0.01013917, "auxiliary_loss_mlp": 0.01006328, "balance_loss_clip": 1.00346696, "balance_loss_mlp": 1.00598717, "epoch": 0.47948294002705544, "flos": 66965436023040.0, "grad_norm": 0.7531833735395026, "language_loss": 0.54119623, "learning_rate": 2.2294706507893517e-06, "loss": 0.56139874, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.07910156, "step": 7975, "time_per_iteration": 3.1026434898376465 }, { "auxiliary_loss_clip": 0.01064634, "auxiliary_loss_mlp": 0.01053456, "balance_loss_clip": 1.02312875, "balance_loss_mlp": 1.01979208, "epoch": 0.47954306327972346, "flos": 12421642561920.0, "grad_norm": 2.325431168621703, "language_loss": 0.90496719, "learning_rate": 2.2290837567268008e-06, "loss": 0.92614806, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.44921875, "step": 7976, "time_per_iteration": 3.7761037349700928 }, { "auxiliary_loss_clip": 0.01063131, "auxiliary_loss_mlp": 0.01045752, "balance_loss_clip": 1.01403058, "balance_loss_mlp": 1.02013135, "epoch": 0.4796031865323914, "flos": 18361172056320.0, "grad_norm": 2.3733002177993865, "language_loss": 0.75489306, "learning_rate": 2.2286968539773713e-06, "loss": 0.7759819, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4296875, "step": 7977, "time_per_iteration": 2.4004733562469482 }, { "auxiliary_loss_clip": 0.01058709, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.01433015, "balance_loss_mlp": 1.01919246, "epoch": 0.4796633097850594, "flos": 21834017239680.0, "grad_norm": 1.614953109361554, "language_loss": 0.79110324, "learning_rate": 2.228309942555734e-06, "loss": 0.81209946, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 7978, "time_per_iteration": 2.3940184116363525 }, { "auxiliary_loss_clip": 0.01060351, "auxiliary_loss_mlp": 0.01048637, "balance_loss_clip": 1.02089667, "balance_loss_mlp": 1.01930285, "epoch": 0.47972343303772735, "flos": 23436321310080.0, "grad_norm": 1.618265869151182, "language_loss": 0.90060294, "learning_rate": 2.22792302247656e-06, "loss": 0.92169279, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 7979, "time_per_iteration": 2.432506799697876 }, { "auxiliary_loss_clip": 0.01062635, "auxiliary_loss_mlp": 0.01044551, "balance_loss_clip": 1.0158689, "balance_loss_mlp": 1.01988387, "epoch": 0.4797835562903953, "flos": 24898623361920.0, "grad_norm": 1.504373453106211, "language_loss": 0.77306986, "learning_rate": 2.227536093754523e-06, "loss": 0.79414171, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.42773438, "step": 7980, "time_per_iteration": 2.4225895404815674 }, { "auxiliary_loss_clip": 0.01064015, "auxiliary_loss_mlp": 0.01041311, "balance_loss_clip": 1.0094465, "balance_loss_mlp": 1.02082705, "epoch": 0.4798436795430633, "flos": 35041555572480.0, "grad_norm": 1.6206162487552431, "language_loss": 0.73755252, "learning_rate": 2.227149156404295e-06, "loss": 0.75860572, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 0.43164062, "step": 7981, "time_per_iteration": 2.55383038520813 }, { "auxiliary_loss_clip": 0.01060655, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.01227665, "balance_loss_mlp": 1.02098441, "epoch": 0.47990380279573125, "flos": 20589293980800.0, "grad_norm": 1.841161528644146, "language_loss": 0.70978743, "learning_rate": 2.2267622104405473e-06, "loss": 0.73078215, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 7982, "time_per_iteration": 2.3786492347717285 }, { "auxiliary_loss_clip": 0.01056963, "auxiliary_loss_mlp": 0.01042068, "balance_loss_clip": 1.01770175, "balance_loss_mlp": 1.01982927, "epoch": 0.4799639260483992, "flos": 26358202327680.0, "grad_norm": 1.6737597323687665, "language_loss": 0.7253623, "learning_rate": 2.2263752558779544e-06, "loss": 0.74635255, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 7983, "time_per_iteration": 2.4813530445098877 }, { "auxiliary_loss_clip": 0.0101457, "auxiliary_loss_mlp": 0.01018907, "balance_loss_clip": 1.01626039, "balance_loss_mlp": 1.00590038, "epoch": 0.4800240493010672, "flos": 70975629941760.0, "grad_norm": 0.8086045979361574, "language_loss": 0.59554565, "learning_rate": 2.2259882927311883e-06, "loss": 0.61588043, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.08691406, "step": 7984, "time_per_iteration": 2.981422185897827 }, { "auxiliary_loss_clip": 0.01060068, "auxiliary_loss_mlp": 0.01049052, "balance_loss_clip": 1.02139497, "balance_loss_mlp": 1.01925886, "epoch": 0.48008417255373514, "flos": 17085864579840.0, "grad_norm": 1.747363970224287, "language_loss": 0.68021798, "learning_rate": 2.2256013210149247e-06, "loss": 0.70130926, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 7985, "time_per_iteration": 2.439094066619873 }, { "auxiliary_loss_clip": 0.01062362, "auxiliary_loss_mlp": 0.01046268, "balance_loss_clip": 1.01699018, "balance_loss_mlp": 1.01990592, "epoch": 0.4801442958064031, "flos": 15412547070720.0, "grad_norm": 1.7999700993751266, "language_loss": 0.72314841, "learning_rate": 2.225214340743835e-06, "loss": 0.74423468, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42382812, "step": 7986, "time_per_iteration": 2.357844114303589 }, { "auxiliary_loss_clip": 0.01063716, "auxiliary_loss_mlp": 0.01057923, "balance_loss_clip": 1.02901459, "balance_loss_mlp": 1.01976895, "epoch": 0.4802044190590711, "flos": 11472947654400.0, "grad_norm": 2.406035625379, "language_loss": 0.8104918, "learning_rate": 2.2248273519325956e-06, "loss": 0.83170819, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43945312, "step": 7987, "time_per_iteration": 2.4091708660125732 }, { "auxiliary_loss_clip": 0.01060186, "auxiliary_loss_mlp": 0.01051332, "balance_loss_clip": 1.02292442, "balance_loss_mlp": 1.01883626, "epoch": 0.48026454231173904, "flos": 20950191371520.0, "grad_norm": 2.0826859362368895, "language_loss": 0.7618891, "learning_rate": 2.2244403545958812e-06, "loss": 0.78300428, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 7988, "time_per_iteration": 2.3661227226257324 }, { "auxiliary_loss_clip": 0.01063115, "auxiliary_loss_mlp": 0.01043499, "balance_loss_clip": 1.01290989, "balance_loss_mlp": 1.02055001, "epoch": 0.48032466556440706, "flos": 20447092546560.0, "grad_norm": 2.0915148262583685, "language_loss": 0.80472958, "learning_rate": 2.224053348748365e-06, "loss": 0.82579565, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42578125, "step": 7989, "time_per_iteration": 2.4025328159332275 }, { "auxiliary_loss_clip": 0.01063302, "auxiliary_loss_mlp": 0.01054335, "balance_loss_clip": 1.02218437, "balance_loss_mlp": 1.01922274, "epoch": 0.480384788817075, "flos": 37119376627200.0, "grad_norm": 1.694231167244884, "language_loss": 0.74831814, "learning_rate": 2.223666334404724e-06, "loss": 0.76949453, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.44140625, "step": 7990, "time_per_iteration": 2.5351386070251465 }, { "auxiliary_loss_clip": 0.0101136, "auxiliary_loss_mlp": 0.01004103, "balance_loss_clip": 1.00143242, "balance_loss_mlp": 1.00335741, "epoch": 0.480444912069743, "flos": 69549323368320.0, "grad_norm": 0.7715540747618471, "language_loss": 0.59119737, "learning_rate": 2.223279311579633e-06, "loss": 0.61135197, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.08007812, "step": 7991, "time_per_iteration": 3.1314985752105713 }, { "auxiliary_loss_clip": 0.01059167, "auxiliary_loss_mlp": 0.0104813, "balance_loss_clip": 1.01827955, "balance_loss_mlp": 1.0181191, "epoch": 0.48050503532241096, "flos": 29821027950720.0, "grad_norm": 2.0284037342332204, "language_loss": 0.68455309, "learning_rate": 2.222892280287768e-06, "loss": 0.70562601, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41015625, "step": 7992, "time_per_iteration": 2.50402569770813 }, { "auxiliary_loss_clip": 0.01060077, "auxiliary_loss_mlp": 0.0104621, "balance_loss_clip": 1.01821947, "balance_loss_mlp": 1.01792324, "epoch": 0.4805651585750789, "flos": 23947484659200.0, "grad_norm": 1.8645538036855487, "language_loss": 0.77351332, "learning_rate": 2.2225052405438056e-06, "loss": 0.79457617, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.421875, "step": 7993, "time_per_iteration": 2.4181883335113525 }, { "auxiliary_loss_clip": 0.01057612, "auxiliary_loss_mlp": 0.01045691, "balance_loss_clip": 1.01787972, "balance_loss_mlp": 1.01833129, "epoch": 0.4806252818277469, "flos": 25664268677760.0, "grad_norm": 1.7133155469880301, "language_loss": 0.80282724, "learning_rate": 2.222118192362422e-06, "loss": 0.82386023, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39257812, "step": 7994, "time_per_iteration": 2.414649486541748 }, { "auxiliary_loss_clip": 0.01061678, "auxiliary_loss_mlp": 0.01043001, "balance_loss_clip": 1.01524913, "balance_loss_mlp": 1.02055192, "epoch": 0.48068540508041485, "flos": 13151152753920.0, "grad_norm": 1.933677776376886, "language_loss": 0.80957055, "learning_rate": 2.2217311357582946e-06, "loss": 0.83061731, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 7995, "time_per_iteration": 2.414883852005005 }, { "auxiliary_loss_clip": 0.01061661, "auxiliary_loss_mlp": 0.01040608, "balance_loss_clip": 1.01339269, "balance_loss_mlp": 1.02064788, "epoch": 0.4807455283330828, "flos": 21175729954560.0, "grad_norm": 1.423762021082029, "language_loss": 0.84352142, "learning_rate": 2.2213440707461e-06, "loss": 0.86454409, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 7996, "time_per_iteration": 2.4238340854644775 }, { "auxiliary_loss_clip": 0.01061499, "auxiliary_loss_mlp": 0.01041805, "balance_loss_clip": 1.01524556, "balance_loss_mlp": 1.02088451, "epoch": 0.4808056515857508, "flos": 12275181941760.0, "grad_norm": 1.629648839991111, "language_loss": 0.81723809, "learning_rate": 2.220956997340516e-06, "loss": 0.83827114, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40625, "step": 7997, "time_per_iteration": 2.4251325130462646 }, { "auxiliary_loss_clip": 0.01060761, "auxiliary_loss_mlp": 0.01042452, "balance_loss_clip": 1.016119, "balance_loss_mlp": 1.01979065, "epoch": 0.48086577483841875, "flos": 24824921748480.0, "grad_norm": 1.6476983165119004, "language_loss": 0.73857147, "learning_rate": 2.220569915556221e-06, "loss": 0.75960362, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 7998, "time_per_iteration": 2.4322750568389893 }, { "auxiliary_loss_clip": 0.0106139, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.01313841, "balance_loss_mlp": 1.02010572, "epoch": 0.4809258980910867, "flos": 24464129091840.0, "grad_norm": 1.8193181126772409, "language_loss": 0.72179097, "learning_rate": 2.220182825407892e-06, "loss": 0.74280536, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 7999, "time_per_iteration": 2.4700052738189697 }, { "auxiliary_loss_clip": 0.0106265, "auxiliary_loss_mlp": 0.01047317, "balance_loss_clip": 1.01924372, "balance_loss_mlp": 1.02112222, "epoch": 0.4809860213437547, "flos": 21214867455360.0, "grad_norm": 1.434568128841026, "language_loss": 0.72275746, "learning_rate": 2.2197957269102083e-06, "loss": 0.74385709, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41601562, "step": 8000, "time_per_iteration": 2.399415969848633 }, { "auxiliary_loss_clip": 0.01064335, "auxiliary_loss_mlp": 0.01040203, "balance_loss_clip": 1.0117476, "balance_loss_mlp": 1.0225805, "epoch": 0.48104614459642264, "flos": 37630609799040.0, "grad_norm": 1.4747237513619706, "language_loss": 0.75711292, "learning_rate": 2.2194086200778485e-06, "loss": 0.77815831, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 8001, "time_per_iteration": 2.5292582511901855 }, { "auxiliary_loss_clip": 0.01061589, "auxiliary_loss_mlp": 0.01043308, "balance_loss_clip": 1.0153296, "balance_loss_mlp": 1.01994359, "epoch": 0.48110626784909066, "flos": 18405127324800.0, "grad_norm": 1.7024790864301895, "language_loss": 0.82216793, "learning_rate": 2.219021504925493e-06, "loss": 0.8432169, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 8002, "time_per_iteration": 2.3702197074890137 }, { "auxiliary_loss_clip": 0.01062448, "auxiliary_loss_mlp": 0.01043527, "balance_loss_clip": 1.01285434, "balance_loss_mlp": 1.02093077, "epoch": 0.48116639110175863, "flos": 28438537000320.0, "grad_norm": 1.7963281434255978, "language_loss": 0.73075515, "learning_rate": 2.218634381467819e-06, "loss": 0.75181484, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.4140625, "step": 8003, "time_per_iteration": 2.453856945037842 }, { "auxiliary_loss_clip": 0.01059124, "auxiliary_loss_mlp": 0.01044266, "balance_loss_clip": 1.01969695, "balance_loss_mlp": 1.02090263, "epoch": 0.4812265143544266, "flos": 21724180502400.0, "grad_norm": 1.552743945959638, "language_loss": 0.83509409, "learning_rate": 2.218247249719507e-06, "loss": 0.85612798, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 8004, "time_per_iteration": 2.386887788772583 }, { "auxiliary_loss_clip": 0.0106818, "auxiliary_loss_mlp": 0.0105335, "balance_loss_clip": 1.02031744, "balance_loss_mlp": 1.0224719, "epoch": 0.48128663760709456, "flos": 13223841937920.0, "grad_norm": 1.9072155334844356, "language_loss": 0.80002147, "learning_rate": 2.217860109695239e-06, "loss": 0.82123679, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 0.45703125, "step": 8005, "time_per_iteration": 2.374980926513672 }, { "auxiliary_loss_clip": 0.0106099, "auxiliary_loss_mlp": 0.01049052, "balance_loss_clip": 1.02103758, "balance_loss_mlp": 1.0193224, "epoch": 0.4813467608597625, "flos": 24242291112960.0, "grad_norm": 1.872877695950415, "language_loss": 0.72127068, "learning_rate": 2.217472961409692e-06, "loss": 0.74237108, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41796875, "step": 8006, "time_per_iteration": 3.8522467613220215 }, { "auxiliary_loss_clip": 0.01061459, "auxiliary_loss_mlp": 0.01046194, "balance_loss_clip": 1.01933587, "balance_loss_mlp": 1.01970983, "epoch": 0.4814068841124305, "flos": 27479473418880.0, "grad_norm": 2.0719733901746764, "language_loss": 0.71288252, "learning_rate": 2.2170858048775495e-06, "loss": 0.73395902, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41796875, "step": 8007, "time_per_iteration": 3.8084208965301514 }, { "auxiliary_loss_clip": 0.01061291, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.01275611, "balance_loss_mlp": 1.01967597, "epoch": 0.48146700736509845, "flos": 19571889784320.0, "grad_norm": 1.7628924614841701, "language_loss": 0.72925931, "learning_rate": 2.2166986401134914e-06, "loss": 0.7502619, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41601562, "step": 8008, "time_per_iteration": 2.4252769947052 }, { "auxiliary_loss_clip": 0.01063055, "auxiliary_loss_mlp": 0.01049505, "balance_loss_clip": 1.01936936, "balance_loss_mlp": 1.02068663, "epoch": 0.4815271306177664, "flos": 20626825559040.0, "grad_norm": 1.7728533209613722, "language_loss": 0.62160355, "learning_rate": 2.216311467132199e-06, "loss": 0.64272916, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42382812, "step": 8009, "time_per_iteration": 3.8353755474090576 }, { "auxiliary_loss_clip": 0.01016746, "auxiliary_loss_mlp": 0.01004499, "balance_loss_clip": 1.00157797, "balance_loss_mlp": 1.00861692, "epoch": 0.4815872538704344, "flos": 67687894120320.0, "grad_norm": 0.8830933847516163, "language_loss": 0.61492455, "learning_rate": 2.2159242859483547e-06, "loss": 0.63513708, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.08105469, "step": 8010, "time_per_iteration": 3.0942647457122803 }, { "auxiliary_loss_clip": 0.01060641, "auxiliary_loss_mlp": 0.01052007, "balance_loss_clip": 1.02256203, "balance_loss_mlp": 1.01913714, "epoch": 0.48164737712310235, "flos": 22819650232320.0, "grad_norm": 1.8451646024837405, "language_loss": 0.74504673, "learning_rate": 2.215537096576639e-06, "loss": 0.76617318, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4140625, "step": 8011, "time_per_iteration": 2.4487714767456055 }, { "auxiliary_loss_clip": 0.0105713, "auxiliary_loss_mlp": 0.01041816, "balance_loss_clip": 1.01848674, "balance_loss_mlp": 1.01791811, "epoch": 0.4817075003757703, "flos": 23732698775040.0, "grad_norm": 1.7095045016521477, "language_loss": 0.80160868, "learning_rate": 2.2151498990317354e-06, "loss": 0.82259816, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.390625, "step": 8012, "time_per_iteration": 2.4836971759796143 }, { "auxiliary_loss_clip": 0.0105939, "auxiliary_loss_mlp": 0.01050753, "balance_loss_clip": 1.02272701, "balance_loss_mlp": 1.0186826, "epoch": 0.4817676236284383, "flos": 28181681061120.0, "grad_norm": 1.9165683230687593, "language_loss": 0.75377458, "learning_rate": 2.214762693328326e-06, "loss": 0.774876, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 8013, "time_per_iteration": 2.4618914127349854 }, { "auxiliary_loss_clip": 0.01057585, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.01931596, "balance_loss_mlp": 1.01822019, "epoch": 0.48182774688110624, "flos": 17090821992960.0, "grad_norm": 2.0389655413353562, "language_loss": 0.92383814, "learning_rate": 2.214375479481094e-06, "loss": 0.94486058, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 8014, "time_per_iteration": 2.343515396118164 }, { "auxiliary_loss_clip": 0.01062754, "auxiliary_loss_mlp": 0.01049924, "balance_loss_clip": 1.01953804, "balance_loss_mlp": 1.01866388, "epoch": 0.4818878701337742, "flos": 12567055841280.0, "grad_norm": 2.071723259852537, "language_loss": 0.76103985, "learning_rate": 2.213988257504722e-06, "loss": 0.7821666, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.44140625, "step": 8015, "time_per_iteration": 2.369561195373535 }, { "auxiliary_loss_clip": 0.01062535, "auxiliary_loss_mlp": 0.01045238, "balance_loss_clip": 1.01669931, "balance_loss_mlp": 1.01953793, "epoch": 0.48194799338644223, "flos": 24607342955520.0, "grad_norm": 2.0135890836369392, "language_loss": 0.8156938, "learning_rate": 2.213601027413894e-06, "loss": 0.83677155, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 8016, "time_per_iteration": 3.8353145122528076 }, { "auxiliary_loss_clip": 0.01059663, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.01540852, "balance_loss_mlp": 1.02077579, "epoch": 0.4820081166391102, "flos": 21104157934080.0, "grad_norm": 1.7765442418953632, "language_loss": 0.78748012, "learning_rate": 2.2132137892232933e-06, "loss": 0.80847991, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38867188, "step": 8017, "time_per_iteration": 2.405628204345703 }, { "auxiliary_loss_clip": 0.01058742, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.01322079, "balance_loss_mlp": 1.01991296, "epoch": 0.48206823989177816, "flos": 25263430824960.0, "grad_norm": 1.8421142344840846, "language_loss": 0.81302094, "learning_rate": 2.2128265429476043e-06, "loss": 0.83399177, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38867188, "step": 8018, "time_per_iteration": 2.3974337577819824 }, { "auxiliary_loss_clip": 0.01059309, "auxiliary_loss_mlp": 0.0104333, "balance_loss_clip": 1.01781893, "balance_loss_mlp": 1.01903677, "epoch": 0.4821283631444461, "flos": 24643897015680.0, "grad_norm": 2.931411638672019, "language_loss": 0.77539766, "learning_rate": 2.2124392886015124e-06, "loss": 0.79642409, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 8019, "time_per_iteration": 2.4267358779907227 }, { "auxiliary_loss_clip": 0.01060731, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.01743543, "balance_loss_mlp": 1.01903856, "epoch": 0.4821884863971141, "flos": 23950940883840.0, "grad_norm": 2.0656954819571034, "language_loss": 0.80130744, "learning_rate": 2.212052026199701e-06, "loss": 0.82235068, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41796875, "step": 8020, "time_per_iteration": 2.416184425354004 }, { "auxiliary_loss_clip": 0.0105909, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.01988363, "balance_loss_mlp": 1.02049625, "epoch": 0.48224860964978206, "flos": 17159845484160.0, "grad_norm": 1.7813388599644817, "language_loss": 0.71371877, "learning_rate": 2.211664755756855e-06, "loss": 0.73474896, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38671875, "step": 8021, "time_per_iteration": 2.415107488632202 }, { "auxiliary_loss_clip": 0.01065447, "auxiliary_loss_mlp": 0.01047897, "balance_loss_clip": 1.01773679, "balance_loss_mlp": 1.02267277, "epoch": 0.48230873290245, "flos": 23074725692160.0, "grad_norm": 1.9226023024343395, "language_loss": 0.63644552, "learning_rate": 2.2112774772876603e-06, "loss": 0.657579, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4296875, "step": 8022, "time_per_iteration": 2.396003246307373 }, { "auxiliary_loss_clip": 0.01058628, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.01320922, "balance_loss_mlp": 1.01982522, "epoch": 0.482368856155118, "flos": 19352530512000.0, "grad_norm": 2.2151863152926263, "language_loss": 0.67269373, "learning_rate": 2.2108901908068028e-06, "loss": 0.69364673, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.38867188, "step": 8023, "time_per_iteration": 2.3957715034484863 }, { "auxiliary_loss_clip": 0.01060887, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.01115143, "balance_loss_mlp": 1.02069521, "epoch": 0.48242897940778595, "flos": 20078095720320.0, "grad_norm": 2.445648796893174, "language_loss": 0.77758944, "learning_rate": 2.2105028963289683e-06, "loss": 0.7985602, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40234375, "step": 8024, "time_per_iteration": 2.392923355102539 }, { "auxiliary_loss_clip": 0.01061727, "auxiliary_loss_mlp": 0.01043353, "balance_loss_clip": 1.01620865, "balance_loss_mlp": 1.0214386, "epoch": 0.4824891026604539, "flos": 23402874360960.0, "grad_norm": 1.5716824555792122, "language_loss": 0.76492053, "learning_rate": 2.2101155938688423e-06, "loss": 0.78597128, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 8025, "time_per_iteration": 2.464078903198242 }, { "auxiliary_loss_clip": 0.01061643, "auxiliary_loss_mlp": 0.01040582, "balance_loss_clip": 1.01424885, "balance_loss_mlp": 1.02174592, "epoch": 0.4825492259131219, "flos": 20367840026880.0, "grad_norm": 1.8161741688680966, "language_loss": 0.72677803, "learning_rate": 2.209728283441112e-06, "loss": 0.74780035, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 8026, "time_per_iteration": 2.366476535797119 }, { "auxiliary_loss_clip": 0.01063072, "auxiliary_loss_mlp": 0.01048757, "balance_loss_clip": 1.01881123, "balance_loss_mlp": 1.02096319, "epoch": 0.48260934916578985, "flos": 14318159592960.0, "grad_norm": 1.931193755784965, "language_loss": 0.76832134, "learning_rate": 2.209340965060465e-06, "loss": 0.78943962, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.421875, "step": 8027, "time_per_iteration": 2.385864019393921 }, { "auxiliary_loss_clip": 0.01064211, "auxiliary_loss_mlp": 0.01044218, "balance_loss_clip": 1.01926792, "balance_loss_mlp": 1.02253222, "epoch": 0.4826694724184578, "flos": 22120235498880.0, "grad_norm": 3.8439574714607807, "language_loss": 0.68394703, "learning_rate": 2.2089536387415868e-06, "loss": 0.70503134, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.41796875, "step": 8028, "time_per_iteration": 2.4028079509735107 }, { "auxiliary_loss_clip": 0.01060827, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.0173763, "balance_loss_mlp": 1.02118289, "epoch": 0.48272959567112583, "flos": 16180217245440.0, "grad_norm": 1.575623083158319, "language_loss": 0.73615682, "learning_rate": 2.2085663044991655e-06, "loss": 0.75720388, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 8029, "time_per_iteration": 2.3875839710235596 }, { "auxiliary_loss_clip": 0.01063854, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.0091089, "balance_loss_mlp": 1.02205467, "epoch": 0.4827897189237938, "flos": 23179465370880.0, "grad_norm": 1.8068717393750544, "language_loss": 0.85839581, "learning_rate": 2.2081789623478896e-06, "loss": 0.87940294, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8030, "time_per_iteration": 2.3997063636779785 }, { "auxiliary_loss_clip": 0.0106108, "auxiliary_loss_mlp": 0.01038529, "balance_loss_clip": 1.01349521, "balance_loss_mlp": 1.02089107, "epoch": 0.48284984217646176, "flos": 21651561141120.0, "grad_norm": 1.9310018928010162, "language_loss": 0.74761051, "learning_rate": 2.2077916123024466e-06, "loss": 0.7686066, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40234375, "step": 8031, "time_per_iteration": 2.423278570175171 }, { "auxiliary_loss_clip": 0.01064863, "auxiliary_loss_mlp": 0.01047083, "balance_loss_clip": 1.01711345, "balance_loss_mlp": 1.02136087, "epoch": 0.48290996542912973, "flos": 31466100303360.0, "grad_norm": 3.563608809081684, "language_loss": 0.7276665, "learning_rate": 2.2074042543775245e-06, "loss": 0.74878597, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43554688, "step": 8032, "time_per_iteration": 2.471867322921753 }, { "auxiliary_loss_clip": 0.01061929, "auxiliary_loss_mlp": 0.01044998, "balance_loss_clip": 1.017663, "balance_loss_mlp": 1.02126431, "epoch": 0.4829700886817977, "flos": 24460812512640.0, "grad_norm": 1.3440796598123754, "language_loss": 0.75173253, "learning_rate": 2.2070168885878126e-06, "loss": 0.77280182, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 8033, "time_per_iteration": 2.4587459564208984 }, { "auxiliary_loss_clip": 0.01064154, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.02155495, "balance_loss_mlp": 1.02120948, "epoch": 0.48303021193446566, "flos": 25700997294720.0, "grad_norm": 1.482503072336305, "language_loss": 0.84330475, "learning_rate": 2.2066295149479996e-06, "loss": 0.8644436, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4296875, "step": 8034, "time_per_iteration": 2.418597936630249 }, { "auxiliary_loss_clip": 0.01059149, "auxiliary_loss_mlp": 0.0104317, "balance_loss_clip": 1.0190897, "balance_loss_mlp": 1.02017319, "epoch": 0.4830903351871336, "flos": 20084170296960.0, "grad_norm": 1.7357396468416317, "language_loss": 0.80176806, "learning_rate": 2.2062421334727744e-06, "loss": 0.82279128, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.390625, "step": 8035, "time_per_iteration": 2.4087443351745605 }, { "auxiliary_loss_clip": 0.01061801, "auxiliary_loss_mlp": 0.01047397, "balance_loss_clip": 1.01881099, "balance_loss_mlp": 1.02064776, "epoch": 0.4831504584398016, "flos": 39450806864640.0, "grad_norm": 2.1747378045962793, "language_loss": 0.71372616, "learning_rate": 2.2058547441768267e-06, "loss": 0.7348181, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41210938, "step": 8036, "time_per_iteration": 2.523103713989258 }, { "auxiliary_loss_clip": 0.01059952, "auxiliary_loss_mlp": 0.01044207, "balance_loss_clip": 1.0180527, "balance_loss_mlp": 1.01902413, "epoch": 0.48321058169246955, "flos": 20005685827200.0, "grad_norm": 1.9362753290644112, "language_loss": 0.74236047, "learning_rate": 2.205467347074847e-06, "loss": 0.7634021, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 8037, "time_per_iteration": 2.3917489051818848 }, { "auxiliary_loss_clip": 0.01063101, "auxiliary_loss_mlp": 0.01050369, "balance_loss_clip": 1.02199769, "balance_loss_mlp": 1.02010298, "epoch": 0.4832707049451375, "flos": 20740397811840.0, "grad_norm": 2.2643846088141637, "language_loss": 0.71441704, "learning_rate": 2.205079942181525e-06, "loss": 0.73555171, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4296875, "step": 8038, "time_per_iteration": 2.3755311965942383 }, { "auxiliary_loss_clip": 0.01058801, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.01943612, "balance_loss_mlp": 1.01869202, "epoch": 0.4833308281978055, "flos": 33144200668800.0, "grad_norm": 1.8644357691772886, "language_loss": 0.80313265, "learning_rate": 2.20469252951155e-06, "loss": 0.82419622, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 8039, "time_per_iteration": 2.512798309326172 }, { "auxiliary_loss_clip": 0.01060613, "auxiliary_loss_mlp": 0.01039484, "balance_loss_clip": 1.01259017, "balance_loss_mlp": 1.01947165, "epoch": 0.48339095145047345, "flos": 19098223102080.0, "grad_norm": 1.5275951104537393, "language_loss": 0.78619558, "learning_rate": 2.2043051090796143e-06, "loss": 0.80719656, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 8040, "time_per_iteration": 2.365368604660034 }, { "auxiliary_loss_clip": 0.01060257, "auxiliary_loss_mlp": 0.01046849, "balance_loss_clip": 1.01907325, "balance_loss_mlp": 1.01944625, "epoch": 0.4834510747031414, "flos": 34458017241600.0, "grad_norm": 1.4497888540116088, "language_loss": 0.76651305, "learning_rate": 2.203917680900409e-06, "loss": 0.78758407, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 8041, "time_per_iteration": 2.5252156257629395 }, { "auxiliary_loss_clip": 0.01060722, "auxiliary_loss_mlp": 0.01039638, "balance_loss_clip": 1.01191008, "balance_loss_mlp": 1.01999557, "epoch": 0.48351119795580944, "flos": 27379621330560.0, "grad_norm": 3.339787986100837, "language_loss": 0.68387473, "learning_rate": 2.203530244988624e-06, "loss": 0.70487833, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 8042, "time_per_iteration": 2.428579330444336 }, { "auxiliary_loss_clip": 0.01012975, "auxiliary_loss_mlp": 0.01006221, "balance_loss_clip": 1.00355053, "balance_loss_mlp": 1.00462937, "epoch": 0.4835713212084774, "flos": 67140770204160.0, "grad_norm": 0.6966085158274868, "language_loss": 0.58610052, "learning_rate": 2.2031428013589517e-06, "loss": 0.60629243, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.08349609, "step": 8043, "time_per_iteration": 3.09352707862854 }, { "auxiliary_loss_clip": 0.01061253, "auxiliary_loss_mlp": 0.01046991, "balance_loss_clip": 1.01804709, "balance_loss_mlp": 1.0189321, "epoch": 0.48363144446114537, "flos": 17966513514240.0, "grad_norm": 2.0704928003168184, "language_loss": 0.73590451, "learning_rate": 2.2027553500260847e-06, "loss": 0.75698698, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 8044, "time_per_iteration": 2.3639345169067383 }, { "auxiliary_loss_clip": 0.01057398, "auxiliary_loss_mlp": 0.01046247, "balance_loss_clip": 1.0176487, "balance_loss_mlp": 1.01789069, "epoch": 0.48369156771381333, "flos": 20592505825920.0, "grad_norm": 1.7359384405607368, "language_loss": 0.76873255, "learning_rate": 2.202367891004714e-06, "loss": 0.78976893, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39453125, "step": 8045, "time_per_iteration": 2.4688022136688232 }, { "auxiliary_loss_clip": 0.01060465, "auxiliary_loss_mlp": 0.01050887, "balance_loss_clip": 1.02013111, "balance_loss_mlp": 1.01780367, "epoch": 0.4837516909664813, "flos": 22673957662080.0, "grad_norm": 1.6054229880531283, "language_loss": 0.70305145, "learning_rate": 2.201980424309533e-06, "loss": 0.72416496, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.42578125, "step": 8046, "time_per_iteration": 3.753631353378296 }, { "auxiliary_loss_clip": 0.01059684, "auxiliary_loss_mlp": 0.0105084, "balance_loss_clip": 1.02224159, "balance_loss_mlp": 1.0192275, "epoch": 0.48381181421914926, "flos": 25517493855360.0, "grad_norm": 1.925554935005815, "language_loss": 0.83608121, "learning_rate": 2.2015929499552337e-06, "loss": 0.85718644, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40429688, "step": 8047, "time_per_iteration": 3.8124001026153564 }, { "auxiliary_loss_clip": 0.01057859, "auxiliary_loss_mlp": 0.01041969, "balance_loss_clip": 1.01468158, "balance_loss_mlp": 1.01742637, "epoch": 0.4838719374718172, "flos": 24206330545920.0, "grad_norm": 1.7251425102606115, "language_loss": 0.81340367, "learning_rate": 2.2012054679565092e-06, "loss": 0.83440197, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 8048, "time_per_iteration": 3.7799034118652344 }, { "auxiliary_loss_clip": 0.0106176, "auxiliary_loss_mlp": 0.010474, "balance_loss_clip": 1.0178839, "balance_loss_mlp": 1.01931429, "epoch": 0.4839320607244852, "flos": 26723358904320.0, "grad_norm": 1.8285384499786943, "language_loss": 0.8286863, "learning_rate": 2.200817978328054e-06, "loss": 0.84977794, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42578125, "step": 8049, "time_per_iteration": 2.4199976921081543 }, { "auxiliary_loss_clip": 0.0105799, "auxiliary_loss_mlp": 0.01039118, "balance_loss_clip": 1.01603889, "balance_loss_mlp": 1.01972604, "epoch": 0.48399218397715316, "flos": 20447860596480.0, "grad_norm": 1.833263095569868, "language_loss": 0.74039996, "learning_rate": 2.2004304810845602e-06, "loss": 0.76137102, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3828125, "step": 8050, "time_per_iteration": 2.393047571182251 }, { "auxiliary_loss_clip": 0.01012864, "auxiliary_loss_mlp": 0.010129, "balance_loss_clip": 1.01006293, "balance_loss_mlp": 1.00475752, "epoch": 0.4840523072298211, "flos": 67177394087040.0, "grad_norm": 0.7103506286266804, "language_loss": 0.56460959, "learning_rate": 2.200042976240723e-06, "loss": 0.58486724, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.08105469, "step": 8051, "time_per_iteration": 3.056006669998169 }, { "auxiliary_loss_clip": 0.01062354, "auxiliary_loss_mlp": 0.01041988, "balance_loss_clip": 1.01256704, "balance_loss_mlp": 1.02063739, "epoch": 0.4841124304824891, "flos": 22410608209920.0, "grad_norm": 2.315457239549307, "language_loss": 0.76556981, "learning_rate": 2.199655463811236e-06, "loss": 0.78661323, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 8052, "time_per_iteration": 2.4224650859832764 }, { "auxiliary_loss_clip": 0.01060694, "auxiliary_loss_mlp": 0.01043382, "balance_loss_clip": 1.01554632, "balance_loss_mlp": 1.01906514, "epoch": 0.48417255373515705, "flos": 13843131367680.0, "grad_norm": 2.1340151369139053, "language_loss": 0.67835009, "learning_rate": 2.1992679438107936e-06, "loss": 0.69939089, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41601562, "step": 8053, "time_per_iteration": 2.3431830406188965 }, { "auxiliary_loss_clip": 0.01059392, "auxiliary_loss_mlp": 0.01044127, "balance_loss_clip": 1.01585054, "balance_loss_mlp": 1.01886106, "epoch": 0.484232676987825, "flos": 31648346933760.0, "grad_norm": 2.1391783866765084, "language_loss": 0.72056293, "learning_rate": 2.198880416254091e-06, "loss": 0.74159819, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 8054, "time_per_iteration": 2.47769832611084 }, { "auxiliary_loss_clip": 0.01059199, "auxiliary_loss_mlp": 0.01040823, "balance_loss_clip": 1.01236737, "balance_loss_mlp": 1.0183965, "epoch": 0.48429280024049304, "flos": 24094294392960.0, "grad_norm": 1.7760997181835392, "language_loss": 0.70525956, "learning_rate": 2.1984928811558233e-06, "loss": 0.72625983, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 8055, "time_per_iteration": 3.847093343734741 }, { "auxiliary_loss_clip": 0.01063039, "auxiliary_loss_mlp": 0.0104482, "balance_loss_clip": 1.01625717, "balance_loss_mlp": 1.02061236, "epoch": 0.484352923493161, "flos": 17529121601280.0, "grad_norm": 1.9450355446291452, "language_loss": 0.64475727, "learning_rate": 2.198105338530685e-06, "loss": 0.6658358, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 8056, "time_per_iteration": 2.3498189449310303 }, { "auxiliary_loss_clip": 0.01061323, "auxiliary_loss_mlp": 0.01048363, "balance_loss_clip": 1.01901388, "balance_loss_mlp": 1.01967168, "epoch": 0.48441304674582897, "flos": 29165638308480.0, "grad_norm": 1.9800532263813526, "language_loss": 0.68398285, "learning_rate": 2.1977177883933726e-06, "loss": 0.70507973, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41601562, "step": 8057, "time_per_iteration": 2.4741296768188477 }, { "auxiliary_loss_clip": 0.01059538, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.01409256, "balance_loss_mlp": 1.01940477, "epoch": 0.48447316999849693, "flos": 15885829728000.0, "grad_norm": 1.7893056373759102, "language_loss": 0.82688767, "learning_rate": 2.1973302307585827e-06, "loss": 0.84788871, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 8058, "time_per_iteration": 2.388723611831665 }, { "auxiliary_loss_clip": 0.01062737, "auxiliary_loss_mlp": 0.01048815, "balance_loss_clip": 1.02052665, "balance_loss_mlp": 1.02086866, "epoch": 0.4845332932511649, "flos": 24380477740800.0, "grad_norm": 2.1184651071538894, "language_loss": 0.81194675, "learning_rate": 2.1969426656410097e-06, "loss": 0.83306223, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8059, "time_per_iteration": 2.426980495452881 }, { "auxiliary_loss_clip": 0.01065121, "auxiliary_loss_mlp": 0.01045581, "balance_loss_clip": 1.01340675, "balance_loss_mlp": 1.02215052, "epoch": 0.48459341650383286, "flos": 37115152352640.0, "grad_norm": 2.576206693789027, "language_loss": 0.68010509, "learning_rate": 2.196555093055352e-06, "loss": 0.70121217, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4296875, "step": 8060, "time_per_iteration": 2.4971303939819336 }, { "auxiliary_loss_clip": 0.01064193, "auxiliary_loss_mlp": 0.01044886, "balance_loss_clip": 1.01726556, "balance_loss_mlp": 1.02245545, "epoch": 0.48465353975650083, "flos": 22965657004800.0, "grad_norm": 1.7006119168574179, "language_loss": 0.68245995, "learning_rate": 2.1961675130163046e-06, "loss": 0.70355076, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8061, "time_per_iteration": 2.4359328746795654 }, { "auxiliary_loss_clip": 0.01064843, "auxiliary_loss_mlp": 0.01052082, "balance_loss_clip": 1.02249384, "balance_loss_mlp": 1.02270436, "epoch": 0.4847136630091688, "flos": 17706864666240.0, "grad_norm": 2.5435696283075497, "language_loss": 0.83263326, "learning_rate": 2.1957799255385653e-06, "loss": 0.8538025, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 8062, "time_per_iteration": 2.3625669479370117 }, { "auxiliary_loss_clip": 0.01063008, "auxiliary_loss_mlp": 0.01045308, "balance_loss_clip": 1.01842654, "balance_loss_mlp": 1.02211249, "epoch": 0.48477378626183676, "flos": 22017171565440.0, "grad_norm": 1.5345641087261925, "language_loss": 0.75190526, "learning_rate": 2.1953923306368325e-06, "loss": 0.77298844, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 8063, "time_per_iteration": 2.4452805519104004 }, { "auxiliary_loss_clip": 0.01063247, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.01460457, "balance_loss_mlp": 1.02146769, "epoch": 0.4848339095145047, "flos": 27961763207040.0, "grad_norm": 3.5037733619661626, "language_loss": 0.80110419, "learning_rate": 2.1950047283258023e-06, "loss": 0.82215774, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 8064, "time_per_iteration": 2.4386181831359863 }, { "auxiliary_loss_clip": 0.0106256, "auxiliary_loss_mlp": 0.01045424, "balance_loss_clip": 1.01941299, "balance_loss_mlp": 1.02346563, "epoch": 0.4848940327671727, "flos": 21687696264960.0, "grad_norm": 1.8195053785379132, "language_loss": 0.80000329, "learning_rate": 2.194617118620173e-06, "loss": 0.82108313, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 8065, "time_per_iteration": 2.4277796745300293 }, { "auxiliary_loss_clip": 0.01057869, "auxiliary_loss_mlp": 0.01042375, "balance_loss_clip": 1.01719797, "balance_loss_mlp": 1.01993489, "epoch": 0.48495415601984065, "flos": 20630526163200.0, "grad_norm": 1.9725729833189676, "language_loss": 0.76840496, "learning_rate": 2.194229501534644e-06, "loss": 0.78940737, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 8066, "time_per_iteration": 2.384957790374756 }, { "auxiliary_loss_clip": 0.01061273, "auxiliary_loss_mlp": 0.01049005, "balance_loss_clip": 1.02373254, "balance_loss_mlp": 1.02234769, "epoch": 0.4850142792725086, "flos": 25627016390400.0, "grad_norm": 1.7089037807898715, "language_loss": 0.73058748, "learning_rate": 2.193841877083912e-06, "loss": 0.75169027, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 8067, "time_per_iteration": 2.465919017791748 }, { "auxiliary_loss_clip": 0.01063691, "auxiliary_loss_mlp": 0.01041526, "balance_loss_clip": 1.01580095, "balance_loss_mlp": 1.02271903, "epoch": 0.4850744025251766, "flos": 13771105499520.0, "grad_norm": 2.18132537363447, "language_loss": 0.8131094, "learning_rate": 2.1934542452826767e-06, "loss": 0.83416158, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 8068, "time_per_iteration": 2.3778936862945557 }, { "auxiliary_loss_clip": 0.01057858, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.01476908, "balance_loss_mlp": 1.01921439, "epoch": 0.4851345257778446, "flos": 20260447084800.0, "grad_norm": 1.6384515057075235, "language_loss": 0.85551858, "learning_rate": 2.193066606145638e-06, "loss": 0.87648481, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38671875, "step": 8069, "time_per_iteration": 2.3884708881378174 }, { "auxiliary_loss_clip": 0.01059839, "auxiliary_loss_mlp": 0.01042606, "balance_loss_clip": 1.01661849, "balance_loss_mlp": 1.0208931, "epoch": 0.48519464903051257, "flos": 27088445658240.0, "grad_norm": 1.7052585531909965, "language_loss": 0.78706878, "learning_rate": 2.192678959687493e-06, "loss": 0.80809325, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38867188, "step": 8070, "time_per_iteration": 2.4462618827819824 }, { "auxiliary_loss_clip": 0.01059335, "auxiliary_loss_mlp": 0.01047904, "balance_loss_clip": 1.02003312, "balance_loss_mlp": 1.01967001, "epoch": 0.48525477228318054, "flos": 17126328712320.0, "grad_norm": 2.9424992332236273, "language_loss": 0.79881787, "learning_rate": 2.192291305922943e-06, "loss": 0.81989026, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.39648438, "step": 8071, "time_per_iteration": 2.3527936935424805 }, { "auxiliary_loss_clip": 0.01059927, "auxiliary_loss_mlp": 0.01047527, "balance_loss_clip": 1.02059793, "balance_loss_mlp": 1.01952791, "epoch": 0.4853148955358485, "flos": 28179167443200.0, "grad_norm": 1.8395574634045238, "language_loss": 0.73164386, "learning_rate": 2.1919036448666873e-06, "loss": 0.75271839, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 8072, "time_per_iteration": 2.4669339656829834 }, { "auxiliary_loss_clip": 0.01061407, "auxiliary_loss_mlp": 0.01055702, "balance_loss_clip": 1.02721095, "balance_loss_mlp": 1.01981699, "epoch": 0.48537501878851647, "flos": 17492358072960.0, "grad_norm": 2.4880003475827874, "language_loss": 0.89084065, "learning_rate": 2.1915159765334262e-06, "loss": 0.91201174, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41601562, "step": 8073, "time_per_iteration": 2.3430569171905518 }, { "auxiliary_loss_clip": 0.01056152, "auxiliary_loss_mlp": 0.01045087, "balance_loss_clip": 1.02032733, "balance_loss_mlp": 1.01853752, "epoch": 0.48543514204118443, "flos": 28583601166080.0, "grad_norm": 1.9392747162390804, "language_loss": 0.61998683, "learning_rate": 2.19112830093786e-06, "loss": 0.64099926, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 8074, "time_per_iteration": 2.457200527191162 }, { "auxiliary_loss_clip": 0.0105867, "auxiliary_loss_mlp": 0.01052167, "balance_loss_clip": 1.02486801, "balance_loss_mlp": 1.01763248, "epoch": 0.4854952652938524, "flos": 20958919211520.0, "grad_norm": 1.7517166035769813, "language_loss": 0.74097854, "learning_rate": 2.19074061809469e-06, "loss": 0.76208687, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 8075, "time_per_iteration": 2.3599302768707275 }, { "auxiliary_loss_clip": 0.01056391, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.01829815, "balance_loss_mlp": 1.01852012, "epoch": 0.48555538854652036, "flos": 66527243015040.0, "grad_norm": 1.4979667256479785, "language_loss": 0.82537305, "learning_rate": 2.1903529280186163e-06, "loss": 0.84637755, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 8076, "time_per_iteration": 2.7755625247955322 }, { "auxiliary_loss_clip": 0.01057296, "auxiliary_loss_mlp": 0.0104243, "balance_loss_clip": 1.01628745, "balance_loss_mlp": 1.01882005, "epoch": 0.4856155117991883, "flos": 15924059533440.0, "grad_norm": 4.501197650386905, "language_loss": 0.88062614, "learning_rate": 2.1899652307243407e-06, "loss": 0.90162343, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 8077, "time_per_iteration": 2.3368592262268066 }, { "auxiliary_loss_clip": 0.01011103, "auxiliary_loss_mlp": 0.01007815, "balance_loss_clip": 1.00493002, "balance_loss_mlp": 1.00290966, "epoch": 0.4856756350518563, "flos": 71044129762560.0, "grad_norm": 0.9067521289282683, "language_loss": 0.5864867, "learning_rate": 2.189577526226564e-06, "loss": 0.60667586, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.08203125, "step": 8078, "time_per_iteration": 2.975743055343628 }, { "auxiliary_loss_clip": 0.01061316, "auxiliary_loss_mlp": 0.01050173, "balance_loss_clip": 1.02143192, "balance_loss_mlp": 1.01963353, "epoch": 0.48573575830452426, "flos": 29824379441280.0, "grad_norm": 1.6266849230439848, "language_loss": 0.73697048, "learning_rate": 2.1891898145399884e-06, "loss": 0.75808537, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 8079, "time_per_iteration": 2.4719154834747314 }, { "auxiliary_loss_clip": 0.01058907, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.02036643, "balance_loss_mlp": 1.01896513, "epoch": 0.4857958815571922, "flos": 17638539402240.0, "grad_norm": 2.0479324251504396, "language_loss": 0.80472726, "learning_rate": 2.1888020956793172e-06, "loss": 0.82579672, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 8080, "time_per_iteration": 2.383028984069824 }, { "auxiliary_loss_clip": 0.01059277, "auxiliary_loss_mlp": 0.01046492, "balance_loss_clip": 1.01899004, "balance_loss_mlp": 1.01845491, "epoch": 0.4858560048098602, "flos": 21104437224960.0, "grad_norm": 1.9529380928397921, "language_loss": 0.84759653, "learning_rate": 2.188414369659251e-06, "loss": 0.86865419, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40820312, "step": 8081, "time_per_iteration": 2.398174285888672 }, { "auxiliary_loss_clip": 0.01058631, "auxiliary_loss_mlp": 0.0105019, "balance_loss_clip": 1.02304602, "balance_loss_mlp": 1.01811934, "epoch": 0.4859161280625282, "flos": 22089756015360.0, "grad_norm": 1.4376535649861486, "language_loss": 0.84286141, "learning_rate": 2.1880266364944924e-06, "loss": 0.8639496, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 8082, "time_per_iteration": 2.4224324226379395 }, { "auxiliary_loss_clip": 0.01058369, "auxiliary_loss_mlp": 0.01042978, "balance_loss_clip": 1.01825428, "balance_loss_mlp": 1.01951754, "epoch": 0.4859762513151962, "flos": 17492497718400.0, "grad_norm": 1.9993672834047864, "language_loss": 0.88750064, "learning_rate": 2.187638896199746e-06, "loss": 0.90851414, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38867188, "step": 8083, "time_per_iteration": 2.359990358352661 }, { "auxiliary_loss_clip": 0.01057636, "auxiliary_loss_mlp": 0.01044478, "balance_loss_clip": 1.01951551, "balance_loss_mlp": 1.01936674, "epoch": 0.48603637456786414, "flos": 18003277042560.0, "grad_norm": 1.9081140289359209, "language_loss": 0.82845592, "learning_rate": 2.1872511487897126e-06, "loss": 0.84947711, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 8084, "time_per_iteration": 2.384178400039673 }, { "auxiliary_loss_clip": 0.01061854, "auxiliary_loss_mlp": 0.01045874, "balance_loss_clip": 1.01774073, "balance_loss_mlp": 1.02001762, "epoch": 0.4860964978205321, "flos": 22490942981760.0, "grad_norm": 2.301997060448946, "language_loss": 0.70044768, "learning_rate": 2.186863394279098e-06, "loss": 0.72152495, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 8085, "time_per_iteration": 3.6870505809783936 }, { "auxiliary_loss_clip": 0.01061918, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.01558888, "balance_loss_mlp": 1.02136874, "epoch": 0.48615662107320007, "flos": 23371277713920.0, "grad_norm": 1.534410328821934, "language_loss": 0.78643644, "learning_rate": 2.1864756326826046e-06, "loss": 0.80748576, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 8086, "time_per_iteration": 3.8431646823883057 }, { "auxiliary_loss_clip": 0.01060772, "auxiliary_loss_mlp": 0.01041725, "balance_loss_clip": 1.01373518, "balance_loss_mlp": 1.02054751, "epoch": 0.48621674432586803, "flos": 34417518197760.0, "grad_norm": 2.4310641183148367, "language_loss": 0.71677953, "learning_rate": 2.1860878640149355e-06, "loss": 0.73780447, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8087, "time_per_iteration": 2.478919267654419 }, { "auxiliary_loss_clip": 0.01062563, "auxiliary_loss_mlp": 0.01050302, "balance_loss_clip": 1.02179885, "balance_loss_mlp": 1.019526, "epoch": 0.486276867578536, "flos": 33106215242880.0, "grad_norm": 1.8777814888201116, "language_loss": 0.7463752, "learning_rate": 2.1857000882907974e-06, "loss": 0.76750386, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 8088, "time_per_iteration": 3.9433329105377197 }, { "auxiliary_loss_clip": 0.01059021, "auxiliary_loss_mlp": 0.01050942, "balance_loss_clip": 1.02494216, "balance_loss_mlp": 1.01965523, "epoch": 0.48633699083120396, "flos": 21469628712960.0, "grad_norm": 1.4672822386875704, "language_loss": 0.7678957, "learning_rate": 2.185312305524892e-06, "loss": 0.78899533, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 8089, "time_per_iteration": 2.419008255004883 }, { "auxiliary_loss_clip": 0.01062613, "auxiliary_loss_mlp": 0.01043621, "balance_loss_clip": 1.01482022, "balance_loss_mlp": 1.02160764, "epoch": 0.48639711408387193, "flos": 20083297512960.0, "grad_norm": 1.5465988301845353, "language_loss": 0.85029054, "learning_rate": 2.184924515731926e-06, "loss": 0.87135291, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 8090, "time_per_iteration": 2.410463809967041 }, { "auxiliary_loss_clip": 0.01059474, "auxiliary_loss_mlp": 0.01036974, "balance_loss_clip": 1.01222634, "balance_loss_mlp": 1.02122462, "epoch": 0.4864572373365399, "flos": 20777789744640.0, "grad_norm": 1.510495849341002, "language_loss": 0.77306741, "learning_rate": 2.1845367189266045e-06, "loss": 0.79403192, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 8091, "time_per_iteration": 2.396533966064453 }, { "auxiliary_loss_clip": 0.01060904, "auxiliary_loss_mlp": 0.01041816, "balance_loss_clip": 1.01448107, "balance_loss_mlp": 1.02073884, "epoch": 0.48651736058920786, "flos": 26024328195840.0, "grad_norm": 1.4587298215944733, "language_loss": 0.8071444, "learning_rate": 2.184148915123631e-06, "loss": 0.82817167, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 8092, "time_per_iteration": 2.4713478088378906 }, { "auxiliary_loss_clip": 0.010632, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.01429582, "balance_loss_mlp": 1.02187884, "epoch": 0.4865774838418758, "flos": 20484554302080.0, "grad_norm": 1.8965175223986472, "language_loss": 0.72490811, "learning_rate": 2.1837611043377126e-06, "loss": 0.74595988, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8093, "time_per_iteration": 2.4030678272247314 }, { "auxiliary_loss_clip": 0.01060147, "auxiliary_loss_mlp": 0.01043732, "balance_loss_clip": 1.01742268, "balance_loss_mlp": 1.02070558, "epoch": 0.4866376070945438, "flos": 23546646806400.0, "grad_norm": 1.7805349735882785, "language_loss": 0.68812656, "learning_rate": 2.1833732865835545e-06, "loss": 0.70916533, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 8094, "time_per_iteration": 3.8735239505767822 }, { "auxiliary_loss_clip": 0.01062914, "auxiliary_loss_mlp": 0.01055519, "balance_loss_clip": 1.0271945, "balance_loss_mlp": 1.02074194, "epoch": 0.4866977303472118, "flos": 16689669937920.0, "grad_norm": 2.424768384797598, "language_loss": 0.68485612, "learning_rate": 2.1829854618758636e-06, "loss": 0.70604044, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 8095, "time_per_iteration": 2.355020046234131 }, { "auxiliary_loss_clip": 0.01060978, "auxiliary_loss_mlp": 0.01048791, "balance_loss_clip": 1.0209918, "balance_loss_mlp": 1.01989233, "epoch": 0.4867578535998798, "flos": 17895011316480.0, "grad_norm": 2.0170220308569995, "language_loss": 0.80598712, "learning_rate": 2.182597630229345e-06, "loss": 0.8270849, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 8096, "time_per_iteration": 2.440167188644409 }, { "auxiliary_loss_clip": 0.01057541, "auxiliary_loss_mlp": 0.01050655, "balance_loss_clip": 1.02318954, "balance_loss_mlp": 1.01867473, "epoch": 0.48681797685254774, "flos": 22636705374720.0, "grad_norm": 1.8329422382741873, "language_loss": 0.69659472, "learning_rate": 2.1822097916587067e-06, "loss": 0.7176767, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.38867188, "step": 8097, "time_per_iteration": 2.386955499649048 }, { "auxiliary_loss_clip": 0.01057859, "auxiliary_loss_mlp": 0.010478, "balance_loss_clip": 1.02042913, "balance_loss_mlp": 1.01834512, "epoch": 0.4868781001052157, "flos": 20885043041280.0, "grad_norm": 1.4748607858451788, "language_loss": 0.72770947, "learning_rate": 2.1818219461786543e-06, "loss": 0.74876606, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 8098, "time_per_iteration": 2.401756763458252 }, { "auxiliary_loss_clip": 0.01063607, "auxiliary_loss_mlp": 0.01046516, "balance_loss_clip": 1.01738083, "balance_loss_mlp": 1.02042031, "epoch": 0.48693822335788367, "flos": 41973316306560.0, "grad_norm": 1.5982524191001612, "language_loss": 0.67768139, "learning_rate": 2.1814340938038956e-06, "loss": 0.69878256, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43164062, "step": 8099, "time_per_iteration": 2.54843807220459 }, { "auxiliary_loss_clip": 0.01056702, "auxiliary_loss_mlp": 0.01047173, "balance_loss_clip": 1.02203155, "balance_loss_mlp": 1.01751184, "epoch": 0.48699834661055164, "flos": 24242151467520.0, "grad_norm": 1.6529325220241322, "language_loss": 0.684488, "learning_rate": 2.181046234549138e-06, "loss": 0.70552677, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 8100, "time_per_iteration": 2.420623779296875 }, { "auxiliary_loss_clip": 0.01054873, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.01787019, "balance_loss_mlp": 1.01639938, "epoch": 0.4870584698632196, "flos": 25922625805440.0, "grad_norm": 1.5637623936031626, "language_loss": 0.7737264, "learning_rate": 2.180658368429088e-06, "loss": 0.7947017, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 8101, "time_per_iteration": 2.4278759956359863 }, { "auxiliary_loss_clip": 0.01011069, "auxiliary_loss_mlp": 0.01011928, "balance_loss_clip": 1.00928152, "balance_loss_mlp": 1.00325632, "epoch": 0.48711859311588757, "flos": 70208588171520.0, "grad_norm": 0.6987598935858327, "language_loss": 0.52428693, "learning_rate": 2.1802704954584565e-06, "loss": 0.54451692, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.078125, "step": 8102, "time_per_iteration": 3.173898696899414 }, { "auxiliary_loss_clip": 0.01058586, "auxiliary_loss_mlp": 0.01045068, "balance_loss_clip": 1.01871049, "balance_loss_mlp": 1.0190202, "epoch": 0.48717871636855553, "flos": 12342320219520.0, "grad_norm": 1.8631496123715359, "language_loss": 0.75103474, "learning_rate": 2.1798826156519484e-06, "loss": 0.77207136, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 8103, "time_per_iteration": 2.3232340812683105 }, { "auxiliary_loss_clip": 0.01061523, "auxiliary_loss_mlp": 0.01050501, "balance_loss_clip": 1.02205729, "balance_loss_mlp": 1.02022684, "epoch": 0.4872388396212235, "flos": 23476017392640.0, "grad_norm": 1.7650212617703493, "language_loss": 0.64478081, "learning_rate": 2.1794947290242737e-06, "loss": 0.66590106, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41210938, "step": 8104, "time_per_iteration": 2.45658016204834 }, { "auxiliary_loss_clip": 0.0105861, "auxiliary_loss_mlp": 0.01042263, "balance_loss_clip": 1.01600075, "balance_loss_mlp": 1.01911306, "epoch": 0.48729896287389146, "flos": 31426334398080.0, "grad_norm": 1.721731285338277, "language_loss": 0.69923645, "learning_rate": 2.1791068355901413e-06, "loss": 0.72024524, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 8105, "time_per_iteration": 2.4437413215637207 }, { "auxiliary_loss_clip": 0.01057165, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.01168013, "balance_loss_mlp": 1.01817691, "epoch": 0.4873590861265594, "flos": 19057060742400.0, "grad_norm": 1.7281528761776228, "language_loss": 0.74703848, "learning_rate": 2.178718935364259e-06, "loss": 0.76798046, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 8106, "time_per_iteration": 2.399144172668457 }, { "auxiliary_loss_clip": 0.01062503, "auxiliary_loss_mlp": 0.01046117, "balance_loss_clip": 1.01553941, "balance_loss_mlp": 1.02061248, "epoch": 0.4874192093792274, "flos": 24347275171200.0, "grad_norm": 1.8912377794399549, "language_loss": 0.78096199, "learning_rate": 2.1783310283613373e-06, "loss": 0.80204821, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.41796875, "step": 8107, "time_per_iteration": 2.401186466217041 }, { "auxiliary_loss_clip": 0.01057599, "auxiliary_loss_mlp": 0.0104132, "balance_loss_clip": 1.01565385, "balance_loss_mlp": 1.0197196, "epoch": 0.4874793326318954, "flos": 23111489220480.0, "grad_norm": 1.8239428648996252, "language_loss": 0.76115012, "learning_rate": 2.1779431145960853e-06, "loss": 0.7821393, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 8108, "time_per_iteration": 2.430654525756836 }, { "auxiliary_loss_clip": 0.01059204, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.01523042, "balance_loss_mlp": 1.02066612, "epoch": 0.4875394558845634, "flos": 19025149893120.0, "grad_norm": 1.6714873124160845, "language_loss": 0.74079216, "learning_rate": 2.177555194083212e-06, "loss": 0.76178163, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 8109, "time_per_iteration": 2.375513792037964 }, { "auxiliary_loss_clip": 0.01059649, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.01790297, "balance_loss_mlp": 1.02019727, "epoch": 0.48759957913723134, "flos": 21432550982400.0, "grad_norm": 1.8339476738649023, "language_loss": 0.7937634, "learning_rate": 2.177167266837428e-06, "loss": 0.81481314, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 8110, "time_per_iteration": 2.425811767578125 }, { "auxiliary_loss_clip": 0.01060352, "auxiliary_loss_mlp": 0.01044469, "balance_loss_clip": 1.01802826, "balance_loss_mlp": 1.01995802, "epoch": 0.4876597023898993, "flos": 17747712823680.0, "grad_norm": 1.9448574640947216, "language_loss": 0.7354126, "learning_rate": 2.176779332873444e-06, "loss": 0.75646079, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 8111, "time_per_iteration": 2.353019952774048 }, { "auxiliary_loss_clip": 0.010621, "auxiliary_loss_mlp": 0.01045663, "balance_loss_clip": 1.01683807, "balance_loss_mlp": 1.02169824, "epoch": 0.4877198256425673, "flos": 17018691390720.0, "grad_norm": 1.7307651338923498, "language_loss": 0.77338839, "learning_rate": 2.17639139220597e-06, "loss": 0.79446602, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40429688, "step": 8112, "time_per_iteration": 2.401118040084839 }, { "auxiliary_loss_clip": 0.01064581, "auxiliary_loss_mlp": 0.01044541, "balance_loss_clip": 1.01570439, "balance_loss_mlp": 1.02138877, "epoch": 0.48777994889523524, "flos": 22382956546560.0, "grad_norm": 1.603476162715847, "language_loss": 0.76233637, "learning_rate": 2.1760034448497166e-06, "loss": 0.7834276, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43164062, "step": 8113, "time_per_iteration": 2.397122621536255 }, { "auxiliary_loss_clip": 0.010184, "auxiliary_loss_mlp": 0.01006601, "balance_loss_clip": 1.00377584, "balance_loss_mlp": 1.01040721, "epoch": 0.4878400721479032, "flos": 61238527908480.0, "grad_norm": 0.7856095335632013, "language_loss": 0.48953632, "learning_rate": 2.1756154908193943e-06, "loss": 0.50978637, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.08007812, "step": 8114, "time_per_iteration": 2.978111743927002 }, { "auxiliary_loss_clip": 0.01063286, "auxiliary_loss_mlp": 0.01045969, "balance_loss_clip": 1.01738262, "balance_loss_mlp": 1.02175331, "epoch": 0.48790019540057117, "flos": 24535421821440.0, "grad_norm": 1.4085129442140978, "language_loss": 0.77793092, "learning_rate": 2.1752275301297155e-06, "loss": 0.79902345, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 8115, "time_per_iteration": 2.422938346862793 }, { "auxiliary_loss_clip": 0.01063013, "auxiliary_loss_mlp": 0.01045988, "balance_loss_clip": 1.01679325, "balance_loss_mlp": 1.02075887, "epoch": 0.48796031865323913, "flos": 21832900076160.0, "grad_norm": 2.072144954818526, "language_loss": 0.73311788, "learning_rate": 2.1748395627953915e-06, "loss": 0.75420785, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 8116, "time_per_iteration": 2.427475929260254 }, { "auxiliary_loss_clip": 0.0105896, "auxiliary_loss_mlp": 0.01045113, "balance_loss_clip": 1.0173372, "balance_loss_mlp": 1.0195266, "epoch": 0.4880204419059071, "flos": 18587897625600.0, "grad_norm": 1.670177532949957, "language_loss": 0.63914466, "learning_rate": 2.1744515888311335e-06, "loss": 0.66018534, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39453125, "step": 8117, "time_per_iteration": 2.3777389526367188 }, { "auxiliary_loss_clip": 0.01058629, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.01562738, "balance_loss_mlp": 1.01883805, "epoch": 0.48808056515857506, "flos": 19171156665600.0, "grad_norm": 1.6803930142318335, "language_loss": 0.80434316, "learning_rate": 2.1740636082516533e-06, "loss": 0.82534605, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8118, "time_per_iteration": 2.421194076538086 }, { "auxiliary_loss_clip": 0.01062077, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.02049494, "balance_loss_mlp": 1.02069759, "epoch": 0.48814068841124303, "flos": 20119467548160.0, "grad_norm": 2.6399460646830035, "language_loss": 0.64873171, "learning_rate": 2.1736756210716645e-06, "loss": 0.66983747, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 8119, "time_per_iteration": 2.3728177547454834 }, { "auxiliary_loss_clip": 0.01058614, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.01191497, "balance_loss_mlp": 1.01933479, "epoch": 0.488200811663911, "flos": 22964504929920.0, "grad_norm": 2.166139340929017, "language_loss": 0.72830266, "learning_rate": 2.173287627305878e-06, "loss": 0.74926734, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39257812, "step": 8120, "time_per_iteration": 2.4339663982391357 }, { "auxiliary_loss_clip": 0.01060997, "auxiliary_loss_mlp": 0.01045335, "balance_loss_clip": 1.01693928, "balance_loss_mlp": 1.01879168, "epoch": 0.48826093491657896, "flos": 33909322314240.0, "grad_norm": 1.6001621993353154, "language_loss": 0.64905918, "learning_rate": 2.1728996269690075e-06, "loss": 0.6701225, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 8121, "time_per_iteration": 2.4859707355499268 }, { "auxiliary_loss_clip": 0.0106228, "auxiliary_loss_mlp": 0.01043163, "balance_loss_clip": 1.01469553, "balance_loss_mlp": 1.01985073, "epoch": 0.488321058169247, "flos": 23069349342720.0, "grad_norm": 2.056472661197915, "language_loss": 0.84618223, "learning_rate": 2.1725116200757664e-06, "loss": 0.86723655, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 8122, "time_per_iteration": 2.396568775177002 }, { "auxiliary_loss_clip": 0.01060277, "auxiliary_loss_mlp": 0.01041841, "balance_loss_clip": 1.01392221, "balance_loss_mlp": 1.01854479, "epoch": 0.48838118142191494, "flos": 19316709590400.0, "grad_norm": 1.7044265461870343, "language_loss": 0.86421931, "learning_rate": 2.172123606640866e-06, "loss": 0.88524055, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8123, "time_per_iteration": 2.358893632888794 }, { "auxiliary_loss_clip": 0.01061028, "auxiliary_loss_mlp": 0.0104393, "balance_loss_clip": 1.01597571, "balance_loss_mlp": 1.01903796, "epoch": 0.4884413046745829, "flos": 25409507420160.0, "grad_norm": 1.4460434426878785, "language_loss": 0.86532879, "learning_rate": 2.1717355866790227e-06, "loss": 0.88637829, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41992188, "step": 8124, "time_per_iteration": 2.4393153190612793 }, { "auxiliary_loss_clip": 0.0106151, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.01441503, "balance_loss_mlp": 1.01960158, "epoch": 0.4885014279272509, "flos": 20990620592640.0, "grad_norm": 2.091484521835699, "language_loss": 0.8139267, "learning_rate": 2.171347560204948e-06, "loss": 0.83495134, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41992188, "step": 8125, "time_per_iteration": 3.8356614112854004 }, { "auxiliary_loss_clip": 0.01060151, "auxiliary_loss_mlp": 0.01043126, "balance_loss_clip": 1.01478946, "balance_loss_mlp": 1.01894784, "epoch": 0.48856155117991884, "flos": 13770756385920.0, "grad_norm": 2.2162273453194694, "language_loss": 0.73798937, "learning_rate": 2.170959527233356e-06, "loss": 0.75902212, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41210938, "step": 8126, "time_per_iteration": 3.752007246017456 }, { "auxiliary_loss_clip": 0.01059996, "auxiliary_loss_mlp": 0.01044361, "balance_loss_clip": 1.01725304, "balance_loss_mlp": 1.01875424, "epoch": 0.4886216744325868, "flos": 32086402162560.0, "grad_norm": 1.6515808268554515, "language_loss": 0.69848514, "learning_rate": 2.1705714877789633e-06, "loss": 0.71952868, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41210938, "step": 8127, "time_per_iteration": 2.469999313354492 }, { "auxiliary_loss_clip": 0.01060062, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.01084948, "balance_loss_mlp": 1.01787996, "epoch": 0.48868179768525477, "flos": 19609037337600.0, "grad_norm": 2.3586792623036077, "language_loss": 0.77954853, "learning_rate": 2.170183441856481e-06, "loss": 0.80054152, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 8128, "time_per_iteration": 3.859971523284912 }, { "auxiliary_loss_clip": 0.01059228, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.01478064, "balance_loss_mlp": 1.01954341, "epoch": 0.48874192093792274, "flos": 21285880894080.0, "grad_norm": 2.103170232123045, "language_loss": 0.76854444, "learning_rate": 2.1697953894806265e-06, "loss": 0.78955531, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 8129, "time_per_iteration": 2.3888838291168213 }, { "auxiliary_loss_clip": 0.01060037, "auxiliary_loss_mlp": 0.01043241, "balance_loss_clip": 1.01378417, "balance_loss_mlp": 1.01907015, "epoch": 0.4888020441905907, "flos": 14172571756800.0, "grad_norm": 2.1013502591445543, "language_loss": 0.66263652, "learning_rate": 2.169407330666114e-06, "loss": 0.68366933, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41015625, "step": 8130, "time_per_iteration": 2.345329999923706 }, { "auxiliary_loss_clip": 0.01057346, "auxiliary_loss_mlp": 0.01044018, "balance_loss_clip": 1.01649261, "balance_loss_mlp": 1.0177108, "epoch": 0.48886216744325867, "flos": 24096738188160.0, "grad_norm": 1.7445132482629475, "language_loss": 0.73151457, "learning_rate": 2.169019265427658e-06, "loss": 0.75252819, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39648438, "step": 8131, "time_per_iteration": 2.413877248764038 }, { "auxiliary_loss_clip": 0.01060871, "auxiliary_loss_mlp": 0.01050328, "balance_loss_clip": 1.02174115, "balance_loss_mlp": 1.01956081, "epoch": 0.48892229069592663, "flos": 38430016266240.0, "grad_norm": 1.4926496599573764, "language_loss": 0.702613, "learning_rate": 2.1686311937799745e-06, "loss": 0.72372496, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41210938, "step": 8132, "time_per_iteration": 2.508917808532715 }, { "auxiliary_loss_clip": 0.01058378, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.01260018, "balance_loss_mlp": 1.01850069, "epoch": 0.4889824139485946, "flos": 23842151487360.0, "grad_norm": 1.4565060458573194, "language_loss": 0.70995712, "learning_rate": 2.1682431157377797e-06, "loss": 0.73093224, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 8133, "time_per_iteration": 2.4350242614746094 }, { "auxiliary_loss_clip": 0.01057206, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.01608658, "balance_loss_mlp": 1.01826143, "epoch": 0.48904253720126256, "flos": 24424677388800.0, "grad_norm": 1.6225443384002662, "language_loss": 0.71776098, "learning_rate": 2.1678550313157883e-06, "loss": 0.73874807, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 8134, "time_per_iteration": 3.956407308578491 }, { "auxiliary_loss_clip": 0.01062676, "auxiliary_loss_mlp": 0.01044791, "balance_loss_clip": 1.01567972, "balance_loss_mlp": 1.02005744, "epoch": 0.4891026604539306, "flos": 24169532106240.0, "grad_norm": 1.943324556443965, "language_loss": 0.81701458, "learning_rate": 2.167466940528718e-06, "loss": 0.83808923, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 8135, "time_per_iteration": 2.4218056201934814 }, { "auxiliary_loss_clip": 0.01058166, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.01522803, "balance_loss_mlp": 1.01934624, "epoch": 0.48916278370659855, "flos": 21469873092480.0, "grad_norm": 1.8301609115127278, "language_loss": 0.75882578, "learning_rate": 2.1670788433912843e-06, "loss": 0.77979732, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.38867188, "step": 8136, "time_per_iteration": 2.401322364807129 }, { "auxiliary_loss_clip": 0.01057548, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 1.01452565, "balance_loss_mlp": 1.01862967, "epoch": 0.4892229069592665, "flos": 22308661440000.0, "grad_norm": 2.0507477659623485, "language_loss": 0.73856843, "learning_rate": 2.166690739918204e-06, "loss": 0.75954747, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 8137, "time_per_iteration": 2.3714816570281982 }, { "auxiliary_loss_clip": 0.01059938, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.01327133, "balance_loss_mlp": 1.02017057, "epoch": 0.4892830302119345, "flos": 12786031088640.0, "grad_norm": 2.0077215783291686, "language_loss": 0.77492034, "learning_rate": 2.1663026301241944e-06, "loss": 0.79589933, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3984375, "step": 8138, "time_per_iteration": 2.38264536857605 }, { "auxiliary_loss_clip": 0.01059546, "auxiliary_loss_mlp": 0.01044583, "balance_loss_clip": 1.01917982, "balance_loss_mlp": 1.01975203, "epoch": 0.48934315346460244, "flos": 20812842616320.0, "grad_norm": 1.5701867538500145, "language_loss": 0.75384945, "learning_rate": 2.165914514023972e-06, "loss": 0.77489078, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 8139, "time_per_iteration": 2.3868191242218018 }, { "auxiliary_loss_clip": 0.01058944, "auxiliary_loss_mlp": 0.01043908, "balance_loss_clip": 1.01807523, "balance_loss_mlp": 1.01868844, "epoch": 0.4894032767172704, "flos": 19754520439680.0, "grad_norm": 1.697186804427206, "language_loss": 0.63513237, "learning_rate": 2.165526391632255e-06, "loss": 0.65616089, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 8140, "time_per_iteration": 2.391818046569824 }, { "auxiliary_loss_clip": 0.01060612, "auxiliary_loss_mlp": 0.01044319, "balance_loss_clip": 1.01529145, "balance_loss_mlp": 1.01955295, "epoch": 0.4894633999699384, "flos": 17818097857920.0, "grad_norm": 1.6024904548050614, "language_loss": 0.83456266, "learning_rate": 2.1651382629637608e-06, "loss": 0.85561198, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41015625, "step": 8141, "time_per_iteration": 2.375199794769287 }, { "auxiliary_loss_clip": 0.01059696, "auxiliary_loss_mlp": 0.01042069, "balance_loss_clip": 1.01593792, "balance_loss_mlp": 1.01891637, "epoch": 0.48952352322260634, "flos": 25521962509440.0, "grad_norm": 1.5298406019735393, "language_loss": 0.7323736, "learning_rate": 2.1647501280332066e-06, "loss": 0.75339127, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 8142, "time_per_iteration": 2.416191577911377 }, { "auxiliary_loss_clip": 0.0105703, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.01183546, "balance_loss_mlp": 1.01805663, "epoch": 0.4895836464752743, "flos": 29054335294080.0, "grad_norm": 1.8107850355168307, "language_loss": 0.6865229, "learning_rate": 2.1643619868553105e-06, "loss": 0.70746773, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38867188, "step": 8143, "time_per_iteration": 2.451817274093628 }, { "auxiliary_loss_clip": 0.01056952, "auxiliary_loss_mlp": 0.01037709, "balance_loss_clip": 1.01356912, "balance_loss_mlp": 1.01808393, "epoch": 0.48964376972794227, "flos": 33545562192000.0, "grad_norm": 1.4563195439832757, "language_loss": 0.76001084, "learning_rate": 2.163973839444793e-06, "loss": 0.78095746, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38867188, "step": 8144, "time_per_iteration": 2.4901671409606934 }, { "auxiliary_loss_clip": 0.01058542, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.01518679, "balance_loss_mlp": 1.01942992, "epoch": 0.48970389298061023, "flos": 22052957575680.0, "grad_norm": 1.997204317839989, "language_loss": 0.77532852, "learning_rate": 2.1635856858163695e-06, "loss": 0.79631376, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 8145, "time_per_iteration": 2.420436382293701 }, { "auxiliary_loss_clip": 0.01059815, "auxiliary_loss_mlp": 0.01040108, "balance_loss_clip": 1.01425135, "balance_loss_mlp": 1.01974666, "epoch": 0.4897640162332782, "flos": 20083262601600.0, "grad_norm": 2.658579771504605, "language_loss": 0.81556249, "learning_rate": 2.163197525984761e-06, "loss": 0.83656168, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8146, "time_per_iteration": 2.361771821975708 }, { "auxiliary_loss_clip": 0.01056258, "auxiliary_loss_mlp": 0.01039236, "balance_loss_clip": 1.01544154, "balance_loss_mlp": 1.01829481, "epoch": 0.48982413948594616, "flos": 23805073756800.0, "grad_norm": 1.5335395060797647, "language_loss": 0.75156212, "learning_rate": 2.162809359964687e-06, "loss": 0.77251703, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 8147, "time_per_iteration": 2.4015231132507324 }, { "auxiliary_loss_clip": 0.01057512, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.0115248, "balance_loss_mlp": 1.01877642, "epoch": 0.4898842627386142, "flos": 17638679047680.0, "grad_norm": 2.122172553736516, "language_loss": 0.84629315, "learning_rate": 2.162421187770864e-06, "loss": 0.86723077, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 8148, "time_per_iteration": 2.3507580757141113 }, { "auxiliary_loss_clip": 0.0105364, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.01344705, "balance_loss_mlp": 1.016891, "epoch": 0.48994438599128215, "flos": 16616980753920.0, "grad_norm": 2.052022882087746, "language_loss": 0.7555837, "learning_rate": 2.162033009418015e-06, "loss": 0.77647418, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3671875, "step": 8149, "time_per_iteration": 2.3609838485717773 }, { "auxiliary_loss_clip": 0.01061359, "auxiliary_loss_mlp": 0.01042738, "balance_loss_clip": 1.01715565, "balance_loss_mlp": 1.01986074, "epoch": 0.4900045092439501, "flos": 26613626901120.0, "grad_norm": 2.7346469601259393, "language_loss": 0.76905018, "learning_rate": 2.1616448249208567e-06, "loss": 0.79009116, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.4140625, "step": 8150, "time_per_iteration": 2.4113337993621826 }, { "auxiliary_loss_clip": 0.01058695, "auxiliary_loss_mlp": 0.01043916, "balance_loss_clip": 1.01811862, "balance_loss_mlp": 1.01863587, "epoch": 0.4900646324966181, "flos": 19901085793920.0, "grad_norm": 1.8817951707232807, "language_loss": 0.7348972, "learning_rate": 2.1612566342941106e-06, "loss": 0.75592327, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8151, "time_per_iteration": 2.3761820793151855 }, { "auxiliary_loss_clip": 0.01009405, "auxiliary_loss_mlp": 0.01002731, "balance_loss_clip": 1.00035918, "balance_loss_mlp": 1.00150132, "epoch": 0.49012475574928605, "flos": 59186682771840.0, "grad_norm": 0.8253647462615042, "language_loss": 0.54429889, "learning_rate": 2.1608684375524977e-06, "loss": 0.56442022, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07910156, "step": 8152, "time_per_iteration": 2.9942619800567627 }, { "auxiliary_loss_clip": 0.01057742, "auxiliary_loss_mlp": 0.01040264, "balance_loss_clip": 1.01554012, "balance_loss_mlp": 1.01732433, "epoch": 0.490184879001954, "flos": 45258049751040.0, "grad_norm": 1.6768208377986304, "language_loss": 0.62249172, "learning_rate": 2.1604802347107364e-06, "loss": 0.64347172, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.40429688, "step": 8153, "time_per_iteration": 2.5772602558135986 }, { "auxiliary_loss_clip": 0.01057699, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.01468897, "balance_loss_mlp": 1.01795018, "epoch": 0.490245002254622, "flos": 28000865796480.0, "grad_norm": 1.6370589119291723, "language_loss": 0.77561718, "learning_rate": 2.160092025783549e-06, "loss": 0.79657805, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3984375, "step": 8154, "time_per_iteration": 2.4156250953674316 }, { "auxiliary_loss_clip": 0.01009132, "auxiliary_loss_mlp": 0.01002908, "balance_loss_clip": 1.00066733, "balance_loss_mlp": 1.00142813, "epoch": 0.49030512550728994, "flos": 58947910917120.0, "grad_norm": 0.9727950300438993, "language_loss": 0.66964078, "learning_rate": 2.1597038107856564e-06, "loss": 0.68976116, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.07714844, "step": 8155, "time_per_iteration": 3.1068997383117676 }, { "auxiliary_loss_clip": 0.01058089, "auxiliary_loss_mlp": 0.01041111, "balance_loss_clip": 1.01710272, "balance_loss_mlp": 1.01878119, "epoch": 0.4903652487599579, "flos": 19790830120320.0, "grad_norm": 2.940843269192393, "language_loss": 0.77508807, "learning_rate": 2.1593155897317784e-06, "loss": 0.79607999, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.39257812, "step": 8156, "time_per_iteration": 2.3568928241729736 }, { "auxiliary_loss_clip": 0.0105592, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.01494098, "balance_loss_mlp": 1.01700771, "epoch": 0.49042537201262587, "flos": 21761013853440.0, "grad_norm": 2.066935992227891, "language_loss": 0.84754622, "learning_rate": 2.1589273626366377e-06, "loss": 0.86849958, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38867188, "step": 8157, "time_per_iteration": 2.394874095916748 }, { "auxiliary_loss_clip": 0.01055969, "auxiliary_loss_mlp": 0.01042962, "balance_loss_clip": 1.01816607, "balance_loss_mlp": 1.01750171, "epoch": 0.49048549526529384, "flos": 18952041772800.0, "grad_norm": 2.364658509754103, "language_loss": 0.80293334, "learning_rate": 2.158539129514956e-06, "loss": 0.82392263, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 8158, "time_per_iteration": 2.354426383972168 }, { "auxiliary_loss_clip": 0.01060396, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.01986194, "balance_loss_mlp": 1.01937056, "epoch": 0.4905456185179618, "flos": 26905186598400.0, "grad_norm": 2.054086097220237, "language_loss": 0.70778286, "learning_rate": 2.158150890381454e-06, "loss": 0.72884345, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 8159, "time_per_iteration": 2.439462900161743 }, { "auxiliary_loss_clip": 0.01055996, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.01636696, "balance_loss_mlp": 1.01696575, "epoch": 0.49060574177062977, "flos": 20411306536320.0, "grad_norm": 1.9521441464888611, "language_loss": 0.73831618, "learning_rate": 2.157762645250854e-06, "loss": 0.75928795, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 8160, "time_per_iteration": 2.356128215789795 }, { "auxiliary_loss_clip": 0.01059439, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.01824069, "balance_loss_mlp": 1.0183388, "epoch": 0.4906658650232978, "flos": 17492742097920.0, "grad_norm": 1.8289368882976844, "language_loss": 0.72796524, "learning_rate": 2.1573743941378796e-06, "loss": 0.7490257, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41015625, "step": 8161, "time_per_iteration": 2.3589413166046143 }, { "auxiliary_loss_clip": 0.01056837, "auxiliary_loss_mlp": 0.01045515, "balance_loss_clip": 1.02096963, "balance_loss_mlp": 1.01815271, "epoch": 0.49072598827596575, "flos": 26613242876160.0, "grad_norm": 1.600070159444585, "language_loss": 0.70210409, "learning_rate": 2.1569861370572517e-06, "loss": 0.7231276, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 8162, "time_per_iteration": 2.3958933353424072 }, { "auxiliary_loss_clip": 0.010605, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.01318741, "balance_loss_mlp": 1.01808453, "epoch": 0.4907861115286337, "flos": 20411550915840.0, "grad_norm": 1.8377627708938684, "language_loss": 0.65298927, "learning_rate": 2.1565978740236944e-06, "loss": 0.67401046, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 8163, "time_per_iteration": 2.3561549186706543 }, { "auxiliary_loss_clip": 0.01055658, "auxiliary_loss_mlp": 0.01038754, "balance_loss_clip": 1.01466203, "balance_loss_mlp": 1.01761353, "epoch": 0.4908462347813017, "flos": 14063398335360.0, "grad_norm": 2.2513844148426827, "language_loss": 0.78545928, "learning_rate": 2.1562096050519293e-06, "loss": 0.8064034, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38085938, "step": 8164, "time_per_iteration": 2.3424456119537354 }, { "auxiliary_loss_clip": 0.01059656, "auxiliary_loss_mlp": 0.01040473, "balance_loss_clip": 1.01293588, "balance_loss_mlp": 1.01826119, "epoch": 0.49090635803396965, "flos": 18734078954880.0, "grad_norm": 1.5587581116206426, "language_loss": 0.78153801, "learning_rate": 2.1558213301566806e-06, "loss": 0.80253935, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4140625, "step": 8165, "time_per_iteration": 5.008649110794067 }, { "auxiliary_loss_clip": 0.01057022, "auxiliary_loss_mlp": 0.01042096, "balance_loss_clip": 1.01709807, "balance_loss_mlp": 1.01797605, "epoch": 0.4909664812866376, "flos": 20557452954240.0, "grad_norm": 1.707293990604302, "language_loss": 0.78386128, "learning_rate": 2.1554330493526716e-06, "loss": 0.80485243, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 8166, "time_per_iteration": 2.4067025184631348 }, { "auxiliary_loss_clip": 0.01009786, "auxiliary_loss_mlp": 0.01004439, "balance_loss_clip": 1.00228119, "balance_loss_mlp": 1.00187361, "epoch": 0.4910266045393056, "flos": 54680686502400.0, "grad_norm": 0.7930940472735298, "language_loss": 0.54189771, "learning_rate": 2.1550447626546253e-06, "loss": 0.56203997, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.07910156, "step": 8167, "time_per_iteration": 4.395890712738037 }, { "auxiliary_loss_clip": 0.01056674, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.01228786, "balance_loss_mlp": 1.01771259, "epoch": 0.49108672779197354, "flos": 16245714689280.0, "grad_norm": 1.866298655890613, "language_loss": 0.8730545, "learning_rate": 2.1546564700772665e-06, "loss": 0.89399338, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.390625, "step": 8168, "time_per_iteration": 2.333305597305298 }, { "auxiliary_loss_clip": 0.01056552, "auxiliary_loss_mlp": 0.01042329, "balance_loss_clip": 1.0177598, "balance_loss_mlp": 1.01893497, "epoch": 0.4911468510446415, "flos": 19824486537600.0, "grad_norm": 1.685539208713929, "language_loss": 0.74043083, "learning_rate": 2.1542681716353193e-06, "loss": 0.76141965, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 8169, "time_per_iteration": 2.377804756164551 }, { "auxiliary_loss_clip": 0.01056392, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.00966084, "balance_loss_mlp": 1.01764989, "epoch": 0.4912069742973095, "flos": 21211690521600.0, "grad_norm": 1.5335941734494924, "language_loss": 0.79503214, "learning_rate": 2.1538798673435068e-06, "loss": 0.81592232, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.38671875, "step": 8170, "time_per_iteration": 2.3817551136016846 }, { "auxiliary_loss_clip": 0.01057949, "auxiliary_loss_mlp": 0.01040259, "balance_loss_clip": 1.01491547, "balance_loss_mlp": 1.01820052, "epoch": 0.49126709754997744, "flos": 19536103774080.0, "grad_norm": 2.0576197823686733, "language_loss": 0.77556258, "learning_rate": 2.1534915572165545e-06, "loss": 0.79654467, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 8171, "time_per_iteration": 2.391784429550171 }, { "auxiliary_loss_clip": 0.01059434, "auxiliary_loss_mlp": 0.01041316, "balance_loss_clip": 1.01585281, "balance_loss_mlp": 1.01899576, "epoch": 0.4913272208026454, "flos": 12238872261120.0, "grad_norm": 1.7586130310535992, "language_loss": 0.82804209, "learning_rate": 2.1531032412691875e-06, "loss": 0.84904957, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40429688, "step": 8172, "time_per_iteration": 2.3469550609588623 }, { "auxiliary_loss_clip": 0.01011973, "auxiliary_loss_mlp": 0.01008965, "balance_loss_clip": 1.00677204, "balance_loss_mlp": 1.0038892, "epoch": 0.49138734405531337, "flos": 65462739661440.0, "grad_norm": 0.6922227780624317, "language_loss": 0.53407866, "learning_rate": 2.1527149195161295e-06, "loss": 0.55428803, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.08105469, "step": 8173, "time_per_iteration": 3.0295400619506836 }, { "auxiliary_loss_clip": 0.01060324, "auxiliary_loss_mlp": 0.01044419, "balance_loss_clip": 1.01677406, "balance_loss_mlp": 1.01982617, "epoch": 0.4914474673079814, "flos": 18438155337600.0, "grad_norm": 1.9509399630197848, "language_loss": 0.64335835, "learning_rate": 2.152326591972107e-06, "loss": 0.66440582, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 8174, "time_per_iteration": 3.874443769454956 }, { "auxiliary_loss_clip": 0.01058399, "auxiliary_loss_mlp": 0.01041675, "balance_loss_clip": 1.01603317, "balance_loss_mlp": 1.01862979, "epoch": 0.49150759056064935, "flos": 21684100394880.0, "grad_norm": 1.70332980914012, "language_loss": 0.70608956, "learning_rate": 2.1519382586518445e-06, "loss": 0.7270903, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8175, "time_per_iteration": 2.3995161056518555 }, { "auxiliary_loss_clip": 0.01058477, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 1.01506162, "balance_loss_mlp": 1.01926398, "epoch": 0.4915677138133173, "flos": 22381350624000.0, "grad_norm": 1.5661662011380222, "language_loss": 0.75732219, "learning_rate": 2.151549919570068e-06, "loss": 0.77831054, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8176, "time_per_iteration": 2.396594524383545 }, { "auxiliary_loss_clip": 0.01059683, "auxiliary_loss_mlp": 0.01038428, "balance_loss_clip": 1.01195168, "balance_loss_mlp": 1.01991248, "epoch": 0.4916278370659853, "flos": 18401985302400.0, "grad_norm": 1.9207388945249415, "language_loss": 0.71162635, "learning_rate": 2.1511615747415036e-06, "loss": 0.73260748, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 8177, "time_per_iteration": 2.3862104415893555 }, { "auxiliary_loss_clip": 0.01013131, "auxiliary_loss_mlp": 0.01003184, "balance_loss_clip": 1.00085938, "balance_loss_mlp": 1.00515008, "epoch": 0.49168796031865325, "flos": 66605620884480.0, "grad_norm": 0.6881099443012091, "language_loss": 0.46261072, "learning_rate": 2.150773224180877e-06, "loss": 0.48277387, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.07958984, "step": 8178, "time_per_iteration": 2.994487762451172 }, { "auxiliary_loss_clip": 0.01062561, "auxiliary_loss_mlp": 0.01044393, "balance_loss_clip": 1.0166415, "balance_loss_mlp": 1.02044821, "epoch": 0.4917480835713212, "flos": 20958290807040.0, "grad_norm": 2.032064382773035, "language_loss": 0.66825706, "learning_rate": 2.1503848679029147e-06, "loss": 0.68932664, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41992188, "step": 8179, "time_per_iteration": 2.4026243686676025 }, { "auxiliary_loss_clip": 0.01061591, "auxiliary_loss_mlp": 0.01042112, "balance_loss_clip": 1.01551652, "balance_loss_mlp": 1.01939702, "epoch": 0.4918082068239892, "flos": 15772152741120.0, "grad_norm": 2.067148499491524, "language_loss": 0.71206617, "learning_rate": 2.149996505922343e-06, "loss": 0.73310316, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 8180, "time_per_iteration": 2.3385813236236572 }, { "auxiliary_loss_clip": 0.01057598, "auxiliary_loss_mlp": 0.01048606, "balance_loss_clip": 1.02296424, "balance_loss_mlp": 1.01857948, "epoch": 0.49186833007665715, "flos": 24603747085440.0, "grad_norm": 1.6716814887533704, "language_loss": 0.85415155, "learning_rate": 2.1496081382538895e-06, "loss": 0.87521362, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 8181, "time_per_iteration": 2.4223759174346924 }, { "auxiliary_loss_clip": 0.01054489, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.01853585, "balance_loss_mlp": 1.01671863, "epoch": 0.4919284533293251, "flos": 22089476724480.0, "grad_norm": 1.908224590801519, "language_loss": 0.74953169, "learning_rate": 2.1492197649122793e-06, "loss": 0.77048808, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37890625, "step": 8182, "time_per_iteration": 2.3814311027526855 }, { "auxiliary_loss_clip": 0.01058169, "auxiliary_loss_mlp": 0.01043639, "balance_loss_clip": 1.01803279, "balance_loss_mlp": 1.0189625, "epoch": 0.4919885765819931, "flos": 23366913793920.0, "grad_norm": 1.7460629112421506, "language_loss": 0.73675412, "learning_rate": 2.1488313859122412e-06, "loss": 0.75777221, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39257812, "step": 8183, "time_per_iteration": 2.4056015014648438 }, { "auxiliary_loss_clip": 0.01059989, "auxiliary_loss_mlp": 0.01044878, "balance_loss_clip": 1.01630306, "balance_loss_mlp": 1.01805937, "epoch": 0.49204869983466104, "flos": 21359442862080.0, "grad_norm": 2.3384715469122948, "language_loss": 0.78907377, "learning_rate": 2.1484430012685015e-06, "loss": 0.81012237, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 8184, "time_per_iteration": 2.3771708011627197 }, { "auxiliary_loss_clip": 0.01058861, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.02183437, "balance_loss_mlp": 1.01941299, "epoch": 0.492108823087329, "flos": 21141619689600.0, "grad_norm": 2.419587360844459, "language_loss": 0.72046697, "learning_rate": 2.148054610995789e-06, "loss": 0.74152064, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39453125, "step": 8185, "time_per_iteration": 2.431516647338867 }, { "auxiliary_loss_clip": 0.01058876, "auxiliary_loss_mlp": 0.01051892, "balance_loss_clip": 1.02588093, "balance_loss_mlp": 1.01873326, "epoch": 0.49216894633999697, "flos": 25115503927680.0, "grad_norm": 1.616783326946073, "language_loss": 0.75547659, "learning_rate": 2.147666215108831e-06, "loss": 0.77658427, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40234375, "step": 8186, "time_per_iteration": 2.407060146331787 }, { "auxiliary_loss_clip": 0.01058643, "auxiliary_loss_mlp": 0.01044112, "balance_loss_clip": 1.01848185, "balance_loss_mlp": 1.01809597, "epoch": 0.49222906959266494, "flos": 22636845020160.0, "grad_norm": 2.244494213405238, "language_loss": 0.69259828, "learning_rate": 2.1472778136223545e-06, "loss": 0.71362585, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40429688, "step": 8187, "time_per_iteration": 2.4409854412078857 }, { "auxiliary_loss_clip": 0.01057681, "auxiliary_loss_mlp": 0.01051286, "balance_loss_clip": 1.02542973, "balance_loss_mlp": 1.01781988, "epoch": 0.49228919284533296, "flos": 20409560968320.0, "grad_norm": 1.4235746634950386, "language_loss": 0.67824864, "learning_rate": 2.1468894065510894e-06, "loss": 0.69933832, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8188, "time_per_iteration": 2.3628766536712646 }, { "auxiliary_loss_clip": 0.0105937, "auxiliary_loss_mlp": 0.01044705, "balance_loss_clip": 1.01963496, "balance_loss_mlp": 1.019894, "epoch": 0.4923493160980009, "flos": 27121229291520.0, "grad_norm": 1.6463921577382392, "language_loss": 0.75319231, "learning_rate": 2.1465009939097623e-06, "loss": 0.7742331, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 8189, "time_per_iteration": 2.4471213817596436 }, { "auxiliary_loss_clip": 0.01055319, "auxiliary_loss_mlp": 0.01039196, "balance_loss_clip": 1.01618814, "balance_loss_mlp": 1.0178678, "epoch": 0.4924094393506689, "flos": 35735209931520.0, "grad_norm": 1.8768953848888983, "language_loss": 0.65482336, "learning_rate": 2.146112575713104e-06, "loss": 0.67576849, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.375, "step": 8190, "time_per_iteration": 2.510164260864258 }, { "auxiliary_loss_clip": 0.01059002, "auxiliary_loss_mlp": 0.0104031, "balance_loss_clip": 1.01700461, "balance_loss_mlp": 1.0202961, "epoch": 0.49246956260333685, "flos": 20411446181760.0, "grad_norm": 1.9522367787871127, "language_loss": 0.72348535, "learning_rate": 2.1457241519758413e-06, "loss": 0.74447846, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38671875, "step": 8191, "time_per_iteration": 2.38382625579834 }, { "auxiliary_loss_clip": 0.01058811, "auxiliary_loss_mlp": 0.01042657, "balance_loss_clip": 1.01788497, "balance_loss_mlp": 1.01925564, "epoch": 0.4925296858560048, "flos": 38975569171200.0, "grad_norm": 1.6071144186086403, "language_loss": 0.72665834, "learning_rate": 2.1453357227127043e-06, "loss": 0.74767303, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39453125, "step": 8192, "time_per_iteration": 2.5199601650238037 }, { "auxiliary_loss_clip": 0.01014084, "auxiliary_loss_mlp": 0.01019631, "balance_loss_clip": 1.01707995, "balance_loss_mlp": 1.00607872, "epoch": 0.4925898091086728, "flos": 64275342721920.0, "grad_norm": 0.7420362473476545, "language_loss": 0.52282012, "learning_rate": 2.1449472879384224e-06, "loss": 0.54315722, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.08007812, "step": 8193, "time_per_iteration": 3.100430488586426 }, { "auxiliary_loss_clip": 0.01058477, "auxiliary_loss_mlp": 0.01043905, "balance_loss_clip": 1.01847744, "balance_loss_mlp": 1.01972675, "epoch": 0.49264993236134075, "flos": 23035343811840.0, "grad_norm": 1.587204482645328, "language_loss": 0.78063792, "learning_rate": 2.1445588476677246e-06, "loss": 0.80166173, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 8194, "time_per_iteration": 2.4033992290496826 }, { "auxiliary_loss_clip": 0.01057854, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.01098454, "balance_loss_mlp": 1.01891828, "epoch": 0.4927100556140087, "flos": 24717040047360.0, "grad_norm": 2.095925504159037, "language_loss": 0.71014535, "learning_rate": 2.144170401915341e-06, "loss": 0.73108298, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38867188, "step": 8195, "time_per_iteration": 2.4212963581085205 }, { "auxiliary_loss_clip": 0.01061196, "auxiliary_loss_mlp": 0.01039957, "balance_loss_clip": 1.01575696, "balance_loss_mlp": 1.02058458, "epoch": 0.4927701788666767, "flos": 23504646574080.0, "grad_norm": 2.053252842670129, "language_loss": 0.82147312, "learning_rate": 2.143781950696001e-06, "loss": 0.84248459, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.40625, "step": 8196, "time_per_iteration": 2.402113676071167 }, { "auxiliary_loss_clip": 0.01059531, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.01334882, "balance_loss_mlp": 1.01924455, "epoch": 0.49283030211934464, "flos": 22927811224320.0, "grad_norm": 2.0380283180650056, "language_loss": 0.71790075, "learning_rate": 2.1433934940244356e-06, "loss": 0.7388894, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40234375, "step": 8197, "time_per_iteration": 2.4418551921844482 }, { "auxiliary_loss_clip": 0.01056619, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.0131768, "balance_loss_mlp": 1.01907778, "epoch": 0.4928904253720126, "flos": 16872091125120.0, "grad_norm": 1.803349432958303, "language_loss": 0.86203557, "learning_rate": 2.143005031915374e-06, "loss": 0.88295263, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.375, "step": 8198, "time_per_iteration": 2.3887691497802734 }, { "auxiliary_loss_clip": 0.01062613, "auxiliary_loss_mlp": 0.01046215, "balance_loss_clip": 1.02104998, "balance_loss_mlp": 1.02215624, "epoch": 0.4929505486246806, "flos": 14865667534080.0, "grad_norm": 2.0101488196083723, "language_loss": 0.77146447, "learning_rate": 2.1426165643835467e-06, "loss": 0.79255277, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40429688, "step": 8199, "time_per_iteration": 2.410046339035034 }, { "auxiliary_loss_clip": 0.0106521, "auxiliary_loss_mlp": 0.01050747, "balance_loss_clip": 1.0217793, "balance_loss_mlp": 1.02253222, "epoch": 0.49301067187734854, "flos": 23841208880640.0, "grad_norm": 1.734005350232476, "language_loss": 0.60565978, "learning_rate": 2.1422280914436864e-06, "loss": 0.62681937, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42773438, "step": 8200, "time_per_iteration": 2.4281396865844727 }, { "auxiliary_loss_clip": 0.01057032, "auxiliary_loss_mlp": 0.01043238, "balance_loss_clip": 1.02030206, "balance_loss_mlp": 1.02029479, "epoch": 0.49307079513001656, "flos": 22490209843200.0, "grad_norm": 1.3854640134829341, "language_loss": 0.80026418, "learning_rate": 2.1418396131105213e-06, "loss": 0.82126689, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 8201, "time_per_iteration": 2.474414348602295 }, { "auxiliary_loss_clip": 0.01064648, "auxiliary_loss_mlp": 0.01043463, "balance_loss_clip": 1.01577067, "balance_loss_mlp": 1.02217472, "epoch": 0.4931309183826845, "flos": 15923675508480.0, "grad_norm": 1.9912349292768199, "language_loss": 0.68719321, "learning_rate": 2.141451129398785e-06, "loss": 0.7082743, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42578125, "step": 8202, "time_per_iteration": 2.3576419353485107 }, { "auxiliary_loss_clip": 0.0105939, "auxiliary_loss_mlp": 0.01039888, "balance_loss_clip": 1.0154022, "balance_loss_mlp": 1.02025831, "epoch": 0.4931910416353525, "flos": 27307804930560.0, "grad_norm": 1.9342093569552468, "language_loss": 0.76569545, "learning_rate": 2.1410626403232076e-06, "loss": 0.78668821, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.390625, "step": 8203, "time_per_iteration": 2.494159698486328 }, { "auxiliary_loss_clip": 0.0105823, "auxiliary_loss_mlp": 0.0104641, "balance_loss_clip": 1.02187729, "balance_loss_mlp": 1.01929832, "epoch": 0.49325116488802045, "flos": 20805301762560.0, "grad_norm": 2.0748903592297796, "language_loss": 0.81646097, "learning_rate": 2.1406741458985197e-06, "loss": 0.83750737, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38867188, "step": 8204, "time_per_iteration": 3.678524971008301 }, { "auxiliary_loss_clip": 0.01058667, "auxiliary_loss_mlp": 0.01045727, "balance_loss_clip": 1.02204037, "balance_loss_mlp": 1.02051842, "epoch": 0.4933112881406884, "flos": 19864915758720.0, "grad_norm": 1.9999615758946698, "language_loss": 0.66783094, "learning_rate": 2.140285646139455e-06, "loss": 0.68887484, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.38085938, "step": 8205, "time_per_iteration": 3.7977397441864014 }, { "auxiliary_loss_clip": 0.01062808, "auxiliary_loss_mlp": 0.01049974, "balance_loss_clip": 1.02015948, "balance_loss_mlp": 1.02042866, "epoch": 0.4933714113933564, "flos": 21827104790400.0, "grad_norm": 1.8410449372653892, "language_loss": 0.67192423, "learning_rate": 2.139897141060744e-06, "loss": 0.69305205, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.42382812, "step": 8206, "time_per_iteration": 3.7696752548217773 }, { "auxiliary_loss_clip": 0.01058987, "auxiliary_loss_mlp": 0.01051431, "balance_loss_clip": 1.02668333, "balance_loss_mlp": 1.01902962, "epoch": 0.49343153464602435, "flos": 27888934377600.0, "grad_norm": 1.7751824703527865, "language_loss": 0.77282161, "learning_rate": 2.1395086306771196e-06, "loss": 0.79392576, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3984375, "step": 8207, "time_per_iteration": 2.4647908210754395 }, { "auxiliary_loss_clip": 0.01058454, "auxiliary_loss_mlp": 0.01047279, "balance_loss_clip": 1.02015889, "balance_loss_mlp": 1.01943207, "epoch": 0.4934916578986923, "flos": 24679927405440.0, "grad_norm": 2.068294801362506, "language_loss": 0.61973751, "learning_rate": 2.1391201150033147e-06, "loss": 0.64079487, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 8208, "time_per_iteration": 2.4150383472442627 }, { "auxiliary_loss_clip": 0.01060296, "auxiliary_loss_mlp": 0.01042733, "balance_loss_clip": 1.01521969, "balance_loss_mlp": 1.02012634, "epoch": 0.4935517811513603, "flos": 23403991524480.0, "grad_norm": 2.0389138260299857, "language_loss": 0.80702686, "learning_rate": 2.1387315940540598e-06, "loss": 0.82805717, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40234375, "step": 8209, "time_per_iteration": 2.3893864154815674 }, { "auxiliary_loss_clip": 0.01057415, "auxiliary_loss_mlp": 0.01042129, "balance_loss_clip": 1.01710665, "balance_loss_mlp": 1.01856017, "epoch": 0.49361190440402825, "flos": 21943435040640.0, "grad_norm": 2.5814654532514822, "language_loss": 0.80068839, "learning_rate": 2.138343067844089e-06, "loss": 0.82168382, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 8210, "time_per_iteration": 2.4062931537628174 }, { "auxiliary_loss_clip": 0.01059778, "auxiliary_loss_mlp": 0.01054383, "balance_loss_clip": 1.02545047, "balance_loss_mlp": 1.01803637, "epoch": 0.4936720276566962, "flos": 25114596232320.0, "grad_norm": 2.3068374399036147, "language_loss": 0.82420647, "learning_rate": 2.1379545363881363e-06, "loss": 0.84534812, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 8211, "time_per_iteration": 2.419793128967285 }, { "auxiliary_loss_clip": 0.01059462, "auxiliary_loss_mlp": 0.01047263, "balance_loss_clip": 1.02112007, "balance_loss_mlp": 1.01956129, "epoch": 0.4937321509093642, "flos": 26357748480000.0, "grad_norm": 2.359071535339109, "language_loss": 0.92891288, "learning_rate": 2.137565999700933e-06, "loss": 0.94998014, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8212, "time_per_iteration": 2.422281503677368 }, { "auxiliary_loss_clip": 0.0105867, "auxiliary_loss_mlp": 0.01045832, "balance_loss_clip": 1.01979685, "balance_loss_mlp": 1.01768172, "epoch": 0.49379227416203214, "flos": 22960420300800.0, "grad_norm": 3.214978919155108, "language_loss": 0.6608308, "learning_rate": 2.1371774577972138e-06, "loss": 0.68187582, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41015625, "step": 8213, "time_per_iteration": 3.9596500396728516 }, { "auxiliary_loss_clip": 0.01057959, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.01541507, "balance_loss_mlp": 1.01793206, "epoch": 0.49385239741470016, "flos": 32487728774400.0, "grad_norm": 2.8797496189176037, "language_loss": 0.77202147, "learning_rate": 2.136788910691711e-06, "loss": 0.79301405, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40039062, "step": 8214, "time_per_iteration": 2.502980947494507 }, { "auxiliary_loss_clip": 0.01059831, "auxiliary_loss_mlp": 0.01047782, "balance_loss_clip": 1.0199945, "balance_loss_mlp": 1.01952517, "epoch": 0.4939125206673681, "flos": 22491745943040.0, "grad_norm": 1.7245932757777973, "language_loss": 0.85474902, "learning_rate": 2.1364003583991594e-06, "loss": 0.87582517, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 8215, "time_per_iteration": 2.4322280883789062 }, { "auxiliary_loss_clip": 0.01055726, "auxiliary_loss_mlp": 0.01039409, "balance_loss_clip": 1.01541233, "balance_loss_mlp": 1.0175302, "epoch": 0.4939726439200361, "flos": 31174994453760.0, "grad_norm": 1.7278753821708002, "language_loss": 0.84423363, "learning_rate": 2.136011800934292e-06, "loss": 0.8651849, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3828125, "step": 8216, "time_per_iteration": 2.4457359313964844 }, { "auxiliary_loss_clip": 0.01057039, "auxiliary_loss_mlp": 0.01049341, "balance_loss_clip": 1.02247143, "balance_loss_mlp": 1.01816869, "epoch": 0.49403276717270406, "flos": 22673119789440.0, "grad_norm": 1.4016205944531097, "language_loss": 0.74982679, "learning_rate": 2.1356232383118442e-06, "loss": 0.77089059, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38867188, "step": 8217, "time_per_iteration": 2.4102461338043213 }, { "auxiliary_loss_clip": 0.01056106, "auxiliary_loss_mlp": 0.01042741, "balance_loss_clip": 1.01429737, "balance_loss_mlp": 1.0179292, "epoch": 0.494092890425372, "flos": 20740013786880.0, "grad_norm": 2.874638033780201, "language_loss": 0.79207373, "learning_rate": 2.1352346705465494e-06, "loss": 0.81306219, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.3828125, "step": 8218, "time_per_iteration": 2.3693618774414062 }, { "auxiliary_loss_clip": 0.01055265, "auxiliary_loss_mlp": 0.01040624, "balance_loss_clip": 1.0154705, "balance_loss_mlp": 1.01732385, "epoch": 0.49415301367804, "flos": 18368049594240.0, "grad_norm": 2.1609581600777026, "language_loss": 0.77207458, "learning_rate": 2.134846097653142e-06, "loss": 0.79303348, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 8219, "time_per_iteration": 2.3816721439361572 }, { "auxiliary_loss_clip": 0.01058578, "auxiliary_loss_mlp": 0.01044757, "balance_loss_clip": 1.01870966, "balance_loss_mlp": 1.01919842, "epoch": 0.49421313693070795, "flos": 17529645271680.0, "grad_norm": 1.7449219582666309, "language_loss": 0.63312006, "learning_rate": 2.134457519646357e-06, "loss": 0.65415347, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 8220, "time_per_iteration": 2.345715045928955 }, { "auxiliary_loss_clip": 0.01057716, "auxiliary_loss_mlp": 0.01039801, "balance_loss_clip": 1.01442099, "balance_loss_mlp": 1.0182333, "epoch": 0.4942732601833759, "flos": 20811166871040.0, "grad_norm": 2.6347165693063754, "language_loss": 0.73311687, "learning_rate": 2.1340689365409296e-06, "loss": 0.75409198, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 8221, "time_per_iteration": 2.379610776901245 }, { "auxiliary_loss_clip": 0.01059522, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.0167861, "balance_loss_mlp": 1.02154303, "epoch": 0.4943333834360439, "flos": 15048053809920.0, "grad_norm": 1.683034357917035, "language_loss": 0.80705798, "learning_rate": 2.133680348351595e-06, "loss": 0.82809675, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37890625, "step": 8222, "time_per_iteration": 2.383652925491333 }, { "auxiliary_loss_clip": 0.01058995, "auxiliary_loss_mlp": 0.01043173, "balance_loss_clip": 1.01596963, "balance_loss_mlp": 1.01968575, "epoch": 0.49439350668871185, "flos": 16069507724160.0, "grad_norm": 2.7672905962714838, "language_loss": 0.73846442, "learning_rate": 2.133291755093088e-06, "loss": 0.75948608, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39453125, "step": 8223, "time_per_iteration": 2.3419268131256104 }, { "auxiliary_loss_clip": 0.01059319, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.02006686, "balance_loss_mlp": 1.01865244, "epoch": 0.4944536299413798, "flos": 20879212844160.0, "grad_norm": 1.83431303896175, "language_loss": 0.76387358, "learning_rate": 2.132903156780144e-06, "loss": 0.784953, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40625, "step": 8224, "time_per_iteration": 2.402845621109009 }, { "auxiliary_loss_clip": 0.01061231, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.01034784, "balance_loss_mlp": 1.02050757, "epoch": 0.4945137531940478, "flos": 26607866526720.0, "grad_norm": 7.233911542803125, "language_loss": 0.65572631, "learning_rate": 2.1325145534274997e-06, "loss": 0.67672509, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 8225, "time_per_iteration": 2.4194650650024414 }, { "auxiliary_loss_clip": 0.01058097, "auxiliary_loss_mlp": 0.01045157, "balance_loss_clip": 1.01839447, "balance_loss_mlp": 1.01845479, "epoch": 0.49457387644671574, "flos": 23987006184960.0, "grad_norm": 2.0878148974518727, "language_loss": 0.78442425, "learning_rate": 2.1321259450498893e-06, "loss": 0.80545682, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 8226, "time_per_iteration": 2.406301259994507 }, { "auxiliary_loss_clip": 0.01059774, "auxiliary_loss_mlp": 0.0104594, "balance_loss_clip": 1.01663876, "balance_loss_mlp": 1.0185287, "epoch": 0.49463399969938376, "flos": 26975466898560.0, "grad_norm": 2.0779689600336533, "language_loss": 0.7191276, "learning_rate": 2.131737331662051e-06, "loss": 0.74018472, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41210938, "step": 8227, "time_per_iteration": 2.409641981124878 }, { "auxiliary_loss_clip": 0.01060873, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.01186097, "balance_loss_mlp": 1.01938117, "epoch": 0.49469412295205173, "flos": 29680188059520.0, "grad_norm": 1.6757593977383254, "language_loss": 0.72395384, "learning_rate": 2.131348713278718e-06, "loss": 0.74496585, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 8228, "time_per_iteration": 2.4470648765563965 }, { "auxiliary_loss_clip": 0.01057676, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.01232314, "balance_loss_mlp": 1.0190866, "epoch": 0.4947542462047197, "flos": 24130708807680.0, "grad_norm": 2.1803024353828553, "language_loss": 0.84825695, "learning_rate": 2.1309600899146304e-06, "loss": 0.86921239, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 8229, "time_per_iteration": 2.393721580505371 }, { "auxiliary_loss_clip": 0.01058424, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.01271617, "balance_loss_mlp": 1.01841962, "epoch": 0.49481436945738766, "flos": 20044090189440.0, "grad_norm": 1.7613105186129552, "language_loss": 0.76071262, "learning_rate": 2.1305714615845227e-06, "loss": 0.78169024, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 8230, "time_per_iteration": 2.372110366821289 }, { "auxiliary_loss_clip": 0.01057751, "auxiliary_loss_mlp": 0.01040236, "balance_loss_clip": 1.01432025, "balance_loss_mlp": 1.01919556, "epoch": 0.4948744927100556, "flos": 15668634960000.0, "grad_norm": 2.3042528525182586, "language_loss": 0.80709779, "learning_rate": 2.1301828283031314e-06, "loss": 0.82807767, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 8231, "time_per_iteration": 2.343997001647949 }, { "auxiliary_loss_clip": 0.01012371, "auxiliary_loss_mlp": 0.01004054, "balance_loss_clip": 1.0013237, "balance_loss_mlp": 1.0041256, "epoch": 0.4949346159627236, "flos": 68868481478400.0, "grad_norm": 0.7592329598511163, "language_loss": 0.60239762, "learning_rate": 2.1297941900851944e-06, "loss": 0.62256187, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 0.02734375, "router_z_loss_mlp": 0.08251953, "step": 8232, "time_per_iteration": 3.1526379585266113 }, { "auxiliary_loss_clip": 0.01061074, "auxiliary_loss_mlp": 0.01046416, "balance_loss_clip": 1.01737702, "balance_loss_mlp": 1.01851606, "epoch": 0.49499473921539155, "flos": 24789135738240.0, "grad_norm": 1.6230797994646526, "language_loss": 0.70830625, "learning_rate": 2.1294055469454496e-06, "loss": 0.7293812, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 8233, "time_per_iteration": 2.403683662414551 }, { "auxiliary_loss_clip": 0.01057225, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.00985718, "balance_loss_mlp": 1.01788843, "epoch": 0.4950548624680595, "flos": 32706529464960.0, "grad_norm": 1.9456900200172516, "language_loss": 0.68128598, "learning_rate": 2.129016898898633e-06, "loss": 0.7022298, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 8234, "time_per_iteration": 2.4702811241149902 }, { "auxiliary_loss_clip": 0.01010441, "auxiliary_loss_mlp": 0.01003804, "balance_loss_clip": 1.00137222, "balance_loss_mlp": 1.00242031, "epoch": 0.4951149857207275, "flos": 50079099196800.0, "grad_norm": 0.8018207526563954, "language_loss": 0.58040982, "learning_rate": 2.128628245959482e-06, "loss": 0.60055226, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.08007812, "step": 8235, "time_per_iteration": 2.961571216583252 }, { "auxiliary_loss_clip": 0.01059789, "auxiliary_loss_mlp": 0.01041601, "balance_loss_clip": 1.01511312, "balance_loss_mlp": 1.01846182, "epoch": 0.49517510897339545, "flos": 22235692965120.0, "grad_norm": 1.6477729791355038, "language_loss": 0.78756118, "learning_rate": 2.1282395881427355e-06, "loss": 0.80857503, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.4140625, "step": 8236, "time_per_iteration": 2.472034215927124 }, { "auxiliary_loss_clip": 0.01057163, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.01684213, "balance_loss_mlp": 1.01876819, "epoch": 0.4952352322260634, "flos": 25372953360000.0, "grad_norm": 4.534221281418266, "language_loss": 0.73392206, "learning_rate": 2.1278509254631315e-06, "loss": 0.75491273, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 8237, "time_per_iteration": 2.4135754108428955 }, { "auxiliary_loss_clip": 0.01055241, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.01708758, "balance_loss_mlp": 1.01698208, "epoch": 0.4952953554787314, "flos": 24607552423680.0, "grad_norm": 1.7641861101684626, "language_loss": 0.77369076, "learning_rate": 2.127462257935406e-06, "loss": 0.79465616, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3828125, "step": 8238, "time_per_iteration": 2.4316391944885254 }, { "auxiliary_loss_clip": 0.0105726, "auxiliary_loss_mlp": 0.01039629, "balance_loss_clip": 1.01472616, "balance_loss_mlp": 1.01778364, "epoch": 0.49535547873139935, "flos": 17310320910720.0, "grad_norm": 2.298793456518699, "language_loss": 0.75660408, "learning_rate": 2.1270735855743008e-06, "loss": 0.77757305, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39453125, "step": 8239, "time_per_iteration": 2.3445820808410645 }, { "auxiliary_loss_clip": 0.01060305, "auxiliary_loss_mlp": 0.01045902, "balance_loss_clip": 1.01810253, "balance_loss_mlp": 1.0190109, "epoch": 0.4954156019840673, "flos": 20739280648320.0, "grad_norm": 2.2227528083099655, "language_loss": 0.81026626, "learning_rate": 2.126684908394552e-06, "loss": 0.83132827, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8240, "time_per_iteration": 2.3913705348968506 }, { "auxiliary_loss_clip": 0.01056457, "auxiliary_loss_mlp": 0.01048919, "balance_loss_clip": 1.02470744, "balance_loss_mlp": 1.01866508, "epoch": 0.49547572523673533, "flos": 12819931885440.0, "grad_norm": 2.14093489475366, "language_loss": 0.87083483, "learning_rate": 2.126296226410898e-06, "loss": 0.89188862, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 8241, "time_per_iteration": 2.3323981761932373 }, { "auxiliary_loss_clip": 0.01056568, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.01828289, "balance_loss_mlp": 1.01829243, "epoch": 0.4955358484894033, "flos": 15596120332800.0, "grad_norm": 1.8160669957564215, "language_loss": 0.78426272, "learning_rate": 2.1259075396380794e-06, "loss": 0.8052516, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 8242, "time_per_iteration": 2.3722546100616455 }, { "auxiliary_loss_clip": 0.01056333, "auxiliary_loss_mlp": 0.01042565, "balance_loss_clip": 1.01762605, "balance_loss_mlp": 1.018013, "epoch": 0.49559597174207126, "flos": 26463291120000.0, "grad_norm": 1.5853045444858458, "language_loss": 0.68591177, "learning_rate": 2.125518848090833e-06, "loss": 0.70690072, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 8243, "time_per_iteration": 3.659604787826538 }, { "auxiliary_loss_clip": 0.01056146, "auxiliary_loss_mlp": 0.01039028, "balance_loss_clip": 1.01479268, "balance_loss_mlp": 1.01813197, "epoch": 0.4956560949947392, "flos": 23147135585280.0, "grad_norm": 2.120463247138215, "language_loss": 0.6966446, "learning_rate": 2.125130151783901e-06, "loss": 0.71759635, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 8244, "time_per_iteration": 2.382580280303955 }, { "auxiliary_loss_clip": 0.01057607, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.01574492, "balance_loss_mlp": 1.01856995, "epoch": 0.4957162182474072, "flos": 20772518129280.0, "grad_norm": 1.764747407804806, "language_loss": 0.77220893, "learning_rate": 2.12474145073202e-06, "loss": 0.79318726, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.390625, "step": 8245, "time_per_iteration": 3.8329248428344727 }, { "auxiliary_loss_clip": 0.01056298, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.01716232, "balance_loss_mlp": 1.01903617, "epoch": 0.49577634150007516, "flos": 18733206170880.0, "grad_norm": 1.8820548254730949, "language_loss": 0.83288509, "learning_rate": 2.1243527449499306e-06, "loss": 0.85386395, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 8246, "time_per_iteration": 3.8300249576568604 }, { "auxiliary_loss_clip": 0.01059875, "auxiliary_loss_mlp": 0.01048698, "balance_loss_clip": 1.02203107, "balance_loss_mlp": 1.01938438, "epoch": 0.4958364647527431, "flos": 25553070397440.0, "grad_norm": 1.5588775827074368, "language_loss": 0.85856116, "learning_rate": 2.1239640344523733e-06, "loss": 0.8796469, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40429688, "step": 8247, "time_per_iteration": 2.4158501625061035 }, { "auxiliary_loss_clip": 0.01057955, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.01450384, "balance_loss_mlp": 1.01824427, "epoch": 0.4958965880054111, "flos": 24424188629760.0, "grad_norm": 1.925415690534848, "language_loss": 0.84879696, "learning_rate": 2.123575319254087e-06, "loss": 0.86977255, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3984375, "step": 8248, "time_per_iteration": 2.4266536235809326 }, { "auxiliary_loss_clip": 0.01058265, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.01501215, "balance_loss_mlp": 1.01810575, "epoch": 0.49595671125807905, "flos": 25082266446720.0, "grad_norm": 1.9074328057804188, "language_loss": 0.75068861, "learning_rate": 2.123186599369812e-06, "loss": 0.77168673, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 8249, "time_per_iteration": 2.3924858570098877 }, { "auxiliary_loss_clip": 0.01059689, "auxiliary_loss_mlp": 0.0104862, "balance_loss_clip": 1.02339578, "balance_loss_mlp": 1.01899743, "epoch": 0.496016834510747, "flos": 16434943591680.0, "grad_norm": 1.7506248804016296, "language_loss": 0.76848024, "learning_rate": 2.122797874814289e-06, "loss": 0.7895633, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40820312, "step": 8250, "time_per_iteration": 2.346207857131958 }, { "auxiliary_loss_clip": 0.0105748, "auxiliary_loss_mlp": 0.01040635, "balance_loss_clip": 1.01554108, "balance_loss_mlp": 1.01818657, "epoch": 0.496076957763415, "flos": 23436879891840.0, "grad_norm": 1.719138315727723, "language_loss": 0.71238661, "learning_rate": 2.1224091456022585e-06, "loss": 0.7333678, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 8251, "time_per_iteration": 2.4118151664733887 }, { "auxiliary_loss_clip": 0.01057241, "auxiliary_loss_mlp": 0.01039791, "balance_loss_clip": 1.01574683, "balance_loss_mlp": 1.0191679, "epoch": 0.49613708101608295, "flos": 16908575362560.0, "grad_norm": 1.8160084496475115, "language_loss": 0.80856979, "learning_rate": 2.122020411748461e-06, "loss": 0.82954013, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38085938, "step": 8252, "time_per_iteration": 3.8295669555664062 }, { "auxiliary_loss_clip": 0.01057917, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.01193666, "balance_loss_mlp": 1.01865101, "epoch": 0.4961972042687509, "flos": 16617155310720.0, "grad_norm": 1.6782129764589253, "language_loss": 0.82181168, "learning_rate": 2.1216316732676363e-06, "loss": 0.84275758, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39257812, "step": 8253, "time_per_iteration": 2.3403515815734863 }, { "auxiliary_loss_clip": 0.01058111, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.01226223, "balance_loss_mlp": 1.01938605, "epoch": 0.49625732752141893, "flos": 28955286167040.0, "grad_norm": 1.4187591809414455, "language_loss": 0.68238151, "learning_rate": 2.1212429301745275e-06, "loss": 0.70332605, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38671875, "step": 8254, "time_per_iteration": 2.507251024246216 }, { "auxiliary_loss_clip": 0.01057738, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.01354384, "balance_loss_mlp": 1.01821995, "epoch": 0.4963174507740869, "flos": 23111244840960.0, "grad_norm": 5.980877644029546, "language_loss": 0.75330126, "learning_rate": 2.1208541824838743e-06, "loss": 0.77427208, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 8255, "time_per_iteration": 2.377192497253418 }, { "auxiliary_loss_clip": 0.01056213, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.0124259, "balance_loss_mlp": 1.01862264, "epoch": 0.49637757402675486, "flos": 13917007537920.0, "grad_norm": 1.6599195084239557, "language_loss": 0.82801437, "learning_rate": 2.1204654302104183e-06, "loss": 0.84894049, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 8256, "time_per_iteration": 2.3584859371185303 }, { "auxiliary_loss_clip": 0.01055936, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.01380849, "balance_loss_mlp": 1.01834297, "epoch": 0.49643769727942283, "flos": 22307928301440.0, "grad_norm": 1.39209644965911, "language_loss": 0.81800514, "learning_rate": 2.120076673368901e-06, "loss": 0.83893442, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 8257, "time_per_iteration": 2.3938443660736084 }, { "auxiliary_loss_clip": 0.01059225, "auxiliary_loss_mlp": 0.01042984, "balance_loss_clip": 1.01474309, "balance_loss_mlp": 1.01797485, "epoch": 0.4964978205320908, "flos": 19499235511680.0, "grad_norm": 1.7076872086477162, "language_loss": 0.67265123, "learning_rate": 2.1196879119740647e-06, "loss": 0.69367331, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 8258, "time_per_iteration": 2.3799993991851807 }, { "auxiliary_loss_clip": 0.01055057, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.01349139, "balance_loss_mlp": 1.01776302, "epoch": 0.49655794378475876, "flos": 23435518348800.0, "grad_norm": 1.7292266309515187, "language_loss": 0.7824496, "learning_rate": 2.1192991460406502e-06, "loss": 0.80336273, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.37304688, "step": 8259, "time_per_iteration": 2.40377140045166 }, { "auxiliary_loss_clip": 0.01055037, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.01814818, "balance_loss_mlp": 1.01723886, "epoch": 0.4966180670374267, "flos": 26829983796480.0, "grad_norm": 1.49680573693927, "language_loss": 0.79356432, "learning_rate": 2.1189103755834e-06, "loss": 0.81455064, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 8260, "time_per_iteration": 2.430732488632202 }, { "auxiliary_loss_clip": 0.01056898, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.011724, "balance_loss_mlp": 1.01765943, "epoch": 0.4966781902900947, "flos": 22008478636800.0, "grad_norm": 2.976926334183822, "language_loss": 0.772017, "learning_rate": 2.1185216006170573e-06, "loss": 0.79295695, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8261, "time_per_iteration": 2.350372314453125 }, { "auxiliary_loss_clip": 0.01055006, "auxiliary_loss_mlp": 0.01039435, "balance_loss_clip": 1.0144366, "balance_loss_mlp": 1.01736712, "epoch": 0.49673831354276266, "flos": 26212160643840.0, "grad_norm": 2.0532473177015107, "language_loss": 0.9041543, "learning_rate": 2.1181328211563627e-06, "loss": 0.92509872, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 8262, "time_per_iteration": 2.4180643558502197 }, { "auxiliary_loss_clip": 0.01055473, "auxiliary_loss_mlp": 0.01037725, "balance_loss_clip": 1.01456237, "balance_loss_mlp": 1.01785517, "epoch": 0.4967984367954306, "flos": 23181245850240.0, "grad_norm": 1.4658211795261253, "language_loss": 0.74655414, "learning_rate": 2.11774403721606e-06, "loss": 0.7674861, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 8263, "time_per_iteration": 2.390300750732422 }, { "auxiliary_loss_clip": 0.01059138, "auxiliary_loss_mlp": 0.010407, "balance_loss_clip": 1.01445055, "balance_loss_mlp": 1.01882911, "epoch": 0.4968585600480986, "flos": 19280434821120.0, "grad_norm": 2.6430504878197625, "language_loss": 0.71569145, "learning_rate": 2.1173552488108923e-06, "loss": 0.73668987, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 8264, "time_per_iteration": 2.34977650642395 }, { "auxiliary_loss_clip": 0.01058082, "auxiliary_loss_mlp": 0.01042899, "balance_loss_clip": 1.01586223, "balance_loss_mlp": 1.01724446, "epoch": 0.49691868330076655, "flos": 22527601776000.0, "grad_norm": 1.3203638450665252, "language_loss": 0.65867084, "learning_rate": 2.1169664559556007e-06, "loss": 0.67968059, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 8265, "time_per_iteration": 2.40763521194458 }, { "auxiliary_loss_clip": 0.01011844, "auxiliary_loss_mlp": 0.01010098, "balance_loss_clip": 1.00772548, "balance_loss_mlp": 1.00407755, "epoch": 0.4969788065534345, "flos": 66573500567040.0, "grad_norm": 0.8778604960501811, "language_loss": 0.53602797, "learning_rate": 2.1165776586649304e-06, "loss": 0.55624735, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07763672, "step": 8266, "time_per_iteration": 3.0847082138061523 }, { "auxiliary_loss_clip": 0.01054434, "auxiliary_loss_mlp": 0.0104209, "balance_loss_clip": 1.01787806, "balance_loss_mlp": 1.01648557, "epoch": 0.49703892980610254, "flos": 24058403648640.0, "grad_norm": 1.5487698452913634, "language_loss": 0.80149132, "learning_rate": 2.1161888569536223e-06, "loss": 0.8224566, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 8267, "time_per_iteration": 2.3897550106048584 }, { "auxiliary_loss_clip": 0.01057648, "auxiliary_loss_mlp": 0.01040043, "balance_loss_clip": 1.01413858, "balance_loss_mlp": 1.01827085, "epoch": 0.4970990530587705, "flos": 29125069441920.0, "grad_norm": 3.2226164881860795, "language_loss": 0.76289195, "learning_rate": 2.1158000508364223e-06, "loss": 0.78386891, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 8268, "time_per_iteration": 2.443751573562622 }, { "auxiliary_loss_clip": 0.0105687, "auxiliary_loss_mlp": 0.01050046, "balance_loss_clip": 1.02221107, "balance_loss_mlp": 1.01789629, "epoch": 0.49715917631143847, "flos": 46024393294080.0, "grad_norm": 2.359235837795071, "language_loss": 0.68424904, "learning_rate": 2.115411240328073e-06, "loss": 0.70531821, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.390625, "step": 8269, "time_per_iteration": 2.5788111686706543 }, { "auxiliary_loss_clip": 0.01055177, "auxiliary_loss_mlp": 0.01042727, "balance_loss_clip": 1.01614332, "balance_loss_mlp": 1.01810014, "epoch": 0.49721929956410643, "flos": 20190306430080.0, "grad_norm": 1.5999172189156035, "language_loss": 0.86620581, "learning_rate": 2.1150224254433167e-06, "loss": 0.88718486, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 8270, "time_per_iteration": 2.3895368576049805 }, { "auxiliary_loss_clip": 0.01057732, "auxiliary_loss_mlp": 0.0103847, "balance_loss_clip": 1.01428199, "balance_loss_mlp": 1.01794052, "epoch": 0.4972794228167744, "flos": 21652468836480.0, "grad_norm": 1.77996847744912, "language_loss": 0.71331424, "learning_rate": 2.114633606196899e-06, "loss": 0.73427618, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3984375, "step": 8271, "time_per_iteration": 2.3924336433410645 }, { "auxiliary_loss_clip": 0.01058817, "auxiliary_loss_mlp": 0.01049094, "balance_loss_clip": 1.02270114, "balance_loss_mlp": 1.01929998, "epoch": 0.49733954606944236, "flos": 24278600793600.0, "grad_norm": 1.3922667737380563, "language_loss": 0.79138064, "learning_rate": 2.1142447826035635e-06, "loss": 0.81245983, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 8272, "time_per_iteration": 2.4235055446624756 }, { "auxiliary_loss_clip": 0.01060565, "auxiliary_loss_mlp": 0.01049389, "balance_loss_clip": 1.02389014, "balance_loss_mlp": 1.02015841, "epoch": 0.4973996693221103, "flos": 37851051323520.0, "grad_norm": 2.6155117073783316, "language_loss": 0.67831993, "learning_rate": 2.1138559546780544e-06, "loss": 0.69941944, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40429688, "step": 8273, "time_per_iteration": 2.5213491916656494 }, { "auxiliary_loss_clip": 0.01056263, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.02191091, "balance_loss_mlp": 1.01767182, "epoch": 0.4974597925747783, "flos": 21360350557440.0, "grad_norm": 2.405273395234122, "language_loss": 0.78847778, "learning_rate": 2.1134671224351163e-06, "loss": 0.80951434, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 8274, "time_per_iteration": 2.3747353553771973 }, { "auxiliary_loss_clip": 0.01060881, "auxiliary_loss_mlp": 0.01043925, "balance_loss_clip": 1.01669741, "balance_loss_mlp": 1.01912725, "epoch": 0.49751991582744626, "flos": 30736799579520.0, "grad_norm": 1.676841084617851, "language_loss": 0.77064574, "learning_rate": 2.113078285889493e-06, "loss": 0.79169381, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41796875, "step": 8275, "time_per_iteration": 2.4686007499694824 }, { "auxiliary_loss_clip": 0.01061168, "auxiliary_loss_mlp": 0.01050159, "balance_loss_clip": 1.02059484, "balance_loss_mlp": 1.01823354, "epoch": 0.4975800390801142, "flos": 14099673104640.0, "grad_norm": 2.0105341458290553, "language_loss": 0.85548866, "learning_rate": 2.1126894450559303e-06, "loss": 0.87660193, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 8276, "time_per_iteration": 2.345021963119507 }, { "auxiliary_loss_clip": 0.01056519, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.01662993, "balance_loss_mlp": 1.01854205, "epoch": 0.4976401623327822, "flos": 24206121077760.0, "grad_norm": 1.3618038905344974, "language_loss": 0.70892239, "learning_rate": 2.112300599949172e-06, "loss": 0.72988653, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8277, "time_per_iteration": 2.406276226043701 }, { "auxiliary_loss_clip": 0.01057578, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.01089907, "balance_loss_mlp": 1.01853156, "epoch": 0.49770028558545015, "flos": 21135859315200.0, "grad_norm": 1.8975617125947426, "language_loss": 0.832546, "learning_rate": 2.111911750583964e-06, "loss": 0.85347533, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.390625, "step": 8278, "time_per_iteration": 2.3692667484283447 }, { "auxiliary_loss_clip": 0.01059245, "auxiliary_loss_mlp": 0.01039304, "balance_loss_clip": 1.01287568, "balance_loss_mlp": 1.01870084, "epoch": 0.4977604088381181, "flos": 16762987526400.0, "grad_norm": 2.0557425726269143, "language_loss": 0.6866653, "learning_rate": 2.111522896975052e-06, "loss": 0.70765078, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40625, "step": 8279, "time_per_iteration": 2.3636441230773926 }, { "auxiliary_loss_clip": 0.01059165, "auxiliary_loss_mlp": 0.01045068, "balance_loss_clip": 1.01567125, "balance_loss_mlp": 1.01899564, "epoch": 0.49782053209078614, "flos": 15702675402240.0, "grad_norm": 2.0791328991282074, "language_loss": 0.71145296, "learning_rate": 2.1111340391371794e-06, "loss": 0.73249531, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40234375, "step": 8280, "time_per_iteration": 2.3368418216705322 }, { "auxiliary_loss_clip": 0.01057111, "auxiliary_loss_mlp": 0.01042968, "balance_loss_clip": 1.01709986, "balance_loss_mlp": 1.01731038, "epoch": 0.4978806553434541, "flos": 24752546766720.0, "grad_norm": 1.636481987756457, "language_loss": 0.65314722, "learning_rate": 2.1107451770850936e-06, "loss": 0.67414796, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8281, "time_per_iteration": 2.422330856323242 }, { "auxiliary_loss_clip": 0.01061754, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.0150671, "balance_loss_mlp": 1.01937366, "epoch": 0.49794077859612207, "flos": 13114877984640.0, "grad_norm": 2.0180811690045632, "language_loss": 0.74816364, "learning_rate": 2.1103563108335387e-06, "loss": 0.76923561, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42382812, "step": 8282, "time_per_iteration": 2.340963125228882 }, { "auxiliary_loss_clip": 0.01055711, "auxiliary_loss_mlp": 0.01040721, "balance_loss_clip": 1.01587796, "balance_loss_mlp": 1.01791096, "epoch": 0.49800090184879003, "flos": 27523952357760.0, "grad_norm": 1.8078166414828842, "language_loss": 0.74251735, "learning_rate": 2.109967440397263e-06, "loss": 0.76348174, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 8283, "time_per_iteration": 3.7654354572296143 }, { "auxiliary_loss_clip": 0.01057328, "auxiliary_loss_mlp": 0.01039746, "balance_loss_clip": 1.01332927, "balance_loss_mlp": 1.01874554, "epoch": 0.498061025101458, "flos": 19791458524800.0, "grad_norm": 1.5508979820456046, "language_loss": 0.8000958, "learning_rate": 2.1095785657910095e-06, "loss": 0.8210665, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38671875, "step": 8284, "time_per_iteration": 3.8061962127685547 }, { "auxiliary_loss_clip": 0.01062162, "auxiliary_loss_mlp": 0.010444, "balance_loss_clip": 1.01491976, "balance_loss_mlp": 1.01960957, "epoch": 0.49812114835412596, "flos": 29892739616640.0, "grad_norm": 1.6808502334980133, "language_loss": 0.74209595, "learning_rate": 2.109189687029526e-06, "loss": 0.76316166, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42578125, "step": 8285, "time_per_iteration": 2.429410934448242 }, { "auxiliary_loss_clip": 0.01059392, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.01347899, "balance_loss_mlp": 1.01956034, "epoch": 0.49818127160679393, "flos": 23145983510400.0, "grad_norm": 1.8468629873369473, "language_loss": 0.75310767, "learning_rate": 2.1088008041275598e-06, "loss": 0.7740978, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8286, "time_per_iteration": 3.7117197513580322 }, { "auxiliary_loss_clip": 0.01060767, "auxiliary_loss_mlp": 0.0104992, "balance_loss_clip": 1.02030826, "balance_loss_mlp": 1.01998591, "epoch": 0.4982413948594619, "flos": 21651735697920.0, "grad_norm": 2.2733321888744515, "language_loss": 0.8598187, "learning_rate": 2.1084119170998545e-06, "loss": 0.8809256, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40820312, "step": 8287, "time_per_iteration": 2.407097101211548 }, { "auxiliary_loss_clip": 0.0105733, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.01054549, "balance_loss_mlp": 1.01719713, "epoch": 0.49830151811212986, "flos": 32485669004160.0, "grad_norm": 1.7787187692651987, "language_loss": 0.73426986, "learning_rate": 2.108023025961159e-06, "loss": 0.75522995, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 8288, "time_per_iteration": 2.4545786380767822 }, { "auxiliary_loss_clip": 0.01061814, "auxiliary_loss_mlp": 0.01043221, "balance_loss_clip": 1.01486135, "balance_loss_mlp": 1.0194757, "epoch": 0.4983616413647978, "flos": 18141603315840.0, "grad_norm": 2.6615338640999258, "language_loss": 0.82219923, "learning_rate": 2.10763413072622e-06, "loss": 0.84324962, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.42382812, "step": 8289, "time_per_iteration": 2.338757276535034 }, { "auxiliary_loss_clip": 0.01057142, "auxiliary_loss_mlp": 0.01042593, "balance_loss_clip": 1.01715326, "balance_loss_mlp": 1.01791239, "epoch": 0.4984217646174658, "flos": 19717826734080.0, "grad_norm": 2.1296818544376226, "language_loss": 0.74670351, "learning_rate": 2.107245231409784e-06, "loss": 0.76770091, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8290, "time_per_iteration": 2.38388991355896 }, { "auxiliary_loss_clip": 0.01060101, "auxiliary_loss_mlp": 0.01047502, "balance_loss_clip": 1.01920128, "balance_loss_mlp": 1.01945734, "epoch": 0.49848188787013376, "flos": 24935386890240.0, "grad_norm": 1.5512956769544037, "language_loss": 0.85108519, "learning_rate": 2.106856328026598e-06, "loss": 0.87216115, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40625, "step": 8291, "time_per_iteration": 2.402104377746582 }, { "auxiliary_loss_clip": 0.0106208, "auxiliary_loss_mlp": 0.01045066, "balance_loss_clip": 1.01464415, "balance_loss_mlp": 1.02038193, "epoch": 0.4985420111228017, "flos": 22381350624000.0, "grad_norm": 1.6096658856160297, "language_loss": 0.68583584, "learning_rate": 2.106467420591409e-06, "loss": 0.70690733, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41796875, "step": 8292, "time_per_iteration": 3.84275221824646 }, { "auxiliary_loss_clip": 0.01057617, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.01366138, "balance_loss_mlp": 1.01820469, "epoch": 0.4986021343754697, "flos": 16215549408000.0, "grad_norm": 1.628685254795149, "language_loss": 0.68466246, "learning_rate": 2.106078509118965e-06, "loss": 0.70565069, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39453125, "step": 8293, "time_per_iteration": 2.3629579544067383 }, { "auxiliary_loss_clip": 0.01059958, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.01522756, "balance_loss_mlp": 1.0196631, "epoch": 0.4986622576281377, "flos": 23402490336000.0, "grad_norm": 1.9401691016118043, "language_loss": 0.83782065, "learning_rate": 2.1056895936240133e-06, "loss": 0.85882998, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 8294, "time_per_iteration": 2.3775217533111572 }, { "auxiliary_loss_clip": 0.01059656, "auxiliary_loss_mlp": 0.01041116, "balance_loss_clip": 1.01475835, "balance_loss_mlp": 1.01902437, "epoch": 0.49872238088080567, "flos": 19973530598400.0, "grad_norm": 2.0177060711633743, "language_loss": 0.7408365, "learning_rate": 2.1053006741213016e-06, "loss": 0.76184422, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 8295, "time_per_iteration": 2.4274377822875977 }, { "auxiliary_loss_clip": 0.01059483, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.0163269, "balance_loss_mlp": 1.01978278, "epoch": 0.49878250413347364, "flos": 22891920480000.0, "grad_norm": 2.0068468908620396, "language_loss": 0.69029319, "learning_rate": 2.1049117506255775e-06, "loss": 0.7113111, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 8296, "time_per_iteration": 2.369084596633911 }, { "auxiliary_loss_clip": 0.01060554, "auxiliary_loss_mlp": 0.01047776, "balance_loss_clip": 1.01928484, "balance_loss_mlp": 1.01910615, "epoch": 0.4988426273861416, "flos": 32597076752640.0, "grad_norm": 2.5595771277196526, "language_loss": 0.6590457, "learning_rate": 2.1045228231515895e-06, "loss": 0.68012893, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 8297, "time_per_iteration": 2.510134220123291 }, { "auxiliary_loss_clip": 0.01058063, "auxiliary_loss_mlp": 0.01042746, "balance_loss_clip": 1.01878452, "balance_loss_mlp": 1.01926196, "epoch": 0.49890275063880957, "flos": 20922539708160.0, "grad_norm": 1.7035805446901031, "language_loss": 0.708915, "learning_rate": 2.1041338917140857e-06, "loss": 0.72992313, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38867188, "step": 8298, "time_per_iteration": 2.379854440689087 }, { "auxiliary_loss_clip": 0.01055244, "auxiliary_loss_mlp": 0.01040722, "balance_loss_clip": 1.01554489, "balance_loss_mlp": 1.01769328, "epoch": 0.49896287389147753, "flos": 18623474167680.0, "grad_norm": 1.8161767758811025, "language_loss": 0.85744816, "learning_rate": 2.103744956327814e-06, "loss": 0.87840784, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 8299, "time_per_iteration": 2.386828660964966 }, { "auxiliary_loss_clip": 0.0106013, "auxiliary_loss_mlp": 0.01048981, "balance_loss_clip": 1.02147913, "balance_loss_mlp": 1.01874757, "epoch": 0.4990229971441455, "flos": 24825410507520.0, "grad_norm": 2.046608705948758, "language_loss": 0.70276558, "learning_rate": 2.1033560170075234e-06, "loss": 0.72385669, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.4140625, "step": 8300, "time_per_iteration": 2.4137816429138184 }, { "auxiliary_loss_clip": 0.01013102, "auxiliary_loss_mlp": 0.01003211, "balance_loss_clip": 1.00083876, "balance_loss_mlp": 1.0052166, "epoch": 0.49908312039681346, "flos": 71381006271360.0, "grad_norm": 0.7630893134157045, "language_loss": 0.51182795, "learning_rate": 2.1029670737679623e-06, "loss": 0.53199112, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07910156, "step": 8301, "time_per_iteration": 3.1143600940704346 }, { "auxiliary_loss_clip": 0.01056158, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.01733494, "balance_loss_mlp": 1.01793706, "epoch": 0.4991432436494814, "flos": 19827628560000.0, "grad_norm": 1.7354825675275027, "language_loss": 0.85556626, "learning_rate": 2.102578126623879e-06, "loss": 0.87654948, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 8302, "time_per_iteration": 2.367746591567993 }, { "auxiliary_loss_clip": 0.01057333, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.01447558, "balance_loss_mlp": 1.0193429, "epoch": 0.4992033669021494, "flos": 15121022284800.0, "grad_norm": 1.8064005309037934, "language_loss": 0.70976365, "learning_rate": 2.102189175590024e-06, "loss": 0.73073119, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 8303, "time_per_iteration": 2.372868537902832 }, { "auxiliary_loss_clip": 0.01057974, "auxiliary_loss_mlp": 0.0104467, "balance_loss_clip": 1.017717, "balance_loss_mlp": 1.01764441, "epoch": 0.49926349015481736, "flos": 31206730746240.0, "grad_norm": 1.6156969889188246, "language_loss": 0.73176634, "learning_rate": 2.101800220681144e-06, "loss": 0.75279284, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 8304, "time_per_iteration": 2.466909646987915 }, { "auxiliary_loss_clip": 0.01058342, "auxiliary_loss_mlp": 0.01047008, "balance_loss_clip": 1.02276146, "balance_loss_mlp": 1.01928937, "epoch": 0.4993236134074853, "flos": 24899042298240.0, "grad_norm": 1.9084815674525235, "language_loss": 0.8269977, "learning_rate": 2.10141126191199e-06, "loss": 0.84805119, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.390625, "step": 8305, "time_per_iteration": 2.426361083984375 }, { "auxiliary_loss_clip": 0.01012891, "auxiliary_loss_mlp": 0.01006323, "balance_loss_clip": 1.00386715, "balance_loss_mlp": 1.00482821, "epoch": 0.4993837366601533, "flos": 70417508947200.0, "grad_norm": 0.7143745862809179, "language_loss": 0.56988788, "learning_rate": 2.1010222992973107e-06, "loss": 0.59008002, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.08056641, "step": 8306, "time_per_iteration": 3.1281819343566895 }, { "auxiliary_loss_clip": 0.01058708, "auxiliary_loss_mlp": 0.01046001, "balance_loss_clip": 1.01868999, "balance_loss_mlp": 1.01986003, "epoch": 0.4994438599128213, "flos": 15960299391360.0, "grad_norm": 1.8696654965247725, "language_loss": 0.84119439, "learning_rate": 2.1006333328518556e-06, "loss": 0.86224151, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 8307, "time_per_iteration": 2.4603028297424316 }, { "auxiliary_loss_clip": 0.01057134, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.01811576, "balance_loss_mlp": 1.01838112, "epoch": 0.4995039831654893, "flos": 27927059448960.0, "grad_norm": 1.6743250367160234, "language_loss": 0.6223284, "learning_rate": 2.1002443625903748e-06, "loss": 0.64333338, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38867188, "step": 8308, "time_per_iteration": 2.4142115116119385 }, { "auxiliary_loss_clip": 0.01055367, "auxiliary_loss_mlp": 0.01041608, "balance_loss_clip": 1.01707518, "balance_loss_mlp": 1.0168364, "epoch": 0.49956410641815724, "flos": 24203712193920.0, "grad_norm": 1.503919186996274, "language_loss": 0.75408769, "learning_rate": 2.0998553885276168e-06, "loss": 0.77505749, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 8309, "time_per_iteration": 2.410120725631714 }, { "auxiliary_loss_clip": 0.01056565, "auxiliary_loss_mlp": 0.01041111, "balance_loss_clip": 1.01678014, "balance_loss_mlp": 1.01794457, "epoch": 0.4996242296708252, "flos": 16179204816000.0, "grad_norm": 2.2571429493541, "language_loss": 0.80262285, "learning_rate": 2.0994664106783335e-06, "loss": 0.82359964, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.38671875, "step": 8310, "time_per_iteration": 2.342151403427124 }, { "auxiliary_loss_clip": 0.01058953, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.015136, "balance_loss_mlp": 1.01867652, "epoch": 0.49968435292349317, "flos": 16872579884160.0, "grad_norm": 1.6727870335259727, "language_loss": 0.71905828, "learning_rate": 2.0990774290572735e-06, "loss": 0.74006522, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 8311, "time_per_iteration": 2.372880220413208 }, { "auxiliary_loss_clip": 0.01058838, "auxiliary_loss_mlp": 0.01045261, "balance_loss_clip": 1.01945174, "balance_loss_mlp": 1.01904786, "epoch": 0.49974447617616113, "flos": 14938636008960.0, "grad_norm": 1.8744749645010346, "language_loss": 0.78390902, "learning_rate": 2.098688443679187e-06, "loss": 0.80495006, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8312, "time_per_iteration": 2.3408703804016113 }, { "auxiliary_loss_clip": 0.0105807, "auxiliary_loss_mlp": 0.01043646, "balance_loss_clip": 1.0182786, "balance_loss_mlp": 1.01885509, "epoch": 0.4998045994288291, "flos": 26650320606720.0, "grad_norm": 1.8132530784581995, "language_loss": 0.8597635, "learning_rate": 2.0982994545588256e-06, "loss": 0.88078064, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8313, "time_per_iteration": 2.4247639179229736 }, { "auxiliary_loss_clip": 0.01058558, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.01185489, "balance_loss_mlp": 1.01843429, "epoch": 0.49986472268149706, "flos": 20952879546240.0, "grad_norm": 2.215714604499613, "language_loss": 0.81617808, "learning_rate": 2.097910461710939e-06, "loss": 0.83713984, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 8314, "time_per_iteration": 2.3642492294311523 }, { "auxiliary_loss_clip": 0.01061656, "auxiliary_loss_mlp": 0.01050949, "balance_loss_clip": 1.02100408, "balance_loss_mlp": 1.0206244, "epoch": 0.49992484593416503, "flos": 22782781969920.0, "grad_norm": 1.7104089272185572, "language_loss": 0.80318224, "learning_rate": 2.0975214651502773e-06, "loss": 0.82430828, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41015625, "step": 8315, "time_per_iteration": 2.4030568599700928 }, { "auxiliary_loss_clip": 0.01058223, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.01566935, "balance_loss_mlp": 1.0191282, "epoch": 0.499984969186833, "flos": 46785325576320.0, "grad_norm": 1.6494547227148133, "language_loss": 0.75483966, "learning_rate": 2.0971324648915926e-06, "loss": 0.7758224, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.390625, "step": 8316, "time_per_iteration": 2.592008590698242 }, { "auxiliary_loss_clip": 0.01054861, "auxiliary_loss_mlp": 0.01038754, "balance_loss_clip": 1.01572323, "balance_loss_mlp": 1.01730025, "epoch": 0.500045092439501, "flos": 25555793483520.0, "grad_norm": 5.8520332049472765, "language_loss": 0.82199979, "learning_rate": 2.0967434609496343e-06, "loss": 0.84293598, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.375, "step": 8317, "time_per_iteration": 2.4913976192474365 }, { "auxiliary_loss_clip": 0.01056348, "auxiliary_loss_mlp": 0.01047007, "balance_loss_clip": 1.01777697, "balance_loss_mlp": 1.01683021, "epoch": 0.5001052156921689, "flos": 20703704106240.0, "grad_norm": 1.564512702709844, "language_loss": 0.84882379, "learning_rate": 2.0963544533391548e-06, "loss": 0.86985737, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.39453125, "step": 8318, "time_per_iteration": 2.3603978157043457 }, { "auxiliary_loss_clip": 0.01057952, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 1.01306772, "balance_loss_mlp": 1.01829314, "epoch": 0.500165338944837, "flos": 21250059972480.0, "grad_norm": 1.825312822917236, "language_loss": 0.83369553, "learning_rate": 2.0959654420749045e-06, "loss": 0.85465014, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.39648438, "step": 8319, "time_per_iteration": 2.3962485790252686 }, { "auxiliary_loss_clip": 0.01059142, "auxiliary_loss_mlp": 0.0104072, "balance_loss_clip": 1.01530492, "balance_loss_mlp": 1.01930583, "epoch": 0.5002254621975049, "flos": 27853183278720.0, "grad_norm": 1.5535049232676816, "language_loss": 0.72548461, "learning_rate": 2.095576427171635e-06, "loss": 0.74648321, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 8320, "time_per_iteration": 2.4695091247558594 }, { "auxiliary_loss_clip": 0.0106417, "auxiliary_loss_mlp": 0.01054026, "balance_loss_clip": 1.02174401, "balance_loss_mlp": 1.01908886, "epoch": 0.5002855854501729, "flos": 15551257368960.0, "grad_norm": 3.0040529036486987, "language_loss": 0.79514688, "learning_rate": 2.0951874086440978e-06, "loss": 0.81632888, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.45117188, "step": 8321, "time_per_iteration": 2.345817804336548 }, { "auxiliary_loss_clip": 0.01056974, "auxiliary_loss_mlp": 0.01044607, "balance_loss_clip": 1.01901293, "balance_loss_mlp": 1.0175606, "epoch": 0.5003457087028408, "flos": 16106480720640.0, "grad_norm": 1.7143199501543434, "language_loss": 0.83997178, "learning_rate": 2.0947983865070455e-06, "loss": 0.8609876, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 8322, "time_per_iteration": 2.3666889667510986 }, { "auxiliary_loss_clip": 0.01059361, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.01639509, "balance_loss_mlp": 1.01817095, "epoch": 0.5004058319555088, "flos": 22709429470080.0, "grad_norm": 3.0828031672055465, "language_loss": 0.75366122, "learning_rate": 2.094409360775228e-06, "loss": 0.77469981, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 8323, "time_per_iteration": 5.1190526485443115 }, { "auxiliary_loss_clip": 0.01059062, "auxiliary_loss_mlp": 0.01044876, "balance_loss_clip": 1.01694536, "balance_loss_mlp": 1.01861858, "epoch": 0.5004659552081767, "flos": 30116637365760.0, "grad_norm": 1.516911688379006, "language_loss": 0.70131838, "learning_rate": 2.0940203314633977e-06, "loss": 0.72235775, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 8324, "time_per_iteration": 2.4382009506225586 }, { "auxiliary_loss_clip": 0.01058243, "auxiliary_loss_mlp": 0.01047747, "balance_loss_clip": 1.01968527, "balance_loss_mlp": 1.01796758, "epoch": 0.5005260784608447, "flos": 18623718547200.0, "grad_norm": 1.965891776856318, "language_loss": 0.73139483, "learning_rate": 2.0936312985863077e-06, "loss": 0.7524547, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8325, "time_per_iteration": 3.7004287242889404 }, { "auxiliary_loss_clip": 0.01059873, "auxiliary_loss_mlp": 0.01049675, "balance_loss_clip": 1.02031422, "balance_loss_mlp": 1.018713, "epoch": 0.5005862017135126, "flos": 24858927279360.0, "grad_norm": 1.5915228718943166, "language_loss": 0.74018496, "learning_rate": 2.093242262158709e-06, "loss": 0.76128042, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 8326, "time_per_iteration": 2.4063503742218018 }, { "auxiliary_loss_clip": 0.01056159, "auxiliary_loss_mlp": 0.01040757, "balance_loss_clip": 1.01566303, "balance_loss_mlp": 1.01721287, "epoch": 0.5006463249661807, "flos": 18733380727680.0, "grad_norm": 1.5118904681271286, "language_loss": 0.79180264, "learning_rate": 2.0928532221953544e-06, "loss": 0.81277174, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 8327, "time_per_iteration": 2.3408899307250977 }, { "auxiliary_loss_clip": 0.01061449, "auxiliary_loss_mlp": 0.01050027, "balance_loss_clip": 1.02238226, "balance_loss_mlp": 1.02046132, "epoch": 0.5007064482188487, "flos": 13041316016640.0, "grad_norm": 2.039895958901919, "language_loss": 0.89281046, "learning_rate": 2.092464178710997e-06, "loss": 0.91392523, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 8328, "time_per_iteration": 2.363429307937622 }, { "auxiliary_loss_clip": 0.0106232, "auxiliary_loss_mlp": 0.01044152, "balance_loss_clip": 1.01659083, "balance_loss_mlp": 1.01952207, "epoch": 0.5007665714715166, "flos": 21287591550720.0, "grad_norm": 2.0488432667312155, "language_loss": 0.75689113, "learning_rate": 2.092075131720388e-06, "loss": 0.77795577, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.42773438, "step": 8329, "time_per_iteration": 2.3692123889923096 }, { "auxiliary_loss_clip": 0.01057404, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.01828396, "balance_loss_mlp": 1.01790237, "epoch": 0.5008266947241846, "flos": 29753226357120.0, "grad_norm": 2.0891404674055742, "language_loss": 0.80400091, "learning_rate": 2.091686081238281e-06, "loss": 0.82502049, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 8330, "time_per_iteration": 2.446190595626831 }, { "auxiliary_loss_clip": 0.01009756, "auxiliary_loss_mlp": 0.01002166, "balance_loss_clip": 0.99990082, "balance_loss_mlp": 1.00204563, "epoch": 0.5008868179768525, "flos": 63555050995200.0, "grad_norm": 0.7457314419483813, "language_loss": 0.56204665, "learning_rate": 2.0912970272794282e-06, "loss": 0.5821659, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.07714844, "step": 8331, "time_per_iteration": 2.8145554065704346 }, { "auxiliary_loss_clip": 0.01056974, "auxiliary_loss_mlp": 0.01045296, "balance_loss_clip": 1.01742458, "balance_loss_mlp": 1.01809692, "epoch": 0.5009469412295205, "flos": 27374559183360.0, "grad_norm": 1.89039169134968, "language_loss": 0.66315216, "learning_rate": 2.0909079698585833e-06, "loss": 0.6841749, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38867188, "step": 8332, "time_per_iteration": 3.8353333473205566 }, { "auxiliary_loss_clip": 0.01056755, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.01163137, "balance_loss_mlp": 1.01810098, "epoch": 0.5010070644821885, "flos": 27377666294400.0, "grad_norm": 1.5861220654784587, "language_loss": 0.75925732, "learning_rate": 2.0905189089904993e-06, "loss": 0.78020763, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38671875, "step": 8333, "time_per_iteration": 2.407594919204712 }, { "auxiliary_loss_clip": 0.01059019, "auxiliary_loss_mlp": 0.010412, "balance_loss_clip": 1.01381743, "balance_loss_mlp": 1.01839817, "epoch": 0.5010671877348565, "flos": 20661843519360.0, "grad_norm": 1.836211192379402, "language_loss": 0.81145251, "learning_rate": 2.090129844689929e-06, "loss": 0.83245468, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 8334, "time_per_iteration": 2.37439227104187 }, { "auxiliary_loss_clip": 0.01009249, "auxiliary_loss_mlp": 0.01003346, "balance_loss_clip": 1.00092566, "balance_loss_mlp": 1.00144279, "epoch": 0.5011273109875244, "flos": 59125374691200.0, "grad_norm": 0.8938750769283911, "language_loss": 0.62799299, "learning_rate": 2.089740776971626e-06, "loss": 0.64811891, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.078125, "step": 8335, "time_per_iteration": 2.988260269165039 }, { "auxiliary_loss_clip": 0.01055495, "auxiliary_loss_mlp": 0.01034897, "balance_loss_clip": 1.01039982, "balance_loss_mlp": 1.01721871, "epoch": 0.5011874342401924, "flos": 25335212313600.0, "grad_norm": 1.660277987684863, "language_loss": 0.80427837, "learning_rate": 2.0893517058503435e-06, "loss": 0.82518232, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 8336, "time_per_iteration": 2.4334375858306885 }, { "auxiliary_loss_clip": 0.01058454, "auxiliary_loss_mlp": 0.0104253, "balance_loss_clip": 1.01427758, "balance_loss_mlp": 1.01811206, "epoch": 0.5012475574928603, "flos": 20228920260480.0, "grad_norm": 1.5756370082941724, "language_loss": 0.81785685, "learning_rate": 2.088962631340836e-06, "loss": 0.83886671, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40234375, "step": 8337, "time_per_iteration": 2.3608627319335938 }, { "auxiliary_loss_clip": 0.01059565, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.01775336, "balance_loss_mlp": 1.01765037, "epoch": 0.5013076807455283, "flos": 22709045445120.0, "grad_norm": 2.004025651779985, "language_loss": 0.81050605, "learning_rate": 2.0885735534578555e-06, "loss": 0.83154899, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 8338, "time_per_iteration": 2.367231607437134 }, { "auxiliary_loss_clip": 0.01057682, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.0142374, "balance_loss_mlp": 1.01852334, "epoch": 0.5013678039981962, "flos": 24243966858240.0, "grad_norm": 1.5628037633182916, "language_loss": 0.85919631, "learning_rate": 2.0881844722161583e-06, "loss": 0.88017523, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 8339, "time_per_iteration": 2.4084701538085938 }, { "auxiliary_loss_clip": 0.01057513, "auxiliary_loss_mlp": 0.01044932, "balance_loss_clip": 1.01983821, "balance_loss_mlp": 1.01808703, "epoch": 0.5014279272508643, "flos": 26175501849600.0, "grad_norm": 1.522081880523542, "language_loss": 0.7207998, "learning_rate": 2.0877953876304962e-06, "loss": 0.74182421, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 8340, "time_per_iteration": 2.416010856628418 }, { "auxiliary_loss_clip": 0.01060846, "auxiliary_loss_mlp": 0.01046922, "balance_loss_clip": 1.01896763, "balance_loss_mlp": 1.01873374, "epoch": 0.5014880505035323, "flos": 21429478782720.0, "grad_norm": 1.91958384658196, "language_loss": 0.79822785, "learning_rate": 2.0874062997156245e-06, "loss": 0.81930548, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.421875, "step": 8341, "time_per_iteration": 2.372584104537964 }, { "auxiliary_loss_clip": 0.01061525, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.01528788, "balance_loss_mlp": 1.01909089, "epoch": 0.5015481737562002, "flos": 15770058059520.0, "grad_norm": 3.334853715037388, "language_loss": 0.90546811, "learning_rate": 2.0870172084862975e-06, "loss": 0.92652988, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 8342, "time_per_iteration": 2.3573524951934814 }, { "auxiliary_loss_clip": 0.01056393, "auxiliary_loss_mlp": 0.01037951, "balance_loss_clip": 1.01295233, "balance_loss_mlp": 1.01781225, "epoch": 0.5016082970088682, "flos": 26829669594240.0, "grad_norm": 1.8474312477823314, "language_loss": 0.77369654, "learning_rate": 2.0866281139572682e-06, "loss": 0.79463995, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 8343, "time_per_iteration": 2.421483039855957 }, { "auxiliary_loss_clip": 0.01057602, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.01210451, "balance_loss_mlp": 1.01920676, "epoch": 0.5016684202615361, "flos": 21469523978880.0, "grad_norm": 1.8677277571357696, "language_loss": 0.68772542, "learning_rate": 2.086239016143293e-06, "loss": 0.70866913, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 8344, "time_per_iteration": 2.4263033866882324 }, { "auxiliary_loss_clip": 0.01060386, "auxiliary_loss_mlp": 0.01042268, "balance_loss_clip": 1.01672125, "balance_loss_mlp": 1.01979852, "epoch": 0.5017285435142042, "flos": 26245712327040.0, "grad_norm": 1.7495446699205126, "language_loss": 0.76734805, "learning_rate": 2.0858499150591258e-06, "loss": 0.78837466, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40625, "step": 8345, "time_per_iteration": 2.396933078765869 }, { "auxiliary_loss_clip": 0.01061492, "auxiliary_loss_mlp": 0.01047467, "balance_loss_clip": 1.01946437, "balance_loss_mlp": 1.02068686, "epoch": 0.5017886667668721, "flos": 20776498024320.0, "grad_norm": 1.8474247265429458, "language_loss": 0.79583383, "learning_rate": 2.0854608107195203e-06, "loss": 0.81692338, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40820312, "step": 8346, "time_per_iteration": 2.4160497188568115 }, { "auxiliary_loss_clip": 0.0105806, "auxiliary_loss_mlp": 0.01044454, "balance_loss_clip": 1.01863313, "balance_loss_mlp": 1.01824558, "epoch": 0.5018487900195401, "flos": 20155393203840.0, "grad_norm": 1.6752918358441549, "language_loss": 0.7039696, "learning_rate": 2.0850717031392333e-06, "loss": 0.72499472, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8347, "time_per_iteration": 2.3659303188323975 }, { "auxiliary_loss_clip": 0.01059074, "auxiliary_loss_mlp": 0.01050803, "balance_loss_clip": 1.02138233, "balance_loss_mlp": 1.0182507, "epoch": 0.501908913272208, "flos": 18149702751360.0, "grad_norm": 1.7947506721838362, "language_loss": 0.72791791, "learning_rate": 2.0846825923330174e-06, "loss": 0.74901664, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40820312, "step": 8348, "time_per_iteration": 2.3891489505767822 }, { "auxiliary_loss_clip": 0.01056123, "auxiliary_loss_mlp": 0.0104524, "balance_loss_clip": 1.01989639, "balance_loss_mlp": 1.01734567, "epoch": 0.501969036524876, "flos": 23111175018240.0, "grad_norm": 1.6815490749823887, "language_loss": 0.75678086, "learning_rate": 2.0842934783156303e-06, "loss": 0.77779448, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 8349, "time_per_iteration": 2.3844335079193115 }, { "auxiliary_loss_clip": 0.01058349, "auxiliary_loss_mlp": 0.01043996, "balance_loss_clip": 1.01635194, "balance_loss_mlp": 1.01764238, "epoch": 0.5020291597775439, "flos": 11362447601280.0, "grad_norm": 2.121833215493566, "language_loss": 0.65722823, "learning_rate": 2.0839043611018266e-06, "loss": 0.67825162, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 8350, "time_per_iteration": 2.368807315826416 }, { "auxiliary_loss_clip": 0.01011409, "auxiliary_loss_mlp": 0.01003125, "balance_loss_clip": 1.00043142, "balance_loss_mlp": 1.00318348, "epoch": 0.5020892830302119, "flos": 64007873729280.0, "grad_norm": 0.7838515260069818, "language_loss": 0.59882939, "learning_rate": 2.0835152407063597e-06, "loss": 0.61897469, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.08203125, "step": 8351, "time_per_iteration": 3.1352503299713135 }, { "auxiliary_loss_clip": 0.01059364, "auxiliary_loss_mlp": 0.01042251, "balance_loss_clip": 1.0154295, "balance_loss_mlp": 1.01793694, "epoch": 0.5021494062828799, "flos": 23731721256960.0, "grad_norm": 1.7318389393488236, "language_loss": 0.76700127, "learning_rate": 2.0831261171439873e-06, "loss": 0.78801745, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.4140625, "step": 8352, "time_per_iteration": 2.3997390270233154 }, { "auxiliary_loss_clip": 0.01058201, "auxiliary_loss_mlp": 0.01046791, "balance_loss_clip": 1.01850224, "balance_loss_mlp": 1.01858771, "epoch": 0.5022095295355479, "flos": 21575764846080.0, "grad_norm": 1.643710814342995, "language_loss": 0.73015594, "learning_rate": 2.082736990429464e-06, "loss": 0.75120592, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.39648438, "step": 8353, "time_per_iteration": 2.362715482711792 }, { "auxiliary_loss_clip": 0.01059223, "auxiliary_loss_mlp": 0.01049908, "balance_loss_clip": 1.02033234, "balance_loss_mlp": 1.01918805, "epoch": 0.5022696527882159, "flos": 21396171479040.0, "grad_norm": 1.9580536518072933, "language_loss": 0.75480407, "learning_rate": 2.0823478605775455e-06, "loss": 0.77589536, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40039062, "step": 8354, "time_per_iteration": 2.3942837715148926 }, { "auxiliary_loss_clip": 0.0105831, "auxiliary_loss_mlp": 0.01044193, "balance_loss_clip": 1.0171082, "balance_loss_mlp": 1.01924944, "epoch": 0.5023297760408838, "flos": 27159528919680.0, "grad_norm": 1.5011179956805172, "language_loss": 0.73537952, "learning_rate": 2.0819587276029884e-06, "loss": 0.75640452, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 8355, "time_per_iteration": 2.4179279804229736 }, { "auxiliary_loss_clip": 0.01059941, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.00885129, "balance_loss_mlp": 1.01883245, "epoch": 0.5023898992935518, "flos": 26212614491520.0, "grad_norm": 1.6912783185109805, "language_loss": 0.82353151, "learning_rate": 2.081569591520548e-06, "loss": 0.8445071, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41015625, "step": 8356, "time_per_iteration": 2.429704427719116 }, { "auxiliary_loss_clip": 0.01060105, "auxiliary_loss_mlp": 0.01045515, "balance_loss_clip": 1.01611829, "balance_loss_mlp": 1.01715183, "epoch": 0.5024500225462197, "flos": 13439570428800.0, "grad_norm": 2.4822634842450566, "language_loss": 0.78118765, "learning_rate": 2.0811804523449803e-06, "loss": 0.80224383, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 8357, "time_per_iteration": 2.337212562561035 }, { "auxiliary_loss_clip": 0.01058977, "auxiliary_loss_mlp": 0.01045986, "balance_loss_clip": 1.01786399, "balance_loss_mlp": 1.01871681, "epoch": 0.5025101457988878, "flos": 21578522843520.0, "grad_norm": 1.629949876756042, "language_loss": 0.78045279, "learning_rate": 2.0807913100910417e-06, "loss": 0.80150235, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8358, "time_per_iteration": 2.4060282707214355 }, { "auxiliary_loss_clip": 0.01059873, "auxiliary_loss_mlp": 0.01047189, "balance_loss_clip": 1.01882887, "balance_loss_mlp": 1.01900589, "epoch": 0.5025702690515557, "flos": 24643966838400.0, "grad_norm": 2.3395452063173487, "language_loss": 0.74075127, "learning_rate": 2.0804021647734887e-06, "loss": 0.76182187, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 8359, "time_per_iteration": 2.394029378890991 }, { "auxiliary_loss_clip": 0.01058968, "auxiliary_loss_mlp": 0.01049766, "balance_loss_clip": 1.02367115, "balance_loss_mlp": 1.01910973, "epoch": 0.5026303923042237, "flos": 22089092699520.0, "grad_norm": 1.7875931782393355, "language_loss": 0.78354347, "learning_rate": 2.080013016407077e-06, "loss": 0.80463088, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8360, "time_per_iteration": 2.4092788696289062 }, { "auxiliary_loss_clip": 0.01060168, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.01923871, "balance_loss_mlp": 1.01949835, "epoch": 0.5026905155568916, "flos": 23696039980800.0, "grad_norm": 1.6178176276556109, "language_loss": 0.77926683, "learning_rate": 2.0796238650065645e-06, "loss": 0.80032152, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40625, "step": 8361, "time_per_iteration": 2.4023187160491943 }, { "auxiliary_loss_clip": 0.01060137, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.0142585, "balance_loss_mlp": 1.01854289, "epoch": 0.5027506388095596, "flos": 25811218056960.0, "grad_norm": 1.6658925363156425, "language_loss": 0.86697662, "learning_rate": 2.0792347105867065e-06, "loss": 0.88801575, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41601562, "step": 8362, "time_per_iteration": 3.6526896953582764 }, { "auxiliary_loss_clip": 0.01059102, "auxiliary_loss_mlp": 0.01040339, "balance_loss_clip": 1.01362479, "balance_loss_mlp": 1.01853156, "epoch": 0.5028107620622275, "flos": 27525383723520.0, "grad_norm": 1.6449063650976044, "language_loss": 0.79787695, "learning_rate": 2.0788455531622605e-06, "loss": 0.81887138, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 8363, "time_per_iteration": 3.8421523571014404 }, { "auxiliary_loss_clip": 0.010562, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.00921011, "balance_loss_mlp": 1.01803613, "epoch": 0.5028708853148955, "flos": 24533152583040.0, "grad_norm": 2.0369175491394453, "language_loss": 0.7747606, "learning_rate": 2.0784563927479838e-06, "loss": 0.79568124, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 8364, "time_per_iteration": 3.778409719467163 }, { "auxiliary_loss_clip": 0.01055881, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.01824784, "balance_loss_mlp": 1.01725686, "epoch": 0.5029310085675635, "flos": 20812563325440.0, "grad_norm": 1.7215615123326828, "language_loss": 0.71149212, "learning_rate": 2.0780672293586317e-06, "loss": 0.73248208, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 8365, "time_per_iteration": 2.377464532852173 }, { "auxiliary_loss_clip": 0.01062623, "auxiliary_loss_mlp": 0.01049907, "balance_loss_clip": 1.02012837, "balance_loss_mlp": 1.01963592, "epoch": 0.5029911318202315, "flos": 22341479984640.0, "grad_norm": 1.6036553173866273, "language_loss": 0.74149007, "learning_rate": 2.0776780630089635e-06, "loss": 0.76261538, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 8366, "time_per_iteration": 2.3989102840423584 }, { "auxiliary_loss_clip": 0.01059058, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.01541841, "balance_loss_mlp": 1.01929653, "epoch": 0.5030512550728995, "flos": 24351569268480.0, "grad_norm": 1.4087794650710723, "language_loss": 0.78995395, "learning_rate": 2.077288893713735e-06, "loss": 0.81096596, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 8367, "time_per_iteration": 2.552541494369507 }, { "auxiliary_loss_clip": 0.01057158, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.01398921, "balance_loss_mlp": 1.01765752, "epoch": 0.5031113783255674, "flos": 18258945995520.0, "grad_norm": 2.074677557633191, "language_loss": 0.71073854, "learning_rate": 2.0768997214877035e-06, "loss": 0.73169518, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.39453125, "step": 8368, "time_per_iteration": 2.337125062942505 }, { "auxiliary_loss_clip": 0.0101151, "auxiliary_loss_mlp": 0.01001976, "balance_loss_clip": 0.99980652, "balance_loss_mlp": 1.00351465, "epoch": 0.5031715015782354, "flos": 57250364924160.0, "grad_norm": 1.3685317063773643, "language_loss": 0.63425505, "learning_rate": 2.0765105463456274e-06, "loss": 0.65438992, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.08007812, "step": 8369, "time_per_iteration": 2.9658255577087402 }, { "auxiliary_loss_clip": 0.01057935, "auxiliary_loss_mlp": 0.01041348, "balance_loss_clip": 1.01583695, "balance_loss_mlp": 1.01918674, "epoch": 0.5032316248309033, "flos": 27526116862080.0, "grad_norm": 2.035264789272788, "language_loss": 0.6151377, "learning_rate": 2.076121368302263e-06, "loss": 0.63613051, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 8370, "time_per_iteration": 2.420901298522949 }, { "auxiliary_loss_clip": 0.01058672, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.01567018, "balance_loss_mlp": 1.01819146, "epoch": 0.5032917480835714, "flos": 34494396744960.0, "grad_norm": 1.7335215572404412, "language_loss": 0.69754529, "learning_rate": 2.0757321873723695e-06, "loss": 0.71856266, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 8371, "time_per_iteration": 2.4743804931640625 }, { "auxiliary_loss_clip": 0.010599, "auxiliary_loss_mlp": 0.01043912, "balance_loss_clip": 1.01559997, "balance_loss_mlp": 1.01797211, "epoch": 0.5033518713362393, "flos": 33655364017920.0, "grad_norm": 1.598993339088028, "language_loss": 0.69056016, "learning_rate": 2.0753430035707042e-06, "loss": 0.71159828, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8372, "time_per_iteration": 3.9135406017303467 }, { "auxiliary_loss_clip": 0.01059169, "auxiliary_loss_mlp": 0.01052198, "balance_loss_clip": 1.02144241, "balance_loss_mlp": 1.01862657, "epoch": 0.5034119945889073, "flos": 28184194679040.0, "grad_norm": 1.6261061395539511, "language_loss": 0.67846233, "learning_rate": 2.0749538169120235e-06, "loss": 0.69957602, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.40625, "step": 8373, "time_per_iteration": 2.44113826751709 }, { "auxiliary_loss_clip": 0.01056588, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.02029371, "balance_loss_mlp": 1.01748586, "epoch": 0.5034721178415752, "flos": 21357697294080.0, "grad_norm": 1.5261485116538334, "language_loss": 0.75453019, "learning_rate": 2.0745646274110872e-06, "loss": 0.77555704, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 8374, "time_per_iteration": 2.3769383430480957 }, { "auxiliary_loss_clip": 0.01057313, "auxiliary_loss_mlp": 0.01045287, "balance_loss_clip": 1.01773763, "balance_loss_mlp": 1.01740766, "epoch": 0.5035322410942432, "flos": 22673713282560.0, "grad_norm": 1.972120101952305, "language_loss": 0.69357574, "learning_rate": 2.0741754350826525e-06, "loss": 0.71460176, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.3984375, "step": 8375, "time_per_iteration": 2.3592889308929443 }, { "auxiliary_loss_clip": 0.01061443, "auxiliary_loss_mlp": 0.01046016, "balance_loss_clip": 1.01428199, "balance_loss_mlp": 1.01882601, "epoch": 0.5035923643469111, "flos": 19827698382720.0, "grad_norm": 1.6917493748688763, "language_loss": 0.80497748, "learning_rate": 2.0737862399414777e-06, "loss": 0.82605207, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42578125, "step": 8376, "time_per_iteration": 2.4184179306030273 }, { "auxiliary_loss_clip": 0.01058785, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.01710892, "balance_loss_mlp": 1.01727986, "epoch": 0.5036524875995791, "flos": 30513425500800.0, "grad_norm": 1.988916482082769, "language_loss": 0.60563326, "learning_rate": 2.0733970420023213e-06, "loss": 0.6266768, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41601562, "step": 8377, "time_per_iteration": 2.440805435180664 }, { "auxiliary_loss_clip": 0.01058113, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.01682758, "balance_loss_mlp": 1.01744318, "epoch": 0.5037126108522471, "flos": 14719695672960.0, "grad_norm": 1.8912633971644486, "language_loss": 0.77317178, "learning_rate": 2.0730078412799425e-06, "loss": 0.79421353, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40625, "step": 8378, "time_per_iteration": 2.3569552898406982 }, { "auxiliary_loss_clip": 0.01058657, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.00970006, "balance_loss_mlp": 1.01832664, "epoch": 0.5037727341049151, "flos": 25296633394560.0, "grad_norm": 1.5830997320019373, "language_loss": 0.76142758, "learning_rate": 2.0726186377890985e-06, "loss": 0.78238189, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 8379, "time_per_iteration": 2.4272427558898926 }, { "auxiliary_loss_clip": 0.01057157, "auxiliary_loss_mlp": 0.01040792, "balance_loss_clip": 1.01592481, "balance_loss_mlp": 1.0179534, "epoch": 0.5038328573575831, "flos": 28540693238400.0, "grad_norm": 2.537341728091919, "language_loss": 0.68237805, "learning_rate": 2.072229431544548e-06, "loss": 0.70335758, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39257812, "step": 8380, "time_per_iteration": 2.43739914894104 }, { "auxiliary_loss_clip": 0.01058015, "auxiliary_loss_mlp": 0.0103973, "balance_loss_clip": 1.01340866, "balance_loss_mlp": 1.01837206, "epoch": 0.503892980610251, "flos": 31648521490560.0, "grad_norm": 1.9732145070357676, "language_loss": 0.64108217, "learning_rate": 2.071840222561051e-06, "loss": 0.66205955, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39648438, "step": 8381, "time_per_iteration": 2.465404510498047 }, { "auxiliary_loss_clip": 0.01057207, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.01471472, "balance_loss_mlp": 1.01819444, "epoch": 0.503953103862919, "flos": 27088131456000.0, "grad_norm": 1.434289581007736, "language_loss": 0.68437755, "learning_rate": 2.071451010853365e-06, "loss": 0.70535898, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 8382, "time_per_iteration": 2.4047036170959473 }, { "auxiliary_loss_clip": 0.0106262, "auxiliary_loss_mlp": 0.01054434, "balance_loss_clip": 1.02477479, "balance_loss_mlp": 1.01962352, "epoch": 0.5040132271155869, "flos": 15632045988480.0, "grad_norm": 3.007779295854294, "language_loss": 0.63624251, "learning_rate": 2.0710617964362506e-06, "loss": 0.65741301, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 8383, "time_per_iteration": 2.3579912185668945 }, { "auxiliary_loss_clip": 0.01058191, "auxiliary_loss_mlp": 0.01040268, "balance_loss_clip": 1.01454258, "balance_loss_mlp": 1.01898491, "epoch": 0.504073350368255, "flos": 13589242894080.0, "grad_norm": 2.2549980618577643, "language_loss": 0.68952703, "learning_rate": 2.070672579324465e-06, "loss": 0.71051162, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 8384, "time_per_iteration": 2.3433139324188232 }, { "auxiliary_loss_clip": 0.0105844, "auxiliary_loss_mlp": 0.01045959, "balance_loss_clip": 1.0190053, "balance_loss_mlp": 1.01800227, "epoch": 0.5041334736209229, "flos": 29056918734720.0, "grad_norm": 5.553742068911801, "language_loss": 0.72495198, "learning_rate": 2.0702833595327674e-06, "loss": 0.74599594, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 8385, "time_per_iteration": 2.4376590251922607 }, { "auxiliary_loss_clip": 0.01055575, "auxiliary_loss_mlp": 0.01041313, "balance_loss_clip": 1.01700616, "balance_loss_mlp": 1.01756358, "epoch": 0.5041935968735909, "flos": 24607203310080.0, "grad_norm": 1.9207196368151191, "language_loss": 0.84418559, "learning_rate": 2.069894137075919e-06, "loss": 0.86515445, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38085938, "step": 8386, "time_per_iteration": 2.382720947265625 }, { "auxiliary_loss_clip": 0.0106049, "auxiliary_loss_mlp": 0.01045834, "balance_loss_clip": 1.01689005, "balance_loss_mlp": 1.01910901, "epoch": 0.5042537201262588, "flos": 26285722611840.0, "grad_norm": 1.4358167674112996, "language_loss": 0.67824709, "learning_rate": 2.0695049119686766e-06, "loss": 0.6993103, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 8387, "time_per_iteration": 2.4458038806915283 }, { "auxiliary_loss_clip": 0.0105869, "auxiliary_loss_mlp": 0.0104741, "balance_loss_clip": 1.02286494, "balance_loss_mlp": 1.01895022, "epoch": 0.5043138433789268, "flos": 22016298781440.0, "grad_norm": 1.4493819455118706, "language_loss": 0.81017447, "learning_rate": 2.0691156842258016e-06, "loss": 0.83123553, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3984375, "step": 8388, "time_per_iteration": 2.3896632194519043 }, { "auxiliary_loss_clip": 0.01057638, "auxiliary_loss_mlp": 0.01042263, "balance_loss_clip": 1.01466644, "balance_loss_mlp": 1.01786041, "epoch": 0.5043739666315947, "flos": 28765847796480.0, "grad_norm": 2.250255956982847, "language_loss": 0.70520318, "learning_rate": 2.0687264538620537e-06, "loss": 0.72620225, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39648438, "step": 8389, "time_per_iteration": 2.4465320110321045 }, { "auxiliary_loss_clip": 0.01060462, "auxiliary_loss_mlp": 0.01040276, "balance_loss_clip": 1.0144906, "balance_loss_mlp": 1.01872516, "epoch": 0.5044340898842627, "flos": 27598037996160.0, "grad_norm": 2.047799919377753, "language_loss": 0.70631689, "learning_rate": 2.068337220892191e-06, "loss": 0.72732425, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41796875, "step": 8390, "time_per_iteration": 2.4149348735809326 }, { "auxiliary_loss_clip": 0.01010674, "auxiliary_loss_mlp": 0.01002161, "balance_loss_clip": 0.99978912, "balance_loss_mlp": 1.00264597, "epoch": 0.5044942131369307, "flos": 67455268531200.0, "grad_norm": 0.8383855917650871, "language_loss": 0.53014278, "learning_rate": 2.067947985330974e-06, "loss": 0.55027115, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.08007812, "step": 8391, "time_per_iteration": 2.9352505207061768 }, { "auxiliary_loss_clip": 0.01010541, "auxiliary_loss_mlp": 0.01003591, "balance_loss_clip": 1.00142145, "balance_loss_mlp": 1.00248814, "epoch": 0.5045543363895987, "flos": 58628247575040.0, "grad_norm": 0.8754830972818822, "language_loss": 0.60830021, "learning_rate": 2.0675587471931628e-06, "loss": 0.62844157, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.08007812, "step": 8392, "time_per_iteration": 2.8591761589050293 }, { "auxiliary_loss_clip": 0.01056774, "auxiliary_loss_mlp": 0.01038062, "balance_loss_clip": 1.01172829, "balance_loss_mlp": 1.01848221, "epoch": 0.5046144596422667, "flos": 22525576917120.0, "grad_norm": 2.6382121526421565, "language_loss": 0.85658652, "learning_rate": 2.067169506493517e-06, "loss": 0.87753487, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 8393, "time_per_iteration": 2.408466339111328 }, { "auxiliary_loss_clip": 0.01059441, "auxiliary_loss_mlp": 0.0104054, "balance_loss_clip": 1.01450515, "balance_loss_mlp": 1.01985455, "epoch": 0.5046745828949346, "flos": 27453008741760.0, "grad_norm": 1.896565364267999, "language_loss": 0.5280993, "learning_rate": 2.0667802632467974e-06, "loss": 0.54909909, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 8394, "time_per_iteration": 2.4049265384674072 }, { "auxiliary_loss_clip": 0.01058014, "auxiliary_loss_mlp": 0.01047741, "balance_loss_clip": 1.01791465, "balance_loss_mlp": 1.01792049, "epoch": 0.5047347061476026, "flos": 17273592293760.0, "grad_norm": 3.364388372301817, "language_loss": 0.7630344, "learning_rate": 2.0663910174677627e-06, "loss": 0.78409195, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.40039062, "step": 8395, "time_per_iteration": 2.401951313018799 }, { "auxiliary_loss_clip": 0.01060513, "auxiliary_loss_mlp": 0.01041044, "balance_loss_clip": 1.0142101, "balance_loss_mlp": 1.01983619, "epoch": 0.5047948294002705, "flos": 16648717046400.0, "grad_norm": 3.115389506048331, "language_loss": 0.69333977, "learning_rate": 2.0660017691711737e-06, "loss": 0.71435535, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 8396, "time_per_iteration": 2.3416330814361572 }, { "auxiliary_loss_clip": 0.01059804, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.01530254, "balance_loss_mlp": 1.02028346, "epoch": 0.5048549526529386, "flos": 26864617731840.0, "grad_norm": 1.6992748131085293, "language_loss": 0.80077219, "learning_rate": 2.065612518371792e-06, "loss": 0.82177669, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 8397, "time_per_iteration": 2.457522392272949 }, { "auxiliary_loss_clip": 0.01057477, "auxiliary_loss_mlp": 0.01040606, "balance_loss_clip": 1.01486826, "balance_loss_mlp": 1.01761079, "epoch": 0.5049150759056065, "flos": 21832900076160.0, "grad_norm": 1.5306463405434139, "language_loss": 0.66758406, "learning_rate": 2.065223265084376e-06, "loss": 0.6885649, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8398, "time_per_iteration": 2.385115385055542 }, { "auxiliary_loss_clip": 0.01058601, "auxiliary_loss_mlp": 0.01044714, "balance_loss_clip": 1.01760554, "balance_loss_mlp": 1.01857507, "epoch": 0.5049751991582745, "flos": 21684833533440.0, "grad_norm": 1.6146264500306389, "language_loss": 0.73117077, "learning_rate": 2.064834009323688e-06, "loss": 0.75220394, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 8399, "time_per_iteration": 2.390725612640381 }, { "auxiliary_loss_clip": 0.01060565, "auxiliary_loss_mlp": 0.0104491, "balance_loss_clip": 1.01680088, "balance_loss_mlp": 1.02026176, "epoch": 0.5050353224109424, "flos": 21358360609920.0, "grad_norm": 1.8786033433842735, "language_loss": 0.82951784, "learning_rate": 2.0644447511044878e-06, "loss": 0.85057265, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8400, "time_per_iteration": 2.3603641986846924 }, { "auxiliary_loss_clip": 0.01058301, "auxiliary_loss_mlp": 0.01043368, "balance_loss_clip": 1.0171895, "balance_loss_mlp": 1.01853502, "epoch": 0.5050954456636104, "flos": 22818986916480.0, "grad_norm": 1.8926505079523883, "language_loss": 0.79944402, "learning_rate": 2.0640554904415362e-06, "loss": 0.82046074, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8401, "time_per_iteration": 3.697805881500244 }, { "auxiliary_loss_clip": 0.01059801, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.00943446, "balance_loss_mlp": 1.01861703, "epoch": 0.5051555689162783, "flos": 30446845804800.0, "grad_norm": 1.6369783480841937, "language_loss": 0.71131104, "learning_rate": 2.063666227349593e-06, "loss": 0.73227835, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41210938, "step": 8402, "time_per_iteration": 2.4881255626678467 }, { "auxiliary_loss_clip": 0.01059256, "auxiliary_loss_mlp": 0.01040911, "balance_loss_clip": 1.01429176, "balance_loss_mlp": 1.01845741, "epoch": 0.5052156921689464, "flos": 21286893323520.0, "grad_norm": 1.618637783982206, "language_loss": 0.70703787, "learning_rate": 2.063276961843422e-06, "loss": 0.7280395, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40820312, "step": 8403, "time_per_iteration": 3.836982488632202 }, { "auxiliary_loss_clip": 0.01057719, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.01370478, "balance_loss_mlp": 1.01962328, "epoch": 0.5052758154216143, "flos": 25080171765120.0, "grad_norm": 1.4165721963452325, "language_loss": 0.86887801, "learning_rate": 2.062887693937781e-06, "loss": 0.88985419, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 8404, "time_per_iteration": 3.8394412994384766 }, { "auxiliary_loss_clip": 0.01059337, "auxiliary_loss_mlp": 0.0104985, "balance_loss_clip": 1.02425551, "balance_loss_mlp": 1.01914728, "epoch": 0.5053359386742823, "flos": 20884484459520.0, "grad_norm": 1.5852179310111671, "language_loss": 0.76223224, "learning_rate": 2.0624984236474322e-06, "loss": 0.78332406, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 8405, "time_per_iteration": 2.3918955326080322 }, { "auxiliary_loss_clip": 0.0106019, "auxiliary_loss_mlp": 0.01044511, "balance_loss_clip": 1.01529288, "balance_loss_mlp": 1.01870584, "epoch": 0.5053960619269503, "flos": 37741808079360.0, "grad_norm": 2.2931961470698545, "language_loss": 0.74241686, "learning_rate": 2.0621091509871378e-06, "loss": 0.76346385, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4140625, "step": 8406, "time_per_iteration": 2.506713628768921 }, { "auxiliary_loss_clip": 0.01056945, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.01295412, "balance_loss_mlp": 1.01866412, "epoch": 0.5054561851796182, "flos": 23512711098240.0, "grad_norm": 2.8187455463443274, "language_loss": 0.7782886, "learning_rate": 2.0617198759716568e-06, "loss": 0.79923737, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 8407, "time_per_iteration": 2.415842294692993 }, { "auxiliary_loss_clip": 0.01058043, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.01539958, "balance_loss_mlp": 1.01760864, "epoch": 0.5055163084322862, "flos": 30408895290240.0, "grad_norm": 1.954245414137095, "language_loss": 0.64672351, "learning_rate": 2.0613305986157535e-06, "loss": 0.66771138, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40429688, "step": 8408, "time_per_iteration": 2.451498031616211 }, { "auxiliary_loss_clip": 0.01058592, "auxiliary_loss_mlp": 0.01042112, "balance_loss_clip": 1.01402628, "balance_loss_mlp": 1.01844406, "epoch": 0.5055764316849541, "flos": 20258806250880.0, "grad_norm": 2.3468632071753084, "language_loss": 0.65098208, "learning_rate": 2.0609413189341865e-06, "loss": 0.67198914, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8409, "time_per_iteration": 2.37862491607666 }, { "auxiliary_loss_clip": 0.0105786, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.01542306, "balance_loss_mlp": 1.01833034, "epoch": 0.5056365549376222, "flos": 26069610096000.0, "grad_norm": 1.285142897054033, "language_loss": 0.7179395, "learning_rate": 2.0605520369417193e-06, "loss": 0.73891681, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.39648438, "step": 8410, "time_per_iteration": 2.438717842102051 }, { "auxiliary_loss_clip": 0.01059381, "auxiliary_loss_mlp": 0.01053495, "balance_loss_clip": 1.02582633, "balance_loss_mlp": 1.01933265, "epoch": 0.5056966781902901, "flos": 19278130671360.0, "grad_norm": 1.5948358737379504, "language_loss": 0.80014116, "learning_rate": 2.060162752653113e-06, "loss": 0.82126987, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 8411, "time_per_iteration": 3.851804733276367 }, { "auxiliary_loss_clip": 0.01060291, "auxiliary_loss_mlp": 0.01050521, "balance_loss_clip": 1.02098107, "balance_loss_mlp": 1.01989615, "epoch": 0.5057568014429581, "flos": 21322295308800.0, "grad_norm": 1.825580662684488, "language_loss": 0.82583284, "learning_rate": 2.0597734660831285e-06, "loss": 0.84694093, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40429688, "step": 8412, "time_per_iteration": 2.3599846363067627 }, { "auxiliary_loss_clip": 0.01059418, "auxiliary_loss_mlp": 0.01051967, "balance_loss_clip": 1.02582419, "balance_loss_mlp": 1.01884818, "epoch": 0.505816924695626, "flos": 17492637363840.0, "grad_norm": 1.780961354408906, "language_loss": 0.80882448, "learning_rate": 2.0593841772465283e-06, "loss": 0.82993829, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 8413, "time_per_iteration": 2.401076316833496 }, { "auxiliary_loss_clip": 0.0106104, "auxiliary_loss_mlp": 0.01048025, "balance_loss_clip": 1.0198437, "balance_loss_mlp": 1.01851118, "epoch": 0.505877047948294, "flos": 21141026196480.0, "grad_norm": 2.0133369355972817, "language_loss": 0.82562053, "learning_rate": 2.0589948861580737e-06, "loss": 0.84671116, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 8414, "time_per_iteration": 2.362041473388672 }, { "auxiliary_loss_clip": 0.01057764, "auxiliary_loss_mlp": 0.01039555, "balance_loss_clip": 1.01230407, "balance_loss_mlp": 1.016909, "epoch": 0.5059371712009619, "flos": 36348738986880.0, "grad_norm": 2.1815117343372763, "language_loss": 0.64325547, "learning_rate": 2.058605592832528e-06, "loss": 0.66422862, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 8415, "time_per_iteration": 2.496384382247925 }, { "auxiliary_loss_clip": 0.01058126, "auxiliary_loss_mlp": 0.01046242, "balance_loss_clip": 1.01651168, "balance_loss_mlp": 1.01746094, "epoch": 0.50599729445363, "flos": 22672316828160.0, "grad_norm": 1.5500389344848098, "language_loss": 0.83542609, "learning_rate": 2.0582162972846515e-06, "loss": 0.85646981, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40625, "step": 8416, "time_per_iteration": 2.3660833835601807 }, { "auxiliary_loss_clip": 0.01056948, "auxiliary_loss_mlp": 0.01045416, "balance_loss_clip": 1.01966655, "balance_loss_mlp": 1.01829112, "epoch": 0.5060574177062979, "flos": 22746751580160.0, "grad_norm": 1.5303912773577506, "language_loss": 0.80296171, "learning_rate": 2.0578269995292078e-06, "loss": 0.82398534, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 8417, "time_per_iteration": 2.4141528606414795 }, { "auxiliary_loss_clip": 0.01056452, "auxiliary_loss_mlp": 0.01051028, "balance_loss_clip": 1.02384818, "balance_loss_mlp": 1.01809287, "epoch": 0.5061175409589659, "flos": 21652119722880.0, "grad_norm": 1.8637302153441762, "language_loss": 0.64286637, "learning_rate": 2.0574376995809588e-06, "loss": 0.66394114, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3828125, "step": 8418, "time_per_iteration": 2.3920953273773193 }, { "auxiliary_loss_clip": 0.01061362, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.01160407, "balance_loss_mlp": 1.0189693, "epoch": 0.5061776642116339, "flos": 21615181637760.0, "grad_norm": 2.0517525684761124, "language_loss": 0.79033387, "learning_rate": 2.0570483974546653e-06, "loss": 0.81134403, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42382812, "step": 8419, "time_per_iteration": 2.3916120529174805 }, { "auxiliary_loss_clip": 0.01060534, "auxiliary_loss_mlp": 0.01045496, "balance_loss_clip": 1.01752925, "balance_loss_mlp": 1.01912773, "epoch": 0.5062377874643018, "flos": 24425131236480.0, "grad_norm": 1.8822221211828354, "language_loss": 0.79060131, "learning_rate": 2.0566590931650917e-06, "loss": 0.8116616, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.4140625, "step": 8420, "time_per_iteration": 2.389162063598633 }, { "auxiliary_loss_clip": 0.01059517, "auxiliary_loss_mlp": 0.01045685, "balance_loss_clip": 1.01683581, "balance_loss_mlp": 1.01936269, "epoch": 0.5062979107169698, "flos": 22523447324160.0, "grad_norm": 3.1900420878541493, "language_loss": 0.78376848, "learning_rate": 2.056269786726999e-06, "loss": 0.80482048, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40234375, "step": 8421, "time_per_iteration": 2.424870729446411 }, { "auxiliary_loss_clip": 0.01059675, "auxiliary_loss_mlp": 0.01040049, "balance_loss_clip": 1.01245189, "balance_loss_mlp": 1.0187856, "epoch": 0.5063580339696377, "flos": 24570823806720.0, "grad_norm": 1.443387530589, "language_loss": 0.68272752, "learning_rate": 2.0558804781551512e-06, "loss": 0.70372474, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 8422, "time_per_iteration": 2.4087188243865967 }, { "auxiliary_loss_clip": 0.01058224, "auxiliary_loss_mlp": 0.01045909, "balance_loss_clip": 1.01740587, "balance_loss_mlp": 1.01832426, "epoch": 0.5064181572223058, "flos": 22595193901440.0, "grad_norm": 3.50432983043544, "language_loss": 0.82680225, "learning_rate": 2.05549116746431e-06, "loss": 0.84784359, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.3984375, "step": 8423, "time_per_iteration": 2.4642202854156494 }, { "auxiliary_loss_clip": 0.0106174, "auxiliary_loss_mlp": 0.01044068, "balance_loss_clip": 1.0150528, "balance_loss_mlp": 1.01997089, "epoch": 0.5064782804749737, "flos": 25993743978240.0, "grad_norm": 1.873029817935076, "language_loss": 0.76655161, "learning_rate": 2.055101854669237e-06, "loss": 0.7876097, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 8424, "time_per_iteration": 2.410820245742798 }, { "auxiliary_loss_clip": 0.01058411, "auxiliary_loss_mlp": 0.01047091, "balance_loss_clip": 1.01874256, "balance_loss_mlp": 1.0185678, "epoch": 0.5065384037276417, "flos": 28551655405440.0, "grad_norm": 1.416509653630379, "language_loss": 0.71898234, "learning_rate": 2.0547125397846975e-06, "loss": 0.74003732, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3984375, "step": 8425, "time_per_iteration": 2.4835565090179443 }, { "auxiliary_loss_clip": 0.01059462, "auxiliary_loss_mlp": 0.01044631, "balance_loss_clip": 1.01758242, "balance_loss_mlp": 1.01891637, "epoch": 0.5065985269803096, "flos": 22964923866240.0, "grad_norm": 1.970271908867102, "language_loss": 0.79876053, "learning_rate": 2.0543232228254524e-06, "loss": 0.81980145, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 8426, "time_per_iteration": 2.381798505783081 }, { "auxiliary_loss_clip": 0.01061995, "auxiliary_loss_mlp": 0.01042711, "balance_loss_clip": 1.01361251, "balance_loss_mlp": 1.02007687, "epoch": 0.5066586502329776, "flos": 21607710606720.0, "grad_norm": 2.2705617730664382, "language_loss": 0.79427338, "learning_rate": 2.053933903806265e-06, "loss": 0.81532043, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 8427, "time_per_iteration": 2.401426076889038 }, { "auxiliary_loss_clip": 0.01056571, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.0135262, "balance_loss_mlp": 1.01772189, "epoch": 0.5067187734856455, "flos": 20338861731840.0, "grad_norm": 1.6959665201401304, "language_loss": 0.72757256, "learning_rate": 2.0535445827418997e-06, "loss": 0.74854565, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38867188, "step": 8428, "time_per_iteration": 2.388887643814087 }, { "auxiliary_loss_clip": 0.01057507, "auxiliary_loss_mlp": 0.01042383, "balance_loss_clip": 1.01646709, "balance_loss_mlp": 1.01780117, "epoch": 0.5067788967383136, "flos": 28839793789440.0, "grad_norm": 1.584197666321564, "language_loss": 0.84232414, "learning_rate": 2.0531552596471168e-06, "loss": 0.86332309, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 8429, "time_per_iteration": 2.42390775680542 }, { "auxiliary_loss_clip": 0.01062794, "auxiliary_loss_mlp": 0.01049705, "balance_loss_clip": 1.01992631, "balance_loss_mlp": 1.02008057, "epoch": 0.5068390199909815, "flos": 32448870564480.0, "grad_norm": 1.8701501045478275, "language_loss": 0.7325362, "learning_rate": 2.052765934536682e-06, "loss": 0.75366116, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42773438, "step": 8430, "time_per_iteration": 2.492708444595337 }, { "auxiliary_loss_clip": 0.01060579, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.01709974, "balance_loss_mlp": 1.01896358, "epoch": 0.5068991432436495, "flos": 23145529662720.0, "grad_norm": 1.6053442330517715, "language_loss": 0.77801883, "learning_rate": 2.0523766074253575e-06, "loss": 0.7990675, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41601562, "step": 8431, "time_per_iteration": 2.377077579498291 }, { "auxiliary_loss_clip": 0.01059553, "auxiliary_loss_mlp": 0.01046219, "balance_loss_clip": 1.01924241, "balance_loss_mlp": 1.01975226, "epoch": 0.5069592664963174, "flos": 19935126236160.0, "grad_norm": 2.6860181299786445, "language_loss": 0.73223424, "learning_rate": 2.0519872783279074e-06, "loss": 0.75329196, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 8432, "time_per_iteration": 2.3855605125427246 }, { "auxiliary_loss_clip": 0.01012398, "auxiliary_loss_mlp": 0.01002687, "balance_loss_clip": 1.00056541, "balance_loss_mlp": 1.00438821, "epoch": 0.5070193897489854, "flos": 65790643950720.0, "grad_norm": 0.7591033104976589, "language_loss": 0.63798451, "learning_rate": 2.0515979472590945e-06, "loss": 0.65813529, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.08007812, "step": 8433, "time_per_iteration": 3.0705459117889404 }, { "auxiliary_loss_clip": 0.01059378, "auxiliary_loss_mlp": 0.01046967, "balance_loss_clip": 1.01903629, "balance_loss_mlp": 1.01870489, "epoch": 0.5070795130016534, "flos": 17274360343680.0, "grad_norm": 1.6593183180098319, "language_loss": 0.78574914, "learning_rate": 2.051208614233681e-06, "loss": 0.80681252, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 8434, "time_per_iteration": 2.3654568195343018 }, { "auxiliary_loss_clip": 0.01059216, "auxiliary_loss_mlp": 0.0104638, "balance_loss_clip": 1.0186044, "balance_loss_mlp": 1.01781309, "epoch": 0.5071396362543213, "flos": 21068860682880.0, "grad_norm": 1.5739680904640445, "language_loss": 0.72307813, "learning_rate": 2.0508192792664326e-06, "loss": 0.74413407, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8435, "time_per_iteration": 2.3703866004943848 }, { "auxiliary_loss_clip": 0.01059993, "auxiliary_loss_mlp": 0.01045739, "balance_loss_clip": 1.01783228, "balance_loss_mlp": 1.0187571, "epoch": 0.5071997595069894, "flos": 23143819006080.0, "grad_norm": 2.683407358818919, "language_loss": 0.73840463, "learning_rate": 2.050429942372112e-06, "loss": 0.75946194, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41210938, "step": 8436, "time_per_iteration": 2.4049243927001953 }, { "auxiliary_loss_clip": 0.01059501, "auxiliary_loss_mlp": 0.01044003, "balance_loss_clip": 1.01546407, "balance_loss_mlp": 1.01849604, "epoch": 0.5072598827596573, "flos": 22746088264320.0, "grad_norm": 1.5475834419501024, "language_loss": 0.84954441, "learning_rate": 2.050040603565483e-06, "loss": 0.87057942, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 8437, "time_per_iteration": 2.375600814819336 }, { "auxiliary_loss_clip": 0.01055993, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.01251292, "balance_loss_mlp": 1.01750422, "epoch": 0.5073200060123253, "flos": 22565168265600.0, "grad_norm": 1.4230738292864307, "language_loss": 0.81930906, "learning_rate": 2.049651262861309e-06, "loss": 0.84025651, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 8438, "time_per_iteration": 2.4282643795013428 }, { "auxiliary_loss_clip": 0.01058333, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.01641345, "balance_loss_mlp": 1.01676142, "epoch": 0.5073801292649932, "flos": 25805318037120.0, "grad_norm": 2.1132935610971484, "language_loss": 0.80684358, "learning_rate": 2.0492619202743543e-06, "loss": 0.82788277, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41601562, "step": 8439, "time_per_iteration": 2.4377732276916504 }, { "auxiliary_loss_clip": 0.01057582, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.01198745, "balance_loss_mlp": 1.01762724, "epoch": 0.5074402525176612, "flos": 25372778803200.0, "grad_norm": 1.637344177504828, "language_loss": 0.71943247, "learning_rate": 2.048872575819383e-06, "loss": 0.74038911, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8440, "time_per_iteration": 2.42033314704895 }, { "auxiliary_loss_clip": 0.01059602, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.01430976, "balance_loss_mlp": 1.01821733, "epoch": 0.5075003757703291, "flos": 26063326051200.0, "grad_norm": 1.9568797808305602, "language_loss": 0.72523022, "learning_rate": 2.048483229511158e-06, "loss": 0.74623007, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.4140625, "step": 8441, "time_per_iteration": 3.756898880004883 }, { "auxiliary_loss_clip": 0.01061101, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.01898539, "balance_loss_mlp": 1.01823747, "epoch": 0.5075604990229972, "flos": 21834366353280.0, "grad_norm": 1.7994065567877748, "language_loss": 0.65611571, "learning_rate": 2.0480938813644445e-06, "loss": 0.67721188, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4296875, "step": 8442, "time_per_iteration": 3.7983431816101074 }, { "auxiliary_loss_clip": 0.01057611, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.01083851, "balance_loss_mlp": 1.01883388, "epoch": 0.5076206222756651, "flos": 31977333475200.0, "grad_norm": 1.570117451899115, "language_loss": 0.71624738, "learning_rate": 2.047704531394006e-06, "loss": 0.73718655, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 8443, "time_per_iteration": 3.893843412399292 }, { "auxiliary_loss_clip": 0.01061157, "auxiliary_loss_mlp": 0.01050171, "balance_loss_clip": 1.02156067, "balance_loss_mlp": 1.01834249, "epoch": 0.5076807455283331, "flos": 36902530972800.0, "grad_norm": 1.24265769294492, "language_loss": 0.62688887, "learning_rate": 2.047315179614607e-06, "loss": 0.64800215, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 8444, "time_per_iteration": 2.5245463848114014 }, { "auxiliary_loss_clip": 0.01058011, "auxiliary_loss_mlp": 0.01041008, "balance_loss_clip": 1.01573515, "balance_loss_mlp": 1.01799595, "epoch": 0.507740868781001, "flos": 29861108058240.0, "grad_norm": 1.6084283402280708, "language_loss": 0.6486457, "learning_rate": 2.046925826041012e-06, "loss": 0.66963589, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 8445, "time_per_iteration": 2.42889142036438 }, { "auxiliary_loss_clip": 0.01010636, "auxiliary_loss_mlp": 0.0101144, "balance_loss_clip": 1.00923479, "balance_loss_mlp": 1.00266433, "epoch": 0.507800992033669, "flos": 61916157953280.0, "grad_norm": 0.8420181665240871, "language_loss": 0.62067723, "learning_rate": 2.0465364706879845e-06, "loss": 0.64089799, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.08007812, "step": 8446, "time_per_iteration": 3.02414608001709 }, { "auxiliary_loss_clip": 0.0105771, "auxiliary_loss_mlp": 0.01040279, "balance_loss_clip": 1.01447058, "balance_loss_mlp": 1.01820731, "epoch": 0.507861115286337, "flos": 20699549654400.0, "grad_norm": 1.535013310124043, "language_loss": 0.81865871, "learning_rate": 2.04614711357029e-06, "loss": 0.83963859, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 8447, "time_per_iteration": 2.3655736446380615 }, { "auxiliary_loss_clip": 0.01060043, "auxiliary_loss_mlp": 0.01043503, "balance_loss_clip": 1.01827824, "balance_loss_mlp": 1.02035141, "epoch": 0.507921238539005, "flos": 30845728621440.0, "grad_norm": 1.3689634974061746, "language_loss": 0.71522647, "learning_rate": 2.0457577547026916e-06, "loss": 0.73626196, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3984375, "step": 8448, "time_per_iteration": 2.464146375656128 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01042424, "balance_loss_clip": 1.01779509, "balance_loss_mlp": 1.01984167, "epoch": 0.507981361791673, "flos": 35698725694080.0, "grad_norm": 1.399631487467386, "language_loss": 0.72520828, "learning_rate": 2.045368394099955e-06, "loss": 0.74622434, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39257812, "step": 8449, "time_per_iteration": 2.5183145999908447 }, { "auxiliary_loss_clip": 0.01058134, "auxiliary_loss_mlp": 0.01044954, "balance_loss_clip": 1.02007532, "balance_loss_mlp": 1.01855624, "epoch": 0.5080414850443409, "flos": 27160262058240.0, "grad_norm": 1.5999901356977657, "language_loss": 0.74203265, "learning_rate": 2.044979031776844e-06, "loss": 0.76306349, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39648438, "step": 8450, "time_per_iteration": 3.8643808364868164 }, { "auxiliary_loss_clip": 0.01061909, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.01883161, "balance_loss_mlp": 1.02014458, "epoch": 0.5081016082970089, "flos": 27084081738240.0, "grad_norm": 1.6344379982743111, "language_loss": 0.77664065, "learning_rate": 2.0445896677481234e-06, "loss": 0.79772633, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8451, "time_per_iteration": 2.4061996936798096 }, { "auxiliary_loss_clip": 0.0105902, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.01680732, "balance_loss_mlp": 1.01862383, "epoch": 0.5081617315496768, "flos": 22855436242560.0, "grad_norm": 1.767493414730549, "language_loss": 0.86563838, "learning_rate": 2.044200302028559e-06, "loss": 0.88665724, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 8452, "time_per_iteration": 2.381490468978882 }, { "auxiliary_loss_clip": 0.01063912, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.0199635, "balance_loss_mlp": 1.02048802, "epoch": 0.5082218548023448, "flos": 16281186497280.0, "grad_norm": 2.5835770687561976, "language_loss": 0.79761779, "learning_rate": 2.0438109346329143e-06, "loss": 0.81874704, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.43359375, "step": 8453, "time_per_iteration": 2.3139281272888184 }, { "auxiliary_loss_clip": 0.01058732, "auxiliary_loss_mlp": 0.01042395, "balance_loss_clip": 1.01886272, "balance_loss_mlp": 1.01872623, "epoch": 0.5082819780550127, "flos": 24459660437760.0, "grad_norm": 1.613067809936125, "language_loss": 0.77920878, "learning_rate": 2.0434215655759544e-06, "loss": 0.80022001, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.40039062, "step": 8454, "time_per_iteration": 2.4301228523254395 }, { "auxiliary_loss_clip": 0.01060469, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 1.01534498, "balance_loss_mlp": 1.01992035, "epoch": 0.5083421013076808, "flos": 23402176133760.0, "grad_norm": 1.9065991176185264, "language_loss": 0.90625519, "learning_rate": 2.0430321948724446e-06, "loss": 0.92729294, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 8455, "time_per_iteration": 2.387866258621216 }, { "auxiliary_loss_clip": 0.01062036, "auxiliary_loss_mlp": 0.01049414, "balance_loss_clip": 1.0170846, "balance_loss_mlp": 1.01970124, "epoch": 0.5084022245603487, "flos": 23871723275520.0, "grad_norm": 1.7268764699752623, "language_loss": 0.63175762, "learning_rate": 2.042642822537149e-06, "loss": 0.6528722, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.42382812, "step": 8456, "time_per_iteration": 2.3955698013305664 }, { "auxiliary_loss_clip": 0.01011951, "auxiliary_loss_mlp": 0.01007527, "balance_loss_clip": 1.00530994, "balance_loss_mlp": 1.00404358, "epoch": 0.5084623478130167, "flos": 62870333944320.0, "grad_norm": 0.8382573550432458, "language_loss": 0.62534261, "learning_rate": 2.0422534485848343e-06, "loss": 0.64553738, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.07910156, "step": 8457, "time_per_iteration": 2.886145830154419 }, { "auxiliary_loss_clip": 0.0106108, "auxiliary_loss_mlp": 0.01054767, "balance_loss_clip": 1.0258708, "balance_loss_mlp": 1.01929855, "epoch": 0.5085224710656846, "flos": 22345040943360.0, "grad_norm": 1.70646377723114, "language_loss": 0.68850744, "learning_rate": 2.0418640730302644e-06, "loss": 0.70966589, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 8458, "time_per_iteration": 2.4077377319335938 }, { "auxiliary_loss_clip": 0.01059225, "auxiliary_loss_mlp": 0.01045053, "balance_loss_clip": 1.01687205, "balance_loss_mlp": 1.01771533, "epoch": 0.5085825943183526, "flos": 26065106530560.0, "grad_norm": 1.6775500414284321, "language_loss": 0.78374052, "learning_rate": 2.0414746958882043e-06, "loss": 0.80478323, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 8459, "time_per_iteration": 2.406562566757202 }, { "auxiliary_loss_clip": 0.01065758, "auxiliary_loss_mlp": 0.01044659, "balance_loss_clip": 1.01491594, "balance_loss_mlp": 1.02094018, "epoch": 0.5086427175710206, "flos": 17419773623040.0, "grad_norm": 2.0588581516784967, "language_loss": 0.81104445, "learning_rate": 2.0410853171734196e-06, "loss": 0.83214867, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.44921875, "step": 8460, "time_per_iteration": 2.3602967262268066 }, { "auxiliary_loss_clip": 0.01060884, "auxiliary_loss_mlp": 0.01046862, "balance_loss_clip": 1.01852596, "balance_loss_mlp": 1.0195725, "epoch": 0.5087028408236886, "flos": 20630700720000.0, "grad_norm": 1.4998423325380499, "language_loss": 0.6989097, "learning_rate": 2.0406959369006754e-06, "loss": 0.71998715, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4140625, "step": 8461, "time_per_iteration": 2.353825569152832 }, { "auxiliary_loss_clip": 0.0105927, "auxiliary_loss_mlp": 0.01054555, "balance_loss_clip": 1.02606416, "balance_loss_mlp": 1.01849461, "epoch": 0.5087629640763566, "flos": 25592626834560.0, "grad_norm": 1.9715534416134695, "language_loss": 0.77255726, "learning_rate": 2.0403065550847375e-06, "loss": 0.79369557, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 8462, "time_per_iteration": 2.425936460494995 }, { "auxiliary_loss_clip": 0.01060125, "auxiliary_loss_mlp": 0.01044191, "balance_loss_clip": 1.01603341, "balance_loss_mlp": 1.01899862, "epoch": 0.5088230873290245, "flos": 13260780023040.0, "grad_norm": 2.3832349630852483, "language_loss": 0.82953119, "learning_rate": 2.0399171717403706e-06, "loss": 0.85057431, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 8463, "time_per_iteration": 2.3425872325897217 }, { "auxiliary_loss_clip": 0.0105964, "auxiliary_loss_mlp": 0.01048509, "balance_loss_clip": 1.0212338, "balance_loss_mlp": 1.01911569, "epoch": 0.5088832105816925, "flos": 20042554089600.0, "grad_norm": 1.6484195477476822, "language_loss": 0.77233231, "learning_rate": 2.039527786882341e-06, "loss": 0.79341376, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 8464, "time_per_iteration": 2.3898494243621826 }, { "auxiliary_loss_clip": 0.01011459, "auxiliary_loss_mlp": 0.01009675, "balance_loss_clip": 1.00724339, "balance_loss_mlp": 1.00347078, "epoch": 0.5089433338343604, "flos": 67418363491200.0, "grad_norm": 0.694206941864317, "language_loss": 0.59482944, "learning_rate": 2.0391384005254133e-06, "loss": 0.61504078, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.08007812, "step": 8465, "time_per_iteration": 3.1264331340789795 }, { "auxiliary_loss_clip": 0.01056907, "auxiliary_loss_mlp": 0.01044356, "balance_loss_clip": 1.01691413, "balance_loss_mlp": 1.01749182, "epoch": 0.5090034570870284, "flos": 22709254913280.0, "grad_norm": 1.7616177920521146, "language_loss": 0.81471777, "learning_rate": 2.038749012684354e-06, "loss": 0.83573043, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 8466, "time_per_iteration": 2.405600070953369 }, { "auxiliary_loss_clip": 0.01056475, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.01830077, "balance_loss_mlp": 1.01689303, "epoch": 0.5090635803396963, "flos": 20444858219520.0, "grad_norm": 1.5334302282990855, "language_loss": 0.7872715, "learning_rate": 2.0383596233739286e-06, "loss": 0.80829728, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.39453125, "step": 8467, "time_per_iteration": 2.364964723587036 }, { "auxiliary_loss_clip": 0.01057327, "auxiliary_loss_mlp": 0.01042637, "balance_loss_clip": 1.0178895, "balance_loss_mlp": 1.01806641, "epoch": 0.5091237035923644, "flos": 23767751646720.0, "grad_norm": 1.6490274474095241, "language_loss": 0.75406432, "learning_rate": 2.0379702326089013e-06, "loss": 0.77506399, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39257812, "step": 8468, "time_per_iteration": 2.421795129776001 }, { "auxiliary_loss_clip": 0.01058273, "auxiliary_loss_mlp": 0.01046941, "balance_loss_clip": 1.0206672, "balance_loss_mlp": 1.01886284, "epoch": 0.5091838268450323, "flos": 18327061791360.0, "grad_norm": 1.8848212811085376, "language_loss": 0.79052973, "learning_rate": 2.03758084040404e-06, "loss": 0.81158185, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 8469, "time_per_iteration": 2.4092581272125244 }, { "auxiliary_loss_clip": 0.01060927, "auxiliary_loss_mlp": 0.01045255, "balance_loss_clip": 1.01732397, "balance_loss_mlp": 1.02089882, "epoch": 0.5092439500977003, "flos": 29056395064320.0, "grad_norm": 1.6848038935870577, "language_loss": 0.69951981, "learning_rate": 2.037191446774109e-06, "loss": 0.72058165, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40039062, "step": 8470, "time_per_iteration": 2.453277826309204 }, { "auxiliary_loss_clip": 0.01061868, "auxiliary_loss_mlp": 0.01050706, "balance_loss_clip": 1.02325177, "balance_loss_mlp": 1.01988184, "epoch": 0.5093040733503682, "flos": 13553037947520.0, "grad_norm": 1.8593007499251606, "language_loss": 0.74907631, "learning_rate": 2.0368020517338745e-06, "loss": 0.77020204, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41992188, "step": 8471, "time_per_iteration": 2.3704445362091064 }, { "auxiliary_loss_clip": 0.01011306, "auxiliary_loss_mlp": 0.01012123, "balance_loss_clip": 1.00950074, "balance_loss_mlp": 1.00329638, "epoch": 0.5093641966030362, "flos": 68903080502400.0, "grad_norm": 0.7537130859706597, "language_loss": 0.58227253, "learning_rate": 2.036412655298103e-06, "loss": 0.60250676, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.08007812, "step": 8472, "time_per_iteration": 3.016374349594116 }, { "auxiliary_loss_clip": 0.01057877, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.01727557, "balance_loss_mlp": 1.01789689, "epoch": 0.5094243198557042, "flos": 21579849475200.0, "grad_norm": 1.7610066679645031, "language_loss": 0.70315015, "learning_rate": 2.03602325748156e-06, "loss": 0.72414231, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3984375, "step": 8473, "time_per_iteration": 2.375530481338501 }, { "auxiliary_loss_clip": 0.01057618, "auxiliary_loss_mlp": 0.0104361, "balance_loss_clip": 1.01838565, "balance_loss_mlp": 1.01825333, "epoch": 0.5094844431083722, "flos": 28839444675840.0, "grad_norm": 2.1591280606595733, "language_loss": 0.85956568, "learning_rate": 2.0356338582990105e-06, "loss": 0.88057792, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 8474, "time_per_iteration": 2.432929515838623 }, { "auxiliary_loss_clip": 0.01058455, "auxiliary_loss_mlp": 0.01043776, "balance_loss_clip": 1.01962423, "balance_loss_mlp": 1.01874661, "epoch": 0.5095445663610402, "flos": 14975224980480.0, "grad_norm": 1.816493254569138, "language_loss": 0.65369725, "learning_rate": 2.035244457765222e-06, "loss": 0.67471951, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3984375, "step": 8475, "time_per_iteration": 2.346534013748169 }, { "auxiliary_loss_clip": 0.01063368, "auxiliary_loss_mlp": 0.01048787, "balance_loss_clip": 1.02077329, "balance_loss_mlp": 1.02053189, "epoch": 0.5096046896137081, "flos": 20776044176640.0, "grad_norm": 2.556273002101492, "language_loss": 0.82963926, "learning_rate": 2.0348550558949605e-06, "loss": 0.85076082, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4296875, "step": 8476, "time_per_iteration": 2.408928632736206 }, { "auxiliary_loss_clip": 0.0106154, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.01667666, "balance_loss_mlp": 1.01975107, "epoch": 0.5096648128663761, "flos": 23183968936320.0, "grad_norm": 2.2272393654219704, "language_loss": 0.82817686, "learning_rate": 2.0344656527029917e-06, "loss": 0.8492583, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41796875, "step": 8477, "time_per_iteration": 2.378952980041504 }, { "auxiliary_loss_clip": 0.01063991, "auxiliary_loss_mlp": 0.01042736, "balance_loss_clip": 1.0150677, "balance_loss_mlp": 1.02228308, "epoch": 0.509724936119044, "flos": 22308347237760.0, "grad_norm": 17.59995275052481, "language_loss": 0.63340479, "learning_rate": 2.034076248204082e-06, "loss": 0.65447211, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8478, "time_per_iteration": 2.3954272270202637 }, { "auxiliary_loss_clip": 0.01059507, "auxiliary_loss_mlp": 0.01049154, "balance_loss_clip": 1.02327335, "balance_loss_mlp": 1.02026474, "epoch": 0.509785059371712, "flos": 26285862257280.0, "grad_norm": 1.7890720293344577, "language_loss": 0.67571807, "learning_rate": 2.0336868424129968e-06, "loss": 0.6968047, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 8479, "time_per_iteration": 2.416754961013794 }, { "auxiliary_loss_clip": 0.01060609, "auxiliary_loss_mlp": 0.01043257, "balance_loss_clip": 1.01860452, "balance_loss_mlp": 1.02118158, "epoch": 0.50984518262438, "flos": 22963527411840.0, "grad_norm": 1.5243540715815629, "language_loss": 0.7058692, "learning_rate": 2.0332974353445037e-06, "loss": 0.72690785, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39453125, "step": 8480, "time_per_iteration": 3.6331043243408203 }, { "auxiliary_loss_clip": 0.01060678, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.01294613, "balance_loss_mlp": 1.01873326, "epoch": 0.509905305877048, "flos": 26212195555200.0, "grad_norm": 1.748521003081691, "language_loss": 0.80091715, "learning_rate": 2.0329080270133688e-06, "loss": 0.82193589, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8481, "time_per_iteration": 2.4144697189331055 }, { "auxiliary_loss_clip": 0.01057467, "auxiliary_loss_mlp": 0.010499, "balance_loss_clip": 1.02528358, "balance_loss_mlp": 1.01885128, "epoch": 0.5099654291297159, "flos": 20339001377280.0, "grad_norm": 1.5321760857809004, "language_loss": 0.84126312, "learning_rate": 2.0325186174343578e-06, "loss": 0.86233681, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 8482, "time_per_iteration": 3.8280582427978516 }, { "auxiliary_loss_clip": 0.01062976, "auxiliary_loss_mlp": 0.01044233, "balance_loss_clip": 1.01730359, "balance_loss_mlp": 1.01997113, "epoch": 0.5100255523823839, "flos": 29053671978240.0, "grad_norm": 1.5992475498017331, "language_loss": 0.86747789, "learning_rate": 2.032129206622238e-06, "loss": 0.88854992, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4296875, "step": 8483, "time_per_iteration": 3.7988643646240234 }, { "auxiliary_loss_clip": 0.01059816, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.01806378, "balance_loss_mlp": 1.01899791, "epoch": 0.5100856756350518, "flos": 22454807857920.0, "grad_norm": 1.7898355895582827, "language_loss": 0.84272349, "learning_rate": 2.031739794591775e-06, "loss": 0.86376464, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 8484, "time_per_iteration": 2.385152578353882 }, { "auxiliary_loss_clip": 0.01060972, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.01714146, "balance_loss_mlp": 1.01933277, "epoch": 0.5101457988877198, "flos": 19170074413440.0, "grad_norm": 1.8987637941413014, "language_loss": 0.82798123, "learning_rate": 2.031350381357736e-06, "loss": 0.84904623, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8485, "time_per_iteration": 2.3719170093536377 }, { "auxiliary_loss_clip": 0.01057456, "auxiliary_loss_mlp": 0.01044798, "balance_loss_clip": 1.02002668, "balance_loss_mlp": 1.01869547, "epoch": 0.5102059221403878, "flos": 14865492977280.0, "grad_norm": 2.1843209048065253, "language_loss": 0.74839187, "learning_rate": 2.0309609669348874e-06, "loss": 0.76941442, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 8486, "time_per_iteration": 2.3604555130004883 }, { "auxiliary_loss_clip": 0.01060394, "auxiliary_loss_mlp": 0.01041241, "balance_loss_clip": 1.01514649, "balance_loss_mlp": 1.01943445, "epoch": 0.5102660453930558, "flos": 22960141009920.0, "grad_norm": 1.5276396309678737, "language_loss": 0.71432161, "learning_rate": 2.0305715513379953e-06, "loss": 0.73533797, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41015625, "step": 8487, "time_per_iteration": 2.4232871532440186 }, { "auxiliary_loss_clip": 0.01060502, "auxiliary_loss_mlp": 0.01043244, "balance_loss_clip": 1.01598072, "balance_loss_mlp": 1.02015007, "epoch": 0.5103261686457238, "flos": 23148182926080.0, "grad_norm": 2.089991253011674, "language_loss": 0.73613989, "learning_rate": 2.030182134581827e-06, "loss": 0.75717735, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 8488, "time_per_iteration": 2.381349802017212 }, { "auxiliary_loss_clip": 0.01059971, "auxiliary_loss_mlp": 0.01054579, "balance_loss_clip": 1.02772164, "balance_loss_mlp": 1.01851821, "epoch": 0.5103862918983917, "flos": 14318369061120.0, "grad_norm": 2.0826718493094716, "language_loss": 0.71753764, "learning_rate": 2.0297927166811503e-06, "loss": 0.73868316, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 8489, "time_per_iteration": 2.3843705654144287 }, { "auxiliary_loss_clip": 0.01058268, "auxiliary_loss_mlp": 0.01044527, "balance_loss_clip": 1.01876593, "balance_loss_mlp": 1.0176003, "epoch": 0.5104464151510597, "flos": 25847353180800.0, "grad_norm": 2.216725040649054, "language_loss": 0.73679549, "learning_rate": 2.0294032976507297e-06, "loss": 0.75782347, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40625, "step": 8490, "time_per_iteration": 3.8420891761779785 }, { "auxiliary_loss_clip": 0.01056948, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.01577675, "balance_loss_mlp": 1.01737738, "epoch": 0.5105065384037276, "flos": 21651840432000.0, "grad_norm": 1.5838017644755435, "language_loss": 0.81245548, "learning_rate": 2.0290138775053337e-06, "loss": 0.83342302, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.39453125, "step": 8491, "time_per_iteration": 2.3823461532592773 }, { "auxiliary_loss_clip": 0.01055452, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.0183413, "balance_loss_mlp": 1.01721275, "epoch": 0.5105666616563956, "flos": 22490489134080.0, "grad_norm": 3.869944346241888, "language_loss": 0.80442715, "learning_rate": 2.028624456259728e-06, "loss": 0.82540572, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 8492, "time_per_iteration": 2.407033920288086 }, { "auxiliary_loss_clip": 0.01061291, "auxiliary_loss_mlp": 0.01049074, "balance_loss_clip": 1.02054679, "balance_loss_mlp": 1.02007675, "epoch": 0.5106267849090635, "flos": 22454668212480.0, "grad_norm": 2.334088030719718, "language_loss": 0.79114056, "learning_rate": 2.0282350339286804e-06, "loss": 0.81224424, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41210938, "step": 8493, "time_per_iteration": 2.380302906036377 }, { "auxiliary_loss_clip": 0.01058824, "auxiliary_loss_mlp": 0.01041907, "balance_loss_clip": 1.01478708, "balance_loss_mlp": 1.01898789, "epoch": 0.5106869081617316, "flos": 23546053313280.0, "grad_norm": 1.682354192991298, "language_loss": 0.84550291, "learning_rate": 2.0278456105269574e-06, "loss": 0.86651027, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 8494, "time_per_iteration": 2.4185791015625 }, { "auxiliary_loss_clip": 0.01061125, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.02203953, "balance_loss_mlp": 1.01969254, "epoch": 0.5107470314143995, "flos": 26791893636480.0, "grad_norm": 2.064172699552442, "language_loss": 0.8042196, "learning_rate": 2.027456186069326e-06, "loss": 0.8253051, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.4140625, "step": 8495, "time_per_iteration": 2.4141385555267334 }, { "auxiliary_loss_clip": 0.01059475, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.01762617, "balance_loss_mlp": 1.01934302, "epoch": 0.5108071546670675, "flos": 25738493961600.0, "grad_norm": 1.5548976243058288, "language_loss": 0.79839039, "learning_rate": 2.0270667605705535e-06, "loss": 0.81941664, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40039062, "step": 8496, "time_per_iteration": 2.4466471672058105 }, { "auxiliary_loss_clip": 0.01056311, "auxiliary_loss_mlp": 0.01041605, "balance_loss_clip": 1.01649904, "balance_loss_mlp": 1.01703012, "epoch": 0.5108672779197354, "flos": 18696547376640.0, "grad_norm": 2.7798772706910135, "language_loss": 0.80509442, "learning_rate": 2.0266773340454066e-06, "loss": 0.82607359, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39257812, "step": 8497, "time_per_iteration": 2.352236032485962 }, { "auxiliary_loss_clip": 0.01057798, "auxiliary_loss_mlp": 0.01045407, "balance_loss_clip": 1.01896644, "balance_loss_mlp": 1.01850057, "epoch": 0.5109274011724034, "flos": 26686944489600.0, "grad_norm": 1.667747516552524, "language_loss": 0.83095706, "learning_rate": 2.0262879065086525e-06, "loss": 0.85198909, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 8498, "time_per_iteration": 2.4277453422546387 }, { "auxiliary_loss_clip": 0.01058752, "auxiliary_loss_mlp": 0.01037059, "balance_loss_clip": 1.01128626, "balance_loss_mlp": 1.01893854, "epoch": 0.5109875244250714, "flos": 22782921615360.0, "grad_norm": 2.514960657591234, "language_loss": 0.72058791, "learning_rate": 2.0258984779750584e-06, "loss": 0.74154603, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8499, "time_per_iteration": 2.399409055709839 }, { "auxiliary_loss_clip": 0.01058069, "auxiliary_loss_mlp": 0.01043451, "balance_loss_clip": 1.0177381, "balance_loss_mlp": 1.01813507, "epoch": 0.5110476476777394, "flos": 35587108477440.0, "grad_norm": 1.476868399800931, "language_loss": 0.73334008, "learning_rate": 2.0255090484593914e-06, "loss": 0.75435525, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8500, "time_per_iteration": 2.4770984649658203 }, { "auxiliary_loss_clip": 0.01062264, "auxiliary_loss_mlp": 0.01048439, "balance_loss_clip": 1.0178858, "balance_loss_mlp": 1.01851737, "epoch": 0.5111077709304074, "flos": 19279806416640.0, "grad_norm": 3.014503508323248, "language_loss": 0.65995121, "learning_rate": 2.0251196179764183e-06, "loss": 0.68105829, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.4375, "step": 8501, "time_per_iteration": 2.3783631324768066 }, { "auxiliary_loss_clip": 0.01060002, "auxiliary_loss_mlp": 0.01047443, "balance_loss_clip": 1.02015555, "balance_loss_mlp": 1.01900148, "epoch": 0.5111678941830753, "flos": 20667150046080.0, "grad_norm": 1.6954590496319137, "language_loss": 0.89045727, "learning_rate": 2.024730186540907e-06, "loss": 0.91153169, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 8502, "time_per_iteration": 2.3655779361724854 }, { "auxiliary_loss_clip": 0.01056431, "auxiliary_loss_mlp": 0.01040127, "balance_loss_clip": 1.01503325, "balance_loss_mlp": 1.01724267, "epoch": 0.5112280174357433, "flos": 26286665218560.0, "grad_norm": 1.3775320038525007, "language_loss": 0.83224136, "learning_rate": 2.0243407541676253e-06, "loss": 0.85320693, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 8503, "time_per_iteration": 2.444091796875 }, { "auxiliary_loss_clip": 0.01014819, "auxiliary_loss_mlp": 0.01003948, "balance_loss_clip": 1.00173104, "balance_loss_mlp": 1.0068028, "epoch": 0.5112881406884112, "flos": 59471504576640.0, "grad_norm": 0.8576582947972992, "language_loss": 0.63844234, "learning_rate": 2.023951320871339e-06, "loss": 0.65863007, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.08007812, "step": 8504, "time_per_iteration": 3.0700438022613525 }, { "auxiliary_loss_clip": 0.01058297, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.01341724, "balance_loss_mlp": 1.0190208, "epoch": 0.5113482639410792, "flos": 26467655040000.0, "grad_norm": 2.053833425561365, "language_loss": 0.85326159, "learning_rate": 2.023561886666816e-06, "loss": 0.87424076, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39257812, "step": 8505, "time_per_iteration": 2.4076192378997803 }, { "auxiliary_loss_clip": 0.01058524, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.01120996, "balance_loss_mlp": 1.01938999, "epoch": 0.5114083871937471, "flos": 29894624830080.0, "grad_norm": 2.0328849133639477, "language_loss": 0.75993967, "learning_rate": 2.0231724515688246e-06, "loss": 0.78089476, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 8506, "time_per_iteration": 2.4578373432159424 }, { "auxiliary_loss_clip": 0.0106049, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.01475239, "balance_loss_mlp": 1.0194155, "epoch": 0.5114685104464152, "flos": 24313479108480.0, "grad_norm": 1.760485764810966, "language_loss": 0.59581649, "learning_rate": 2.022783015592131e-06, "loss": 0.61683589, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 8507, "time_per_iteration": 2.416443347930908 }, { "auxiliary_loss_clip": 0.01059894, "auxiliary_loss_mlp": 0.01049123, "balance_loss_clip": 1.02147877, "balance_loss_mlp": 1.01947165, "epoch": 0.5115286336990831, "flos": 17018342277120.0, "grad_norm": 2.336910049019614, "language_loss": 0.86441511, "learning_rate": 2.022393578751503e-06, "loss": 0.88550526, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 8508, "time_per_iteration": 2.3512864112854004 }, { "auxiliary_loss_clip": 0.01060449, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.0150665, "balance_loss_mlp": 1.01868606, "epoch": 0.5115887569517511, "flos": 23658264023040.0, "grad_norm": 1.6904228890094024, "language_loss": 0.73971701, "learning_rate": 2.022004141061709e-06, "loss": 0.7607584, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 8509, "time_per_iteration": 2.4185376167297363 }, { "auxiliary_loss_clip": 0.01057824, "auxiliary_loss_mlp": 0.01037891, "balance_loss_clip": 1.01319075, "balance_loss_mlp": 1.01842356, "epoch": 0.511648880204419, "flos": 16106271252480.0, "grad_norm": 1.6631376835743572, "language_loss": 0.77526665, "learning_rate": 2.0216147025375153e-06, "loss": 0.79622382, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.39453125, "step": 8510, "time_per_iteration": 2.345179796218872 }, { "auxiliary_loss_clip": 0.01058569, "auxiliary_loss_mlp": 0.01041064, "balance_loss_clip": 1.01629245, "balance_loss_mlp": 1.01950943, "epoch": 0.511709003457087, "flos": 32633595901440.0, "grad_norm": 1.699063497637329, "language_loss": 0.71924716, "learning_rate": 2.0212252631936907e-06, "loss": 0.74024349, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 8511, "time_per_iteration": 2.50089430809021 }, { "auxiliary_loss_clip": 0.01058123, "auxiliary_loss_mlp": 0.01042847, "balance_loss_clip": 1.01682329, "balance_loss_mlp": 1.01980734, "epoch": 0.511769126709755, "flos": 21761013853440.0, "grad_norm": 1.8578425715498674, "language_loss": 0.68383574, "learning_rate": 2.020835823045001e-06, "loss": 0.70484543, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 8512, "time_per_iteration": 2.374147653579712 }, { "auxiliary_loss_clip": 0.01059672, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.01619208, "balance_loss_mlp": 1.01849604, "epoch": 0.511829249962423, "flos": 23914212266880.0, "grad_norm": 1.6643768919269346, "language_loss": 0.67855716, "learning_rate": 2.0204463821062146e-06, "loss": 0.69959831, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 8513, "time_per_iteration": 2.4140875339508057 }, { "auxiliary_loss_clip": 0.01057349, "auxiliary_loss_mlp": 0.01042241, "balance_loss_clip": 1.01545501, "balance_loss_mlp": 1.01858091, "epoch": 0.511889373215091, "flos": 23726030705280.0, "grad_norm": 1.864333754801022, "language_loss": 0.69601905, "learning_rate": 2.0200569403921e-06, "loss": 0.71701491, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38671875, "step": 8514, "time_per_iteration": 2.4115965366363525 }, { "auxiliary_loss_clip": 0.01056221, "auxiliary_loss_mlp": 0.01044109, "balance_loss_clip": 1.01858675, "balance_loss_mlp": 1.01722324, "epoch": 0.5119494964677589, "flos": 28110248686080.0, "grad_norm": 1.5249478632116205, "language_loss": 0.66780084, "learning_rate": 2.019667497917424e-06, "loss": 0.68880415, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 8515, "time_per_iteration": 2.4685540199279785 }, { "auxiliary_loss_clip": 0.0105529, "auxiliary_loss_mlp": 0.01039153, "balance_loss_clip": 1.0141902, "balance_loss_mlp": 1.01648593, "epoch": 0.5120096197204269, "flos": 24972045684480.0, "grad_norm": 3.556507713972009, "language_loss": 0.76318705, "learning_rate": 2.019278054696955e-06, "loss": 0.78413141, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 8516, "time_per_iteration": 2.4189131259918213 }, { "auxiliary_loss_clip": 0.01060893, "auxiliary_loss_mlp": 0.01045879, "balance_loss_clip": 1.01811516, "balance_loss_mlp": 1.01941478, "epoch": 0.5120697429730948, "flos": 17967037184640.0, "grad_norm": 1.8483468226846396, "language_loss": 0.79111099, "learning_rate": 2.0188886107454595e-06, "loss": 0.81217873, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8517, "time_per_iteration": 2.3737893104553223 }, { "auxiliary_loss_clip": 0.01059808, "auxiliary_loss_mlp": 0.01041267, "balance_loss_clip": 1.01319313, "balance_loss_mlp": 1.01838803, "epoch": 0.5121298662257628, "flos": 23291292055680.0, "grad_norm": 1.8013719480212882, "language_loss": 0.74703163, "learning_rate": 2.0184991660777063e-06, "loss": 0.76804233, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 8518, "time_per_iteration": 2.3857932090759277 }, { "auxiliary_loss_clip": 0.01057995, "auxiliary_loss_mlp": 0.01045823, "balance_loss_clip": 1.01911998, "balance_loss_mlp": 1.01823592, "epoch": 0.5121899894784308, "flos": 17310111442560.0, "grad_norm": 1.7248289437768196, "language_loss": 0.80002332, "learning_rate": 2.0181097207084625e-06, "loss": 0.82106149, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 8519, "time_per_iteration": 3.699230909347534 }, { "auxiliary_loss_clip": 0.01058744, "auxiliary_loss_mlp": 0.01044742, "balance_loss_clip": 1.01741982, "balance_loss_mlp": 1.01894784, "epoch": 0.5122501127310988, "flos": 24929102845440.0, "grad_norm": 1.477083357572077, "language_loss": 0.80575848, "learning_rate": 2.017720274652497e-06, "loss": 0.82679331, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 8520, "time_per_iteration": 2.456456422805786 }, { "auxiliary_loss_clip": 0.01062348, "auxiliary_loss_mlp": 0.01047017, "balance_loss_clip": 1.01721489, "balance_loss_mlp": 1.0191437, "epoch": 0.5123102359837667, "flos": 18441855941760.0, "grad_norm": 1.874234480973075, "language_loss": 0.83050859, "learning_rate": 2.0173308279245765e-06, "loss": 0.8516022, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.43359375, "step": 8521, "time_per_iteration": 3.871715784072876 }, { "auxiliary_loss_clip": 0.01056885, "auxiliary_loss_mlp": 0.01044704, "balance_loss_clip": 1.01654708, "balance_loss_mlp": 1.01741266, "epoch": 0.5123703592364347, "flos": 26683732644480.0, "grad_norm": 1.7442068212686215, "language_loss": 0.68872917, "learning_rate": 2.0169413805394692e-06, "loss": 0.70974499, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.39453125, "step": 8522, "time_per_iteration": 2.4884464740753174 }, { "auxiliary_loss_clip": 0.01063591, "auxiliary_loss_mlp": 0.01053523, "balance_loss_clip": 1.02177787, "balance_loss_mlp": 1.01963902, "epoch": 0.5124304824891026, "flos": 28802681147520.0, "grad_norm": 1.6992335897159507, "language_loss": 0.63180339, "learning_rate": 2.0165519325119433e-06, "loss": 0.65297455, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.43945312, "step": 8523, "time_per_iteration": 3.864572048187256 }, { "auxiliary_loss_clip": 0.01061057, "auxiliary_loss_mlp": 0.01042228, "balance_loss_clip": 1.01858926, "balance_loss_mlp": 1.01980662, "epoch": 0.5124906057417706, "flos": 21760769473920.0, "grad_norm": 1.9323552634249836, "language_loss": 0.78889459, "learning_rate": 2.0161624838567656e-06, "loss": 0.80992746, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.41210938, "step": 8524, "time_per_iteration": 2.394404411315918 }, { "auxiliary_loss_clip": 0.01059274, "auxiliary_loss_mlp": 0.01042931, "balance_loss_clip": 1.0172174, "balance_loss_mlp": 1.01962483, "epoch": 0.5125507289944387, "flos": 18879527145600.0, "grad_norm": 2.321488004421083, "language_loss": 0.75879574, "learning_rate": 2.015773034588706e-06, "loss": 0.77981782, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39648438, "step": 8525, "time_per_iteration": 2.375128746032715 }, { "auxiliary_loss_clip": 0.01059437, "auxiliary_loss_mlp": 0.01046505, "balance_loss_clip": 1.01815677, "balance_loss_mlp": 1.01899898, "epoch": 0.5126108522471066, "flos": 35626350712320.0, "grad_norm": 1.796641649184217, "language_loss": 0.76034749, "learning_rate": 2.015383584722531e-06, "loss": 0.78140688, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40429688, "step": 8526, "time_per_iteration": 2.5429975986480713 }, { "auxiliary_loss_clip": 0.01059844, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.01478779, "balance_loss_mlp": 1.0197283, "epoch": 0.5126709754997746, "flos": 20189957316480.0, "grad_norm": 1.581574957714515, "language_loss": 0.66615528, "learning_rate": 2.0149941342730088e-06, "loss": 0.68717593, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 8527, "time_per_iteration": 2.4000167846679688 }, { "auxiliary_loss_clip": 0.01057617, "auxiliary_loss_mlp": 0.01042144, "balance_loss_clip": 1.01813579, "balance_loss_mlp": 1.01966739, "epoch": 0.5127310987524425, "flos": 18587548512000.0, "grad_norm": 1.6287403683324753, "language_loss": 0.74970263, "learning_rate": 2.014604683254908e-06, "loss": 0.77070028, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 8528, "time_per_iteration": 3.8061838150024414 }, { "auxiliary_loss_clip": 0.01056732, "auxiliary_loss_mlp": 0.01039818, "balance_loss_clip": 1.014081, "balance_loss_mlp": 1.01746714, "epoch": 0.5127912220051105, "flos": 22453620871680.0, "grad_norm": 2.509196433939289, "language_loss": 0.84685808, "learning_rate": 2.014215231682995e-06, "loss": 0.8678236, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 8529, "time_per_iteration": 2.409266233444214 }, { "auxiliary_loss_clip": 0.01056588, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.01246595, "balance_loss_mlp": 1.01928341, "epoch": 0.5128513452577784, "flos": 19092846752640.0, "grad_norm": 2.0000810773624718, "language_loss": 0.75220764, "learning_rate": 2.01382577957204e-06, "loss": 0.7731446, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 8530, "time_per_iteration": 2.359086275100708 }, { "auxiliary_loss_clip": 0.01012736, "auxiliary_loss_mlp": 0.01005158, "balance_loss_clip": 1.00270212, "balance_loss_mlp": 1.00495255, "epoch": 0.5129114685104464, "flos": 67888573948800.0, "grad_norm": 0.7594127512124409, "language_loss": 0.60880721, "learning_rate": 2.0134363269368095e-06, "loss": 0.62898612, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.078125, "step": 8531, "time_per_iteration": 3.1339187622070312 }, { "auxiliary_loss_clip": 0.01060237, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.01408291, "balance_loss_mlp": 1.02011704, "epoch": 0.5129715917631144, "flos": 20448104976000.0, "grad_norm": 1.7031259136727874, "language_loss": 0.78163862, "learning_rate": 2.0130468737920725e-06, "loss": 0.80264515, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 8532, "time_per_iteration": 2.370124578475952 }, { "auxiliary_loss_clip": 0.01058608, "auxiliary_loss_mlp": 0.01041531, "balance_loss_clip": 1.01494718, "balance_loss_mlp": 1.02009034, "epoch": 0.5130317150157824, "flos": 35114698604160.0, "grad_norm": 1.9622905596979503, "language_loss": 0.68789613, "learning_rate": 2.012657420152597e-06, "loss": 0.70889747, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 8533, "time_per_iteration": 2.5079092979431152 }, { "auxiliary_loss_clip": 0.01061739, "auxiliary_loss_mlp": 0.01043778, "balance_loss_clip": 1.01625228, "balance_loss_mlp": 1.01979756, "epoch": 0.5130918382684503, "flos": 19790620652160.0, "grad_norm": 2.1797980166786894, "language_loss": 0.82926643, "learning_rate": 2.01226796603315e-06, "loss": 0.85032159, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 8534, "time_per_iteration": 2.3801212310791016 }, { "auxiliary_loss_clip": 0.0106127, "auxiliary_loss_mlp": 0.0104886, "balance_loss_clip": 1.020679, "balance_loss_mlp": 1.01872575, "epoch": 0.5131519615211183, "flos": 26321892647040.0, "grad_norm": 1.5122338337526506, "language_loss": 0.64694142, "learning_rate": 2.0118785114485017e-06, "loss": 0.66804266, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 8535, "time_per_iteration": 2.457638740539551 }, { "auxiliary_loss_clip": 0.01058907, "auxiliary_loss_mlp": 0.01041633, "balance_loss_clip": 1.01539493, "balance_loss_mlp": 1.01911974, "epoch": 0.5132120847737862, "flos": 19170912286080.0, "grad_norm": 1.5659440175933066, "language_loss": 0.70472276, "learning_rate": 2.011489056413418e-06, "loss": 0.72572815, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8536, "time_per_iteration": 2.391397476196289 }, { "auxiliary_loss_clip": 0.01057714, "auxiliary_loss_mlp": 0.01048324, "balance_loss_clip": 1.02029729, "balance_loss_mlp": 1.01747215, "epoch": 0.5132722080264542, "flos": 20229374108160.0, "grad_norm": 2.5257121268655816, "language_loss": 0.73083436, "learning_rate": 2.011099600942669e-06, "loss": 0.75189471, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8537, "time_per_iteration": 2.40474009513855 }, { "auxiliary_loss_clip": 0.01060192, "auxiliary_loss_mlp": 0.0104289, "balance_loss_clip": 1.01717663, "balance_loss_mlp": 1.01930642, "epoch": 0.5133323312791223, "flos": 16468600008960.0, "grad_norm": 1.8775366628472996, "language_loss": 0.81298035, "learning_rate": 2.0107101450510214e-06, "loss": 0.8340112, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 8538, "time_per_iteration": 2.335294723510742 }, { "auxiliary_loss_clip": 0.01055263, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.02087438, "balance_loss_mlp": 1.0164206, "epoch": 0.5133924545317902, "flos": 26066887009920.0, "grad_norm": 1.9134185357710018, "language_loss": 0.79800117, "learning_rate": 2.0103206887532437e-06, "loss": 0.81900239, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38867188, "step": 8539, "time_per_iteration": 2.4937570095062256 }, { "auxiliary_loss_clip": 0.01057287, "auxiliary_loss_mlp": 0.01042902, "balance_loss_clip": 1.01618707, "balance_loss_mlp": 1.01655531, "epoch": 0.5134525777844582, "flos": 29129782475520.0, "grad_norm": 1.607513036844469, "language_loss": 0.77288437, "learning_rate": 2.009931232064105e-06, "loss": 0.79388624, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40820312, "step": 8540, "time_per_iteration": 2.4277966022491455 }, { "auxiliary_loss_clip": 0.01059583, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.01280928, "balance_loss_mlp": 1.01816845, "epoch": 0.5135127010371261, "flos": 17453883888000.0, "grad_norm": 1.5935132697257863, "language_loss": 0.76410198, "learning_rate": 2.0095417749983724e-06, "loss": 0.78512126, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.4140625, "step": 8541, "time_per_iteration": 2.3637542724609375 }, { "auxiliary_loss_clip": 0.01056626, "auxiliary_loss_mlp": 0.01042095, "balance_loss_clip": 1.01572585, "balance_loss_mlp": 1.01730943, "epoch": 0.5135728242897941, "flos": 21943888888320.0, "grad_norm": 1.607914710467613, "language_loss": 0.7132597, "learning_rate": 2.0091523175708162e-06, "loss": 0.73424691, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 8542, "time_per_iteration": 2.369668483734131 }, { "auxiliary_loss_clip": 0.01057592, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.01574659, "balance_loss_mlp": 1.01750219, "epoch": 0.513632947542462, "flos": 22673748193920.0, "grad_norm": 2.1970596189589746, "language_loss": 0.80488658, "learning_rate": 2.0087628597962023e-06, "loss": 0.82587433, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 8543, "time_per_iteration": 2.4077601432800293 }, { "auxiliary_loss_clip": 0.01058955, "auxiliary_loss_mlp": 0.01043055, "balance_loss_clip": 1.01693606, "balance_loss_mlp": 1.0184989, "epoch": 0.51369307079513, "flos": 29455976108160.0, "grad_norm": 1.7246015241783519, "language_loss": 0.68473911, "learning_rate": 2.008373401689299e-06, "loss": 0.70575923, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 8544, "time_per_iteration": 2.4298627376556396 }, { "auxiliary_loss_clip": 0.01058953, "auxiliary_loss_mlp": 0.01044869, "balance_loss_clip": 1.01920354, "balance_loss_mlp": 1.01738095, "epoch": 0.513753194047798, "flos": 18988351453440.0, "grad_norm": 2.325405412244088, "language_loss": 0.73436403, "learning_rate": 2.0079839432648765e-06, "loss": 0.75540221, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41601562, "step": 8545, "time_per_iteration": 2.3820533752441406 }, { "auxiliary_loss_clip": 0.01060412, "auxiliary_loss_mlp": 0.01047416, "balance_loss_clip": 1.02063, "balance_loss_mlp": 1.01878595, "epoch": 0.513813317300466, "flos": 17820890766720.0, "grad_norm": 1.9552740963922353, "language_loss": 0.83562577, "learning_rate": 2.0075944845377016e-06, "loss": 0.85670406, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41601562, "step": 8546, "time_per_iteration": 2.3604159355163574 }, { "auxiliary_loss_clip": 0.01061178, "auxiliary_loss_mlp": 0.01047925, "balance_loss_clip": 1.02025652, "balance_loss_mlp": 1.01956832, "epoch": 0.5138734405531339, "flos": 24060044482560.0, "grad_norm": 1.647217616233915, "language_loss": 0.74095666, "learning_rate": 2.007205025522544e-06, "loss": 0.76204765, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 8547, "time_per_iteration": 2.409750461578369 }, { "auxiliary_loss_clip": 0.01058657, "auxiliary_loss_mlp": 0.01049935, "balance_loss_clip": 1.02223051, "balance_loss_mlp": 1.01816273, "epoch": 0.5139335638058019, "flos": 26096249329920.0, "grad_norm": 1.5985372163596159, "language_loss": 0.7434026, "learning_rate": 2.0068155662341702e-06, "loss": 0.76448858, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 8548, "time_per_iteration": 2.4135282039642334 }, { "auxiliary_loss_clip": 0.0105931, "auxiliary_loss_mlp": 0.01049431, "balance_loss_clip": 1.02324033, "balance_loss_mlp": 1.01806366, "epoch": 0.5139936870584698, "flos": 18916081205760.0, "grad_norm": 1.6837182871620853, "language_loss": 0.82636821, "learning_rate": 2.0064261066873495e-06, "loss": 0.84745562, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41210938, "step": 8549, "time_per_iteration": 2.3858635425567627 }, { "auxiliary_loss_clip": 0.0105719, "auxiliary_loss_mlp": 0.01037812, "balance_loss_clip": 1.01313519, "balance_loss_mlp": 1.0186168, "epoch": 0.5140538103111378, "flos": 16143069692160.0, "grad_norm": 2.242911491877206, "language_loss": 0.73030663, "learning_rate": 2.0060366468968504e-06, "loss": 0.75125659, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38671875, "step": 8550, "time_per_iteration": 2.363816976547241 }, { "auxiliary_loss_clip": 0.01061718, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.01578939, "balance_loss_mlp": 1.01983464, "epoch": 0.5141139335638057, "flos": 22419021847680.0, "grad_norm": 1.4737718457205646, "language_loss": 0.76223755, "learning_rate": 2.0056471868774408e-06, "loss": 0.78328216, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 8551, "time_per_iteration": 2.3895111083984375 }, { "auxiliary_loss_clip": 0.01056834, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.01419604, "balance_loss_mlp": 1.01801467, "epoch": 0.5141740568164738, "flos": 27088410746880.0, "grad_norm": 1.5897108466378804, "language_loss": 0.69814861, "learning_rate": 2.0052577266438897e-06, "loss": 0.71910679, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 8552, "time_per_iteration": 2.4332399368286133 }, { "auxiliary_loss_clip": 0.01061004, "auxiliary_loss_mlp": 0.01043483, "balance_loss_clip": 1.01571941, "balance_loss_mlp": 1.0193789, "epoch": 0.5142341800691418, "flos": 24972080595840.0, "grad_norm": 1.7512039116179945, "language_loss": 0.76116419, "learning_rate": 2.004868266210965e-06, "loss": 0.78220904, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41796875, "step": 8553, "time_per_iteration": 2.4096620082855225 }, { "auxiliary_loss_clip": 0.01059734, "auxiliary_loss_mlp": 0.01045661, "balance_loss_clip": 1.01864815, "balance_loss_mlp": 1.01879573, "epoch": 0.5142943033218097, "flos": 20703459726720.0, "grad_norm": 6.749646297211825, "language_loss": 0.69132477, "learning_rate": 2.004478805593435e-06, "loss": 0.71237868, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 8554, "time_per_iteration": 2.3876938819885254 }, { "auxiliary_loss_clip": 0.01063667, "auxiliary_loss_mlp": 0.01048126, "balance_loss_clip": 1.01610601, "balance_loss_mlp": 1.01991987, "epoch": 0.5143544265744777, "flos": 22924494645120.0, "grad_norm": 1.9151407693552793, "language_loss": 0.74876618, "learning_rate": 2.004089344806068e-06, "loss": 0.76988411, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.4375, "step": 8555, "time_per_iteration": 2.410733699798584 }, { "auxiliary_loss_clip": 0.01060476, "auxiliary_loss_mlp": 0.01045435, "balance_loss_clip": 1.01790953, "balance_loss_mlp": 1.01979208, "epoch": 0.5144145498271456, "flos": 15920568397440.0, "grad_norm": 2.350433633296237, "language_loss": 0.76730627, "learning_rate": 2.003699883863633e-06, "loss": 0.78836536, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 8556, "time_per_iteration": 2.3820340633392334 }, { "auxiliary_loss_clip": 0.01058616, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.01703119, "balance_loss_mlp": 1.01863348, "epoch": 0.5144746730798136, "flos": 19680260244480.0, "grad_norm": 1.7634303935278353, "language_loss": 0.87140256, "learning_rate": 2.003310422780898e-06, "loss": 0.89242351, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 8557, "time_per_iteration": 2.357302188873291 }, { "auxiliary_loss_clip": 0.01056751, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.01985669, "balance_loss_mlp": 1.01747596, "epoch": 0.5145347963324816, "flos": 23913583862400.0, "grad_norm": 1.5110000307812326, "language_loss": 0.90199411, "learning_rate": 2.0029209615726307e-06, "loss": 0.92300445, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.39257812, "step": 8558, "time_per_iteration": 2.4706225395202637 }, { "auxiliary_loss_clip": 0.01057935, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.011343, "balance_loss_mlp": 1.01849222, "epoch": 0.5145949195851496, "flos": 18259015818240.0, "grad_norm": 1.8009705831426672, "language_loss": 0.66473305, "learning_rate": 2.002531500253602e-06, "loss": 0.68569469, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 8559, "time_per_iteration": 3.6178627014160156 }, { "auxiliary_loss_clip": 0.01059488, "auxiliary_loss_mlp": 0.01046446, "balance_loss_clip": 1.02025557, "balance_loss_mlp": 1.01933765, "epoch": 0.5146550428378175, "flos": 26212230466560.0, "grad_norm": 1.6620697012226853, "language_loss": 0.63709444, "learning_rate": 2.002142038838577e-06, "loss": 0.65815377, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 8560, "time_per_iteration": 3.7992730140686035 }, { "auxiliary_loss_clip": 0.01058364, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.01703095, "balance_loss_mlp": 1.0180068, "epoch": 0.5147151660904855, "flos": 22673084878080.0, "grad_norm": 1.5875965150638116, "language_loss": 0.71861267, "learning_rate": 2.0017525773423265e-06, "loss": 0.73963022, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40429688, "step": 8561, "time_per_iteration": 2.391887664794922 }, { "auxiliary_loss_clip": 0.01060806, "auxiliary_loss_mlp": 0.01042644, "balance_loss_clip": 1.0161562, "balance_loss_mlp": 1.01846671, "epoch": 0.5147752893431534, "flos": 24971242723200.0, "grad_norm": 1.7470901945565382, "language_loss": 0.67721188, "learning_rate": 2.0013631157796177e-06, "loss": 0.69824636, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.42382812, "step": 8562, "time_per_iteration": 3.8853988647460938 }, { "auxiliary_loss_clip": 0.01062283, "auxiliary_loss_mlp": 0.01043419, "balance_loss_clip": 1.01424885, "balance_loss_mlp": 1.01963782, "epoch": 0.5148354125958214, "flos": 22743644469120.0, "grad_norm": 1.8427452313219466, "language_loss": 0.7853775, "learning_rate": 2.0009736541652188e-06, "loss": 0.80643463, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 8563, "time_per_iteration": 2.368241786956787 }, { "auxiliary_loss_clip": 0.01062118, "auxiliary_loss_mlp": 0.01049793, "balance_loss_clip": 1.01846468, "balance_loss_mlp": 1.01773167, "epoch": 0.5148955358484893, "flos": 23067848154240.0, "grad_norm": 6.524064763970968, "language_loss": 0.83945417, "learning_rate": 2.0005841925139e-06, "loss": 0.86057329, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.4453125, "step": 8564, "time_per_iteration": 2.3760836124420166 }, { "auxiliary_loss_clip": 0.01061297, "auxiliary_loss_mlp": 0.01052346, "balance_loss_clip": 1.02296066, "balance_loss_mlp": 1.01785111, "epoch": 0.5149556591011574, "flos": 20339071200000.0, "grad_norm": 1.8315308913860093, "language_loss": 0.7454375, "learning_rate": 2.0001947308404283e-06, "loss": 0.76657391, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43359375, "step": 8565, "time_per_iteration": 2.3717117309570312 }, { "auxiliary_loss_clip": 0.01063844, "auxiliary_loss_mlp": 0.01049344, "balance_loss_clip": 1.01663268, "balance_loss_mlp": 1.01961493, "epoch": 0.5150157823538254, "flos": 22637124311040.0, "grad_norm": 2.1602763130196347, "language_loss": 0.6954385, "learning_rate": 1.9998052691595715e-06, "loss": 0.71657032, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 0.44140625, "step": 8566, "time_per_iteration": 2.3834266662597656 }, { "auxiliary_loss_clip": 0.01059045, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.01449037, "balance_loss_mlp": 1.01641607, "epoch": 0.5150759056064933, "flos": 26066433162240.0, "grad_norm": 1.662051601037073, "language_loss": 0.79545146, "learning_rate": 1.9994158074861005e-06, "loss": 0.81647646, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.42578125, "step": 8567, "time_per_iteration": 2.413003921508789 }, { "auxiliary_loss_clip": 0.01061679, "auxiliary_loss_mlp": 0.01043528, "balance_loss_clip": 1.01390433, "balance_loss_mlp": 1.01863122, "epoch": 0.5151360288591613, "flos": 25951604100480.0, "grad_norm": 1.9148150324777176, "language_loss": 0.80531043, "learning_rate": 1.9990263458347806e-06, "loss": 0.82636249, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 8568, "time_per_iteration": 2.393540382385254 }, { "auxiliary_loss_clip": 0.01057539, "auxiliary_loss_mlp": 0.01044566, "balance_loss_clip": 1.01755309, "balance_loss_mlp": 1.01650798, "epoch": 0.5151961521118292, "flos": 18506480601600.0, "grad_norm": 2.1178754305934997, "language_loss": 0.91916156, "learning_rate": 1.9986368842203825e-06, "loss": 0.94018257, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 8569, "time_per_iteration": 3.8019347190856934 }, { "auxiliary_loss_clip": 0.01059762, "auxiliary_loss_mlp": 0.01050604, "balance_loss_clip": 1.02144575, "balance_loss_mlp": 1.01834488, "epoch": 0.5152562753644973, "flos": 22232690588160.0, "grad_norm": 1.5478096321504302, "language_loss": 0.77427965, "learning_rate": 1.998247422657674e-06, "loss": 0.79538333, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4140625, "step": 8570, "time_per_iteration": 2.498631477355957 }, { "auxiliary_loss_clip": 0.01058393, "auxiliary_loss_mlp": 0.01046704, "balance_loss_clip": 1.0141958, "balance_loss_mlp": 1.01655126, "epoch": 0.5153163986171652, "flos": 38435008590720.0, "grad_norm": 1.5287740385288529, "language_loss": 0.75072974, "learning_rate": 1.9978579611614227e-06, "loss": 0.77178073, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.41796875, "step": 8571, "time_per_iteration": 2.516692638397217 }, { "auxiliary_loss_clip": 0.01010196, "auxiliary_loss_mlp": 0.0100228, "balance_loss_clip": 0.99984848, "balance_loss_mlp": 1.00197172, "epoch": 0.5153765218698332, "flos": 66381164553600.0, "grad_norm": 0.8123311864854141, "language_loss": 0.52918267, "learning_rate": 1.9974684997463984e-06, "loss": 0.54930747, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.08203125, "step": 8572, "time_per_iteration": 3.129420042037964 }, { "auxiliary_loss_clip": 0.01058841, "auxiliary_loss_mlp": 0.01050864, "balance_loss_clip": 1.02344584, "balance_loss_mlp": 1.01880717, "epoch": 0.5154366451225011, "flos": 24023525333760.0, "grad_norm": 2.1386004569655697, "language_loss": 0.78012699, "learning_rate": 1.9970790384273687e-06, "loss": 0.80122411, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 8573, "time_per_iteration": 2.411212205886841 }, { "auxiliary_loss_clip": 0.01058262, "auxiliary_loss_mlp": 0.01042096, "balance_loss_clip": 1.0133909, "balance_loss_mlp": 1.01800108, "epoch": 0.5154967683751691, "flos": 23467952868480.0, "grad_norm": 1.7008904820878583, "language_loss": 0.78410989, "learning_rate": 1.996689577219102e-06, "loss": 0.80511343, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40234375, "step": 8574, "time_per_iteration": 2.3866829872131348 }, { "auxiliary_loss_clip": 0.0106036, "auxiliary_loss_mlp": 0.01049741, "balance_loss_clip": 1.02026057, "balance_loss_mlp": 1.01810384, "epoch": 0.515556891627837, "flos": 23804515175040.0, "grad_norm": 1.931526776000156, "language_loss": 0.86812443, "learning_rate": 1.996300116136367e-06, "loss": 0.88922548, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 8575, "time_per_iteration": 2.387206792831421 }, { "auxiliary_loss_clip": 0.01060631, "auxiliary_loss_mlp": 0.01045358, "balance_loss_clip": 1.01553154, "balance_loss_mlp": 1.01793671, "epoch": 0.515617014880505, "flos": 19827523825920.0, "grad_norm": 1.5673130672295255, "language_loss": 0.78034395, "learning_rate": 1.995910655193932e-06, "loss": 0.80140388, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42578125, "step": 8576, "time_per_iteration": 2.4093997478485107 }, { "auxiliary_loss_clip": 0.01062992, "auxiliary_loss_mlp": 0.010441, "balance_loss_clip": 1.01285493, "balance_loss_mlp": 1.0194211, "epoch": 0.515677138133173, "flos": 14245051472640.0, "grad_norm": 2.302916897952558, "language_loss": 0.77984273, "learning_rate": 1.9955211944065654e-06, "loss": 0.80091369, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43554688, "step": 8577, "time_per_iteration": 2.337428331375122 }, { "auxiliary_loss_clip": 0.01064019, "auxiliary_loss_mlp": 0.01051004, "balance_loss_clip": 1.01876998, "balance_loss_mlp": 1.01950645, "epoch": 0.515737261385841, "flos": 28288550332800.0, "grad_norm": 1.933229087051479, "language_loss": 0.81537855, "learning_rate": 1.9951317337890353e-06, "loss": 0.83652878, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4453125, "step": 8578, "time_per_iteration": 2.4498679637908936 }, { "auxiliary_loss_clip": 0.01058587, "auxiliary_loss_mlp": 0.01046288, "balance_loss_clip": 1.01696229, "balance_loss_mlp": 1.01748025, "epoch": 0.515797384638509, "flos": 27890679945600.0, "grad_norm": 1.8834053740500836, "language_loss": 0.77837729, "learning_rate": 1.9947422733561105e-06, "loss": 0.79942608, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 8579, "time_per_iteration": 2.436156988143921 }, { "auxiliary_loss_clip": 0.01062693, "auxiliary_loss_mlp": 0.01046409, "balance_loss_clip": 1.0167495, "balance_loss_mlp": 1.01931286, "epoch": 0.5158575078911769, "flos": 23038939681920.0, "grad_norm": 1.6255394186297516, "language_loss": 0.7984978, "learning_rate": 1.994352813122559e-06, "loss": 0.8195889, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43359375, "step": 8580, "time_per_iteration": 2.4161672592163086 }, { "auxiliary_loss_clip": 0.01062796, "auxiliary_loss_mlp": 0.01055921, "balance_loss_clip": 1.0217557, "balance_loss_mlp": 1.01877809, "epoch": 0.5159176311438449, "flos": 12640513075200.0, "grad_norm": 2.2127985690468877, "language_loss": 0.73839819, "learning_rate": 1.99396335310315e-06, "loss": 0.75958526, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 0.44140625, "step": 8581, "time_per_iteration": 2.32454252243042 }, { "auxiliary_loss_clip": 0.01062304, "auxiliary_loss_mlp": 0.0104291, "balance_loss_clip": 1.01384664, "balance_loss_mlp": 1.01973271, "epoch": 0.5159777543965128, "flos": 15557297034240.0, "grad_norm": 2.0793928200909724, "language_loss": 0.75807214, "learning_rate": 1.9935738933126508e-06, "loss": 0.77912426, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42578125, "step": 8582, "time_per_iteration": 2.361762285232544 }, { "auxiliary_loss_clip": 0.01059982, "auxiliary_loss_mlp": 0.01044297, "balance_loss_clip": 1.01492345, "balance_loss_mlp": 1.01814723, "epoch": 0.5160378776491809, "flos": 23220557907840.0, "grad_norm": 2.098882416396348, "language_loss": 0.67370284, "learning_rate": 1.99318443376583e-06, "loss": 0.6947456, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41796875, "step": 8583, "time_per_iteration": 2.372183322906494 }, { "auxiliary_loss_clip": 0.01060391, "auxiliary_loss_mlp": 0.01046568, "balance_loss_clip": 1.01734972, "balance_loss_mlp": 1.01735163, "epoch": 0.5160980009018488, "flos": 21943539774720.0, "grad_norm": 1.4590965056469798, "language_loss": 0.76671237, "learning_rate": 1.9927949744774568e-06, "loss": 0.78778189, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 8584, "time_per_iteration": 2.382758140563965 }, { "auxiliary_loss_clip": 0.01061992, "auxiliary_loss_mlp": 0.01051749, "balance_loss_clip": 1.02089715, "balance_loss_mlp": 1.01842034, "epoch": 0.5161581241545168, "flos": 22782956526720.0, "grad_norm": 1.9578348203755775, "language_loss": 0.80701542, "learning_rate": 1.9924055154622983e-06, "loss": 0.82815284, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43554688, "step": 8585, "time_per_iteration": 2.369828224182129 }, { "auxiliary_loss_clip": 0.0105635, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.01916552, "balance_loss_mlp": 1.01625085, "epoch": 0.5162182474071847, "flos": 19674569692800.0, "grad_norm": 2.2724647409532857, "language_loss": 0.82233346, "learning_rate": 1.9920160567351238e-06, "loss": 0.84335911, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 8586, "time_per_iteration": 2.360952854156494 }, { "auxiliary_loss_clip": 0.01061711, "auxiliary_loss_mlp": 0.01048473, "balance_loss_clip": 1.01762116, "balance_loss_mlp": 1.01852536, "epoch": 0.5162783706598527, "flos": 20045207352960.0, "grad_norm": 1.6808443091618486, "language_loss": 0.73047125, "learning_rate": 1.991626598310701e-06, "loss": 0.75157309, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 8587, "time_per_iteration": 2.375197410583496 }, { "auxiliary_loss_clip": 0.01010938, "auxiliary_loss_mlp": 0.01014878, "balance_loss_clip": 1.01272047, "balance_loss_mlp": 1.00272107, "epoch": 0.5163384939125206, "flos": 69956131063680.0, "grad_norm": 0.7351708166100714, "language_loss": 0.57927597, "learning_rate": 1.9912371402037984e-06, "loss": 0.59953403, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.08203125, "step": 8588, "time_per_iteration": 3.007497787475586 }, { "auxiliary_loss_clip": 0.01061926, "auxiliary_loss_mlp": 0.01053173, "balance_loss_clip": 1.01965189, "balance_loss_mlp": 1.01955438, "epoch": 0.5163986171651886, "flos": 17416177752960.0, "grad_norm": 1.6402533081369923, "language_loss": 0.76984012, "learning_rate": 1.990847682429185e-06, "loss": 0.79099119, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 0.42382812, "step": 8589, "time_per_iteration": 2.348696231842041 }, { "auxiliary_loss_clip": 0.01061118, "auxiliary_loss_mlp": 0.01048701, "balance_loss_clip": 1.01976871, "balance_loss_mlp": 1.01760769, "epoch": 0.5164587404178566, "flos": 21321666904320.0, "grad_norm": 1.648656135254223, "language_loss": 0.68291318, "learning_rate": 1.990458225001627e-06, "loss": 0.70401132, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.43554688, "step": 8590, "time_per_iteration": 2.381047487258911 }, { "auxiliary_loss_clip": 0.01010516, "auxiliary_loss_mlp": 0.01005311, "balance_loss_clip": 1.00286746, "balance_loss_mlp": 1.00218463, "epoch": 0.5165188636705246, "flos": 68053923480960.0, "grad_norm": 0.7859134665215488, "language_loss": 0.55969584, "learning_rate": 1.990068767935895e-06, "loss": 0.57985413, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.08300781, "step": 8591, "time_per_iteration": 2.9582712650299072 }, { "auxiliary_loss_clip": 0.01054743, "auxiliary_loss_mlp": 0.01035918, "balance_loss_clip": 1.00897634, "balance_loss_mlp": 1.01641023, "epoch": 0.5165789869231926, "flos": 19384790474880.0, "grad_norm": 1.4683031394764285, "language_loss": 0.82357979, "learning_rate": 1.9896793112467566e-06, "loss": 0.84448642, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3828125, "step": 8592, "time_per_iteration": 2.382112503051758 }, { "auxiliary_loss_clip": 0.01059314, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.01794863, "balance_loss_mlp": 1.01877415, "epoch": 0.5166391101758605, "flos": 20959128679680.0, "grad_norm": 1.980675360231778, "language_loss": 0.84381473, "learning_rate": 1.989289854948979e-06, "loss": 0.86487079, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40429688, "step": 8593, "time_per_iteration": 2.360713005065918 }, { "auxiliary_loss_clip": 0.01061229, "auxiliary_loss_mlp": 0.01045943, "balance_loss_clip": 1.01695108, "balance_loss_mlp": 1.01892781, "epoch": 0.5166992334285285, "flos": 29461073166720.0, "grad_norm": 1.5395400526224643, "language_loss": 0.70806122, "learning_rate": 1.9889003990573314e-06, "loss": 0.72913301, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 8594, "time_per_iteration": 2.4555654525756836 }, { "auxiliary_loss_clip": 0.01060685, "auxiliary_loss_mlp": 0.01039343, "balance_loss_clip": 1.0114001, "balance_loss_mlp": 1.01834357, "epoch": 0.5167593566811964, "flos": 20303285189760.0, "grad_norm": 1.5562643547494142, "language_loss": 0.78764117, "learning_rate": 1.988510943586582e-06, "loss": 0.80864143, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.42382812, "step": 8595, "time_per_iteration": 2.3777198791503906 }, { "auxiliary_loss_clip": 0.01059762, "auxiliary_loss_mlp": 0.01043915, "balance_loss_clip": 1.01666403, "balance_loss_mlp": 1.01815987, "epoch": 0.5168194799338645, "flos": 14610487340160.0, "grad_norm": 1.5404259727031036, "language_loss": 0.66485023, "learning_rate": 1.9881214885514986e-06, "loss": 0.68588698, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41601562, "step": 8596, "time_per_iteration": 2.374009370803833 }, { "auxiliary_loss_clip": 0.01061141, "auxiliary_loss_mlp": 0.01044321, "balance_loss_clip": 1.01208687, "balance_loss_mlp": 1.0198307, "epoch": 0.5168796031865324, "flos": 25006155949440.0, "grad_norm": 1.5338561177284706, "language_loss": 0.76517105, "learning_rate": 1.9877320339668492e-06, "loss": 0.78622562, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.4140625, "step": 8597, "time_per_iteration": 2.415248394012451 }, { "auxiliary_loss_clip": 0.01058816, "auxiliary_loss_mlp": 0.01040037, "balance_loss_clip": 1.01142693, "balance_loss_mlp": 1.01705825, "epoch": 0.5169397264392004, "flos": 26938843015680.0, "grad_norm": 1.6693355946433313, "language_loss": 0.82123262, "learning_rate": 1.987342579847403e-06, "loss": 0.84222114, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 8598, "time_per_iteration": 2.4188878536224365 }, { "auxiliary_loss_clip": 0.01060183, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.01257467, "balance_loss_mlp": 1.0181303, "epoch": 0.5169998496918683, "flos": 25406714511360.0, "grad_norm": 1.5680566398432116, "language_loss": 0.76565331, "learning_rate": 1.9869531262079273e-06, "loss": 0.78666353, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 8599, "time_per_iteration": 3.73745059967041 }, { "auxiliary_loss_clip": 0.01058784, "auxiliary_loss_mlp": 0.01044282, "balance_loss_clip": 1.01719785, "balance_loss_mlp": 1.0190618, "epoch": 0.5170599729445363, "flos": 24679648114560.0, "grad_norm": 2.146859537403998, "language_loss": 0.73982966, "learning_rate": 1.9865636730631904e-06, "loss": 0.76086032, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 8600, "time_per_iteration": 3.8174214363098145 }, { "auxiliary_loss_clip": 0.01059659, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.01044393, "balance_loss_mlp": 1.01945448, "epoch": 0.5171200961972042, "flos": 20993448412800.0, "grad_norm": 1.4877499752672034, "language_loss": 0.75800622, "learning_rate": 1.9861742204279602e-06, "loss": 0.77899545, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40234375, "step": 8601, "time_per_iteration": 2.3712313175201416 }, { "auxiliary_loss_clip": 0.01061157, "auxiliary_loss_mlp": 0.01046135, "balance_loss_clip": 1.01589131, "balance_loss_mlp": 1.01881099, "epoch": 0.5171802194498722, "flos": 22744587075840.0, "grad_norm": 2.0114687556491577, "language_loss": 0.85934794, "learning_rate": 1.9857847683170045e-06, "loss": 0.8804208, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42382812, "step": 8602, "time_per_iteration": 3.7635622024536133 }, { "auxiliary_loss_clip": 0.01060312, "auxiliary_loss_mlp": 0.01045269, "balance_loss_clip": 1.017398, "balance_loss_mlp": 1.01904058, "epoch": 0.5172403427025402, "flos": 28175676307200.0, "grad_norm": 1.7114024685601863, "language_loss": 0.75494182, "learning_rate": 1.9853953167450926e-06, "loss": 0.77599764, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41210938, "step": 8603, "time_per_iteration": 2.4482667446136475 }, { "auxiliary_loss_clip": 0.01064174, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.0166831, "balance_loss_mlp": 1.0213474, "epoch": 0.5173004659552082, "flos": 20336836872960.0, "grad_norm": 2.0725355079841377, "language_loss": 0.74246591, "learning_rate": 1.9850058657269915e-06, "loss": 0.76354653, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4296875, "step": 8604, "time_per_iteration": 2.3572020530700684 }, { "auxiliary_loss_clip": 0.01064344, "auxiliary_loss_mlp": 0.01047395, "balance_loss_clip": 1.01733029, "balance_loss_mlp": 1.01919973, "epoch": 0.5173605892078762, "flos": 19062297446400.0, "grad_norm": 1.9033405662604428, "language_loss": 0.86225927, "learning_rate": 1.984616415277469e-06, "loss": 0.8833766, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.45117188, "step": 8605, "time_per_iteration": 2.3699514865875244 }, { "auxiliary_loss_clip": 0.01060555, "auxiliary_loss_mlp": 0.01042096, "balance_loss_clip": 1.01498771, "balance_loss_mlp": 1.01950324, "epoch": 0.5174207124605441, "flos": 27994092992640.0, "grad_norm": 2.8597008826402166, "language_loss": 0.66001898, "learning_rate": 1.984226965411294e-06, "loss": 0.68104547, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 8606, "time_per_iteration": 2.438121795654297 }, { "auxiliary_loss_clip": 0.0105966, "auxiliary_loss_mlp": 0.01043378, "balance_loss_clip": 1.01590037, "balance_loss_mlp": 1.02001345, "epoch": 0.5174808357132121, "flos": 19495744375680.0, "grad_norm": 1.4697328839526682, "language_loss": 0.78558415, "learning_rate": 1.983837516143234e-06, "loss": 0.80661452, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39648438, "step": 8607, "time_per_iteration": 2.3781893253326416 }, { "auxiliary_loss_clip": 0.0106147, "auxiliary_loss_mlp": 0.01049888, "balance_loss_clip": 1.02089584, "balance_loss_mlp": 1.01915407, "epoch": 0.51754095896588, "flos": 22783061260800.0, "grad_norm": 1.6809778746127417, "language_loss": 0.73249513, "learning_rate": 1.983448067488057e-06, "loss": 0.7536087, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 8608, "time_per_iteration": 2.3808720111846924 }, { "auxiliary_loss_clip": 0.01062599, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.01888728, "balance_loss_mlp": 1.01896667, "epoch": 0.5176010822185481, "flos": 22668302021760.0, "grad_norm": 1.8911582526905595, "language_loss": 0.87418115, "learning_rate": 1.983058619460531e-06, "loss": 0.89529145, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.43554688, "step": 8609, "time_per_iteration": 3.8297431468963623 }, { "auxiliary_loss_clip": 0.01058297, "auxiliary_loss_mlp": 0.01042808, "balance_loss_clip": 1.01709425, "balance_loss_mlp": 1.01814342, "epoch": 0.517661205471216, "flos": 23950068099840.0, "grad_norm": 1.5009472415083918, "language_loss": 0.75016081, "learning_rate": 1.9826691720754237e-06, "loss": 0.77117187, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8610, "time_per_iteration": 2.392219305038452 }, { "auxiliary_loss_clip": 0.01061235, "auxiliary_loss_mlp": 0.01047785, "balance_loss_clip": 1.01801848, "balance_loss_mlp": 1.01901984, "epoch": 0.517721328723884, "flos": 15595177726080.0, "grad_norm": 2.478073767645993, "language_loss": 0.69364512, "learning_rate": 1.9822797253475034e-06, "loss": 0.71473527, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.421875, "step": 8611, "time_per_iteration": 2.3727009296417236 }, { "auxiliary_loss_clip": 0.01058041, "auxiliary_loss_mlp": 0.01041174, "balance_loss_clip": 1.01543725, "balance_loss_mlp": 1.01791, "epoch": 0.5177814519765519, "flos": 20959128679680.0, "grad_norm": 9.703014121955546, "language_loss": 0.78096437, "learning_rate": 1.9818902792915373e-06, "loss": 0.80195653, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8612, "time_per_iteration": 2.391763210296631 }, { "auxiliary_loss_clip": 0.01058977, "auxiliary_loss_mlp": 0.0105484, "balance_loss_clip": 1.02842307, "balance_loss_mlp": 1.018942, "epoch": 0.5178415752292199, "flos": 17966862627840.0, "grad_norm": 2.021707187755749, "language_loss": 0.83566517, "learning_rate": 1.981500833922294e-06, "loss": 0.8568033, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 8613, "time_per_iteration": 2.376495838165283 }, { "auxiliary_loss_clip": 0.01058631, "auxiliary_loss_mlp": 0.01044126, "balance_loss_clip": 1.01537275, "balance_loss_mlp": 1.01854587, "epoch": 0.5179016984818878, "flos": 17820541653120.0, "grad_norm": 2.311871852045684, "language_loss": 0.68980074, "learning_rate": 1.981111389254541e-06, "loss": 0.7108283, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40234375, "step": 8614, "time_per_iteration": 2.3408281803131104 }, { "auxiliary_loss_clip": 0.01058661, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.01404786, "balance_loss_mlp": 1.0180366, "epoch": 0.5179618217345558, "flos": 17819529223680.0, "grad_norm": 2.0292789222500316, "language_loss": 0.87569779, "learning_rate": 1.9807219453030453e-06, "loss": 0.89669621, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 8615, "time_per_iteration": 2.3525636196136475 }, { "auxiliary_loss_clip": 0.01059354, "auxiliary_loss_mlp": 0.01048393, "balance_loss_clip": 1.02188098, "balance_loss_mlp": 1.01920331, "epoch": 0.5180219449872238, "flos": 22521212997120.0, "grad_norm": 1.7204634791384206, "language_loss": 0.81922895, "learning_rate": 1.9803325020825763e-06, "loss": 0.8403064, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 8616, "time_per_iteration": 2.380617380142212 }, { "auxiliary_loss_clip": 0.01063663, "auxiliary_loss_mlp": 0.01058206, "balance_loss_clip": 1.02785563, "balance_loss_mlp": 1.02088475, "epoch": 0.5180820682398918, "flos": 23914317000960.0, "grad_norm": 1.6937763438070625, "language_loss": 0.75727648, "learning_rate": 1.9799430596079e-06, "loss": 0.77849519, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.42773438, "step": 8617, "time_per_iteration": 2.4198520183563232 }, { "auxiliary_loss_clip": 0.01060585, "auxiliary_loss_mlp": 0.01053713, "balance_loss_clip": 1.02381563, "balance_loss_mlp": 1.0191133, "epoch": 0.5181421914925598, "flos": 16979065130880.0, "grad_norm": 1.7920525594260173, "language_loss": 0.7125017, "learning_rate": 1.979553617893785e-06, "loss": 0.73364472, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4140625, "step": 8618, "time_per_iteration": 2.3416073322296143 }, { "auxiliary_loss_clip": 0.01010763, "auxiliary_loss_mlp": 0.01002552, "balance_loss_clip": 1.00019133, "balance_loss_mlp": 1.00283694, "epoch": 0.5182023147452277, "flos": 66056437198080.0, "grad_norm": 0.9470492992878874, "language_loss": 0.67377687, "learning_rate": 1.979164176954999e-06, "loss": 0.69391, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07910156, "step": 8619, "time_per_iteration": 2.997868776321411 }, { "auxiliary_loss_clip": 0.01057541, "auxiliary_loss_mlp": 0.01046083, "balance_loss_clip": 1.01995277, "balance_loss_mlp": 1.01794243, "epoch": 0.5182624379978957, "flos": 18186745570560.0, "grad_norm": 3.7902197319946205, "language_loss": 0.8117466, "learning_rate": 1.97877473680631e-06, "loss": 0.8327828, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 8620, "time_per_iteration": 2.3587841987609863 }, { "auxiliary_loss_clip": 0.01057144, "auxiliary_loss_mlp": 0.01044607, "balance_loss_clip": 1.01953733, "balance_loss_mlp": 1.0183351, "epoch": 0.5183225612505636, "flos": 14025866757120.0, "grad_norm": 2.246061975055125, "language_loss": 0.82992184, "learning_rate": 1.9783852974624846e-06, "loss": 0.85093933, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 8621, "time_per_iteration": 2.4415416717529297 }, { "auxiliary_loss_clip": 0.01057971, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.01943731, "balance_loss_mlp": 1.01894379, "epoch": 0.5183826845032317, "flos": 23658648048000.0, "grad_norm": 1.8930811762836135, "language_loss": 0.66553462, "learning_rate": 1.9779958589382905e-06, "loss": 0.68655419, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.390625, "step": 8622, "time_per_iteration": 2.3828036785125732 }, { "auxiliary_loss_clip": 0.01059927, "auxiliary_loss_mlp": 0.01048519, "balance_loss_clip": 1.02095795, "balance_loss_mlp": 1.01846552, "epoch": 0.5184428077558996, "flos": 15887680030080.0, "grad_norm": 2.3024508892326434, "language_loss": 0.62506688, "learning_rate": 1.977606421248497e-06, "loss": 0.6461513, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8623, "time_per_iteration": 2.360011100769043 }, { "auxiliary_loss_clip": 0.0105789, "auxiliary_loss_mlp": 0.01040368, "balance_loss_clip": 1.01444018, "balance_loss_mlp": 1.01759219, "epoch": 0.5185029310085676, "flos": 21029827916160.0, "grad_norm": 1.7550176946988412, "language_loss": 0.77462006, "learning_rate": 1.9772169844078685e-06, "loss": 0.79560256, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 8624, "time_per_iteration": 2.3688676357269287 }, { "auxiliary_loss_clip": 0.01056723, "auxiliary_loss_mlp": 0.01042082, "balance_loss_clip": 1.0172987, "balance_loss_mlp": 1.01771331, "epoch": 0.5185630542612355, "flos": 26541461387520.0, "grad_norm": 1.8416599404667129, "language_loss": 0.72529542, "learning_rate": 1.9768275484311756e-06, "loss": 0.74628341, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 8625, "time_per_iteration": 2.437434673309326 }, { "auxiliary_loss_clip": 0.01057649, "auxiliary_loss_mlp": 0.01039848, "balance_loss_clip": 1.01426601, "balance_loss_mlp": 1.01779628, "epoch": 0.5186231775139035, "flos": 20667359514240.0, "grad_norm": 1.7430011445362872, "language_loss": 0.69202864, "learning_rate": 1.976438113333184e-06, "loss": 0.71300358, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 8626, "time_per_iteration": 2.356048345565796 }, { "auxiliary_loss_clip": 0.0105703, "auxiliary_loss_mlp": 0.01038156, "balance_loss_clip": 1.01274085, "balance_loss_mlp": 1.01831663, "epoch": 0.5186833007665714, "flos": 20884484459520.0, "grad_norm": 2.208856285894059, "language_loss": 0.71613133, "learning_rate": 1.9760486791286612e-06, "loss": 0.7370832, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 8627, "time_per_iteration": 2.36867356300354 }, { "auxiliary_loss_clip": 0.01062084, "auxiliary_loss_mlp": 0.0104921, "balance_loss_clip": 1.02073085, "balance_loss_mlp": 1.01970041, "epoch": 0.5187434240192395, "flos": 20885846002560.0, "grad_norm": 1.7450518068090621, "language_loss": 0.74209583, "learning_rate": 1.9756592458323753e-06, "loss": 0.76320881, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42382812, "step": 8628, "time_per_iteration": 2.368021249771118 }, { "auxiliary_loss_clip": 0.01058328, "auxiliary_loss_mlp": 0.01042162, "balance_loss_clip": 1.01801038, "balance_loss_mlp": 1.01934719, "epoch": 0.5188035472719074, "flos": 19859050650240.0, "grad_norm": 1.6456687569439328, "language_loss": 0.78179717, "learning_rate": 1.9752698134590927e-06, "loss": 0.80280209, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.390625, "step": 8629, "time_per_iteration": 2.371837854385376 }, { "auxiliary_loss_clip": 0.01059167, "auxiliary_loss_mlp": 0.01041889, "balance_loss_clip": 1.01687872, "balance_loss_mlp": 1.01887751, "epoch": 0.5188636705245754, "flos": 21137360503680.0, "grad_norm": 2.0199238800565364, "language_loss": 0.75623369, "learning_rate": 1.9748803820235815e-06, "loss": 0.77724421, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40234375, "step": 8630, "time_per_iteration": 2.379490852355957 }, { "auxiliary_loss_clip": 0.01058046, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.0116148, "balance_loss_mlp": 1.01736426, "epoch": 0.5189237937772434, "flos": 22418672734080.0, "grad_norm": 2.1980064583612475, "language_loss": 0.81801069, "learning_rate": 1.9744909515406093e-06, "loss": 0.8389715, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 8631, "time_per_iteration": 2.384612560272217 }, { "auxiliary_loss_clip": 0.01059566, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.01787949, "balance_loss_mlp": 1.01920044, "epoch": 0.5189839170299113, "flos": 25445537809920.0, "grad_norm": 1.4743549493245562, "language_loss": 0.76430452, "learning_rate": 1.974101522024942e-06, "loss": 0.78533214, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40429688, "step": 8632, "time_per_iteration": 2.4333744049072266 }, { "auxiliary_loss_clip": 0.01054669, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.0105679, "balance_loss_mlp": 1.01720583, "epoch": 0.5190440402825793, "flos": 18586745550720.0, "grad_norm": 1.8694246518284945, "language_loss": 0.79923856, "learning_rate": 1.9737120934913477e-06, "loss": 0.8201251, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 8633, "time_per_iteration": 2.3420188426971436 }, { "auxiliary_loss_clip": 0.01057577, "auxiliary_loss_mlp": 0.01037935, "balance_loss_clip": 1.01235223, "balance_loss_mlp": 1.01840198, "epoch": 0.5191041635352472, "flos": 21907544296320.0, "grad_norm": 1.720547105511141, "language_loss": 0.81460118, "learning_rate": 1.9733226659545936e-06, "loss": 0.83555627, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39257812, "step": 8634, "time_per_iteration": 2.3956730365753174 }, { "auxiliary_loss_clip": 0.01057606, "auxiliary_loss_mlp": 0.01035104, "balance_loss_clip": 1.01089275, "balance_loss_mlp": 1.01864803, "epoch": 0.5191642867879153, "flos": 27526710355200.0, "grad_norm": 1.456809717712187, "language_loss": 0.7001794, "learning_rate": 1.9729332394294467e-06, "loss": 0.72110647, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.390625, "step": 8635, "time_per_iteration": 2.432401418685913 }, { "auxiliary_loss_clip": 0.01060392, "auxiliary_loss_mlp": 0.0104091, "balance_loss_clip": 1.01574516, "balance_loss_mlp": 1.0193032, "epoch": 0.5192244100405832, "flos": 15705084286080.0, "grad_norm": 1.513113110721494, "language_loss": 0.78763294, "learning_rate": 1.9725438139306742e-06, "loss": 0.80864596, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.41015625, "step": 8636, "time_per_iteration": 2.3771419525146484 }, { "auxiliary_loss_clip": 0.01059269, "auxiliary_loss_mlp": 0.01044053, "balance_loss_clip": 1.0178746, "balance_loss_mlp": 1.01825464, "epoch": 0.5192845332932512, "flos": 12056276517120.0, "grad_norm": 2.033632827937796, "language_loss": 0.73011541, "learning_rate": 1.9721543894730425e-06, "loss": 0.75114858, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41015625, "step": 8637, "time_per_iteration": 2.337094783782959 }, { "auxiliary_loss_clip": 0.01056853, "auxiliary_loss_mlp": 0.01038776, "balance_loss_clip": 1.01377773, "balance_loss_mlp": 1.01784885, "epoch": 0.5193446565459191, "flos": 18952181418240.0, "grad_norm": 1.7543237595746102, "language_loss": 0.76828229, "learning_rate": 1.9717649660713194e-06, "loss": 0.78923863, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 8638, "time_per_iteration": 3.646576404571533 }, { "auxiliary_loss_clip": 0.01056219, "auxiliary_loss_mlp": 0.01036945, "balance_loss_clip": 1.01174474, "balance_loss_mlp": 1.01822472, "epoch": 0.5194047797985871, "flos": 20373949514880.0, "grad_norm": 2.143636135548527, "language_loss": 0.75782728, "learning_rate": 1.971375543740272e-06, "loss": 0.77875894, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 8639, "time_per_iteration": 2.3743813037872314 }, { "auxiliary_loss_clip": 0.01056388, "auxiliary_loss_mlp": 0.01041553, "balance_loss_clip": 1.01659024, "balance_loss_mlp": 1.01738381, "epoch": 0.519464903051255, "flos": 24351848559360.0, "grad_norm": 1.6953667481796126, "language_loss": 0.78977895, "learning_rate": 1.9709861224946665e-06, "loss": 0.81075835, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 8640, "time_per_iteration": 3.8895034790039062 }, { "auxiliary_loss_clip": 0.01056702, "auxiliary_loss_mlp": 0.01043245, "balance_loss_clip": 1.0177815, "balance_loss_mlp": 1.01745439, "epoch": 0.519525026303923, "flos": 14061024362880.0, "grad_norm": 2.044175130160542, "language_loss": 0.67116296, "learning_rate": 1.97059670234927e-06, "loss": 0.69216239, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8641, "time_per_iteration": 2.3712549209594727 }, { "auxiliary_loss_clip": 0.01056977, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.01557457, "balance_loss_mlp": 1.01782191, "epoch": 0.519585149556591, "flos": 28834731642240.0, "grad_norm": 1.9689268059163851, "language_loss": 0.76775724, "learning_rate": 1.97020728331885e-06, "loss": 0.78873557, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39257812, "step": 8642, "time_per_iteration": 3.7600438594818115 }, { "auxiliary_loss_clip": 0.01054425, "auxiliary_loss_mlp": 0.01038518, "balance_loss_clip": 1.01527262, "balance_loss_mlp": 1.01672995, "epoch": 0.519645272809259, "flos": 25371871107840.0, "grad_norm": 1.5518916478325302, "language_loss": 0.8395499, "learning_rate": 1.9698178654181726e-06, "loss": 0.86047935, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37695312, "step": 8643, "time_per_iteration": 2.410273313522339 }, { "auxiliary_loss_clip": 0.01059906, "auxiliary_loss_mlp": 0.01047641, "balance_loss_clip": 1.02034211, "balance_loss_mlp": 1.01919997, "epoch": 0.519705396061927, "flos": 25371731462400.0, "grad_norm": 1.5030227893070505, "language_loss": 0.71464056, "learning_rate": 1.969428448662004e-06, "loss": 0.73571604, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 8644, "time_per_iteration": 2.424198627471924 }, { "auxiliary_loss_clip": 0.01057804, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.01354384, "balance_loss_mlp": 1.01777649, "epoch": 0.5197655193145949, "flos": 28474951415040.0, "grad_norm": 1.6026946721507562, "language_loss": 0.81300634, "learning_rate": 1.9690390330651133e-06, "loss": 0.83396447, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.40039062, "step": 8645, "time_per_iteration": 2.4513957500457764 }, { "auxiliary_loss_clip": 0.01055286, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.01457453, "balance_loss_mlp": 1.0165, "epoch": 0.5198256425672629, "flos": 20008164533760.0, "grad_norm": 1.9861857122117847, "language_loss": 0.79330468, "learning_rate": 1.968649618642264e-06, "loss": 0.814264, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 8646, "time_per_iteration": 2.3493642807006836 }, { "auxiliary_loss_clip": 0.01058101, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.01559329, "balance_loss_mlp": 1.01887846, "epoch": 0.5198857658199308, "flos": 19827838028160.0, "grad_norm": 2.182512432837295, "language_loss": 0.66838747, "learning_rate": 1.9682602054082252e-06, "loss": 0.68937033, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39257812, "step": 8647, "time_per_iteration": 2.368234157562256 }, { "auxiliary_loss_clip": 0.01057993, "auxiliary_loss_mlp": 0.01046289, "balance_loss_clip": 1.01829898, "balance_loss_mlp": 1.0175395, "epoch": 0.5199458890725989, "flos": 24460777601280.0, "grad_norm": 2.026021143728582, "language_loss": 0.72862148, "learning_rate": 1.967870793377763e-06, "loss": 0.74966431, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 8648, "time_per_iteration": 3.804499626159668 }, { "auxiliary_loss_clip": 0.01059365, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.0141232, "balance_loss_mlp": 1.01848817, "epoch": 0.5200060123252668, "flos": 23403642410880.0, "grad_norm": 1.655959108293007, "language_loss": 0.6578145, "learning_rate": 1.967481382565642e-06, "loss": 0.6788221, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40820312, "step": 8649, "time_per_iteration": 2.3656485080718994 }, { "auxiliary_loss_clip": 0.010587, "auxiliary_loss_mlp": 0.01047145, "balance_loss_clip": 1.01947689, "balance_loss_mlp": 1.01749313, "epoch": 0.5200661355779348, "flos": 17200414350720.0, "grad_norm": 1.7855501267155942, "language_loss": 0.71916592, "learning_rate": 1.9670919729866315e-06, "loss": 0.74022436, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41210938, "step": 8650, "time_per_iteration": 2.3648929595947266 }, { "auxiliary_loss_clip": 0.01054738, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.01539707, "balance_loss_mlp": 1.01687396, "epoch": 0.5201262588306027, "flos": 18514091278080.0, "grad_norm": 1.7067828181771945, "language_loss": 0.79375482, "learning_rate": 1.966702564655496e-06, "loss": 0.8146975, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37890625, "step": 8651, "time_per_iteration": 2.3546831607818604 }, { "auxiliary_loss_clip": 0.01058339, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.01618648, "balance_loss_mlp": 1.01894593, "epoch": 0.5201863820832707, "flos": 18618551665920.0, "grad_norm": 1.724081196901747, "language_loss": 0.80076015, "learning_rate": 1.966313157587003e-06, "loss": 0.82176816, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 8652, "time_per_iteration": 2.392047166824341 }, { "auxiliary_loss_clip": 0.01058429, "auxiliary_loss_mlp": 0.01043678, "balance_loss_clip": 1.01658177, "balance_loss_mlp": 1.01853395, "epoch": 0.5202465053359386, "flos": 22856029735680.0, "grad_norm": 1.8830591023143755, "language_loss": 0.71400857, "learning_rate": 1.9659237517959187e-06, "loss": 0.73502964, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 8653, "time_per_iteration": 2.36360764503479 }, { "auxiliary_loss_clip": 0.01058808, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.02497625, "balance_loss_mlp": 1.0180645, "epoch": 0.5203066285886067, "flos": 21980442948480.0, "grad_norm": 1.5492832562629386, "language_loss": 0.78948599, "learning_rate": 1.965534347297008e-06, "loss": 0.81059504, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40820312, "step": 8654, "time_per_iteration": 2.399876117706299 }, { "auxiliary_loss_clip": 0.01059661, "auxiliary_loss_mlp": 0.01049271, "balance_loss_clip": 1.02175784, "balance_loss_mlp": 1.01783538, "epoch": 0.5203667518412746, "flos": 20232201928320.0, "grad_norm": 1.9402366763628847, "language_loss": 0.85397243, "learning_rate": 1.9651449441050393e-06, "loss": 0.87506175, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 8655, "time_per_iteration": 2.4194746017456055 }, { "auxiliary_loss_clip": 0.01054995, "auxiliary_loss_mlp": 0.01043564, "balance_loss_clip": 1.0176959, "balance_loss_mlp": 1.01719058, "epoch": 0.5204268750939426, "flos": 15704560615680.0, "grad_norm": 2.195797486812235, "language_loss": 0.67257971, "learning_rate": 1.9647555422347777e-06, "loss": 0.69356537, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 8656, "time_per_iteration": 2.361248731613159 }, { "auxiliary_loss_clip": 0.01058111, "auxiliary_loss_mlp": 0.01038416, "balance_loss_clip": 1.01223779, "balance_loss_mlp": 1.01851368, "epoch": 0.5204869983466105, "flos": 27448365530880.0, "grad_norm": 1.8466992860615747, "language_loss": 0.7442261, "learning_rate": 1.9643661417009893e-06, "loss": 0.76519132, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 8657, "time_per_iteration": 2.4133663177490234 }, { "auxiliary_loss_clip": 0.01057473, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.01232123, "balance_loss_mlp": 1.01815796, "epoch": 0.5205471215992785, "flos": 20594391039360.0, "grad_norm": 1.8935632879023778, "language_loss": 0.72271514, "learning_rate": 1.9639767425184408e-06, "loss": 0.74367768, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 8658, "time_per_iteration": 2.374812602996826 }, { "auxiliary_loss_clip": 0.01055064, "auxiliary_loss_mlp": 0.01041907, "balance_loss_clip": 1.01677799, "balance_loss_mlp": 1.01631439, "epoch": 0.5206072448519465, "flos": 22126798834560.0, "grad_norm": 1.6594102980886032, "language_loss": 0.85135216, "learning_rate": 1.963587344701897e-06, "loss": 0.87232184, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 8659, "time_per_iteration": 2.3967931270599365 }, { "auxiliary_loss_clip": 0.01061159, "auxiliary_loss_mlp": 0.0105116, "balance_loss_clip": 1.02319312, "balance_loss_mlp": 1.01910973, "epoch": 0.5206673681046144, "flos": 18329505586560.0, "grad_norm": 41.71607671674362, "language_loss": 0.76919484, "learning_rate": 1.9631979482661253e-06, "loss": 0.79031801, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41992188, "step": 8660, "time_per_iteration": 2.3444137573242188 }, { "auxiliary_loss_clip": 0.01055599, "auxiliary_loss_mlp": 0.0104087, "balance_loss_clip": 1.01571703, "balance_loss_mlp": 1.01754546, "epoch": 0.5207274913572825, "flos": 20229199551360.0, "grad_norm": 1.7635166035786252, "language_loss": 0.78894216, "learning_rate": 1.9628085532258906e-06, "loss": 0.80990684, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 8661, "time_per_iteration": 2.376197576522827 }, { "auxiliary_loss_clip": 0.01058799, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.01569223, "balance_loss_mlp": 1.0185008, "epoch": 0.5207876146099504, "flos": 22125960961920.0, "grad_norm": 1.7858020366396843, "language_loss": 0.71590358, "learning_rate": 1.9624191595959603e-06, "loss": 0.73689443, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.40234375, "step": 8662, "time_per_iteration": 2.3949086666107178 }, { "auxiliary_loss_clip": 0.01056098, "auxiliary_loss_mlp": 0.01037542, "balance_loss_clip": 1.01386666, "balance_loss_mlp": 1.01815522, "epoch": 0.5208477378626184, "flos": 23877762940800.0, "grad_norm": 1.8697759171122728, "language_loss": 0.70388722, "learning_rate": 1.962029767391098e-06, "loss": 0.72482365, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 8663, "time_per_iteration": 2.451535940170288 }, { "auxiliary_loss_clip": 0.01057545, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.01686275, "balance_loss_mlp": 1.01859665, "epoch": 0.5209078611152863, "flos": 20960420400000.0, "grad_norm": 2.80791360781465, "language_loss": 0.7781893, "learning_rate": 1.961640376626072e-06, "loss": 0.79918277, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 8664, "time_per_iteration": 2.392019033432007 }, { "auxiliary_loss_clip": 0.01056844, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.01278996, "balance_loss_mlp": 1.01801038, "epoch": 0.5209679843679543, "flos": 20666696198400.0, "grad_norm": 1.9135389256522037, "language_loss": 0.77326179, "learning_rate": 1.961250987315646e-06, "loss": 0.7942006, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38867188, "step": 8665, "time_per_iteration": 2.3781957626342773 }, { "auxiliary_loss_clip": 0.01056736, "auxiliary_loss_mlp": 0.01044221, "balance_loss_clip": 1.02023578, "balance_loss_mlp": 1.01810145, "epoch": 0.5210281076206222, "flos": 20226336819840.0, "grad_norm": 1.7688649962937042, "language_loss": 0.73275435, "learning_rate": 1.960861599474586e-06, "loss": 0.75376385, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38671875, "step": 8666, "time_per_iteration": 2.3667633533477783 }, { "auxiliary_loss_clip": 0.01059395, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.01648629, "balance_loss_mlp": 1.01760554, "epoch": 0.5210882308732903, "flos": 16069088787840.0, "grad_norm": 2.047104075556123, "language_loss": 0.70703954, "learning_rate": 1.9604722131176592e-06, "loss": 0.72808158, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8667, "time_per_iteration": 2.3887619972229004 }, { "auxiliary_loss_clip": 0.01054107, "auxiliary_loss_mlp": 0.01038595, "balance_loss_clip": 1.01567101, "balance_loss_mlp": 1.01699197, "epoch": 0.5211483541259582, "flos": 24824188609920.0, "grad_norm": 1.4301182139453286, "language_loss": 0.81824553, "learning_rate": 1.960082828259629e-06, "loss": 0.8391726, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.37109375, "step": 8668, "time_per_iteration": 2.408695697784424 }, { "auxiliary_loss_clip": 0.01056026, "auxiliary_loss_mlp": 0.010365, "balance_loss_clip": 1.01253855, "balance_loss_mlp": 1.01752687, "epoch": 0.5212084773786262, "flos": 20369760151680.0, "grad_norm": 2.025802953213226, "language_loss": 0.65592206, "learning_rate": 1.9596934449152623e-06, "loss": 0.67684728, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38476562, "step": 8669, "time_per_iteration": 2.3873908519744873 }, { "auxiliary_loss_clip": 0.01056576, "auxiliary_loss_mlp": 0.01041058, "balance_loss_clip": 1.01638162, "balance_loss_mlp": 1.01889777, "epoch": 0.5212686006312941, "flos": 23144447410560.0, "grad_norm": 1.542324685123087, "language_loss": 0.67225313, "learning_rate": 1.959304063099325e-06, "loss": 0.69322944, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37695312, "step": 8670, "time_per_iteration": 2.3840346336364746 }, { "auxiliary_loss_clip": 0.01052771, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.01298606, "balance_loss_mlp": 1.01663911, "epoch": 0.5213287238839621, "flos": 27773023063680.0, "grad_norm": 2.0609820593283157, "language_loss": 0.77284163, "learning_rate": 1.9589146828265806e-06, "loss": 0.79372901, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 8671, "time_per_iteration": 2.4253759384155273 }, { "auxiliary_loss_clip": 0.01060616, "auxiliary_loss_mlp": 0.01050826, "balance_loss_clip": 1.02465987, "balance_loss_mlp": 1.02070892, "epoch": 0.5213888471366301, "flos": 19936662336000.0, "grad_norm": 2.016256790974508, "language_loss": 0.80003715, "learning_rate": 1.958525304111796e-06, "loss": 0.82115155, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8672, "time_per_iteration": 2.3904175758361816 }, { "auxiliary_loss_clip": 0.01052667, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.01463771, "balance_loss_mlp": 1.01618886, "epoch": 0.521448970389298, "flos": 16981788216960.0, "grad_norm": 2.1274131667437195, "language_loss": 0.73971999, "learning_rate": 1.958135926969736e-06, "loss": 0.76061118, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36523438, "step": 8673, "time_per_iteration": 2.3626186847686768 }, { "auxiliary_loss_clip": 0.01056605, "auxiliary_loss_mlp": 0.01039759, "balance_loss_clip": 1.0151062, "balance_loss_mlp": 1.01747024, "epoch": 0.5215090936419661, "flos": 18988700567040.0, "grad_norm": 1.5500978301247603, "language_loss": 0.76374114, "learning_rate": 1.957746551415166e-06, "loss": 0.7847048, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.390625, "step": 8674, "time_per_iteration": 2.3947479724884033 }, { "auxiliary_loss_clip": 0.01055741, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.01398504, "balance_loss_mlp": 1.01615405, "epoch": 0.521569216894634, "flos": 16142511110400.0, "grad_norm": 2.3486525877211966, "language_loss": 0.86855829, "learning_rate": 1.9573571774628506e-06, "loss": 0.88950467, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39648438, "step": 8675, "time_per_iteration": 2.369410276412964 }, { "auxiliary_loss_clip": 0.01010522, "auxiliary_loss_mlp": 0.01006657, "balance_loss_clip": 1.00415313, "balance_loss_mlp": 1.00278497, "epoch": 0.521629340147302, "flos": 57576733113600.0, "grad_norm": 0.880460868265128, "language_loss": 0.6326282, "learning_rate": 1.9569678051275556e-06, "loss": 0.65280002, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.07714844, "step": 8676, "time_per_iteration": 2.965484857559204 }, { "auxiliary_loss_clip": 0.01054425, "auxiliary_loss_mlp": 0.01040803, "balance_loss_clip": 1.01694942, "balance_loss_mlp": 1.01682806, "epoch": 0.5216894633999699, "flos": 26795698974720.0, "grad_norm": 1.4937920120220312, "language_loss": 0.7000879, "learning_rate": 1.956578434424046e-06, "loss": 0.72104019, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37695312, "step": 8677, "time_per_iteration": 2.454317331314087 }, { "auxiliary_loss_clip": 0.01055104, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.01612973, "balance_loss_mlp": 1.01666474, "epoch": 0.5217495866526379, "flos": 26357539011840.0, "grad_norm": 1.568851285072069, "language_loss": 0.66670763, "learning_rate": 1.956189065367086e-06, "loss": 0.68765277, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3828125, "step": 8678, "time_per_iteration": 3.6005992889404297 }, { "auxiliary_loss_clip": 0.01056691, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 1.01568627, "balance_loss_mlp": 1.01740086, "epoch": 0.5218097099053058, "flos": 23582956487040.0, "grad_norm": 2.1999663297170997, "language_loss": 0.70739549, "learning_rate": 1.9557996979714414e-06, "loss": 0.72838485, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 8679, "time_per_iteration": 3.789944648742676 }, { "auxiliary_loss_clip": 0.01058405, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.02158999, "balance_loss_mlp": 1.01898336, "epoch": 0.5218698331579739, "flos": 18076420074240.0, "grad_norm": 1.7376325251658797, "language_loss": 0.67851698, "learning_rate": 1.9554103322518764e-06, "loss": 0.69956177, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.39453125, "step": 8680, "time_per_iteration": 2.397831916809082 }, { "auxiliary_loss_clip": 0.01057584, "auxiliary_loss_mlp": 0.01041728, "balance_loss_clip": 1.01574016, "balance_loss_mlp": 1.01833081, "epoch": 0.5219299564106418, "flos": 19280120618880.0, "grad_norm": 1.872790665761548, "language_loss": 0.84427702, "learning_rate": 1.955020968223156e-06, "loss": 0.86527026, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39257812, "step": 8681, "time_per_iteration": 3.745870351791382 }, { "auxiliary_loss_clip": 0.01056102, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.01400042, "balance_loss_mlp": 1.01753974, "epoch": 0.5219900796633098, "flos": 26650146049920.0, "grad_norm": 1.774921802068531, "language_loss": 0.78890586, "learning_rate": 1.9546316059000454e-06, "loss": 0.80985612, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 8682, "time_per_iteration": 2.4392757415771484 }, { "auxiliary_loss_clip": 0.01058631, "auxiliary_loss_mlp": 0.01041081, "balance_loss_clip": 1.0170126, "balance_loss_mlp": 1.01936913, "epoch": 0.5220502029159777, "flos": 34311312241920.0, "grad_norm": 2.066926550116428, "language_loss": 0.70765734, "learning_rate": 1.9542422452973082e-06, "loss": 0.7286545, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.39257812, "step": 8683, "time_per_iteration": 2.487595319747925 }, { "auxiliary_loss_clip": 0.01058148, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.01872349, "balance_loss_mlp": 1.01811981, "epoch": 0.5221103261686457, "flos": 22155602572800.0, "grad_norm": 1.5941627395400075, "language_loss": 0.77391988, "learning_rate": 1.9538528864297104e-06, "loss": 0.79495776, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 8684, "time_per_iteration": 2.4316630363464355 }, { "auxiliary_loss_clip": 0.01055622, "auxiliary_loss_mlp": 0.01039792, "balance_loss_clip": 1.0143404, "balance_loss_mlp": 1.01739573, "epoch": 0.5221704494213137, "flos": 19207396523520.0, "grad_norm": 1.6527356822890986, "language_loss": 0.77366984, "learning_rate": 1.9534635293120153e-06, "loss": 0.79462397, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 8685, "time_per_iteration": 2.3657829761505127 }, { "auxiliary_loss_clip": 0.01060325, "auxiliary_loss_mlp": 0.01046015, "balance_loss_clip": 1.01832247, "balance_loss_mlp": 1.01989663, "epoch": 0.5222305726739817, "flos": 19353054182400.0, "grad_norm": 1.6907191965042778, "language_loss": 0.81427276, "learning_rate": 1.9530741739589876e-06, "loss": 0.83533615, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 8686, "time_per_iteration": 2.3812577724456787 }, { "auxiliary_loss_clip": 0.01054245, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.01235187, "balance_loss_mlp": 1.01727962, "epoch": 0.5222906959266497, "flos": 27813661752960.0, "grad_norm": 1.5345942980041698, "language_loss": 0.70987964, "learning_rate": 1.9526848203853927e-06, "loss": 0.73079097, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36914062, "step": 8687, "time_per_iteration": 2.422388792037964 }, { "auxiliary_loss_clip": 0.01055815, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.01764786, "balance_loss_mlp": 1.01816583, "epoch": 0.5223508191793176, "flos": 12712189829760.0, "grad_norm": 2.114508202058387, "language_loss": 0.83506423, "learning_rate": 1.9522954686059936e-06, "loss": 0.85603094, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 8688, "time_per_iteration": 3.7761032581329346 }, { "auxiliary_loss_clip": 0.01058306, "auxiliary_loss_mlp": 0.0103881, "balance_loss_clip": 1.01294196, "balance_loss_mlp": 1.01936483, "epoch": 0.5224109424319856, "flos": 15631347761280.0, "grad_norm": 2.329607447682385, "language_loss": 0.75480485, "learning_rate": 1.9519061186355558e-06, "loss": 0.77577603, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 8689, "time_per_iteration": 2.347965717315674 }, { "auxiliary_loss_clip": 0.01055641, "auxiliary_loss_mlp": 0.01038865, "balance_loss_clip": 1.01420069, "balance_loss_mlp": 1.01770663, "epoch": 0.5224710656846535, "flos": 15741324144000.0, "grad_norm": 1.8728909652525534, "language_loss": 0.83874685, "learning_rate": 1.9515167704888417e-06, "loss": 0.85969186, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 8690, "time_per_iteration": 2.3429031372070312 }, { "auxiliary_loss_clip": 0.01058858, "auxiliary_loss_mlp": 0.01045466, "balance_loss_clip": 1.01740396, "balance_loss_mlp": 1.01884937, "epoch": 0.5225311889373215, "flos": 26029809279360.0, "grad_norm": 2.147571767602476, "language_loss": 0.79388702, "learning_rate": 1.9511274241806173e-06, "loss": 0.8149302, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 8691, "time_per_iteration": 2.4180705547332764 }, { "auxiliary_loss_clip": 0.01060014, "auxiliary_loss_mlp": 0.0104668, "balance_loss_clip": 1.0172236, "balance_loss_mlp": 1.01905251, "epoch": 0.5225913121899894, "flos": 18368293973760.0, "grad_norm": 2.031352265679352, "language_loss": 0.78017777, "learning_rate": 1.950738079725646e-06, "loss": 0.80124474, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41015625, "step": 8692, "time_per_iteration": 2.3377621173858643 }, { "auxiliary_loss_clip": 0.01054945, "auxiliary_loss_mlp": 0.01039888, "balance_loss_clip": 1.01649904, "balance_loss_mlp": 1.01729131, "epoch": 0.5226514354426575, "flos": 29272367934720.0, "grad_norm": 1.7154108708746885, "language_loss": 0.73036647, "learning_rate": 1.950348737138691e-06, "loss": 0.75131488, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 8693, "time_per_iteration": 2.421260118484497 }, { "auxiliary_loss_clip": 0.01061455, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.01798308, "balance_loss_mlp": 1.0196054, "epoch": 0.5227115586953254, "flos": 22852294220160.0, "grad_norm": 2.181704162236289, "language_loss": 0.83275342, "learning_rate": 1.949959396434517e-06, "loss": 0.85384089, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 8694, "time_per_iteration": 2.4089062213897705 }, { "auxiliary_loss_clip": 0.01012283, "auxiliary_loss_mlp": 0.0100378, "balance_loss_clip": 1.00120533, "balance_loss_mlp": 1.00424361, "epoch": 0.5227716819479934, "flos": 57471539587200.0, "grad_norm": 0.7572355846963392, "language_loss": 0.55743909, "learning_rate": 1.949570057627888e-06, "loss": 0.5775997, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.08007812, "step": 8695, "time_per_iteration": 3.06791615486145 }, { "auxiliary_loss_clip": 0.01056979, "auxiliary_loss_mlp": 0.01040619, "balance_loss_clip": 1.01379716, "balance_loss_mlp": 1.01793647, "epoch": 0.5228318052006613, "flos": 13807415180160.0, "grad_norm": 1.8468687083546864, "language_loss": 0.74761295, "learning_rate": 1.9491807207335672e-06, "loss": 0.76858902, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 8696, "time_per_iteration": 2.3486523628234863 }, { "auxiliary_loss_clip": 0.01057762, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.0193069, "balance_loss_mlp": 1.01829433, "epoch": 0.5228919284533293, "flos": 15595282460160.0, "grad_norm": 1.524417154723536, "language_loss": 0.72870672, "learning_rate": 1.948791385766319e-06, "loss": 0.74972749, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 8697, "time_per_iteration": 2.359988212585449 }, { "auxiliary_loss_clip": 0.01054828, "auxiliary_loss_mlp": 0.01039477, "balance_loss_clip": 1.01559937, "balance_loss_mlp": 1.01767457, "epoch": 0.5229520517059973, "flos": 22490419311360.0, "grad_norm": 2.3251333518047406, "language_loss": 0.82130867, "learning_rate": 1.948402052740906e-06, "loss": 0.84225178, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 8698, "time_per_iteration": 2.4124057292938232 }, { "auxiliary_loss_clip": 0.01055742, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.01748991, "balance_loss_mlp": 1.01819336, "epoch": 0.5230121749586653, "flos": 22089790926720.0, "grad_norm": 1.6961092317347655, "language_loss": 0.7534464, "learning_rate": 1.948012721672093e-06, "loss": 0.77441722, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 8699, "time_per_iteration": 2.377918004989624 }, { "auxiliary_loss_clip": 0.01058924, "auxiliary_loss_mlp": 0.0104254, "balance_loss_clip": 1.01528835, "balance_loss_mlp": 1.01754665, "epoch": 0.5230722982113333, "flos": 22126065696000.0, "grad_norm": 1.773525905427694, "language_loss": 0.74855703, "learning_rate": 1.947623392574642e-06, "loss": 0.76957172, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 8700, "time_per_iteration": 2.4055604934692383 }, { "auxiliary_loss_clip": 0.01059298, "auxiliary_loss_mlp": 0.01048571, "balance_loss_clip": 1.02111685, "balance_loss_mlp": 1.01854074, "epoch": 0.5231324214640012, "flos": 25008110985600.0, "grad_norm": 1.6412549369581064, "language_loss": 0.69239759, "learning_rate": 1.947234065463318e-06, "loss": 0.7134763, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 8701, "time_per_iteration": 2.4089200496673584 }, { "auxiliary_loss_clip": 0.01054605, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.01653254, "balance_loss_mlp": 1.01683521, "epoch": 0.5231925447166692, "flos": 25739296922880.0, "grad_norm": 2.8840722567661725, "language_loss": 0.6749472, "learning_rate": 1.9468447403528826e-06, "loss": 0.69591081, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 8702, "time_per_iteration": 2.4184858798980713 }, { "auxiliary_loss_clip": 0.01055387, "auxiliary_loss_mlp": 0.01038258, "balance_loss_clip": 1.01316404, "balance_loss_mlp": 1.01784325, "epoch": 0.5232526679693371, "flos": 21432865184640.0, "grad_norm": 1.8347647522178292, "language_loss": 0.77607536, "learning_rate": 1.946455417258101e-06, "loss": 0.79701185, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 8703, "time_per_iteration": 2.367572069168091 }, { "auxiliary_loss_clip": 0.01061003, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.01824594, "balance_loss_mlp": 1.01948404, "epoch": 0.5233127912220051, "flos": 35296945234560.0, "grad_norm": 2.120800386725718, "language_loss": 0.78304303, "learning_rate": 1.9460660961937348e-06, "loss": 0.80414128, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.41601562, "step": 8704, "time_per_iteration": 2.5978171825408936 }, { "auxiliary_loss_clip": 0.01054666, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.01428986, "balance_loss_mlp": 1.01776814, "epoch": 0.523372914474673, "flos": 17050497505920.0, "grad_norm": 2.191921423827496, "language_loss": 0.78889239, "learning_rate": 1.9456767771745474e-06, "loss": 0.80982959, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 8705, "time_per_iteration": 2.348168134689331 }, { "auxiliary_loss_clip": 0.01057959, "auxiliary_loss_mlp": 0.01040463, "balance_loss_clip": 1.0144161, "balance_loss_mlp": 1.01822066, "epoch": 0.5234330377273411, "flos": 18405301881600.0, "grad_norm": 1.8954177601980944, "language_loss": 0.70698822, "learning_rate": 1.9452874602153027e-06, "loss": 0.72797239, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3984375, "step": 8706, "time_per_iteration": 2.381861686706543 }, { "auxiliary_loss_clip": 0.01011697, "auxiliary_loss_mlp": 0.01007064, "balance_loss_clip": 1.004632, "balance_loss_mlp": 1.00390267, "epoch": 0.523493160980009, "flos": 65846608727040.0, "grad_norm": 0.6831141602233511, "language_loss": 0.52601075, "learning_rate": 1.9448981453307623e-06, "loss": 0.54619837, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.078125, "step": 8707, "time_per_iteration": 3.065253734588623 }, { "auxiliary_loss_clip": 0.01056141, "auxiliary_loss_mlp": 0.01042789, "balance_loss_clip": 1.01829123, "balance_loss_mlp": 1.01763892, "epoch": 0.523553284232677, "flos": 21870990236160.0, "grad_norm": 1.640904618811821, "language_loss": 0.76192904, "learning_rate": 1.9445088325356904e-06, "loss": 0.78291833, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38476562, "step": 8708, "time_per_iteration": 2.4026901721954346 }, { "auxiliary_loss_clip": 0.010567, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.01254058, "balance_loss_mlp": 1.01959705, "epoch": 0.5236134074853449, "flos": 20847197260800.0, "grad_norm": 1.627696789144538, "language_loss": 0.78402996, "learning_rate": 1.944119521844849e-06, "loss": 0.80495447, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 8709, "time_per_iteration": 2.3709378242492676 }, { "auxiliary_loss_clip": 0.01058698, "auxiliary_loss_mlp": 0.01043133, "balance_loss_clip": 1.01572728, "balance_loss_mlp": 1.01763427, "epoch": 0.5236735307380129, "flos": 25519239423360.0, "grad_norm": 1.9144941932729982, "language_loss": 0.85177374, "learning_rate": 1.9437302132730003e-06, "loss": 0.87279207, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 8710, "time_per_iteration": 2.402303695678711 }, { "auxiliary_loss_clip": 0.01054936, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.01128197, "balance_loss_mlp": 1.01835024, "epoch": 0.523733653990681, "flos": 23582083703040.0, "grad_norm": 1.9349589243215275, "language_loss": 0.71063024, "learning_rate": 1.943340906834908e-06, "loss": 0.73152655, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36523438, "step": 8711, "time_per_iteration": 2.3834891319274902 }, { "auxiliary_loss_clip": 0.01056826, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.01486659, "balance_loss_mlp": 1.01876092, "epoch": 0.5237937772433489, "flos": 21105170363520.0, "grad_norm": 1.6964714420456806, "language_loss": 0.84106719, "learning_rate": 1.9429516025453345e-06, "loss": 0.8620159, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8712, "time_per_iteration": 2.3998725414276123 }, { "auxiliary_loss_clip": 0.01058687, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.01970983, "balance_loss_mlp": 1.01960504, "epoch": 0.5238539004960169, "flos": 19171854892800.0, "grad_norm": 3.5023494652383147, "language_loss": 0.7064721, "learning_rate": 1.9425623004190415e-06, "loss": 0.72751522, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 8713, "time_per_iteration": 2.388051748275757 }, { "auxiliary_loss_clip": 0.0105999, "auxiliary_loss_mlp": 0.01045837, "balance_loss_clip": 1.01843083, "balance_loss_mlp": 1.01939476, "epoch": 0.5239140237486848, "flos": 17887435551360.0, "grad_norm": 2.963735677675801, "language_loss": 0.79046351, "learning_rate": 1.9421730004707925e-06, "loss": 0.81152177, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 8714, "time_per_iteration": 2.3725767135620117 }, { "auxiliary_loss_clip": 0.0105969, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.00872815, "balance_loss_mlp": 1.02109134, "epoch": 0.5239741470013528, "flos": 17929470695040.0, "grad_norm": 1.8576815591205822, "language_loss": 0.77284169, "learning_rate": 1.9417837027153483e-06, "loss": 0.79377598, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 8715, "time_per_iteration": 2.354696273803711 }, { "auxiliary_loss_clip": 0.01056281, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.01369488, "balance_loss_mlp": 1.01938128, "epoch": 0.5240342702540207, "flos": 30992049596160.0, "grad_norm": 1.5690556272342817, "language_loss": 0.71948993, "learning_rate": 1.9413944071674723e-06, "loss": 0.74041891, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36914062, "step": 8716, "time_per_iteration": 2.485448122024536 }, { "auxiliary_loss_clip": 0.01056286, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.01477277, "balance_loss_mlp": 1.01900887, "epoch": 0.5240943935066887, "flos": 25004026356480.0, "grad_norm": 1.7689617245951055, "language_loss": 0.87473756, "learning_rate": 1.941005113841926e-06, "loss": 0.89567763, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.37304688, "step": 8717, "time_per_iteration": 3.622506618499756 }, { "auxiliary_loss_clip": 0.01056914, "auxiliary_loss_mlp": 0.01037624, "balance_loss_clip": 1.01481926, "balance_loss_mlp": 1.01910019, "epoch": 0.5241545167593566, "flos": 23657984732160.0, "grad_norm": 2.4078394416600593, "language_loss": 0.62718987, "learning_rate": 1.9406158227534723e-06, "loss": 0.64813524, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.37890625, "step": 8718, "time_per_iteration": 2.4041476249694824 }, { "auxiliary_loss_clip": 0.01058592, "auxiliary_loss_mlp": 0.01048287, "balance_loss_clip": 1.0211792, "balance_loss_mlp": 1.01899672, "epoch": 0.5242146400120247, "flos": 23399383224960.0, "grad_norm": 1.8191480582373898, "language_loss": 0.73149961, "learning_rate": 1.940226533916872e-06, "loss": 0.75256848, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 8719, "time_per_iteration": 3.829763174057007 }, { "auxiliary_loss_clip": 0.01053965, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.01992726, "balance_loss_mlp": 1.01805866, "epoch": 0.5242747632646926, "flos": 17748096848640.0, "grad_norm": 1.8177636261561498, "language_loss": 0.74276817, "learning_rate": 1.9398372473468877e-06, "loss": 0.7637099, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.359375, "step": 8720, "time_per_iteration": 2.3425824642181396 }, { "auxiliary_loss_clip": 0.01056746, "auxiliary_loss_mlp": 0.01042746, "balance_loss_clip": 1.01826036, "balance_loss_mlp": 1.01898074, "epoch": 0.5243348865173606, "flos": 32596378525440.0, "grad_norm": 1.6075355060439904, "language_loss": 0.71082437, "learning_rate": 1.939447963058281e-06, "loss": 0.73181927, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 8721, "time_per_iteration": 3.890228271484375 }, { "auxiliary_loss_clip": 0.01054142, "auxiliary_loss_mlp": 0.01046017, "balance_loss_clip": 1.02323627, "balance_loss_mlp": 1.01734602, "epoch": 0.5243950097700285, "flos": 25482929742720.0, "grad_norm": 1.5899296682751258, "language_loss": 0.87446201, "learning_rate": 1.939058681065813e-06, "loss": 0.89546371, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36914062, "step": 8722, "time_per_iteration": 2.3921303749084473 }, { "auxiliary_loss_clip": 0.01056507, "auxiliary_loss_mlp": 0.01036737, "balance_loss_clip": 1.01345551, "balance_loss_mlp": 1.01989698, "epoch": 0.5244551330226965, "flos": 15267482904960.0, "grad_norm": 1.8859182991268613, "language_loss": 0.80674177, "learning_rate": 1.938669401384247e-06, "loss": 0.82767421, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 8723, "time_per_iteration": 2.4017889499664307 }, { "auxiliary_loss_clip": 0.0105832, "auxiliary_loss_mlp": 0.01052425, "balance_loss_clip": 1.02509058, "balance_loss_mlp": 1.01953804, "epoch": 0.5245152562753645, "flos": 22236007167360.0, "grad_norm": 1.7878959975678363, "language_loss": 0.76610428, "learning_rate": 1.9382801240283426e-06, "loss": 0.78721172, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 8724, "time_per_iteration": 2.3943753242492676 }, { "auxiliary_loss_clip": 0.01057443, "auxiliary_loss_mlp": 0.01043502, "balance_loss_clip": 1.01769304, "balance_loss_mlp": 1.01763058, "epoch": 0.5245753795280325, "flos": 29425112599680.0, "grad_norm": 1.5237493746677904, "language_loss": 0.71331334, "learning_rate": 1.9378908490128625e-06, "loss": 0.73432279, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 8725, "time_per_iteration": 2.4625654220581055 }, { "auxiliary_loss_clip": 0.01012062, "auxiliary_loss_mlp": 0.01008948, "balance_loss_clip": 1.00565839, "balance_loss_mlp": 1.00400853, "epoch": 0.5246355027807005, "flos": 58831196641920.0, "grad_norm": 0.7689618128825894, "language_loss": 0.55715448, "learning_rate": 1.937501576352568e-06, "loss": 0.57736456, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 0.03295898, "router_z_loss_mlp": 0.08056641, "step": 8726, "time_per_iteration": 3.042745590209961 }, { "auxiliary_loss_clip": 0.01010862, "auxiliary_loss_mlp": 0.01014743, "balance_loss_clip": 1.0117749, "balance_loss_mlp": 1.00303054, "epoch": 0.5246956260333684, "flos": 64523226441600.0, "grad_norm": 0.8081085402575653, "language_loss": 0.58449149, "learning_rate": 1.937112306062219e-06, "loss": 0.60474759, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.078125, "step": 8727, "time_per_iteration": 4.44676947593689 }, { "auxiliary_loss_clip": 0.01056891, "auxiliary_loss_mlp": 0.01041721, "balance_loss_clip": 1.0174737, "balance_loss_mlp": 1.01760566, "epoch": 0.5247557492860364, "flos": 24532524178560.0, "grad_norm": 1.264401217922183, "language_loss": 0.71490341, "learning_rate": 1.9367230381565786e-06, "loss": 0.73588955, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.39257812, "step": 8728, "time_per_iteration": 2.425903558731079 }, { "auxiliary_loss_clip": 0.01054135, "auxiliary_loss_mlp": 0.01042027, "balance_loss_clip": 1.01898408, "balance_loss_mlp": 1.01787412, "epoch": 0.5248158725387043, "flos": 18805162216320.0, "grad_norm": 1.4406759006257879, "language_loss": 0.7011348, "learning_rate": 1.9363337726504062e-06, "loss": 0.72209644, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 8729, "time_per_iteration": 2.3614766597747803 }, { "auxiliary_loss_clip": 0.01057091, "auxiliary_loss_mlp": 0.01048603, "balance_loss_clip": 1.0237124, "balance_loss_mlp": 1.01896799, "epoch": 0.5248759957913723, "flos": 20954904405120.0, "grad_norm": 1.7599484946186146, "language_loss": 0.84838784, "learning_rate": 1.935944509558464e-06, "loss": 0.86944485, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38085938, "step": 8730, "time_per_iteration": 2.3882665634155273 }, { "auxiliary_loss_clip": 0.01058447, "auxiliary_loss_mlp": 0.01043617, "balance_loss_clip": 1.01908422, "balance_loss_mlp": 1.02000403, "epoch": 0.5249361190440403, "flos": 18659993316480.0, "grad_norm": 1.9941384161614533, "language_loss": 0.8063423, "learning_rate": 1.9355552488955125e-06, "loss": 0.8273629, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 8731, "time_per_iteration": 2.360391855239868 }, { "auxiliary_loss_clip": 0.01053243, "auxiliary_loss_mlp": 0.01037033, "balance_loss_clip": 1.01608753, "balance_loss_mlp": 1.01767635, "epoch": 0.5249962422967083, "flos": 24862174035840.0, "grad_norm": 1.9409676628356936, "language_loss": 0.84366351, "learning_rate": 1.935165990676312e-06, "loss": 0.86456627, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.35546875, "step": 8732, "time_per_iteration": 2.4193758964538574 }, { "auxiliary_loss_clip": 0.01058635, "auxiliary_loss_mlp": 0.01040968, "balance_loss_clip": 1.01841354, "balance_loss_mlp": 1.02113891, "epoch": 0.5250563655493762, "flos": 15261931998720.0, "grad_norm": 1.515051279518933, "language_loss": 0.78927875, "learning_rate": 1.9347767349156237e-06, "loss": 0.81027472, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.375, "step": 8733, "time_per_iteration": 2.349703550338745 }, { "auxiliary_loss_clip": 0.01060225, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.01605535, "balance_loss_mlp": 1.02103329, "epoch": 0.5251164888020442, "flos": 18624172394880.0, "grad_norm": 1.9932740130071283, "language_loss": 0.82879496, "learning_rate": 1.934387481628208e-06, "loss": 0.84978878, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.39257812, "step": 8734, "time_per_iteration": 2.378478527069092 }, { "auxiliary_loss_clip": 0.010596, "auxiliary_loss_mlp": 0.01040007, "balance_loss_clip": 1.01664221, "balance_loss_mlp": 1.02262902, "epoch": 0.5251766120547121, "flos": 29709620202240.0, "grad_norm": 1.3360558111626202, "language_loss": 0.77340525, "learning_rate": 1.933998230828826e-06, "loss": 0.79440135, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 8735, "time_per_iteration": 2.4504826068878174 }, { "auxiliary_loss_clip": 0.01060773, "auxiliary_loss_mlp": 0.01042695, "balance_loss_clip": 1.02056956, "balance_loss_mlp": 1.02180052, "epoch": 0.5252367353073801, "flos": 23439184041600.0, "grad_norm": 1.60824494173692, "language_loss": 0.81755507, "learning_rate": 1.9336089825322376e-06, "loss": 0.83858979, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.390625, "step": 8736, "time_per_iteration": 2.4420132637023926 }, { "auxiliary_loss_clip": 0.01060529, "auxiliary_loss_mlp": 0.01044223, "balance_loss_clip": 1.02022648, "balance_loss_mlp": 1.02254438, "epoch": 0.5252968585600482, "flos": 30809384029440.0, "grad_norm": 2.2306287375429674, "language_loss": 0.71547711, "learning_rate": 1.9332197367532033e-06, "loss": 0.73652458, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 8737, "time_per_iteration": 2.45532488822937 }, { "auxiliary_loss_clip": 0.01060789, "auxiliary_loss_mlp": 0.01042946, "balance_loss_clip": 1.01971257, "balance_loss_mlp": 1.02290571, "epoch": 0.5253569818127161, "flos": 20627314318080.0, "grad_norm": 1.4964947883761246, "language_loss": 0.78298426, "learning_rate": 1.9328304935064833e-06, "loss": 0.8040216, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8738, "time_per_iteration": 2.4596011638641357 }, { "auxiliary_loss_clip": 0.01020948, "auxiliary_loss_mlp": 0.01004385, "balance_loss_clip": 1.00170302, "balance_loss_mlp": 1.0132798, "epoch": 0.5254171050653841, "flos": 63425452561920.0, "grad_norm": 0.7369338861809099, "language_loss": 0.54468799, "learning_rate": 1.932441252806837e-06, "loss": 0.56494135, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.07666016, "step": 8739, "time_per_iteration": 2.995771884918213 }, { "auxiliary_loss_clip": 0.01057626, "auxiliary_loss_mlp": 0.01045663, "balance_loss_clip": 1.02393115, "balance_loss_mlp": 1.02084923, "epoch": 0.525477228318052, "flos": 34669556369280.0, "grad_norm": 1.6292068314972095, "language_loss": 0.8557179, "learning_rate": 1.9320520146690263e-06, "loss": 0.87675071, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3671875, "step": 8740, "time_per_iteration": 2.4990029335021973 }, { "auxiliary_loss_clip": 0.01059315, "auxiliary_loss_mlp": 0.0104741, "balance_loss_clip": 1.02365172, "balance_loss_mlp": 1.02172267, "epoch": 0.52553735157072, "flos": 17929889631360.0, "grad_norm": 2.2247570088756525, "language_loss": 0.70634121, "learning_rate": 1.9316627791078093e-06, "loss": 0.72740841, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37695312, "step": 8741, "time_per_iteration": 2.371375322341919 }, { "auxiliary_loss_clip": 0.01061868, "auxiliary_loss_mlp": 0.01046415, "balance_loss_clip": 1.02222764, "balance_loss_mlp": 1.02226734, "epoch": 0.5255974748233879, "flos": 9940120922880.0, "grad_norm": 2.282515297047375, "language_loss": 0.67687726, "learning_rate": 1.931273546137947e-06, "loss": 0.69796002, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.39648438, "step": 8742, "time_per_iteration": 2.393458366394043 }, { "auxiliary_loss_clip": 0.01060959, "auxiliary_loss_mlp": 0.01049807, "balance_loss_clip": 1.02387881, "balance_loss_mlp": 1.0211519, "epoch": 0.5256575980760559, "flos": 16867622471040.0, "grad_norm": 3.8463710977476433, "language_loss": 0.64764762, "learning_rate": 1.9308843157741983e-06, "loss": 0.66875529, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3984375, "step": 8743, "time_per_iteration": 2.3419594764709473 }, { "auxiliary_loss_clip": 0.01016438, "auxiliary_loss_mlp": 0.01005366, "balance_loss_clip": 1.00317216, "balance_loss_mlp": 1.00875866, "epoch": 0.5257177213287239, "flos": 62382561206400.0, "grad_norm": 0.7874550288716761, "language_loss": 0.54152644, "learning_rate": 1.930495088031323e-06, "loss": 0.56174445, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.07666016, "step": 8744, "time_per_iteration": 3.142137050628662 }, { "auxiliary_loss_clip": 0.01060769, "auxiliary_loss_mlp": 0.0105516, "balance_loss_clip": 1.02827835, "balance_loss_mlp": 1.02035952, "epoch": 0.5257778445813919, "flos": 20775869619840.0, "grad_norm": 2.0409896473815468, "language_loss": 0.77576262, "learning_rate": 1.9301058629240814e-06, "loss": 0.79692191, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 8745, "time_per_iteration": 2.378455638885498 }, { "auxiliary_loss_clip": 0.01057233, "auxiliary_loss_mlp": 0.01046769, "balance_loss_clip": 1.02348781, "balance_loss_mlp": 1.01977539, "epoch": 0.5258379678340598, "flos": 17017678961280.0, "grad_norm": 2.154127393503611, "language_loss": 0.83249569, "learning_rate": 1.9297166404672324e-06, "loss": 0.85353571, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 8746, "time_per_iteration": 2.375123977661133 }, { "auxiliary_loss_clip": 0.01056128, "auxiliary_loss_mlp": 0.0104867, "balance_loss_clip": 1.02482855, "balance_loss_mlp": 1.01883245, "epoch": 0.5258980910867278, "flos": 21067708608000.0, "grad_norm": 1.7890727202434968, "language_loss": 0.76738036, "learning_rate": 1.9293274206755353e-06, "loss": 0.78842831, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 8747, "time_per_iteration": 2.384676694869995 }, { "auxiliary_loss_clip": 0.01051973, "auxiliary_loss_mlp": 0.01049705, "balance_loss_clip": 1.02455163, "balance_loss_mlp": 1.01661134, "epoch": 0.5259582143393957, "flos": 18003486510720.0, "grad_norm": 1.7655506844856546, "language_loss": 0.83793688, "learning_rate": 1.9289382035637505e-06, "loss": 0.8589536, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35351562, "step": 8748, "time_per_iteration": 2.3717947006225586 }, { "auxiliary_loss_clip": 0.0105523, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.02343512, "balance_loss_mlp": 1.017308, "epoch": 0.5260183375920637, "flos": 22782747058560.0, "grad_norm": 2.0777534307082606, "language_loss": 0.82299793, "learning_rate": 1.9285489891466345e-06, "loss": 0.84404296, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 8749, "time_per_iteration": 2.385683059692383 }, { "auxiliary_loss_clip": 0.01056037, "auxiliary_loss_mlp": 0.01053717, "balance_loss_clip": 1.03044772, "balance_loss_mlp": 1.01872849, "epoch": 0.5260784608447318, "flos": 27051193370880.0, "grad_norm": 2.0238846471825673, "language_loss": 0.74046791, "learning_rate": 1.9281597774389487e-06, "loss": 0.76156545, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 8750, "time_per_iteration": 2.45823073387146 }, { "auxiliary_loss_clip": 0.01055878, "auxiliary_loss_mlp": 0.01043941, "balance_loss_clip": 1.02052844, "balance_loss_mlp": 1.01765072, "epoch": 0.5261385840973997, "flos": 20661913342080.0, "grad_norm": 1.314274524467246, "language_loss": 0.7751323, "learning_rate": 1.9277705684554517e-06, "loss": 0.79613042, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3828125, "step": 8751, "time_per_iteration": 2.38356614112854 }, { "auxiliary_loss_clip": 0.01054179, "auxiliary_loss_mlp": 0.01043881, "balance_loss_clip": 1.0217315, "balance_loss_mlp": 1.01815772, "epoch": 0.5261987073500677, "flos": 23621535406080.0, "grad_norm": 1.3845837319638385, "language_loss": 0.77462542, "learning_rate": 1.927381362210902e-06, "loss": 0.79560602, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 8752, "time_per_iteration": 2.432647466659546 }, { "auxiliary_loss_clip": 0.01058172, "auxiliary_loss_mlp": 0.01045501, "balance_loss_clip": 1.01976347, "balance_loss_mlp": 1.01915622, "epoch": 0.5262588306027356, "flos": 27635010992640.0, "grad_norm": 1.4733096638464633, "language_loss": 0.68351376, "learning_rate": 1.926992158720058e-06, "loss": 0.7045505, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 8753, "time_per_iteration": 2.4235689640045166 }, { "auxiliary_loss_clip": 0.01058512, "auxiliary_loss_mlp": 0.0104702, "balance_loss_clip": 1.02397704, "balance_loss_mlp": 1.02122831, "epoch": 0.5263189538554036, "flos": 21758709703680.0, "grad_norm": 1.4941498728070772, "language_loss": 0.84941691, "learning_rate": 1.9266029579976785e-06, "loss": 0.87047231, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37304688, "step": 8754, "time_per_iteration": 2.404838800430298 }, { "auxiliary_loss_clip": 0.01060328, "auxiliary_loss_mlp": 0.01043791, "balance_loss_clip": 1.01973426, "balance_loss_mlp": 1.0212115, "epoch": 0.5263790771080715, "flos": 14275670601600.0, "grad_norm": 1.8314818885106496, "language_loss": 0.88412648, "learning_rate": 1.926213760058522e-06, "loss": 0.90516764, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.390625, "step": 8755, "time_per_iteration": 2.351964235305786 }, { "auxiliary_loss_clip": 0.01017915, "auxiliary_loss_mlp": 0.01007243, "balance_loss_clip": 1.00488257, "balance_loss_mlp": 1.01041031, "epoch": 0.5264392003607395, "flos": 65802932749440.0, "grad_norm": 1.0386365409873706, "language_loss": 0.58940661, "learning_rate": 1.9258245649173477e-06, "loss": 0.60965812, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07519531, "step": 8756, "time_per_iteration": 3.092277765274048 }, { "auxiliary_loss_clip": 0.01062733, "auxiliary_loss_mlp": 0.01039129, "balance_loss_clip": 1.01444101, "balance_loss_mlp": 1.02263546, "epoch": 0.5264993236134075, "flos": 21031364016000.0, "grad_norm": 1.7845334963678385, "language_loss": 0.7136035, "learning_rate": 1.925435372588913e-06, "loss": 0.73462212, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.40039062, "step": 8757, "time_per_iteration": 3.681122064590454 }, { "auxiliary_loss_clip": 0.01063399, "auxiliary_loss_mlp": 0.01045894, "balance_loss_clip": 1.01976359, "balance_loss_mlp": 1.02334428, "epoch": 0.5265594468660755, "flos": 16617260044800.0, "grad_norm": 1.5289496053062177, "language_loss": 0.88919771, "learning_rate": 1.9250461830879768e-06, "loss": 0.91029066, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 8758, "time_per_iteration": 2.380053758621216 }, { "auxiliary_loss_clip": 0.0106221, "auxiliary_loss_mlp": 0.01043696, "balance_loss_clip": 1.01650417, "balance_loss_mlp": 1.02185965, "epoch": 0.5266195701187434, "flos": 24132978046080.0, "grad_norm": 2.781284707831141, "language_loss": 0.77075654, "learning_rate": 1.9246569964292965e-06, "loss": 0.79181564, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 8759, "time_per_iteration": 3.9452333450317383 }, { "auxiliary_loss_clip": 0.01060951, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.01611876, "balance_loss_mlp": 1.02289939, "epoch": 0.5266796933714114, "flos": 15843410559360.0, "grad_norm": 2.7964519017675427, "language_loss": 0.72968918, "learning_rate": 1.9242678126276307e-06, "loss": 0.75070101, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38085938, "step": 8760, "time_per_iteration": 3.7207367420196533 }, { "auxiliary_loss_clip": 0.01065175, "auxiliary_loss_mlp": 0.01045219, "balance_loss_clip": 1.01930261, "balance_loss_mlp": 1.02468121, "epoch": 0.5267398166240793, "flos": 20950610307840.0, "grad_norm": 2.1157488582224486, "language_loss": 0.78108966, "learning_rate": 1.923878631697736e-06, "loss": 0.80219352, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40625, "step": 8761, "time_per_iteration": 2.3809332847595215 }, { "auxiliary_loss_clip": 0.01064633, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.01916623, "balance_loss_mlp": 1.02669811, "epoch": 0.5267999398767473, "flos": 20995333626240.0, "grad_norm": 1.7358435903627882, "language_loss": 0.71560919, "learning_rate": 1.923489453654373e-06, "loss": 0.73668027, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8762, "time_per_iteration": 2.4480535984039307 }, { "auxiliary_loss_clip": 0.01025891, "auxiliary_loss_mlp": 0.01024549, "balance_loss_clip": 1.0218668, "balance_loss_mlp": 1.01834786, "epoch": 0.5268600631294152, "flos": 66846312864000.0, "grad_norm": 0.9306984605231466, "language_loss": 0.65718877, "learning_rate": 1.9231002785122963e-06, "loss": 0.67769313, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.07519531, "step": 8763, "time_per_iteration": 2.926093339920044 }, { "auxiliary_loss_clip": 0.0106348, "auxiliary_loss_mlp": 0.01043581, "balance_loss_clip": 1.01758146, "balance_loss_mlp": 1.02470195, "epoch": 0.5269201863820833, "flos": 17164593429120.0, "grad_norm": 2.0015222711613245, "language_loss": 0.72594279, "learning_rate": 1.922711106286265e-06, "loss": 0.74701345, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 8764, "time_per_iteration": 2.400773525238037 }, { "auxiliary_loss_clip": 0.01064299, "auxiliary_loss_mlp": 0.01043256, "balance_loss_clip": 1.01645756, "balance_loss_mlp": 1.0248183, "epoch": 0.5269803096347513, "flos": 20521527298560.0, "grad_norm": 1.8224268175574079, "language_loss": 0.7546376, "learning_rate": 1.9223219369910368e-06, "loss": 0.77571309, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 8765, "time_per_iteration": 2.4014012813568115 }, { "auxiliary_loss_clip": 0.01063035, "auxiliary_loss_mlp": 0.01053533, "balance_loss_clip": 1.02570939, "balance_loss_mlp": 1.02259564, "epoch": 0.5270404328874192, "flos": 27229879042560.0, "grad_norm": 1.4477945558839704, "language_loss": 0.86199421, "learning_rate": 1.9219327706413677e-06, "loss": 0.88315988, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 8766, "time_per_iteration": 2.4741854667663574 }, { "auxiliary_loss_clip": 0.01062726, "auxiliary_loss_mlp": 0.01056814, "balance_loss_clip": 1.0302304, "balance_loss_mlp": 1.02284098, "epoch": 0.5271005561400872, "flos": 23109429450240.0, "grad_norm": 1.665646772679221, "language_loss": 0.80079776, "learning_rate": 1.921543607252017e-06, "loss": 0.82199311, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 8767, "time_per_iteration": 3.841486930847168 }, { "auxiliary_loss_clip": 0.01065866, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.02775788, "balance_loss_mlp": 1.02527404, "epoch": 0.5271606793927551, "flos": 22563701988480.0, "grad_norm": 1.9369669914269416, "language_loss": 0.74536908, "learning_rate": 1.9211544468377394e-06, "loss": 0.76657015, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40625, "step": 8768, "time_per_iteration": 2.41845703125 }, { "auxiliary_loss_clip": 0.01060383, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.02457309, "balance_loss_mlp": 1.02237427, "epoch": 0.5272208026454231, "flos": 18763441274880.0, "grad_norm": 1.933059860605097, "language_loss": 0.75451422, "learning_rate": 1.9207652894132933e-06, "loss": 0.77559453, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37890625, "step": 8769, "time_per_iteration": 2.366391658782959 }, { "auxiliary_loss_clip": 0.0105946, "auxiliary_loss_mlp": 0.01057508, "balance_loss_clip": 1.03345132, "balance_loss_mlp": 1.02111757, "epoch": 0.5272809258980911, "flos": 20411131979520.0, "grad_norm": 1.6141404870747005, "language_loss": 0.75011218, "learning_rate": 1.920376134993436e-06, "loss": 0.77128184, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 8770, "time_per_iteration": 2.4181125164031982 }, { "auxiliary_loss_clip": 0.01061141, "auxiliary_loss_mlp": 0.01059539, "balance_loss_clip": 1.03221619, "balance_loss_mlp": 1.02106392, "epoch": 0.5273410491507591, "flos": 28255487408640.0, "grad_norm": 1.6926271326134057, "language_loss": 0.70028627, "learning_rate": 1.9199869835929224e-06, "loss": 0.72149301, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 8771, "time_per_iteration": 2.4243364334106445 }, { "auxiliary_loss_clip": 0.01056843, "auxiliary_loss_mlp": 0.01058989, "balance_loss_clip": 1.03421688, "balance_loss_mlp": 1.01862288, "epoch": 0.527401172403427, "flos": 22454074719360.0, "grad_norm": 1.919680573082745, "language_loss": 0.77613837, "learning_rate": 1.9195978352265115e-06, "loss": 0.79729676, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 8772, "time_per_iteration": 2.4343321323394775 }, { "auxiliary_loss_clip": 0.01058977, "auxiliary_loss_mlp": 0.0106048, "balance_loss_clip": 1.03250158, "balance_loss_mlp": 1.01935434, "epoch": 0.527461295656095, "flos": 21030072295680.0, "grad_norm": 1.8530392064491314, "language_loss": 0.67408526, "learning_rate": 1.9192086899089585e-06, "loss": 0.69527978, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.39648438, "step": 8773, "time_per_iteration": 2.398852825164795 }, { "auxiliary_loss_clip": 0.01059005, "auxiliary_loss_mlp": 0.01055863, "balance_loss_clip": 1.02879083, "balance_loss_mlp": 1.01836252, "epoch": 0.5275214189087629, "flos": 26320845306240.0, "grad_norm": 1.7256670890012678, "language_loss": 0.86780375, "learning_rate": 1.91881954765502e-06, "loss": 0.88895243, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 8774, "time_per_iteration": 2.5173864364624023 }, { "auxiliary_loss_clip": 0.01058144, "auxiliary_loss_mlp": 0.01059481, "balance_loss_clip": 1.03369617, "balance_loss_mlp": 1.01859272, "epoch": 0.5275815421614309, "flos": 20046010314240.0, "grad_norm": 1.4886581558670002, "language_loss": 0.80844235, "learning_rate": 1.9184304084794523e-06, "loss": 0.82961857, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 8775, "time_per_iteration": 2.415989398956299 }, { "auxiliary_loss_clip": 0.01056733, "auxiliary_loss_mlp": 0.01060291, "balance_loss_clip": 1.0339458, "balance_loss_mlp": 1.01850724, "epoch": 0.5276416654140988, "flos": 21431189439360.0, "grad_norm": 1.6657083598587137, "language_loss": 0.84585285, "learning_rate": 1.918041272397012e-06, "loss": 0.86702305, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 8776, "time_per_iteration": 2.4803123474121094 }, { "auxiliary_loss_clip": 0.01059194, "auxiliary_loss_mlp": 0.01057592, "balance_loss_clip": 1.03149748, "balance_loss_mlp": 1.01932907, "epoch": 0.5277017886667669, "flos": 17164139581440.0, "grad_norm": 1.7829563817489542, "language_loss": 0.68662083, "learning_rate": 1.9176521394224547e-06, "loss": 0.70778871, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 8777, "time_per_iteration": 2.3876562118530273 }, { "auxiliary_loss_clip": 0.01059721, "auxiliary_loss_mlp": 0.01057475, "balance_loss_clip": 1.03155899, "balance_loss_mlp": 1.02093685, "epoch": 0.5277619119194349, "flos": 20447127457920.0, "grad_norm": 1.5425136144291716, "language_loss": 0.83212143, "learning_rate": 1.9172630095705358e-06, "loss": 0.85329342, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38867188, "step": 8778, "time_per_iteration": 2.5151069164276123 }, { "auxiliary_loss_clip": 0.01061193, "auxiliary_loss_mlp": 0.01055674, "balance_loss_clip": 1.02839887, "balance_loss_mlp": 1.02014101, "epoch": 0.5278220351721028, "flos": 24059939748480.0, "grad_norm": 2.008598959560973, "language_loss": 0.80970883, "learning_rate": 1.916873882856013e-06, "loss": 0.83087754, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 8779, "time_per_iteration": 2.415189027786255 }, { "auxiliary_loss_clip": 0.01059032, "auxiliary_loss_mlp": 0.01049125, "balance_loss_clip": 1.02509284, "balance_loss_mlp": 1.02140379, "epoch": 0.5278821584247708, "flos": 24641802334080.0, "grad_norm": 2.6453383344017025, "language_loss": 0.7830928, "learning_rate": 1.9164847592936406e-06, "loss": 0.8041743, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 8780, "time_per_iteration": 2.4948935508728027 }, { "auxiliary_loss_clip": 0.01066832, "auxiliary_loss_mlp": 0.01045883, "balance_loss_clip": 1.0187037, "balance_loss_mlp": 1.02548337, "epoch": 0.5279422816774387, "flos": 35406781971840.0, "grad_norm": 2.484989147992843, "language_loss": 0.70239067, "learning_rate": 1.916095638898174e-06, "loss": 0.72351784, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.4140625, "step": 8781, "time_per_iteration": 2.526517391204834 }, { "auxiliary_loss_clip": 0.01060478, "auxiliary_loss_mlp": 0.01047457, "balance_loss_clip": 1.02423549, "balance_loss_mlp": 1.0225687, "epoch": 0.5280024049301068, "flos": 22965901384320.0, "grad_norm": 1.5604329440408664, "language_loss": 0.73176873, "learning_rate": 1.9157065216843696e-06, "loss": 0.75284803, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8782, "time_per_iteration": 2.4905261993408203 }, { "auxiliary_loss_clip": 0.01062411, "auxiliary_loss_mlp": 0.01039687, "balance_loss_clip": 1.01349688, "balance_loss_mlp": 1.02382493, "epoch": 0.5280625281827747, "flos": 21506531886720.0, "grad_norm": 1.7833002331495102, "language_loss": 0.69886035, "learning_rate": 1.915317407666982e-06, "loss": 0.71988136, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 8783, "time_per_iteration": 2.402932643890381 }, { "auxiliary_loss_clip": 0.01069091, "auxiliary_loss_mlp": 0.01048868, "balance_loss_clip": 1.0157752, "balance_loss_mlp": 1.02481556, "epoch": 0.5281226514354427, "flos": 31206940214400.0, "grad_norm": 2.110705560758405, "language_loss": 0.70799989, "learning_rate": 1.9149282968607674e-06, "loss": 0.7291795, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.44140625, "step": 8784, "time_per_iteration": 2.5252246856689453 }, { "auxiliary_loss_clip": 0.01068306, "auxiliary_loss_mlp": 0.01048271, "balance_loss_clip": 1.01758683, "balance_loss_mlp": 1.02483833, "epoch": 0.5281827746881106, "flos": 25076785363200.0, "grad_norm": 2.324072504388123, "language_loss": 0.77062678, "learning_rate": 1.91453918928048e-06, "loss": 0.79179251, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.43359375, "step": 8785, "time_per_iteration": 2.419907808303833 }, { "auxiliary_loss_clip": 0.01065551, "auxiliary_loss_mlp": 0.01042038, "balance_loss_clip": 1.01470351, "balance_loss_mlp": 1.02554309, "epoch": 0.5282428979407786, "flos": 20630211960960.0, "grad_norm": 3.2768154774097216, "language_loss": 0.84453517, "learning_rate": 1.9141500849408745e-06, "loss": 0.86561108, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 8786, "time_per_iteration": 2.442213296890259 }, { "auxiliary_loss_clip": 0.01064467, "auxiliary_loss_mlp": 0.01034722, "balance_loss_clip": 1.01308596, "balance_loss_mlp": 1.02704775, "epoch": 0.5283030211934465, "flos": 22418288709120.0, "grad_norm": 2.128652958646841, "language_loss": 0.83907872, "learning_rate": 1.9137609838567076e-06, "loss": 0.86007059, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.375, "step": 8787, "time_per_iteration": 2.4233736991882324 }, { "auxiliary_loss_clip": 0.01065757, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.01061606, "balance_loss_mlp": 1.02687979, "epoch": 0.5283631444461145, "flos": 23614553134080.0, "grad_norm": 1.6641471404157684, "language_loss": 0.84220195, "learning_rate": 1.9133718860427316e-06, "loss": 0.86319351, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.38867188, "step": 8788, "time_per_iteration": 2.42002010345459 }, { "auxiliary_loss_clip": 0.01067266, "auxiliary_loss_mlp": 0.01045869, "balance_loss_clip": 1.01980996, "balance_loss_mlp": 1.02822781, "epoch": 0.5284232676987825, "flos": 32670603809280.0, "grad_norm": 1.5966751274371047, "language_loss": 0.75749522, "learning_rate": 1.9129827915137027e-06, "loss": 0.77862656, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 8789, "time_per_iteration": 2.5190012454986572 }, { "auxiliary_loss_clip": 0.01067266, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.02337623, "balance_loss_mlp": 1.02636707, "epoch": 0.5284833909514505, "flos": 26759703496320.0, "grad_norm": 1.5060037095994725, "language_loss": 0.70959944, "learning_rate": 1.9125937002843754e-06, "loss": 0.73074716, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.41015625, "step": 8790, "time_per_iteration": 2.484835624694824 }, { "auxiliary_loss_clip": 0.0106468, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.01433587, "balance_loss_mlp": 1.02648902, "epoch": 0.5285435142041185, "flos": 22089616369920.0, "grad_norm": 1.4734888707081581, "language_loss": 0.79399848, "learning_rate": 1.9122046123695036e-06, "loss": 0.81501043, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3828125, "step": 8791, "time_per_iteration": 2.407970666885376 }, { "auxiliary_loss_clip": 0.0106361, "auxiliary_loss_mlp": 0.01039459, "balance_loss_clip": 1.01660633, "balance_loss_mlp": 1.02611017, "epoch": 0.5286036374567864, "flos": 20374438273920.0, "grad_norm": 2.0906532939480527, "language_loss": 0.67280078, "learning_rate": 1.9118155277838423e-06, "loss": 0.69383144, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.375, "step": 8792, "time_per_iteration": 2.42397403717041 }, { "auxiliary_loss_clip": 0.01060102, "auxiliary_loss_mlp": 0.01042982, "balance_loss_clip": 1.01862729, "balance_loss_mlp": 1.02271152, "epoch": 0.5286637607094544, "flos": 24351045598080.0, "grad_norm": 1.920259381284063, "language_loss": 0.80866086, "learning_rate": 1.9114264465421443e-06, "loss": 0.82969165, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 8793, "time_per_iteration": 2.4125218391418457 }, { "auxiliary_loss_clip": 0.01064488, "auxiliary_loss_mlp": 0.01045929, "balance_loss_clip": 1.02037024, "balance_loss_mlp": 1.02479768, "epoch": 0.5287238839621223, "flos": 17270310625920.0, "grad_norm": 2.0510743220721874, "language_loss": 0.85873467, "learning_rate": 1.9110373686591645e-06, "loss": 0.87983882, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39648438, "step": 8794, "time_per_iteration": 2.3777575492858887 }, { "auxiliary_loss_clip": 0.0106552, "auxiliary_loss_mlp": 0.01053675, "balance_loss_clip": 1.02631629, "balance_loss_mlp": 1.02322435, "epoch": 0.5287840072147904, "flos": 17565920040960.0, "grad_norm": 2.3136085647910094, "language_loss": 0.69693053, "learning_rate": 1.9106482941496564e-06, "loss": 0.71812248, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.421875, "step": 8795, "time_per_iteration": 2.376643180847168 }, { "auxiliary_loss_clip": 0.01062919, "auxiliary_loss_mlp": 0.01049026, "balance_loss_clip": 1.02363467, "balance_loss_mlp": 1.02226162, "epoch": 0.5288441304674583, "flos": 18551099185920.0, "grad_norm": 1.9631779135278333, "language_loss": 0.81858176, "learning_rate": 1.910259223028374e-06, "loss": 0.83970118, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 8796, "time_per_iteration": 2.364229917526245 }, { "auxiliary_loss_clip": 0.01063045, "auxiliary_loss_mlp": 0.01051225, "balance_loss_clip": 1.02541661, "balance_loss_mlp": 1.02232206, "epoch": 0.5289042537201263, "flos": 20813436109440.0, "grad_norm": 1.792741703003455, "language_loss": 0.70644498, "learning_rate": 1.909870155310071e-06, "loss": 0.7275877, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40625, "step": 8797, "time_per_iteration": 3.6795458793640137 }, { "auxiliary_loss_clip": 0.01058874, "auxiliary_loss_mlp": 0.01041476, "balance_loss_clip": 1.01792026, "balance_loss_mlp": 1.02183175, "epoch": 0.5289643769727942, "flos": 15734551340160.0, "grad_norm": 1.480876460202256, "language_loss": 0.82950515, "learning_rate": 1.9094810910095005e-06, "loss": 0.85050869, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37109375, "step": 8798, "time_per_iteration": 3.802950382232666 }, { "auxiliary_loss_clip": 0.01062459, "auxiliary_loss_mlp": 0.01047074, "balance_loss_clip": 1.02051377, "balance_loss_mlp": 1.02078056, "epoch": 0.5290245002254622, "flos": 19536278330880.0, "grad_norm": 1.6816718788712157, "language_loss": 0.71895194, "learning_rate": 1.9090920301414166e-06, "loss": 0.74004734, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.41601562, "step": 8799, "time_per_iteration": 2.371108055114746 }, { "auxiliary_loss_clip": 0.01057304, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.02066159, "balance_loss_mlp": 1.02023578, "epoch": 0.5290846234781301, "flos": 15814222796160.0, "grad_norm": 1.7925059576958258, "language_loss": 0.70702004, "learning_rate": 1.9087029727205716e-06, "loss": 0.72803289, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 8800, "time_per_iteration": 3.880500555038452 }, { "auxiliary_loss_clip": 0.01015275, "auxiliary_loss_mlp": 0.01006245, "balance_loss_clip": 1.00408745, "balance_loss_mlp": 1.0080241, "epoch": 0.5291447467307981, "flos": 70054516874880.0, "grad_norm": 0.9608730887301785, "language_loss": 0.57017988, "learning_rate": 1.9083139187617193e-06, "loss": 0.59039509, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.07226562, "step": 8801, "time_per_iteration": 2.938629627227783 }, { "auxiliary_loss_clip": 0.0105947, "auxiliary_loss_mlp": 0.01055472, "balance_loss_clip": 1.02814889, "balance_loss_mlp": 1.02024174, "epoch": 0.529204869983466, "flos": 28362985084800.0, "grad_norm": 1.7299710673553594, "language_loss": 0.65102643, "learning_rate": 1.9079248682796123e-06, "loss": 0.67217582, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39257812, "step": 8802, "time_per_iteration": 2.440952777862549 }, { "auxiliary_loss_clip": 0.01056511, "auxiliary_loss_mlp": 0.0104442, "balance_loss_clip": 1.01933849, "balance_loss_mlp": 1.0180068, "epoch": 0.5292649932361341, "flos": 33757624990080.0, "grad_norm": 1.6432917605781632, "language_loss": 0.70227224, "learning_rate": 1.907535821289003e-06, "loss": 0.7232815, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 8803, "time_per_iteration": 2.556483268737793 }, { "auxiliary_loss_clip": 0.01056463, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.01506233, "balance_loss_mlp": 1.01854479, "epoch": 0.5293251164888021, "flos": 20446673610240.0, "grad_norm": 1.6514442618010206, "language_loss": 0.77543199, "learning_rate": 1.9071467778046458e-06, "loss": 0.79640281, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 8804, "time_per_iteration": 2.3862838745117188 }, { "auxiliary_loss_clip": 0.01012112, "auxiliary_loss_mlp": 0.01002135, "balance_loss_clip": 0.99945235, "balance_loss_mlp": 1.00482976, "epoch": 0.52938523974147, "flos": 66541554806400.0, "grad_norm": 0.7588570171223493, "language_loss": 0.53083634, "learning_rate": 1.906757737841291e-06, "loss": 0.55097878, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.07275391, "step": 8805, "time_per_iteration": 3.136951208114624 }, { "auxiliary_loss_clip": 0.01011555, "auxiliary_loss_mlp": 0.01003059, "balance_loss_clip": 1.00044847, "balance_loss_mlp": 1.00426817, "epoch": 0.529445362994138, "flos": 67148345969280.0, "grad_norm": 0.7431702372449289, "language_loss": 0.6392709, "learning_rate": 1.906368701413693e-06, "loss": 0.65941703, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.07275391, "step": 8806, "time_per_iteration": 3.064457416534424 }, { "auxiliary_loss_clip": 0.01060845, "auxiliary_loss_mlp": 0.01041614, "balance_loss_clip": 1.01584101, "balance_loss_mlp": 1.01929927, "epoch": 0.5295054862468059, "flos": 17748341228160.0, "grad_norm": 1.477192008012668, "language_loss": 0.73326373, "learning_rate": 1.9059796685366026e-06, "loss": 0.75428826, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41601562, "step": 8807, "time_per_iteration": 3.864272117614746 }, { "auxiliary_loss_clip": 0.01057113, "auxiliary_loss_mlp": 0.01045853, "balance_loss_clip": 1.02161813, "balance_loss_mlp": 1.01923561, "epoch": 0.529565609499474, "flos": 11396697511680.0, "grad_norm": 2.218606984838343, "language_loss": 0.71464831, "learning_rate": 1.9055906392247723e-06, "loss": 0.73567796, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 8808, "time_per_iteration": 2.394089698791504 }, { "auxiliary_loss_clip": 0.01057632, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.02007771, "balance_loss_mlp": 1.01961803, "epoch": 0.5296257327521419, "flos": 17195561671680.0, "grad_norm": 1.7140598009302366, "language_loss": 0.88453537, "learning_rate": 1.9052016134929554e-06, "loss": 0.90553808, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.37890625, "step": 8809, "time_per_iteration": 2.4058609008789062 }, { "auxiliary_loss_clip": 0.01062807, "auxiliary_loss_mlp": 0.0105318, "balance_loss_clip": 1.02422369, "balance_loss_mlp": 1.02155709, "epoch": 0.5296858560048099, "flos": 39962633529600.0, "grad_norm": 1.6740658952566916, "language_loss": 0.64925444, "learning_rate": 1.9048125913559016e-06, "loss": 0.67041427, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 8810, "time_per_iteration": 2.5711207389831543 }, { "auxiliary_loss_clip": 0.01057191, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.01825166, "balance_loss_mlp": 1.02004576, "epoch": 0.5297459792574778, "flos": 20960315665920.0, "grad_norm": 1.5661132725050455, "language_loss": 0.68302178, "learning_rate": 1.9044235728283646e-06, "loss": 0.7040149, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 8811, "time_per_iteration": 2.438230514526367 }, { "auxiliary_loss_clip": 0.01016896, "auxiliary_loss_mlp": 0.01009305, "balance_loss_clip": 1.00670671, "balance_loss_mlp": 1.00965142, "epoch": 0.5298061025101458, "flos": 66520468344960.0, "grad_norm": 0.6638883443666355, "language_loss": 0.53449357, "learning_rate": 1.9040345579250953e-06, "loss": 0.55475557, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.07226562, "step": 8812, "time_per_iteration": 3.1377737522125244 }, { "auxiliary_loss_clip": 0.01017433, "auxiliary_loss_mlp": 0.01017581, "balance_loss_clip": 1.01482737, "balance_loss_mlp": 1.01027989, "epoch": 0.5298662257628137, "flos": 67659579141120.0, "grad_norm": 0.735726061196352, "language_loss": 0.56359839, "learning_rate": 1.9036455466608453e-06, "loss": 0.58394861, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.07128906, "step": 8813, "time_per_iteration": 3.1136279106140137 }, { "auxiliary_loss_clip": 0.01058098, "auxiliary_loss_mlp": 0.01049, "balance_loss_clip": 1.02568269, "balance_loss_mlp": 1.0221976, "epoch": 0.5299263490154817, "flos": 19645381929600.0, "grad_norm": 1.5638958517568933, "language_loss": 0.82964516, "learning_rate": 1.9032565390503657e-06, "loss": 0.85071611, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 8814, "time_per_iteration": 2.4017255306243896 }, { "auxiliary_loss_clip": 0.01064433, "auxiliary_loss_mlp": 0.01044311, "balance_loss_clip": 1.01913381, "balance_loss_mlp": 1.02474284, "epoch": 0.5299864722681497, "flos": 22053900182400.0, "grad_norm": 1.5213610480665616, "language_loss": 0.86071992, "learning_rate": 1.9028675351084076e-06, "loss": 0.88180739, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39648438, "step": 8815, "time_per_iteration": 2.407411575317383 }, { "auxiliary_loss_clip": 0.01060621, "auxiliary_loss_mlp": 0.0104284, "balance_loss_clip": 1.01918876, "balance_loss_mlp": 1.02317059, "epoch": 0.5300465955208177, "flos": 21762584864640.0, "grad_norm": 1.9172934324757265, "language_loss": 0.67519611, "learning_rate": 1.9024785348497225e-06, "loss": 0.69623065, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 8816, "time_per_iteration": 2.450045108795166 }, { "auxiliary_loss_clip": 0.01065068, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.01661301, "balance_loss_mlp": 1.02574062, "epoch": 0.5301067187734857, "flos": 42994840043520.0, "grad_norm": 1.5416065044464464, "language_loss": 0.73338866, "learning_rate": 1.9020895382890611e-06, "loss": 0.75445294, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39453125, "step": 8817, "time_per_iteration": 2.583049774169922 }, { "auxiliary_loss_clip": 0.01063232, "auxiliary_loss_mlp": 0.01042677, "balance_loss_clip": 1.01764309, "balance_loss_mlp": 1.02422059, "epoch": 0.5301668420261536, "flos": 20553368325120.0, "grad_norm": 1.7391512623309993, "language_loss": 0.65849602, "learning_rate": 1.9017005454411743e-06, "loss": 0.67955518, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 8818, "time_per_iteration": 2.460122585296631 }, { "auxiliary_loss_clip": 0.01063157, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.01244056, "balance_loss_mlp": 1.02454138, "epoch": 0.5302269652788216, "flos": 17485899471360.0, "grad_norm": 1.9157406818381337, "language_loss": 0.75755465, "learning_rate": 1.9013115563208126e-06, "loss": 0.77857172, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 8819, "time_per_iteration": 2.3741085529327393 }, { "auxiliary_loss_clip": 0.01064575, "auxiliary_loss_mlp": 0.01049372, "balance_loss_clip": 1.02311039, "balance_loss_mlp": 1.02395272, "epoch": 0.5302870885314895, "flos": 14573339786880.0, "grad_norm": 1.9928780209130403, "language_loss": 0.83864629, "learning_rate": 1.9009225709427267e-06, "loss": 0.85978574, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 8820, "time_per_iteration": 2.3901596069335938 }, { "auxiliary_loss_clip": 0.01060951, "auxiliary_loss_mlp": 0.01040176, "balance_loss_clip": 1.01663184, "balance_loss_mlp": 1.02257657, "epoch": 0.5303472117841576, "flos": 23436984625920.0, "grad_norm": 2.4774293917992427, "language_loss": 0.72966605, "learning_rate": 1.9005335893216667e-06, "loss": 0.75067735, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3828125, "step": 8821, "time_per_iteration": 2.4542064666748047 }, { "auxiliary_loss_clip": 0.01059494, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.01049733, "balance_loss_mlp": 1.02293777, "epoch": 0.5304073350368255, "flos": 22707963192960.0, "grad_norm": 1.4568779073272584, "language_loss": 0.75382578, "learning_rate": 1.9001446114723824e-06, "loss": 0.77474934, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 8822, "time_per_iteration": 2.435206890106201 }, { "auxiliary_loss_clip": 0.01060444, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.01807475, "balance_loss_mlp": 1.02248263, "epoch": 0.5304674582894935, "flos": 27927303828480.0, "grad_norm": 1.5885946231415418, "language_loss": 0.68194342, "learning_rate": 1.8997556374096257e-06, "loss": 0.70298338, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 8823, "time_per_iteration": 2.4676249027252197 }, { "auxiliary_loss_clip": 0.01063256, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.01289701, "balance_loss_mlp": 1.02334809, "epoch": 0.5305275815421614, "flos": 21249606124800.0, "grad_norm": 1.6231188707075235, "language_loss": 0.70528579, "learning_rate": 1.8993666671481444e-06, "loss": 0.72631955, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 8824, "time_per_iteration": 2.451826572418213 }, { "auxiliary_loss_clip": 0.01059364, "auxiliary_loss_mlp": 0.01044393, "balance_loss_clip": 1.02136147, "balance_loss_mlp": 1.02277458, "epoch": 0.5305877047948294, "flos": 17602124987520.0, "grad_norm": 3.7497419075933385, "language_loss": 0.77654326, "learning_rate": 1.898977700702689e-06, "loss": 0.79758084, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 8825, "time_per_iteration": 2.3647689819335938 }, { "auxiliary_loss_clip": 0.0106036, "auxiliary_loss_mlp": 0.01047551, "balance_loss_clip": 1.02258897, "balance_loss_mlp": 1.02248597, "epoch": 0.5306478280474973, "flos": 15194584252800.0, "grad_norm": 1.7732730682069018, "language_loss": 0.86219126, "learning_rate": 1.8985887380880103e-06, "loss": 0.88327038, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37890625, "step": 8826, "time_per_iteration": 2.4161453247070312 }, { "auxiliary_loss_clip": 0.01057324, "auxiliary_loss_mlp": 0.0104152, "balance_loss_clip": 1.01996744, "balance_loss_mlp": 1.02096653, "epoch": 0.5307079513001653, "flos": 15340311734400.0, "grad_norm": 1.3985904453167153, "language_loss": 0.65555811, "learning_rate": 1.8981997793188558e-06, "loss": 0.67654657, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36328125, "step": 8827, "time_per_iteration": 2.3964104652404785 }, { "auxiliary_loss_clip": 0.01060131, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.02412796, "balance_loss_mlp": 1.02124083, "epoch": 0.5307680745528333, "flos": 43542766920960.0, "grad_norm": 1.547113097641632, "language_loss": 0.61446357, "learning_rate": 1.8978108244099762e-06, "loss": 0.63554835, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38867188, "step": 8828, "time_per_iteration": 2.6076481342315674 }, { "auxiliary_loss_clip": 0.01058252, "auxiliary_loss_mlp": 0.01050409, "balance_loss_clip": 1.02624512, "balance_loss_mlp": 1.01976371, "epoch": 0.5308281978055013, "flos": 20047860616320.0, "grad_norm": 1.5613820157540044, "language_loss": 0.8223874, "learning_rate": 1.8974218733761208e-06, "loss": 0.84347397, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38476562, "step": 8829, "time_per_iteration": 2.4185941219329834 }, { "auxiliary_loss_clip": 0.01054938, "auxiliary_loss_mlp": 0.01049376, "balance_loss_clip": 1.02677357, "balance_loss_mlp": 1.01920879, "epoch": 0.5308883210581693, "flos": 20702901144960.0, "grad_norm": 1.4946404527831225, "language_loss": 0.79079181, "learning_rate": 1.8970329262320375e-06, "loss": 0.81183493, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 8830, "time_per_iteration": 2.443601131439209 }, { "auxiliary_loss_clip": 0.01055691, "auxiliary_loss_mlp": 0.0104716, "balance_loss_clip": 1.02342534, "balance_loss_mlp": 1.0191803, "epoch": 0.5309484443108372, "flos": 14354643830400.0, "grad_norm": 1.9143193050059826, "language_loss": 0.81340587, "learning_rate": 1.8966439829924768e-06, "loss": 0.83443439, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 8831, "time_per_iteration": 2.3900814056396484 }, { "auxiliary_loss_clip": 0.01055632, "auxiliary_loss_mlp": 0.01054774, "balance_loss_clip": 1.03000259, "balance_loss_mlp": 1.01835346, "epoch": 0.5310085675635052, "flos": 20009491165440.0, "grad_norm": 1.8916516066050082, "language_loss": 0.74651122, "learning_rate": 1.896255043672186e-06, "loss": 0.76761532, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 8832, "time_per_iteration": 2.4469454288482666 }, { "auxiliary_loss_clip": 0.01059842, "auxiliary_loss_mlp": 0.01058549, "balance_loss_clip": 1.03340805, "balance_loss_mlp": 1.02107048, "epoch": 0.5310686908161731, "flos": 22126205341440.0, "grad_norm": 2.094123761504719, "language_loss": 0.76921493, "learning_rate": 1.8958661082859143e-06, "loss": 0.79039884, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38867188, "step": 8833, "time_per_iteration": 2.3871819972991943 }, { "auxiliary_loss_clip": 0.01059713, "auxiliary_loss_mlp": 0.01049213, "balance_loss_clip": 1.02380967, "balance_loss_mlp": 1.02042353, "epoch": 0.5311288140688412, "flos": 24716725845120.0, "grad_norm": 1.6433434598810113, "language_loss": 0.74611098, "learning_rate": 1.8954771768484103e-06, "loss": 0.76720023, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 8834, "time_per_iteration": 2.4571337699890137 }, { "auxiliary_loss_clip": 0.01062626, "auxiliary_loss_mlp": 0.01049154, "balance_loss_clip": 1.02252269, "balance_loss_mlp": 1.02049017, "epoch": 0.5311889373215091, "flos": 24096563631360.0, "grad_norm": 1.7558438479762275, "language_loss": 0.79309857, "learning_rate": 1.8950882493744226e-06, "loss": 0.81421632, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 8835, "time_per_iteration": 2.4141416549682617 }, { "auxiliary_loss_clip": 0.01057825, "auxiliary_loss_mlp": 0.01043566, "balance_loss_clip": 1.01859128, "balance_loss_mlp": 1.01966023, "epoch": 0.5312490605741771, "flos": 22015949667840.0, "grad_norm": 1.6332604849994858, "language_loss": 0.73422229, "learning_rate": 1.8946993258786985e-06, "loss": 0.75523627, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 8836, "time_per_iteration": 3.7075467109680176 }, { "auxiliary_loss_clip": 0.0106039, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.01896548, "balance_loss_mlp": 1.02059531, "epoch": 0.531309183826845, "flos": 19389538419840.0, "grad_norm": 1.8177015382597919, "language_loss": 0.81933516, "learning_rate": 1.894310406375987e-06, "loss": 0.84039825, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 8837, "time_per_iteration": 2.3757455348968506 }, { "auxiliary_loss_clip": 0.01058942, "auxiliary_loss_mlp": 0.01045184, "balance_loss_clip": 1.01824284, "balance_loss_mlp": 1.02108049, "epoch": 0.531369307079513, "flos": 20189119443840.0, "grad_norm": 2.0824916080175124, "language_loss": 0.8668617, "learning_rate": 1.893921490881035e-06, "loss": 0.88790298, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 8838, "time_per_iteration": 3.782651662826538 }, { "auxiliary_loss_clip": 0.01059842, "auxiliary_loss_mlp": 0.01040452, "balance_loss_clip": 1.01656222, "balance_loss_mlp": 1.02237415, "epoch": 0.5314294303321809, "flos": 18879143120640.0, "grad_norm": 1.87373645089244, "language_loss": 0.7378245, "learning_rate": 1.8935325794085906e-06, "loss": 0.75882745, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 8839, "time_per_iteration": 3.7373855113983154 }, { "auxiliary_loss_clip": 0.01061048, "auxiliary_loss_mlp": 0.0104345, "balance_loss_clip": 1.0188098, "balance_loss_mlp": 1.02147341, "epoch": 0.531489553584849, "flos": 23038904770560.0, "grad_norm": 1.5604271507126428, "language_loss": 0.77475286, "learning_rate": 1.8931436719734023e-06, "loss": 0.79579788, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39648438, "step": 8840, "time_per_iteration": 2.3892595767974854 }, { "auxiliary_loss_clip": 0.01062779, "auxiliary_loss_mlp": 0.01044289, "balance_loss_clip": 1.01759768, "balance_loss_mlp": 1.02286577, "epoch": 0.5315496768375169, "flos": 19789503488640.0, "grad_norm": 1.9139129823169478, "language_loss": 0.7857036, "learning_rate": 1.892754768590216e-06, "loss": 0.80677426, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 8841, "time_per_iteration": 2.3851101398468018 }, { "auxiliary_loss_clip": 0.01026071, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.0322485, "balance_loss_mlp": 1.01837087, "epoch": 0.5316098000901849, "flos": 71019620121600.0, "grad_norm": 0.7128305122710047, "language_loss": 0.56920904, "learning_rate": 1.8923658692737793e-06, "loss": 0.58981621, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.07714844, "step": 8842, "time_per_iteration": 3.1683902740478516 }, { "auxiliary_loss_clip": 0.01065711, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.01336336, "balance_loss_mlp": 1.02540922, "epoch": 0.5316699233428529, "flos": 16434629389440.0, "grad_norm": 1.9438791093520686, "language_loss": 0.75480783, "learning_rate": 1.8919769740388407e-06, "loss": 0.77586198, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 8843, "time_per_iteration": 2.367952823638916 }, { "auxiliary_loss_clip": 0.01028046, "auxiliary_loss_mlp": 0.01012362, "balance_loss_clip": 1.0097754, "balance_loss_mlp": 1.02005267, "epoch": 0.5317300465955208, "flos": 67417036859520.0, "grad_norm": 0.8853029183172776, "language_loss": 0.61115479, "learning_rate": 1.891588082900145e-06, "loss": 0.6315589, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.08007812, "step": 8844, "time_per_iteration": 3.1023476123809814 }, { "auxiliary_loss_clip": 0.01028287, "auxiliary_loss_mlp": 0.01005563, "balance_loss_clip": 1.00328577, "balance_loss_mlp": 1.02052307, "epoch": 0.5317901698481888, "flos": 59505405373440.0, "grad_norm": 0.8491527716064996, "language_loss": 0.62294441, "learning_rate": 1.8911991958724411e-06, "loss": 0.64328289, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.07763672, "step": 8845, "time_per_iteration": 3.0294365882873535 }, { "auxiliary_loss_clip": 0.01064986, "auxiliary_loss_mlp": 0.01050081, "balance_loss_clip": 1.02374732, "balance_loss_mlp": 1.02453005, "epoch": 0.5318502931008567, "flos": 19128388383360.0, "grad_norm": 1.8837555701047806, "language_loss": 0.7772972, "learning_rate": 1.890810312970474e-06, "loss": 0.79844785, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40429688, "step": 8846, "time_per_iteration": 3.8905065059661865 }, { "auxiliary_loss_clip": 0.01063652, "auxiliary_loss_mlp": 0.01056518, "balance_loss_clip": 1.03148437, "balance_loss_mlp": 1.02415895, "epoch": 0.5319104163535248, "flos": 24679892494080.0, "grad_norm": 1.6153446697775842, "language_loss": 0.76336938, "learning_rate": 1.8904214342089903e-06, "loss": 0.78457105, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 8847, "time_per_iteration": 2.451061725616455 }, { "auxiliary_loss_clip": 0.01063547, "auxiliary_loss_mlp": 0.01057418, "balance_loss_clip": 1.03375459, "balance_loss_mlp": 1.02407646, "epoch": 0.5319705396061927, "flos": 19384650829440.0, "grad_norm": 1.6000816393709836, "language_loss": 0.88560414, "learning_rate": 1.8900325596027378e-06, "loss": 0.9068138, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.39453125, "step": 8848, "time_per_iteration": 2.377861499786377 }, { "auxiliary_loss_clip": 0.01064002, "auxiliary_loss_mlp": 0.01065185, "balance_loss_clip": 1.0368613, "balance_loss_mlp": 1.02381003, "epoch": 0.5320306628588607, "flos": 18258352502400.0, "grad_norm": 2.4054855453451673, "language_loss": 0.75952625, "learning_rate": 1.8896436891664609e-06, "loss": 0.7808181, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40234375, "step": 8849, "time_per_iteration": 2.367537498474121 }, { "auxiliary_loss_clip": 0.01065327, "auxiliary_loss_mlp": 0.01072916, "balance_loss_clip": 1.04558146, "balance_loss_mlp": 1.02288961, "epoch": 0.5320907861115286, "flos": 23731197586560.0, "grad_norm": 1.8491787911045516, "language_loss": 0.81118202, "learning_rate": 1.8892548229149066e-06, "loss": 0.83256447, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.42382812, "step": 8850, "time_per_iteration": 2.45816707611084 }, { "auxiliary_loss_clip": 0.01061957, "auxiliary_loss_mlp": 0.0108418, "balance_loss_clip": 1.05982566, "balance_loss_mlp": 1.02199626, "epoch": 0.5321509093641966, "flos": 34493838163200.0, "grad_norm": 1.413812688483363, "language_loss": 0.55373549, "learning_rate": 1.888865960862821e-06, "loss": 0.57519686, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.40039062, "step": 8851, "time_per_iteration": 2.52815580368042 }, { "auxiliary_loss_clip": 0.01063191, "auxiliary_loss_mlp": 0.01071621, "balance_loss_clip": 1.04625344, "balance_loss_mlp": 1.02245498, "epoch": 0.5322110326168645, "flos": 20009910101760.0, "grad_norm": 1.6223615300649814, "language_loss": 0.69457793, "learning_rate": 1.8884771030249484e-06, "loss": 0.71592605, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 8852, "time_per_iteration": 2.4178266525268555 }, { "auxiliary_loss_clip": 0.01018948, "auxiliary_loss_mlp": 0.0105745, "balance_loss_clip": 1.05455351, "balance_loss_mlp": 1.01103234, "epoch": 0.5322711558695326, "flos": 64627931208960.0, "grad_norm": 0.8501394306061668, "language_loss": 0.63128453, "learning_rate": 1.8880882494160357e-06, "loss": 0.65204853, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.07910156, "step": 8853, "time_per_iteration": 2.980318546295166 }, { "auxiliary_loss_clip": 0.01061991, "auxiliary_loss_mlp": 0.01070419, "balance_loss_clip": 1.04420471, "balance_loss_mlp": 1.02035654, "epoch": 0.5323312791222005, "flos": 14938461452160.0, "grad_norm": 2.124012533662979, "language_loss": 0.80760837, "learning_rate": 1.8876994000508278e-06, "loss": 0.82893252, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41601562, "step": 8854, "time_per_iteration": 2.3557727336883545 }, { "auxiliary_loss_clip": 0.01056003, "auxiliary_loss_mlp": 0.01056646, "balance_loss_clip": 1.03293478, "balance_loss_mlp": 1.01897204, "epoch": 0.5323914023748685, "flos": 23439707712000.0, "grad_norm": 1.8799614699362066, "language_loss": 0.75067246, "learning_rate": 1.8873105549440698e-06, "loss": 0.77179885, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 8855, "time_per_iteration": 2.4229915142059326 }, { "auxiliary_loss_clip": 0.01058597, "auxiliary_loss_mlp": 0.01054917, "balance_loss_clip": 1.03065777, "balance_loss_mlp": 1.01924467, "epoch": 0.5324515256275365, "flos": 26284989473280.0, "grad_norm": 1.9966086426987635, "language_loss": 0.66545069, "learning_rate": 1.886921714110507e-06, "loss": 0.68658584, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.39453125, "step": 8856, "time_per_iteration": 2.469897508621216 }, { "auxiliary_loss_clip": 0.0106229, "auxiliary_loss_mlp": 0.01060219, "balance_loss_clip": 1.03112006, "balance_loss_mlp": 1.02074814, "epoch": 0.5325116488802044, "flos": 26869679879040.0, "grad_norm": 2.953929306947869, "language_loss": 0.78457052, "learning_rate": 1.8865328775648842e-06, "loss": 0.80579561, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41601562, "step": 8857, "time_per_iteration": 2.4262306690216064 }, { "auxiliary_loss_clip": 0.01061951, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.02344561, "balance_loss_mlp": 1.02198148, "epoch": 0.5325717721328724, "flos": 25883558127360.0, "grad_norm": 2.006948592875613, "language_loss": 0.71515667, "learning_rate": 1.8861440453219456e-06, "loss": 0.7362684, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40039062, "step": 8858, "time_per_iteration": 2.452889919281006 }, { "auxiliary_loss_clip": 0.01062825, "auxiliary_loss_mlp": 0.01051456, "balance_loss_clip": 1.02281034, "balance_loss_mlp": 1.02274477, "epoch": 0.5326318953855403, "flos": 21798231229440.0, "grad_norm": 1.589773861662681, "language_loss": 0.70252377, "learning_rate": 1.8857552173964367e-06, "loss": 0.72366655, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40039062, "step": 8859, "time_per_iteration": 2.4315295219421387 }, { "auxiliary_loss_clip": 0.01060615, "auxiliary_loss_mlp": 0.0103982, "balance_loss_clip": 1.01590705, "balance_loss_mlp": 1.02306974, "epoch": 0.5326920186382084, "flos": 20921876392320.0, "grad_norm": 1.4240415589334594, "language_loss": 0.70712042, "learning_rate": 1.8853663938031013e-06, "loss": 0.7281248, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 8860, "time_per_iteration": 2.4265458583831787 }, { "auxiliary_loss_clip": 0.01065429, "auxiliary_loss_mlp": 0.01044272, "balance_loss_clip": 1.01795053, "balance_loss_mlp": 1.02629876, "epoch": 0.5327521418908763, "flos": 21432376425600.0, "grad_norm": 1.9121229789274412, "language_loss": 0.78604925, "learning_rate": 1.884977574556683e-06, "loss": 0.80714625, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 8861, "time_per_iteration": 2.4094181060791016 }, { "auxiliary_loss_clip": 0.01065091, "auxiliary_loss_mlp": 0.0105033, "balance_loss_clip": 1.02241087, "balance_loss_mlp": 1.02487087, "epoch": 0.5328122651435443, "flos": 21759233374080.0, "grad_norm": 1.4817956821496452, "language_loss": 0.86503243, "learning_rate": 1.8845887596719279e-06, "loss": 0.8861866, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40234375, "step": 8862, "time_per_iteration": 2.4504189491271973 }, { "auxiliary_loss_clip": 0.01067723, "auxiliary_loss_mlp": 0.01047497, "balance_loss_clip": 1.01829052, "balance_loss_mlp": 1.02553236, "epoch": 0.5328723883962122, "flos": 18295500055680.0, "grad_norm": 2.1632944180812705, "language_loss": 0.63500017, "learning_rate": 1.8841999491635778e-06, "loss": 0.65615237, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.421875, "step": 8863, "time_per_iteration": 2.3840513229370117 }, { "auxiliary_loss_clip": 0.01070303, "auxiliary_loss_mlp": 0.01042047, "balance_loss_clip": 1.01616716, "balance_loss_mlp": 1.03008246, "epoch": 0.5329325116488802, "flos": 25373721409920.0, "grad_norm": 1.7982023549850212, "language_loss": 0.74842638, "learning_rate": 1.883811143046377e-06, "loss": 0.76954991, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 8864, "time_per_iteration": 2.4624276161193848 }, { "auxiliary_loss_clip": 0.01068525, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.01681733, "balance_loss_mlp": 1.02813613, "epoch": 0.5329926349015481, "flos": 25590951089280.0, "grad_norm": 2.2968095804498603, "language_loss": 0.65510702, "learning_rate": 1.8834223413350702e-06, "loss": 0.6762234, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40429688, "step": 8865, "time_per_iteration": 2.448385715484619 }, { "auxiliary_loss_clip": 0.01066121, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.01233876, "balance_loss_mlp": 1.0261364, "epoch": 0.5330527581542162, "flos": 22888603900800.0, "grad_norm": 1.9387653326293621, "language_loss": 0.79912567, "learning_rate": 1.8830335440443989e-06, "loss": 0.82016397, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 8866, "time_per_iteration": 2.428471803665161 }, { "auxiliary_loss_clip": 0.0107026, "auxiliary_loss_mlp": 0.0104091, "balance_loss_clip": 1.0146122, "balance_loss_mlp": 1.02997971, "epoch": 0.5331128814068841, "flos": 16026041214720.0, "grad_norm": 2.022874823435687, "language_loss": 0.75490224, "learning_rate": 1.882644751189108e-06, "loss": 0.77601397, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 8867, "time_per_iteration": 2.3903987407684326 }, { "auxiliary_loss_clip": 0.01072026, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.01276159, "balance_loss_mlp": 1.02989876, "epoch": 0.5331730046595521, "flos": 39343239365760.0, "grad_norm": 1.5402991963139867, "language_loss": 0.72724938, "learning_rate": 1.88225596278394e-06, "loss": 0.74838406, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.421875, "step": 8868, "time_per_iteration": 2.559777021408081 }, { "auxiliary_loss_clip": 0.01071582, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.01219249, "balance_loss_mlp": 1.03151023, "epoch": 0.5332331279122201, "flos": 24023246042880.0, "grad_norm": 1.803047915036845, "language_loss": 0.79577821, "learning_rate": 1.881867178843637e-06, "loss": 0.81686449, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.40234375, "step": 8869, "time_per_iteration": 2.4375386238098145 }, { "auxiliary_loss_clip": 0.01071599, "auxiliary_loss_mlp": 0.01049441, "balance_loss_clip": 1.02108145, "balance_loss_mlp": 1.0286721, "epoch": 0.533293251164888, "flos": 17128353571200.0, "grad_norm": 1.7476242435888947, "language_loss": 0.77265304, "learning_rate": 1.8814783993829434e-06, "loss": 0.79386342, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.4296875, "step": 8870, "time_per_iteration": 2.428163528442383 }, { "auxiliary_loss_clip": 0.01071555, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.02243185, "balance_loss_mlp": 1.02826035, "epoch": 0.533353374417556, "flos": 22125297646080.0, "grad_norm": 1.8266985496272372, "language_loss": 0.76380825, "learning_rate": 1.8810896244165997e-06, "loss": 0.78505176, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43359375, "step": 8871, "time_per_iteration": 2.4485087394714355 }, { "auxiliary_loss_clip": 0.01071414, "auxiliary_loss_mlp": 0.01044057, "balance_loss_clip": 1.01744938, "balance_loss_mlp": 1.02932751, "epoch": 0.533413497670224, "flos": 15010242940800.0, "grad_norm": 2.308582605268064, "language_loss": 0.73747301, "learning_rate": 1.8807008539593498e-06, "loss": 0.75862771, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.421875, "step": 8872, "time_per_iteration": 2.4122579097747803 }, { "auxiliary_loss_clip": 0.01069784, "auxiliary_loss_mlp": 0.01047787, "balance_loss_clip": 1.02046418, "balance_loss_mlp": 1.02992606, "epoch": 0.533473620922892, "flos": 19608932603520.0, "grad_norm": 1.5827071001111013, "language_loss": 0.65873134, "learning_rate": 1.880312088025936e-06, "loss": 0.67990708, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 8873, "time_per_iteration": 2.3884077072143555 }, { "auxiliary_loss_clip": 0.01069114, "auxiliary_loss_mlp": 0.01046392, "balance_loss_clip": 1.0191524, "balance_loss_mlp": 1.02778769, "epoch": 0.5335337441755599, "flos": 14281780089600.0, "grad_norm": 3.0232932357567015, "language_loss": 0.81635195, "learning_rate": 1.879923326631099e-06, "loss": 0.83750701, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.4140625, "step": 8874, "time_per_iteration": 2.426406145095825 }, { "auxiliary_loss_clip": 0.01066643, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.01816392, "balance_loss_mlp": 1.02570438, "epoch": 0.5335938674282279, "flos": 20813750311680.0, "grad_norm": 1.8159115886153605, "language_loss": 0.71033031, "learning_rate": 1.879534569789582e-06, "loss": 0.73146635, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41015625, "step": 8875, "time_per_iteration": 2.3906476497650146 }, { "auxiliary_loss_clip": 0.01029383, "auxiliary_loss_mlp": 0.01002804, "balance_loss_clip": 1.00019372, "balance_loss_mlp": 1.0216701, "epoch": 0.5336539906808958, "flos": 71392596842880.0, "grad_norm": 0.7239415715877608, "language_loss": 0.5973134, "learning_rate": 1.879145817516126e-06, "loss": 0.61763531, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.07714844, "step": 8876, "time_per_iteration": 4.360065698623657 }, { "auxiliary_loss_clip": 0.01063285, "auxiliary_loss_mlp": 0.01048926, "balance_loss_clip": 1.02215147, "balance_loss_mlp": 1.02249122, "epoch": 0.5337141139335638, "flos": 20152076624640.0, "grad_norm": 5.097698883494553, "language_loss": 0.7586025, "learning_rate": 1.8787570698254727e-06, "loss": 0.7797246, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 8877, "time_per_iteration": 2.431522846221924 }, { "auxiliary_loss_clip": 0.0102167, "auxiliary_loss_mlp": 0.01006494, "balance_loss_clip": 1.00355005, "balance_loss_mlp": 1.0139122, "epoch": 0.5337742371862317, "flos": 67725181319040.0, "grad_norm": 0.7660830315031517, "language_loss": 0.57327259, "learning_rate": 1.8783683267323629e-06, "loss": 0.59355426, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 0.02941895, "router_z_loss_mlp": 0.07714844, "step": 8878, "time_per_iteration": 4.378432989120483 }, { "auxiliary_loss_clip": 0.01064334, "auxiliary_loss_mlp": 0.01074067, "balance_loss_clip": 1.04300117, "balance_loss_mlp": 1.02121174, "epoch": 0.5338343604388998, "flos": 25007761872000.0, "grad_norm": 1.6162311876035236, "language_loss": 0.73438466, "learning_rate": 1.8779795882515395e-06, "loss": 0.75576866, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.43164062, "step": 8879, "time_per_iteration": 3.745462417602539 }, { "auxiliary_loss_clip": 0.0106212, "auxiliary_loss_mlp": 0.0107164, "balance_loss_clip": 1.04164672, "balance_loss_mlp": 1.02051866, "epoch": 0.5338944836915677, "flos": 17600344508160.0, "grad_norm": 2.2149738614853476, "language_loss": 0.84270668, "learning_rate": 1.8775908543977416e-06, "loss": 0.86404431, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41601562, "step": 8880, "time_per_iteration": 2.388364553451538 }, { "auxiliary_loss_clip": 0.0105886, "auxiliary_loss_mlp": 0.01061822, "balance_loss_clip": 1.0339036, "balance_loss_mlp": 1.0191474, "epoch": 0.5339546069442357, "flos": 21723098250240.0, "grad_norm": 1.4240444263016985, "language_loss": 0.80492198, "learning_rate": 1.8772021251857107e-06, "loss": 0.82612884, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.3984375, "step": 8881, "time_per_iteration": 2.3901116847991943 }, { "auxiliary_loss_clip": 0.01010673, "auxiliary_loss_mlp": 0.01038322, "balance_loss_clip": 1.03555655, "balance_loss_mlp": 1.00304937, "epoch": 0.5340147301969036, "flos": 69720642743040.0, "grad_norm": 0.8228580020017534, "language_loss": 0.59327441, "learning_rate": 1.8768134006301882e-06, "loss": 0.61376441, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.07617188, "step": 8882, "time_per_iteration": 2.9947543144226074 }, { "auxiliary_loss_clip": 0.01011024, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.03872287, "balance_loss_mlp": 1.00315666, "epoch": 0.5340748534495716, "flos": 63878067694080.0, "grad_norm": 0.9773490547480955, "language_loss": 0.63756943, "learning_rate": 1.876424680745913e-06, "loss": 0.65809184, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07861328, "step": 8883, "time_per_iteration": 2.825155735015869 }, { "auxiliary_loss_clip": 0.01063402, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.02824736, "balance_loss_mlp": 1.02087343, "epoch": 0.5341349767022396, "flos": 28693053878400.0, "grad_norm": 2.202805627840597, "language_loss": 0.83480918, "learning_rate": 1.8760359655476272e-06, "loss": 0.85602313, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42578125, "step": 8884, "time_per_iteration": 2.4382941722869873 }, { "auxiliary_loss_clip": 0.01061153, "auxiliary_loss_mlp": 0.01054613, "balance_loss_clip": 1.02602673, "balance_loss_mlp": 1.02173042, "epoch": 0.5341950999549075, "flos": 16288762262400.0, "grad_norm": 1.7152142221216333, "language_loss": 0.72637749, "learning_rate": 1.8756472550500695e-06, "loss": 0.74753517, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39453125, "step": 8885, "time_per_iteration": 2.373394727706909 }, { "auxiliary_loss_clip": 0.01067503, "auxiliary_loss_mlp": 0.01048063, "balance_loss_clip": 1.01842749, "balance_loss_mlp": 1.02426291, "epoch": 0.5342552232075756, "flos": 14354783475840.0, "grad_norm": 1.914030344860083, "language_loss": 0.79961038, "learning_rate": 1.87525854926798e-06, "loss": 0.82076609, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.43164062, "step": 8886, "time_per_iteration": 3.8442862033843994 }, { "auxiliary_loss_clip": 0.01066852, "auxiliary_loss_mlp": 0.01053243, "balance_loss_clip": 1.0221293, "balance_loss_mlp": 1.02473044, "epoch": 0.5343153464602435, "flos": 30296719491840.0, "grad_norm": 1.593547363668411, "language_loss": 0.75513285, "learning_rate": 1.8748698482160996e-06, "loss": 0.77633381, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.421875, "step": 8887, "time_per_iteration": 2.513382911682129 }, { "auxiliary_loss_clip": 0.01065697, "auxiliary_loss_mlp": 0.01043913, "balance_loss_clip": 1.01595831, "balance_loss_mlp": 1.02530527, "epoch": 0.5343754697129115, "flos": 15595387194240.0, "grad_norm": 2.2184434876933463, "language_loss": 0.71537983, "learning_rate": 1.8744811519091663e-06, "loss": 0.73647594, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 8888, "time_per_iteration": 2.385096549987793 }, { "auxiliary_loss_clip": 0.01075961, "auxiliary_loss_mlp": 0.01054517, "balance_loss_clip": 1.02256858, "balance_loss_mlp": 1.02951789, "epoch": 0.5344355929655794, "flos": 16908680096640.0, "grad_norm": 2.011258751882969, "language_loss": 0.78616232, "learning_rate": 1.8740924603619208e-06, "loss": 0.8074671, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 0.46484375, "step": 8889, "time_per_iteration": 2.4146435260772705 }, { "auxiliary_loss_clip": 0.0107231, "auxiliary_loss_mlp": 0.01053564, "balance_loss_clip": 1.02183008, "balance_loss_mlp": 1.02991343, "epoch": 0.5344957162182474, "flos": 16797307259520.0, "grad_norm": 1.9295596384091895, "language_loss": 0.70762539, "learning_rate": 1.873703773589102e-06, "loss": 0.7288841, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.42382812, "step": 8890, "time_per_iteration": 2.384955883026123 }, { "auxiliary_loss_clip": 0.01077029, "auxiliary_loss_mlp": 0.0104806, "balance_loss_clip": 1.01658857, "balance_loss_mlp": 1.03237569, "epoch": 0.5345558394709153, "flos": 12704998089600.0, "grad_norm": 2.382859091124453, "language_loss": 0.79682755, "learning_rate": 1.8733150916054483e-06, "loss": 0.81807852, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 8891, "time_per_iteration": 2.4077796936035156 }, { "auxiliary_loss_clip": 0.01069439, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.01788688, "balance_loss_mlp": 1.0292294, "epoch": 0.5346159627235834, "flos": 22453969985280.0, "grad_norm": 1.7376531924742231, "language_loss": 0.76005864, "learning_rate": 1.872926414425699e-06, "loss": 0.78119588, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 8892, "time_per_iteration": 2.4401135444641113 }, { "auxiliary_loss_clip": 0.01075915, "auxiliary_loss_mlp": 0.0105406, "balance_loss_clip": 1.02654696, "balance_loss_mlp": 1.03265631, "epoch": 0.5346760859762513, "flos": 22414762661760.0, "grad_norm": 1.7297153268963914, "language_loss": 0.89013106, "learning_rate": 1.8725377420645932e-06, "loss": 0.91143084, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.43164062, "step": 8893, "time_per_iteration": 2.4406864643096924 }, { "auxiliary_loss_clip": 0.01073125, "auxiliary_loss_mlp": 0.01049624, "balance_loss_clip": 1.02203941, "balance_loss_mlp": 1.03128338, "epoch": 0.5347362092289193, "flos": 22815146666880.0, "grad_norm": 1.74171596092802, "language_loss": 0.73870063, "learning_rate": 1.872149074536869e-06, "loss": 0.75992811, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 8894, "time_per_iteration": 2.4509146213531494 }, { "auxiliary_loss_clip": 0.01076504, "auxiliary_loss_mlp": 0.01058843, "balance_loss_clip": 1.02915967, "balance_loss_mlp": 1.03374577, "epoch": 0.5347963324815872, "flos": 23218428314880.0, "grad_norm": 1.5000022437689657, "language_loss": 0.75850934, "learning_rate": 1.8717604118572648e-06, "loss": 0.77986282, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42773438, "step": 8895, "time_per_iteration": 2.423671007156372 }, { "auxiliary_loss_clip": 0.01073373, "auxiliary_loss_mlp": 0.01046, "balance_loss_clip": 1.01759219, "balance_loss_mlp": 1.03143585, "epoch": 0.5348564557342552, "flos": 22600256048640.0, "grad_norm": 1.5996643824935284, "language_loss": 0.78036553, "learning_rate": 1.8713717540405178e-06, "loss": 0.80155921, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41992188, "step": 8896, "time_per_iteration": 2.4725427627563477 }, { "auxiliary_loss_clip": 0.01073017, "auxiliary_loss_mlp": 0.01049554, "balance_loss_clip": 1.02113438, "balance_loss_mlp": 1.03266931, "epoch": 0.5349165789869232, "flos": 18001461651840.0, "grad_norm": 1.6579895812082601, "language_loss": 0.79489326, "learning_rate": 1.8709831011013676e-06, "loss": 0.81611896, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40429688, "step": 8897, "time_per_iteration": 2.3870222568511963 }, { "auxiliary_loss_clip": 0.01072928, "auxiliary_loss_mlp": 0.01054936, "balance_loss_clip": 1.02508652, "balance_loss_mlp": 1.03006744, "epoch": 0.5349767022395912, "flos": 17158972700160.0, "grad_norm": 1.853986619594393, "language_loss": 0.77072966, "learning_rate": 1.8705944530545509e-06, "loss": 0.79200828, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.4296875, "step": 8898, "time_per_iteration": 2.392573356628418 }, { "auxiliary_loss_clip": 0.0104644, "auxiliary_loss_mlp": 0.01047698, "balance_loss_clip": 1.04523051, "balance_loss_mlp": 1.03651476, "epoch": 0.5350368254922592, "flos": 70988302765440.0, "grad_norm": 0.8914213708600844, "language_loss": 0.58073032, "learning_rate": 1.8702058099148052e-06, "loss": 0.60167176, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.09960938, "step": 8899, "time_per_iteration": 3.2076053619384766 }, { "auxiliary_loss_clip": 0.01067996, "auxiliary_loss_mlp": 0.01056608, "balance_loss_clip": 1.02878428, "balance_loss_mlp": 1.02760029, "epoch": 0.5350969487449271, "flos": 27416594327040.0, "grad_norm": 1.5508776547425214, "language_loss": 0.71142709, "learning_rate": 1.869817171696868e-06, "loss": 0.73267317, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40429688, "step": 8900, "time_per_iteration": 2.4627418518066406 }, { "auxiliary_loss_clip": 0.01069403, "auxiliary_loss_mlp": 0.01042836, "balance_loss_clip": 1.0142498, "balance_loss_mlp": 1.0265485, "epoch": 0.5351570719975951, "flos": 19315173490560.0, "grad_norm": 1.7897480741791756, "language_loss": 0.72815812, "learning_rate": 1.8694285384154777e-06, "loss": 0.74928057, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4296875, "step": 8901, "time_per_iteration": 2.4339797496795654 }, { "auxiliary_loss_clip": 0.01067898, "auxiliary_loss_mlp": 0.01048836, "balance_loss_clip": 1.01965368, "balance_loss_mlp": 1.02506447, "epoch": 0.535217195250263, "flos": 19827558737280.0, "grad_norm": 1.9213024303147992, "language_loss": 0.78756297, "learning_rate": 1.8690399100853699e-06, "loss": 0.80873024, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 8902, "time_per_iteration": 2.3829216957092285 }, { "auxiliary_loss_clip": 0.01063051, "auxiliary_loss_mlp": 0.01047291, "balance_loss_clip": 1.02174425, "balance_loss_mlp": 1.02379465, "epoch": 0.535277318502931, "flos": 22126763923200.0, "grad_norm": 1.4526084444728036, "language_loss": 0.70623171, "learning_rate": 1.868651286721281e-06, "loss": 0.7273351, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39257812, "step": 8903, "time_per_iteration": 2.4510228633880615 }, { "auxiliary_loss_clip": 0.01063821, "auxiliary_loss_mlp": 0.01051648, "balance_loss_clip": 1.02049828, "balance_loss_mlp": 1.02136683, "epoch": 0.5353374417555989, "flos": 25044734868480.0, "grad_norm": 1.5721375966744437, "language_loss": 0.73643661, "learning_rate": 1.86826266833795e-06, "loss": 0.75759137, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.42578125, "step": 8904, "time_per_iteration": 2.4856839179992676 }, { "auxiliary_loss_clip": 0.01063687, "auxiliary_loss_mlp": 0.01047063, "balance_loss_clip": 1.01869118, "balance_loss_mlp": 1.02245307, "epoch": 0.535397565008267, "flos": 19387757940480.0, "grad_norm": 2.00494889914011, "language_loss": 0.74273026, "learning_rate": 1.8678740549501103e-06, "loss": 0.76383775, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41210938, "step": 8905, "time_per_iteration": 2.3885810375213623 }, { "auxiliary_loss_clip": 0.01056087, "auxiliary_loss_mlp": 0.01040577, "balance_loss_clip": 1.01464868, "balance_loss_mlp": 1.01894772, "epoch": 0.5354576882609349, "flos": 21470117472000.0, "grad_norm": 1.5119191310032039, "language_loss": 0.84734917, "learning_rate": 1.8674854465725005e-06, "loss": 0.86831582, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37109375, "step": 8906, "time_per_iteration": 2.4262685775756836 }, { "auxiliary_loss_clip": 0.01061227, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.01734066, "balance_loss_mlp": 1.01943147, "epoch": 0.5355178115136029, "flos": 20776463112960.0, "grad_norm": 1.988213604635427, "language_loss": 0.74982154, "learning_rate": 1.8670968432198563e-06, "loss": 0.7709403, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 0.41796875, "step": 8907, "time_per_iteration": 2.3803696632385254 }, { "auxiliary_loss_clip": 0.01059859, "auxiliary_loss_mlp": 0.01046911, "balance_loss_clip": 1.01930213, "balance_loss_mlp": 1.01949835, "epoch": 0.5355779347662708, "flos": 23512885655040.0, "grad_norm": 1.901327394138041, "language_loss": 0.77715266, "learning_rate": 1.866708244906912e-06, "loss": 0.79822034, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40234375, "step": 8908, "time_per_iteration": 2.4347689151763916 }, { "auxiliary_loss_clip": 0.01062941, "auxiliary_loss_mlp": 0.01049264, "balance_loss_clip": 1.02014148, "balance_loss_mlp": 1.0206486, "epoch": 0.5356380580189388, "flos": 20302168026240.0, "grad_norm": 2.035663742641289, "language_loss": 0.75531268, "learning_rate": 1.8663196516484055e-06, "loss": 0.77643466, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.421875, "step": 8909, "time_per_iteration": 2.370511293411255 }, { "auxiliary_loss_clip": 0.01059988, "auxiliary_loss_mlp": 0.01050983, "balance_loss_clip": 1.02401829, "balance_loss_mlp": 1.01988292, "epoch": 0.5356981812716068, "flos": 21360560025600.0, "grad_norm": 2.002884545803835, "language_loss": 0.85591042, "learning_rate": 1.8659310634590702e-06, "loss": 0.87702012, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 8910, "time_per_iteration": 2.4434726238250732 }, { "auxiliary_loss_clip": 0.01060415, "auxiliary_loss_mlp": 0.01041084, "balance_loss_clip": 1.01209211, "balance_loss_mlp": 1.01856661, "epoch": 0.5357583045242748, "flos": 23110162588800.0, "grad_norm": 1.4727261373466636, "language_loss": 0.82746887, "learning_rate": 1.8655424803536427e-06, "loss": 0.84848392, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41796875, "step": 8911, "time_per_iteration": 2.4135525226593018 }, { "auxiliary_loss_clip": 0.01060349, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.01333928, "balance_loss_mlp": 1.01986504, "epoch": 0.5358184277769428, "flos": 21140711994240.0, "grad_norm": 2.7258566904273196, "language_loss": 0.70417905, "learning_rate": 1.8651539023468585e-06, "loss": 0.72518408, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 8912, "time_per_iteration": 2.4221699237823486 }, { "auxiliary_loss_clip": 0.01061747, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.02188802, "balance_loss_mlp": 1.02104568, "epoch": 0.5358785510296107, "flos": 16281675256320.0, "grad_norm": 1.932447615197762, "language_loss": 0.72819209, "learning_rate": 1.8647653294534509e-06, "loss": 0.74929237, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 8913, "time_per_iteration": 2.3562655448913574 }, { "auxiliary_loss_clip": 0.01062682, "auxiliary_loss_mlp": 0.01041091, "balance_loss_clip": 1.01319635, "balance_loss_mlp": 1.02065992, "epoch": 0.5359386742822787, "flos": 16976097665280.0, "grad_norm": 1.5969317224802104, "language_loss": 0.7265929, "learning_rate": 1.864376761688156e-06, "loss": 0.7476306, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41992188, "step": 8914, "time_per_iteration": 2.4059157371520996 }, { "auxiliary_loss_clip": 0.01063633, "auxiliary_loss_mlp": 0.01046583, "balance_loss_clip": 1.01613724, "balance_loss_mlp": 1.02039218, "epoch": 0.5359987975349466, "flos": 20811900009600.0, "grad_norm": 1.9155088400901905, "language_loss": 0.71881568, "learning_rate": 1.8639881990657079e-06, "loss": 0.73991787, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.43164062, "step": 8915, "time_per_iteration": 3.643850803375244 }, { "auxiliary_loss_clip": 0.01061087, "auxiliary_loss_mlp": 0.01045207, "balance_loss_clip": 1.0162394, "balance_loss_mlp": 1.02081382, "epoch": 0.5360589207876146, "flos": 22198859614080.0, "grad_norm": 1.732350776190522, "language_loss": 0.769216, "learning_rate": 1.8635996416008408e-06, "loss": 0.79027897, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40234375, "step": 8916, "time_per_iteration": 2.396199941635132 }, { "auxiliary_loss_clip": 0.01063134, "auxiliary_loss_mlp": 0.01044788, "balance_loss_clip": 1.01642776, "balance_loss_mlp": 1.02132297, "epoch": 0.5361190440402825, "flos": 31393027094400.0, "grad_norm": 8.614904049543634, "language_loss": 0.73389709, "learning_rate": 1.863211089308289e-06, "loss": 0.75497633, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41796875, "step": 8917, "time_per_iteration": 3.8461360931396484 }, { "auxiliary_loss_clip": 0.01062627, "auxiliary_loss_mlp": 0.01049861, "balance_loss_clip": 1.0229435, "balance_loss_mlp": 1.02211475, "epoch": 0.5361791672929506, "flos": 16068984053760.0, "grad_norm": 2.159824558487007, "language_loss": 0.72848344, "learning_rate": 1.8628225422027865e-06, "loss": 0.74960834, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 8918, "time_per_iteration": 3.8624536991119385 }, { "auxiliary_loss_clip": 0.01064461, "auxiliary_loss_mlp": 0.01043696, "balance_loss_clip": 1.01679039, "balance_loss_mlp": 1.02293301, "epoch": 0.5362392905456185, "flos": 20739874141440.0, "grad_norm": 1.4154404216487477, "language_loss": 0.75639176, "learning_rate": 1.862434000299067e-06, "loss": 0.77747333, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.4140625, "step": 8919, "time_per_iteration": 2.3792266845703125 }, { "auxiliary_loss_clip": 0.01062174, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.02142859, "balance_loss_mlp": 1.02101207, "epoch": 0.5362994137982865, "flos": 17339334117120.0, "grad_norm": 1.81102550749132, "language_loss": 0.73100811, "learning_rate": 1.862045463611864e-06, "loss": 0.75210309, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41210938, "step": 8920, "time_per_iteration": 2.380683422088623 }, { "auxiliary_loss_clip": 0.01061143, "auxiliary_loss_mlp": 0.01044706, "balance_loss_clip": 1.01651287, "balance_loss_mlp": 1.02094889, "epoch": 0.5363595370509544, "flos": 42812314122240.0, "grad_norm": 1.698400698304111, "language_loss": 0.69555867, "learning_rate": 1.8616569321559105e-06, "loss": 0.71661723, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 8921, "time_per_iteration": 2.5705759525299072 }, { "auxiliary_loss_clip": 0.01065236, "auxiliary_loss_mlp": 0.01042895, "balance_loss_clip": 1.01427317, "balance_loss_mlp": 1.02408421, "epoch": 0.5364196603036224, "flos": 19170947197440.0, "grad_norm": 1.79979168271998, "language_loss": 0.82978195, "learning_rate": 1.86126840594594e-06, "loss": 0.85086322, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41210938, "step": 8922, "time_per_iteration": 2.4011123180389404 }, { "auxiliary_loss_clip": 0.0106221, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.0146687, "balance_loss_mlp": 1.02110839, "epoch": 0.5364797835562904, "flos": 17930099099520.0, "grad_norm": 2.281972116780324, "language_loss": 0.77675593, "learning_rate": 1.860879884996686e-06, "loss": 0.79779208, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 8923, "time_per_iteration": 2.4456005096435547 }, { "auxiliary_loss_clip": 0.01065447, "auxiliary_loss_mlp": 0.01047425, "balance_loss_clip": 1.0193634, "balance_loss_mlp": 1.02280951, "epoch": 0.5365399068089584, "flos": 30226718482560.0, "grad_norm": 1.5464700502781064, "language_loss": 0.71232474, "learning_rate": 1.8604913693228804e-06, "loss": 0.73345351, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.42578125, "step": 8924, "time_per_iteration": 2.482163190841675 }, { "auxiliary_loss_clip": 0.01065051, "auxiliary_loss_mlp": 0.01048715, "balance_loss_clip": 1.01760125, "balance_loss_mlp": 1.02306151, "epoch": 0.5366000300616264, "flos": 24890768305920.0, "grad_norm": 1.8043655231516553, "language_loss": 0.88353157, "learning_rate": 1.8601028589392558e-06, "loss": 0.90466917, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.41992188, "step": 8925, "time_per_iteration": 2.422456741333008 }, { "auxiliary_loss_clip": 0.01063248, "auxiliary_loss_mlp": 0.01041822, "balance_loss_clip": 1.0154767, "balance_loss_mlp": 1.02147818, "epoch": 0.5366601533142943, "flos": 29825322048000.0, "grad_norm": 1.5371652857121307, "language_loss": 0.78513134, "learning_rate": 1.8597143538605455e-06, "loss": 0.80618203, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41796875, "step": 8926, "time_per_iteration": 4.001922130584717 }, { "auxiliary_loss_clip": 0.01060921, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.01407409, "balance_loss_mlp": 1.02237046, "epoch": 0.5367202765669623, "flos": 27198107838720.0, "grad_norm": 1.3834995510839738, "language_loss": 0.67799723, "learning_rate": 1.85932585410148e-06, "loss": 0.69899845, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 8927, "time_per_iteration": 2.4455482959747314 }, { "auxiliary_loss_clip": 0.01060971, "auxiliary_loss_mlp": 0.0103898, "balance_loss_clip": 1.01331377, "balance_loss_mlp": 1.02010584, "epoch": 0.5367803998196302, "flos": 20228920260480.0, "grad_norm": 1.7501057741621895, "language_loss": 0.75113106, "learning_rate": 1.8589373596767929e-06, "loss": 0.77213061, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40820312, "step": 8928, "time_per_iteration": 2.424966335296631 }, { "auxiliary_loss_clip": 0.01060328, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.01158547, "balance_loss_mlp": 1.02106953, "epoch": 0.5368405230722982, "flos": 32153435706240.0, "grad_norm": 2.03069349144544, "language_loss": 0.64169168, "learning_rate": 1.8585488706012154e-06, "loss": 0.66264516, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.39257812, "step": 8929, "time_per_iteration": 2.471461772918701 }, { "auxiliary_loss_clip": 0.01060655, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.01983595, "balance_loss_mlp": 1.0205375, "epoch": 0.5369006463249661, "flos": 26246794579200.0, "grad_norm": 1.6783940610683814, "language_loss": 0.6731981, "learning_rate": 1.8581603868894781e-06, "loss": 0.69425082, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.40039062, "step": 8930, "time_per_iteration": 2.453794002532959 }, { "auxiliary_loss_clip": 0.01060031, "auxiliary_loss_mlp": 0.01038848, "balance_loss_clip": 1.01163232, "balance_loss_mlp": 1.01999247, "epoch": 0.5369607695776342, "flos": 26210170696320.0, "grad_norm": 1.519035641934808, "language_loss": 0.68409687, "learning_rate": 1.8577719085563136e-06, "loss": 0.70508569, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 8931, "time_per_iteration": 2.416675090789795 }, { "auxiliary_loss_clip": 0.0106145, "auxiliary_loss_mlp": 0.01042631, "balance_loss_clip": 1.01747727, "balance_loss_mlp": 1.02184939, "epoch": 0.5370208928303021, "flos": 25007866606080.0, "grad_norm": 1.614254314826076, "language_loss": 0.76886606, "learning_rate": 1.8573834356164525e-06, "loss": 0.78990686, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39648438, "step": 8932, "time_per_iteration": 2.4670121669769287 }, { "auxiliary_loss_clip": 0.01059796, "auxiliary_loss_mlp": 0.01037622, "balance_loss_clip": 1.01225448, "balance_loss_mlp": 1.01968837, "epoch": 0.5370810160829701, "flos": 31790897481600.0, "grad_norm": 2.015454669579529, "language_loss": 0.67043734, "learning_rate": 1.8569949680846261e-06, "loss": 0.6914115, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40234375, "step": 8933, "time_per_iteration": 2.48069429397583 }, { "auxiliary_loss_clip": 0.01057378, "auxiliary_loss_mlp": 0.0104004, "balance_loss_clip": 1.01481557, "balance_loss_mlp": 1.01921618, "epoch": 0.537141139335638, "flos": 23841453260160.0, "grad_norm": 1.644395959033918, "language_loss": 0.83793044, "learning_rate": 1.856606505975565e-06, "loss": 0.8589046, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 8934, "time_per_iteration": 2.4439468383789062 }, { "auxiliary_loss_clip": 0.01056457, "auxiliary_loss_mlp": 0.01044267, "balance_loss_clip": 1.01789832, "balance_loss_mlp": 1.01879072, "epoch": 0.537201262588306, "flos": 18508016701440.0, "grad_norm": 1.9821992931733459, "language_loss": 0.80805588, "learning_rate": 1.856218049303999e-06, "loss": 0.82906306, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37695312, "step": 8935, "time_per_iteration": 2.3380212783813477 }, { "auxiliary_loss_clip": 0.010578, "auxiliary_loss_mlp": 0.01043129, "balance_loss_clip": 1.01723635, "balance_loss_mlp": 1.01845884, "epoch": 0.537261385840974, "flos": 25661859793920.0, "grad_norm": 10.030755595023885, "language_loss": 0.8467598, "learning_rate": 1.855829598084659e-06, "loss": 0.86776906, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 8936, "time_per_iteration": 2.4430556297302246 }, { "auxiliary_loss_clip": 0.01055912, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.01547027, "balance_loss_mlp": 1.01791883, "epoch": 0.537321509093642, "flos": 40733410815360.0, "grad_norm": 1.2860961306905667, "language_loss": 0.73651946, "learning_rate": 1.8554411523322754e-06, "loss": 0.75747693, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37890625, "step": 8937, "time_per_iteration": 2.5403835773468018 }, { "auxiliary_loss_clip": 0.01057225, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.01350975, "balance_loss_mlp": 1.01719332, "epoch": 0.53738163234631, "flos": 17237526992640.0, "grad_norm": 2.3331083130226755, "language_loss": 0.82505441, "learning_rate": 1.8550527120615778e-06, "loss": 0.84601092, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.40039062, "step": 8938, "time_per_iteration": 2.3777172565460205 }, { "auxiliary_loss_clip": 0.01061157, "auxiliary_loss_mlp": 0.01045818, "balance_loss_clip": 1.01911533, "balance_loss_mlp": 1.02012837, "epoch": 0.5374417555989779, "flos": 12821188694400.0, "grad_norm": 2.3610462377042185, "language_loss": 0.81864643, "learning_rate": 1.8546642772872957e-06, "loss": 0.8397162, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41015625, "step": 8939, "time_per_iteration": 2.3595237731933594 }, { "auxiliary_loss_clip": 0.01010822, "auxiliary_loss_mlp": 0.01002732, "balance_loss_clip": 1.000229, "balance_loss_mlp": 1.00365186, "epoch": 0.5375018788516459, "flos": 67252771445760.0, "grad_norm": 0.7078105291319816, "language_loss": 0.52529275, "learning_rate": 1.8542758480241589e-06, "loss": 0.54542834, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.07128906, "step": 8940, "time_per_iteration": 3.0362002849578857 }, { "auxiliary_loss_clip": 0.01055438, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 1.01290321, "balance_loss_mlp": 1.01785064, "epoch": 0.5375620021043138, "flos": 18113183602560.0, "grad_norm": 1.6973258593163194, "language_loss": 0.73459923, "learning_rate": 1.8538874242868965e-06, "loss": 0.75552058, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 8941, "time_per_iteration": 2.35707688331604 }, { "auxiliary_loss_clip": 0.01054146, "auxiliary_loss_mlp": 0.01038855, "balance_loss_clip": 1.0161221, "balance_loss_mlp": 1.01740837, "epoch": 0.5376221253569818, "flos": 23148252748800.0, "grad_norm": 1.6732213765550117, "language_loss": 0.80721354, "learning_rate": 1.853499006090237e-06, "loss": 0.8281436, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 8942, "time_per_iteration": 2.432715654373169 }, { "auxiliary_loss_clip": 0.01058069, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.0140388, "balance_loss_mlp": 1.018085, "epoch": 0.5376822486096497, "flos": 29970979706880.0, "grad_norm": 1.9204127927109742, "language_loss": 0.71803719, "learning_rate": 1.853110593448911e-06, "loss": 0.73902243, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 8943, "time_per_iteration": 2.436572313308716 }, { "auxiliary_loss_clip": 0.01009241, "auxiliary_loss_mlp": 0.01002972, "balance_loss_clip": 1.00082672, "balance_loss_mlp": 1.00205779, "epoch": 0.5377423718623178, "flos": 54165752726400.0, "grad_norm": 0.8367067700047615, "language_loss": 0.59753829, "learning_rate": 1.852722186377645e-06, "loss": 0.61766046, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.07177734, "step": 8944, "time_per_iteration": 3.037797451019287 }, { "auxiliary_loss_clip": 0.01061648, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.01649046, "balance_loss_mlp": 1.01888394, "epoch": 0.5378024951149857, "flos": 23255994804480.0, "grad_norm": 3.0749352379848665, "language_loss": 0.78804541, "learning_rate": 1.852333784891169e-06, "loss": 0.80911267, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.42773438, "step": 8945, "time_per_iteration": 2.3829212188720703 }, { "auxiliary_loss_clip": 0.01056655, "auxiliary_loss_mlp": 0.01036239, "balance_loss_clip": 1.01201558, "balance_loss_mlp": 1.01705623, "epoch": 0.5378626183676537, "flos": 24022966752000.0, "grad_norm": 1.7580223898635348, "language_loss": 0.69726533, "learning_rate": 1.8519453890042112e-06, "loss": 0.71819425, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.39648438, "step": 8946, "time_per_iteration": 2.442201614379883 }, { "auxiliary_loss_clip": 0.01055776, "auxiliary_loss_mlp": 0.01043655, "balance_loss_clip": 1.02043331, "balance_loss_mlp": 1.01800525, "epoch": 0.5379227416203216, "flos": 27160576260480.0, "grad_norm": 1.510240069492998, "language_loss": 0.78098416, "learning_rate": 1.851556998731498e-06, "loss": 0.80197847, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 8947, "time_per_iteration": 2.4441444873809814 }, { "auxiliary_loss_clip": 0.0105712, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.01393962, "balance_loss_mlp": 1.0179261, "epoch": 0.5379828648729896, "flos": 24680451075840.0, "grad_norm": 1.7497469169330293, "language_loss": 0.6138885, "learning_rate": 1.8511686140877592e-06, "loss": 0.63484848, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 8948, "time_per_iteration": 2.4484026432037354 }, { "auxiliary_loss_clip": 0.0105775, "auxiliary_loss_mlp": 0.01040686, "balance_loss_clip": 1.01834655, "balance_loss_mlp": 1.01910567, "epoch": 0.5380429881256577, "flos": 22522330160640.0, "grad_norm": 1.815220120212949, "language_loss": 0.80242872, "learning_rate": 1.8507802350877205e-06, "loss": 0.82341307, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.38671875, "step": 8949, "time_per_iteration": 2.390437126159668 }, { "auxiliary_loss_clip": 0.01054392, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.0167737, "balance_loss_mlp": 1.01757836, "epoch": 0.5381031113783256, "flos": 26978329630080.0, "grad_norm": 1.6309538217010142, "language_loss": 0.79235423, "learning_rate": 1.850391861746111e-06, "loss": 0.81329799, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 8950, "time_per_iteration": 2.483891248703003 }, { "auxiliary_loss_clip": 0.01055683, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.01493907, "balance_loss_mlp": 1.01814222, "epoch": 0.5381632346309936, "flos": 24752930791680.0, "grad_norm": 1.5781124079692752, "language_loss": 0.73622531, "learning_rate": 1.8500034940776573e-06, "loss": 0.75716949, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 8951, "time_per_iteration": 2.4104650020599365 }, { "auxiliary_loss_clip": 0.01056404, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.01247025, "balance_loss_mlp": 1.01691258, "epoch": 0.5382233578836615, "flos": 15559147336320.0, "grad_norm": 2.2400974920651584, "language_loss": 0.764476, "learning_rate": 1.849615132097085e-06, "loss": 0.7854169, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 8952, "time_per_iteration": 2.409898519515991 }, { "auxiliary_loss_clip": 0.01058143, "auxiliary_loss_mlp": 0.01037982, "balance_loss_clip": 1.01094496, "balance_loss_mlp": 1.01887238, "epoch": 0.5382834811363295, "flos": 25083278876160.0, "grad_norm": 1.472357334311386, "language_loss": 0.80165809, "learning_rate": 1.8492267758191228e-06, "loss": 0.82261932, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 8953, "time_per_iteration": 2.422800064086914 }, { "auxiliary_loss_clip": 0.01055782, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.0137893, "balance_loss_mlp": 1.01808465, "epoch": 0.5383436043889974, "flos": 13297054792320.0, "grad_norm": 1.8543790006106247, "language_loss": 0.82048732, "learning_rate": 1.8488384252584964e-06, "loss": 0.84142548, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37695312, "step": 8954, "time_per_iteration": 2.3814477920532227 }, { "auxiliary_loss_clip": 0.01057086, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.01475799, "balance_loss_mlp": 1.01823997, "epoch": 0.5384037276416654, "flos": 23038276366080.0, "grad_norm": 1.857722653483034, "language_loss": 0.78084671, "learning_rate": 1.8484500804299318e-06, "loss": 0.80181575, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 8955, "time_per_iteration": 3.631215810775757 }, { "auxiliary_loss_clip": 0.01056768, "auxiliary_loss_mlp": 0.01041021, "balance_loss_clip": 1.01500928, "balance_loss_mlp": 1.01889789, "epoch": 0.5384638508943334, "flos": 20630107226880.0, "grad_norm": 1.5954811077793964, "language_loss": 0.79965377, "learning_rate": 1.8480617413481557e-06, "loss": 0.82063162, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 8956, "time_per_iteration": 3.8077375888824463 }, { "auxiliary_loss_clip": 0.01011196, "auxiliary_loss_mlp": 0.01007055, "balance_loss_clip": 1.00439668, "balance_loss_mlp": 1.00440657, "epoch": 0.5385239741470014, "flos": 66734660736000.0, "grad_norm": 0.8608436064406197, "language_loss": 0.63602746, "learning_rate": 1.8476734080278932e-06, "loss": 0.65620995, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.06787109, "step": 8957, "time_per_iteration": 2.9715232849121094 }, { "auxiliary_loss_clip": 0.01010713, "auxiliary_loss_mlp": 0.0100368, "balance_loss_clip": 1.00136757, "balance_loss_mlp": 1.00385141, "epoch": 0.5385840973996693, "flos": 64712490255360.0, "grad_norm": 0.7096124123491183, "language_loss": 0.51777202, "learning_rate": 1.8472850804838705e-06, "loss": 0.537916, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.06835938, "step": 8958, "time_per_iteration": 4.472727298736572 }, { "auxiliary_loss_clip": 0.01058434, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.01422727, "balance_loss_mlp": 1.01836693, "epoch": 0.5386442206523373, "flos": 26140553712000.0, "grad_norm": 1.5589692663596597, "language_loss": 0.78063565, "learning_rate": 1.8468967587308128e-06, "loss": 0.80161583, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40234375, "step": 8959, "time_per_iteration": 2.430861473083496 }, { "auxiliary_loss_clip": 0.01058341, "auxiliary_loss_mlp": 0.01038607, "balance_loss_clip": 1.01409721, "balance_loss_mlp": 1.01832962, "epoch": 0.5387043439050052, "flos": 18251090939520.0, "grad_norm": 2.310057288415181, "language_loss": 0.84586954, "learning_rate": 1.8465084427834455e-06, "loss": 0.86683899, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.40039062, "step": 8960, "time_per_iteration": 2.3853800296783447 }, { "auxiliary_loss_clip": 0.01059604, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.01330543, "balance_loss_mlp": 1.01970553, "epoch": 0.5387644671576732, "flos": 29787022419840.0, "grad_norm": 1.4471658114563881, "language_loss": 0.79901946, "learning_rate": 1.8461201326564933e-06, "loss": 0.81999922, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3984375, "step": 8961, "time_per_iteration": 2.435152769088745 }, { "auxiliary_loss_clip": 0.01059181, "auxiliary_loss_mlp": 0.01048647, "balance_loss_clip": 1.02301741, "balance_loss_mlp": 1.01931548, "epoch": 0.5388245904103413, "flos": 22373600302080.0, "grad_norm": 1.7904018516199225, "language_loss": 0.85113066, "learning_rate": 1.845731828364681e-06, "loss": 0.87220895, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 8962, "time_per_iteration": 2.4178013801574707 }, { "auxiliary_loss_clip": 0.01013907, "auxiliary_loss_mlp": 0.01011693, "balance_loss_clip": 1.00952375, "balance_loss_mlp": 1.00663519, "epoch": 0.5388847136630092, "flos": 69804538473600.0, "grad_norm": 0.7358616053125006, "language_loss": 0.54220402, "learning_rate": 1.8453435299227333e-06, "loss": 0.56246006, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.07275391, "step": 8963, "time_per_iteration": 2.953850746154785 }, { "auxiliary_loss_clip": 0.01011479, "auxiliary_loss_mlp": 0.01008706, "balance_loss_clip": 1.00636983, "balance_loss_mlp": 1.00427961, "epoch": 0.5389448369156772, "flos": 69818642663040.0, "grad_norm": 0.8405623080527205, "language_loss": 0.63612169, "learning_rate": 1.8449552373453744e-06, "loss": 0.65632361, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.07177734, "step": 8964, "time_per_iteration": 3.1220006942749023 }, { "auxiliary_loss_clip": 0.01059031, "auxiliary_loss_mlp": 0.01048386, "balance_loss_clip": 1.02021682, "balance_loss_mlp": 1.01766276, "epoch": 0.5390049601683451, "flos": 31721105940480.0, "grad_norm": 1.4758028928091504, "language_loss": 0.7089926, "learning_rate": 1.8445669506473287e-06, "loss": 0.73006678, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 8965, "time_per_iteration": 2.4459476470947266 }, { "auxiliary_loss_clip": 0.01059777, "auxiliary_loss_mlp": 0.01043161, "balance_loss_clip": 1.01539755, "balance_loss_mlp": 1.01841962, "epoch": 0.5390650834210131, "flos": 18112520286720.0, "grad_norm": 2.076461512921895, "language_loss": 0.83626616, "learning_rate": 1.8441786698433192e-06, "loss": 0.85729551, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.4140625, "step": 8966, "time_per_iteration": 3.8342537879943848 }, { "auxiliary_loss_clip": 0.01057914, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.01946676, "balance_loss_mlp": 1.01850271, "epoch": 0.539125206673681, "flos": 17415863550720.0, "grad_norm": 2.3315091395767267, "language_loss": 0.7388761, "learning_rate": 1.8437903949480706e-06, "loss": 0.7599113, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 8967, "time_per_iteration": 2.3663673400878906 }, { "auxiliary_loss_clip": 0.01058863, "auxiliary_loss_mlp": 0.01042347, "balance_loss_clip": 1.01708674, "balance_loss_mlp": 1.01834643, "epoch": 0.539185329926349, "flos": 22197882096000.0, "grad_norm": 1.6398824528864735, "language_loss": 0.83116883, "learning_rate": 1.8434021259763065e-06, "loss": 0.8521809, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 8968, "time_per_iteration": 2.4037864208221436 }, { "auxiliary_loss_clip": 0.01058233, "auxiliary_loss_mlp": 0.01043357, "balance_loss_clip": 1.01481867, "balance_loss_mlp": 1.01843548, "epoch": 0.539245453179017, "flos": 21433319032320.0, "grad_norm": 1.6166746259951301, "language_loss": 0.74946511, "learning_rate": 1.8430138629427484e-06, "loss": 0.77048099, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.3984375, "step": 8969, "time_per_iteration": 2.403905153274536 }, { "auxiliary_loss_clip": 0.01061107, "auxiliary_loss_mlp": 0.01044488, "balance_loss_clip": 1.01594973, "balance_loss_mlp": 1.01872611, "epoch": 0.539305576431685, "flos": 20734113767040.0, "grad_norm": 1.9428928612055358, "language_loss": 0.83089077, "learning_rate": 1.8426256058621205e-06, "loss": 0.85194671, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.421875, "step": 8970, "time_per_iteration": 2.401843309402466 }, { "auxiliary_loss_clip": 0.01057607, "auxiliary_loss_mlp": 0.01044983, "balance_loss_clip": 1.02035439, "balance_loss_mlp": 1.01804173, "epoch": 0.5393656996843529, "flos": 30919116032640.0, "grad_norm": 1.3666022681546044, "language_loss": 0.76155531, "learning_rate": 1.842237354749146e-06, "loss": 0.78258121, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39453125, "step": 8971, "time_per_iteration": 2.4846408367156982 }, { "auxiliary_loss_clip": 0.01010692, "auxiliary_loss_mlp": 0.01024174, "balance_loss_clip": 1.02190936, "balance_loss_mlp": 1.00369453, "epoch": 0.5394258229370209, "flos": 50315252699520.0, "grad_norm": 0.8917501742508555, "language_loss": 0.60393536, "learning_rate": 1.8418491096185465e-06, "loss": 0.62428403, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.0703125, "step": 8972, "time_per_iteration": 3.0506210327148438 }, { "auxiliary_loss_clip": 0.01059305, "auxiliary_loss_mlp": 0.01043468, "balance_loss_clip": 1.01787424, "balance_loss_mlp": 1.01899874, "epoch": 0.5394859461896888, "flos": 25410729317760.0, "grad_norm": 1.4545777137239582, "language_loss": 0.79535711, "learning_rate": 1.841460870485045e-06, "loss": 0.81638491, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 8973, "time_per_iteration": 2.463809013366699 }, { "auxiliary_loss_clip": 0.01063968, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.01815927, "balance_loss_mlp": 1.01955462, "epoch": 0.5395460694423568, "flos": 25477448659200.0, "grad_norm": 2.5273185583683877, "language_loss": 0.7508719, "learning_rate": 1.8410726373633623e-06, "loss": 0.7720083, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 0.4453125, "step": 8974, "time_per_iteration": 2.4357120990753174 }, { "auxiliary_loss_clip": 0.01014775, "auxiliary_loss_mlp": 0.01010143, "balance_loss_clip": 1.00778222, "balance_loss_mlp": 1.00765443, "epoch": 0.5396061926950249, "flos": 53246524872960.0, "grad_norm": 0.734340002377585, "language_loss": 0.51125991, "learning_rate": 1.8406844102682215e-06, "loss": 0.53150904, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07128906, "step": 8975, "time_per_iteration": 3.0572211742401123 }, { "auxiliary_loss_clip": 0.01059818, "auxiliary_loss_mlp": 0.01043163, "balance_loss_clip": 1.01642489, "balance_loss_mlp": 1.02067208, "epoch": 0.5396663159476928, "flos": 26723847663360.0, "grad_norm": 1.5587239840296387, "language_loss": 0.73087239, "learning_rate": 1.840296189214344e-06, "loss": 0.75190222, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 8976, "time_per_iteration": 2.448598623275757 }, { "auxiliary_loss_clip": 0.01061, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.01599395, "balance_loss_mlp": 1.02124524, "epoch": 0.5397264392003608, "flos": 23252398934400.0, "grad_norm": 1.7167900138141599, "language_loss": 0.7147845, "learning_rate": 1.8399079742164509e-06, "loss": 0.73582113, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 8977, "time_per_iteration": 2.4112281799316406 }, { "auxiliary_loss_clip": 0.01061746, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.01149499, "balance_loss_mlp": 1.02172613, "epoch": 0.5397865624530287, "flos": 18293265728640.0, "grad_norm": 1.6486295666862683, "language_loss": 0.73721743, "learning_rate": 1.8395197652892636e-06, "loss": 0.75821996, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 8978, "time_per_iteration": 2.3499584197998047 }, { "auxiliary_loss_clip": 0.01065171, "auxiliary_loss_mlp": 0.01044442, "balance_loss_clip": 1.01356697, "balance_loss_mlp": 1.02182221, "epoch": 0.5398466857056967, "flos": 15296810313600.0, "grad_norm": 1.9610702643816238, "language_loss": 0.755826, "learning_rate": 1.8391315624475028e-06, "loss": 0.77692211, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 8979, "time_per_iteration": 2.380565881729126 }, { "auxiliary_loss_clip": 0.01066956, "auxiliary_loss_mlp": 0.01053353, "balance_loss_clip": 1.02374196, "balance_loss_mlp": 1.02393889, "epoch": 0.5399068089583646, "flos": 17820786032640.0, "grad_norm": 1.9055526557088598, "language_loss": 0.77763009, "learning_rate": 1.8387433657058892e-06, "loss": 0.79883313, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 8980, "time_per_iteration": 2.37343430519104 }, { "auxiliary_loss_clip": 0.01062933, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.01379442, "balance_loss_mlp": 1.02177286, "epoch": 0.5399669322110326, "flos": 27380389380480.0, "grad_norm": 2.4123427923894485, "language_loss": 0.83400095, "learning_rate": 1.8383551750791431e-06, "loss": 0.85502386, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.41210938, "step": 8981, "time_per_iteration": 2.4467689990997314 }, { "auxiliary_loss_clip": 0.01064948, "auxiliary_loss_mlp": 0.01043723, "balance_loss_clip": 1.01597071, "balance_loss_mlp": 1.02251744, "epoch": 0.5400270554637006, "flos": 20448070064640.0, "grad_norm": 1.9947635355718314, "language_loss": 0.67855889, "learning_rate": 1.8379669905819857e-06, "loss": 0.69964564, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42382812, "step": 8982, "time_per_iteration": 2.3779404163360596 }, { "auxiliary_loss_clip": 0.0106243, "auxiliary_loss_mlp": 0.01049405, "balance_loss_clip": 1.02234459, "balance_loss_mlp": 1.02275705, "epoch": 0.5400871787163686, "flos": 21688499226240.0, "grad_norm": 1.4867207218214564, "language_loss": 0.84106934, "learning_rate": 1.8375788122291358e-06, "loss": 0.86218768, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 8983, "time_per_iteration": 2.393679618835449 }, { "auxiliary_loss_clip": 0.01062137, "auxiliary_loss_mlp": 0.01045951, "balance_loss_clip": 1.01642323, "balance_loss_mlp": 1.02227402, "epoch": 0.5401473019690365, "flos": 19203835564800.0, "grad_norm": 1.861331151480305, "language_loss": 0.71988642, "learning_rate": 1.8371906400353138e-06, "loss": 0.74096733, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.3984375, "step": 8984, "time_per_iteration": 2.396200656890869 }, { "auxiliary_loss_clip": 0.01064089, "auxiliary_loss_mlp": 0.01048806, "balance_loss_clip": 1.02078032, "balance_loss_mlp": 1.02291405, "epoch": 0.5402074252217045, "flos": 20626441534080.0, "grad_norm": 1.935681432914346, "language_loss": 0.81596434, "learning_rate": 1.8368024740152386e-06, "loss": 0.83709329, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 8985, "time_per_iteration": 2.395359516143799 }, { "auxiliary_loss_clip": 0.0105837, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.0146513, "balance_loss_mlp": 1.02084017, "epoch": 0.5402675484743724, "flos": 24972290064000.0, "grad_norm": 1.4581307226063513, "language_loss": 0.79929209, "learning_rate": 1.83641431418363e-06, "loss": 0.82027662, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 8986, "time_per_iteration": 2.4424142837524414 }, { "auxiliary_loss_clip": 0.01060759, "auxiliary_loss_mlp": 0.01047344, "balance_loss_clip": 1.02178597, "balance_loss_mlp": 1.02067447, "epoch": 0.5403276717270404, "flos": 19458142974720.0, "grad_norm": 1.5989405020568415, "language_loss": 0.78253478, "learning_rate": 1.8360261605552075e-06, "loss": 0.80361587, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 8987, "time_per_iteration": 2.4046552181243896 }, { "auxiliary_loss_clip": 0.01059715, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.02473998, "balance_loss_mlp": 1.02029669, "epoch": 0.5403877949797083, "flos": 18441157714560.0, "grad_norm": 1.7717174374703635, "language_loss": 0.72350985, "learning_rate": 1.8356380131446887e-06, "loss": 0.74461377, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 8988, "time_per_iteration": 2.3936660289764404 }, { "auxiliary_loss_clip": 0.01059741, "auxiliary_loss_mlp": 0.01046483, "balance_loss_clip": 1.01848066, "balance_loss_mlp": 1.01922727, "epoch": 0.5404479182323764, "flos": 28291622532480.0, "grad_norm": 2.31919775424533, "language_loss": 0.6898632, "learning_rate": 1.8352498719667934e-06, "loss": 0.7109254, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 8989, "time_per_iteration": 2.4300780296325684 }, { "auxiliary_loss_clip": 0.0106002, "auxiliary_loss_mlp": 0.01051168, "balance_loss_clip": 1.02255774, "balance_loss_mlp": 1.01850891, "epoch": 0.5405080414850444, "flos": 23366215566720.0, "grad_norm": 1.4090341533778252, "language_loss": 0.78394675, "learning_rate": 1.8348617370362399e-06, "loss": 0.80505866, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 8990, "time_per_iteration": 2.426922082901001 }, { "auxiliary_loss_clip": 0.01058001, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.02207994, "balance_loss_mlp": 1.01815677, "epoch": 0.5405681647377123, "flos": 21105344920320.0, "grad_norm": 1.753818868406025, "language_loss": 0.70264602, "learning_rate": 1.834473608367745e-06, "loss": 0.72369778, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3984375, "step": 8991, "time_per_iteration": 2.372962713241577 }, { "auxiliary_loss_clip": 0.01058298, "auxiliary_loss_mlp": 0.01058244, "balance_loss_clip": 1.03059959, "balance_loss_mlp": 1.01753485, "epoch": 0.5406282879903803, "flos": 20448139887360.0, "grad_norm": 1.7247015679145334, "language_loss": 0.7786051, "learning_rate": 1.8340854859760277e-06, "loss": 0.79977047, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 8992, "time_per_iteration": 2.3975396156311035 }, { "auxiliary_loss_clip": 0.01058848, "auxiliary_loss_mlp": 0.01051674, "balance_loss_clip": 1.02456617, "balance_loss_mlp": 1.01787519, "epoch": 0.5406884112430482, "flos": 14208637057920.0, "grad_norm": 2.37075758450615, "language_loss": 0.78151548, "learning_rate": 1.8336973698758056e-06, "loss": 0.80262077, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 8993, "time_per_iteration": 2.3424320220947266 }, { "auxiliary_loss_clip": 0.01055387, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.02091265, "balance_loss_mlp": 1.01706314, "epoch": 0.5407485344957162, "flos": 23874516184320.0, "grad_norm": 1.6973041111487555, "language_loss": 0.70863628, "learning_rate": 1.8333092600817959e-06, "loss": 0.72965825, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 8994, "time_per_iteration": 3.70424747467041 }, { "auxiliary_loss_clip": 0.01059238, "auxiliary_loss_mlp": 0.0105201, "balance_loss_clip": 1.0233283, "balance_loss_mlp": 1.01786327, "epoch": 0.5408086577483842, "flos": 23147379964800.0, "grad_norm": 2.491419053312409, "language_loss": 0.76449114, "learning_rate": 1.8329211566087157e-06, "loss": 0.78560364, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 8995, "time_per_iteration": 3.7523932456970215 }, { "auxiliary_loss_clip": 0.01056811, "auxiliary_loss_mlp": 0.01042638, "balance_loss_clip": 1.01572061, "balance_loss_mlp": 1.01795602, "epoch": 0.5408687810010522, "flos": 18770039521920.0, "grad_norm": 1.9015684050157444, "language_loss": 0.74601519, "learning_rate": 1.832533059471282e-06, "loss": 0.76700962, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 8996, "time_per_iteration": 2.3674206733703613 }, { "auxiliary_loss_clip": 0.01056012, "auxiliary_loss_mlp": 0.01046712, "balance_loss_clip": 1.02129626, "balance_loss_mlp": 1.0178709, "epoch": 0.5409289042537201, "flos": 13880697857280.0, "grad_norm": 1.8456103430696942, "language_loss": 0.74739599, "learning_rate": 1.8321449686842115e-06, "loss": 0.7684232, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 8997, "time_per_iteration": 2.375185489654541 }, { "auxiliary_loss_clip": 0.0105908, "auxiliary_loss_mlp": 0.01045857, "balance_loss_clip": 1.01876044, "balance_loss_mlp": 1.01901495, "epoch": 0.5409890275063881, "flos": 14464480567680.0, "grad_norm": 2.2164161532583826, "language_loss": 0.73034811, "learning_rate": 1.8317568842622207e-06, "loss": 0.75139749, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 8998, "time_per_iteration": 3.674541711807251 }, { "auxiliary_loss_clip": 0.01057384, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.01389563, "balance_loss_mlp": 1.01794624, "epoch": 0.541049150759056, "flos": 48975706454400.0, "grad_norm": 1.4637379598060303, "language_loss": 0.71076822, "learning_rate": 1.8313688062200256e-06, "loss": 0.73173249, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 8999, "time_per_iteration": 2.637082576751709 }, { "auxiliary_loss_clip": 0.01056786, "auxiliary_loss_mlp": 0.01048848, "balance_loss_clip": 1.02212119, "balance_loss_mlp": 1.01861525, "epoch": 0.541109274011724, "flos": 18146700374400.0, "grad_norm": 2.258126134273791, "language_loss": 0.82385063, "learning_rate": 1.8309807345723422e-06, "loss": 0.84490699, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3828125, "step": 9000, "time_per_iteration": 2.3275983333587646 }, { "auxiliary_loss_clip": 0.01057739, "auxiliary_loss_mlp": 0.01045158, "balance_loss_clip": 1.01696467, "balance_loss_mlp": 1.0198704, "epoch": 0.541169397264392, "flos": 20521492387200.0, "grad_norm": 2.0428991569937223, "language_loss": 0.74478143, "learning_rate": 1.8305926693338863e-06, "loss": 0.76581037, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.37890625, "step": 9001, "time_per_iteration": 2.3980133533477783 }, { "auxiliary_loss_clip": 0.01060872, "auxiliary_loss_mlp": 0.01044522, "balance_loss_clip": 1.0159955, "balance_loss_mlp": 1.01910973, "epoch": 0.54122952051706, "flos": 20043112671360.0, "grad_norm": 2.374825741441473, "language_loss": 0.87238884, "learning_rate": 1.8302046105193734e-06, "loss": 0.89344275, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41796875, "step": 9002, "time_per_iteration": 2.3451480865478516 }, { "auxiliary_loss_clip": 0.01058118, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.01422012, "balance_loss_mlp": 1.01970601, "epoch": 0.541289643769728, "flos": 19061250105600.0, "grad_norm": 1.9461522351018676, "language_loss": 0.79572153, "learning_rate": 1.8298165581435183e-06, "loss": 0.81667662, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38476562, "step": 9003, "time_per_iteration": 2.377026081085205 }, { "auxiliary_loss_clip": 0.01059758, "auxiliary_loss_mlp": 0.01042404, "balance_loss_clip": 1.01512861, "balance_loss_mlp": 1.02079535, "epoch": 0.5413497670223959, "flos": 22381210978560.0, "grad_norm": 2.0004025544164254, "language_loss": 0.71071774, "learning_rate": 1.8294285122210372e-06, "loss": 0.73173934, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 9004, "time_per_iteration": 2.389606237411499 }, { "auxiliary_loss_clip": 0.01012111, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.02302504, "balance_loss_mlp": 1.00480962, "epoch": 0.5414098902750639, "flos": 70028331488640.0, "grad_norm": 0.9729350548688701, "language_loss": 0.59331584, "learning_rate": 1.8290404727666434e-06, "loss": 0.613693, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.07324219, "step": 9005, "time_per_iteration": 4.5205957889556885 }, { "auxiliary_loss_clip": 0.01061705, "auxiliary_loss_mlp": 0.01045404, "balance_loss_clip": 1.0195955, "balance_loss_mlp": 1.02079582, "epoch": 0.5414700135277318, "flos": 21797882115840.0, "grad_norm": 1.8178538627835596, "language_loss": 0.80296308, "learning_rate": 1.8286524397950517e-06, "loss": 0.82403409, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40820312, "step": 9006, "time_per_iteration": 2.40203857421875 }, { "auxiliary_loss_clip": 0.01057371, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.0153321, "balance_loss_mlp": 1.01975799, "epoch": 0.5415301367803999, "flos": 16907039262720.0, "grad_norm": 2.249568031199077, "language_loss": 0.84355617, "learning_rate": 1.8282644133209777e-06, "loss": 0.86450994, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.375, "step": 9007, "time_per_iteration": 2.3698244094848633 }, { "auxiliary_loss_clip": 0.01061981, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.01416326, "balance_loss_mlp": 1.02195454, "epoch": 0.5415902600330678, "flos": 25702952330880.0, "grad_norm": 1.950225815954539, "language_loss": 0.67885053, "learning_rate": 1.8278763933591334e-06, "loss": 0.69988298, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 9008, "time_per_iteration": 2.4727377891540527 }, { "auxiliary_loss_clip": 0.01064084, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.01754737, "balance_loss_mlp": 1.02193022, "epoch": 0.5416503832857358, "flos": 19207152144000.0, "grad_norm": 2.509320249086475, "language_loss": 0.75636637, "learning_rate": 1.827488379924234e-06, "loss": 0.77746785, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.421875, "step": 9009, "time_per_iteration": 2.3743577003479004 }, { "auxiliary_loss_clip": 0.0106322, "auxiliary_loss_mlp": 0.01048498, "balance_loss_clip": 1.01933932, "balance_loss_mlp": 1.02208197, "epoch": 0.5417105065384037, "flos": 12712888056960.0, "grad_norm": 1.935199508487144, "language_loss": 0.8962605, "learning_rate": 1.8271003730309923e-06, "loss": 0.91737771, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41210938, "step": 9010, "time_per_iteration": 2.3980228900909424 }, { "auxiliary_loss_clip": 0.01059963, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.01856911, "balance_loss_mlp": 1.02083111, "epoch": 0.5417706297910717, "flos": 30334635095040.0, "grad_norm": 1.7943799969821448, "language_loss": 0.66993344, "learning_rate": 1.826712372694122e-06, "loss": 0.69096839, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 9011, "time_per_iteration": 2.447270393371582 }, { "auxiliary_loss_clip": 0.01061599, "auxiliary_loss_mlp": 0.01041005, "balance_loss_clip": 1.01603103, "balance_loss_mlp": 1.02191806, "epoch": 0.5418307530437396, "flos": 29019771181440.0, "grad_norm": 2.0449270734460376, "language_loss": 0.80795693, "learning_rate": 1.8263243789283362e-06, "loss": 0.82898295, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 9012, "time_per_iteration": 2.4754998683929443 }, { "auxiliary_loss_clip": 0.01060309, "auxiliary_loss_mlp": 0.01045201, "balance_loss_clip": 1.02090609, "balance_loss_mlp": 1.02084887, "epoch": 0.5418908762964076, "flos": 16872510061440.0, "grad_norm": 1.8757561546603014, "language_loss": 0.76193905, "learning_rate": 1.8259363917483466e-06, "loss": 0.78299415, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.39453125, "step": 9013, "time_per_iteration": 2.3597066402435303 }, { "auxiliary_loss_clip": 0.01062558, "auxiliary_loss_mlp": 0.01050776, "balance_loss_clip": 1.02317941, "balance_loss_mlp": 1.02081013, "epoch": 0.5419509995490756, "flos": 18948795016320.0, "grad_norm": 1.974864113811349, "language_loss": 0.73032314, "learning_rate": 1.8255484111688667e-06, "loss": 0.75145644, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41796875, "step": 9014, "time_per_iteration": 2.3981499671936035 }, { "auxiliary_loss_clip": 0.01061494, "auxiliary_loss_mlp": 0.0105499, "balance_loss_clip": 1.0285852, "balance_loss_mlp": 1.02150881, "epoch": 0.5420111228017436, "flos": 18076734276480.0, "grad_norm": 1.4777247001602036, "language_loss": 0.81576651, "learning_rate": 1.8251604372046085e-06, "loss": 0.83693141, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 9015, "time_per_iteration": 2.3688113689422607 }, { "auxiliary_loss_clip": 0.01061985, "auxiliary_loss_mlp": 0.01054927, "balance_loss_clip": 1.02592325, "balance_loss_mlp": 1.0204612, "epoch": 0.5420712460544116, "flos": 19060796257920.0, "grad_norm": 2.245958185716074, "language_loss": 0.82187343, "learning_rate": 1.8247724698702843e-06, "loss": 0.84304249, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4140625, "step": 9016, "time_per_iteration": 2.409641742706299 }, { "auxiliary_loss_clip": 0.01059075, "auxiliary_loss_mlp": 0.01044416, "balance_loss_clip": 1.01900089, "balance_loss_mlp": 1.0196172, "epoch": 0.5421313693070795, "flos": 18186117166080.0, "grad_norm": 1.6622537856888557, "language_loss": 0.82373416, "learning_rate": 1.8243845091806053e-06, "loss": 0.84476912, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 9017, "time_per_iteration": 2.396249771118164 }, { "auxiliary_loss_clip": 0.01056014, "auxiliary_loss_mlp": 0.01044802, "balance_loss_clip": 1.01998281, "balance_loss_mlp": 1.01834893, "epoch": 0.5421914925597475, "flos": 13005111070080.0, "grad_norm": 1.9589094103525435, "language_loss": 0.79151911, "learning_rate": 1.8239965551502837e-06, "loss": 0.8125273, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 9018, "time_per_iteration": 2.3783037662506104 }, { "auxiliary_loss_clip": 0.01059489, "auxiliary_loss_mlp": 0.01048949, "balance_loss_clip": 1.02057695, "balance_loss_mlp": 1.01819682, "epoch": 0.5422516158124154, "flos": 46756591660800.0, "grad_norm": 1.6955387125928798, "language_loss": 0.67519015, "learning_rate": 1.8236086077940303e-06, "loss": 0.69627452, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 9019, "time_per_iteration": 2.6251652240753174 }, { "auxiliary_loss_clip": 0.01055669, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.01894343, "balance_loss_mlp": 1.01807308, "epoch": 0.5423117390650835, "flos": 31757310887040.0, "grad_norm": 1.811198562467502, "language_loss": 0.70897895, "learning_rate": 1.8232206671265555e-06, "loss": 0.72995055, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.375, "step": 9020, "time_per_iteration": 2.48136305809021 }, { "auxiliary_loss_clip": 0.01055018, "auxiliary_loss_mlp": 0.01043244, "balance_loss_clip": 1.0191642, "balance_loss_mlp": 1.01767564, "epoch": 0.5423718623177514, "flos": 27200656368000.0, "grad_norm": 1.7940169446069285, "language_loss": 0.80727732, "learning_rate": 1.8228327331625717e-06, "loss": 0.82825994, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 9021, "time_per_iteration": 2.4461467266082764 }, { "auxiliary_loss_clip": 0.01058422, "auxiliary_loss_mlp": 0.01045885, "balance_loss_clip": 1.02068448, "balance_loss_mlp": 1.01955914, "epoch": 0.5424319855704194, "flos": 23545424908800.0, "grad_norm": 2.029745206896073, "language_loss": 0.79678822, "learning_rate": 1.822444805916788e-06, "loss": 0.81783134, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38867188, "step": 9022, "time_per_iteration": 2.4300336837768555 }, { "auxiliary_loss_clip": 0.01056649, "auxiliary_loss_mlp": 0.01042405, "balance_loss_clip": 1.01723957, "balance_loss_mlp": 1.0175575, "epoch": 0.5424921088230873, "flos": 26614394951040.0, "grad_norm": 1.672323156821775, "language_loss": 0.83438879, "learning_rate": 1.822056885403915e-06, "loss": 0.85537934, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 9023, "time_per_iteration": 2.40997576713562 }, { "auxiliary_loss_clip": 0.01056093, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.01485729, "balance_loss_mlp": 1.01730251, "epoch": 0.5425522320757553, "flos": 23585679573120.0, "grad_norm": 1.6788339390128948, "language_loss": 0.72745001, "learning_rate": 1.8216689716386627e-06, "loss": 0.74840409, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 9024, "time_per_iteration": 2.433309316635132 }, { "auxiliary_loss_clip": 0.01057034, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.01614118, "balance_loss_mlp": 1.01740849, "epoch": 0.5426123553284232, "flos": 30590932452480.0, "grad_norm": 1.5624531973643847, "language_loss": 0.66219771, "learning_rate": 1.8212810646357405e-06, "loss": 0.68318808, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39648438, "step": 9025, "time_per_iteration": 2.4732935428619385 }, { "auxiliary_loss_clip": 0.01059305, "auxiliary_loss_mlp": 0.01044837, "balance_loss_clip": 1.01960087, "balance_loss_mlp": 1.01925826, "epoch": 0.5426724785810912, "flos": 12494296834560.0, "grad_norm": 1.9017165047952214, "language_loss": 0.74200058, "learning_rate": 1.8208931644098591e-06, "loss": 0.76304197, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40039062, "step": 9026, "time_per_iteration": 2.3399176597595215 }, { "auxiliary_loss_clip": 0.01058637, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.0134809, "balance_loss_mlp": 1.01818252, "epoch": 0.5427326018337592, "flos": 26063500608000.0, "grad_norm": 1.8259223815879184, "language_loss": 0.79725444, "learning_rate": 1.8205052709757265e-06, "loss": 0.81826639, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40429688, "step": 9027, "time_per_iteration": 2.443751573562622 }, { "auxiliary_loss_clip": 0.01011653, "auxiliary_loss_mlp": 0.01002156, "balance_loss_clip": 0.99973613, "balance_loss_mlp": 1.00441337, "epoch": 0.5427927250864272, "flos": 65981374041600.0, "grad_norm": 0.7442778816826011, "language_loss": 0.56616509, "learning_rate": 1.8201173843480515e-06, "loss": 0.58630323, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.07226562, "step": 9028, "time_per_iteration": 3.05761456489563 }, { "auxiliary_loss_clip": 0.01059637, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.01527393, "balance_loss_mlp": 1.01893997, "epoch": 0.5428528483390952, "flos": 19974333559680.0, "grad_norm": 2.143365225552012, "language_loss": 0.79777998, "learning_rate": 1.8197295045415442e-06, "loss": 0.81882066, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40625, "step": 9029, "time_per_iteration": 2.372952938079834 }, { "auxiliary_loss_clip": 0.01055827, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.00933278, "balance_loss_mlp": 1.01771736, "epoch": 0.5429129715917631, "flos": 21831329064960.0, "grad_norm": 1.4213641814921112, "language_loss": 0.83825862, "learning_rate": 1.8193416315709112e-06, "loss": 0.85917109, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 9030, "time_per_iteration": 2.4428200721740723 }, { "auxiliary_loss_clip": 0.01056462, "auxiliary_loss_mlp": 0.01041428, "balance_loss_clip": 1.01617944, "balance_loss_mlp": 1.01809311, "epoch": 0.5429730948444311, "flos": 27781436701440.0, "grad_norm": 1.7163038713351189, "language_loss": 0.75879824, "learning_rate": 1.8189537654508623e-06, "loss": 0.77977717, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38476562, "step": 9031, "time_per_iteration": 2.419968605041504 }, { "auxiliary_loss_clip": 0.01054955, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.01585364, "balance_loss_mlp": 1.01806211, "epoch": 0.543033218097099, "flos": 26759249648640.0, "grad_norm": 2.1053979154442377, "language_loss": 0.86058158, "learning_rate": 1.8185659061961045e-06, "loss": 0.88150644, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36914062, "step": 9032, "time_per_iteration": 2.4241983890533447 }, { "auxiliary_loss_clip": 0.01060876, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.0174861, "balance_loss_mlp": 1.01911294, "epoch": 0.5430933413497671, "flos": 22674132218880.0, "grad_norm": 1.8537280352805028, "language_loss": 0.7478106, "learning_rate": 1.8181780538213457e-06, "loss": 0.76888895, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41796875, "step": 9033, "time_per_iteration": 3.7659451961517334 }, { "auxiliary_loss_clip": 0.01055965, "auxiliary_loss_mlp": 0.01042192, "balance_loss_clip": 1.01680052, "balance_loss_mlp": 1.01665318, "epoch": 0.543153464602435, "flos": 24606365437440.0, "grad_norm": 1.6516160195770817, "language_loss": 0.77413392, "learning_rate": 1.8177902083412935e-06, "loss": 0.79511547, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 9034, "time_per_iteration": 2.4380745887756348 }, { "auxiliary_loss_clip": 0.01056679, "auxiliary_loss_mlp": 0.01046697, "balance_loss_clip": 1.0220331, "balance_loss_mlp": 1.01847863, "epoch": 0.543213587855103, "flos": 19024730956800.0, "grad_norm": 1.7384672178748555, "language_loss": 0.853266, "learning_rate": 1.817402369770655e-06, "loss": 0.87429976, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 9035, "time_per_iteration": 3.786783456802368 }, { "auxiliary_loss_clip": 0.01014882, "auxiliary_loss_mlp": 0.01003294, "balance_loss_clip": 1.00087392, "balance_loss_mlp": 1.00749063, "epoch": 0.5432737111077709, "flos": 65683251008640.0, "grad_norm": 0.7305709731943748, "language_loss": 0.56051302, "learning_rate": 1.8170145381241364e-06, "loss": 0.58069479, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.07373047, "step": 9036, "time_per_iteration": 2.9957268238067627 }, { "auxiliary_loss_clip": 0.01059588, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.01657248, "balance_loss_mlp": 1.01951742, "epoch": 0.5433338343604389, "flos": 22090558976640.0, "grad_norm": 1.6428754783239325, "language_loss": 0.76165104, "learning_rate": 1.8166267134164451e-06, "loss": 0.78268802, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 9037, "time_per_iteration": 3.8074212074279785 }, { "auxiliary_loss_clip": 0.01058785, "auxiliary_loss_mlp": 0.01052549, "balance_loss_clip": 1.02489209, "balance_loss_mlp": 1.01887214, "epoch": 0.5433939576131068, "flos": 34671371760000.0, "grad_norm": 1.7527397128064792, "language_loss": 0.67730588, "learning_rate": 1.8162388956622875e-06, "loss": 0.69841921, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9038, "time_per_iteration": 2.517861843109131 }, { "auxiliary_loss_clip": 0.01055571, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.01552153, "balance_loss_mlp": 1.01778579, "epoch": 0.5434540808657748, "flos": 20302307671680.0, "grad_norm": 2.425787863563316, "language_loss": 0.79687029, "learning_rate": 1.8158510848763692e-06, "loss": 0.81782311, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37695312, "step": 9039, "time_per_iteration": 2.39618182182312 }, { "auxiliary_loss_clip": 0.0105772, "auxiliary_loss_mlp": 0.01043938, "balance_loss_clip": 1.01917827, "balance_loss_mlp": 1.01872134, "epoch": 0.5435142041184428, "flos": 23111663777280.0, "grad_norm": 1.936345146366669, "language_loss": 0.77743638, "learning_rate": 1.8154632810733962e-06, "loss": 0.79845297, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 9040, "time_per_iteration": 2.3906710147857666 }, { "auxiliary_loss_clip": 0.01013408, "auxiliary_loss_mlp": 0.01002224, "balance_loss_clip": 1.00011444, "balance_loss_mlp": 1.00613451, "epoch": 0.5435743273711108, "flos": 64009479651840.0, "grad_norm": 0.6630439675029843, "language_loss": 0.52525878, "learning_rate": 1.815075484268074e-06, "loss": 0.54541516, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.07275391, "step": 9041, "time_per_iteration": 3.0061399936676025 }, { "auxiliary_loss_clip": 0.01059877, "auxiliary_loss_mlp": 0.0104653, "balance_loss_clip": 1.0195055, "balance_loss_mlp": 1.02010858, "epoch": 0.5436344506237788, "flos": 25117738254720.0, "grad_norm": 1.7887217616826259, "language_loss": 0.7783438, "learning_rate": 1.8146876944751078e-06, "loss": 0.79940784, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9042, "time_per_iteration": 2.411620616912842 }, { "auxiliary_loss_clip": 0.01056717, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.01270223, "balance_loss_mlp": 1.01762748, "epoch": 0.5436945738764467, "flos": 19571959607040.0, "grad_norm": 1.6724308970173312, "language_loss": 0.68378264, "learning_rate": 1.8142999117092033e-06, "loss": 0.70472205, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.390625, "step": 9043, "time_per_iteration": 2.385835647583008 }, { "auxiliary_loss_clip": 0.0105515, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.01646543, "balance_loss_mlp": 1.01798427, "epoch": 0.5437546971291147, "flos": 21141445132800.0, "grad_norm": 1.6768725016350807, "language_loss": 0.85946238, "learning_rate": 1.8139121359850644e-06, "loss": 0.88041186, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 9044, "time_per_iteration": 3.8827810287475586 }, { "auxiliary_loss_clip": 0.01059681, "auxiliary_loss_mlp": 0.01047643, "balance_loss_clip": 1.01893795, "balance_loss_mlp": 1.01818919, "epoch": 0.5438148203817826, "flos": 25117528786560.0, "grad_norm": 2.5803185596156504, "language_loss": 0.6302526, "learning_rate": 1.8135243673173956e-06, "loss": 0.65132588, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 9045, "time_per_iteration": 2.3982245922088623 }, { "auxiliary_loss_clip": 0.01058662, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.01838672, "balance_loss_mlp": 1.01891708, "epoch": 0.5438749436344507, "flos": 23001827040000.0, "grad_norm": 1.4712084884167334, "language_loss": 0.71204925, "learning_rate": 1.8131366057209023e-06, "loss": 0.73307335, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 9046, "time_per_iteration": 2.445315361022949 }, { "auxiliary_loss_clip": 0.01056287, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.01396275, "balance_loss_mlp": 1.01809633, "epoch": 0.5439350668871186, "flos": 15486109038720.0, "grad_norm": 1.5465169320588217, "language_loss": 0.78237331, "learning_rate": 1.8127488512102868e-06, "loss": 0.80334604, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3828125, "step": 9047, "time_per_iteration": 2.3545403480529785 }, { "auxiliary_loss_clip": 0.01056563, "auxiliary_loss_mlp": 0.01044949, "balance_loss_clip": 1.01779294, "balance_loss_mlp": 1.01801372, "epoch": 0.5439951901397866, "flos": 17237457169920.0, "grad_norm": 1.7731542647759895, "language_loss": 0.74016345, "learning_rate": 1.8123611038002547e-06, "loss": 0.76117849, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38476562, "step": 9048, "time_per_iteration": 2.3727214336395264 }, { "auxiliary_loss_clip": 0.01058725, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.01580942, "balance_loss_mlp": 1.01965606, "epoch": 0.5440553133924545, "flos": 18660028227840.0, "grad_norm": 2.1578440972650963, "language_loss": 0.94451684, "learning_rate": 1.8119733635055076e-06, "loss": 0.96554434, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.390625, "step": 9049, "time_per_iteration": 2.3596088886260986 }, { "auxiliary_loss_clip": 0.01055475, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.01731825, "balance_loss_mlp": 1.01648462, "epoch": 0.5441154366451225, "flos": 27121787873280.0, "grad_norm": 1.7969916840490479, "language_loss": 0.74946129, "learning_rate": 1.8115856303407492e-06, "loss": 0.77044261, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 9050, "time_per_iteration": 2.453023672103882 }, { "auxiliary_loss_clip": 0.01058283, "auxiliary_loss_mlp": 0.01041243, "balance_loss_clip": 1.01430154, "balance_loss_mlp": 1.01848006, "epoch": 0.5441755598977904, "flos": 25992696637440.0, "grad_norm": 1.756650253366105, "language_loss": 0.69444597, "learning_rate": 1.8111979043206832e-06, "loss": 0.71544123, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9051, "time_per_iteration": 2.418930768966675 }, { "auxiliary_loss_clip": 0.01056819, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.016904, "balance_loss_mlp": 1.01815462, "epoch": 0.5442356831504584, "flos": 32378660087040.0, "grad_norm": 1.6585199862090179, "language_loss": 0.6877389, "learning_rate": 1.810810185460011e-06, "loss": 0.70872915, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 9052, "time_per_iteration": 2.485384464263916 }, { "auxiliary_loss_clip": 0.01059764, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.01525521, "balance_loss_mlp": 1.01955175, "epoch": 0.5442958064031264, "flos": 24163317884160.0, "grad_norm": 1.7106228395563101, "language_loss": 0.93766999, "learning_rate": 1.810422473773436e-06, "loss": 0.95868838, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 9053, "time_per_iteration": 2.4144763946533203 }, { "auxiliary_loss_clip": 0.01058876, "auxiliary_loss_mlp": 0.0104566, "balance_loss_clip": 1.01796818, "balance_loss_mlp": 1.0188812, "epoch": 0.5443559296557944, "flos": 18763860211200.0, "grad_norm": 1.849519385746669, "language_loss": 0.84817815, "learning_rate": 1.8100347692756595e-06, "loss": 0.86922354, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 9054, "time_per_iteration": 2.359365940093994 }, { "auxiliary_loss_clip": 0.01058843, "auxiliary_loss_mlp": 0.01047316, "balance_loss_clip": 1.02000558, "balance_loss_mlp": 1.01943111, "epoch": 0.5444160529084624, "flos": 22631608316160.0, "grad_norm": 1.9640408192433894, "language_loss": 0.70381314, "learning_rate": 1.8096470719813836e-06, "loss": 0.72487473, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 9055, "time_per_iteration": 2.4353179931640625 }, { "auxiliary_loss_clip": 0.0101025, "auxiliary_loss_mlp": 0.01011342, "balance_loss_clip": 1.008636, "balance_loss_mlp": 1.00335169, "epoch": 0.5444761761611303, "flos": 69668376704640.0, "grad_norm": 0.7338477128881574, "language_loss": 0.57765657, "learning_rate": 1.80925938190531e-06, "loss": 0.5978725, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.06884766, "step": 9056, "time_per_iteration": 3.027512550354004 }, { "auxiliary_loss_clip": 0.0105844, "auxiliary_loss_mlp": 0.01042283, "balance_loss_clip": 1.01450729, "balance_loss_mlp": 1.01849353, "epoch": 0.5445362994137983, "flos": 14277695460480.0, "grad_norm": 3.297214325210018, "language_loss": 0.71262908, "learning_rate": 1.8088716990621395e-06, "loss": 0.73363632, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9057, "time_per_iteration": 2.3674979209899902 }, { "auxiliary_loss_clip": 0.01056728, "auxiliary_loss_mlp": 0.01041818, "balance_loss_clip": 1.017344, "balance_loss_mlp": 1.01950347, "epoch": 0.5445964226664662, "flos": 28984927777920.0, "grad_norm": 1.9426110025911156, "language_loss": 0.76045471, "learning_rate": 1.8084840234665738e-06, "loss": 0.7814402, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 9058, "time_per_iteration": 2.467259407043457 }, { "auxiliary_loss_clip": 0.01011074, "auxiliary_loss_mlp": 0.01008864, "balance_loss_clip": 1.00656343, "balance_loss_mlp": 1.00409472, "epoch": 0.5446565459191343, "flos": 68616548040960.0, "grad_norm": 0.808543220785197, "language_loss": 0.62751162, "learning_rate": 1.808096355133312e-06, "loss": 0.64771092, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.06982422, "step": 9059, "time_per_iteration": 3.1290409564971924 }, { "auxiliary_loss_clip": 0.01055083, "auxiliary_loss_mlp": 0.01045065, "balance_loss_clip": 1.02055573, "balance_loss_mlp": 1.01742005, "epoch": 0.5447166691718022, "flos": 16215549408000.0, "grad_norm": 1.7115498003963618, "language_loss": 0.8074283, "learning_rate": 1.8077086940770572e-06, "loss": 0.82842982, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 9060, "time_per_iteration": 2.390475034713745 }, { "auxiliary_loss_clip": 0.01057722, "auxiliary_loss_mlp": 0.01044311, "balance_loss_clip": 1.01900315, "balance_loss_mlp": 1.01949477, "epoch": 0.5447767924244702, "flos": 25847841939840.0, "grad_norm": 2.1661163492544406, "language_loss": 0.80990827, "learning_rate": 1.8073210403125072e-06, "loss": 0.83092862, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9061, "time_per_iteration": 2.4083383083343506 }, { "auxiliary_loss_clip": 0.01057697, "auxiliary_loss_mlp": 0.0104155, "balance_loss_clip": 1.01617002, "balance_loss_mlp": 1.01922059, "epoch": 0.5448369156771381, "flos": 19676838931200.0, "grad_norm": 1.7657601749236296, "language_loss": 0.87870806, "learning_rate": 1.8069333938543627e-06, "loss": 0.89970052, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 9062, "time_per_iteration": 2.3853800296783447 }, { "auxiliary_loss_clip": 0.01061386, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.02161312, "balance_loss_mlp": 1.01974452, "epoch": 0.5448970389298061, "flos": 19280783934720.0, "grad_norm": 2.0243415416616455, "language_loss": 0.83890337, "learning_rate": 1.8065457547173233e-06, "loss": 0.86001015, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 9063, "time_per_iteration": 2.3655283451080322 }, { "auxiliary_loss_clip": 0.0105773, "auxiliary_loss_mlp": 0.01048092, "balance_loss_clip": 1.02241397, "balance_loss_mlp": 1.01749837, "epoch": 0.544957162182474, "flos": 20990760238080.0, "grad_norm": 1.6991257408756613, "language_loss": 0.64819884, "learning_rate": 1.8061581229160878e-06, "loss": 0.66925704, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 9064, "time_per_iteration": 2.406205177307129 }, { "auxiliary_loss_clip": 0.01058422, "auxiliary_loss_mlp": 0.01050351, "balance_loss_clip": 1.02224183, "balance_loss_mlp": 1.01843405, "epoch": 0.545017285435142, "flos": 25373407207680.0, "grad_norm": 1.6492213959280924, "language_loss": 0.80796456, "learning_rate": 1.8057704984653566e-06, "loss": 0.82905233, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 9065, "time_per_iteration": 2.414774179458618 }, { "auxiliary_loss_clip": 0.01055712, "auxiliary_loss_mlp": 0.01044683, "balance_loss_clip": 1.02055502, "balance_loss_mlp": 1.01817989, "epoch": 0.54507740868781, "flos": 19133764732800.0, "grad_norm": 1.8932295716576684, "language_loss": 0.79279208, "learning_rate": 1.805382881379827e-06, "loss": 0.81379604, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37695312, "step": 9066, "time_per_iteration": 2.3795690536499023 }, { "auxiliary_loss_clip": 0.01059763, "auxiliary_loss_mlp": 0.01051599, "balance_loss_clip": 1.02415705, "balance_loss_mlp": 1.01855993, "epoch": 0.545137531940478, "flos": 26248609969920.0, "grad_norm": 1.6187287960385142, "language_loss": 0.76905054, "learning_rate": 1.8049952716741975e-06, "loss": 0.79016417, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41210938, "step": 9067, "time_per_iteration": 2.415585517883301 }, { "auxiliary_loss_clip": 0.01062184, "auxiliary_loss_mlp": 0.01054935, "balance_loss_clip": 1.02380896, "balance_loss_mlp": 1.01986396, "epoch": 0.545197655193146, "flos": 37554254922240.0, "grad_norm": 2.009421348764547, "language_loss": 0.64262295, "learning_rate": 1.8046076693631682e-06, "loss": 0.6637941, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.42382812, "step": 9068, "time_per_iteration": 2.528505802154541 }, { "auxiliary_loss_clip": 0.01057167, "auxiliary_loss_mlp": 0.01057328, "balance_loss_clip": 1.03130496, "balance_loss_mlp": 1.0185945, "epoch": 0.5452577784458139, "flos": 26030053658880.0, "grad_norm": 1.5305675024301644, "language_loss": 0.73225141, "learning_rate": 1.8042200744614343e-06, "loss": 0.75339639, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9069, "time_per_iteration": 2.435340404510498 }, { "auxiliary_loss_clip": 0.01055682, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 1.0192616, "balance_loss_mlp": 1.0190897, "epoch": 0.5453179016984819, "flos": 17638085554560.0, "grad_norm": 1.6532645044201513, "language_loss": 0.75292534, "learning_rate": 1.8038324869836957e-06, "loss": 0.77391875, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 9070, "time_per_iteration": 2.3401498794555664 }, { "auxiliary_loss_clip": 0.01057634, "auxiliary_loss_mlp": 0.0105093, "balance_loss_clip": 1.02637339, "balance_loss_mlp": 1.01812756, "epoch": 0.5453780249511498, "flos": 23215705228800.0, "grad_norm": 2.50302580000591, "language_loss": 0.62152565, "learning_rate": 1.8034449069446489e-06, "loss": 0.64261127, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39453125, "step": 9071, "time_per_iteration": 2.4144999980926514 }, { "auxiliary_loss_clip": 0.01009875, "auxiliary_loss_mlp": 0.01004611, "balance_loss_clip": 1.00244141, "balance_loss_mlp": 1.00274849, "epoch": 0.5454381482038179, "flos": 68692728360960.0, "grad_norm": 0.7003940584138308, "language_loss": 0.57189238, "learning_rate": 1.80305733435899e-06, "loss": 0.5920372, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.07128906, "step": 9072, "time_per_iteration": 3.1287894248962402 }, { "auxiliary_loss_clip": 0.01054369, "auxiliary_loss_mlp": 0.01042876, "balance_loss_clip": 1.01827073, "balance_loss_mlp": 1.01694596, "epoch": 0.5454982714564858, "flos": 13259802504960.0, "grad_norm": 1.7849556806745304, "language_loss": 0.71198654, "learning_rate": 1.8026697692414174e-06, "loss": 0.73295903, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 9073, "time_per_iteration": 3.548638343811035 }, { "auxiliary_loss_clip": 0.01055031, "auxiliary_loss_mlp": 0.01045571, "balance_loss_clip": 1.02060819, "balance_loss_mlp": 1.01800632, "epoch": 0.5455583947091538, "flos": 21834785289600.0, "grad_norm": 1.769839850691107, "language_loss": 0.72245842, "learning_rate": 1.802282211606627e-06, "loss": 0.74346447, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 9074, "time_per_iteration": 3.9618887901306152 }, { "auxiliary_loss_clip": 0.01054941, "auxiliary_loss_mlp": 0.01046497, "balance_loss_clip": 1.0215348, "balance_loss_mlp": 1.01682413, "epoch": 0.5456185179618217, "flos": 17816596669440.0, "grad_norm": 2.0372779810032076, "language_loss": 0.70494938, "learning_rate": 1.8018946614693148e-06, "loss": 0.72596377, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 9075, "time_per_iteration": 2.4025380611419678 }, { "auxiliary_loss_clip": 0.01055341, "auxiliary_loss_mlp": 0.01043966, "balance_loss_clip": 1.02083969, "balance_loss_mlp": 1.0186727, "epoch": 0.5456786412144897, "flos": 21068337012480.0, "grad_norm": 1.630491771771455, "language_loss": 0.82153249, "learning_rate": 1.8015071188441768e-06, "loss": 0.84252554, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 9076, "time_per_iteration": 2.3757872581481934 }, { "auxiliary_loss_clip": 0.0105496, "auxiliary_loss_mlp": 0.01040224, "balance_loss_clip": 1.01563132, "balance_loss_mlp": 1.01661229, "epoch": 0.5457387644671576, "flos": 23293840584960.0, "grad_norm": 1.7440947756358194, "language_loss": 0.815027, "learning_rate": 1.8011195837459089e-06, "loss": 0.83597887, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 9077, "time_per_iteration": 3.861351251602173 }, { "auxiliary_loss_clip": 0.01056762, "auxiliary_loss_mlp": 0.01040948, "balance_loss_clip": 1.01645088, "balance_loss_mlp": 1.0181489, "epoch": 0.5457988877198257, "flos": 21615949687680.0, "grad_norm": 1.9450405511709188, "language_loss": 0.69382048, "learning_rate": 1.8007320561892064e-06, "loss": 0.71479756, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 9078, "time_per_iteration": 2.3997509479522705 }, { "auxiliary_loss_clip": 0.01057581, "auxiliary_loss_mlp": 0.01047052, "balance_loss_clip": 1.02051544, "balance_loss_mlp": 1.01772714, "epoch": 0.5458590109724936, "flos": 23761537424640.0, "grad_norm": 1.7305129958128598, "language_loss": 0.81537807, "learning_rate": 1.800344536188764e-06, "loss": 0.83642447, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 9079, "time_per_iteration": 2.4489526748657227 }, { "auxiliary_loss_clip": 0.01058803, "auxiliary_loss_mlp": 0.01045182, "balance_loss_clip": 1.01748967, "balance_loss_mlp": 1.01814556, "epoch": 0.5459191342251616, "flos": 24423176200320.0, "grad_norm": 1.8586634212553703, "language_loss": 0.77055687, "learning_rate": 1.799957023759277e-06, "loss": 0.79159677, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 9080, "time_per_iteration": 2.4146924018859863 }, { "auxiliary_loss_clip": 0.01058776, "auxiliary_loss_mlp": 0.01048129, "balance_loss_clip": 1.02032924, "balance_loss_mlp": 1.01926124, "epoch": 0.5459792574778296, "flos": 23621884519680.0, "grad_norm": 2.1254029987814937, "language_loss": 0.85092694, "learning_rate": 1.7995695189154392e-06, "loss": 0.87199599, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39648438, "step": 9081, "time_per_iteration": 2.453568935394287 }, { "auxiliary_loss_clip": 0.01059474, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.01924896, "balance_loss_mlp": 1.01894546, "epoch": 0.5460393807304975, "flos": 19134532782720.0, "grad_norm": 1.504669813873107, "language_loss": 0.71161693, "learning_rate": 1.7991820216719461e-06, "loss": 0.73267734, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 9082, "time_per_iteration": 2.3735578060150146 }, { "auxiliary_loss_clip": 0.01054918, "auxiliary_loss_mlp": 0.01035352, "balance_loss_clip": 1.01037776, "balance_loss_mlp": 1.01720166, "epoch": 0.5460995039831655, "flos": 35917072536960.0, "grad_norm": 1.5452739085026146, "language_loss": 0.67266226, "learning_rate": 1.7987945320434906e-06, "loss": 0.69356495, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 9083, "time_per_iteration": 2.564061403274536 }, { "auxiliary_loss_clip": 0.0105598, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.01597559, "balance_loss_mlp": 1.01849961, "epoch": 0.5461596272358334, "flos": 26758062662400.0, "grad_norm": 2.370125658442472, "language_loss": 0.80986738, "learning_rate": 1.798407050044766e-06, "loss": 0.83083451, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 9084, "time_per_iteration": 3.876662492752075 }, { "auxiliary_loss_clip": 0.01059754, "auxiliary_loss_mlp": 0.01043495, "balance_loss_clip": 1.01714993, "balance_loss_mlp": 1.01994848, "epoch": 0.5462197504885015, "flos": 20885531800320.0, "grad_norm": 1.7604703283909713, "language_loss": 0.76402938, "learning_rate": 1.7980195756904675e-06, "loss": 0.78506184, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 9085, "time_per_iteration": 2.4194583892822266 }, { "auxiliary_loss_clip": 0.01058594, "auxiliary_loss_mlp": 0.01039572, "balance_loss_clip": 1.01474071, "balance_loss_mlp": 1.02012062, "epoch": 0.5462798737411694, "flos": 25803991405440.0, "grad_norm": 1.840647445605235, "language_loss": 0.75356686, "learning_rate": 1.7976321089952857e-06, "loss": 0.77454853, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 9086, "time_per_iteration": 2.5095789432525635 }, { "auxiliary_loss_clip": 0.01056973, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.01680779, "balance_loss_mlp": 1.01875019, "epoch": 0.5463399969938374, "flos": 25773861035520.0, "grad_norm": 1.7431648957505663, "language_loss": 0.78161323, "learning_rate": 1.7972446499739155e-06, "loss": 0.80258954, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 9087, "time_per_iteration": 2.4573400020599365 }, { "auxiliary_loss_clip": 0.01059844, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.01575887, "balance_loss_mlp": 1.02024388, "epoch": 0.5464001202465053, "flos": 18842309769600.0, "grad_norm": 1.8553745117706277, "language_loss": 0.79202592, "learning_rate": 1.7968571986410484e-06, "loss": 0.8130542, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39453125, "step": 9088, "time_per_iteration": 2.361616373062134 }, { "auxiliary_loss_clip": 0.01014764, "auxiliary_loss_mlp": 0.01004959, "balance_loss_clip": 1.00274217, "balance_loss_mlp": 1.00765014, "epoch": 0.5464602434991733, "flos": 69046084897920.0, "grad_norm": 0.8686643638952096, "language_loss": 0.57721162, "learning_rate": 1.7964697550113758e-06, "loss": 0.59740883, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.07128906, "step": 9089, "time_per_iteration": 3.085942268371582 }, { "auxiliary_loss_clip": 0.01060177, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.01139545, "balance_loss_mlp": 1.02088404, "epoch": 0.5465203667518412, "flos": 27558900495360.0, "grad_norm": 1.7527443894645216, "language_loss": 0.78339219, "learning_rate": 1.7960823190995918e-06, "loss": 0.80435592, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39257812, "step": 9090, "time_per_iteration": 2.4447054862976074 }, { "auxiliary_loss_clip": 0.01062489, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.01640189, "balance_loss_mlp": 1.02019429, "epoch": 0.5465804900045093, "flos": 21209281637760.0, "grad_norm": 1.8353209137931406, "language_loss": 0.74716306, "learning_rate": 1.7956948909203855e-06, "loss": 0.76824093, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 9091, "time_per_iteration": 2.4215595722198486 }, { "auxiliary_loss_clip": 0.0105973, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.01820946, "balance_loss_mlp": 1.01971078, "epoch": 0.5466406132571772, "flos": 22487940604800.0, "grad_norm": 1.7411741473210525, "language_loss": 0.7912339, "learning_rate": 1.7953074704884498e-06, "loss": 0.81227678, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 9092, "time_per_iteration": 2.4153802394866943 }, { "auxiliary_loss_clip": 0.0106211, "auxiliary_loss_mlp": 0.01047296, "balance_loss_clip": 1.01934135, "balance_loss_mlp": 1.02169061, "epoch": 0.5467007365098452, "flos": 17674883994240.0, "grad_norm": 2.5320627347006983, "language_loss": 0.7656368, "learning_rate": 1.794920057818476e-06, "loss": 0.78673083, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 9093, "time_per_iteration": 2.3647782802581787 }, { "auxiliary_loss_clip": 0.01059182, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.01819468, "balance_loss_mlp": 1.01908255, "epoch": 0.5467608597625132, "flos": 15698136925440.0, "grad_norm": 2.1316505229713507, "language_loss": 0.7074858, "learning_rate": 1.7945326529251533e-06, "loss": 0.72853994, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 9094, "time_per_iteration": 2.4637675285339355 }, { "auxiliary_loss_clip": 0.01059276, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.02317381, "balance_loss_mlp": 1.02088583, "epoch": 0.5468209830151811, "flos": 24311768451840.0, "grad_norm": 3.2070443733822, "language_loss": 0.69727045, "learning_rate": 1.7941452558231731e-06, "loss": 0.71835291, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38476562, "step": 9095, "time_per_iteration": 2.4257540702819824 }, { "auxiliary_loss_clip": 0.01058623, "auxiliary_loss_mlp": 0.0104012, "balance_loss_clip": 1.0158608, "balance_loss_mlp": 1.02027047, "epoch": 0.5468811062678491, "flos": 29165114638080.0, "grad_norm": 1.4886080606432184, "language_loss": 0.67555988, "learning_rate": 1.7937578665272256e-06, "loss": 0.69654733, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3828125, "step": 9096, "time_per_iteration": 2.4329347610473633 }, { "auxiliary_loss_clip": 0.01011873, "auxiliary_loss_mlp": 0.01002845, "balance_loss_clip": 1.00001979, "balance_loss_mlp": 1.00490308, "epoch": 0.546941229520517, "flos": 67864031262720.0, "grad_norm": 0.7469479413703247, "language_loss": 0.57640612, "learning_rate": 1.7933704850520007e-06, "loss": 0.59655321, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.06982422, "step": 9097, "time_per_iteration": 3.1705825328826904 }, { "auxiliary_loss_clip": 0.01010673, "auxiliary_loss_mlp": 0.0100359, "balance_loss_clip": 1.0009675, "balance_loss_mlp": 1.00363612, "epoch": 0.5470013527731851, "flos": 58267594563840.0, "grad_norm": 0.9149350828114573, "language_loss": 0.64953506, "learning_rate": 1.7929831114121868e-06, "loss": 0.66967767, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.0703125, "step": 9098, "time_per_iteration": 2.979968547821045 }, { "auxiliary_loss_clip": 0.01057928, "auxiliary_loss_mlp": 0.01046652, "balance_loss_clip": 1.02067637, "balance_loss_mlp": 1.01852584, "epoch": 0.547061476025853, "flos": 22964819132160.0, "grad_norm": 1.5894528429345922, "language_loss": 0.74389493, "learning_rate": 1.7925957456224753e-06, "loss": 0.76494068, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 9099, "time_per_iteration": 2.4378929138183594 }, { "auxiliary_loss_clip": 0.01055169, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.01429605, "balance_loss_mlp": 1.01673758, "epoch": 0.547121599278521, "flos": 29967034723200.0, "grad_norm": 1.9386559693545249, "language_loss": 0.74160415, "learning_rate": 1.7922083876975537e-06, "loss": 0.7625345, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.38476562, "step": 9100, "time_per_iteration": 2.513603687286377 }, { "auxiliary_loss_clip": 0.01055692, "auxiliary_loss_mlp": 0.01039856, "balance_loss_clip": 1.01351047, "balance_loss_mlp": 1.01794147, "epoch": 0.5471817225311889, "flos": 36534057816960.0, "grad_norm": 1.675797508750771, "language_loss": 0.68811095, "learning_rate": 1.7918210376521102e-06, "loss": 0.70906639, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37695312, "step": 9101, "time_per_iteration": 2.5238590240478516 }, { "auxiliary_loss_clip": 0.0105625, "auxiliary_loss_mlp": 0.0103972, "balance_loss_clip": 1.01494861, "balance_loss_mlp": 1.01781082, "epoch": 0.5472418457838569, "flos": 25774070503680.0, "grad_norm": 1.8344269243276754, "language_loss": 0.79140317, "learning_rate": 1.7914336955008343e-06, "loss": 0.81236291, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 9102, "time_per_iteration": 2.442070245742798 }, { "auxiliary_loss_clip": 0.01056554, "auxiliary_loss_mlp": 0.01049568, "balance_loss_clip": 1.02520192, "balance_loss_mlp": 1.01861191, "epoch": 0.5473019690365248, "flos": 27886560405120.0, "grad_norm": 2.4063789105401265, "language_loss": 0.73228234, "learning_rate": 1.791046361258413e-06, "loss": 0.75334352, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9103, "time_per_iteration": 2.4447858333587646 }, { "auxiliary_loss_clip": 0.01056168, "auxiliary_loss_mlp": 0.0104084, "balance_loss_clip": 1.01649714, "balance_loss_mlp": 1.01859212, "epoch": 0.5473620922891929, "flos": 57629313354240.0, "grad_norm": 1.3878463714227298, "language_loss": 0.65671688, "learning_rate": 1.7906590349395356e-06, "loss": 0.67768699, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.375, "step": 9104, "time_per_iteration": 2.7730796337127686 }, { "auxiliary_loss_clip": 0.01061809, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.01327181, "balance_loss_mlp": 1.02094364, "epoch": 0.5474222155418608, "flos": 19353054182400.0, "grad_norm": 1.8794357176388636, "language_loss": 0.83077937, "learning_rate": 1.790271716558888e-06, "loss": 0.85181242, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40820312, "step": 9105, "time_per_iteration": 2.390254259109497 }, { "auxiliary_loss_clip": 0.01056568, "auxiliary_loss_mlp": 0.01039438, "balance_loss_clip": 1.01629972, "balance_loss_mlp": 1.01821017, "epoch": 0.5474823387945288, "flos": 25119239443200.0, "grad_norm": 3.204498211599707, "language_loss": 0.81005132, "learning_rate": 1.7898844061311575e-06, "loss": 0.83101141, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3828125, "step": 9106, "time_per_iteration": 2.4481019973754883 }, { "auxiliary_loss_clip": 0.01056917, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.02140689, "balance_loss_mlp": 1.01932883, "epoch": 0.5475424620471967, "flos": 18003207219840.0, "grad_norm": 1.705114293923401, "language_loss": 0.70554245, "learning_rate": 1.7894971036710322e-06, "loss": 0.72655737, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 9107, "time_per_iteration": 2.386042833328247 }, { "auxiliary_loss_clip": 0.01057644, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.01370645, "balance_loss_mlp": 1.01868069, "epoch": 0.5476025852998647, "flos": 22308242503680.0, "grad_norm": 1.5664655245451466, "language_loss": 0.64725584, "learning_rate": 1.789109809193197e-06, "loss": 0.66821182, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.390625, "step": 9108, "time_per_iteration": 2.4163575172424316 }, { "auxiliary_loss_clip": 0.01057324, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.01492906, "balance_loss_mlp": 1.01880455, "epoch": 0.5476627085525327, "flos": 20119467548160.0, "grad_norm": 1.680457114895876, "language_loss": 0.75850153, "learning_rate": 1.7887225227123396e-06, "loss": 0.77946758, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38476562, "step": 9109, "time_per_iteration": 2.405853509902954 }, { "auxiliary_loss_clip": 0.01058008, "auxiliary_loss_mlp": 0.0104009, "balance_loss_clip": 1.01543808, "balance_loss_mlp": 1.02104759, "epoch": 0.5477228318052006, "flos": 17711612611200.0, "grad_norm": 1.8772983344192442, "language_loss": 0.78879553, "learning_rate": 1.7883352442431457e-06, "loss": 0.80977654, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 9110, "time_per_iteration": 2.396664619445801 }, { "auxiliary_loss_clip": 0.01058381, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.01290345, "balance_loss_mlp": 1.02123523, "epoch": 0.5477829550578687, "flos": 25847702294400.0, "grad_norm": 1.4757584353713669, "language_loss": 0.72065884, "learning_rate": 1.7879479738002993e-06, "loss": 0.74160099, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.37109375, "step": 9111, "time_per_iteration": 2.4412314891815186 }, { "auxiliary_loss_clip": 0.01059541, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.01580083, "balance_loss_mlp": 1.02045894, "epoch": 0.5478430783105366, "flos": 23038555656960.0, "grad_norm": 1.5422860223946497, "language_loss": 0.71650338, "learning_rate": 1.7875607113984876e-06, "loss": 0.73751539, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9112, "time_per_iteration": 2.4126458168029785 }, { "auxiliary_loss_clip": 0.01061836, "auxiliary_loss_mlp": 0.01042224, "balance_loss_clip": 1.01608098, "balance_loss_mlp": 1.02076626, "epoch": 0.5479032015632046, "flos": 16070275774080.0, "grad_norm": 1.8573697805886338, "language_loss": 0.89720356, "learning_rate": 1.7871734570523953e-06, "loss": 0.91824412, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.41210938, "step": 9113, "time_per_iteration": 3.609755039215088 }, { "auxiliary_loss_clip": 0.01060921, "auxiliary_loss_mlp": 0.01043192, "balance_loss_clip": 1.01696599, "balance_loss_mlp": 1.02116871, "epoch": 0.5479633248158725, "flos": 24277588364160.0, "grad_norm": 1.5948466480736014, "language_loss": 0.74089706, "learning_rate": 1.7867862107767067e-06, "loss": 0.76193821, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 9114, "time_per_iteration": 3.9397969245910645 }, { "auxiliary_loss_clip": 0.01057103, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.01348174, "balance_loss_mlp": 1.01996183, "epoch": 0.5480234480685405, "flos": 26357050252800.0, "grad_norm": 1.4588372593868142, "language_loss": 0.72803497, "learning_rate": 1.7863989725861066e-06, "loss": 0.74897897, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 9115, "time_per_iteration": 2.4961321353912354 }, { "auxiliary_loss_clip": 0.01058967, "auxiliary_loss_mlp": 0.01044229, "balance_loss_clip": 1.01743054, "balance_loss_mlp": 1.01976871, "epoch": 0.5480835713212084, "flos": 22053970005120.0, "grad_norm": 1.757304810601309, "language_loss": 0.73384494, "learning_rate": 1.7860117424952781e-06, "loss": 0.75487691, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 9116, "time_per_iteration": 3.723086357116699 }, { "auxiliary_loss_clip": 0.01059432, "auxiliary_loss_mlp": 0.01046944, "balance_loss_clip": 1.02065873, "balance_loss_mlp": 1.02144825, "epoch": 0.5481436945738765, "flos": 25299880151040.0, "grad_norm": 1.8292617755087848, "language_loss": 0.78270292, "learning_rate": 1.7856245205189063e-06, "loss": 0.80376673, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 9117, "time_per_iteration": 2.4519572257995605 }, { "auxiliary_loss_clip": 0.01056168, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.01497889, "balance_loss_mlp": 1.01943874, "epoch": 0.5482038178265444, "flos": 33579532811520.0, "grad_norm": 1.5642591156223873, "language_loss": 0.63661695, "learning_rate": 1.785237306671674e-06, "loss": 0.65755963, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 9118, "time_per_iteration": 2.479496717453003 }, { "auxiliary_loss_clip": 0.01059246, "auxiliary_loss_mlp": 0.0104145, "balance_loss_clip": 1.01346004, "balance_loss_mlp": 1.0200057, "epoch": 0.5482639410792124, "flos": 19025184804480.0, "grad_norm": 2.437196049113347, "language_loss": 0.79900599, "learning_rate": 1.7848501009682646e-06, "loss": 0.82001299, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.39257812, "step": 9119, "time_per_iteration": 2.4355697631835938 }, { "auxiliary_loss_clip": 0.01057214, "auxiliary_loss_mlp": 0.0103578, "balance_loss_clip": 1.01316595, "balance_loss_mlp": 1.02003741, "epoch": 0.5483240643318803, "flos": 25409158306560.0, "grad_norm": 1.5927542802251113, "language_loss": 0.82988691, "learning_rate": 1.7844629034233604e-06, "loss": 0.85081685, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 9120, "time_per_iteration": 2.4370572566986084 }, { "auxiliary_loss_clip": 0.0106128, "auxiliary_loss_mlp": 0.01042177, "balance_loss_clip": 1.01632047, "balance_loss_mlp": 1.0203383, "epoch": 0.5483841875845483, "flos": 21465928108800.0, "grad_norm": 1.6575122046839157, "language_loss": 0.8156057, "learning_rate": 1.7840757140516455e-06, "loss": 0.83664024, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40820312, "step": 9121, "time_per_iteration": 2.4428751468658447 }, { "auxiliary_loss_clip": 0.01058995, "auxiliary_loss_mlp": 0.01046764, "balance_loss_clip": 1.01826072, "balance_loss_mlp": 1.01870871, "epoch": 0.5484443108372163, "flos": 24746297633280.0, "grad_norm": 3.0112534090175087, "language_loss": 0.62547362, "learning_rate": 1.7836885328678008e-06, "loss": 0.64653122, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40234375, "step": 9122, "time_per_iteration": 2.3998727798461914 }, { "auxiliary_loss_clip": 0.01057774, "auxiliary_loss_mlp": 0.01041566, "balance_loss_clip": 1.01858246, "balance_loss_mlp": 1.01998746, "epoch": 0.5485044340898843, "flos": 25374175257600.0, "grad_norm": 1.6011034039766179, "language_loss": 0.72441429, "learning_rate": 1.7833013598865084e-06, "loss": 0.7454077, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37695312, "step": 9123, "time_per_iteration": 3.8539044857025146 }, { "auxiliary_loss_clip": 0.01057338, "auxiliary_loss_mlp": 0.01040612, "balance_loss_clip": 1.01630533, "balance_loss_mlp": 1.01843381, "epoch": 0.5485645573425523, "flos": 12640338518400.0, "grad_norm": 1.83592829153088, "language_loss": 0.85070956, "learning_rate": 1.7829141951224505e-06, "loss": 0.87168908, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.38867188, "step": 9124, "time_per_iteration": 2.3644046783447266 }, { "auxiliary_loss_clip": 0.0105814, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.01629412, "balance_loss_mlp": 1.01996672, "epoch": 0.5486246805952202, "flos": 28328176592640.0, "grad_norm": 2.0022509465836213, "language_loss": 0.81611747, "learning_rate": 1.7825270385903075e-06, "loss": 0.83711237, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 9125, "time_per_iteration": 2.4593863487243652 }, { "auxiliary_loss_clip": 0.01058825, "auxiliary_loss_mlp": 0.01044182, "balance_loss_clip": 1.01813483, "balance_loss_mlp": 1.01937294, "epoch": 0.5486848038478882, "flos": 16799087738880.0, "grad_norm": 2.677761396135111, "language_loss": 0.75152779, "learning_rate": 1.7821398903047617e-06, "loss": 0.77255785, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 9126, "time_per_iteration": 2.3763856887817383 }, { "auxiliary_loss_clip": 0.01060757, "auxiliary_loss_mlp": 0.01042554, "balance_loss_clip": 1.01321626, "balance_loss_mlp": 1.01959276, "epoch": 0.5487449271005561, "flos": 17235327576960.0, "grad_norm": 2.7724145172170562, "language_loss": 0.69432455, "learning_rate": 1.7817527502804928e-06, "loss": 0.71535766, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41210938, "step": 9127, "time_per_iteration": 2.4257378578186035 }, { "auxiliary_loss_clip": 0.01057798, "auxiliary_loss_mlp": 0.01045756, "balance_loss_clip": 1.01744425, "balance_loss_mlp": 1.01910233, "epoch": 0.5488050503532241, "flos": 17340102167040.0, "grad_norm": 1.813821031854537, "language_loss": 0.84516877, "learning_rate": 1.781365618532181e-06, "loss": 0.86620432, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.38671875, "step": 9128, "time_per_iteration": 2.3799655437469482 }, { "auxiliary_loss_clip": 0.01059307, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.02130389, "balance_loss_mlp": 1.01984167, "epoch": 0.548865173605892, "flos": 17238190308480.0, "grad_norm": 1.8378892621440883, "language_loss": 0.75626445, "learning_rate": 1.7809784950745078e-06, "loss": 0.777336, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 9129, "time_per_iteration": 2.42330002784729 }, { "auxiliary_loss_clip": 0.01061106, "auxiliary_loss_mlp": 0.01044205, "balance_loss_clip": 1.0147723, "balance_loss_mlp": 1.02044392, "epoch": 0.5489252968585601, "flos": 17455769101440.0, "grad_norm": 2.5365159810898232, "language_loss": 0.6503489, "learning_rate": 1.7805913799221511e-06, "loss": 0.67140198, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40625, "step": 9130, "time_per_iteration": 2.36358380317688 }, { "auxiliary_loss_clip": 0.01059274, "auxiliary_loss_mlp": 0.01045499, "balance_loss_clip": 1.01930881, "balance_loss_mlp": 1.0192281, "epoch": 0.548985420111228, "flos": 26322171937920.0, "grad_norm": 1.7023299433605819, "language_loss": 0.64812708, "learning_rate": 1.7802042730897915e-06, "loss": 0.66917479, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9131, "time_per_iteration": 2.482997179031372 }, { "auxiliary_loss_clip": 0.01058823, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.01460278, "balance_loss_mlp": 1.01849937, "epoch": 0.549045543363896, "flos": 18692846772480.0, "grad_norm": 1.8080689508585666, "language_loss": 0.75938559, "learning_rate": 1.7798171745921084e-06, "loss": 0.78039742, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 9132, "time_per_iteration": 2.348004102706909 }, { "auxiliary_loss_clip": 0.01056829, "auxiliary_loss_mlp": 0.01040741, "balance_loss_clip": 1.0157783, "balance_loss_mlp": 1.0173707, "epoch": 0.5491056666165639, "flos": 24716237086080.0, "grad_norm": 1.5645339539347949, "language_loss": 0.82409668, "learning_rate": 1.7794300844437795e-06, "loss": 0.84507239, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39453125, "step": 9133, "time_per_iteration": 2.434020757675171 }, { "auxiliary_loss_clip": 0.01055777, "auxiliary_loss_mlp": 0.01051665, "balance_loss_clip": 1.02601123, "balance_loss_mlp": 1.01753724, "epoch": 0.5491657898692319, "flos": 21575939402880.0, "grad_norm": 1.8929803192733115, "language_loss": 0.71067679, "learning_rate": 1.7790430026594841e-06, "loss": 0.7317512, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 9134, "time_per_iteration": 2.3680477142333984 }, { "auxiliary_loss_clip": 0.010598, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.01835537, "balance_loss_mlp": 1.01895714, "epoch": 0.5492259131219, "flos": 50474562566400.0, "grad_norm": 1.7410897079528607, "language_loss": 0.62225509, "learning_rate": 1.7786559292539004e-06, "loss": 0.64329892, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40820312, "step": 9135, "time_per_iteration": 2.660022497177124 }, { "auxiliary_loss_clip": 0.01060233, "auxiliary_loss_mlp": 0.01047796, "balance_loss_clip": 1.0178988, "balance_loss_mlp": 1.01868749, "epoch": 0.5492860363745679, "flos": 25118087368320.0, "grad_norm": 1.7998948514213355, "language_loss": 0.7357502, "learning_rate": 1.7782688642417058e-06, "loss": 0.75683051, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41601562, "step": 9136, "time_per_iteration": 2.3954274654388428 }, { "auxiliary_loss_clip": 0.01061349, "auxiliary_loss_mlp": 0.01045267, "balance_loss_clip": 1.0164659, "balance_loss_mlp": 1.01851749, "epoch": 0.5493461596272359, "flos": 22632795302400.0, "grad_norm": 2.4501265862613857, "language_loss": 0.69885302, "learning_rate": 1.7778818076375781e-06, "loss": 0.71991915, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.4296875, "step": 9137, "time_per_iteration": 2.4319875240325928 }, { "auxiliary_loss_clip": 0.01010287, "auxiliary_loss_mlp": 0.01006758, "balance_loss_clip": 1.00445724, "balance_loss_mlp": 1.00325394, "epoch": 0.5494062828799038, "flos": 66148853166720.0, "grad_norm": 0.7614728242226136, "language_loss": 0.65465063, "learning_rate": 1.7774947594561947e-06, "loss": 0.67482108, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.0703125, "step": 9138, "time_per_iteration": 3.0608794689178467 }, { "auxiliary_loss_clip": 0.01058883, "auxiliary_loss_mlp": 0.01040453, "balance_loss_clip": 1.01390481, "balance_loss_mlp": 1.01878309, "epoch": 0.5494664061325718, "flos": 21104891072640.0, "grad_norm": 1.9903706062729905, "language_loss": 0.76111007, "learning_rate": 1.7771077197122321e-06, "loss": 0.78210342, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 9139, "time_per_iteration": 2.4276340007781982 }, { "auxiliary_loss_clip": 0.01057916, "auxiliary_loss_mlp": 0.01041622, "balance_loss_clip": 1.01618242, "balance_loss_mlp": 1.018224, "epoch": 0.5495265293852397, "flos": 14391686649600.0, "grad_norm": 1.9922199902026902, "language_loss": 0.73040128, "learning_rate": 1.7767206884203672e-06, "loss": 0.75139672, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 9140, "time_per_iteration": 2.3565568923950195 }, { "auxiliary_loss_clip": 0.01056359, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.01297116, "balance_loss_mlp": 1.01720381, "epoch": 0.5495866526379077, "flos": 25548182807040.0, "grad_norm": 1.9294143340894114, "language_loss": 0.77730656, "learning_rate": 1.7763336655952762e-06, "loss": 0.7982648, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 9141, "time_per_iteration": 2.46065616607666 }, { "auxiliary_loss_clip": 0.01057258, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.01641035, "balance_loss_mlp": 1.01941705, "epoch": 0.5496467758905756, "flos": 21316395288960.0, "grad_norm": 2.012828809143257, "language_loss": 0.76231593, "learning_rate": 1.7759466512516346e-06, "loss": 0.78329229, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37890625, "step": 9142, "time_per_iteration": 2.3724138736724854 }, { "auxiliary_loss_clip": 0.0106103, "auxiliary_loss_mlp": 0.01042842, "balance_loss_clip": 1.01498246, "balance_loss_mlp": 1.02071309, "epoch": 0.5497068991432437, "flos": 22232097095040.0, "grad_norm": 1.8764464890788948, "language_loss": 0.78219438, "learning_rate": 1.7755596454041192e-06, "loss": 0.80323309, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40234375, "step": 9143, "time_per_iteration": 2.4085166454315186 }, { "auxiliary_loss_clip": 0.01056793, "auxiliary_loss_mlp": 0.01041002, "balance_loss_clip": 1.0144068, "balance_loss_mlp": 1.01811051, "epoch": 0.5497670223959116, "flos": 18478095799680.0, "grad_norm": 2.359969146439659, "language_loss": 0.8166585, "learning_rate": 1.7751726480674044e-06, "loss": 0.83763641, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 9144, "time_per_iteration": 2.3414220809936523 }, { "auxiliary_loss_clip": 0.01060302, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.0175401, "balance_loss_mlp": 1.02054191, "epoch": 0.5498271456485796, "flos": 29203833202560.0, "grad_norm": 1.79061867070704, "language_loss": 0.72018629, "learning_rate": 1.7747856592561645e-06, "loss": 0.74122602, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 9145, "time_per_iteration": 2.4653677940368652 }, { "auxiliary_loss_clip": 0.01057707, "auxiliary_loss_mlp": 0.01044429, "balance_loss_clip": 1.01858437, "balance_loss_mlp": 1.01884604, "epoch": 0.5498872689012475, "flos": 34822929438720.0, "grad_norm": 1.6445743225522298, "language_loss": 0.71706307, "learning_rate": 1.774398678985076e-06, "loss": 0.73808444, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 9146, "time_per_iteration": 2.478318691253662 }, { "auxiliary_loss_clip": 0.01055288, "auxiliary_loss_mlp": 0.01035708, "balance_loss_clip": 1.01337981, "balance_loss_mlp": 1.01894283, "epoch": 0.5499473921539155, "flos": 25920740592000.0, "grad_norm": 1.818373377820676, "language_loss": 0.65076184, "learning_rate": 1.7740117072688113e-06, "loss": 0.67167181, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 9147, "time_per_iteration": 2.442758798599243 }, { "auxiliary_loss_clip": 0.01057737, "auxiliary_loss_mlp": 0.01043291, "balance_loss_clip": 1.01696908, "balance_loss_mlp": 1.0186944, "epoch": 0.5500075154065835, "flos": 22272596138880.0, "grad_norm": 1.9667471021979908, "language_loss": 0.82788002, "learning_rate": 1.7736247441220458e-06, "loss": 0.8488903, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 9148, "time_per_iteration": 2.3792545795440674 }, { "auxiliary_loss_clip": 0.01057844, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.01833153, "balance_loss_mlp": 1.01813698, "epoch": 0.5500676386592515, "flos": 28036267781760.0, "grad_norm": 1.651629966173748, "language_loss": 0.80273116, "learning_rate": 1.773237789559453e-06, "loss": 0.82375538, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 9149, "time_per_iteration": 2.489269971847534 }, { "auxiliary_loss_clip": 0.01058571, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.01268196, "balance_loss_mlp": 1.01903105, "epoch": 0.5501277619119195, "flos": 23913688596480.0, "grad_norm": 1.8980725538064207, "language_loss": 0.7345587, "learning_rate": 1.7728508435957052e-06, "loss": 0.75551587, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.39453125, "step": 9150, "time_per_iteration": 2.3829362392425537 }, { "auxiliary_loss_clip": 0.01059936, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.01476526, "balance_loss_mlp": 1.01850092, "epoch": 0.5501878851645874, "flos": 20922714264960.0, "grad_norm": 1.943021795179864, "language_loss": 0.76154155, "learning_rate": 1.772463906245477e-06, "loss": 0.78257966, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.4140625, "step": 9151, "time_per_iteration": 2.4296443462371826 }, { "auxiliary_loss_clip": 0.01058362, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.01549184, "balance_loss_mlp": 1.01897454, "epoch": 0.5502480084172554, "flos": 20664322225920.0, "grad_norm": 2.2818107367432394, "language_loss": 0.76577061, "learning_rate": 1.7720769775234394e-06, "loss": 0.78676689, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 9152, "time_per_iteration": 3.7227983474731445 }, { "auxiliary_loss_clip": 0.01057719, "auxiliary_loss_mlp": 0.0103777, "balance_loss_clip": 1.0139401, "balance_loss_mlp": 1.01881444, "epoch": 0.5503081316699233, "flos": 26431345359360.0, "grad_norm": 1.8805064295393548, "language_loss": 0.83817899, "learning_rate": 1.7716900574442662e-06, "loss": 0.85913396, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38867188, "step": 9153, "time_per_iteration": 2.4264204502105713 }, { "auxiliary_loss_clip": 0.01057937, "auxiliary_loss_mlp": 0.01043549, "balance_loss_clip": 1.01628625, "balance_loss_mlp": 1.01885462, "epoch": 0.5503682549225913, "flos": 30627800714880.0, "grad_norm": 1.8086797563121977, "language_loss": 0.75392109, "learning_rate": 1.7713031460226294e-06, "loss": 0.77493596, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 9154, "time_per_iteration": 3.8839645385742188 }, { "auxiliary_loss_clip": 0.01061868, "auxiliary_loss_mlp": 0.01044657, "balance_loss_clip": 1.01759696, "balance_loss_mlp": 1.02007222, "epoch": 0.5504283781752592, "flos": 22564330392960.0, "grad_norm": 1.8243345205920185, "language_loss": 0.73644543, "learning_rate": 1.770916243273199e-06, "loss": 0.75751066, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41796875, "step": 9155, "time_per_iteration": 2.3925323486328125 }, { "auxiliary_loss_clip": 0.01010598, "auxiliary_loss_mlp": 0.01003507, "balance_loss_clip": 1.00118196, "balance_loss_mlp": 1.00365508, "epoch": 0.5504885014279273, "flos": 67898071704960.0, "grad_norm": 0.7483967329612776, "language_loss": 0.55426359, "learning_rate": 1.7705293492106483e-06, "loss": 0.5744046, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.06933594, "step": 9156, "time_per_iteration": 4.5518903732299805 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.01349211, "balance_loss_mlp": 1.01811743, "epoch": 0.5505486246805952, "flos": 22449117306240.0, "grad_norm": 1.6590720663489695, "language_loss": 0.83128595, "learning_rate": 1.7701424638496475e-06, "loss": 0.85222483, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38476562, "step": 9157, "time_per_iteration": 2.4331817626953125 }, { "auxiliary_loss_clip": 0.01060699, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.01497984, "balance_loss_mlp": 1.0191648, "epoch": 0.5506087479332632, "flos": 26905675357440.0, "grad_norm": 2.1230303098158467, "language_loss": 0.77217239, "learning_rate": 1.7697555872048677e-06, "loss": 0.79322898, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.4140625, "step": 9158, "time_per_iteration": 2.4065189361572266 }, { "auxiliary_loss_clip": 0.01056571, "auxiliary_loss_mlp": 0.01040119, "balance_loss_clip": 1.01658714, "balance_loss_mlp": 1.01893508, "epoch": 0.5506688711859311, "flos": 22929137856000.0, "grad_norm": 1.603470520352, "language_loss": 0.71154547, "learning_rate": 1.769368719290979e-06, "loss": 0.73251241, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.375, "step": 9159, "time_per_iteration": 2.4332242012023926 }, { "auxiliary_loss_clip": 0.01058774, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.01721215, "balance_loss_mlp": 1.01881242, "epoch": 0.5507289944385991, "flos": 29605124903040.0, "grad_norm": 1.6310180404094303, "language_loss": 0.69669431, "learning_rate": 1.7689818601226516e-06, "loss": 0.71772587, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 9160, "time_per_iteration": 2.4329280853271484 }, { "auxiliary_loss_clip": 0.01055675, "auxiliary_loss_mlp": 0.01039526, "balance_loss_clip": 1.01589859, "balance_loss_mlp": 1.01898551, "epoch": 0.5507891176912671, "flos": 15333713487360.0, "grad_norm": 1.911008898251302, "language_loss": 0.73328185, "learning_rate": 1.7685950097145552e-06, "loss": 0.75423384, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 9161, "time_per_iteration": 2.358996629714966 }, { "auxiliary_loss_clip": 0.01058389, "auxiliary_loss_mlp": 0.01044513, "balance_loss_clip": 1.01671302, "balance_loss_mlp": 1.01892507, "epoch": 0.5508492409439351, "flos": 26577107752320.0, "grad_norm": 1.5852653899863, "language_loss": 0.70630443, "learning_rate": 1.768208168081359e-06, "loss": 0.72733349, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39453125, "step": 9162, "time_per_iteration": 2.454695701599121 }, { "auxiliary_loss_clip": 0.01056287, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.01770473, "balance_loss_mlp": 1.01784992, "epoch": 0.5509093641966031, "flos": 25442360876160.0, "grad_norm": 1.8403649382133367, "language_loss": 0.86780512, "learning_rate": 1.767821335237733e-06, "loss": 0.88880825, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 9163, "time_per_iteration": 3.812758684158325 }, { "auxiliary_loss_clip": 0.01058324, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.01652741, "balance_loss_mlp": 1.01998055, "epoch": 0.550969487449271, "flos": 18697524894720.0, "grad_norm": 1.5936049004617316, "language_loss": 0.81575346, "learning_rate": 1.7674345111983441e-06, "loss": 0.83675498, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9164, "time_per_iteration": 2.3507559299468994 }, { "auxiliary_loss_clip": 0.01059465, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.01304948, "balance_loss_mlp": 1.01955223, "epoch": 0.551029610701939, "flos": 22707683902080.0, "grad_norm": 2.0132349047060423, "language_loss": 0.75174403, "learning_rate": 1.767047695977863e-06, "loss": 0.77274239, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 9165, "time_per_iteration": 2.413749933242798 }, { "auxiliary_loss_clip": 0.01055703, "auxiliary_loss_mlp": 0.01042199, "balance_loss_clip": 1.01654553, "balance_loss_mlp": 1.01743317, "epoch": 0.5510897339546069, "flos": 12419722437120.0, "grad_norm": 1.9307878726343064, "language_loss": 0.8033185, "learning_rate": 1.7666608895909563e-06, "loss": 0.82429749, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 9166, "time_per_iteration": 2.323681592941284 }, { "auxiliary_loss_clip": 0.01058961, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.01298642, "balance_loss_mlp": 1.0184536, "epoch": 0.5511498572072749, "flos": 18769585674240.0, "grad_norm": 2.0362108526293228, "language_loss": 0.76960921, "learning_rate": 1.7662740920522913e-06, "loss": 0.79059917, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40429688, "step": 9167, "time_per_iteration": 2.389540910720825 }, { "auxiliary_loss_clip": 0.01055936, "auxiliary_loss_mlp": 0.01044626, "balance_loss_clip": 1.01993799, "balance_loss_mlp": 1.01783371, "epoch": 0.5512099804599428, "flos": 19572308720640.0, "grad_norm": 2.9724391212911816, "language_loss": 0.81968546, "learning_rate": 1.7658873033765374e-06, "loss": 0.84069109, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38085938, "step": 9168, "time_per_iteration": 2.347132444381714 }, { "auxiliary_loss_clip": 0.01059637, "auxiliary_loss_mlp": 0.01050424, "balance_loss_clip": 1.02266049, "balance_loss_mlp": 1.01880789, "epoch": 0.5512701037126109, "flos": 26244525340800.0, "grad_norm": 1.6442007315658245, "language_loss": 0.70274997, "learning_rate": 1.7655005235783591e-06, "loss": 0.72385061, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 9169, "time_per_iteration": 2.4525113105773926 }, { "auxiliary_loss_clip": 0.01054048, "auxiliary_loss_mlp": 0.01040807, "balance_loss_clip": 1.01717997, "balance_loss_mlp": 1.01662254, "epoch": 0.5513302269652788, "flos": 21944307824640.0, "grad_norm": 2.064096716722073, "language_loss": 0.86044455, "learning_rate": 1.7651137526724251e-06, "loss": 0.88139307, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 9170, "time_per_iteration": 2.364689350128174 }, { "auxiliary_loss_clip": 0.01009583, "auxiliary_loss_mlp": 0.01008562, "balance_loss_clip": 1.00638044, "balance_loss_mlp": 1.00234532, "epoch": 0.5513903502179468, "flos": 68232818620800.0, "grad_norm": 0.784304240043542, "language_loss": 0.60005444, "learning_rate": 1.7647269906734017e-06, "loss": 0.62023592, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.07226562, "step": 9171, "time_per_iteration": 3.057135581970215 }, { "auxiliary_loss_clip": 0.01056048, "auxiliary_loss_mlp": 0.01048219, "balance_loss_clip": 1.02291048, "balance_loss_mlp": 1.01789653, "epoch": 0.5514504734706147, "flos": 18733241082240.0, "grad_norm": 1.6370679319809294, "language_loss": 0.71507764, "learning_rate": 1.7643402375959533e-06, "loss": 0.73612034, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9172, "time_per_iteration": 2.3793253898620605 }, { "auxiliary_loss_clip": 0.01056245, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 1.01818156, "balance_loss_mlp": 1.01813924, "epoch": 0.5515105967232827, "flos": 22269942875520.0, "grad_norm": 1.9080651096520829, "language_loss": 0.77429104, "learning_rate": 1.7639534934547474e-06, "loss": 0.79528385, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 9173, "time_per_iteration": 2.412353277206421 }, { "auxiliary_loss_clip": 0.0105692, "auxiliary_loss_mlp": 0.01043656, "balance_loss_clip": 1.01752555, "balance_loss_mlp": 1.01841056, "epoch": 0.5515707199759508, "flos": 22556789539200.0, "grad_norm": 1.6313557595557857, "language_loss": 0.76699287, "learning_rate": 1.7635667582644484e-06, "loss": 0.78799868, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 9174, "time_per_iteration": 2.4581298828125 }, { "auxiliary_loss_clip": 0.01059179, "auxiliary_loss_mlp": 0.01040547, "balance_loss_clip": 1.0146904, "balance_loss_mlp": 1.01957667, "epoch": 0.5516308432286187, "flos": 28289876964480.0, "grad_norm": 1.6873688739908637, "language_loss": 0.73654068, "learning_rate": 1.7631800320397217e-06, "loss": 0.75753796, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 9175, "time_per_iteration": 2.5249581336975098 }, { "auxiliary_loss_clip": 0.01058421, "auxiliary_loss_mlp": 0.010439, "balance_loss_clip": 1.01688695, "balance_loss_mlp": 1.01816273, "epoch": 0.5516909664812867, "flos": 18763650743040.0, "grad_norm": 1.9715915898190761, "language_loss": 0.70634639, "learning_rate": 1.7627933147952318e-06, "loss": 0.72736961, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 9176, "time_per_iteration": 2.369164228439331 }, { "auxiliary_loss_clip": 0.01056797, "auxiliary_loss_mlp": 0.01039555, "balance_loss_clip": 1.01319742, "balance_loss_mlp": 1.01835287, "epoch": 0.5517510897339546, "flos": 27739261912320.0, "grad_norm": 1.7803438814773822, "language_loss": 0.72027975, "learning_rate": 1.7624066065456435e-06, "loss": 0.74124324, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38476562, "step": 9177, "time_per_iteration": 2.485382318496704 }, { "auxiliary_loss_clip": 0.01059516, "auxiliary_loss_mlp": 0.01041565, "balance_loss_clip": 1.01570892, "balance_loss_mlp": 1.01976287, "epoch": 0.5518112129866226, "flos": 18403521402240.0, "grad_norm": 1.6251339359601547, "language_loss": 0.81377089, "learning_rate": 1.7620199073056204e-06, "loss": 0.83478171, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 9178, "time_per_iteration": 2.359722852706909 }, { "auxiliary_loss_clip": 0.01059886, "auxiliary_loss_mlp": 0.01047798, "balance_loss_clip": 1.02011776, "balance_loss_mlp": 1.01927853, "epoch": 0.5518713362392905, "flos": 25081498396800.0, "grad_norm": 1.5878298282377064, "language_loss": 0.75393605, "learning_rate": 1.761633217089826e-06, "loss": 0.77501297, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 9179, "time_per_iteration": 2.4454519748687744 }, { "auxiliary_loss_clip": 0.01058863, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.01253748, "balance_loss_mlp": 1.01919413, "epoch": 0.5519314594919585, "flos": 36537514041600.0, "grad_norm": 1.6244368030687935, "language_loss": 0.7070663, "learning_rate": 1.761246535912924e-06, "loss": 0.72803032, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 9180, "time_per_iteration": 2.5000698566436768 }, { "auxiliary_loss_clip": 0.01060585, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.01713526, "balance_loss_mlp": 1.02002239, "epoch": 0.5519915827446265, "flos": 20447581305600.0, "grad_norm": 2.1207412318155554, "language_loss": 0.68842822, "learning_rate": 1.7608598637895776e-06, "loss": 0.70949638, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40429688, "step": 9181, "time_per_iteration": 2.4087836742401123 }, { "auxiliary_loss_clip": 0.01060608, "auxiliary_loss_mlp": 0.01045185, "balance_loss_clip": 1.01588368, "balance_loss_mlp": 1.01898932, "epoch": 0.5520517059972945, "flos": 23766948685440.0, "grad_norm": 1.913604945843073, "language_loss": 0.80197525, "learning_rate": 1.7604732007344486e-06, "loss": 0.82303315, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41601562, "step": 9182, "time_per_iteration": 2.3929319381713867 }, { "auxiliary_loss_clip": 0.01058953, "auxiliary_loss_mlp": 0.01040612, "balance_loss_clip": 1.01328969, "balance_loss_mlp": 1.01886892, "epoch": 0.5521118292499624, "flos": 22195473212160.0, "grad_norm": 2.3564041284992383, "language_loss": 0.84101868, "learning_rate": 1.7600865467622003e-06, "loss": 0.86201435, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 9183, "time_per_iteration": 2.4321517944335938 }, { "auxiliary_loss_clip": 0.01057446, "auxiliary_loss_mlp": 0.01040794, "balance_loss_clip": 1.01419806, "balance_loss_mlp": 1.01842761, "epoch": 0.5521719525026304, "flos": 23582258259840.0, "grad_norm": 1.3356188840544858, "language_loss": 0.6805346, "learning_rate": 1.7596999018874936e-06, "loss": 0.70151699, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 9184, "time_per_iteration": 2.400406837463379 }, { "auxiliary_loss_clip": 0.01059987, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.01117325, "balance_loss_mlp": 1.02014494, "epoch": 0.5522320757552983, "flos": 26136503994240.0, "grad_norm": 1.5977206569476512, "language_loss": 0.77196604, "learning_rate": 1.7593132661249917e-06, "loss": 0.7929647, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.3984375, "step": 9185, "time_per_iteration": 2.429492235183716 }, { "auxiliary_loss_clip": 0.01059312, "auxiliary_loss_mlp": 0.01050472, "balance_loss_clip": 1.02195764, "balance_loss_mlp": 1.01875532, "epoch": 0.5522921990079663, "flos": 24675144549120.0, "grad_norm": 1.7531684120012776, "language_loss": 0.75334585, "learning_rate": 1.7589266394893536e-06, "loss": 0.77444363, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40625, "step": 9186, "time_per_iteration": 2.390329599380493 }, { "auxiliary_loss_clip": 0.01061007, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.0211674, "balance_loss_mlp": 1.02012503, "epoch": 0.5523523222606344, "flos": 22747030871040.0, "grad_norm": 1.8701277765106823, "language_loss": 0.67645961, "learning_rate": 1.7585400219952421e-06, "loss": 0.69755113, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 9187, "time_per_iteration": 2.4065563678741455 }, { "auxiliary_loss_clip": 0.01060101, "auxiliary_loss_mlp": 0.01045413, "balance_loss_clip": 1.01916349, "balance_loss_mlp": 1.01991844, "epoch": 0.5524124455133023, "flos": 19754799730560.0, "grad_norm": 1.8499425442188828, "language_loss": 0.78257847, "learning_rate": 1.758153413657318e-06, "loss": 0.80363363, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 9188, "time_per_iteration": 2.377748489379883 }, { "auxiliary_loss_clip": 0.01058234, "auxiliary_loss_mlp": 0.01043227, "balance_loss_clip": 1.01621377, "balance_loss_mlp": 1.01853347, "epoch": 0.5524725687659703, "flos": 23293700939520.0, "grad_norm": 1.8213033852410643, "language_loss": 0.83582628, "learning_rate": 1.7577668144902394e-06, "loss": 0.85684085, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 9189, "time_per_iteration": 2.378924608230591 }, { "auxiliary_loss_clip": 0.01058503, "auxiliary_loss_mlp": 0.01044242, "balance_loss_clip": 1.01767063, "balance_loss_mlp": 1.01921511, "epoch": 0.5525326920186382, "flos": 24861056872320.0, "grad_norm": 1.6622967414901515, "language_loss": 0.77373821, "learning_rate": 1.7573802245086684e-06, "loss": 0.79476559, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 9190, "time_per_iteration": 2.4461700916290283 }, { "auxiliary_loss_clip": 0.01061698, "auxiliary_loss_mlp": 0.01052716, "balance_loss_clip": 1.02119708, "balance_loss_mlp": 1.0188098, "epoch": 0.5525928152713062, "flos": 13734725996160.0, "grad_norm": 2.560013088717474, "language_loss": 0.80507934, "learning_rate": 1.7569936437272627e-06, "loss": 0.82622343, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 0.4296875, "step": 9191, "time_per_iteration": 2.340862274169922 }, { "auxiliary_loss_clip": 0.0105698, "auxiliary_loss_mlp": 0.01041044, "balance_loss_clip": 1.01512825, "balance_loss_mlp": 1.01812005, "epoch": 0.5526529385239741, "flos": 13070957627520.0, "grad_norm": 1.767515491681345, "language_loss": 0.69881183, "learning_rate": 1.7566070721606829e-06, "loss": 0.71979213, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38867188, "step": 9192, "time_per_iteration": 3.5769171714782715 }, { "auxiliary_loss_clip": 0.01056235, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.01328766, "balance_loss_mlp": 1.0183475, "epoch": 0.5527130617766421, "flos": 23147275230720.0, "grad_norm": 1.4675193631851224, "language_loss": 0.78197569, "learning_rate": 1.756220509823588e-06, "loss": 0.80291522, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9193, "time_per_iteration": 3.776726722717285 }, { "auxiliary_loss_clip": 0.01058172, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.02171636, "balance_loss_mlp": 1.01765156, "epoch": 0.55277318502931, "flos": 21284554262400.0, "grad_norm": 1.5600468795275042, "language_loss": 0.79391778, "learning_rate": 1.7558339567306344e-06, "loss": 0.8149724, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40625, "step": 9194, "time_per_iteration": 2.3703150749206543 }, { "auxiliary_loss_clip": 0.0106053, "auxiliary_loss_mlp": 0.01041485, "balance_loss_clip": 1.01525903, "balance_loss_mlp": 1.01778853, "epoch": 0.5528333082819781, "flos": 38323077171840.0, "grad_norm": 2.041674738626649, "language_loss": 0.70133263, "learning_rate": 1.7554474128964825e-06, "loss": 0.72235274, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.42773438, "step": 9195, "time_per_iteration": 3.9303488731384277 }, { "auxiliary_loss_clip": 0.0106237, "auxiliary_loss_mlp": 0.01047074, "balance_loss_clip": 1.01774824, "balance_loss_mlp": 1.01920605, "epoch": 0.552893431534646, "flos": 13552758656640.0, "grad_norm": 1.9671246255640147, "language_loss": 0.75125611, "learning_rate": 1.7550608783357887e-06, "loss": 0.77235055, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.43164062, "step": 9196, "time_per_iteration": 2.403614044189453 }, { "auxiliary_loss_clip": 0.01057416, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.01399636, "balance_loss_mlp": 1.01893687, "epoch": 0.552953554787314, "flos": 21938477627520.0, "grad_norm": 1.519266390381783, "language_loss": 0.77587616, "learning_rate": 1.7546743530632115e-06, "loss": 0.79683965, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 9197, "time_per_iteration": 2.384580612182617 }, { "auxiliary_loss_clip": 0.01057892, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.01206553, "balance_loss_mlp": 1.01864028, "epoch": 0.5530136780399819, "flos": 43656199528320.0, "grad_norm": 2.729385044324578, "language_loss": 0.7685467, "learning_rate": 1.754287837093407e-06, "loss": 0.78948969, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.39257812, "step": 9198, "time_per_iteration": 2.6204497814178467 }, { "auxiliary_loss_clip": 0.0105585, "auxiliary_loss_mlp": 0.01039715, "balance_loss_clip": 1.01482391, "balance_loss_mlp": 1.01713586, "epoch": 0.5530738012926499, "flos": 25044350843520.0, "grad_norm": 1.5509690090048953, "language_loss": 0.79867828, "learning_rate": 1.7539013304410327e-06, "loss": 0.81963396, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 9199, "time_per_iteration": 2.437286376953125 }, { "auxiliary_loss_clip": 0.0105681, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.01164293, "balance_loss_mlp": 1.0172224, "epoch": 0.553133924545318, "flos": 16471148538240.0, "grad_norm": 1.6753277825684778, "language_loss": 0.64931262, "learning_rate": 1.7535148331207443e-06, "loss": 0.67025316, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39648438, "step": 9200, "time_per_iteration": 2.369597911834717 }, { "auxiliary_loss_clip": 0.01059909, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.01150608, "balance_loss_mlp": 1.01835704, "epoch": 0.5531940477979859, "flos": 24605108628480.0, "grad_norm": 1.5441726668124012, "language_loss": 0.66981852, "learning_rate": 1.7531283451471978e-06, "loss": 0.69084001, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.41601562, "step": 9201, "time_per_iteration": 2.4304957389831543 }, { "auxiliary_loss_clip": 0.01060132, "auxiliary_loss_mlp": 0.01048467, "balance_loss_clip": 1.0194633, "balance_loss_mlp": 1.02012205, "epoch": 0.5532541710506539, "flos": 22158604949760.0, "grad_norm": 1.945114592552578, "language_loss": 0.62201154, "learning_rate": 1.7527418665350502e-06, "loss": 0.64309752, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40039062, "step": 9202, "time_per_iteration": 2.42339825630188 }, { "auxiliary_loss_clip": 0.01057778, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.01389968, "balance_loss_mlp": 1.01951957, "epoch": 0.5533142943033218, "flos": 21396206390400.0, "grad_norm": 2.078790064874674, "language_loss": 0.65846097, "learning_rate": 1.7523553972989548e-06, "loss": 0.67944157, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 9203, "time_per_iteration": 3.7850546836853027 }, { "auxiliary_loss_clip": 0.01057199, "auxiliary_loss_mlp": 0.0103847, "balance_loss_clip": 1.01190984, "balance_loss_mlp": 1.01755726, "epoch": 0.5533744175559898, "flos": 23549404803840.0, "grad_norm": 1.5353117249854624, "language_loss": 0.6431849, "learning_rate": 1.7519689374535683e-06, "loss": 0.66414154, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 9204, "time_per_iteration": 2.429094076156616 }, { "auxiliary_loss_clip": 0.01056566, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.02077007, "balance_loss_mlp": 1.01784348, "epoch": 0.5534345408086577, "flos": 24060358684800.0, "grad_norm": 1.5521957660321217, "language_loss": 0.78408074, "learning_rate": 1.7515824870135445e-06, "loss": 0.80510151, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 9205, "time_per_iteration": 2.4150044918060303 }, { "auxiliary_loss_clip": 0.01058247, "auxiliary_loss_mlp": 0.01042597, "balance_loss_clip": 1.01757479, "balance_loss_mlp": 1.0199337, "epoch": 0.5534946640613257, "flos": 33770262902400.0, "grad_norm": 1.383561578313534, "language_loss": 0.73353618, "learning_rate": 1.751196045993537e-06, "loss": 0.75454462, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 9206, "time_per_iteration": 2.529114246368408 }, { "auxiliary_loss_clip": 0.01058336, "auxiliary_loss_mlp": 0.0104665, "balance_loss_clip": 1.01943469, "balance_loss_mlp": 1.01896524, "epoch": 0.5535547873139937, "flos": 15158309483520.0, "grad_norm": 2.0684553373031576, "language_loss": 0.76330268, "learning_rate": 1.7508096144082012e-06, "loss": 0.78435254, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 9207, "time_per_iteration": 2.3636984825134277 }, { "auxiliary_loss_clip": 0.01061024, "auxiliary_loss_mlp": 0.01048886, "balance_loss_clip": 1.01944113, "balance_loss_mlp": 1.0195483, "epoch": 0.5536149105666617, "flos": 16979972826240.0, "grad_norm": 3.3978010434470387, "language_loss": 0.64710319, "learning_rate": 1.750423192272189e-06, "loss": 0.66820228, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4140625, "step": 9208, "time_per_iteration": 2.363457441329956 }, { "auxiliary_loss_clip": 0.01058564, "auxiliary_loss_mlp": 0.01040014, "balance_loss_clip": 1.01352549, "balance_loss_mlp": 1.01869249, "epoch": 0.5536750338193296, "flos": 18148969612800.0, "grad_norm": 2.0488900148248157, "language_loss": 0.66506851, "learning_rate": 1.7500367796001547e-06, "loss": 0.68605435, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 9209, "time_per_iteration": 2.363802433013916 }, { "auxiliary_loss_clip": 0.01058367, "auxiliary_loss_mlp": 0.01050775, "balance_loss_clip": 1.02283204, "balance_loss_mlp": 1.01834893, "epoch": 0.5537351570719976, "flos": 22746681757440.0, "grad_norm": 2.045828516512412, "language_loss": 0.83813059, "learning_rate": 1.7496503764067513e-06, "loss": 0.85922205, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40039062, "step": 9210, "time_per_iteration": 2.4034178256988525 }, { "auxiliary_loss_clip": 0.01055989, "auxiliary_loss_mlp": 0.01043739, "balance_loss_clip": 1.01908612, "balance_loss_mlp": 1.01767623, "epoch": 0.5537952803246655, "flos": 26354920659840.0, "grad_norm": 4.461585705198792, "language_loss": 0.73940337, "learning_rate": 1.74926398270663e-06, "loss": 0.76040059, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 9211, "time_per_iteration": 2.4348394870758057 }, { "auxiliary_loss_clip": 0.01060295, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.01738286, "balance_loss_mlp": 1.01858091, "epoch": 0.5538554035773335, "flos": 18036549434880.0, "grad_norm": 2.3025685649430505, "language_loss": 0.67700422, "learning_rate": 1.7488775985144437e-06, "loss": 0.69808125, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41796875, "step": 9212, "time_per_iteration": 2.385986566543579 }, { "auxiliary_loss_clip": 0.01059664, "auxiliary_loss_mlp": 0.0104467, "balance_loss_clip": 1.01449835, "balance_loss_mlp": 1.01704848, "epoch": 0.5539155268300014, "flos": 31684900993920.0, "grad_norm": 1.4806686956497581, "language_loss": 0.53044724, "learning_rate": 1.7484912238448443e-06, "loss": 0.55149055, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.42578125, "step": 9213, "time_per_iteration": 2.466909646987915 }, { "auxiliary_loss_clip": 0.01060093, "auxiliary_loss_mlp": 0.01046559, "balance_loss_clip": 1.02014208, "balance_loss_mlp": 1.01943111, "epoch": 0.5539756500826695, "flos": 15192908507520.0, "grad_norm": 2.666401618199012, "language_loss": 0.87095213, "learning_rate": 1.7481048587124827e-06, "loss": 0.89201862, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 9214, "time_per_iteration": 2.366302967071533 }, { "auxiliary_loss_clip": 0.01058483, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.01510561, "balance_loss_mlp": 1.01983917, "epoch": 0.5540357733353375, "flos": 26352092839680.0, "grad_norm": 1.6833256450571068, "language_loss": 0.71177828, "learning_rate": 1.7477185031320108e-06, "loss": 0.7327792, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 9215, "time_per_iteration": 2.428729772567749 }, { "auxiliary_loss_clip": 0.01060357, "auxiliary_loss_mlp": 0.01040615, "balance_loss_clip": 1.01350701, "balance_loss_mlp": 1.01950169, "epoch": 0.5540958965880054, "flos": 21322644422400.0, "grad_norm": 3.5470472541215035, "language_loss": 0.74340463, "learning_rate": 1.7473321571180773e-06, "loss": 0.76441431, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40820312, "step": 9216, "time_per_iteration": 2.415776491165161 }, { "auxiliary_loss_clip": 0.01055119, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.01369214, "balance_loss_mlp": 1.01777899, "epoch": 0.5541560198406734, "flos": 25665630220800.0, "grad_norm": 2.1169619820523584, "language_loss": 0.73433208, "learning_rate": 1.7469458206853345e-06, "loss": 0.75525969, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 9217, "time_per_iteration": 2.429340124130249 }, { "auxiliary_loss_clip": 0.01055415, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.01384676, "balance_loss_mlp": 1.01749587, "epoch": 0.5542161430933413, "flos": 21938687095680.0, "grad_norm": 1.7233229313224785, "language_loss": 0.79160762, "learning_rate": 1.7465594938484315e-06, "loss": 0.81253815, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 9218, "time_per_iteration": 2.3886399269104004 }, { "auxiliary_loss_clip": 0.01059, "auxiliary_loss_mlp": 0.01046397, "balance_loss_clip": 1.01573682, "balance_loss_mlp": 1.01734102, "epoch": 0.5542762663460093, "flos": 19570493329920.0, "grad_norm": 1.6695513465954157, "language_loss": 0.73087752, "learning_rate": 1.7461731766220176e-06, "loss": 0.75193149, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.41601562, "step": 9219, "time_per_iteration": 2.38936448097229 }, { "auxiliary_loss_clip": 0.01060027, "auxiliary_loss_mlp": 0.01042346, "balance_loss_clip": 1.01533306, "balance_loss_mlp": 1.01891708, "epoch": 0.5543363895986773, "flos": 19498083436800.0, "grad_norm": 1.6751041061945051, "language_loss": 0.72195196, "learning_rate": 1.7457868690207426e-06, "loss": 0.74297571, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 9220, "time_per_iteration": 2.3764688968658447 }, { "auxiliary_loss_clip": 0.01056131, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.01240098, "balance_loss_mlp": 1.01766109, "epoch": 0.5543965128513453, "flos": 22634575781760.0, "grad_norm": 1.5590197795281522, "language_loss": 0.80327368, "learning_rate": 1.7454005710592547e-06, "loss": 0.82420874, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 9221, "time_per_iteration": 2.391430377960205 }, { "auxiliary_loss_clip": 0.01057491, "auxiliary_loss_mlp": 0.01043244, "balance_loss_clip": 1.01555204, "balance_loss_mlp": 1.01797605, "epoch": 0.5544566361040132, "flos": 25988891299200.0, "grad_norm": 3.3599689568082294, "language_loss": 0.84447789, "learning_rate": 1.7450142827522027e-06, "loss": 0.86548531, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39453125, "step": 9222, "time_per_iteration": 2.4187347888946533 }, { "auxiliary_loss_clip": 0.0106106, "auxiliary_loss_mlp": 0.01050578, "balance_loss_clip": 1.02112174, "balance_loss_mlp": 1.01922345, "epoch": 0.5545167593566812, "flos": 28256290369920.0, "grad_norm": 1.655745439943986, "language_loss": 0.76688802, "learning_rate": 1.7446280041142344e-06, "loss": 0.7880044, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41796875, "step": 9223, "time_per_iteration": 2.443096160888672 }, { "auxiliary_loss_clip": 0.01056318, "auxiliary_loss_mlp": 0.01043665, "balance_loss_clip": 1.01865458, "balance_loss_mlp": 1.0175581, "epoch": 0.5545768826093491, "flos": 28475265617280.0, "grad_norm": 1.7141676486825865, "language_loss": 0.83399391, "learning_rate": 1.7442417351599986e-06, "loss": 0.8549937, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 9224, "time_per_iteration": 2.451061487197876 }, { "auxiliary_loss_clip": 0.01058041, "auxiliary_loss_mlp": 0.01051763, "balance_loss_clip": 1.0261209, "balance_loss_mlp": 1.01814878, "epoch": 0.5546370058620171, "flos": 18477083370240.0, "grad_norm": 2.3772323927073393, "language_loss": 0.59058785, "learning_rate": 1.743855475904141e-06, "loss": 0.61168587, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 9225, "time_per_iteration": 2.3830294609069824 }, { "auxiliary_loss_clip": 0.01057728, "auxiliary_loss_mlp": 0.01040043, "balance_loss_clip": 1.01447284, "balance_loss_mlp": 1.01727414, "epoch": 0.554697129114685, "flos": 22929382235520.0, "grad_norm": 1.9745354379826432, "language_loss": 0.68642306, "learning_rate": 1.7434692263613098e-06, "loss": 0.7074008, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40429688, "step": 9226, "time_per_iteration": 2.4468395709991455 }, { "auxiliary_loss_clip": 0.01057513, "auxiliary_loss_mlp": 0.0104388, "balance_loss_clip": 1.01804686, "balance_loss_mlp": 1.01761198, "epoch": 0.5547572523673531, "flos": 21796136547840.0, "grad_norm": 1.5383973367518706, "language_loss": 0.75837314, "learning_rate": 1.7430829865461518e-06, "loss": 0.77938712, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 9227, "time_per_iteration": 2.41269588470459 }, { "auxiliary_loss_clip": 0.01060073, "auxiliary_loss_mlp": 0.01044583, "balance_loss_clip": 1.01611614, "balance_loss_mlp": 1.01992846, "epoch": 0.5548173756200211, "flos": 22341829098240.0, "grad_norm": 1.7260189792259566, "language_loss": 0.74539554, "learning_rate": 1.7426967564733118e-06, "loss": 0.76644206, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40234375, "step": 9228, "time_per_iteration": 2.369199275970459 }, { "auxiliary_loss_clip": 0.01059161, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.01380181, "balance_loss_mlp": 1.01848316, "epoch": 0.554877498872689, "flos": 17857759029120.0, "grad_norm": 1.6917172683992476, "language_loss": 0.773045, "learning_rate": 1.7423105361574373e-06, "loss": 0.79402751, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40625, "step": 9229, "time_per_iteration": 2.4117865562438965 }, { "auxiliary_loss_clip": 0.01057646, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.0191505, "balance_loss_mlp": 1.01800489, "epoch": 0.554937622125357, "flos": 17237387347200.0, "grad_norm": 1.7401500435094004, "language_loss": 0.70279574, "learning_rate": 1.741924325613172e-06, "loss": 0.72384155, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39648438, "step": 9230, "time_per_iteration": 2.3469128608703613 }, { "auxiliary_loss_clip": 0.01059352, "auxiliary_loss_mlp": 0.01040902, "balance_loss_clip": 1.01284027, "balance_loss_mlp": 1.01856756, "epoch": 0.5549977453780249, "flos": 25367088251520.0, "grad_norm": 3.112531765614294, "language_loss": 0.70716316, "learning_rate": 1.741538124855163e-06, "loss": 0.72816569, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 9231, "time_per_iteration": 3.6563682556152344 }, { "auxiliary_loss_clip": 0.01060253, "auxiliary_loss_mlp": 0.01045875, "balance_loss_clip": 1.01683545, "balance_loss_mlp": 1.01902509, "epoch": 0.555057868630693, "flos": 25078042172160.0, "grad_norm": 1.633228671135284, "language_loss": 0.79433107, "learning_rate": 1.7411519338980548e-06, "loss": 0.81539232, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41210938, "step": 9232, "time_per_iteration": 2.4073214530944824 }, { "auxiliary_loss_clip": 0.01056008, "auxiliary_loss_mlp": 0.0104421, "balance_loss_clip": 1.01847291, "balance_loss_mlp": 1.01747739, "epoch": 0.5551179918833609, "flos": 26103022133760.0, "grad_norm": 1.619224569939363, "language_loss": 0.84006721, "learning_rate": 1.7407657527564898e-06, "loss": 0.86106944, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9233, "time_per_iteration": 3.844449758529663 }, { "auxiliary_loss_clip": 0.01060104, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.01394713, "balance_loss_mlp": 1.01825488, "epoch": 0.5551781151360289, "flos": 19383917690880.0, "grad_norm": 2.610989924249469, "language_loss": 0.77043319, "learning_rate": 1.7403795814451142e-06, "loss": 0.7914753, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41796875, "step": 9234, "time_per_iteration": 2.3553028106689453 }, { "auxiliary_loss_clip": 0.01056578, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.01279926, "balance_loss_mlp": 1.01779366, "epoch": 0.5552382383886968, "flos": 21724878729600.0, "grad_norm": 1.91421092057046, "language_loss": 0.67222404, "learning_rate": 1.7399934199785706e-06, "loss": 0.69316828, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 9235, "time_per_iteration": 3.729624032974243 }, { "auxiliary_loss_clip": 0.01058749, "auxiliary_loss_mlp": 0.0104731, "balance_loss_clip": 1.01825917, "balance_loss_mlp": 1.01713562, "epoch": 0.5552983616413648, "flos": 14355307146240.0, "grad_norm": 1.9562642390149214, "language_loss": 0.69683111, "learning_rate": 1.7396072683715029e-06, "loss": 0.71789175, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41601562, "step": 9236, "time_per_iteration": 2.334315061569214 }, { "auxiliary_loss_clip": 0.01054895, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.01179123, "balance_loss_mlp": 1.01688552, "epoch": 0.5553584848940327, "flos": 25477518481920.0, "grad_norm": 1.6407874131645486, "language_loss": 0.86509657, "learning_rate": 1.7392211266385536e-06, "loss": 0.88602424, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 9237, "time_per_iteration": -0.22570204734802246 }, { "auxiliary_loss_clip": 0.01055987, "auxiliary_loss_mlp": 0.01045905, "balance_loss_clip": 1.01969123, "balance_loss_mlp": 1.01730657, "epoch": 0.5554186081467007, "flos": 22162759401600.0, "grad_norm": 1.8260989042868407, "language_loss": 0.75146019, "learning_rate": 1.7388349947943652e-06, "loss": 0.77247918, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9238, "time_per_iteration": 2.3745672702789307 }, { "auxiliary_loss_clip": 0.01058365, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.01576567, "balance_loss_mlp": 1.01737332, "epoch": 0.5554787313993687, "flos": 49744807994880.0, "grad_norm": 1.6443770670761424, "language_loss": 0.79886454, "learning_rate": 1.73844887285358e-06, "loss": 0.8198694, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.41015625, "step": 9239, "time_per_iteration": 2.655271291732788 }, { "auxiliary_loss_clip": 0.01058926, "auxiliary_loss_mlp": 0.0103813, "balance_loss_clip": 1.01327538, "balance_loss_mlp": 1.01922548, "epoch": 0.5555388546520367, "flos": 22126275164160.0, "grad_norm": 1.4830358866742888, "language_loss": 0.8080231, "learning_rate": 1.7380627608308393e-06, "loss": 0.82899362, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3984375, "step": 9240, "time_per_iteration": 2.3737666606903076 }, { "auxiliary_loss_clip": 0.01058145, "auxiliary_loss_mlp": 0.01043661, "balance_loss_clip": 1.01693439, "balance_loss_mlp": 1.01781738, "epoch": 0.5555989779047047, "flos": 24680939834880.0, "grad_norm": 2.2400575629296204, "language_loss": 0.66702247, "learning_rate": 1.737676658740786e-06, "loss": 0.68804061, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 9241, "time_per_iteration": 2.389888048171997 }, { "auxiliary_loss_clip": 0.01059436, "auxiliary_loss_mlp": 0.01046466, "balance_loss_clip": 1.01793909, "balance_loss_mlp": 1.01867723, "epoch": 0.5556591011573726, "flos": 16105607936640.0, "grad_norm": 1.9946343640563353, "language_loss": 0.73946714, "learning_rate": 1.7372905665980594e-06, "loss": 0.76052618, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40820312, "step": 9242, "time_per_iteration": 2.364753246307373 }, { "auxiliary_loss_clip": 0.01058297, "auxiliary_loss_mlp": 0.01043511, "balance_loss_clip": 1.01529455, "balance_loss_mlp": 1.01790464, "epoch": 0.5557192244100406, "flos": 12932840822400.0, "grad_norm": 2.1696820197609226, "language_loss": 0.6529848, "learning_rate": 1.7369044844173012e-06, "loss": 0.67400289, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40429688, "step": 9243, "time_per_iteration": 3.894040107727051 }, { "auxiliary_loss_clip": 0.01059827, "auxiliary_loss_mlp": 0.01044207, "balance_loss_clip": 1.01711059, "balance_loss_mlp": 1.01971555, "epoch": 0.5557793476627085, "flos": 23110616436480.0, "grad_norm": 1.957881261016502, "language_loss": 0.76107538, "learning_rate": 1.7365184122131509e-06, "loss": 0.78211576, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 9244, "time_per_iteration": 2.362924098968506 }, { "auxiliary_loss_clip": 0.01055836, "auxiliary_loss_mlp": 0.01043492, "balance_loss_clip": 1.01945949, "balance_loss_mlp": 1.01808524, "epoch": 0.5558394709153766, "flos": 21427139721600.0, "grad_norm": 2.5477495621374278, "language_loss": 0.75771207, "learning_rate": 1.7361323500002486e-06, "loss": 0.77870536, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37695312, "step": 9245, "time_per_iteration": 2.38907790184021 }, { "auxiliary_loss_clip": 0.01060548, "auxiliary_loss_mlp": 0.01047634, "balance_loss_clip": 1.01876187, "balance_loss_mlp": 1.01852441, "epoch": 0.5558995941680445, "flos": 25077274122240.0, "grad_norm": 2.380829236453101, "language_loss": 0.80332035, "learning_rate": 1.7357462977932348e-06, "loss": 0.82440215, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.421875, "step": 9246, "time_per_iteration": 2.4106640815734863 }, { "auxiliary_loss_clip": 0.01059011, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.0172441, "balance_loss_mlp": 1.01882577, "epoch": 0.5559597174207125, "flos": 20010119569920.0, "grad_norm": 1.85023833811184, "language_loss": 0.74488884, "learning_rate": 1.7353602556067471e-06, "loss": 0.76591277, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 9247, "time_per_iteration": 2.4263551235198975 }, { "auxiliary_loss_clip": 0.01058955, "auxiliary_loss_mlp": 0.01050615, "balance_loss_clip": 1.02086043, "balance_loss_mlp": 1.01860452, "epoch": 0.5560198406733804, "flos": 16834769015040.0, "grad_norm": 3.220478147195277, "language_loss": 0.77109385, "learning_rate": 1.7349742234554254e-06, "loss": 0.7921896, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40234375, "step": 9248, "time_per_iteration": 2.3174362182617188 }, { "auxiliary_loss_clip": 0.01009949, "auxiliary_loss_mlp": 0.01003662, "balance_loss_clip": 1.00111091, "balance_loss_mlp": 1.00220883, "epoch": 0.5560799639260484, "flos": 70693391577600.0, "grad_norm": 0.8534170620176549, "language_loss": 0.59518552, "learning_rate": 1.7345882013539081e-06, "loss": 0.61532164, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.07763672, "step": 9249, "time_per_iteration": 3.1202709674835205 }, { "auxiliary_loss_clip": 0.01055985, "auxiliary_loss_mlp": 0.01041154, "balance_loss_clip": 1.01439142, "balance_loss_mlp": 1.01699352, "epoch": 0.5561400871787163, "flos": 23147484698880.0, "grad_norm": 1.864325578461777, "language_loss": 0.80538189, "learning_rate": 1.734202189316832e-06, "loss": 0.82635331, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 9250, "time_per_iteration": 2.3554506301879883 }, { "auxiliary_loss_clip": 0.01057965, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.01823294, "balance_loss_mlp": 1.01717389, "epoch": 0.5562002104313843, "flos": 17565466193280.0, "grad_norm": 1.9466571000124746, "language_loss": 0.70088965, "learning_rate": 1.733816187358836e-06, "loss": 0.72194374, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40820312, "step": 9251, "time_per_iteration": 2.3438754081726074 }, { "auxiliary_loss_clip": 0.01057819, "auxiliary_loss_mlp": 0.01044301, "balance_loss_clip": 1.01660895, "balance_loss_mlp": 1.01796377, "epoch": 0.5562603336840523, "flos": 25044281020800.0, "grad_norm": 1.6409065352839491, "language_loss": 0.76406777, "learning_rate": 1.7334301954945569e-06, "loss": 0.7850889, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9252, "time_per_iteration": 2.411550283432007 }, { "auxiliary_loss_clip": 0.01059568, "auxiliary_loss_mlp": 0.01047551, "balance_loss_clip": 1.01891661, "balance_loss_mlp": 1.01853609, "epoch": 0.5563204569367203, "flos": 29057756607360.0, "grad_norm": 1.4856609571445794, "language_loss": 0.74015641, "learning_rate": 1.7330442137386313e-06, "loss": 0.76122761, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 9253, "time_per_iteration": 2.44506573677063 }, { "auxiliary_loss_clip": 0.01058628, "auxiliary_loss_mlp": 0.0104255, "balance_loss_clip": 1.01719451, "balance_loss_mlp": 1.01898479, "epoch": 0.5563805801893883, "flos": 22089371990400.0, "grad_norm": 2.0022683284969665, "language_loss": 0.83640879, "learning_rate": 1.7326582421056965e-06, "loss": 0.85742056, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 9254, "time_per_iteration": 2.3668556213378906 }, { "auxiliary_loss_clip": 0.01008298, "auxiliary_loss_mlp": 0.01008935, "balance_loss_clip": 1.00626516, "balance_loss_mlp": 1.00119257, "epoch": 0.5564407034420562, "flos": 58633379544960.0, "grad_norm": 0.873450334206818, "language_loss": 0.64886624, "learning_rate": 1.732272280610387e-06, "loss": 0.66903859, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.07128906, "step": 9255, "time_per_iteration": 2.882638931274414 }, { "auxiliary_loss_clip": 0.01056786, "auxiliary_loss_mlp": 0.01041714, "balance_loss_clip": 1.01663232, "balance_loss_mlp": 1.01857901, "epoch": 0.5565008266947242, "flos": 23111209929600.0, "grad_norm": 1.691654122854711, "language_loss": 0.70080554, "learning_rate": 1.7318863292673399e-06, "loss": 0.72179055, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 9256, "time_per_iteration": 2.367661237716675 }, { "auxiliary_loss_clip": 0.01056445, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.01405787, "balance_loss_mlp": 1.01791883, "epoch": 0.5565609499473921, "flos": 21577370768640.0, "grad_norm": 1.6353600774456003, "language_loss": 0.76965111, "learning_rate": 1.73150038809119e-06, "loss": 0.790609, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 9257, "time_per_iteration": 2.400399923324585 }, { "auxiliary_loss_clip": 0.01057296, "auxiliary_loss_mlp": 0.01045754, "balance_loss_clip": 1.01898003, "balance_loss_mlp": 1.01761377, "epoch": 0.5566210732000602, "flos": 18368643087360.0, "grad_norm": 2.6484574031828356, "language_loss": 0.62938869, "learning_rate": 1.7311144570965724e-06, "loss": 0.65041924, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 9258, "time_per_iteration": 2.334697723388672 }, { "auxiliary_loss_clip": 0.01058961, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.01683784, "balance_loss_mlp": 1.01863742, "epoch": 0.5566811964527281, "flos": 25702149369600.0, "grad_norm": 1.6445113211658102, "language_loss": 0.80115783, "learning_rate": 1.7307285362981215e-06, "loss": 0.82220972, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40234375, "step": 9259, "time_per_iteration": 2.427361011505127 }, { "auxiliary_loss_clip": 0.01058482, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.01549125, "balance_loss_mlp": 1.01782179, "epoch": 0.5567413197053961, "flos": 26942753088000.0, "grad_norm": 1.774477557924321, "language_loss": 0.83204716, "learning_rate": 1.7303426257104712e-06, "loss": 0.85305846, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 9260, "time_per_iteration": 2.421825408935547 }, { "auxiliary_loss_clip": 0.0105825, "auxiliary_loss_mlp": 0.01047793, "balance_loss_clip": 1.01934958, "balance_loss_mlp": 1.01871789, "epoch": 0.556801442958064, "flos": 20849536321920.0, "grad_norm": 1.807488036676987, "language_loss": 0.70392787, "learning_rate": 1.729956725348256e-06, "loss": 0.72498828, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39453125, "step": 9261, "time_per_iteration": 2.3937456607818604 }, { "auxiliary_loss_clip": 0.01009073, "auxiliary_loss_mlp": 0.010024, "balance_loss_clip": 0.99988443, "balance_loss_mlp": 1.00188375, "epoch": 0.556861566210732, "flos": 70495015898880.0, "grad_norm": 0.7364127035065363, "language_loss": 0.61186457, "learning_rate": 1.729570835226108e-06, "loss": 0.63197929, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.07226562, "step": 9262, "time_per_iteration": 2.9974472522735596 }, { "auxiliary_loss_clip": 0.01058992, "auxiliary_loss_mlp": 0.01043562, "balance_loss_clip": 1.01648927, "balance_loss_mlp": 1.01893771, "epoch": 0.5569216894633999, "flos": 25336120008960.0, "grad_norm": 1.5933336652843424, "language_loss": 0.65901494, "learning_rate": 1.7291849553586622e-06, "loss": 0.68004048, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 9263, "time_per_iteration": 2.423570394515991 }, { "auxiliary_loss_clip": 0.01056786, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.01502001, "balance_loss_mlp": 1.01771235, "epoch": 0.556981812716068, "flos": 22637613070080.0, "grad_norm": 1.733592292883532, "language_loss": 0.73715144, "learning_rate": 1.7287990857605497e-06, "loss": 0.75813282, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 9264, "time_per_iteration": 2.3729918003082275 }, { "auxiliary_loss_clip": 0.01060203, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.01752925, "balance_loss_mlp": 1.02015877, "epoch": 0.5570419359687359, "flos": 11035066982400.0, "grad_norm": 2.0601590034954866, "language_loss": 0.77938616, "learning_rate": 1.7284132264464022e-06, "loss": 0.80041695, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 9265, "time_per_iteration": 2.37692928314209 }, { "auxiliary_loss_clip": 0.01056342, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.0204519, "balance_loss_mlp": 1.01858389, "epoch": 0.5571020592214039, "flos": 22821954382080.0, "grad_norm": 1.372458321936311, "language_loss": 0.71835935, "learning_rate": 1.7280273774308536e-06, "loss": 0.73936331, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 9266, "time_per_iteration": 2.4074583053588867 }, { "auxiliary_loss_clip": 0.01057157, "auxiliary_loss_mlp": 0.01042163, "balance_loss_clip": 1.01703334, "balance_loss_mlp": 1.01839113, "epoch": 0.5571621824740719, "flos": 22926728972160.0, "grad_norm": 2.0116384929176268, "language_loss": 0.69389522, "learning_rate": 1.727641538728533e-06, "loss": 0.71488839, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 9267, "time_per_iteration": 2.3831422328948975 }, { "auxiliary_loss_clip": 0.01055142, "auxiliary_loss_mlp": 0.01042562, "balance_loss_clip": 1.01858878, "balance_loss_mlp": 1.0178802, "epoch": 0.5572223057267398, "flos": 22965587182080.0, "grad_norm": 2.119624605734058, "language_loss": 0.76320827, "learning_rate": 1.7272557103540736e-06, "loss": 0.78418535, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 9268, "time_per_iteration": 2.3947315216064453 }, { "auxiliary_loss_clip": 0.0105755, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.01623368, "balance_loss_mlp": 1.01906157, "epoch": 0.5572824289794078, "flos": 20958989034240.0, "grad_norm": 2.326644502080678, "language_loss": 0.75527292, "learning_rate": 1.726869892322104e-06, "loss": 0.77624261, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38476562, "step": 9269, "time_per_iteration": 2.3992502689361572 }, { "auxiliary_loss_clip": 0.01057004, "auxiliary_loss_mlp": 0.01041963, "balance_loss_clip": 1.01784682, "balance_loss_mlp": 1.01859093, "epoch": 0.5573425522320757, "flos": 25041348466560.0, "grad_norm": 2.017765674936808, "language_loss": 0.84115529, "learning_rate": 1.726484084647256e-06, "loss": 0.86214489, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38476562, "step": 9270, "time_per_iteration": 2.3983871936798096 }, { "auxiliary_loss_clip": 0.01057682, "auxiliary_loss_mlp": 0.0104271, "balance_loss_clip": 1.01759231, "balance_loss_mlp": 1.01880455, "epoch": 0.5574026754847438, "flos": 23658508402560.0, "grad_norm": 1.8755766583253284, "language_loss": 0.81144142, "learning_rate": 1.7260982873441591e-06, "loss": 0.83244526, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 9271, "time_per_iteration": 2.3901779651641846 }, { "auxiliary_loss_clip": 0.01057754, "auxiliary_loss_mlp": 0.01038147, "balance_loss_clip": 1.01276731, "balance_loss_mlp": 1.0184772, "epoch": 0.5574627987374117, "flos": 24781315593600.0, "grad_norm": 1.7334005736882865, "language_loss": 0.90879023, "learning_rate": 1.725712500427442e-06, "loss": 0.92974925, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 9272, "time_per_iteration": 3.6055901050567627 }, { "auxiliary_loss_clip": 0.01055494, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.01480603, "balance_loss_mlp": 1.01814127, "epoch": 0.5575229219900797, "flos": 21833877594240.0, "grad_norm": 1.9560915059089719, "language_loss": 0.85906267, "learning_rate": 1.7253267239117347e-06, "loss": 0.88001955, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 9273, "time_per_iteration": 3.7747347354888916 }, { "auxiliary_loss_clip": 0.01058388, "auxiliary_loss_mlp": 0.01043417, "balance_loss_clip": 1.01750064, "balance_loss_mlp": 1.01864326, "epoch": 0.5575830452427476, "flos": 27814010866560.0, "grad_norm": 2.22567633067447, "language_loss": 0.7533164, "learning_rate": 1.7249409578116655e-06, "loss": 0.77433449, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 9274, "time_per_iteration": 2.3998196125030518 }, { "auxiliary_loss_clip": 0.01062628, "auxiliary_loss_mlp": 0.01049105, "balance_loss_clip": 1.01770544, "balance_loss_mlp": 1.01949644, "epoch": 0.5576431684954156, "flos": 17812093104000.0, "grad_norm": 2.7991036975604766, "language_loss": 0.79678273, "learning_rate": 1.7245552021418629e-06, "loss": 0.81790006, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.43164062, "step": 9275, "time_per_iteration": 3.7139954566955566 }, { "auxiliary_loss_clip": 0.01057553, "auxiliary_loss_mlp": 0.01039209, "balance_loss_clip": 1.01472342, "balance_loss_mlp": 1.01880944, "epoch": 0.5577032917480835, "flos": 15485969393280.0, "grad_norm": 1.7858325593050102, "language_loss": 0.76206326, "learning_rate": 1.7241694569169546e-06, "loss": 0.78303093, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 9276, "time_per_iteration": 2.3844759464263916 }, { "auxiliary_loss_clip": 0.01056084, "auxiliary_loss_mlp": 0.01041751, "balance_loss_clip": 1.01600218, "balance_loss_mlp": 1.01704359, "epoch": 0.5577634150007516, "flos": 21578697400320.0, "grad_norm": 1.7536603982558525, "language_loss": 0.76480746, "learning_rate": 1.7237837221515678e-06, "loss": 0.78578579, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9277, "time_per_iteration": 2.352510452270508 }, { "auxiliary_loss_clip": 0.01053602, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.01751256, "balance_loss_mlp": 1.01572025, "epoch": 0.5578235382534195, "flos": 21138756958080.0, "grad_norm": 1.6486112247985807, "language_loss": 0.72085673, "learning_rate": 1.7233979978603304e-06, "loss": 0.74180567, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 9278, "time_per_iteration": 2.3810791969299316 }, { "auxiliary_loss_clip": 0.01058954, "auxiliary_loss_mlp": 0.01042817, "balance_loss_clip": 1.01660275, "balance_loss_mlp": 1.01884747, "epoch": 0.5578836615060875, "flos": 26503999632000.0, "grad_norm": 1.4906734843405118, "language_loss": 0.76550686, "learning_rate": 1.723012284057868e-06, "loss": 0.78652453, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 9279, "time_per_iteration": 2.4189023971557617 }, { "auxiliary_loss_clip": 0.01056025, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.01419997, "balance_loss_mlp": 1.01711226, "epoch": 0.5579437847587555, "flos": 20152844674560.0, "grad_norm": 1.5430762763666508, "language_loss": 0.68836212, "learning_rate": 1.7226265807588082e-06, "loss": 0.70932794, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 9280, "time_per_iteration": 2.3903071880340576 }, { "auxiliary_loss_clip": 0.01057436, "auxiliary_loss_mlp": 0.01047114, "balance_loss_clip": 1.02142489, "balance_loss_mlp": 1.0175308, "epoch": 0.5580039080114234, "flos": 26101136920320.0, "grad_norm": 1.8633502549112315, "language_loss": 0.74431616, "learning_rate": 1.7222408879777763e-06, "loss": 0.76536167, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 9281, "time_per_iteration": 2.4069015979766846 }, { "auxiliary_loss_clip": 0.01054975, "auxiliary_loss_mlp": 0.01042729, "balance_loss_clip": 1.01775491, "balance_loss_mlp": 1.01736808, "epoch": 0.5580640312640914, "flos": 13770826208640.0, "grad_norm": 3.618091544205112, "language_loss": 0.76178563, "learning_rate": 1.7218552057293974e-06, "loss": 0.78276265, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 9282, "time_per_iteration": 2.370868444442749 }, { "auxiliary_loss_clip": 0.010554, "auxiliary_loss_mlp": 0.01037952, "balance_loss_clip": 1.01408601, "balance_loss_mlp": 1.0175879, "epoch": 0.5581241545167593, "flos": 17675023639680.0, "grad_norm": 1.9628580380355938, "language_loss": 0.68100697, "learning_rate": 1.721469534028297e-06, "loss": 0.70194048, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 9283, "time_per_iteration": 3.7976675033569336 }, { "auxiliary_loss_clip": 0.01057099, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.01470196, "balance_loss_mlp": 1.01857007, "epoch": 0.5581842777694274, "flos": 19568259002880.0, "grad_norm": 1.7396582685026583, "language_loss": 0.84303057, "learning_rate": 1.7210838728890994e-06, "loss": 0.86399519, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 9284, "time_per_iteration": 2.377251148223877 }, { "auxiliary_loss_clip": 0.0105701, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.01135612, "balance_loss_mlp": 1.01700473, "epoch": 0.5582444010220953, "flos": 20594111748480.0, "grad_norm": 2.2067019018410736, "language_loss": 0.86766535, "learning_rate": 1.7206982223264304e-06, "loss": 0.88860404, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 9285, "time_per_iteration": 2.3620309829711914 }, { "auxiliary_loss_clip": 0.01058475, "auxiliary_loss_mlp": 0.01042217, "balance_loss_clip": 1.01625347, "balance_loss_mlp": 1.01772678, "epoch": 0.5583045242747633, "flos": 19134497871360.0, "grad_norm": 3.1992466386470277, "language_loss": 0.75718784, "learning_rate": 1.720312582354912e-06, "loss": 0.77819479, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40820312, "step": 9286, "time_per_iteration": 2.3977746963500977 }, { "auxiliary_loss_clip": 0.01055835, "auxiliary_loss_mlp": 0.01046607, "balance_loss_clip": 1.01896262, "balance_loss_mlp": 1.01689529, "epoch": 0.5583646475274312, "flos": 27453322944000.0, "grad_norm": 1.5624405617002075, "language_loss": 0.75754577, "learning_rate": 1.7199269529891684e-06, "loss": 0.77857018, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38867188, "step": 9287, "time_per_iteration": 2.4230213165283203 }, { "auxiliary_loss_clip": 0.01058374, "auxiliary_loss_mlp": 0.01044066, "balance_loss_clip": 1.01669598, "balance_loss_mlp": 1.01841116, "epoch": 0.5584247707800992, "flos": 23652817850880.0, "grad_norm": 5.7214271044715925, "language_loss": 0.75936341, "learning_rate": 1.7195413342438233e-06, "loss": 0.78038776, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 9288, "time_per_iteration": 2.4312572479248047 }, { "auxiliary_loss_clip": 0.01059666, "auxiliary_loss_mlp": 0.01046473, "balance_loss_clip": 1.01990128, "balance_loss_mlp": 1.0200907, "epoch": 0.5584848940327671, "flos": 13698032290560.0, "grad_norm": 1.9896155142706071, "language_loss": 0.7994194, "learning_rate": 1.7191557261334984e-06, "loss": 0.82048082, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 9289, "time_per_iteration": 2.352869749069214 }, { "auxiliary_loss_clip": 0.01060904, "auxiliary_loss_mlp": 0.01041531, "balance_loss_clip": 1.01398158, "balance_loss_mlp": 1.01868773, "epoch": 0.5585450172854352, "flos": 27014988424320.0, "grad_norm": 1.786566677015049, "language_loss": 0.62665999, "learning_rate": 1.718770128672817e-06, "loss": 0.64768434, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.421875, "step": 9290, "time_per_iteration": 2.433828353881836 }, { "auxiliary_loss_clip": 0.01058678, "auxiliary_loss_mlp": 0.01042847, "balance_loss_clip": 1.01544118, "balance_loss_mlp": 1.01860046, "epoch": 0.5586051405381031, "flos": 23184527518080.0, "grad_norm": 1.9153586350461669, "language_loss": 0.68997192, "learning_rate": 1.7183845418764e-06, "loss": 0.71098721, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 9291, "time_per_iteration": 2.366030693054199 }, { "auxiliary_loss_clip": 0.0105727, "auxiliary_loss_mlp": 0.01051894, "balance_loss_clip": 1.02623999, "balance_loss_mlp": 1.01736116, "epoch": 0.5586652637907711, "flos": 20774542988160.0, "grad_norm": 1.76520953942206, "language_loss": 0.85088307, "learning_rate": 1.7179989657588698e-06, "loss": 0.87197471, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3984375, "step": 9292, "time_per_iteration": 2.3915297985076904 }, { "auxiliary_loss_clip": 0.01055791, "auxiliary_loss_mlp": 0.01043012, "balance_loss_clip": 1.01901567, "balance_loss_mlp": 1.01832891, "epoch": 0.5587253870434391, "flos": 28218654057600.0, "grad_norm": 1.866151563332103, "language_loss": 0.75336683, "learning_rate": 1.7176134003348476e-06, "loss": 0.77435488, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 9293, "time_per_iteration": 2.4078938961029053 }, { "auxiliary_loss_clip": 0.0105495, "auxiliary_loss_mlp": 0.01040617, "balance_loss_clip": 1.01734757, "balance_loss_mlp": 1.01691532, "epoch": 0.558785510296107, "flos": 26614499685120.0, "grad_norm": 2.5493642181278244, "language_loss": 0.73037696, "learning_rate": 1.7172278456189523e-06, "loss": 0.75133264, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38085938, "step": 9294, "time_per_iteration": 2.417523145675659 }, { "auxiliary_loss_clip": 0.01057156, "auxiliary_loss_mlp": 0.01043417, "balance_loss_clip": 1.0176084, "balance_loss_mlp": 1.01801634, "epoch": 0.558845633548775, "flos": 20155742317440.0, "grad_norm": 3.150280677610416, "language_loss": 0.69511712, "learning_rate": 1.716842301625806e-06, "loss": 0.71612287, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9295, "time_per_iteration": 2.348029375076294 }, { "auxiliary_loss_clip": 0.01056169, "auxiliary_loss_mlp": 0.01042911, "balance_loss_clip": 1.01712596, "balance_loss_mlp": 1.01835454, "epoch": 0.5589057568014429, "flos": 24349684055040.0, "grad_norm": 1.5364659888082042, "language_loss": 0.82060313, "learning_rate": 1.7164567683700281e-06, "loss": 0.84159398, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 9296, "time_per_iteration": 2.469758987426758 }, { "auxiliary_loss_clip": 0.010563, "auxiliary_loss_mlp": 0.01035779, "balance_loss_clip": 1.01212811, "balance_loss_mlp": 1.01798522, "epoch": 0.558965880054111, "flos": 21104123022720.0, "grad_norm": 1.6407005841635993, "language_loss": 0.66844463, "learning_rate": 1.7160712458662379e-06, "loss": 0.68936545, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 9297, "time_per_iteration": 2.395679235458374 }, { "auxiliary_loss_clip": 0.01059797, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.01635134, "balance_loss_mlp": 1.01900268, "epoch": 0.5590260033067789, "flos": 18435257694720.0, "grad_norm": 1.7113642474169888, "language_loss": 0.76317871, "learning_rate": 1.7156857341290544e-06, "loss": 0.78422689, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40820312, "step": 9298, "time_per_iteration": 2.340498447418213 }, { "auxiliary_loss_clip": 0.01009064, "auxiliary_loss_mlp": 0.01002672, "balance_loss_clip": 1.00028825, "balance_loss_mlp": 1.00208163, "epoch": 0.5590861265594469, "flos": 70574058950400.0, "grad_norm": 0.683027179276209, "language_loss": 0.52488124, "learning_rate": 1.7153002331730967e-06, "loss": 0.54499865, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06933594, "step": 9299, "time_per_iteration": 3.0739896297454834 }, { "auxiliary_loss_clip": 0.01056106, "auxiliary_loss_mlp": 0.01039567, "balance_loss_clip": 1.01586831, "balance_loss_mlp": 1.01798081, "epoch": 0.5591462498121148, "flos": 30663097966080.0, "grad_norm": 1.795973136252282, "language_loss": 0.69891268, "learning_rate": 1.7149147430129824e-06, "loss": 0.71986943, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 9300, "time_per_iteration": 2.488100051879883 }, { "auxiliary_loss_clip": 0.0105805, "auxiliary_loss_mlp": 0.01051968, "balance_loss_clip": 1.02491927, "balance_loss_mlp": 1.01834822, "epoch": 0.5592063730647828, "flos": 18149458371840.0, "grad_norm": 1.9611668217491016, "language_loss": 0.8278262, "learning_rate": 1.7145292636633293e-06, "loss": 0.84892637, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 9301, "time_per_iteration": 2.3338632583618164 }, { "auxiliary_loss_clip": 0.01055233, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.01585603, "balance_loss_mlp": 1.01636755, "epoch": 0.5592664963174507, "flos": 24059276432640.0, "grad_norm": 2.4500762402515983, "language_loss": 0.68925315, "learning_rate": 1.714143795138756e-06, "loss": 0.7102133, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 9302, "time_per_iteration": 2.412785768508911 }, { "auxiliary_loss_clip": 0.01059388, "auxiliary_loss_mlp": 0.01044058, "balance_loss_clip": 1.01716399, "balance_loss_mlp": 1.01843596, "epoch": 0.5593266195701188, "flos": 19826895421440.0, "grad_norm": 1.5370312460046234, "language_loss": 0.71311402, "learning_rate": 1.713758337453878e-06, "loss": 0.7341485, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 9303, "time_per_iteration": 2.377955913543701 }, { "auxiliary_loss_clip": 0.01055196, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.01736891, "balance_loss_mlp": 1.01846015, "epoch": 0.5593867428227867, "flos": 25299600860160.0, "grad_norm": 1.560473808683883, "language_loss": 0.73405856, "learning_rate": 1.7133728906233124e-06, "loss": 0.75501496, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 9304, "time_per_iteration": 2.4263484477996826 }, { "auxiliary_loss_clip": 0.0105593, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.01367116, "balance_loss_mlp": 1.01681542, "epoch": 0.5594468660754547, "flos": 12932177506560.0, "grad_norm": 1.9243818026940929, "language_loss": 0.79250586, "learning_rate": 1.7129874546616763e-06, "loss": 0.81345844, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9305, "time_per_iteration": 2.35794734954834 }, { "auxiliary_loss_clip": 0.01053449, "auxiliary_loss_mlp": 0.01042186, "balance_loss_clip": 1.01842713, "balance_loss_mlp": 1.0169909, "epoch": 0.5595069893281227, "flos": 19061703953280.0, "grad_norm": 1.6968760729345957, "language_loss": 0.70550996, "learning_rate": 1.7126020295835836e-06, "loss": 0.7264663, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 9306, "time_per_iteration": 2.363891363143921 }, { "auxiliary_loss_clip": 0.01008884, "auxiliary_loss_mlp": 0.01002691, "balance_loss_clip": 1.00028253, "balance_loss_mlp": 1.00168788, "epoch": 0.5595671125807906, "flos": 70270350099840.0, "grad_norm": 1.185923492722234, "language_loss": 0.60368311, "learning_rate": 1.7122166154036518e-06, "loss": 0.62379885, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.07226562, "step": 9307, "time_per_iteration": 3.1439149379730225 }, { "auxiliary_loss_clip": 0.01055172, "auxiliary_loss_mlp": 0.01049569, "balance_loss_clip": 1.02593017, "balance_loss_mlp": 1.01708221, "epoch": 0.5596272358334586, "flos": 20664531694080.0, "grad_norm": 1.7467367712463384, "language_loss": 0.74987692, "learning_rate": 1.7118312121364943e-06, "loss": 0.77092433, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 9308, "time_per_iteration": 2.39603853225708 }, { "auxiliary_loss_clip": 0.01057359, "auxiliary_loss_mlp": 0.01048404, "balance_loss_clip": 1.02156961, "balance_loss_mlp": 1.01723135, "epoch": 0.5596873590861265, "flos": 25039986923520.0, "grad_norm": 1.805438555756017, "language_loss": 0.70827657, "learning_rate": 1.7114458197967257e-06, "loss": 0.72933418, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 9309, "time_per_iteration": 2.407064914703369 }, { "auxiliary_loss_clip": 0.01056548, "auxiliary_loss_mlp": 0.01047191, "balance_loss_clip": 1.01804423, "balance_loss_mlp": 1.01777279, "epoch": 0.5597474823387946, "flos": 25957189918080.0, "grad_norm": 5.202721970385582, "language_loss": 0.76524115, "learning_rate": 1.7110604383989613e-06, "loss": 0.78627849, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.38867188, "step": 9310, "time_per_iteration": 2.458327054977417 }, { "auxiliary_loss_clip": 0.01058147, "auxiliary_loss_mlp": 0.01042706, "balance_loss_clip": 1.01601458, "balance_loss_mlp": 1.01849103, "epoch": 0.5598076055914625, "flos": 26176234988160.0, "grad_norm": 2.517132350771058, "language_loss": 0.71590853, "learning_rate": 1.7106750679578133e-06, "loss": 0.73691702, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 9311, "time_per_iteration": 3.8497228622436523 }, { "auxiliary_loss_clip": 0.01054282, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.01807725, "balance_loss_mlp": 1.01622272, "epoch": 0.5598677288441305, "flos": 11654984816640.0, "grad_norm": 1.7499607172699307, "language_loss": 0.73350573, "learning_rate": 1.7102897084878962e-06, "loss": 0.75447893, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 9312, "time_per_iteration": 3.8601391315460205 }, { "auxiliary_loss_clip": 0.01055706, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.01649928, "balance_loss_mlp": 1.01794147, "epoch": 0.5599278520967984, "flos": 22965482448000.0, "grad_norm": 2.3436504583278803, "language_loss": 0.9106487, "learning_rate": 1.709904360003822e-06, "loss": 0.93160522, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37890625, "step": 9313, "time_per_iteration": 2.4069085121154785 }, { "auxiliary_loss_clip": 0.01056999, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.01565218, "balance_loss_mlp": 1.01870525, "epoch": 0.5599879753494664, "flos": 21214483430400.0, "grad_norm": 1.4516446399905245, "language_loss": 0.78088903, "learning_rate": 1.709519022520204e-06, "loss": 0.80186981, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9314, "time_per_iteration": 2.390681028366089 }, { "auxiliary_loss_clip": 0.01056635, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.01484084, "balance_loss_mlp": 1.01832342, "epoch": 0.5600480986021343, "flos": 31901921205120.0, "grad_norm": 1.8375582845437513, "language_loss": 0.7126416, "learning_rate": 1.7091336960516537e-06, "loss": 0.73360145, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 9315, "time_per_iteration": 3.8582558631896973 }, { "auxiliary_loss_clip": 0.01059262, "auxiliary_loss_mlp": 0.01052988, "balance_loss_clip": 1.0258199, "balance_loss_mlp": 1.01809514, "epoch": 0.5601082218548024, "flos": 28474776858240.0, "grad_norm": 1.8160278998461676, "language_loss": 0.68117023, "learning_rate": 1.7087483806127824e-06, "loss": 0.70229274, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 9316, "time_per_iteration": 2.4142086505889893 }, { "auxiliary_loss_clip": 0.01055488, "auxiliary_loss_mlp": 0.01037653, "balance_loss_clip": 1.01329815, "balance_loss_mlp": 1.01785421, "epoch": 0.5601683451074703, "flos": 24096039960960.0, "grad_norm": 1.8602752808413583, "language_loss": 0.87719476, "learning_rate": 1.7083630762182022e-06, "loss": 0.89812618, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37695312, "step": 9317, "time_per_iteration": 2.388385772705078 }, { "auxiliary_loss_clip": 0.01058518, "auxiliary_loss_mlp": 0.01047691, "balance_loss_clip": 1.02001095, "balance_loss_mlp": 1.01770496, "epoch": 0.5602284683601383, "flos": 26355095216640.0, "grad_norm": 1.6815065082654364, "language_loss": 0.78544563, "learning_rate": 1.7079777828825233e-06, "loss": 0.80650777, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40820312, "step": 9318, "time_per_iteration": 2.393500328063965 }, { "auxiliary_loss_clip": 0.01054998, "auxiliary_loss_mlp": 0.01046602, "balance_loss_clip": 1.02332079, "balance_loss_mlp": 1.01719093, "epoch": 0.5602885916128063, "flos": 24495306802560.0, "grad_norm": 1.463101183830637, "language_loss": 0.76929039, "learning_rate": 1.7075925006203558e-06, "loss": 0.79030639, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 9319, "time_per_iteration": 2.417733669281006 }, { "auxiliary_loss_clip": 0.01055992, "auxiliary_loss_mlp": 0.01035193, "balance_loss_clip": 1.01137495, "balance_loss_mlp": 1.01801276, "epoch": 0.5603487148654742, "flos": 27343765497600.0, "grad_norm": 2.1718038108803546, "language_loss": 0.86273456, "learning_rate": 1.7072072294463101e-06, "loss": 0.88364643, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 9320, "time_per_iteration": 2.4052815437316895 }, { "auxiliary_loss_clip": 0.01010659, "auxiliary_loss_mlp": 0.01012668, "balance_loss_clip": 1.01018846, "balance_loss_mlp": 1.00343299, "epoch": 0.5604088381181422, "flos": 54084789550080.0, "grad_norm": 0.7465787145657868, "language_loss": 0.52628881, "learning_rate": 1.706821969374996e-06, "loss": 0.54652202, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.07226562, "step": 9321, "time_per_iteration": 2.8485891819000244 }, { "auxiliary_loss_clip": 0.01055973, "auxiliary_loss_mlp": 0.01036999, "balance_loss_clip": 1.01341987, "balance_loss_mlp": 1.01812601, "epoch": 0.5604689613708101, "flos": 22235308940160.0, "grad_norm": 1.6780034507753476, "language_loss": 0.75418532, "learning_rate": 1.7064367204210216e-06, "loss": 0.77511501, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 9322, "time_per_iteration": 3.9475460052490234 }, { "auxiliary_loss_clip": 0.01057453, "auxiliary_loss_mlp": 0.0104032, "balance_loss_clip": 1.0147022, "balance_loss_mlp": 1.01880503, "epoch": 0.5605290846234782, "flos": 35296351741440.0, "grad_norm": 1.6584998785037635, "language_loss": 0.74390411, "learning_rate": 1.7060514825989963e-06, "loss": 0.76488185, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 9323, "time_per_iteration": 2.506676435470581 }, { "auxiliary_loss_clip": 0.01058525, "auxiliary_loss_mlp": 0.01039985, "balance_loss_clip": 1.01572645, "balance_loss_mlp": 1.01903987, "epoch": 0.5605892078761461, "flos": 20262367209600.0, "grad_norm": 1.7020353855666461, "language_loss": 0.62398374, "learning_rate": 1.7056662559235286e-06, "loss": 0.64496887, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.39453125, "step": 9324, "time_per_iteration": 2.378683567047119 }, { "auxiliary_loss_clip": 0.01055905, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.01389134, "balance_loss_mlp": 1.01714802, "epoch": 0.5606493311288141, "flos": 17307458179200.0, "grad_norm": 2.229199851811408, "language_loss": 0.88773632, "learning_rate": 1.705281040409226e-06, "loss": 0.90869248, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9325, "time_per_iteration": 2.33491849899292 }, { "auxiliary_loss_clip": 0.01057582, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.01137114, "balance_loss_mlp": 1.01745009, "epoch": 0.560709454381482, "flos": 21651910254720.0, "grad_norm": 1.4707490675697381, "language_loss": 0.74897802, "learning_rate": 1.7048958360706952e-06, "loss": 0.76994014, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 9326, "time_per_iteration": 2.4150948524475098 }, { "auxiliary_loss_clip": 0.01059812, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.01265645, "balance_loss_mlp": 1.01909375, "epoch": 0.56076957763415, "flos": 20302307671680.0, "grad_norm": 1.7916213853018836, "language_loss": 0.79537249, "learning_rate": 1.7045106429225447e-06, "loss": 0.81639814, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.40820312, "step": 9327, "time_per_iteration": 2.4020192623138428 }, { "auxiliary_loss_clip": 0.01058733, "auxiliary_loss_mlp": 0.01037653, "balance_loss_clip": 1.0116775, "balance_loss_mlp": 1.01952553, "epoch": 0.5608297008868179, "flos": 25044734868480.0, "grad_norm": 2.3282532959035285, "language_loss": 0.79096985, "learning_rate": 1.7041254609793795e-06, "loss": 0.81193376, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9328, "time_per_iteration": 2.4061052799224854 }, { "auxiliary_loss_clip": 0.01056474, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.01639175, "balance_loss_mlp": 1.01773012, "epoch": 0.560889824139486, "flos": 19865753631360.0, "grad_norm": 1.5467392697309061, "language_loss": 0.74498534, "learning_rate": 1.7037402902558066e-06, "loss": 0.76597357, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 9329, "time_per_iteration": 2.402282953262329 }, { "auxiliary_loss_clip": 0.0105963, "auxiliary_loss_mlp": 0.0104044, "balance_loss_clip": 1.01234245, "balance_loss_mlp": 1.01902533, "epoch": 0.5609499473921539, "flos": 22928299983360.0, "grad_norm": 1.5377900967784905, "language_loss": 0.84446639, "learning_rate": 1.7033551307664324e-06, "loss": 0.86546707, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 9330, "time_per_iteration": 2.3976733684539795 }, { "auxiliary_loss_clip": 0.01010313, "auxiliary_loss_mlp": 0.01004717, "balance_loss_clip": 1.0020467, "balance_loss_mlp": 1.00300694, "epoch": 0.5610100706448219, "flos": 53032716506880.0, "grad_norm": 0.7216140745381557, "language_loss": 0.5793041, "learning_rate": 1.7029699825258603e-06, "loss": 0.5994544, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.07324219, "step": 9331, "time_per_iteration": 3.0341784954071045 }, { "auxiliary_loss_clip": 0.01058006, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.01330066, "balance_loss_mlp": 1.01903582, "epoch": 0.5610701938974898, "flos": 21833877594240.0, "grad_norm": 1.8208126047874622, "language_loss": 0.82594037, "learning_rate": 1.7025848455486971e-06, "loss": 0.84691358, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9332, "time_per_iteration": 2.3856351375579834 }, { "auxiliary_loss_clip": 0.01059455, "auxiliary_loss_mlp": 0.01040812, "balance_loss_clip": 1.01332271, "balance_loss_mlp": 1.01825738, "epoch": 0.5611303171501578, "flos": 17456222949120.0, "grad_norm": 4.6049167720506885, "language_loss": 0.83076477, "learning_rate": 1.7021997198495454e-06, "loss": 0.85176742, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41210938, "step": 9333, "time_per_iteration": 2.340019702911377 }, { "auxiliary_loss_clip": 0.01054922, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.01505518, "balance_loss_mlp": 1.01676178, "epoch": 0.5611904404028258, "flos": 22636705374720.0, "grad_norm": 1.8016442154367938, "language_loss": 0.73936832, "learning_rate": 1.7018146054430108e-06, "loss": 0.76031029, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38085938, "step": 9334, "time_per_iteration": 2.3622076511383057 }, { "auxiliary_loss_clip": 0.01056352, "auxiliary_loss_mlp": 0.01043028, "balance_loss_clip": 1.01975834, "balance_loss_mlp": 1.01804113, "epoch": 0.5612505636554938, "flos": 14315541240960.0, "grad_norm": 3.1314566229992287, "language_loss": 0.7279228, "learning_rate": 1.7014295023436961e-06, "loss": 0.74891663, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3828125, "step": 9335, "time_per_iteration": 2.35103440284729 }, { "auxiliary_loss_clip": 0.01056454, "auxiliary_loss_mlp": 0.010383, "balance_loss_clip": 1.01320601, "balance_loss_mlp": 1.01823854, "epoch": 0.5613106869081618, "flos": 16507353484800.0, "grad_norm": 1.868815445267964, "language_loss": 0.78093797, "learning_rate": 1.701044410566205e-06, "loss": 0.80188549, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 9336, "time_per_iteration": 2.335078239440918 }, { "auxiliary_loss_clip": 0.01054874, "auxiliary_loss_mlp": 0.01040531, "balance_loss_clip": 1.01611674, "balance_loss_mlp": 1.01737046, "epoch": 0.5613708101608297, "flos": 24057495953280.0, "grad_norm": 2.430784907509139, "language_loss": 0.65667808, "learning_rate": 1.7006593301251393e-06, "loss": 0.67763209, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 9337, "time_per_iteration": 2.4050772190093994 }, { "auxiliary_loss_clip": 0.01008807, "auxiliary_loss_mlp": 0.01006276, "balance_loss_clip": 1.00361753, "balance_loss_mlp": 1.00173378, "epoch": 0.5614309334134977, "flos": 64902977832960.0, "grad_norm": 0.874617988977845, "language_loss": 0.62668276, "learning_rate": 1.700274261035102e-06, "loss": 0.6468336, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.07080078, "step": 9338, "time_per_iteration": 2.9741971492767334 }, { "auxiliary_loss_clip": 0.01057736, "auxiliary_loss_mlp": 0.01047911, "balance_loss_clip": 1.02094603, "balance_loss_mlp": 1.01813722, "epoch": 0.5614910566661656, "flos": 32918662085760.0, "grad_norm": 1.8273105964181269, "language_loss": 0.67368859, "learning_rate": 1.6998892033106946e-06, "loss": 0.69474506, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 9339, "time_per_iteration": 2.490867853164673 }, { "auxiliary_loss_clip": 0.01055849, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.01766813, "balance_loss_mlp": 1.01847756, "epoch": 0.5615511799188336, "flos": 18587862714240.0, "grad_norm": 1.821941202289845, "language_loss": 0.71072841, "learning_rate": 1.6995041569665184e-06, "loss": 0.73172581, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 9340, "time_per_iteration": 2.334085464477539 }, { "auxiliary_loss_clip": 0.01054826, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.01413345, "balance_loss_mlp": 1.01826, "epoch": 0.5616113031715015, "flos": 22818917093760.0, "grad_norm": 2.090479714698616, "language_loss": 0.7888875, "learning_rate": 1.6991191220171756e-06, "loss": 0.80980611, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 9341, "time_per_iteration": 2.3955166339874268 }, { "auxiliary_loss_clip": 0.01056191, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.01298952, "balance_loss_mlp": 1.01733911, "epoch": 0.5616714264241696, "flos": 22344622007040.0, "grad_norm": 1.6905855695476195, "language_loss": 0.80394554, "learning_rate": 1.6987340984772653e-06, "loss": 0.82489312, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38867188, "step": 9342, "time_per_iteration": 2.362762212753296 }, { "auxiliary_loss_clip": 0.010578, "auxiliary_loss_mlp": 0.01045391, "balance_loss_clip": 1.01902175, "balance_loss_mlp": 1.0179441, "epoch": 0.5617315496768375, "flos": 18806768138880.0, "grad_norm": 1.84055991315106, "language_loss": 0.77753919, "learning_rate": 1.6983490863613882e-06, "loss": 0.79857111, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 9343, "time_per_iteration": 2.3555991649627686 }, { "auxiliary_loss_clip": 0.01056052, "auxiliary_loss_mlp": 0.0104636, "balance_loss_clip": 1.02101612, "balance_loss_mlp": 1.01828003, "epoch": 0.5617916729295055, "flos": 18368328885120.0, "grad_norm": 3.230588659609028, "language_loss": 0.70748818, "learning_rate": 1.6979640856841442e-06, "loss": 0.72851241, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 9344, "time_per_iteration": 2.3543155193328857 }, { "auxiliary_loss_clip": 0.01057873, "auxiliary_loss_mlp": 0.01041594, "balance_loss_clip": 1.01520097, "balance_loss_mlp": 1.01901484, "epoch": 0.5618517961821734, "flos": 28178818329600.0, "grad_norm": 2.0370183942746767, "language_loss": 0.683855, "learning_rate": 1.6975790964601318e-06, "loss": 0.70484966, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 9345, "time_per_iteration": 2.4349169731140137 }, { "auxiliary_loss_clip": 0.01057461, "auxiliary_loss_mlp": 0.01045343, "balance_loss_clip": 1.02076197, "balance_loss_mlp": 1.01837873, "epoch": 0.5619119194348414, "flos": 15485969393280.0, "grad_norm": 2.1191707063301015, "language_loss": 0.88249314, "learning_rate": 1.6971941187039512e-06, "loss": 0.90352124, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.390625, "step": 9346, "time_per_iteration": 2.319436550140381 }, { "auxiliary_loss_clip": 0.01057317, "auxiliary_loss_mlp": 0.01046314, "balance_loss_clip": 1.01921797, "balance_loss_mlp": 1.0183996, "epoch": 0.5619720426875094, "flos": 29127478325760.0, "grad_norm": 4.338464176892236, "language_loss": 0.60727662, "learning_rate": 1.6968091524301993e-06, "loss": 0.62831295, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 9347, "time_per_iteration": 2.454298734664917 }, { "auxiliary_loss_clip": 0.01059143, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.02051175, "balance_loss_mlp": 1.01926911, "epoch": 0.5620321659401774, "flos": 18002788283520.0, "grad_norm": 3.023624401468501, "language_loss": 0.7154842, "learning_rate": 1.6964241976534745e-06, "loss": 0.73655051, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9348, "time_per_iteration": 2.346240997314453 }, { "auxiliary_loss_clip": 0.01060459, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.01438642, "balance_loss_mlp": 1.01889467, "epoch": 0.5620922891928454, "flos": 20593483344000.0, "grad_norm": 1.8982188024228155, "language_loss": 0.80409181, "learning_rate": 1.6960392543883754e-06, "loss": 0.82512093, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.4140625, "step": 9349, "time_per_iteration": 2.399677276611328 }, { "auxiliary_loss_clip": 0.01058182, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.01084018, "balance_loss_mlp": 1.01890278, "epoch": 0.5621524124455133, "flos": 26285792434560.0, "grad_norm": 2.3622569765184855, "language_loss": 0.68631899, "learning_rate": 1.6956543226494975e-06, "loss": 0.70728302, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39257812, "step": 9350, "time_per_iteration": 2.3992104530334473 }, { "auxiliary_loss_clip": 0.01059685, "auxiliary_loss_mlp": 0.01042724, "balance_loss_clip": 1.01633108, "balance_loss_mlp": 1.01945043, "epoch": 0.5622125356981813, "flos": 12749477028480.0, "grad_norm": 1.893234625538157, "language_loss": 0.79780602, "learning_rate": 1.6952694024514381e-06, "loss": 0.81883013, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 9351, "time_per_iteration": 3.709948778152466 }, { "auxiliary_loss_clip": 0.0106258, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.01501799, "balance_loss_mlp": 1.0213511, "epoch": 0.5622726589508492, "flos": 23804200972800.0, "grad_norm": 1.5998315848123077, "language_loss": 0.59906721, "learning_rate": 1.6948844938087945e-06, "loss": 0.62012517, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 9352, "time_per_iteration": 3.7839183807373047 }, { "auxiliary_loss_clip": 0.0105466, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.01559103, "balance_loss_mlp": 1.01772523, "epoch": 0.5623327822035172, "flos": 24717040047360.0, "grad_norm": 1.3834391344306762, "language_loss": 0.72785926, "learning_rate": 1.6944995967361604e-06, "loss": 0.74878842, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 9353, "time_per_iteration": 2.441418409347534 }, { "auxiliary_loss_clip": 0.01058121, "auxiliary_loss_mlp": 0.01040703, "balance_loss_clip": 1.01433396, "balance_loss_mlp": 1.01817644, "epoch": 0.5623929054561851, "flos": 14018640105600.0, "grad_norm": 2.490437652393347, "language_loss": 0.77509654, "learning_rate": 1.6941147112481327e-06, "loss": 0.79608482, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40039062, "step": 9354, "time_per_iteration": 3.610450029373169 }, { "auxiliary_loss_clip": 0.0105936, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.01194382, "balance_loss_mlp": 1.01882005, "epoch": 0.5624530287088532, "flos": 20703354992640.0, "grad_norm": 1.7266201035999869, "language_loss": 0.74004686, "learning_rate": 1.6937298373593056e-06, "loss": 0.76102173, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 9355, "time_per_iteration": 2.3969128131866455 }, { "auxiliary_loss_clip": 0.01057322, "auxiliary_loss_mlp": 0.01038373, "balance_loss_clip": 1.0124445, "balance_loss_mlp": 1.01769531, "epoch": 0.5625131519615211, "flos": 21469838181120.0, "grad_norm": 1.5605605743948345, "language_loss": 0.74220723, "learning_rate": 1.693344975084274e-06, "loss": 0.76316416, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 9356, "time_per_iteration": 2.3933475017547607 }, { "auxiliary_loss_clip": 0.01055913, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.01377523, "balance_loss_mlp": 1.01794136, "epoch": 0.5625732752141891, "flos": 18697001224320.0, "grad_norm": 2.188850257185513, "language_loss": 0.85069621, "learning_rate": 1.6929601244376318e-06, "loss": 0.87163717, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9357, "time_per_iteration": 2.365926504135132 }, { "auxiliary_loss_clip": 0.01055637, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.0122323, "balance_loss_mlp": 1.01715004, "epoch": 0.562633398466857, "flos": 16215968344320.0, "grad_norm": 2.273626814182386, "language_loss": 0.73516345, "learning_rate": 1.6925752854339722e-06, "loss": 0.75609684, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 9358, "time_per_iteration": 2.362377405166626 }, { "auxiliary_loss_clip": 0.01056159, "auxiliary_loss_mlp": 0.01043798, "balance_loss_clip": 1.01853788, "balance_loss_mlp": 1.01786876, "epoch": 0.562693521719525, "flos": 22490838247680.0, "grad_norm": 1.7550871187634007, "language_loss": 0.78603393, "learning_rate": 1.6921904580878885e-06, "loss": 0.80703342, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 9359, "time_per_iteration": 2.4174253940582275 }, { "auxiliary_loss_clip": 0.01054486, "auxiliary_loss_mlp": 0.01042498, "balance_loss_clip": 1.01761913, "balance_loss_mlp": 1.01645041, "epoch": 0.562753644972193, "flos": 25330185077760.0, "grad_norm": 1.712079122039542, "language_loss": 0.72180551, "learning_rate": 1.6918056424139736e-06, "loss": 0.74277538, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38085938, "step": 9360, "time_per_iteration": 2.40791916847229 }, { "auxiliary_loss_clip": 0.01011239, "auxiliary_loss_mlp": 0.01002477, "balance_loss_clip": 1.00010514, "balance_loss_mlp": 1.00399148, "epoch": 0.562813768224861, "flos": 67389631441920.0, "grad_norm": 0.7833824320668576, "language_loss": 0.55675793, "learning_rate": 1.6914208384268197e-06, "loss": 0.57689512, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07226562, "step": 9361, "time_per_iteration": 4.348718166351318 }, { "auxiliary_loss_clip": 0.01055647, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.01875424, "balance_loss_mlp": 1.0184691, "epoch": 0.562873891477529, "flos": 23330045531520.0, "grad_norm": 1.437425642134479, "language_loss": 0.82417035, "learning_rate": 1.691036046141018e-06, "loss": 0.84513479, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.37109375, "step": 9362, "time_per_iteration": 2.3725786209106445 }, { "auxiliary_loss_clip": 0.01055384, "auxiliary_loss_mlp": 0.01043646, "balance_loss_clip": 1.01782513, "balance_loss_mlp": 1.01734865, "epoch": 0.5629340147301969, "flos": 38471283360000.0, "grad_norm": 1.8534468767850678, "language_loss": 0.75059605, "learning_rate": 1.6906512655711614e-06, "loss": 0.7715863, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 9363, "time_per_iteration": 2.584226608276367 }, { "auxiliary_loss_clip": 0.01057141, "auxiliary_loss_mlp": 0.01049316, "balance_loss_clip": 1.02341151, "balance_loss_mlp": 1.01736164, "epoch": 0.5629941379828649, "flos": 29240736376320.0, "grad_norm": 1.8895204875648448, "language_loss": 0.84109938, "learning_rate": 1.690266496731839e-06, "loss": 0.8621639, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 9364, "time_per_iteration": 2.421346426010132 }, { "auxiliary_loss_clip": 0.01055378, "auxiliary_loss_mlp": 0.01041069, "balance_loss_clip": 1.01672602, "balance_loss_mlp": 1.01792908, "epoch": 0.5630542612355328, "flos": 19420052814720.0, "grad_norm": 2.272359094482832, "language_loss": 0.66664791, "learning_rate": 1.689881739637642e-06, "loss": 0.6876123, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.375, "step": 9365, "time_per_iteration": 2.416754722595215 }, { "auxiliary_loss_clip": 0.01058656, "auxiliary_loss_mlp": 0.01050581, "balance_loss_clip": 1.02141058, "balance_loss_mlp": 1.01710081, "epoch": 0.5631143844882008, "flos": 22265404398720.0, "grad_norm": 2.6387864575251996, "language_loss": 0.83952981, "learning_rate": 1.6894969943031611e-06, "loss": 0.86062217, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41601562, "step": 9366, "time_per_iteration": 2.348599910736084 }, { "auxiliary_loss_clip": 0.01056335, "auxiliary_loss_mlp": 0.0103447, "balance_loss_clip": 1.01131904, "balance_loss_mlp": 1.01879096, "epoch": 0.5631745077408687, "flos": 22964225639040.0, "grad_norm": 1.554844901162832, "language_loss": 0.74185359, "learning_rate": 1.6891122607429845e-06, "loss": 0.76276159, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 9367, "time_per_iteration": 2.4300789833068848 }, { "auxiliary_loss_clip": 0.01008908, "auxiliary_loss_mlp": 0.01007917, "balance_loss_clip": 1.00543761, "balance_loss_mlp": 1.00169575, "epoch": 0.5632346309935368, "flos": 65076948604800.0, "grad_norm": 0.6481489262827694, "language_loss": 0.53508335, "learning_rate": 1.6887275389717028e-06, "loss": 0.55525166, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.07226562, "step": 9368, "time_per_iteration": 3.105098247528076 }, { "auxiliary_loss_clip": 0.01056551, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.01787949, "balance_loss_mlp": 1.01862633, "epoch": 0.5632947542462047, "flos": 23001792128640.0, "grad_norm": 1.5685351932083478, "language_loss": 0.69633293, "learning_rate": 1.6883428290039046e-06, "loss": 0.71734011, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 9369, "time_per_iteration": 2.4433796405792236 }, { "auxiliary_loss_clip": 0.01054321, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.01487684, "balance_loss_mlp": 1.0165236, "epoch": 0.5633548774988727, "flos": 30481270272000.0, "grad_norm": 1.9673707397368405, "language_loss": 0.76886958, "learning_rate": 1.6879581308541763e-06, "loss": 0.78981483, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 9370, "time_per_iteration": 2.428602695465088 }, { "auxiliary_loss_clip": 0.01057475, "auxiliary_loss_mlp": 0.01043162, "balance_loss_clip": 1.01540983, "balance_loss_mlp": 1.01690209, "epoch": 0.5634150007515406, "flos": 18514056366720.0, "grad_norm": 2.2314743215713193, "language_loss": 0.76945245, "learning_rate": 1.687573444537108e-06, "loss": 0.79045886, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40625, "step": 9371, "time_per_iteration": 2.4036200046539307 }, { "auxiliary_loss_clip": 0.01055208, "auxiliary_loss_mlp": 0.0104852, "balance_loss_clip": 1.02390337, "balance_loss_mlp": 1.01719487, "epoch": 0.5634751240042086, "flos": 19243671292800.0, "grad_norm": 1.7926796863919519, "language_loss": 0.77414119, "learning_rate": 1.687188770067285e-06, "loss": 0.79517841, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 9372, "time_per_iteration": 2.3473870754241943 }, { "auxiliary_loss_clip": 0.01056143, "auxiliary_loss_mlp": 0.01041413, "balance_loss_clip": 1.01631939, "balance_loss_mlp": 1.01904154, "epoch": 0.5635352472568766, "flos": 12019827191040.0, "grad_norm": 2.040820058003114, "language_loss": 0.73364133, "learning_rate": 1.6868041074592956e-06, "loss": 0.75461692, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 9373, "time_per_iteration": 2.4173049926757812 }, { "auxiliary_loss_clip": 0.01060081, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.01447701, "balance_loss_mlp": 1.01971531, "epoch": 0.5635953705095446, "flos": 21870571299840.0, "grad_norm": 1.9942537200869856, "language_loss": 0.84247488, "learning_rate": 1.6864194567277264e-06, "loss": 0.86350143, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40429688, "step": 9374, "time_per_iteration": 2.362138509750366 }, { "auxiliary_loss_clip": 0.01055016, "auxiliary_loss_mlp": 0.01034872, "balance_loss_clip": 1.01075602, "balance_loss_mlp": 1.01724374, "epoch": 0.5636554937622126, "flos": 27124929895680.0, "grad_norm": 1.8477550926639839, "language_loss": 0.67851031, "learning_rate": 1.6860348178871618e-06, "loss": 0.69940919, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 9375, "time_per_iteration": 2.457329034805298 }, { "auxiliary_loss_clip": 0.01057509, "auxiliary_loss_mlp": 0.01042523, "balance_loss_clip": 1.01539063, "balance_loss_mlp": 1.01788831, "epoch": 0.5637156170148805, "flos": 12925753816320.0, "grad_norm": 2.6765731588899127, "language_loss": 0.8142826, "learning_rate": 1.6856501909521889e-06, "loss": 0.83528286, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 9376, "time_per_iteration": 2.319622755050659 }, { "auxiliary_loss_clip": 0.01059233, "auxiliary_loss_mlp": 0.01043637, "balance_loss_clip": 1.01598024, "balance_loss_mlp": 1.01814759, "epoch": 0.5637757402675485, "flos": 45549295246080.0, "grad_norm": 1.4587973807656855, "language_loss": 0.70266855, "learning_rate": 1.6852655759373925e-06, "loss": 0.72369719, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 9377, "time_per_iteration": 2.639840841293335 }, { "auxiliary_loss_clip": 0.0105568, "auxiliary_loss_mlp": 0.01038605, "balance_loss_clip": 1.01377416, "balance_loss_mlp": 1.01979971, "epoch": 0.5638358635202164, "flos": 20885008129920.0, "grad_norm": 1.4353540271301388, "language_loss": 0.75042665, "learning_rate": 1.6848809728573565e-06, "loss": 0.77136946, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 9378, "time_per_iteration": 2.433823585510254 }, { "auxiliary_loss_clip": 0.01060336, "auxiliary_loss_mlp": 0.01044456, "balance_loss_clip": 1.01623917, "balance_loss_mlp": 1.01853633, "epoch": 0.5638959867728844, "flos": 18805581152640.0, "grad_norm": 2.2720914082981167, "language_loss": 0.83660364, "learning_rate": 1.6844963817266656e-06, "loss": 0.85765159, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41796875, "step": 9379, "time_per_iteration": 2.385291576385498 }, { "auxiliary_loss_clip": 0.01057094, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.01489377, "balance_loss_mlp": 1.01777685, "epoch": 0.5639561100255523, "flos": 27489108954240.0, "grad_norm": 3.033956504849508, "language_loss": 0.73501676, "learning_rate": 1.6841118025599042e-06, "loss": 0.75600314, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 9380, "time_per_iteration": 2.409416675567627 }, { "auxiliary_loss_clip": 0.01057954, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.01925242, "balance_loss_mlp": 1.01822114, "epoch": 0.5640162332782204, "flos": 18075617112960.0, "grad_norm": 2.4917643960033122, "language_loss": 0.75122398, "learning_rate": 1.6837272353716542e-06, "loss": 0.77228165, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39648438, "step": 9381, "time_per_iteration": 2.3851940631866455 }, { "auxiliary_loss_clip": 0.01059449, "auxiliary_loss_mlp": 0.01043219, "balance_loss_clip": 1.01714814, "balance_loss_mlp": 1.01997876, "epoch": 0.5640763565308883, "flos": 20883856055040.0, "grad_norm": 2.0987239660044428, "language_loss": 0.74058092, "learning_rate": 1.683342680176499e-06, "loss": 0.76160765, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 9382, "time_per_iteration": 2.3891780376434326 }, { "auxiliary_loss_clip": 0.01014248, "auxiliary_loss_mlp": 0.01006783, "balance_loss_clip": 1.00389779, "balance_loss_mlp": 1.00697041, "epoch": 0.5641364797835563, "flos": 64444707060480.0, "grad_norm": 0.7359058435459532, "language_loss": 0.54467261, "learning_rate": 1.682958136989022e-06, "loss": 0.56488299, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.07275391, "step": 9383, "time_per_iteration": 3.1711373329162598 }, { "auxiliary_loss_clip": 0.01060109, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.01779938, "balance_loss_mlp": 1.01951301, "epoch": 0.5641966030362242, "flos": 18659958405120.0, "grad_norm": 2.1990885625899264, "language_loss": 0.7146951, "learning_rate": 1.6825736058238033e-06, "loss": 0.73576307, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40625, "step": 9384, "time_per_iteration": 2.344680070877075 }, { "auxiliary_loss_clip": 0.01058882, "auxiliary_loss_mlp": 0.01044614, "balance_loss_clip": 1.01834059, "balance_loss_mlp": 1.01840925, "epoch": 0.5642567262888922, "flos": 22491222272640.0, "grad_norm": 2.32887950802258, "language_loss": 0.77026761, "learning_rate": 1.6821890866954263e-06, "loss": 0.79130256, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40429688, "step": 9385, "time_per_iteration": 2.4221150875091553 }, { "auxiliary_loss_clip": 0.01055828, "auxiliary_loss_mlp": 0.0104774, "balance_loss_clip": 1.02007115, "balance_loss_mlp": 1.01739907, "epoch": 0.5643168495415603, "flos": 13003190945280.0, "grad_norm": 2.140020689054966, "language_loss": 0.84097928, "learning_rate": 1.6818045796184703e-06, "loss": 0.86201501, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3828125, "step": 9386, "time_per_iteration": 2.3416872024536133 }, { "auxiliary_loss_clip": 0.01062058, "auxiliary_loss_mlp": 0.01055321, "balance_loss_clip": 1.02696061, "balance_loss_mlp": 1.01980495, "epoch": 0.5643769727942282, "flos": 18587304132480.0, "grad_norm": 2.001553493932271, "language_loss": 0.72132051, "learning_rate": 1.681420084607516e-06, "loss": 0.74249434, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.421875, "step": 9387, "time_per_iteration": 2.366070032119751 }, { "auxiliary_loss_clip": 0.01060034, "auxiliary_loss_mlp": 0.0105093, "balance_loss_clip": 1.0237745, "balance_loss_mlp": 1.01955342, "epoch": 0.5644370960468962, "flos": 33804757192320.0, "grad_norm": 1.5774820526984588, "language_loss": 0.75411272, "learning_rate": 1.6810356016771452e-06, "loss": 0.77522236, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 9388, "time_per_iteration": 2.513049364089966 }, { "auxiliary_loss_clip": 0.01055408, "auxiliary_loss_mlp": 0.01041245, "balance_loss_clip": 1.01685429, "balance_loss_mlp": 1.01782513, "epoch": 0.5644972192995641, "flos": 21213855025920.0, "grad_norm": 2.1761693664065382, "language_loss": 0.83304507, "learning_rate": 1.6806511308419353e-06, "loss": 0.8540116, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 9389, "time_per_iteration": 2.3622193336486816 }, { "auxiliary_loss_clip": 0.01059924, "auxiliary_loss_mlp": 0.01045277, "balance_loss_clip": 1.01764441, "balance_loss_mlp": 1.01883388, "epoch": 0.5645573425522321, "flos": 18586745550720.0, "grad_norm": 2.7737168378033545, "language_loss": 0.66203213, "learning_rate": 1.680266672116467e-06, "loss": 0.68308407, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 9390, "time_per_iteration": 3.586611270904541 }, { "auxiliary_loss_clip": 0.01056938, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.01363134, "balance_loss_mlp": 1.0187912, "epoch": 0.5646174658049, "flos": 18112834488960.0, "grad_norm": 1.61830463331607, "language_loss": 0.92971283, "learning_rate": 1.6798822255153192e-06, "loss": 0.95065528, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38085938, "step": 9391, "time_per_iteration": 2.377021551132202 }, { "auxiliary_loss_clip": 0.01061841, "auxiliary_loss_mlp": 0.01046765, "balance_loss_clip": 1.01578212, "balance_loss_mlp": 1.01856613, "epoch": 0.564677589057568, "flos": 28328700263040.0, "grad_norm": 2.389408318807455, "language_loss": 0.61967111, "learning_rate": 1.6794977910530684e-06, "loss": 0.64075714, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.43359375, "step": 9392, "time_per_iteration": 3.938218355178833 }, { "auxiliary_loss_clip": 0.01055888, "auxiliary_loss_mlp": 0.01042841, "balance_loss_clip": 1.01449287, "balance_loss_mlp": 1.01689339, "epoch": 0.564737712310236, "flos": 22162654667520.0, "grad_norm": 2.4685339141529186, "language_loss": 0.83153039, "learning_rate": 1.6791133687442937e-06, "loss": 0.85251766, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.390625, "step": 9393, "time_per_iteration": 2.397209882736206 }, { "auxiliary_loss_clip": 0.01056906, "auxiliary_loss_mlp": 0.01050471, "balance_loss_clip": 1.02497208, "balance_loss_mlp": 1.01839495, "epoch": 0.564797835562904, "flos": 20957976604800.0, "grad_norm": 2.360077096582207, "language_loss": 0.88241589, "learning_rate": 1.6787289586035725e-06, "loss": 0.90348965, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 9394, "time_per_iteration": 3.867898941040039 }, { "auxiliary_loss_clip": 0.01057601, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.01707458, "balance_loss_mlp": 1.01876783, "epoch": 0.5648579588155719, "flos": 17419354686720.0, "grad_norm": 1.892315894350741, "language_loss": 0.85473049, "learning_rate": 1.6783445606454814e-06, "loss": 0.87573385, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 9395, "time_per_iteration": 2.3330233097076416 }, { "auxiliary_loss_clip": 0.01010274, "auxiliary_loss_mlp": 0.01006383, "balance_loss_clip": 1.00323594, "balance_loss_mlp": 1.00276637, "epoch": 0.5649180820682399, "flos": 69925965782400.0, "grad_norm": 0.8047315058692487, "language_loss": 0.58497566, "learning_rate": 1.677960174884597e-06, "loss": 0.60514224, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.07519531, "step": 9396, "time_per_iteration": 3.0166380405426025 }, { "auxiliary_loss_clip": 0.01059107, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.01488137, "balance_loss_mlp": 1.01817274, "epoch": 0.5649782053209078, "flos": 24971906039040.0, "grad_norm": 2.3103382236432015, "language_loss": 0.72458661, "learning_rate": 1.6775758013354943e-06, "loss": 0.74559575, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 9397, "time_per_iteration": 2.3875210285186768 }, { "auxiliary_loss_clip": 0.01059023, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.01324344, "balance_loss_mlp": 1.01807177, "epoch": 0.5650383285735758, "flos": 21725507134080.0, "grad_norm": 1.7456950414473367, "language_loss": 0.67838115, "learning_rate": 1.67719144001275e-06, "loss": 0.69938731, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41015625, "step": 9398, "time_per_iteration": 2.403930425643921 }, { "auxiliary_loss_clip": 0.01010616, "auxiliary_loss_mlp": 0.01008183, "balance_loss_clip": 1.00470173, "balance_loss_mlp": 1.0031116, "epoch": 0.5650984518262439, "flos": 65901318560640.0, "grad_norm": 0.776288550895725, "language_loss": 0.58228523, "learning_rate": 1.6768070909309386e-06, "loss": 0.6024732, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.07519531, "step": 9399, "time_per_iteration": 2.9562344551086426 }, { "auxiliary_loss_clip": 0.01059288, "auxiliary_loss_mlp": 0.01043908, "balance_loss_clip": 1.01527405, "balance_loss_mlp": 1.01887989, "epoch": 0.5651585750789118, "flos": 21031538572800.0, "grad_norm": 1.9093042594447873, "language_loss": 0.74590206, "learning_rate": 1.6764227541046347e-06, "loss": 0.76693404, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40429688, "step": 9400, "time_per_iteration": 3.8087432384490967 }, { "auxiliary_loss_clip": 0.01059615, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.01789403, "balance_loss_mlp": 1.01868439, "epoch": 0.5652186983315798, "flos": 18550924629120.0, "grad_norm": 2.169843438709384, "language_loss": 0.61704648, "learning_rate": 1.676038429548412e-06, "loss": 0.63811511, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 9401, "time_per_iteration": 2.3585643768310547 }, { "auxiliary_loss_clip": 0.01054549, "auxiliary_loss_mlp": 0.01040075, "balance_loss_clip": 1.01467109, "balance_loss_mlp": 1.01697123, "epoch": 0.5652788215842477, "flos": 18477676863360.0, "grad_norm": 1.9691448556945297, "language_loss": 0.82516575, "learning_rate": 1.6756541172768453e-06, "loss": 0.84611201, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 9402, "time_per_iteration": 2.3356659412384033 }, { "auxiliary_loss_clip": 0.01056453, "auxiliary_loss_mlp": 0.01034573, "balance_loss_clip": 1.01139867, "balance_loss_mlp": 1.01856196, "epoch": 0.5653389448369157, "flos": 30042761195520.0, "grad_norm": 1.509488389437512, "language_loss": 0.78701055, "learning_rate": 1.6752698173045068e-06, "loss": 0.80792087, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37890625, "step": 9403, "time_per_iteration": 2.4841253757476807 }, { "auxiliary_loss_clip": 0.0105799, "auxiliary_loss_mlp": 0.01043895, "balance_loss_clip": 1.01785922, "balance_loss_mlp": 1.01855779, "epoch": 0.5653990680895836, "flos": 16726608023040.0, "grad_norm": 1.5641480245386388, "language_loss": 0.69610381, "learning_rate": 1.6748855296459685e-06, "loss": 0.71712261, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 9404, "time_per_iteration": 2.3679237365722656 }, { "auxiliary_loss_clip": 0.01055059, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.0157932, "balance_loss_mlp": 1.01783776, "epoch": 0.5654591913422516, "flos": 14537379219840.0, "grad_norm": 1.793891642667489, "language_loss": 0.68657511, "learning_rate": 1.6745012543158045e-06, "loss": 0.70751363, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37109375, "step": 9405, "time_per_iteration": 2.341883659362793 }, { "auxiliary_loss_clip": 0.01056363, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.01695347, "balance_loss_mlp": 1.0196743, "epoch": 0.5655193145949196, "flos": 26208809153280.0, "grad_norm": 3.208832411442583, "language_loss": 0.74926889, "learning_rate": 1.6741169913285852e-06, "loss": 0.77023482, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 9406, "time_per_iteration": 2.4525160789489746 }, { "auxiliary_loss_clip": 0.01060249, "auxiliary_loss_mlp": 0.01043293, "balance_loss_clip": 1.01729381, "balance_loss_mlp": 1.01904523, "epoch": 0.5655794378475876, "flos": 25045398184320.0, "grad_norm": 1.7598757386560056, "language_loss": 0.80870628, "learning_rate": 1.673732740698882e-06, "loss": 0.82974172, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.41210938, "step": 9407, "time_per_iteration": 2.4073002338409424 }, { "auxiliary_loss_clip": 0.01056825, "auxiliary_loss_mlp": 0.01041268, "balance_loss_clip": 1.01642513, "balance_loss_mlp": 1.01972699, "epoch": 0.5656395611002555, "flos": 31031431476480.0, "grad_norm": 1.4182586048414427, "language_loss": 0.72465122, "learning_rate": 1.6733485024412666e-06, "loss": 0.74563217, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 9408, "time_per_iteration": 2.5082805156707764 }, { "auxiliary_loss_clip": 0.01056635, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.01602209, "balance_loss_mlp": 1.01855147, "epoch": 0.5656996843529235, "flos": 20228501324160.0, "grad_norm": 1.9555609566229977, "language_loss": 0.82303578, "learning_rate": 1.672964276570308e-06, "loss": 0.84401143, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38085938, "step": 9409, "time_per_iteration": 2.3717715740203857 }, { "auxiliary_loss_clip": 0.01056563, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.01358438, "balance_loss_mlp": 1.01750541, "epoch": 0.5657598076055914, "flos": 20995193980800.0, "grad_norm": 1.7111634511952234, "language_loss": 0.79590911, "learning_rate": 1.6725800631005776e-06, "loss": 0.81686604, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 9410, "time_per_iteration": 2.3791637420654297 }, { "auxiliary_loss_clip": 0.01059537, "auxiliary_loss_mlp": 0.01042933, "balance_loss_clip": 1.01598001, "balance_loss_mlp": 1.02002931, "epoch": 0.5658199308582594, "flos": 11545217902080.0, "grad_norm": 2.1813964605172247, "language_loss": 0.84656888, "learning_rate": 1.6721958620466432e-06, "loss": 0.86759365, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 9411, "time_per_iteration": 2.3606479167938232 }, { "auxiliary_loss_clip": 0.01061101, "auxiliary_loss_mlp": 0.01040576, "balance_loss_clip": 1.01427865, "balance_loss_mlp": 1.02061427, "epoch": 0.5658800541109275, "flos": 14171314947840.0, "grad_norm": 2.462035329990107, "language_loss": 0.69344926, "learning_rate": 1.6718116734230749e-06, "loss": 0.71446604, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 9412, "time_per_iteration": 2.3773415088653564 }, { "auxiliary_loss_clip": 0.01053736, "auxiliary_loss_mlp": 0.01037664, "balance_loss_clip": 1.01600337, "balance_loss_mlp": 1.01746595, "epoch": 0.5659401773635954, "flos": 27303929769600.0, "grad_norm": 2.19064894568191, "language_loss": 0.59615433, "learning_rate": 1.6714274972444413e-06, "loss": 0.61706829, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36328125, "step": 9413, "time_per_iteration": 2.420997381210327 }, { "auxiliary_loss_clip": 0.01054057, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.0142045, "balance_loss_mlp": 1.01702178, "epoch": 0.5660003006162634, "flos": 16727236427520.0, "grad_norm": 1.832825870082147, "language_loss": 0.70071828, "learning_rate": 1.6710433335253092e-06, "loss": 0.72164005, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 9414, "time_per_iteration": 2.376802682876587 }, { "auxiliary_loss_clip": 0.01054142, "auxiliary_loss_mlp": 0.01037965, "balance_loss_clip": 1.01549363, "balance_loss_mlp": 1.01731658, "epoch": 0.5660604238689313, "flos": 21652364102400.0, "grad_norm": 1.6425773839279085, "language_loss": 0.79047763, "learning_rate": 1.670659182280247e-06, "loss": 0.81139874, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3671875, "step": 9415, "time_per_iteration": 2.39070725440979 }, { "auxiliary_loss_clip": 0.01012248, "auxiliary_loss_mlp": 0.01011515, "balance_loss_clip": 1.00878489, "balance_loss_mlp": 1.00513875, "epoch": 0.5661205471215993, "flos": 68820755783040.0, "grad_norm": 0.8041251921894411, "language_loss": 0.4917706, "learning_rate": 1.670275043523822e-06, "loss": 0.51200819, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 0.02734375, "router_z_loss_mlp": 0.07128906, "step": 9416, "time_per_iteration": 3.135566234588623 }, { "auxiliary_loss_clip": 0.01056526, "auxiliary_loss_mlp": 0.01043733, "balance_loss_clip": 1.01925969, "balance_loss_mlp": 1.01829219, "epoch": 0.5661806703742672, "flos": 28620504339840.0, "grad_norm": 2.5670646660297884, "language_loss": 0.64985567, "learning_rate": 1.6698909172706e-06, "loss": 0.67085826, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 9417, "time_per_iteration": 2.4306368827819824 }, { "auxiliary_loss_clip": 0.01057156, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.01777625, "balance_loss_mlp": 1.01813793, "epoch": 0.5662407936269352, "flos": 21396869706240.0, "grad_norm": 2.0548390862183163, "language_loss": 0.70527893, "learning_rate": 1.6695068035351479e-06, "loss": 0.72627938, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 9418, "time_per_iteration": 2.3985869884490967 }, { "auxiliary_loss_clip": 0.01055809, "auxiliary_loss_mlp": 0.0104542, "balance_loss_clip": 1.01804948, "balance_loss_mlp": 1.01731563, "epoch": 0.5663009168796032, "flos": 25658997062400.0, "grad_norm": 1.8558630048884273, "language_loss": 0.66156423, "learning_rate": 1.6691227023320304e-06, "loss": 0.68257648, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 9419, "time_per_iteration": 2.4074909687042236 }, { "auxiliary_loss_clip": 0.01011377, "auxiliary_loss_mlp": 0.0100358, "balance_loss_clip": 1.00121975, "balance_loss_mlp": 1.0043509, "epoch": 0.5663610401322712, "flos": 67926699020160.0, "grad_norm": 0.7246517064192552, "language_loss": 0.59786212, "learning_rate": 1.6687386136758135e-06, "loss": 0.61801171, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.0703125, "step": 9420, "time_per_iteration": 3.0855016708374023 }, { "auxiliary_loss_clip": 0.01053929, "auxiliary_loss_mlp": 0.01044841, "balance_loss_clip": 1.02037954, "balance_loss_mlp": 1.016922, "epoch": 0.5664211633849391, "flos": 24608180828160.0, "grad_norm": 4.534910332992511, "language_loss": 0.75317121, "learning_rate": 1.6683545375810618e-06, "loss": 0.7741589, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 9421, "time_per_iteration": 2.3974058628082275 }, { "auxiliary_loss_clip": 0.01057036, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.01630604, "balance_loss_mlp": 1.01817453, "epoch": 0.5664812866376071, "flos": 11648212012800.0, "grad_norm": 1.9005915958894368, "language_loss": 0.74140751, "learning_rate": 1.6679704740623389e-06, "loss": 0.76239491, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 9422, "time_per_iteration": 2.3696045875549316 }, { "auxiliary_loss_clip": 0.01053853, "auxiliary_loss_mlp": 0.01041614, "balance_loss_clip": 1.01976252, "balance_loss_mlp": 1.01749861, "epoch": 0.566541409890275, "flos": 24642849674880.0, "grad_norm": 1.6270598756509607, "language_loss": 0.82178295, "learning_rate": 1.6675864231342085e-06, "loss": 0.84273762, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 9423, "time_per_iteration": 2.3969924449920654 }, { "auxiliary_loss_clip": 0.01052854, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.02592659, "balance_loss_mlp": 1.01569724, "epoch": 0.566601533142943, "flos": 22269558850560.0, "grad_norm": 1.6739056217873451, "language_loss": 0.81886715, "learning_rate": 1.6672023848112353e-06, "loss": 0.83992541, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 9424, "time_per_iteration": 2.396890878677368 }, { "auxiliary_loss_clip": 0.01058866, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.02589333, "balance_loss_mlp": 1.01804852, "epoch": 0.5666616563956111, "flos": 29970351302400.0, "grad_norm": 2.650324527112347, "language_loss": 0.79871309, "learning_rate": 1.6668183591079805e-06, "loss": 0.81984138, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40820312, "step": 9425, "time_per_iteration": 2.4261481761932373 }, { "auxiliary_loss_clip": 0.01057049, "auxiliary_loss_mlp": 0.01046354, "balance_loss_clip": 1.02273822, "balance_loss_mlp": 1.01878667, "epoch": 0.566721779648279, "flos": 17780601191040.0, "grad_norm": 4.393953977013748, "language_loss": 0.59929949, "learning_rate": 1.6664343460390064e-06, "loss": 0.62033355, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 9426, "time_per_iteration": 2.396416187286377 }, { "auxiliary_loss_clip": 0.01058708, "auxiliary_loss_mlp": 0.01041116, "balance_loss_clip": 1.01533079, "balance_loss_mlp": 1.01774168, "epoch": 0.566781902900947, "flos": 21032411356800.0, "grad_norm": 1.837477443292821, "language_loss": 0.8268019, "learning_rate": 1.6660503456188764e-06, "loss": 0.84780014, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.41015625, "step": 9427, "time_per_iteration": 2.399181842803955 }, { "auxiliary_loss_clip": 0.01055136, "auxiliary_loss_mlp": 0.0104172, "balance_loss_clip": 1.01877189, "balance_loss_mlp": 1.01791191, "epoch": 0.5668420261536149, "flos": 23147484698880.0, "grad_norm": 1.9989461165912032, "language_loss": 0.86719805, "learning_rate": 1.6656663578621498e-06, "loss": 0.88816655, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.37304688, "step": 9428, "time_per_iteration": 2.44124436378479 }, { "auxiliary_loss_clip": 0.01060285, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.01986384, "balance_loss_mlp": 1.02057052, "epoch": 0.5669021494062829, "flos": 22600500428160.0, "grad_norm": 2.454240613713165, "language_loss": 0.7437256, "learning_rate": 1.6652823827833886e-06, "loss": 0.76477975, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 9429, "time_per_iteration": 3.666529655456543 }, { "auxiliary_loss_clip": 0.01056898, "auxiliary_loss_mlp": 0.010433, "balance_loss_clip": 1.01720476, "balance_loss_mlp": 1.01699686, "epoch": 0.5669622726589508, "flos": 17380356831360.0, "grad_norm": 1.8081619030812368, "language_loss": 0.76713371, "learning_rate": 1.6648984203971538e-06, "loss": 0.78813565, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 9430, "time_per_iteration": 2.3478167057037354 }, { "auxiliary_loss_clip": 0.01057194, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.01696837, "balance_loss_mlp": 1.0176785, "epoch": 0.5670223959116188, "flos": 18762463756800.0, "grad_norm": 1.8762251426933314, "language_loss": 0.73629367, "learning_rate": 1.6645144707180032e-06, "loss": 0.75728726, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 9431, "time_per_iteration": 3.847378969192505 }, { "auxiliary_loss_clip": 0.01052172, "auxiliary_loss_mlp": 0.01039764, "balance_loss_clip": 1.01871181, "balance_loss_mlp": 1.01828647, "epoch": 0.5670825191642868, "flos": 13552479365760.0, "grad_norm": 1.6070463436402618, "language_loss": 0.73840511, "learning_rate": 1.6641305337604984e-06, "loss": 0.75932443, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 9432, "time_per_iteration": 2.3581840991973877 }, { "auxiliary_loss_clip": 0.01057576, "auxiliary_loss_mlp": 0.01043227, "balance_loss_clip": 1.01816881, "balance_loss_mlp": 1.01874304, "epoch": 0.5671426424169548, "flos": 22052957575680.0, "grad_norm": 2.1405731594787847, "language_loss": 0.78779054, "learning_rate": 1.663746609539197e-06, "loss": 0.80879855, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 9433, "time_per_iteration": 3.6906118392944336 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01042458, "balance_loss_clip": 1.01297724, "balance_loss_mlp": 1.01850629, "epoch": 0.5672027656696227, "flos": 21322923713280.0, "grad_norm": 2.653955555850362, "language_loss": 0.65214205, "learning_rate": 1.6633626980686582e-06, "loss": 0.67315841, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40625, "step": 9434, "time_per_iteration": 2.3835320472717285 }, { "auxiliary_loss_clip": 0.01056426, "auxiliary_loss_mlp": 0.0104048, "balance_loss_clip": 1.01495779, "balance_loss_mlp": 1.01882911, "epoch": 0.5672628889222907, "flos": 23512920566400.0, "grad_norm": 1.6421212521881932, "language_loss": 0.67755008, "learning_rate": 1.6629787993634399e-06, "loss": 0.69851911, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 9435, "time_per_iteration": 2.3970589637756348 }, { "auxiliary_loss_clip": 0.01055247, "auxiliary_loss_mlp": 0.01037305, "balance_loss_clip": 1.01444101, "balance_loss_mlp": 1.01849532, "epoch": 0.5673230121749586, "flos": 27120810355200.0, "grad_norm": 1.5751048145444824, "language_loss": 0.72466922, "learning_rate": 1.6625949134380984e-06, "loss": 0.7455948, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 9436, "time_per_iteration": 2.3960695266723633 }, { "auxiliary_loss_clip": 0.01057485, "auxiliary_loss_mlp": 0.01039606, "balance_loss_clip": 1.01359487, "balance_loss_mlp": 1.01809764, "epoch": 0.5673831354276266, "flos": 31140569986560.0, "grad_norm": 1.4939608596607286, "language_loss": 0.74899954, "learning_rate": 1.6622110403071921e-06, "loss": 0.76997042, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 9437, "time_per_iteration": 2.4957878589630127 }, { "auxiliary_loss_clip": 0.01061432, "auxiliary_loss_mlp": 0.010445, "balance_loss_clip": 1.01723671, "balance_loss_mlp": 1.02192807, "epoch": 0.5674432586802945, "flos": 27671949077760.0, "grad_norm": 1.7619277462146563, "language_loss": 0.62179649, "learning_rate": 1.661827179985277e-06, "loss": 0.64285582, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 9438, "time_per_iteration": 2.420778274536133 }, { "auxiliary_loss_clip": 0.01058295, "auxiliary_loss_mlp": 0.01041088, "balance_loss_clip": 1.01480246, "balance_loss_mlp": 1.0195843, "epoch": 0.5675033819329626, "flos": 26613941103360.0, "grad_norm": 1.447575633280863, "language_loss": 0.75890762, "learning_rate": 1.661443332486909e-06, "loss": 0.7799015, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9439, "time_per_iteration": 3.8625569343566895 }, { "auxiliary_loss_clip": 0.01059876, "auxiliary_loss_mlp": 0.01043183, "balance_loss_clip": 1.01490653, "balance_loss_mlp": 1.02171004, "epoch": 0.5675635051856306, "flos": 19097385229440.0, "grad_norm": 1.8142686041357212, "language_loss": 0.84285086, "learning_rate": 1.6610594978266438e-06, "loss": 0.86388147, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3828125, "step": 9440, "time_per_iteration": 2.381423234939575 }, { "auxiliary_loss_clip": 0.01062558, "auxiliary_loss_mlp": 0.01050625, "balance_loss_clip": 1.02294481, "balance_loss_mlp": 1.02086508, "epoch": 0.5676236284382985, "flos": 17565361459200.0, "grad_norm": 2.0648410527645886, "language_loss": 0.77664793, "learning_rate": 1.6606756760190365e-06, "loss": 0.7977798, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41601562, "step": 9441, "time_per_iteration": 2.3416824340820312 }, { "auxiliary_loss_clip": 0.01059085, "auxiliary_loss_mlp": 0.01041662, "balance_loss_clip": 1.01631832, "balance_loss_mlp": 1.02015376, "epoch": 0.5676837516909665, "flos": 15953352030720.0, "grad_norm": 2.0047640238209965, "language_loss": 0.8429684, "learning_rate": 1.6602918670786413e-06, "loss": 0.86397588, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 9442, "time_per_iteration": 2.385270118713379 }, { "auxiliary_loss_clip": 0.01056322, "auxiliary_loss_mlp": 0.01034776, "balance_loss_clip": 1.01266313, "balance_loss_mlp": 1.0212245, "epoch": 0.5677438749436344, "flos": 18294941473920.0, "grad_norm": 1.9635026421659756, "language_loss": 0.75874031, "learning_rate": 1.6599080710200126e-06, "loss": 0.77965128, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 9443, "time_per_iteration": 2.3514723777770996 }, { "auxiliary_loss_clip": 0.01058485, "auxiliary_loss_mlp": 0.01044186, "balance_loss_clip": 1.0198555, "balance_loss_mlp": 1.01941633, "epoch": 0.5678039981963025, "flos": 17930343479040.0, "grad_norm": 2.1419919263038665, "language_loss": 0.78862095, "learning_rate": 1.6595242878577046e-06, "loss": 0.80964768, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.390625, "step": 9444, "time_per_iteration": 2.3738808631896973 }, { "auxiliary_loss_clip": 0.0106039, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.02213073, "balance_loss_mlp": 1.02116013, "epoch": 0.5678641214489704, "flos": 19315382958720.0, "grad_norm": 1.716331537955263, "language_loss": 0.8210746, "learning_rate": 1.6591405176062687e-06, "loss": 0.84216231, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 9445, "time_per_iteration": 2.356755256652832 }, { "auxiliary_loss_clip": 0.01056842, "auxiliary_loss_mlp": 0.01034955, "balance_loss_clip": 1.01113653, "balance_loss_mlp": 1.01849246, "epoch": 0.5679242447016384, "flos": 27749700408960.0, "grad_norm": 1.6160276298515326, "language_loss": 0.7160629, "learning_rate": 1.658756760280259e-06, "loss": 0.73698092, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 9446, "time_per_iteration": 2.4642629623413086 }, { "auxiliary_loss_clip": 0.01059544, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.01180613, "balance_loss_mlp": 1.01952851, "epoch": 0.5679843679543063, "flos": 23767961114880.0, "grad_norm": 2.0819691741700717, "language_loss": 0.74552643, "learning_rate": 1.6583730158942276e-06, "loss": 0.76650101, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9447, "time_per_iteration": 2.391909599304199 }, { "auxiliary_loss_clip": 0.01059726, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.01394653, "balance_loss_mlp": 1.01958382, "epoch": 0.5680444912069743, "flos": 25590741621120.0, "grad_norm": 1.9863335488184843, "language_loss": 0.76689839, "learning_rate": 1.657989284462725e-06, "loss": 0.78789389, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 9448, "time_per_iteration": 2.431476354598999 }, { "auxiliary_loss_clip": 0.01059509, "auxiliary_loss_mlp": 0.01039114, "balance_loss_clip": 1.01362681, "balance_loss_mlp": 1.0201683, "epoch": 0.5681046144596422, "flos": 23694678437760.0, "grad_norm": 2.122287393100708, "language_loss": 0.78145564, "learning_rate": 1.6576055660003038e-06, "loss": 0.80244184, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 9449, "time_per_iteration": 2.4132816791534424 }, { "auxiliary_loss_clip": 0.01056529, "auxiliary_loss_mlp": 0.01041936, "balance_loss_clip": 1.01612735, "balance_loss_mlp": 1.01776838, "epoch": 0.5681647377123102, "flos": 27999539164800.0, "grad_norm": 1.771368793366794, "language_loss": 0.75902414, "learning_rate": 1.6572218605215128e-06, "loss": 0.78000879, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9450, "time_per_iteration": 2.4368133544921875 }, { "auxiliary_loss_clip": 0.01061581, "auxiliary_loss_mlp": 0.01041775, "balance_loss_clip": 1.01646709, "balance_loss_mlp": 1.02082753, "epoch": 0.5682248609649782, "flos": 22746646846080.0, "grad_norm": 3.262495508048384, "language_loss": 0.6785019, "learning_rate": 1.6568381680409038e-06, "loss": 0.69953549, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40820312, "step": 9451, "time_per_iteration": 2.4191763401031494 }, { "auxiliary_loss_clip": 0.01061772, "auxiliary_loss_mlp": 0.01045199, "balance_loss_clip": 1.01562333, "balance_loss_mlp": 1.01940846, "epoch": 0.5682849842176462, "flos": 21287521728000.0, "grad_norm": 2.2194648615807617, "language_loss": 0.73976332, "learning_rate": 1.656454488573026e-06, "loss": 0.76083302, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42382812, "step": 9452, "time_per_iteration": 2.3792662620544434 }, { "auxiliary_loss_clip": 0.01054967, "auxiliary_loss_mlp": 0.01038254, "balance_loss_clip": 1.01397121, "balance_loss_mlp": 1.01766038, "epoch": 0.5683451074703142, "flos": 21140642171520.0, "grad_norm": 1.5869523399215737, "language_loss": 0.71361184, "learning_rate": 1.656070822132428e-06, "loss": 0.73454404, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37304688, "step": 9453, "time_per_iteration": 2.415027141571045 }, { "auxiliary_loss_clip": 0.01056191, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.01586235, "balance_loss_mlp": 1.01799524, "epoch": 0.5684052307229821, "flos": 22343435020800.0, "grad_norm": 1.6019621741376417, "language_loss": 0.71509349, "learning_rate": 1.6556871687336592e-06, "loss": 0.73606133, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 9454, "time_per_iteration": 2.396164894104004 }, { "auxiliary_loss_clip": 0.01053772, "auxiliary_loss_mlp": 0.01041082, "balance_loss_clip": 1.01868248, "balance_loss_mlp": 1.01671076, "epoch": 0.5684653539756501, "flos": 21797567913600.0, "grad_norm": 2.056862292015668, "language_loss": 0.62264562, "learning_rate": 1.6553035283912671e-06, "loss": 0.64359415, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37109375, "step": 9455, "time_per_iteration": 2.3931291103363037 }, { "auxiliary_loss_clip": 0.01059738, "auxiliary_loss_mlp": 0.01052282, "balance_loss_clip": 1.02414906, "balance_loss_mlp": 1.01894653, "epoch": 0.568525477228318, "flos": 22998615194880.0, "grad_norm": 1.7634350807731827, "language_loss": 0.7369386, "learning_rate": 1.6549199011198e-06, "loss": 0.75805879, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40820312, "step": 9456, "time_per_iteration": 2.377664804458618 }, { "auxiliary_loss_clip": 0.01055256, "auxiliary_loss_mlp": 0.01045655, "balance_loss_clip": 1.02162218, "balance_loss_mlp": 1.01709509, "epoch": 0.568585600480986, "flos": 21391563179520.0, "grad_norm": 2.7593444229887325, "language_loss": 0.77261573, "learning_rate": 1.6545362869338048e-06, "loss": 0.79362482, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 9457, "time_per_iteration": 2.399627685546875 }, { "auxiliary_loss_clip": 0.01056935, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.02397132, "balance_loss_mlp": 1.01751208, "epoch": 0.568645723733654, "flos": 30006067489920.0, "grad_norm": 1.8220797971127245, "language_loss": 0.67212141, "learning_rate": 1.6541526858478285e-06, "loss": 0.69318843, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 9458, "time_per_iteration": 2.439864158630371 }, { "auxiliary_loss_clip": 0.01058031, "auxiliary_loss_mlp": 0.01040731, "balance_loss_clip": 1.01536322, "balance_loss_mlp": 1.01829338, "epoch": 0.568705846986322, "flos": 20411620738560.0, "grad_norm": 2.31795203439138, "language_loss": 0.70246398, "learning_rate": 1.6537690978764167e-06, "loss": 0.72345161, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 9459, "time_per_iteration": 2.3814280033111572 }, { "auxiliary_loss_clip": 0.01060259, "auxiliary_loss_mlp": 0.01047423, "balance_loss_clip": 1.02106643, "balance_loss_mlp": 1.01965725, "epoch": 0.5687659702389899, "flos": 17455804012800.0, "grad_norm": 2.210188125881042, "language_loss": 0.78124559, "learning_rate": 1.6533855230341155e-06, "loss": 0.80232245, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 9460, "time_per_iteration": 2.335477828979492 }, { "auxiliary_loss_clip": 0.01058284, "auxiliary_loss_mlp": 0.01048674, "balance_loss_clip": 1.0213517, "balance_loss_mlp": 1.0181129, "epoch": 0.5688260934916579, "flos": 25405038766080.0, "grad_norm": 1.6311125344805677, "language_loss": 0.73046041, "learning_rate": 1.65300196133547e-06, "loss": 0.75153005, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 9461, "time_per_iteration": 2.41902232170105 }, { "auxiliary_loss_clip": 0.01056097, "auxiliary_loss_mlp": 0.01046315, "balance_loss_clip": 1.02005363, "balance_loss_mlp": 1.01719475, "epoch": 0.5688862167443258, "flos": 21607186936320.0, "grad_norm": 1.9515234225402238, "language_loss": 0.73925102, "learning_rate": 1.6526184127950249e-06, "loss": 0.76027513, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 9462, "time_per_iteration": 2.3553099632263184 }, { "auxiliary_loss_clip": 0.01053253, "auxiliary_loss_mlp": 0.01042043, "balance_loss_clip": 1.0197506, "balance_loss_mlp": 1.0164938, "epoch": 0.5689463399969938, "flos": 22417904684160.0, "grad_norm": 2.295071393595814, "language_loss": 0.75005972, "learning_rate": 1.6522348774273246e-06, "loss": 0.77101266, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3671875, "step": 9463, "time_per_iteration": 2.3887345790863037 }, { "auxiliary_loss_clip": 0.01056709, "auxiliary_loss_mlp": 0.01040504, "balance_loss_clip": 1.01740086, "balance_loss_mlp": 1.01801944, "epoch": 0.5690064632496618, "flos": 18295814257920.0, "grad_norm": 1.9872401227122782, "language_loss": 0.75659609, "learning_rate": 1.6518513552469123e-06, "loss": 0.77756822, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.38671875, "step": 9464, "time_per_iteration": 2.3748157024383545 }, { "auxiliary_loss_clip": 0.01057539, "auxiliary_loss_mlp": 0.01047525, "balance_loss_clip": 1.0224669, "balance_loss_mlp": 1.01780462, "epoch": 0.5690665865023298, "flos": 21578208641280.0, "grad_norm": 1.6689193831193228, "language_loss": 0.84629148, "learning_rate": 1.6514678462683312e-06, "loss": 0.86734211, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 9465, "time_per_iteration": 2.40130352973938 }, { "auxiliary_loss_clip": 0.01054331, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.01368475, "balance_loss_mlp": 1.01741147, "epoch": 0.5691267097549978, "flos": 24420418202880.0, "grad_norm": 1.5128674928003591, "language_loss": 0.73335814, "learning_rate": 1.651084350506125e-06, "loss": 0.75427461, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 9466, "time_per_iteration": 2.395613431930542 }, { "auxiliary_loss_clip": 0.01013885, "auxiliary_loss_mlp": 0.01005002, "balance_loss_clip": 1.00265348, "balance_loss_mlp": 1.00667953, "epoch": 0.5691868330076657, "flos": 61654238000640.0, "grad_norm": 0.7177133049195442, "language_loss": 0.55458283, "learning_rate": 1.6507008679748343e-06, "loss": 0.57477176, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.07226562, "step": 9467, "time_per_iteration": 3.0899758338928223 }, { "auxiliary_loss_clip": 0.01058604, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.01262999, "balance_loss_mlp": 1.01761281, "epoch": 0.5692469562603337, "flos": 21324110699520.0, "grad_norm": 2.0364122937605273, "language_loss": 0.64847547, "learning_rate": 1.6503173986890023e-06, "loss": 0.66946137, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41015625, "step": 9468, "time_per_iteration": 2.3684208393096924 }, { "auxiliary_loss_clip": 0.01055798, "auxiliary_loss_mlp": 0.01039976, "balance_loss_clip": 1.01549101, "balance_loss_mlp": 1.0173862, "epoch": 0.5693070795130016, "flos": 23366774148480.0, "grad_norm": 1.7666740836438684, "language_loss": 0.80662835, "learning_rate": 1.64993394266317e-06, "loss": 0.82758605, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 9469, "time_per_iteration": 3.639204978942871 }, { "auxiliary_loss_clip": 0.01061264, "auxiliary_loss_mlp": 0.01047305, "balance_loss_clip": 1.01830149, "balance_loss_mlp": 1.01935279, "epoch": 0.5693672027656697, "flos": 18696268085760.0, "grad_norm": 2.0420897259085504, "language_loss": 0.7110917, "learning_rate": 1.6495504999118769e-06, "loss": 0.73217738, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.41992188, "step": 9470, "time_per_iteration": 2.354099750518799 }, { "auxiliary_loss_clip": 0.01056507, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.01257539, "balance_loss_mlp": 1.01812518, "epoch": 0.5694273260183376, "flos": 20448139887360.0, "grad_norm": 1.7076465569960073, "language_loss": 0.75448298, "learning_rate": 1.6491670704496644e-06, "loss": 0.77540612, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38476562, "step": 9471, "time_per_iteration": 3.779123067855835 }, { "auxiliary_loss_clip": 0.01057169, "auxiliary_loss_mlp": 0.0104089, "balance_loss_clip": 1.01554608, "balance_loss_mlp": 1.01911521, "epoch": 0.5694874492710056, "flos": 17602229721600.0, "grad_norm": 1.6135908380893915, "language_loss": 0.58758497, "learning_rate": 1.6487836542910716e-06, "loss": 0.60856557, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 9472, "time_per_iteration": 3.7279274463653564 }, { "auxiliary_loss_clip": 0.01054897, "auxiliary_loss_mlp": 0.01039118, "balance_loss_clip": 1.01473975, "balance_loss_mlp": 1.01849079, "epoch": 0.5695475725236735, "flos": 13369988355840.0, "grad_norm": 1.8961406024202287, "language_loss": 0.75419343, "learning_rate": 1.648400251450638e-06, "loss": 0.77513361, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 9473, "time_per_iteration": 2.337942361831665 }, { "auxiliary_loss_clip": 0.01015467, "auxiliary_loss_mlp": 0.01011427, "balance_loss_clip": 1.00900733, "balance_loss_mlp": 1.00771809, "epoch": 0.5696076957763415, "flos": 68170951958400.0, "grad_norm": 0.6840943689829038, "language_loss": 0.5768314, "learning_rate": 1.6480168619429023e-06, "loss": 0.59710032, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.07714844, "step": 9474, "time_per_iteration": 3.043489456176758 }, { "auxiliary_loss_clip": 0.01056583, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.01416135, "balance_loss_mlp": 1.01797414, "epoch": 0.5696678190290094, "flos": 33836912421120.0, "grad_norm": 1.8063648204545353, "language_loss": 0.55522847, "learning_rate": 1.6476334857824017e-06, "loss": 0.57620466, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 9475, "time_per_iteration": 2.499204635620117 }, { "auxiliary_loss_clip": 0.01057611, "auxiliary_loss_mlp": 0.010457, "balance_loss_clip": 1.01886594, "balance_loss_mlp": 1.0176791, "epoch": 0.5697279422816774, "flos": 26355479241600.0, "grad_norm": 1.503299354446975, "language_loss": 0.80268013, "learning_rate": 1.647250122983675e-06, "loss": 0.82371324, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9476, "time_per_iteration": 2.4094772338867188 }, { "auxiliary_loss_clip": 0.01058804, "auxiliary_loss_mlp": 0.01045558, "balance_loss_clip": 1.01843786, "balance_loss_mlp": 1.01831365, "epoch": 0.5697880655343454, "flos": 22929382235520.0, "grad_norm": 2.2394583667237318, "language_loss": 0.6794852, "learning_rate": 1.6468667735612592e-06, "loss": 0.7005288, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40429688, "step": 9477, "time_per_iteration": 2.3922054767608643 }, { "auxiliary_loss_clip": 0.01057883, "auxiliary_loss_mlp": 0.01040683, "balance_loss_clip": 1.01313365, "balance_loss_mlp": 1.01775002, "epoch": 0.5698481887870134, "flos": 26760087521280.0, "grad_norm": 1.924560840008803, "language_loss": 0.71284944, "learning_rate": 1.6464834375296906e-06, "loss": 0.73383504, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40234375, "step": 9478, "time_per_iteration": 2.458282709121704 }, { "auxiliary_loss_clip": 0.0105349, "auxiliary_loss_mlp": 0.01036725, "balance_loss_clip": 1.01339531, "balance_loss_mlp": 1.01735449, "epoch": 0.5699083120396814, "flos": 15741359055360.0, "grad_norm": 1.7893645376752374, "language_loss": 0.70481324, "learning_rate": 1.6461001149035055e-06, "loss": 0.72571534, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 9479, "time_per_iteration": 3.8688933849334717 }, { "auxiliary_loss_clip": 0.01054945, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.01467299, "balance_loss_mlp": 1.0175724, "epoch": 0.5699684352923493, "flos": 19536243419520.0, "grad_norm": 1.82323568223002, "language_loss": 0.72467065, "learning_rate": 1.6457168056972392e-06, "loss": 0.74560797, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 9480, "time_per_iteration": 2.378706216812134 }, { "auxiliary_loss_clip": 0.01056184, "auxiliary_loss_mlp": 0.01039256, "balance_loss_clip": 1.01484203, "balance_loss_mlp": 1.01731718, "epoch": 0.5700285585450173, "flos": 16252417670400.0, "grad_norm": 2.0750684695884005, "language_loss": 0.73376185, "learning_rate": 1.6453335099254276e-06, "loss": 0.75471628, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 9481, "time_per_iteration": 2.369520902633667 }, { "auxiliary_loss_clip": 0.01057109, "auxiliary_loss_mlp": 0.01048063, "balance_loss_clip": 1.02079964, "balance_loss_mlp": 1.01831651, "epoch": 0.5700886817976852, "flos": 19863973152000.0, "grad_norm": 1.628531207077819, "language_loss": 0.79280233, "learning_rate": 1.6449502276026041e-06, "loss": 0.81385398, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38867188, "step": 9482, "time_per_iteration": 2.372015953063965 }, { "auxiliary_loss_clip": 0.01056512, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.0140475, "balance_loss_mlp": 1.01798725, "epoch": 0.5701488050503533, "flos": 23840580476160.0, "grad_norm": 1.5346923315985341, "language_loss": 0.78712308, "learning_rate": 1.6445669587433043e-06, "loss": 0.80808479, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38476562, "step": 9483, "time_per_iteration": 2.407238245010376 }, { "auxiliary_loss_clip": 0.01057018, "auxiliary_loss_mlp": 0.01044938, "balance_loss_clip": 1.01841438, "balance_loss_mlp": 1.01772571, "epoch": 0.5702089283030212, "flos": 23658543313920.0, "grad_norm": 1.6239014004759877, "language_loss": 0.82265639, "learning_rate": 1.6441837033620612e-06, "loss": 0.84367597, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 9484, "time_per_iteration": 2.3969709873199463 }, { "auxiliary_loss_clip": 0.0105825, "auxiliary_loss_mlp": 0.0104159, "balance_loss_clip": 1.01412463, "balance_loss_mlp": 1.01772046, "epoch": 0.5702690515556892, "flos": 27889946807040.0, "grad_norm": 2.116695717742763, "language_loss": 0.6185658, "learning_rate": 1.6438004614734073e-06, "loss": 0.63956416, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 9485, "time_per_iteration": 2.4600908756256104 }, { "auxiliary_loss_clip": 0.01055621, "auxiliary_loss_mlp": 0.01048169, "balance_loss_clip": 1.02182412, "balance_loss_mlp": 1.01637602, "epoch": 0.5703291748083571, "flos": 24022827106560.0, "grad_norm": 1.7851248679162126, "language_loss": 0.66980362, "learning_rate": 1.6434172330918757e-06, "loss": 0.6908415, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 9486, "time_per_iteration": 2.380516529083252 }, { "auxiliary_loss_clip": 0.01008927, "auxiliary_loss_mlp": 0.01009179, "balance_loss_clip": 1.00649667, "balance_loss_mlp": 1.00155234, "epoch": 0.5703892980610251, "flos": 57019867061760.0, "grad_norm": 0.6717152268103516, "language_loss": 0.480317, "learning_rate": 1.6430340182319978e-06, "loss": 0.50049806, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.07373047, "step": 9487, "time_per_iteration": 3.072537660598755 }, { "auxiliary_loss_clip": 0.0105596, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.01174176, "balance_loss_mlp": 1.017802, "epoch": 0.570449421313693, "flos": 24349928434560.0, "grad_norm": 1.6140677204483456, "language_loss": 0.87307906, "learning_rate": 1.6426508169083067e-06, "loss": 0.89401168, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 9488, "time_per_iteration": 2.3926823139190674 }, { "auxiliary_loss_clip": 0.01057253, "auxiliary_loss_mlp": 0.01038441, "balance_loss_clip": 1.01101041, "balance_loss_mlp": 1.01665568, "epoch": 0.570509544566361, "flos": 24827365543680.0, "grad_norm": 1.5987781746329246, "language_loss": 0.7958535, "learning_rate": 1.6422676291353314e-06, "loss": 0.81681043, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 9489, "time_per_iteration": 2.418394088745117 }, { "auxiliary_loss_clip": 0.0105578, "auxiliary_loss_mlp": 0.01046164, "balance_loss_clip": 1.02219093, "balance_loss_mlp": 1.01681876, "epoch": 0.570569667819029, "flos": 21396241301760.0, "grad_norm": 1.8871972374427497, "language_loss": 0.70452535, "learning_rate": 1.641884454927604e-06, "loss": 0.72554475, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.390625, "step": 9490, "time_per_iteration": 2.3594772815704346 }, { "auxiliary_loss_clip": 0.01056534, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.01565039, "balance_loss_mlp": 1.01782596, "epoch": 0.570629791071697, "flos": 23215775051520.0, "grad_norm": 1.5999651571646583, "language_loss": 0.76874959, "learning_rate": 1.6415012942996548e-06, "loss": 0.78971696, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 9491, "time_per_iteration": 2.4088194370269775 }, { "auxiliary_loss_clip": 0.01010915, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.03097236, "balance_loss_mlp": 1.00323987, "epoch": 0.570689914324365, "flos": 65281505460480.0, "grad_norm": 0.8247426639473324, "language_loss": 0.57479417, "learning_rate": 1.641118147266011e-06, "loss": 0.59523892, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.07666016, "step": 9492, "time_per_iteration": 3.0613343715667725 }, { "auxiliary_loss_clip": 0.01056168, "auxiliary_loss_mlp": 0.0104244, "balance_loss_clip": 1.01689351, "balance_loss_mlp": 1.0182693, "epoch": 0.5707500375770329, "flos": 21140851639680.0, "grad_norm": 2.0961596157082347, "language_loss": 0.73081791, "learning_rate": 1.6407350138412035e-06, "loss": 0.75180399, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 9493, "time_per_iteration": 2.391953706741333 }, { "auxiliary_loss_clip": 0.01058531, "auxiliary_loss_mlp": 0.01040916, "balance_loss_clip": 1.01410604, "balance_loss_mlp": 1.01808178, "epoch": 0.5708101608297009, "flos": 20811725452800.0, "grad_norm": 1.588551633508979, "language_loss": 0.78733397, "learning_rate": 1.6403518940397606e-06, "loss": 0.80832845, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40429688, "step": 9494, "time_per_iteration": 2.3706045150756836 }, { "auxiliary_loss_clip": 0.01059861, "auxiliary_loss_mlp": 0.01043975, "balance_loss_clip": 1.01413691, "balance_loss_mlp": 1.01871896, "epoch": 0.5708702840823688, "flos": 25811148234240.0, "grad_norm": 2.1364965949322197, "language_loss": 0.82278192, "learning_rate": 1.6399687878762096e-06, "loss": 0.84382027, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41015625, "step": 9495, "time_per_iteration": 2.420576572418213 }, { "auxiliary_loss_clip": 0.01063141, "auxiliary_loss_mlp": 0.01049456, "balance_loss_clip": 1.01716232, "balance_loss_mlp": 1.01968515, "epoch": 0.5709304073350369, "flos": 23651002460160.0, "grad_norm": 2.2610602275447262, "language_loss": 0.6733402, "learning_rate": 1.6395856953650784e-06, "loss": 0.69446617, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 0.43554688, "step": 9496, "time_per_iteration": 2.3802120685577393 }, { "auxiliary_loss_clip": 0.01060997, "auxiliary_loss_mlp": 0.01048772, "balance_loss_clip": 1.01948261, "balance_loss_mlp": 1.01932383, "epoch": 0.5709905305877048, "flos": 16106620366080.0, "grad_norm": 3.3706200131415702, "language_loss": 0.70314479, "learning_rate": 1.6392026165208938e-06, "loss": 0.72424257, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41601562, "step": 9497, "time_per_iteration": 2.478105068206787 }, { "auxiliary_loss_clip": 0.01060597, "auxiliary_loss_mlp": 0.01044525, "balance_loss_clip": 1.01536632, "balance_loss_mlp": 1.02046728, "epoch": 0.5710506538403728, "flos": 24749753857920.0, "grad_norm": 1.8996737020512495, "language_loss": 0.82497096, "learning_rate": 1.638819551358182e-06, "loss": 0.84602219, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40234375, "step": 9498, "time_per_iteration": 2.5104923248291016 }, { "auxiliary_loss_clip": 0.01059581, "auxiliary_loss_mlp": 0.01042209, "balance_loss_clip": 1.01483893, "balance_loss_mlp": 1.019629, "epoch": 0.5711107770930407, "flos": 21981141175680.0, "grad_norm": 1.8808834172717348, "language_loss": 0.67554456, "learning_rate": 1.638436499891469e-06, "loss": 0.69656247, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 9499, "time_per_iteration": 2.4099626541137695 }, { "auxiliary_loss_clip": 0.01058559, "auxiliary_loss_mlp": 0.01043166, "balance_loss_clip": 1.01540232, "balance_loss_mlp": 1.02005672, "epoch": 0.5711709003457087, "flos": 19572972036480.0, "grad_norm": 1.6707626323624971, "language_loss": 0.72868401, "learning_rate": 1.6380534621352805e-06, "loss": 0.74970126, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38476562, "step": 9500, "time_per_iteration": 2.364004135131836 }, { "auxiliary_loss_clip": 0.01059936, "auxiliary_loss_mlp": 0.01042727, "balance_loss_clip": 1.0158329, "balance_loss_mlp": 1.01942039, "epoch": 0.5712310235983766, "flos": 24241557974400.0, "grad_norm": 1.9588612500759415, "language_loss": 0.77825814, "learning_rate": 1.6376704381041407e-06, "loss": 0.7992847, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 9501, "time_per_iteration": 2.4257142543792725 }, { "auxiliary_loss_clip": 0.01059119, "auxiliary_loss_mlp": 0.01040579, "balance_loss_clip": 1.01571202, "balance_loss_mlp": 1.01927018, "epoch": 0.5712911468510447, "flos": 20995089246720.0, "grad_norm": 1.5701910873817941, "language_loss": 0.75599229, "learning_rate": 1.6372874278125742e-06, "loss": 0.77698922, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3984375, "step": 9502, "time_per_iteration": 2.366684913635254 }, { "auxiliary_loss_clip": 0.01056732, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.01599777, "balance_loss_mlp": 1.01927543, "epoch": 0.5713512701037126, "flos": 18915976471680.0, "grad_norm": 1.5411826006344016, "language_loss": 0.83148485, "learning_rate": 1.636904431275105e-06, "loss": 0.8524484, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 9503, "time_per_iteration": 2.4386911392211914 }, { "auxiliary_loss_clip": 0.01057823, "auxiliary_loss_mlp": 0.01041224, "balance_loss_clip": 1.01578474, "balance_loss_mlp": 1.0204879, "epoch": 0.5714113933563806, "flos": 17412686616960.0, "grad_norm": 2.108042959499498, "language_loss": 0.87428325, "learning_rate": 1.6365214485062553e-06, "loss": 0.89527369, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 9504, "time_per_iteration": 2.3479714393615723 }, { "auxiliary_loss_clip": 0.01056442, "auxiliary_loss_mlp": 0.01039399, "balance_loss_clip": 1.01435328, "balance_loss_mlp": 1.01867616, "epoch": 0.5714715166090486, "flos": 20192331288960.0, "grad_norm": 1.693219939484887, "language_loss": 0.76204503, "learning_rate": 1.6361384795205496e-06, "loss": 0.78300345, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 9505, "time_per_iteration": 2.362290859222412 }, { "auxiliary_loss_clip": 0.01055754, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.01275039, "balance_loss_mlp": 1.01777506, "epoch": 0.5715316398617165, "flos": 18550680249600.0, "grad_norm": 1.4928605390946246, "language_loss": 0.8296628, "learning_rate": 1.635755524332509e-06, "loss": 0.85058331, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 9506, "time_per_iteration": 2.3837249279022217 }, { "auxiliary_loss_clip": 0.01056486, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.01171279, "balance_loss_mlp": 1.01831138, "epoch": 0.5715917631143845, "flos": 18477223015680.0, "grad_norm": 2.3766490509489486, "language_loss": 0.7835598, "learning_rate": 1.6353725829566552e-06, "loss": 0.80447865, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38085938, "step": 9507, "time_per_iteration": 2.3627235889434814 }, { "auxiliary_loss_clip": 0.01058824, "auxiliary_loss_mlp": 0.01048981, "balance_loss_clip": 1.0206089, "balance_loss_mlp": 1.01902151, "epoch": 0.5716518863670524, "flos": 24019021768320.0, "grad_norm": 1.701103427389121, "language_loss": 0.69872522, "learning_rate": 1.63498965540751e-06, "loss": 0.71980327, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3984375, "step": 9508, "time_per_iteration": 3.7153103351593018 }, { "auxiliary_loss_clip": 0.01056738, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.01257086, "balance_loss_mlp": 1.01773107, "epoch": 0.5717120096197205, "flos": 17818586616960.0, "grad_norm": 1.9625765442518446, "language_loss": 0.81152916, "learning_rate": 1.634606741699593e-06, "loss": 0.83247024, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 9509, "time_per_iteration": 2.3681812286376953 }, { "auxiliary_loss_clip": 0.01055449, "auxiliary_loss_mlp": 0.01042452, "balance_loss_clip": 1.01763272, "balance_loss_mlp": 1.01719654, "epoch": 0.5717721328723884, "flos": 21865125127680.0, "grad_norm": 1.9993386053172415, "language_loss": 0.7425251, "learning_rate": 1.6342238418474255e-06, "loss": 0.76350415, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 9510, "time_per_iteration": 2.356755256652832 }, { "auxiliary_loss_clip": 0.01057216, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.01690948, "balance_loss_mlp": 1.01827407, "epoch": 0.5718322561250564, "flos": 28436407407360.0, "grad_norm": 1.492955321518968, "language_loss": 0.7017312, "learning_rate": 1.6338409558655264e-06, "loss": 0.72271383, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.390625, "step": 9511, "time_per_iteration": 3.879596471786499 }, { "auxiliary_loss_clip": 0.01056255, "auxiliary_loss_mlp": 0.01038426, "balance_loss_clip": 1.01440561, "balance_loss_mlp": 1.01791513, "epoch": 0.5718923793777243, "flos": 13551013088640.0, "grad_norm": 6.805787727722015, "language_loss": 0.63699365, "learning_rate": 1.6334580837684152e-06, "loss": 0.65794045, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 9512, "time_per_iteration": 3.805906057357788 }, { "auxiliary_loss_clip": 0.01054895, "auxiliary_loss_mlp": 0.01042068, "balance_loss_clip": 1.01839328, "balance_loss_mlp": 1.01691353, "epoch": 0.5719525026303923, "flos": 17821065323520.0, "grad_norm": 2.8229518966912623, "language_loss": 0.76595831, "learning_rate": 1.6330752255706104e-06, "loss": 0.78692794, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.38085938, "step": 9513, "time_per_iteration": 2.3460934162139893 }, { "auxiliary_loss_clip": 0.01012936, "auxiliary_loss_mlp": 0.01011846, "balance_loss_clip": 1.00882959, "balance_loss_mlp": 1.00511169, "epoch": 0.5720126258830602, "flos": 61295262600960.0, "grad_norm": 0.8952054791590704, "language_loss": 0.66925633, "learning_rate": 1.6326923812866288e-06, "loss": 0.68950415, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 0.03015137, "router_z_loss_mlp": 0.078125, "step": 9514, "time_per_iteration": 3.021477222442627 }, { "auxiliary_loss_clip": 0.01059387, "auxiliary_loss_mlp": 0.01044944, "balance_loss_clip": 1.01802695, "balance_loss_mlp": 1.01852405, "epoch": 0.5720727491357283, "flos": 23986901450880.0, "grad_norm": 1.8262650899168478, "language_loss": 0.82732415, "learning_rate": 1.63230955093099e-06, "loss": 0.84836745, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41015625, "step": 9515, "time_per_iteration": 2.3834378719329834 }, { "auxiliary_loss_clip": 0.0105251, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.01924133, "balance_loss_mlp": 1.01613271, "epoch": 0.5721328723883962, "flos": 23404270815360.0, "grad_norm": 1.5716611387768071, "language_loss": 0.86725569, "learning_rate": 1.6319267345182092e-06, "loss": 0.88822293, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 9516, "time_per_iteration": 2.4139959812164307 }, { "auxiliary_loss_clip": 0.01054065, "auxiliary_loss_mlp": 0.01043584, "balance_loss_clip": 1.01893115, "balance_loss_mlp": 1.01662636, "epoch": 0.5721929956410642, "flos": 18803975230080.0, "grad_norm": 1.6856519116282316, "language_loss": 0.8856144, "learning_rate": 1.6315439320628038e-06, "loss": 0.90659088, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.375, "step": 9517, "time_per_iteration": 2.3733081817626953 }, { "auxiliary_loss_clip": 0.01055652, "auxiliary_loss_mlp": 0.01048178, "balance_loss_clip": 1.02263129, "balance_loss_mlp": 1.01839757, "epoch": 0.5722531188937322, "flos": 27195489486720.0, "grad_norm": 1.818854795603614, "language_loss": 0.8590256, "learning_rate": 1.6311611435792893e-06, "loss": 0.88006389, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 9518, "time_per_iteration": 2.4328629970550537 }, { "auxiliary_loss_clip": 0.0105394, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.01448298, "balance_loss_mlp": 1.01744103, "epoch": 0.5723132421464001, "flos": 15194758809600.0, "grad_norm": 1.8079125390897117, "language_loss": 0.79965115, "learning_rate": 1.6307783690821812e-06, "loss": 0.82056725, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 9519, "time_per_iteration": 3.889305591583252 }, { "auxiliary_loss_clip": 0.01056241, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.0157032, "balance_loss_mlp": 1.01849544, "epoch": 0.5723733653990681, "flos": 27598212552960.0, "grad_norm": 1.4024176519535674, "language_loss": 0.83497787, "learning_rate": 1.6303956085859944e-06, "loss": 0.85592747, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37890625, "step": 9520, "time_per_iteration": 2.4327268600463867 }, { "auxiliary_loss_clip": 0.01057276, "auxiliary_loss_mlp": 0.0104164, "balance_loss_clip": 1.01536608, "balance_loss_mlp": 1.01936615, "epoch": 0.572433488651736, "flos": 18221903176320.0, "grad_norm": 1.9220522304433219, "language_loss": 0.74948633, "learning_rate": 1.630012862105243e-06, "loss": 0.77047551, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 9521, "time_per_iteration": 2.3500499725341797 }, { "auxiliary_loss_clip": 0.01056051, "auxiliary_loss_mlp": 0.01043779, "balance_loss_clip": 1.01934075, "balance_loss_mlp": 1.01744485, "epoch": 0.5724936119044041, "flos": 31247753460480.0, "grad_norm": 2.0142121741845957, "language_loss": 0.79991895, "learning_rate": 1.6296301296544415e-06, "loss": 0.82091725, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 9522, "time_per_iteration": 2.4569313526153564 }, { "auxiliary_loss_clip": 0.01054261, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.01707304, "balance_loss_mlp": 1.01765156, "epoch": 0.572553735157072, "flos": 19201356858240.0, "grad_norm": 1.6144533012989668, "language_loss": 0.72739983, "learning_rate": 1.629247411248102e-06, "loss": 0.74834025, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 9523, "time_per_iteration": 2.3570334911346436 }, { "auxiliary_loss_clip": 0.01053823, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.01519811, "balance_loss_mlp": 1.01759934, "epoch": 0.57261385840974, "flos": 21213855025920.0, "grad_norm": 2.201312971851895, "language_loss": 0.71506977, "learning_rate": 1.628864706900738e-06, "loss": 0.73598212, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 9524, "time_per_iteration": 2.392056941986084 }, { "auxiliary_loss_clip": 0.01056607, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.01258552, "balance_loss_mlp": 1.01898026, "epoch": 0.5726739816624079, "flos": 33983128661760.0, "grad_norm": 1.6363937201212755, "language_loss": 0.66453052, "learning_rate": 1.6284820166268615e-06, "loss": 0.68546373, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37695312, "step": 9525, "time_per_iteration": 2.465132474899292 }, { "auxiliary_loss_clip": 0.01054315, "auxiliary_loss_mlp": 0.01036478, "balance_loss_clip": 1.01342297, "balance_loss_mlp": 1.01714325, "epoch": 0.5727341049150759, "flos": 24274935100800.0, "grad_norm": 1.772454948715186, "language_loss": 0.73324937, "learning_rate": 1.628099340440984e-06, "loss": 0.7541573, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37109375, "step": 9526, "time_per_iteration": 2.418215274810791 }, { "auxiliary_loss_clip": 0.01054861, "auxiliary_loss_mlp": 0.0103931, "balance_loss_clip": 1.01562321, "balance_loss_mlp": 1.01872015, "epoch": 0.5727942281677438, "flos": 28399364588160.0, "grad_norm": 1.6067947480641316, "language_loss": 0.81519699, "learning_rate": 1.6277166783576176e-06, "loss": 0.83613873, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 9527, "time_per_iteration": 2.435356855392456 }, { "auxiliary_loss_clip": 0.01055246, "auxiliary_loss_mlp": 0.01039339, "balance_loss_clip": 1.0146389, "balance_loss_mlp": 1.0177784, "epoch": 0.5728543514204119, "flos": 19535754660480.0, "grad_norm": 1.631690567122575, "language_loss": 0.73218423, "learning_rate": 1.6273340303912713e-06, "loss": 0.75313008, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 9528, "time_per_iteration": 2.3709001541137695 }, { "auxiliary_loss_clip": 0.01057479, "auxiliary_loss_mlp": 0.01042835, "balance_loss_clip": 1.01783657, "balance_loss_mlp": 1.01927447, "epoch": 0.5729144746730798, "flos": 21505694014080.0, "grad_norm": 1.9557791997165668, "language_loss": 0.87040937, "learning_rate": 1.6269513965564557e-06, "loss": 0.8914125, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 9529, "time_per_iteration": 2.364185333251953 }, { "auxiliary_loss_clip": 0.01011718, "auxiliary_loss_mlp": 0.01004513, "balance_loss_clip": 1.00197423, "balance_loss_mlp": 1.00431764, "epoch": 0.5729745979257478, "flos": 58678626533760.0, "grad_norm": 0.7642069643283972, "language_loss": 0.56201917, "learning_rate": 1.6265687768676813e-06, "loss": 0.58218151, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.07421875, "step": 9530, "time_per_iteration": 2.889549732208252 }, { "auxiliary_loss_clip": 0.01059109, "auxiliary_loss_mlp": 0.01038586, "balance_loss_clip": 1.01439846, "balance_loss_mlp": 1.02073693, "epoch": 0.5730347211784158, "flos": 18551099185920.0, "grad_norm": 1.7934784479854953, "language_loss": 0.68321764, "learning_rate": 1.6261861713394553e-06, "loss": 0.70419455, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3828125, "step": 9531, "time_per_iteration": 2.3667116165161133 }, { "auxiliary_loss_clip": 0.01057494, "auxiliary_loss_mlp": 0.01042403, "balance_loss_clip": 1.01683283, "balance_loss_mlp": 1.01890993, "epoch": 0.5730948444310837, "flos": 38030051197440.0, "grad_norm": 1.8639937626570713, "language_loss": 0.76890218, "learning_rate": 1.6258035799862876e-06, "loss": 0.78990114, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 9532, "time_per_iteration": 2.490511655807495 }, { "auxiliary_loss_clip": 0.01056267, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.01871789, "balance_loss_mlp": 1.01831198, "epoch": 0.5731549676837517, "flos": 25225934158080.0, "grad_norm": 1.3439861470678454, "language_loss": 0.7978667, "learning_rate": 1.625421002822686e-06, "loss": 0.81885237, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 9533, "time_per_iteration": 2.510737419128418 }, { "auxiliary_loss_clip": 0.01056153, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.01638699, "balance_loss_mlp": 1.01925421, "epoch": 0.5732150909364196, "flos": 23367088350720.0, "grad_norm": 1.561395890016325, "language_loss": 0.86078918, "learning_rate": 1.6250384398631574e-06, "loss": 0.88173115, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36914062, "step": 9534, "time_per_iteration": 2.3958029747009277 }, { "auxiliary_loss_clip": 0.01057189, "auxiliary_loss_mlp": 0.01054574, "balance_loss_clip": 1.02750206, "balance_loss_mlp": 1.01830578, "epoch": 0.5732752141890877, "flos": 23078147005440.0, "grad_norm": 1.7602103403020786, "language_loss": 0.76200438, "learning_rate": 1.6246558911222085e-06, "loss": 0.78312206, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 9535, "time_per_iteration": 2.446237087249756 }, { "auxiliary_loss_clip": 0.01059407, "auxiliary_loss_mlp": 0.01045488, "balance_loss_clip": 1.02075195, "balance_loss_mlp": 1.01936531, "epoch": 0.5733353374417556, "flos": 24351150332160.0, "grad_norm": 1.4825531042901026, "language_loss": 0.71928382, "learning_rate": 1.624273356614346e-06, "loss": 0.74033284, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.40039062, "step": 9536, "time_per_iteration": 2.4280905723571777 }, { "auxiliary_loss_clip": 0.01055826, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.02026796, "balance_loss_mlp": 1.0179615, "epoch": 0.5733954606944236, "flos": 27197619079680.0, "grad_norm": 1.6806462526849715, "language_loss": 0.70715308, "learning_rate": 1.6238908363540755e-06, "loss": 0.7281577, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9537, "time_per_iteration": 2.427006483078003 }, { "auxiliary_loss_clip": 0.01055895, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.01894236, "balance_loss_mlp": 1.01734495, "epoch": 0.5734555839470915, "flos": 28763927671680.0, "grad_norm": 1.7944879743907458, "language_loss": 0.64101231, "learning_rate": 1.623508330355902e-06, "loss": 0.66201949, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38476562, "step": 9538, "time_per_iteration": 2.4524643421173096 }, { "auxiliary_loss_clip": 0.0105472, "auxiliary_loss_mlp": 0.0104072, "balance_loss_clip": 1.0155195, "balance_loss_mlp": 1.01741087, "epoch": 0.5735157071997595, "flos": 22965691916160.0, "grad_norm": 2.0682741367384656, "language_loss": 0.84145463, "learning_rate": 1.6231258386343306e-06, "loss": 0.86240906, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 9539, "time_per_iteration": 2.409797191619873 }, { "auxiliary_loss_clip": 0.01056875, "auxiliary_loss_mlp": 0.01043167, "balance_loss_clip": 1.01835966, "balance_loss_mlp": 1.0178144, "epoch": 0.5735758304524274, "flos": 18988456187520.0, "grad_norm": 2.129863662189192, "language_loss": 0.74585205, "learning_rate": 1.6227433612038647e-06, "loss": 0.7668525, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 9540, "time_per_iteration": 2.382070779800415 }, { "auxiliary_loss_clip": 0.01054394, "auxiliary_loss_mlp": 0.01043062, "balance_loss_clip": 1.02009046, "balance_loss_mlp": 1.01670384, "epoch": 0.5736359537050955, "flos": 28396816058880.0, "grad_norm": 1.746768083030801, "language_loss": 0.81313366, "learning_rate": 1.6223608980790089e-06, "loss": 0.83410823, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37695312, "step": 9541, "time_per_iteration": 2.4585421085357666 }, { "auxiliary_loss_clip": 0.01058591, "auxiliary_loss_mlp": 0.01040574, "balance_loss_clip": 1.0141809, "balance_loss_mlp": 1.01847827, "epoch": 0.5736960769577634, "flos": 15626460170880.0, "grad_norm": 2.6077008280685914, "language_loss": 0.6650753, "learning_rate": 1.6219784492742654e-06, "loss": 0.68606693, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 9542, "time_per_iteration": 2.399834632873535 }, { "auxiliary_loss_clip": 0.01055622, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.0138849, "balance_loss_mlp": 1.01652551, "epoch": 0.5737562002104314, "flos": 18003032663040.0, "grad_norm": 1.9131403423004232, "language_loss": 0.84907281, "learning_rate": 1.6215960148041365e-06, "loss": 0.87000036, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.390625, "step": 9543, "time_per_iteration": 2.3519742488861084 }, { "auxiliary_loss_clip": 0.01058311, "auxiliary_loss_mlp": 0.01045639, "balance_loss_clip": 1.01875734, "balance_loss_mlp": 1.01785409, "epoch": 0.5738163234630994, "flos": 20697315327360.0, "grad_norm": 1.8362391959046571, "language_loss": 0.74657309, "learning_rate": 1.6212135946831257e-06, "loss": 0.76761252, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 9544, "time_per_iteration": 2.3877358436584473 }, { "auxiliary_loss_clip": 0.01056761, "auxiliary_loss_mlp": 0.01042732, "balance_loss_clip": 1.01726878, "balance_loss_mlp": 1.01714706, "epoch": 0.5738764467157673, "flos": 23148182926080.0, "grad_norm": 1.8905485670736228, "language_loss": 0.77390069, "learning_rate": 1.620831188925733e-06, "loss": 0.79489565, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 9545, "time_per_iteration": 2.3817641735076904 }, { "auxiliary_loss_clip": 0.01055785, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.02039492, "balance_loss_mlp": 1.01711965, "epoch": 0.5739365699684353, "flos": 29491762118400.0, "grad_norm": 2.8624954256366597, "language_loss": 0.57990003, "learning_rate": 1.620448797546459e-06, "loss": 0.60091233, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 9546, "time_per_iteration": 2.4600863456726074 }, { "auxiliary_loss_clip": 0.01055828, "auxiliary_loss_mlp": 0.01039367, "balance_loss_clip": 1.01486945, "balance_loss_mlp": 1.01726961, "epoch": 0.5739966932211032, "flos": 14026390427520.0, "grad_norm": 2.233097543508225, "language_loss": 0.77461058, "learning_rate": 1.6200664205598055e-06, "loss": 0.79556251, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38476562, "step": 9547, "time_per_iteration": 2.344376802444458 }, { "auxiliary_loss_clip": 0.01056066, "auxiliary_loss_mlp": 0.01041373, "balance_loss_clip": 1.01667285, "balance_loss_mlp": 1.01750493, "epoch": 0.5740568164737713, "flos": 19061040637440.0, "grad_norm": 1.974300833717856, "language_loss": 0.76437676, "learning_rate": 1.6196840579802704e-06, "loss": 0.78535116, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 9548, "time_per_iteration": 3.6503734588623047 }, { "auxiliary_loss_clip": 0.01055971, "auxiliary_loss_mlp": 0.01043589, "balance_loss_clip": 1.01838863, "balance_loss_mlp": 1.01619649, "epoch": 0.5741169397264392, "flos": 22126729011840.0, "grad_norm": 2.1838803591765314, "language_loss": 0.71336406, "learning_rate": 1.619301709822355e-06, "loss": 0.73435968, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39648438, "step": 9549, "time_per_iteration": 2.381854295730591 }, { "auxiliary_loss_clip": 0.01057214, "auxiliary_loss_mlp": 0.01041288, "balance_loss_clip": 1.01758885, "balance_loss_mlp": 1.01906312, "epoch": 0.5741770629791072, "flos": 24935666181120.0, "grad_norm": 1.8544123467706213, "language_loss": 0.80050874, "learning_rate": 1.6189193761005564e-06, "loss": 0.82149374, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 9550, "time_per_iteration": 2.423672914505005 }, { "auxiliary_loss_clip": 0.01055947, "auxiliary_loss_mlp": 0.01038587, "balance_loss_clip": 1.01159835, "balance_loss_mlp": 1.01665938, "epoch": 0.5742371862317751, "flos": 18800623739520.0, "grad_norm": 1.900863217902449, "language_loss": 0.6847226, "learning_rate": 1.6185370568293727e-06, "loss": 0.70566797, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 9551, "time_per_iteration": 3.792492628097534 }, { "auxiliary_loss_clip": 0.01058259, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.01620662, "balance_loss_mlp": 1.01881564, "epoch": 0.5742973094844431, "flos": 24459555703680.0, "grad_norm": 2.385278242115737, "language_loss": 0.73599768, "learning_rate": 1.6181547520233031e-06, "loss": 0.75699139, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39453125, "step": 9552, "time_per_iteration": 3.800985813140869 }, { "auxiliary_loss_clip": 0.01056547, "auxiliary_loss_mlp": 0.01044372, "balance_loss_clip": 1.0186224, "balance_loss_mlp": 1.01777923, "epoch": 0.574357432737111, "flos": 21651700786560.0, "grad_norm": 2.076525830401587, "language_loss": 0.81585115, "learning_rate": 1.617772461696843e-06, "loss": 0.8368603, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9553, "time_per_iteration": 2.4008424282073975 }, { "auxiliary_loss_clip": 0.01058238, "auxiliary_loss_mlp": 0.01038068, "balance_loss_clip": 1.01131737, "balance_loss_mlp": 1.01804817, "epoch": 0.5744175559897791, "flos": 16543802810880.0, "grad_norm": 2.0506797752593813, "language_loss": 0.84328806, "learning_rate": 1.6173901858644895e-06, "loss": 0.86425114, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 9554, "time_per_iteration": 2.3592336177825928 }, { "auxiliary_loss_clip": 0.01059461, "auxiliary_loss_mlp": 0.01042304, "balance_loss_clip": 1.01592314, "balance_loss_mlp": 1.0188024, "epoch": 0.574477679242447, "flos": 24206435280000.0, "grad_norm": 1.5666021644075383, "language_loss": 0.72358561, "learning_rate": 1.6170079245407385e-06, "loss": 0.74460334, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 9555, "time_per_iteration": 2.4518346786499023 }, { "auxiliary_loss_clip": 0.01055863, "auxiliary_loss_mlp": 0.01042305, "balance_loss_clip": 1.01567411, "balance_loss_mlp": 1.0176785, "epoch": 0.574537802495115, "flos": 14902116860160.0, "grad_norm": 4.120499054832592, "language_loss": 0.74109733, "learning_rate": 1.6166256777400853e-06, "loss": 0.762079, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 9556, "time_per_iteration": 2.3397746086120605 }, { "auxiliary_loss_clip": 0.01056754, "auxiliary_loss_mlp": 0.01043192, "balance_loss_clip": 1.01632237, "balance_loss_mlp": 1.01822662, "epoch": 0.5745979257477829, "flos": 24933850790400.0, "grad_norm": 1.5302490802948858, "language_loss": 0.74935424, "learning_rate": 1.6162434454770248e-06, "loss": 0.77035373, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 9557, "time_per_iteration": 2.424532651901245 }, { "auxiliary_loss_clip": 0.01056519, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.01822901, "balance_loss_mlp": 1.01757216, "epoch": 0.5746580490004509, "flos": 17234873729280.0, "grad_norm": 2.064412403423656, "language_loss": 0.68950069, "learning_rate": 1.6158612277660514e-06, "loss": 0.71051061, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 9558, "time_per_iteration": 2.3553335666656494 }, { "auxiliary_loss_clip": 0.01062057, "auxiliary_loss_mlp": 0.01048792, "balance_loss_clip": 1.01975238, "balance_loss_mlp": 1.01969409, "epoch": 0.5747181722531189, "flos": 13187043498240.0, "grad_norm": 1.9105734592067514, "language_loss": 0.71823871, "learning_rate": 1.615479024621659e-06, "loss": 0.73934722, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.42382812, "step": 9559, "time_per_iteration": 2.3596718311309814 }, { "auxiliary_loss_clip": 0.01057058, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.01146233, "balance_loss_mlp": 1.01893568, "epoch": 0.5747782955057869, "flos": 22961991312000.0, "grad_norm": 1.6117850597342274, "language_loss": 0.80160224, "learning_rate": 1.6150968360583398e-06, "loss": 0.82252038, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3828125, "step": 9560, "time_per_iteration": 3.8129396438598633 }, { "auxiliary_loss_clip": 0.01058193, "auxiliary_loss_mlp": 0.01040991, "balance_loss_clip": 1.01557541, "balance_loss_mlp": 1.0187273, "epoch": 0.5748384187584549, "flos": 23402141222400.0, "grad_norm": 1.6703156139946893, "language_loss": 0.65132344, "learning_rate": 1.614714662090588e-06, "loss": 0.67231524, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39453125, "step": 9561, "time_per_iteration": 2.3952810764312744 }, { "auxiliary_loss_clip": 0.01062905, "auxiliary_loss_mlp": 0.01047404, "balance_loss_clip": 1.019068, "balance_loss_mlp": 1.02075672, "epoch": 0.5748985420111228, "flos": 17784546174720.0, "grad_norm": 2.2816899113471503, "language_loss": 0.72708106, "learning_rate": 1.6143325027328945e-06, "loss": 0.7481842, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.421875, "step": 9562, "time_per_iteration": 2.360301971435547 }, { "auxiliary_loss_clip": 0.01057477, "auxiliary_loss_mlp": 0.01044444, "balance_loss_clip": 1.02041101, "balance_loss_mlp": 1.01859164, "epoch": 0.5749586652637908, "flos": 19865195049600.0, "grad_norm": 1.954112149862691, "language_loss": 0.84965956, "learning_rate": 1.613950357999751e-06, "loss": 0.87067872, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38867188, "step": 9563, "time_per_iteration": 2.3926923274993896 }, { "auxiliary_loss_clip": 0.01060864, "auxiliary_loss_mlp": 0.01050624, "balance_loss_clip": 1.02083373, "balance_loss_mlp": 1.01930857, "epoch": 0.5750187885164587, "flos": 21286195096320.0, "grad_norm": 1.944516805137725, "language_loss": 0.58618474, "learning_rate": 1.6135682279056488e-06, "loss": 0.60729963, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41601562, "step": 9564, "time_per_iteration": 2.396207571029663 }, { "auxiliary_loss_clip": 0.01054916, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.0178206, "balance_loss_mlp": 1.01720595, "epoch": 0.5750789117691267, "flos": 18803730850560.0, "grad_norm": 1.7168238394188347, "language_loss": 0.77022219, "learning_rate": 1.613186112465078e-06, "loss": 0.79119551, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37695312, "step": 9565, "time_per_iteration": 2.3691141605377197 }, { "auxiliary_loss_clip": 0.01014201, "auxiliary_loss_mlp": 0.01009131, "balance_loss_clip": 1.00662792, "balance_loss_mlp": 1.00661302, "epoch": 0.5751390350217946, "flos": 70659490780800.0, "grad_norm": 0.9570041109482518, "language_loss": 0.60824341, "learning_rate": 1.6128040116925287e-06, "loss": 0.6284768, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.07617188, "step": 9566, "time_per_iteration": 3.092327356338501 }, { "auxiliary_loss_clip": 0.01057559, "auxiliary_loss_mlp": 0.01041118, "balance_loss_clip": 1.01369965, "balance_loss_mlp": 1.01891398, "epoch": 0.5751991582744627, "flos": 14245470408960.0, "grad_norm": 3.266615226074235, "language_loss": 0.76611084, "learning_rate": 1.6124219256024901e-06, "loss": 0.78709763, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.38671875, "step": 9567, "time_per_iteration": 2.3595707416534424 }, { "auxiliary_loss_clip": 0.01057732, "auxiliary_loss_mlp": 0.01043996, "balance_loss_clip": 1.01874709, "balance_loss_mlp": 1.01788974, "epoch": 0.5752592815271306, "flos": 18327306170880.0, "grad_norm": 1.4741556993108602, "language_loss": 0.7576738, "learning_rate": 1.6120398542094504e-06, "loss": 0.77869117, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3984375, "step": 9568, "time_per_iteration": 2.346219539642334 }, { "auxiliary_loss_clip": 0.01058347, "auxiliary_loss_mlp": 0.01038451, "balance_loss_clip": 1.0123446, "balance_loss_mlp": 1.01827359, "epoch": 0.5753194047797986, "flos": 20921701835520.0, "grad_norm": 1.6513363306708648, "language_loss": 0.72996294, "learning_rate": 1.6116577975278994e-06, "loss": 0.75093091, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9569, "time_per_iteration": 2.3752152919769287 }, { "auxiliary_loss_clip": 0.01056967, "auxiliary_loss_mlp": 0.01045362, "balance_loss_clip": 1.01694226, "balance_loss_mlp": 1.01703906, "epoch": 0.5753795280324665, "flos": 19280783934720.0, "grad_norm": 2.8361671419822088, "language_loss": 0.57394445, "learning_rate": 1.6112757555723223e-06, "loss": 0.59496772, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3984375, "step": 9570, "time_per_iteration": 2.3558568954467773 }, { "auxiliary_loss_clip": 0.01054539, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.01524758, "balance_loss_mlp": 1.0163238, "epoch": 0.5754396512851345, "flos": 21651805520640.0, "grad_norm": 1.5655896432295775, "language_loss": 0.65644765, "learning_rate": 1.6108937283572082e-06, "loss": 0.67739779, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 9571, "time_per_iteration": 2.393368721008301 }, { "auxiliary_loss_clip": 0.01057363, "auxiliary_loss_mlp": 0.01046192, "balance_loss_clip": 1.01715279, "balance_loss_mlp": 1.01711607, "epoch": 0.5754997745378025, "flos": 51019871091840.0, "grad_norm": 2.958459336154477, "language_loss": 0.67842793, "learning_rate": 1.6105117158970434e-06, "loss": 0.69946349, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40234375, "step": 9572, "time_per_iteration": 2.6232669353485107 }, { "auxiliary_loss_clip": 0.01057571, "auxiliary_loss_mlp": 0.01046893, "balance_loss_clip": 1.01979721, "balance_loss_mlp": 1.01767421, "epoch": 0.5755598977904705, "flos": 22855785356160.0, "grad_norm": 1.8165436994359578, "language_loss": 0.74191487, "learning_rate": 1.6101297182063123e-06, "loss": 0.76295954, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 9573, "time_per_iteration": 2.380631446838379 }, { "auxiliary_loss_clip": 0.01054121, "auxiliary_loss_mlp": 0.01042041, "balance_loss_clip": 1.01829445, "balance_loss_mlp": 1.01786995, "epoch": 0.5756200210431385, "flos": 38471283360000.0, "grad_norm": 1.7414687037198726, "language_loss": 0.76890129, "learning_rate": 1.6097477352995022e-06, "loss": 0.78986287, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 9574, "time_per_iteration": 2.515338659286499 }, { "auxiliary_loss_clip": 0.01059482, "auxiliary_loss_mlp": 0.01049562, "balance_loss_clip": 1.02051044, "balance_loss_mlp": 1.01736033, "epoch": 0.5756801442958064, "flos": 23909010474240.0, "grad_norm": 2.814625144854651, "language_loss": 0.68373936, "learning_rate": 1.6093657671910968e-06, "loss": 0.70482981, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.421875, "step": 9575, "time_per_iteration": 2.403566837310791 }, { "auxiliary_loss_clip": 0.01056071, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.01718402, "balance_loss_mlp": 1.01827312, "epoch": 0.5757402675484744, "flos": 21104227756800.0, "grad_norm": 1.4500763305336692, "language_loss": 0.80810601, "learning_rate": 1.6089838138955804e-06, "loss": 0.82908142, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 9576, "time_per_iteration": 2.3726954460144043 }, { "auxiliary_loss_clip": 0.0105461, "auxiliary_loss_mlp": 0.01044322, "balance_loss_clip": 1.0192647, "balance_loss_mlp": 1.01634288, "epoch": 0.5758003908011423, "flos": 20558046447360.0, "grad_norm": 1.8200115837941528, "language_loss": 0.70378321, "learning_rate": 1.6086018754274372e-06, "loss": 0.72477257, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 9577, "time_per_iteration": 2.410407781600952 }, { "auxiliary_loss_clip": 0.01059209, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.01443231, "balance_loss_mlp": 1.01808548, "epoch": 0.5758605140538103, "flos": 16472056233600.0, "grad_norm": 1.7218234156834145, "language_loss": 0.67362154, "learning_rate": 1.6082199518011504e-06, "loss": 0.69463599, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 9578, "time_per_iteration": 2.35194730758667 }, { "auxiliary_loss_clip": 0.0105694, "auxiliary_loss_mlp": 0.01040715, "balance_loss_clip": 1.01545501, "balance_loss_mlp": 1.01827669, "epoch": 0.5759206373064782, "flos": 21286509298560.0, "grad_norm": 1.5980151605609199, "language_loss": 0.73586762, "learning_rate": 1.6078380430312016e-06, "loss": 0.75684416, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 9579, "time_per_iteration": 2.387449264526367 }, { "auxiliary_loss_clip": 0.01059945, "auxiliary_loss_mlp": 0.0104071, "balance_loss_clip": 1.01109886, "balance_loss_mlp": 1.01806045, "epoch": 0.5759807605591463, "flos": 26066677541760.0, "grad_norm": 3.1641898025400654, "language_loss": 0.66418159, "learning_rate": 1.6074561491320742e-06, "loss": 0.68518817, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.41796875, "step": 9580, "time_per_iteration": 2.404876232147217 }, { "auxiliary_loss_clip": 0.01057571, "auxiliary_loss_mlp": 0.01042797, "balance_loss_clip": 1.01589179, "balance_loss_mlp": 1.0178144, "epoch": 0.5760408838118142, "flos": 18872265582720.0, "grad_norm": 1.8120562681990553, "language_loss": 0.85973215, "learning_rate": 1.6070742701182486e-06, "loss": 0.88073581, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9581, "time_per_iteration": 2.3992364406585693 }, { "auxiliary_loss_clip": 0.01061385, "auxiliary_loss_mlp": 0.01047533, "balance_loss_clip": 1.01914883, "balance_loss_mlp": 1.01978445, "epoch": 0.5761010070644822, "flos": 15377214908160.0, "grad_norm": 3.234614820958906, "language_loss": 0.69307148, "learning_rate": 1.6066924060042057e-06, "loss": 0.71416068, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41601562, "step": 9582, "time_per_iteration": 2.404843807220459 }, { "auxiliary_loss_clip": 0.01011151, "auxiliary_loss_mlp": 0.0100305, "balance_loss_clip": 1.0006057, "balance_loss_mlp": 1.00341892, "epoch": 0.5761611303171501, "flos": 71468009112960.0, "grad_norm": 0.6564265343705085, "language_loss": 0.572083, "learning_rate": 1.6063105568044271e-06, "loss": 0.59222507, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.07714844, "step": 9583, "time_per_iteration": 3.177868127822876 }, { "auxiliary_loss_clip": 0.01059181, "auxiliary_loss_mlp": 0.01041749, "balance_loss_clip": 1.0145936, "balance_loss_mlp": 1.01838732, "epoch": 0.5762212535698181, "flos": 16245435398400.0, "grad_norm": 1.8088932579830939, "language_loss": 0.83131158, "learning_rate": 1.6059287225333912e-06, "loss": 0.85232091, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40820312, "step": 9584, "time_per_iteration": 2.3496522903442383 }, { "auxiliary_loss_clip": 0.01011165, "auxiliary_loss_mlp": 0.0100462, "balance_loss_clip": 1.00191438, "balance_loss_mlp": 1.0033356, "epoch": 0.5762813768224861, "flos": 70181879114880.0, "grad_norm": 0.627365348316528, "language_loss": 0.49564311, "learning_rate": 1.6055469032055773e-06, "loss": 0.51580095, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.078125, "step": 9585, "time_per_iteration": 3.1105260848999023 }, { "auxiliary_loss_clip": 0.01054824, "auxiliary_loss_mlp": 0.01037615, "balance_loss_clip": 1.01151979, "balance_loss_mlp": 1.0167383, "epoch": 0.5763415000751541, "flos": 20517093555840.0, "grad_norm": 1.5626981059409593, "language_loss": 0.85430479, "learning_rate": 1.605165098835465e-06, "loss": 0.87522918, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 9586, "time_per_iteration": 2.356426239013672 }, { "auxiliary_loss_clip": 0.01057157, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.01637983, "balance_loss_mlp": 1.01770544, "epoch": 0.5764016233278221, "flos": 15814606821120.0, "grad_norm": 1.7913494493481767, "language_loss": 0.8077755, "learning_rate": 1.6047833094375308e-06, "loss": 0.82877839, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 9587, "time_per_iteration": 2.3580310344696045 }, { "auxiliary_loss_clip": 0.01057491, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.00952649, "balance_loss_mlp": 1.01784897, "epoch": 0.57646174658049, "flos": 20771400965760.0, "grad_norm": 1.7425088920472596, "language_loss": 0.66716671, "learning_rate": 1.6044015350262542e-06, "loss": 0.68810129, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 9588, "time_per_iteration": 3.746570587158203 }, { "auxiliary_loss_clip": 0.01058546, "auxiliary_loss_mlp": 0.01047528, "balance_loss_clip": 1.01778495, "balance_loss_mlp": 1.01817942, "epoch": 0.576521869833158, "flos": 23548811310720.0, "grad_norm": 1.8302220205137567, "language_loss": 0.7964797, "learning_rate": 1.6040197756161104e-06, "loss": 0.81754047, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40234375, "step": 9589, "time_per_iteration": 2.398541212081909 }, { "auxiliary_loss_clip": 0.01054745, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.0107348, "balance_loss_mlp": 1.01670337, "epoch": 0.5765819930858259, "flos": 20265544143360.0, "grad_norm": 2.1033425790581357, "language_loss": 0.81439692, "learning_rate": 1.6036380312215762e-06, "loss": 0.83529013, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38085938, "step": 9590, "time_per_iteration": 2.40799880027771 }, { "auxiliary_loss_clip": 0.01057947, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.00741363, "balance_loss_mlp": 1.01925242, "epoch": 0.5766421163384939, "flos": 23147659255680.0, "grad_norm": 2.0425778459712287, "language_loss": 0.64280772, "learning_rate": 1.6032563018571283e-06, "loss": 0.6637094, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 9591, "time_per_iteration": 3.851085901260376 }, { "auxiliary_loss_clip": 0.01058155, "auxiliary_loss_mlp": 0.01042655, "balance_loss_clip": 1.0163933, "balance_loss_mlp": 1.01847732, "epoch": 0.5767022395911618, "flos": 25847702294400.0, "grad_norm": 1.5670008313729025, "language_loss": 0.78456253, "learning_rate": 1.6028745875372406e-06, "loss": 0.8055706, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 9592, "time_per_iteration": 3.853929281234741 }, { "auxiliary_loss_clip": 0.01009183, "auxiliary_loss_mlp": 0.01002715, "balance_loss_clip": 1.00000882, "balance_loss_mlp": 1.0014801, "epoch": 0.5767623628438299, "flos": 68289586358400.0, "grad_norm": 0.816191985922381, "language_loss": 0.59858215, "learning_rate": 1.6024928882763885e-06, "loss": 0.61870116, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.07714844, "step": 9593, "time_per_iteration": 3.1290857791900635 }, { "auxiliary_loss_clip": 0.01059364, "auxiliary_loss_mlp": 0.01045414, "balance_loss_clip": 1.01670814, "balance_loss_mlp": 1.0180198, "epoch": 0.5768224860964978, "flos": 30187196956800.0, "grad_norm": 1.9346675176770052, "language_loss": 0.71688581, "learning_rate": 1.6021112040890463e-06, "loss": 0.73793364, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.4140625, "step": 9594, "time_per_iteration": 2.432359457015991 }, { "auxiliary_loss_clip": 0.01057161, "auxiliary_loss_mlp": 0.01038094, "balance_loss_clip": 1.01396596, "balance_loss_mlp": 1.01786542, "epoch": 0.5768826093491658, "flos": 17894068709760.0, "grad_norm": 1.7811884400853808, "language_loss": 0.71808088, "learning_rate": 1.6017295349896863e-06, "loss": 0.73903346, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.39257812, "step": 9595, "time_per_iteration": 2.396301507949829 }, { "auxiliary_loss_clip": 0.01057875, "auxiliary_loss_mlp": 0.01048008, "balance_loss_clip": 1.02043521, "balance_loss_mlp": 1.01795936, "epoch": 0.5769427326018337, "flos": 17456222949120.0, "grad_norm": 1.8648511378428372, "language_loss": 0.70623684, "learning_rate": 1.6013478809927828e-06, "loss": 0.72729564, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.3984375, "step": 9596, "time_per_iteration": 2.3457531929016113 }, { "auxiliary_loss_clip": 0.01061332, "auxiliary_loss_mlp": 0.0105015, "balance_loss_clip": 1.01982343, "balance_loss_mlp": 1.01861262, "epoch": 0.5770028558545017, "flos": 39420152824320.0, "grad_norm": 2.637289367304534, "language_loss": 0.69003308, "learning_rate": 1.6009662421128074e-06, "loss": 0.71114784, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.42773438, "step": 9597, "time_per_iteration": 2.541459560394287 }, { "auxiliary_loss_clip": 0.0105572, "auxiliary_loss_mlp": 0.01044336, "balance_loss_clip": 1.01839578, "balance_loss_mlp": 1.01678944, "epoch": 0.5770629791071697, "flos": 21535510181760.0, "grad_norm": 1.974355369407956, "language_loss": 0.82335824, "learning_rate": 1.6005846183642323e-06, "loss": 0.8443588, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9598, "time_per_iteration": 2.369436025619507 }, { "auxiliary_loss_clip": 0.01058018, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.01673257, "balance_loss_mlp": 1.01822758, "epoch": 0.5771231023598377, "flos": 20885741268480.0, "grad_norm": 1.5788211809089572, "language_loss": 0.73893368, "learning_rate": 1.6002030097615277e-06, "loss": 0.75995922, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9599, "time_per_iteration": 3.7819316387176514 }, { "auxiliary_loss_clip": 0.0105387, "auxiliary_loss_mlp": 0.01034547, "balance_loss_clip": 1.01090801, "balance_loss_mlp": 1.01689243, "epoch": 0.5771832256125057, "flos": 18076245517440.0, "grad_norm": 1.9339555296442994, "language_loss": 0.79680347, "learning_rate": 1.5998214163191663e-06, "loss": 0.81768763, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 9600, "time_per_iteration": 2.3551025390625 }, { "auxiliary_loss_clip": 0.01059152, "auxiliary_loss_mlp": 0.0105, "balance_loss_clip": 1.0229156, "balance_loss_mlp": 1.01925206, "epoch": 0.5772433488651736, "flos": 26357888125440.0, "grad_norm": 1.7871054422036199, "language_loss": 0.73623025, "learning_rate": 1.5994398380516163e-06, "loss": 0.75732183, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 9601, "time_per_iteration": 2.4281699657440186 }, { "auxiliary_loss_clip": 0.0105662, "auxiliary_loss_mlp": 0.01048949, "balance_loss_clip": 1.0219121, "balance_loss_mlp": 1.01818264, "epoch": 0.5773034721178416, "flos": 19680015864960.0, "grad_norm": 2.2747137442379834, "language_loss": 0.69216019, "learning_rate": 1.599058274973348e-06, "loss": 0.71321589, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3828125, "step": 9602, "time_per_iteration": 2.3615126609802246 }, { "auxiliary_loss_clip": 0.0105416, "auxiliary_loss_mlp": 0.01041156, "balance_loss_clip": 1.01701665, "balance_loss_mlp": 1.01766896, "epoch": 0.5773635953705095, "flos": 25081707864960.0, "grad_norm": 1.4490217209872032, "language_loss": 0.74543029, "learning_rate": 1.5986767270988297e-06, "loss": 0.76638341, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 9603, "time_per_iteration": 2.4447476863861084 }, { "auxiliary_loss_clip": 0.01055694, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.01185, "balance_loss_mlp": 1.01760828, "epoch": 0.5774237186231775, "flos": 21031922597760.0, "grad_norm": 1.6077537678299514, "language_loss": 0.77694672, "learning_rate": 1.5982951944425298e-06, "loss": 0.79788667, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 9604, "time_per_iteration": 2.3969345092773438 }, { "auxiliary_loss_clip": 0.01058381, "auxiliary_loss_mlp": 0.01046333, "balance_loss_clip": 1.01720977, "balance_loss_mlp": 1.0179863, "epoch": 0.5774838418758454, "flos": 15230824110720.0, "grad_norm": 1.6975033733354346, "language_loss": 0.84335047, "learning_rate": 1.5979136770189174e-06, "loss": 0.86439764, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40234375, "step": 9605, "time_per_iteration": 2.3570096492767334 }, { "auxiliary_loss_clip": 0.01062191, "auxiliary_loss_mlp": 0.01052514, "balance_loss_clip": 1.01911211, "balance_loss_mlp": 1.0191865, "epoch": 0.5775439651285135, "flos": 23581594944000.0, "grad_norm": 2.440394033219565, "language_loss": 0.79023451, "learning_rate": 1.5975321748424581e-06, "loss": 0.81138158, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 0.4296875, "step": 9606, "time_per_iteration": 2.3742012977600098 }, { "auxiliary_loss_clip": 0.01056119, "auxiliary_loss_mlp": 0.01041625, "balance_loss_clip": 1.0156132, "balance_loss_mlp": 1.01791525, "epoch": 0.5776040883811814, "flos": 18039551811840.0, "grad_norm": 1.711896172128816, "language_loss": 0.74448699, "learning_rate": 1.597150687927619e-06, "loss": 0.76546443, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 9607, "time_per_iteration": 2.3592264652252197 }, { "auxiliary_loss_clip": 0.010593, "auxiliary_loss_mlp": 0.0104273, "balance_loss_clip": 1.01614594, "balance_loss_mlp": 1.01881349, "epoch": 0.5776642116338494, "flos": 18623648724480.0, "grad_norm": 2.115830723275719, "language_loss": 0.70205438, "learning_rate": 1.5967692162888664e-06, "loss": 0.72307467, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40429688, "step": 9608, "time_per_iteration": 2.340737819671631 }, { "auxiliary_loss_clip": 0.01057757, "auxiliary_loss_mlp": 0.01046951, "balance_loss_clip": 1.01972318, "balance_loss_mlp": 1.01787519, "epoch": 0.5777243348865173, "flos": 28401284712960.0, "grad_norm": 1.7356385676757342, "language_loss": 0.78224707, "learning_rate": 1.596387759940665e-06, "loss": 0.80329406, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 9609, "time_per_iteration": 2.434516668319702 }, { "auxiliary_loss_clip": 0.01059426, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.01381922, "balance_loss_mlp": 1.01833725, "epoch": 0.5777844581391853, "flos": 24023560245120.0, "grad_norm": 1.71507321756123, "language_loss": 0.78088254, "learning_rate": 1.5960063188974808e-06, "loss": 0.80190468, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41015625, "step": 9610, "time_per_iteration": 2.383596420288086 }, { "auxiliary_loss_clip": 0.0105715, "auxiliary_loss_mlp": 0.0104629, "balance_loss_clip": 1.01793039, "balance_loss_mlp": 1.01758265, "epoch": 0.5778445813918534, "flos": 17776621296000.0, "grad_norm": 2.9001799806263597, "language_loss": 0.70885479, "learning_rate": 1.5956248931737777e-06, "loss": 0.72988915, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.39453125, "step": 9611, "time_per_iteration": 2.339589834213257 }, { "auxiliary_loss_clip": 0.01056367, "auxiliary_loss_mlp": 0.01043517, "balance_loss_clip": 1.01502562, "balance_loss_mlp": 1.0171864, "epoch": 0.5779047046445213, "flos": 22232201829120.0, "grad_norm": 1.9865063766774689, "language_loss": 0.84334707, "learning_rate": 1.5952434827840185e-06, "loss": 0.86434591, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39257812, "step": 9612, "time_per_iteration": 2.378924608230591 }, { "auxiliary_loss_clip": 0.01058032, "auxiliary_loss_mlp": 0.0104321, "balance_loss_clip": 1.01650715, "balance_loss_mlp": 1.01787198, "epoch": 0.5779648278971893, "flos": 21433284120960.0, "grad_norm": 1.6743146183862951, "language_loss": 0.80675328, "learning_rate": 1.594862087742667e-06, "loss": 0.8277657, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 9613, "time_per_iteration": 2.369762420654297 }, { "auxiliary_loss_clip": 0.01055825, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 1.01321566, "balance_loss_mlp": 1.01740193, "epoch": 0.5780249511498572, "flos": 19025114981760.0, "grad_norm": 1.8325557246204063, "language_loss": 0.77387351, "learning_rate": 1.5944807080641863e-06, "loss": 0.79480535, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38476562, "step": 9614, "time_per_iteration": 2.36098575592041 }, { "auxiliary_loss_clip": 0.01060395, "auxiliary_loss_mlp": 0.01048587, "balance_loss_clip": 1.02058518, "balance_loss_mlp": 1.01845717, "epoch": 0.5780850744025252, "flos": 12124008288000.0, "grad_norm": 2.298379494956112, "language_loss": 0.81960964, "learning_rate": 1.5940993437630375e-06, "loss": 0.84069943, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41992188, "step": 9615, "time_per_iteration": 2.3248143196105957 }, { "auxiliary_loss_clip": 0.01058362, "auxiliary_loss_mlp": 0.01043124, "balance_loss_clip": 1.01614642, "balance_loss_mlp": 1.01829767, "epoch": 0.5781451976551931, "flos": 25043303502720.0, "grad_norm": 1.4736801224528853, "language_loss": 0.68005568, "learning_rate": 1.5937179948536825e-06, "loss": 0.70107055, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40039062, "step": 9616, "time_per_iteration": 2.4304521083831787 }, { "auxiliary_loss_clip": 0.01056951, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.01450908, "balance_loss_mlp": 1.01841211, "epoch": 0.5782053209078611, "flos": 19244578988160.0, "grad_norm": 1.6462956688568904, "language_loss": 0.78965247, "learning_rate": 1.5933366613505812e-06, "loss": 0.81062531, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9617, "time_per_iteration": 2.3750863075256348 }, { "auxiliary_loss_clip": 0.01056287, "auxiliary_loss_mlp": 0.01045009, "balance_loss_clip": 1.01770997, "balance_loss_mlp": 1.01757121, "epoch": 0.578265444160529, "flos": 25992661726080.0, "grad_norm": 1.478044688320972, "language_loss": 0.7624132, "learning_rate": 1.5929553432681947e-06, "loss": 0.78342617, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 9618, "time_per_iteration": 2.4222350120544434 }, { "auxiliary_loss_clip": 0.01056493, "auxiliary_loss_mlp": 0.01038418, "balance_loss_clip": 1.01367044, "balance_loss_mlp": 1.01815546, "epoch": 0.5783255674131971, "flos": 21797533002240.0, "grad_norm": 1.6284973878103197, "language_loss": 0.82992291, "learning_rate": 1.5925740406209826e-06, "loss": 0.85087204, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 9619, "time_per_iteration": 2.3860421180725098 }, { "auxiliary_loss_clip": 0.01056488, "auxiliary_loss_mlp": 0.01050187, "balance_loss_clip": 1.02225614, "balance_loss_mlp": 1.01820874, "epoch": 0.578385690665865, "flos": 24788612067840.0, "grad_norm": 2.0482177651231095, "language_loss": 0.73814917, "learning_rate": 1.5921927534234039e-06, "loss": 0.75921595, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.3828125, "step": 9620, "time_per_iteration": 2.4099926948547363 }, { "auxiliary_loss_clip": 0.01056304, "auxiliary_loss_mlp": 0.01042383, "balance_loss_clip": 1.01584744, "balance_loss_mlp": 1.01732731, "epoch": 0.578445813918533, "flos": 21211865078400.0, "grad_norm": 1.5859147209737507, "language_loss": 0.78496319, "learning_rate": 1.591811481689916e-06, "loss": 0.80595005, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 9621, "time_per_iteration": 2.3856754302978516 }, { "auxiliary_loss_clip": 0.01058138, "auxiliary_loss_mlp": 0.01043036, "balance_loss_clip": 1.01617813, "balance_loss_mlp": 1.0179894, "epoch": 0.5785059371712009, "flos": 25045607652480.0, "grad_norm": 1.7824326290333916, "language_loss": 0.71849024, "learning_rate": 1.5914302254349787e-06, "loss": 0.73950195, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40234375, "step": 9622, "time_per_iteration": 2.438464879989624 }, { "auxiliary_loss_clip": 0.01012319, "auxiliary_loss_mlp": 0.0101865, "balance_loss_clip": 1.01609921, "balance_loss_mlp": 1.00467443, "epoch": 0.5785660604238689, "flos": 70839503084160.0, "grad_norm": 0.7814281771486934, "language_loss": 0.56029576, "learning_rate": 1.5910489846730476e-06, "loss": 0.58060545, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.07617188, "step": 9623, "time_per_iteration": 3.037371873855591 }, { "auxiliary_loss_clip": 0.0106014, "auxiliary_loss_mlp": 0.01046546, "balance_loss_clip": 1.01669562, "balance_loss_mlp": 1.01856542, "epoch": 0.578626183676537, "flos": 31648626224640.0, "grad_norm": 1.7556502206280105, "language_loss": 0.72541893, "learning_rate": 1.5906677594185799e-06, "loss": 0.74648583, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41601562, "step": 9624, "time_per_iteration": 2.4599552154541016 }, { "auxiliary_loss_clip": 0.01058987, "auxiliary_loss_mlp": 0.01049237, "balance_loss_clip": 1.0214138, "balance_loss_mlp": 1.01916838, "epoch": 0.5786863069292049, "flos": 21864287255040.0, "grad_norm": 2.047362446522189, "language_loss": 0.83611226, "learning_rate": 1.5902865496860322e-06, "loss": 0.85719442, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9625, "time_per_iteration": 2.3695106506347656 }, { "auxiliary_loss_clip": 0.01056054, "auxiliary_loss_mlp": 0.01046173, "balance_loss_clip": 1.01815844, "balance_loss_mlp": 1.01745319, "epoch": 0.5787464301818729, "flos": 23363911416960.0, "grad_norm": 1.4648004450804337, "language_loss": 0.70975208, "learning_rate": 1.5899053554898591e-06, "loss": 0.73077434, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38671875, "step": 9626, "time_per_iteration": 2.408174514770508 }, { "auxiliary_loss_clip": 0.01057288, "auxiliary_loss_mlp": 0.01043697, "balance_loss_clip": 1.01615953, "balance_loss_mlp": 1.01829147, "epoch": 0.5788065534345408, "flos": 30002820733440.0, "grad_norm": 1.461207446328878, "language_loss": 0.72577387, "learning_rate": 1.5895241768445166e-06, "loss": 0.74678373, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.390625, "step": 9627, "time_per_iteration": 3.8399624824523926 }, { "auxiliary_loss_clip": 0.01055939, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.01634502, "balance_loss_mlp": 1.01730394, "epoch": 0.5788666766872088, "flos": 24526903449600.0, "grad_norm": 1.5555220597265118, "language_loss": 0.85159552, "learning_rate": 1.589143013764458e-06, "loss": 0.87257969, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9628, "time_per_iteration": 2.395768880844116 }, { "auxiliary_loss_clip": 0.01057629, "auxiliary_loss_mlp": 0.01045902, "balance_loss_clip": 1.01815009, "balance_loss_mlp": 1.01833034, "epoch": 0.5789267999398767, "flos": 23731686345600.0, "grad_norm": 1.5549609405216833, "language_loss": 0.73841393, "learning_rate": 1.5887618662641376e-06, "loss": 0.75944924, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39257812, "step": 9629, "time_per_iteration": 2.402574300765991 }, { "auxiliary_loss_clip": 0.01058163, "auxiliary_loss_mlp": 0.01045159, "balance_loss_clip": 1.01744223, "balance_loss_mlp": 1.01936352, "epoch": 0.5789869231925447, "flos": 21134183569920.0, "grad_norm": 10.201285133251142, "language_loss": 0.76392716, "learning_rate": 1.5883807343580087e-06, "loss": 0.78496039, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38867188, "step": 9630, "time_per_iteration": 2.358173370361328 }, { "auxiliary_loss_clip": 0.01054795, "auxiliary_loss_mlp": 0.01044586, "balance_loss_clip": 1.01981437, "balance_loss_mlp": 1.01704037, "epoch": 0.5790470464452127, "flos": 21208723056000.0, "grad_norm": 1.657666302905593, "language_loss": 0.79782975, "learning_rate": 1.587999618060523e-06, "loss": 0.81882352, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 9631, "time_per_iteration": 3.843238353729248 }, { "auxiliary_loss_clip": 0.0105614, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.01976979, "balance_loss_mlp": 1.01753652, "epoch": 0.5791071696978807, "flos": 23403258385920.0, "grad_norm": 1.781086778691973, "language_loss": 0.76595688, "learning_rate": 1.5876185173861333e-06, "loss": 0.78697705, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9632, "time_per_iteration": 3.8624846935272217 }, { "auxiliary_loss_clip": 0.0105651, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.01720786, "balance_loss_mlp": 1.01726532, "epoch": 0.5791672929505486, "flos": 24205387939200.0, "grad_norm": 1.685535399947355, "language_loss": 0.8030079, "learning_rate": 1.5872374323492915e-06, "loss": 0.82401478, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 9633, "time_per_iteration": 2.3842391967773438 }, { "auxiliary_loss_clip": 0.01062234, "auxiliary_loss_mlp": 0.01054559, "balance_loss_clip": 1.02213442, "balance_loss_mlp": 1.01924312, "epoch": 0.5792274162032166, "flos": 24347833752960.0, "grad_norm": 1.6804724402989795, "language_loss": 0.79375452, "learning_rate": 1.5868563629644464e-06, "loss": 0.81492245, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.4296875, "step": 9634, "time_per_iteration": 2.4450557231903076 }, { "auxiliary_loss_clip": 0.0105994, "auxiliary_loss_mlp": 0.01053008, "balance_loss_clip": 1.02505326, "balance_loss_mlp": 1.01928973, "epoch": 0.5792875394558845, "flos": 20448349355520.0, "grad_norm": 2.2766818334440386, "language_loss": 0.64118737, "learning_rate": 1.5864753092460502e-06, "loss": 0.6623168, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 9635, "time_per_iteration": 2.3672003746032715 }, { "auxiliary_loss_clip": 0.01055429, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.01770353, "balance_loss_mlp": 1.01782441, "epoch": 0.5793476627085525, "flos": 24059206609920.0, "grad_norm": 1.605461942062675, "language_loss": 0.78430903, "learning_rate": 1.5860942712085516e-06, "loss": 0.80527496, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 9636, "time_per_iteration": 2.4172332286834717 }, { "auxiliary_loss_clip": 0.01053981, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.01251769, "balance_loss_mlp": 1.01693773, "epoch": 0.5794077859612206, "flos": 22053201955200.0, "grad_norm": 1.7593519316617432, "language_loss": 0.70011115, "learning_rate": 1.5857132488663998e-06, "loss": 0.72102123, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 9637, "time_per_iteration": 2.3615307807922363 }, { "auxiliary_loss_clip": 0.01056605, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.01811254, "balance_loss_mlp": 1.01710665, "epoch": 0.5794679092138885, "flos": 11434124355840.0, "grad_norm": 2.8916856193908593, "language_loss": 0.74326253, "learning_rate": 1.585332242234043e-06, "loss": 0.76428223, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 9638, "time_per_iteration": 2.312392473220825 }, { "auxiliary_loss_clip": 0.0105689, "auxiliary_loss_mlp": 0.01042969, "balance_loss_clip": 1.01747012, "balance_loss_mlp": 1.01868081, "epoch": 0.5795280324665565, "flos": 18879212943360.0, "grad_norm": 1.6453715448636221, "language_loss": 0.73439491, "learning_rate": 1.5849512513259291e-06, "loss": 0.75539356, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 9639, "time_per_iteration": 3.7946226596832275 }, { "auxiliary_loss_clip": 0.0105739, "auxiliary_loss_mlp": 0.01044136, "balance_loss_clip": 1.01761198, "balance_loss_mlp": 1.01800632, "epoch": 0.5795881557192244, "flos": 13005111070080.0, "grad_norm": 1.7753387642090828, "language_loss": 0.70351684, "learning_rate": 1.5845702761565054e-06, "loss": 0.72453207, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 9640, "time_per_iteration": 2.3464341163635254 }, { "auxiliary_loss_clip": 0.01062364, "auxiliary_loss_mlp": 0.01052029, "balance_loss_clip": 1.02251232, "balance_loss_mlp": 1.01960611, "epoch": 0.5796482789718924, "flos": 19931530366080.0, "grad_norm": 2.452272375985523, "language_loss": 0.78904641, "learning_rate": 1.5841893167402183e-06, "loss": 0.81019038, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.42773438, "step": 9641, "time_per_iteration": 2.336226224899292 }, { "auxiliary_loss_clip": 0.01057222, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.01313651, "balance_loss_mlp": 1.01856506, "epoch": 0.5797084022245603, "flos": 21649780661760.0, "grad_norm": 1.9002487461427917, "language_loss": 0.75600505, "learning_rate": 1.5838083730915143e-06, "loss": 0.77695847, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 9642, "time_per_iteration": 2.3731956481933594 }, { "auxiliary_loss_clip": 0.01057753, "auxiliary_loss_mlp": 0.01049485, "balance_loss_clip": 1.02217436, "balance_loss_mlp": 1.01757026, "epoch": 0.5797685254772283, "flos": 26030367861120.0, "grad_norm": 1.5965365376771556, "language_loss": 0.74851978, "learning_rate": 1.5834274452248378e-06, "loss": 0.76959217, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 9643, "time_per_iteration": 2.4040513038635254 }, { "auxiliary_loss_clip": 0.01058808, "auxiliary_loss_mlp": 0.01040962, "balance_loss_clip": 1.01374662, "balance_loss_mlp": 1.01948464, "epoch": 0.5798286487298963, "flos": 22704227677440.0, "grad_norm": 1.8556618209861921, "language_loss": 0.68342769, "learning_rate": 1.5830465331546352e-06, "loss": 0.70442533, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39453125, "step": 9644, "time_per_iteration": 2.3822808265686035 }, { "auxiliary_loss_clip": 0.01061954, "auxiliary_loss_mlp": 0.01044557, "balance_loss_clip": 1.01668584, "balance_loss_mlp": 1.01964831, "epoch": 0.5798887719825643, "flos": 23147868723840.0, "grad_norm": 2.2459748846739607, "language_loss": 0.86554164, "learning_rate": 1.5826656368953496e-06, "loss": 0.88660675, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.42382812, "step": 9645, "time_per_iteration": 2.3785078525543213 }, { "auxiliary_loss_clip": 0.01059207, "auxiliary_loss_mlp": 0.0103908, "balance_loss_clip": 1.0115428, "balance_loss_mlp": 1.01924109, "epoch": 0.5799488952352322, "flos": 24424886856960.0, "grad_norm": 1.7384744022672902, "language_loss": 0.76165122, "learning_rate": 1.5822847564614244e-06, "loss": 0.78263414, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 9646, "time_per_iteration": 2.451570749282837 }, { "auxiliary_loss_clip": 0.01061042, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.01377082, "balance_loss_mlp": 1.02032638, "epoch": 0.5800090184879002, "flos": 38394474635520.0, "grad_norm": 1.717468288902804, "language_loss": 0.60406452, "learning_rate": 1.5819038918673038e-06, "loss": 0.62510276, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40820312, "step": 9647, "time_per_iteration": 2.5303382873535156 }, { "auxiliary_loss_clip": 0.0106033, "auxiliary_loss_mlp": 0.01044303, "balance_loss_clip": 1.0134275, "balance_loss_mlp": 1.01976359, "epoch": 0.5800691417405681, "flos": 19784022405120.0, "grad_norm": 1.8945762046988777, "language_loss": 0.85357642, "learning_rate": 1.5815230431274288e-06, "loss": 0.87462282, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.40625, "step": 9648, "time_per_iteration": 2.377918243408203 }, { "auxiliary_loss_clip": 0.01010252, "auxiliary_loss_mlp": 0.01011856, "balance_loss_clip": 1.00937653, "balance_loss_mlp": 1.00232971, "epoch": 0.5801292649932361, "flos": 70311407725440.0, "grad_norm": 0.8387995919109723, "language_loss": 0.63089406, "learning_rate": 1.581142210256242e-06, "loss": 0.65111512, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.07910156, "step": 9649, "time_per_iteration": 3.0349202156066895 }, { "auxiliary_loss_clip": 0.01056001, "auxiliary_loss_mlp": 0.01040521, "balance_loss_clip": 1.01444983, "balance_loss_mlp": 1.01841903, "epoch": 0.5801893882459042, "flos": 18733799664000.0, "grad_norm": 1.6194712018341269, "language_loss": 0.82837903, "learning_rate": 1.5807613932681857e-06, "loss": 0.84934425, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37695312, "step": 9650, "time_per_iteration": 2.3652777671813965 }, { "auxiliary_loss_clip": 0.01060189, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.01531744, "balance_loss_mlp": 1.01840568, "epoch": 0.5802495114985721, "flos": 15595596662400.0, "grad_norm": 3.0833308115100992, "language_loss": 0.78731692, "learning_rate": 1.580380592177698e-06, "loss": 0.80834186, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 9651, "time_per_iteration": 2.3354527950286865 }, { "auxiliary_loss_clip": 0.01059981, "auxiliary_loss_mlp": 0.01047263, "balance_loss_clip": 1.02008367, "balance_loss_mlp": 1.01960063, "epoch": 0.5803096347512401, "flos": 18254547164160.0, "grad_norm": 2.0570475186334556, "language_loss": 0.75334066, "learning_rate": 1.5799998069992213e-06, "loss": 0.77441311, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40429688, "step": 9652, "time_per_iteration": 2.3579297065734863 }, { "auxiliary_loss_clip": 0.01058261, "auxiliary_loss_mlp": 0.01048974, "balance_loss_clip": 1.02041161, "balance_loss_mlp": 1.0181632, "epoch": 0.580369758003908, "flos": 22892060125440.0, "grad_norm": 1.993832193804271, "language_loss": 0.78145969, "learning_rate": 1.579619037747193e-06, "loss": 0.80253208, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40039062, "step": 9653, "time_per_iteration": 2.394782543182373 }, { "auxiliary_loss_clip": 0.01056679, "auxiliary_loss_mlp": 0.01049672, "balance_loss_clip": 1.02209878, "balance_loss_mlp": 1.01730001, "epoch": 0.580429881256576, "flos": 18696687022080.0, "grad_norm": 2.5784024424664143, "language_loss": 0.76053554, "learning_rate": 1.5792382844360534e-06, "loss": 0.78159904, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39453125, "step": 9654, "time_per_iteration": 2.355658531188965 }, { "auxiliary_loss_clip": 0.01054821, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.01680124, "balance_loss_mlp": 1.01804352, "epoch": 0.5804900045092439, "flos": 24680800189440.0, "grad_norm": 1.7159839409931645, "language_loss": 0.71127498, "learning_rate": 1.5788575470802408e-06, "loss": 0.73223346, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 9655, "time_per_iteration": 2.3872992992401123 }, { "auxiliary_loss_clip": 0.01059301, "auxiliary_loss_mlp": 0.01046896, "balance_loss_clip": 1.01821387, "balance_loss_mlp": 1.01769447, "epoch": 0.580550127761912, "flos": 23111663777280.0, "grad_norm": 2.043749720728159, "language_loss": 0.71484268, "learning_rate": 1.5784768256941915e-06, "loss": 0.73590463, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41601562, "step": 9656, "time_per_iteration": 2.36397385597229 }, { "auxiliary_loss_clip": 0.01056719, "auxiliary_loss_mlp": 0.01044263, "balance_loss_clip": 1.02099323, "balance_loss_mlp": 1.01820326, "epoch": 0.5806102510145799, "flos": 18474779220480.0, "grad_norm": 1.5036982804066188, "language_loss": 0.730088, "learning_rate": 1.5780961202923433e-06, "loss": 0.7510978, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38476562, "step": 9657, "time_per_iteration": 2.364748954772949 }, { "auxiliary_loss_clip": 0.01059642, "auxiliary_loss_mlp": 0.01052427, "balance_loss_clip": 1.02217197, "balance_loss_mlp": 1.01821184, "epoch": 0.5806703742672479, "flos": 23914491557760.0, "grad_norm": 2.1874765983282685, "language_loss": 0.72076333, "learning_rate": 1.5777154308891328e-06, "loss": 0.74188399, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.4140625, "step": 9658, "time_per_iteration": 2.3941538333892822 }, { "auxiliary_loss_clip": 0.01009736, "auxiliary_loss_mlp": 0.01004509, "balance_loss_clip": 1.00162399, "balance_loss_mlp": 1.00201821, "epoch": 0.5807304975199158, "flos": 66308649926400.0, "grad_norm": 0.6537858521165671, "language_loss": 0.53615606, "learning_rate": 1.5773347574989953e-06, "loss": 0.55629855, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.07714844, "step": 9659, "time_per_iteration": 3.0252506732940674 }, { "auxiliary_loss_clip": 0.01057738, "auxiliary_loss_mlp": 0.01057052, "balance_loss_clip": 1.03063512, "balance_loss_mlp": 1.01781356, "epoch": 0.5807906207725838, "flos": 31721105940480.0, "grad_norm": 1.9023620005876074, "language_loss": 0.63603634, "learning_rate": 1.576954100136366e-06, "loss": 0.65718424, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 9660, "time_per_iteration": 2.4636917114257812 }, { "auxiliary_loss_clip": 0.01056859, "auxiliary_loss_mlp": 0.01049564, "balance_loss_clip": 1.02220607, "balance_loss_mlp": 1.01635814, "epoch": 0.5808507440252517, "flos": 23800151255040.0, "grad_norm": 1.5501747012657303, "language_loss": 0.66718972, "learning_rate": 1.5765734588156797e-06, "loss": 0.68825388, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 9661, "time_per_iteration": 2.3849096298217773 }, { "auxiliary_loss_clip": 0.01053464, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.01569557, "balance_loss_mlp": 1.01748252, "epoch": 0.5809108672779197, "flos": 13697613354240.0, "grad_norm": 1.5939189637942786, "language_loss": 0.75306082, "learning_rate": 1.5761928335513704e-06, "loss": 0.7739985, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 9662, "time_per_iteration": 2.3410990238189697 }, { "auxiliary_loss_clip": 0.01009842, "auxiliary_loss_mlp": 0.01003917, "balance_loss_clip": 1.00123489, "balance_loss_mlp": 1.00175309, "epoch": 0.5809709905305876, "flos": 69131062880640.0, "grad_norm": 0.8762795288784939, "language_loss": 0.58442986, "learning_rate": 1.5758122243578709e-06, "loss": 0.60456741, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.08105469, "step": 9663, "time_per_iteration": 3.1316299438476562 }, { "auxiliary_loss_clip": 0.01056506, "auxiliary_loss_mlp": 0.01047012, "balance_loss_clip": 1.02117932, "balance_loss_mlp": 1.01748478, "epoch": 0.5810311137832557, "flos": 19826546307840.0, "grad_norm": 2.21730367635318, "language_loss": 0.82590163, "learning_rate": 1.5754316312496152e-06, "loss": 0.84693682, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9664, "time_per_iteration": 2.360711097717285 }, { "auxiliary_loss_clip": 0.0105755, "auxiliary_loss_mlp": 0.01044814, "balance_loss_clip": 1.01643062, "balance_loss_mlp": 1.01728249, "epoch": 0.5810912370359237, "flos": 29237315063040.0, "grad_norm": 1.6989256823917467, "language_loss": 0.82457912, "learning_rate": 1.5750510542410337e-06, "loss": 0.84560275, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40234375, "step": 9665, "time_per_iteration": 2.4231061935424805 }, { "auxiliary_loss_clip": 0.01060954, "auxiliary_loss_mlp": 0.01049165, "balance_loss_clip": 1.01888609, "balance_loss_mlp": 1.01912427, "epoch": 0.5811513602885916, "flos": 22784422803840.0, "grad_norm": 1.5773210903481527, "language_loss": 0.81668073, "learning_rate": 1.5746704933465599e-06, "loss": 0.83778191, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.41796875, "step": 9666, "time_per_iteration": 2.412925958633423 }, { "auxiliary_loss_clip": 0.01054423, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.0177747, "balance_loss_mlp": 1.01671541, "epoch": 0.5812114835412596, "flos": 18733345816320.0, "grad_norm": 1.7820007140893226, "language_loss": 0.81189734, "learning_rate": 1.5742899485806227e-06, "loss": 0.83286488, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37695312, "step": 9667, "time_per_iteration": 3.643038511276245 }, { "auxiliary_loss_clip": 0.01059991, "auxiliary_loss_mlp": 0.01048293, "balance_loss_clip": 1.01802564, "balance_loss_mlp": 1.01790941, "epoch": 0.5812716067939275, "flos": 26430123461760.0, "grad_norm": 1.633648598993552, "language_loss": 0.79702008, "learning_rate": 1.573909419957653e-06, "loss": 0.81810296, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 0.41992188, "step": 9668, "time_per_iteration": 2.4294373989105225 }, { "auxiliary_loss_clip": 0.01058964, "auxiliary_loss_mlp": 0.0104169, "balance_loss_clip": 1.01554716, "balance_loss_mlp": 1.0183692, "epoch": 0.5813317300465956, "flos": 43396201566720.0, "grad_norm": 2.1467203833137907, "language_loss": 0.65852821, "learning_rate": 1.5735289074920819e-06, "loss": 0.67953479, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40625, "step": 9669, "time_per_iteration": 2.58272647857666 }, { "auxiliary_loss_clip": 0.01058776, "auxiliary_loss_mlp": 0.01049895, "balance_loss_clip": 1.02207136, "balance_loss_mlp": 1.01817513, "epoch": 0.5813918532992635, "flos": 24784457616000.0, "grad_norm": 1.5629143302453479, "language_loss": 0.74283135, "learning_rate": 1.5731484111983363e-06, "loss": 0.76391804, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 9670, "time_per_iteration": 2.450007677078247 }, { "auxiliary_loss_clip": 0.0105883, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.01865315, "balance_loss_mlp": 1.01834154, "epoch": 0.5814519765519315, "flos": 22856239203840.0, "grad_norm": 2.0174871921656563, "language_loss": 0.79804516, "learning_rate": 1.5727679310908464e-06, "loss": 0.81909877, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 9671, "time_per_iteration": 5.123297452926636 }, { "auxiliary_loss_clip": 0.01060535, "auxiliary_loss_mlp": 0.0104723, "balance_loss_clip": 1.01727223, "balance_loss_mlp": 1.01894712, "epoch": 0.5815120998045994, "flos": 24059695368960.0, "grad_norm": 2.4467214405452684, "language_loss": 0.62598181, "learning_rate": 1.5723874671840399e-06, "loss": 0.64705944, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41601562, "step": 9672, "time_per_iteration": 2.3861207962036133 }, { "auxiliary_loss_clip": 0.01056388, "auxiliary_loss_mlp": 0.01039403, "balance_loss_clip": 1.0134151, "balance_loss_mlp": 1.01863444, "epoch": 0.5815722230572674, "flos": 24278356414080.0, "grad_norm": 1.615518545086455, "language_loss": 0.82627928, "learning_rate": 1.572007019492342e-06, "loss": 0.84723711, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37695312, "step": 9673, "time_per_iteration": 2.3996458053588867 }, { "auxiliary_loss_clip": 0.01059879, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.01601815, "balance_loss_mlp": 1.01881409, "epoch": 0.5816323463099353, "flos": 22199278550400.0, "grad_norm": 1.7602483374194644, "language_loss": 0.8915112, "learning_rate": 1.5716265880301817e-06, "loss": 0.91255623, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.41015625, "step": 9674, "time_per_iteration": 2.3507673740386963 }, { "auxiliary_loss_clip": 0.01056269, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.01473272, "balance_loss_mlp": 1.01694942, "epoch": 0.5816924695626033, "flos": 24133292248320.0, "grad_norm": 1.8301467380684988, "language_loss": 0.79698992, "learning_rate": 1.571246172811984e-06, "loss": 0.81796658, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 9675, "time_per_iteration": 2.410081624984741 }, { "auxiliary_loss_clip": 0.01057467, "auxiliary_loss_mlp": 0.01038763, "balance_loss_clip": 1.01232243, "balance_loss_mlp": 1.01850343, "epoch": 0.5817525928152713, "flos": 21323168092800.0, "grad_norm": 1.9080999491844914, "language_loss": 0.70921588, "learning_rate": 1.5708657738521748e-06, "loss": 0.73017812, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 9676, "time_per_iteration": 2.3587865829467773 }, { "auxiliary_loss_clip": 0.01058132, "auxiliary_loss_mlp": 0.01047664, "balance_loss_clip": 1.02102089, "balance_loss_mlp": 1.01793861, "epoch": 0.5818127160679393, "flos": 26933536488960.0, "grad_norm": 2.1599011095058867, "language_loss": 0.64784658, "learning_rate": 1.5704853911651779e-06, "loss": 0.66890454, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 9677, "time_per_iteration": 2.3892252445220947 }, { "auxiliary_loss_clip": 0.01013769, "auxiliary_loss_mlp": 0.01013424, "balance_loss_clip": 1.01072943, "balance_loss_mlp": 1.00586987, "epoch": 0.5818728393206073, "flos": 63914934090240.0, "grad_norm": 0.8174540337431729, "language_loss": 0.54335356, "learning_rate": 1.5701050247654182e-06, "loss": 0.56362545, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.07910156, "step": 9678, "time_per_iteration": 3.0696749687194824 }, { "auxiliary_loss_clip": 0.0101545, "auxiliary_loss_mlp": 0.01016007, "balance_loss_clip": 1.01305091, "balance_loss_mlp": 1.00731444, "epoch": 0.5819329625732752, "flos": 64951017730560.0, "grad_norm": 0.7463029450869442, "language_loss": 0.56326097, "learning_rate": 1.569724674667319e-06, "loss": 0.58357555, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.08105469, "step": 9679, "time_per_iteration": 4.336357355117798 }, { "auxiliary_loss_clip": 0.01057994, "auxiliary_loss_mlp": 0.01039487, "balance_loss_clip": 1.01439369, "balance_loss_mlp": 1.01918554, "epoch": 0.5819930858259432, "flos": 21214204139520.0, "grad_norm": 1.5970847905496717, "language_loss": 0.66561031, "learning_rate": 1.5693443408853032e-06, "loss": 0.68658507, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38867188, "step": 9680, "time_per_iteration": 2.402846097946167 }, { "auxiliary_loss_clip": 0.01056474, "auxiliary_loss_mlp": 0.01042126, "balance_loss_clip": 1.01625776, "balance_loss_mlp": 1.01774049, "epoch": 0.5820532090786111, "flos": 19457654215680.0, "grad_norm": 1.9379370282766968, "language_loss": 0.84198004, "learning_rate": 1.5689640234337933e-06, "loss": 0.86296606, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9681, "time_per_iteration": 2.3989641666412354 }, { "auxiliary_loss_clip": 0.01058124, "auxiliary_loss_mlp": 0.0104456, "balance_loss_clip": 1.01751101, "balance_loss_mlp": 1.01852179, "epoch": 0.5821133323312792, "flos": 17711647522560.0, "grad_norm": 1.699104077057305, "language_loss": 0.76698035, "learning_rate": 1.5685837223272109e-06, "loss": 0.7880072, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 9682, "time_per_iteration": 2.380551815032959 }, { "auxiliary_loss_clip": 0.01058855, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.01704383, "balance_loss_mlp": 1.01883817, "epoch": 0.5821734555839471, "flos": 24570649249920.0, "grad_norm": 2.0272570640066134, "language_loss": 0.76638001, "learning_rate": 1.568203437579977e-06, "loss": 0.78739917, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9683, "time_per_iteration": 2.4015114307403564 }, { "auxiliary_loss_clip": 0.01059942, "auxiliary_loss_mlp": 0.01049352, "balance_loss_clip": 1.0211947, "balance_loss_mlp": 1.01877391, "epoch": 0.5822335788366151, "flos": 22381176067200.0, "grad_norm": 1.718989330574959, "language_loss": 0.74696392, "learning_rate": 1.5678231692065116e-06, "loss": 0.76805687, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 9684, "time_per_iteration": 2.399181842803955 }, { "auxiliary_loss_clip": 0.01057353, "auxiliary_loss_mlp": 0.01046158, "balance_loss_clip": 1.01933563, "balance_loss_mlp": 1.01798964, "epoch": 0.582293702089283, "flos": 26721334045440.0, "grad_norm": 2.0943156685813067, "language_loss": 0.78831232, "learning_rate": 1.5674429172212348e-06, "loss": 0.80934739, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 9685, "time_per_iteration": 2.422429323196411 }, { "auxiliary_loss_clip": 0.01057904, "auxiliary_loss_mlp": 0.0104652, "balance_loss_clip": 1.02050889, "balance_loss_mlp": 1.0181694, "epoch": 0.582353825341951, "flos": 17347677932160.0, "grad_norm": 2.0533128560100287, "language_loss": 0.76515245, "learning_rate": 1.5670626816385667e-06, "loss": 0.78619671, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 9686, "time_per_iteration": 2.3625597953796387 }, { "auxiliary_loss_clip": 0.01014836, "auxiliary_loss_mlp": 0.0100427, "balance_loss_clip": 1.00114667, "balance_loss_mlp": 1.00668001, "epoch": 0.5824139485946189, "flos": 55470282877440.0, "grad_norm": 0.8189624474652363, "language_loss": 0.57509542, "learning_rate": 1.5666824624729244e-06, "loss": 0.59528649, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.08154297, "step": 9687, "time_per_iteration": 2.8404717445373535 }, { "auxiliary_loss_clip": 0.01057914, "auxiliary_loss_mlp": 0.01046718, "balance_loss_clip": 1.02043211, "balance_loss_mlp": 1.01852131, "epoch": 0.582474071847287, "flos": 20301993469440.0, "grad_norm": 2.0031903530661275, "language_loss": 0.7080102, "learning_rate": 1.566302259738727e-06, "loss": 0.72905654, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 9688, "time_per_iteration": 2.38323974609375 }, { "auxiliary_loss_clip": 0.01057227, "auxiliary_loss_mlp": 0.01049323, "balance_loss_clip": 1.02445602, "balance_loss_mlp": 1.01778328, "epoch": 0.5825341950999549, "flos": 23876890156800.0, "grad_norm": 2.590336256695031, "language_loss": 0.67444384, "learning_rate": 1.5659220734503918e-06, "loss": 0.69550931, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.39453125, "step": 9689, "time_per_iteration": 2.3901262283325195 }, { "auxiliary_loss_clip": 0.01057084, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.02045131, "balance_loss_mlp": 1.01942611, "epoch": 0.5825943183526229, "flos": 23111908156800.0, "grad_norm": 1.6312637743234726, "language_loss": 0.74426997, "learning_rate": 1.5655419036223341e-06, "loss": 0.76530343, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 9690, "time_per_iteration": 2.4091169834136963 }, { "auxiliary_loss_clip": 0.01058455, "auxiliary_loss_mlp": 0.01057462, "balance_loss_clip": 1.02907848, "balance_loss_mlp": 1.01861525, "epoch": 0.5826544416052909, "flos": 22856309026560.0, "grad_norm": 1.639701397527284, "language_loss": 0.7661159, "learning_rate": 1.5651617502689717e-06, "loss": 0.78727502, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.3984375, "step": 9691, "time_per_iteration": 2.3949649333953857 }, { "auxiliary_loss_clip": 0.01058294, "auxiliary_loss_mlp": 0.01059686, "balance_loss_clip": 1.03410339, "balance_loss_mlp": 1.01796746, "epoch": 0.5827145648579588, "flos": 31500559681920.0, "grad_norm": 1.7420255440375607, "language_loss": 0.81438446, "learning_rate": 1.5647816134047184e-06, "loss": 0.83556426, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 9692, "time_per_iteration": 2.4539248943328857 }, { "auxiliary_loss_clip": 0.01013284, "auxiliary_loss_mlp": 0.01002508, "balance_loss_clip": 0.99971884, "balance_loss_mlp": 1.00566137, "epoch": 0.5827746881106268, "flos": 69808448545920.0, "grad_norm": 0.7628410029723887, "language_loss": 0.56947231, "learning_rate": 1.5644014930439907e-06, "loss": 0.58963025, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.07617188, "step": 9693, "time_per_iteration": 3.0034701824188232 }, { "auxiliary_loss_clip": 0.01055995, "auxiliary_loss_mlp": 0.01051393, "balance_loss_clip": 1.0275631, "balance_loss_mlp": 1.01717126, "epoch": 0.5828348113632947, "flos": 23111279752320.0, "grad_norm": 1.9420154820857942, "language_loss": 0.80321366, "learning_rate": 1.5640213892012025e-06, "loss": 0.82428753, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38867188, "step": 9694, "time_per_iteration": 2.415757179260254 }, { "auxiliary_loss_clip": 0.0105499, "auxiliary_loss_mlp": 0.01051943, "balance_loss_clip": 1.02820849, "balance_loss_mlp": 1.01785028, "epoch": 0.5828949346159628, "flos": 21871967754240.0, "grad_norm": 1.5198913888807701, "language_loss": 0.77122825, "learning_rate": 1.5636413018907656e-06, "loss": 0.7922976, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 9695, "time_per_iteration": 2.3731281757354736 }, { "auxiliary_loss_clip": 0.01010941, "auxiliary_loss_mlp": 0.0101456, "balance_loss_clip": 1.01169848, "balance_loss_mlp": 1.00327277, "epoch": 0.5829550578686307, "flos": 65958784525440.0, "grad_norm": 0.7916564620503238, "language_loss": 0.55058151, "learning_rate": 1.563261231127095e-06, "loss": 0.57083654, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.07666016, "step": 9696, "time_per_iteration": 3.108586072921753 }, { "auxiliary_loss_clip": 0.01058706, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.02183616, "balance_loss_mlp": 1.01922393, "epoch": 0.5830151811212987, "flos": 16288866996480.0, "grad_norm": 1.9844589090317828, "language_loss": 0.77703589, "learning_rate": 1.5628811769246021e-06, "loss": 0.79807973, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.39453125, "step": 9697, "time_per_iteration": 2.3509905338287354 }, { "auxiliary_loss_clip": 0.01059733, "auxiliary_loss_mlp": 0.01052893, "balance_loss_clip": 1.0236392, "balance_loss_mlp": 1.01857936, "epoch": 0.5830753043739666, "flos": 24167751626880.0, "grad_norm": 1.6301584383224557, "language_loss": 0.78980643, "learning_rate": 1.5625011392976991e-06, "loss": 0.81093276, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 9698, "time_per_iteration": 2.4063656330108643 }, { "auxiliary_loss_clip": 0.01059658, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.01957417, "balance_loss_mlp": 1.02055097, "epoch": 0.5831354276266346, "flos": 27057651972480.0, "grad_norm": 1.8997547595693893, "language_loss": 0.84697342, "learning_rate": 1.5621211182607966e-06, "loss": 0.86802727, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 9699, "time_per_iteration": 2.4957683086395264 }, { "auxiliary_loss_clip": 0.01060169, "auxiliary_loss_mlp": 0.01047299, "balance_loss_clip": 1.02075088, "balance_loss_mlp": 1.02053189, "epoch": 0.5831955508793025, "flos": 23622338367360.0, "grad_norm": 2.164695343857819, "language_loss": 0.67658496, "learning_rate": 1.561741113828305e-06, "loss": 0.69765967, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 9700, "time_per_iteration": 2.422590494155884 }, { "auxiliary_loss_clip": 0.01060138, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.01312399, "balance_loss_mlp": 1.01990676, "epoch": 0.5832556741319705, "flos": 24972080595840.0, "grad_norm": 1.6617463492113318, "language_loss": 0.71865761, "learning_rate": 1.5613611260146344e-06, "loss": 0.7396552, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 9701, "time_per_iteration": 2.403135299682617 }, { "auxiliary_loss_clip": 0.01060354, "auxiliary_loss_mlp": 0.01042401, "balance_loss_clip": 1.01778436, "balance_loss_mlp": 1.02189898, "epoch": 0.5833157973846385, "flos": 23220453173760.0, "grad_norm": 1.790134100145381, "language_loss": 0.86233765, "learning_rate": 1.5609811548341936e-06, "loss": 0.88336521, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 9702, "time_per_iteration": 2.4219655990600586 }, { "auxiliary_loss_clip": 0.01059277, "auxiliary_loss_mlp": 0.0104261, "balance_loss_clip": 1.01854134, "balance_loss_mlp": 1.02166212, "epoch": 0.5833759206373065, "flos": 21976951812480.0, "grad_norm": 1.5488403762154956, "language_loss": 0.77953529, "learning_rate": 1.560601200301392e-06, "loss": 0.80055416, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 9703, "time_per_iteration": 2.386042356491089 }, { "auxiliary_loss_clip": 0.01064443, "auxiliary_loss_mlp": 0.01041012, "balance_loss_clip": 1.01420212, "balance_loss_mlp": 1.02393389, "epoch": 0.5834360438899745, "flos": 21761328055680.0, "grad_norm": 1.7322268286976417, "language_loss": 0.72366005, "learning_rate": 1.5602212624306366e-06, "loss": 0.74471462, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40625, "step": 9704, "time_per_iteration": 2.4090538024902344 }, { "auxiliary_loss_clip": 0.01062611, "auxiliary_loss_mlp": 0.01045866, "balance_loss_clip": 1.0213213, "balance_loss_mlp": 1.02297032, "epoch": 0.5834961671426424, "flos": 15991791304320.0, "grad_norm": 1.7494760691715012, "language_loss": 0.82554686, "learning_rate": 1.559841341236335e-06, "loss": 0.84663159, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39648438, "step": 9705, "time_per_iteration": 2.368354558944702 }, { "auxiliary_loss_clip": 0.01061741, "auxiliary_loss_mlp": 0.01038845, "balance_loss_clip": 1.01461017, "balance_loss_mlp": 1.02317071, "epoch": 0.5835562903953104, "flos": 22817276259840.0, "grad_norm": 1.7628637324878906, "language_loss": 0.82132697, "learning_rate": 1.5594614367328937e-06, "loss": 0.84233284, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38671875, "step": 9706, "time_per_iteration": 3.8437769412994385 }, { "auxiliary_loss_clip": 0.01060551, "auxiliary_loss_mlp": 0.01042041, "balance_loss_clip": 1.01494431, "balance_loss_mlp": 1.02231216, "epoch": 0.5836164136479783, "flos": 48466288673280.0, "grad_norm": 2.3097329051112094, "language_loss": 0.75893152, "learning_rate": 1.5590815489347187e-06, "loss": 0.77995741, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3828125, "step": 9707, "time_per_iteration": 2.6114885807037354 }, { "auxiliary_loss_clip": 0.01061802, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.01591671, "balance_loss_mlp": 1.02398622, "epoch": 0.5836765369006464, "flos": 26904802573440.0, "grad_norm": 1.7277212846637298, "language_loss": 0.82326746, "learning_rate": 1.5587016778562163e-06, "loss": 0.84429264, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 9708, "time_per_iteration": 2.4620330333709717 }, { "auxiliary_loss_clip": 0.01062515, "auxiliary_loss_mlp": 0.01042492, "balance_loss_clip": 1.01629007, "balance_loss_mlp": 1.02433288, "epoch": 0.5837366601533143, "flos": 20083018222080.0, "grad_norm": 1.4846566879879044, "language_loss": 0.79136592, "learning_rate": 1.5583218235117896e-06, "loss": 0.81241596, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 9709, "time_per_iteration": 2.3955883979797363 }, { "auxiliary_loss_clip": 0.01016346, "auxiliary_loss_mlp": 0.01015552, "balance_loss_clip": 1.01296556, "balance_loss_mlp": 1.00884533, "epoch": 0.5837967834059823, "flos": 65360548512000.0, "grad_norm": 0.7736790168775292, "language_loss": 0.56673497, "learning_rate": 1.557941985915844e-06, "loss": 0.58705395, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.07519531, "step": 9710, "time_per_iteration": 4.419286489486694 }, { "auxiliary_loss_clip": 0.01058999, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.01935709, "balance_loss_mlp": 1.02112389, "epoch": 0.5838569066586502, "flos": 25337446640640.0, "grad_norm": 1.57584384020284, "language_loss": 0.67071807, "learning_rate": 1.5575621650827833e-06, "loss": 0.69174767, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 9711, "time_per_iteration": 3.9427759647369385 }, { "auxiliary_loss_clip": 0.01064906, "auxiliary_loss_mlp": 0.01047698, "balance_loss_clip": 1.01828945, "balance_loss_mlp": 1.0220778, "epoch": 0.5839170299113182, "flos": 22228361579520.0, "grad_norm": 2.4206433842761763, "language_loss": 0.80277276, "learning_rate": 1.5571823610270085e-06, "loss": 0.82389879, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.4296875, "step": 9712, "time_per_iteration": 2.3955094814300537 }, { "auxiliary_loss_clip": 0.01059619, "auxiliary_loss_mlp": 0.01048816, "balance_loss_clip": 1.02328146, "balance_loss_mlp": 1.02072453, "epoch": 0.5839771531639861, "flos": 22198929436800.0, "grad_norm": 1.808255036455255, "language_loss": 0.74307442, "learning_rate": 1.5568025737629234e-06, "loss": 0.76415884, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 9713, "time_per_iteration": 2.425021171569824 }, { "auxiliary_loss_clip": 0.01063334, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.01752663, "balance_loss_mlp": 1.02143598, "epoch": 0.5840372764166541, "flos": 22418253797760.0, "grad_norm": 1.8131056496329903, "language_loss": 0.70846361, "learning_rate": 1.5564228033049292e-06, "loss": 0.729581, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 0.41992188, "step": 9714, "time_per_iteration": 2.398369073867798 }, { "auxiliary_loss_clip": 0.01062268, "auxiliary_loss_mlp": 0.01051991, "balance_loss_clip": 1.02363062, "balance_loss_mlp": 1.02103829, "epoch": 0.5840973996693221, "flos": 19827244535040.0, "grad_norm": 1.6732015427109435, "language_loss": 0.8113693, "learning_rate": 1.5560430496674268e-06, "loss": 0.8325119, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41210938, "step": 9715, "time_per_iteration": 2.4085938930511475 }, { "auxiliary_loss_clip": 0.01061213, "auxiliary_loss_mlp": 0.01051681, "balance_loss_clip": 1.02471626, "balance_loss_mlp": 1.02165782, "epoch": 0.5841575229219901, "flos": 21141898980480.0, "grad_norm": 1.9361805212644985, "language_loss": 0.74431419, "learning_rate": 1.5556633128648167e-06, "loss": 0.76544309, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 9716, "time_per_iteration": 2.396636486053467 }, { "auxiliary_loss_clip": 0.0105888, "auxiliary_loss_mlp": 0.0104387, "balance_loss_clip": 1.0190866, "balance_loss_mlp": 1.02116513, "epoch": 0.5842176461746581, "flos": 24639288716160.0, "grad_norm": 1.7342186036136324, "language_loss": 0.76303291, "learning_rate": 1.5552835929114976e-06, "loss": 0.78406042, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 9717, "time_per_iteration": 2.4350569248199463 }, { "auxiliary_loss_clip": 0.01059699, "auxiliary_loss_mlp": 0.01053342, "balance_loss_clip": 1.02673495, "balance_loss_mlp": 1.02004921, "epoch": 0.584277769427326, "flos": 19130273596800.0, "grad_norm": 2.406781584490626, "language_loss": 0.81289351, "learning_rate": 1.5549038898218697e-06, "loss": 0.83402395, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 9718, "time_per_iteration": 2.340877056121826 }, { "auxiliary_loss_clip": 0.01058515, "auxiliary_loss_mlp": 0.01054472, "balance_loss_clip": 1.02822232, "balance_loss_mlp": 1.02019143, "epoch": 0.584337892679994, "flos": 22673992573440.0, "grad_norm": 1.6621732173893033, "language_loss": 0.6823613, "learning_rate": 1.5545242036103306e-06, "loss": 0.70349121, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 9719, "time_per_iteration": 3.83272385597229 }, { "auxiliary_loss_clip": 0.01060028, "auxiliary_loss_mlp": 0.01055958, "balance_loss_clip": 1.02915967, "balance_loss_mlp": 1.02017331, "epoch": 0.5843980159326619, "flos": 31282771420800.0, "grad_norm": 2.641470012829243, "language_loss": 0.76400101, "learning_rate": 1.5541445342912786e-06, "loss": 0.7851609, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 9720, "time_per_iteration": 2.463491201400757 }, { "auxiliary_loss_clip": 0.01057975, "auxiliary_loss_mlp": 0.01057064, "balance_loss_clip": 1.03115952, "balance_loss_mlp": 1.01869512, "epoch": 0.58445813918533, "flos": 22746995959680.0, "grad_norm": 1.5220823301755952, "language_loss": 0.83621228, "learning_rate": 1.5537648818791105e-06, "loss": 0.85736269, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39257812, "step": 9721, "time_per_iteration": 2.3918910026550293 }, { "auxiliary_loss_clip": 0.01012137, "auxiliary_loss_mlp": 0.01006757, "balance_loss_clip": 1.00431275, "balance_loss_mlp": 1.00478566, "epoch": 0.5845182624379979, "flos": 60683548936320.0, "grad_norm": 0.9369790555966969, "language_loss": 0.71478814, "learning_rate": 1.5533852463882226e-06, "loss": 0.73497707, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.07324219, "step": 9722, "time_per_iteration": 3.0392560958862305 }, { "auxiliary_loss_clip": 0.01056576, "auxiliary_loss_mlp": 0.01050538, "balance_loss_clip": 1.02362061, "balance_loss_mlp": 1.01796544, "epoch": 0.5845783856906659, "flos": 16361521269120.0, "grad_norm": 2.741151566696127, "language_loss": 0.9109118, "learning_rate": 1.5530056278330113e-06, "loss": 0.93198299, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 9723, "time_per_iteration": 2.3523313999176025 }, { "auxiliary_loss_clip": 0.01057029, "auxiliary_loss_mlp": 0.01048714, "balance_loss_clip": 1.02230906, "balance_loss_mlp": 1.01857877, "epoch": 0.5846385089433338, "flos": 20082389817600.0, "grad_norm": 1.7480765550085273, "language_loss": 0.69075775, "learning_rate": 1.5526260262278709e-06, "loss": 0.71181512, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38476562, "step": 9724, "time_per_iteration": 2.4231467247009277 }, { "auxiliary_loss_clip": 0.01061828, "auxiliary_loss_mlp": 0.01061403, "balance_loss_clip": 1.03195846, "balance_loss_mlp": 1.02076185, "epoch": 0.5846986321960018, "flos": 17310111442560.0, "grad_norm": 2.4149565492202454, "language_loss": 0.87805009, "learning_rate": 1.552246441587197e-06, "loss": 0.89928234, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41015625, "step": 9725, "time_per_iteration": 2.3524668216705322 }, { "auxiliary_loss_clip": 0.01061921, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 1.02432072, "balance_loss_mlp": 1.02010047, "epoch": 0.5847587554486697, "flos": 17197062860160.0, "grad_norm": 1.5831958510496844, "language_loss": 0.840478, "learning_rate": 1.5518668739253821e-06, "loss": 0.86162698, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41796875, "step": 9726, "time_per_iteration": 2.400054693222046 }, { "auxiliary_loss_clip": 0.01059084, "auxiliary_loss_mlp": 0.01049149, "balance_loss_clip": 1.02307844, "balance_loss_mlp": 1.01917517, "epoch": 0.5848188787013378, "flos": 24528125347200.0, "grad_norm": 1.9164751397416517, "language_loss": 0.67587441, "learning_rate": 1.5514873232568206e-06, "loss": 0.69695675, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3984375, "step": 9727, "time_per_iteration": 2.424271583557129 }, { "auxiliary_loss_clip": 0.01062589, "auxiliary_loss_mlp": 0.0105633, "balance_loss_clip": 1.02704024, "balance_loss_mlp": 1.02173638, "epoch": 0.5848790019540057, "flos": 20627419052160.0, "grad_norm": 1.6944164624528744, "language_loss": 0.82469261, "learning_rate": 1.5511077895959055e-06, "loss": 0.84588182, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40820312, "step": 9728, "time_per_iteration": 2.4002597332000732 }, { "auxiliary_loss_clip": 0.01058267, "auxiliary_loss_mlp": 0.01050975, "balance_loss_clip": 1.0247016, "balance_loss_mlp": 1.01957393, "epoch": 0.5849391252066737, "flos": 22417765038720.0, "grad_norm": 1.70536624127662, "language_loss": 0.78270817, "learning_rate": 1.550728272957027e-06, "loss": 0.80380058, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9729, "time_per_iteration": 2.412889242172241 }, { "auxiliary_loss_clip": 0.01060481, "auxiliary_loss_mlp": 0.01046862, "balance_loss_clip": 1.01850247, "balance_loss_mlp": 1.01961231, "epoch": 0.5849992484593417, "flos": 25409751799680.0, "grad_norm": 1.9475581020909587, "language_loss": 0.71908486, "learning_rate": 1.5503487733545782e-06, "loss": 0.74015832, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40820312, "step": 9730, "time_per_iteration": 2.417783498764038 }, { "auxiliary_loss_clip": 0.01061004, "auxiliary_loss_mlp": 0.01050643, "balance_loss_clip": 1.0205071, "balance_loss_mlp": 1.01900578, "epoch": 0.5850593717120096, "flos": 21064217472000.0, "grad_norm": 1.6425034824779405, "language_loss": 0.7969386, "learning_rate": 1.5499692908029482e-06, "loss": 0.81805503, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.41992188, "step": 9731, "time_per_iteration": 2.419891357421875 }, { "auxiliary_loss_clip": 0.01059146, "auxiliary_loss_mlp": 0.01050478, "balance_loss_clip": 1.02146268, "balance_loss_mlp": 1.01908207, "epoch": 0.5851194949646776, "flos": 25300368910080.0, "grad_norm": 1.911264245744276, "language_loss": 0.71980989, "learning_rate": 1.549589825316528e-06, "loss": 0.74090612, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40039062, "step": 9732, "time_per_iteration": 2.451434850692749 }, { "auxiliary_loss_clip": 0.01063151, "auxiliary_loss_mlp": 0.0104865, "balance_loss_clip": 1.0175842, "balance_loss_mlp": 1.02119899, "epoch": 0.5851796182173455, "flos": 23586098509440.0, "grad_norm": 2.7559214044598, "language_loss": 0.54748499, "learning_rate": 1.5492103769097075e-06, "loss": 0.56860298, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 0.41992188, "step": 9733, "time_per_iteration": 2.4483511447906494 }, { "auxiliary_loss_clip": 0.01061617, "auxiliary_loss_mlp": 0.01048979, "balance_loss_clip": 1.02011812, "balance_loss_mlp": 1.02085495, "epoch": 0.5852397414700136, "flos": 24821674992000.0, "grad_norm": 2.9352899712635034, "language_loss": 0.89963114, "learning_rate": 1.5488309455968739e-06, "loss": 0.92073709, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40820312, "step": 9734, "time_per_iteration": 2.39829683303833 }, { "auxiliary_loss_clip": 0.01058067, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.01855564, "balance_loss_mlp": 1.02112269, "epoch": 0.5852998647226815, "flos": 19936767070080.0, "grad_norm": 1.5407475061721285, "language_loss": 0.73723245, "learning_rate": 1.5484515313924163e-06, "loss": 0.7582655, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.36914062, "step": 9735, "time_per_iteration": 2.3957083225250244 }, { "auxiliary_loss_clip": 0.01063355, "auxiliary_loss_mlp": 0.01052199, "balance_loss_clip": 1.02268267, "balance_loss_mlp": 1.02133012, "epoch": 0.5853599879753495, "flos": 16719625751040.0, "grad_norm": 2.907834911069106, "language_loss": 0.75574118, "learning_rate": 1.5480721343107217e-06, "loss": 0.77689672, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.41992188, "step": 9736, "time_per_iteration": 2.3294007778167725 }, { "auxiliary_loss_clip": 0.01060304, "auxiliary_loss_mlp": 0.01045268, "balance_loss_clip": 1.01808858, "balance_loss_mlp": 1.02131295, "epoch": 0.5854201112280174, "flos": 44454872856960.0, "grad_norm": 1.5357707854391474, "language_loss": 0.71728212, "learning_rate": 1.5476927543661772e-06, "loss": 0.73833781, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 9737, "time_per_iteration": 2.5804479122161865 }, { "auxiliary_loss_clip": 0.01060165, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.02080846, "balance_loss_mlp": 1.02153492, "epoch": 0.5854802344806854, "flos": 20338163504640.0, "grad_norm": 1.634433807636634, "language_loss": 0.83826458, "learning_rate": 1.547313391573169e-06, "loss": 0.85933501, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 9738, "time_per_iteration": 2.401454210281372 }, { "auxiliary_loss_clip": 0.01066013, "auxiliary_loss_mlp": 0.0104641, "balance_loss_clip": 1.01735926, "balance_loss_mlp": 1.02406943, "epoch": 0.5855403577333533, "flos": 20920061001600.0, "grad_norm": 2.317941877396921, "language_loss": 0.69406289, "learning_rate": 1.546934045946082e-06, "loss": 0.71518707, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.41992188, "step": 9739, "time_per_iteration": 2.4022514820098877 }, { "auxiliary_loss_clip": 0.01063427, "auxiliary_loss_mlp": 0.01037195, "balance_loss_clip": 1.00980091, "balance_loss_mlp": 1.02155352, "epoch": 0.5856004809860214, "flos": 20447616216960.0, "grad_norm": 3.381840185859494, "language_loss": 0.59718215, "learning_rate": 1.5465547174993017e-06, "loss": 0.61818838, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.41796875, "step": 9740, "time_per_iteration": 2.3687727451324463 }, { "auxiliary_loss_clip": 0.01059513, "auxiliary_loss_mlp": 0.01042137, "balance_loss_clip": 1.01532698, "balance_loss_mlp": 1.02046061, "epoch": 0.5856606042386893, "flos": 19639900846080.0, "grad_norm": 1.973944953396449, "language_loss": 0.75795615, "learning_rate": 1.5461754062472113e-06, "loss": 0.77897263, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 9741, "time_per_iteration": 2.4126803874969482 }, { "auxiliary_loss_clip": 0.01063716, "auxiliary_loss_mlp": 0.01049484, "balance_loss_clip": 1.02118373, "balance_loss_mlp": 1.02358317, "epoch": 0.5857207274913573, "flos": 21685182647040.0, "grad_norm": 1.683553134901866, "language_loss": 0.76890856, "learning_rate": 1.5457961122041959e-06, "loss": 0.79004055, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40039062, "step": 9742, "time_per_iteration": 2.396118402481079 }, { "auxiliary_loss_clip": 0.01061308, "auxiliary_loss_mlp": 0.01043399, "balance_loss_clip": 1.01697016, "balance_loss_mlp": 1.02206182, "epoch": 0.5857808507440253, "flos": 23181664786560.0, "grad_norm": 1.6629997325539856, "language_loss": 0.76161021, "learning_rate": 1.5454168353846369e-06, "loss": 0.78265727, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 9743, "time_per_iteration": 2.4101898670196533 }, { "auxiliary_loss_clip": 0.0106037, "auxiliary_loss_mlp": 0.01041179, "balance_loss_clip": 1.0157752, "balance_loss_mlp": 1.02184474, "epoch": 0.5858409739966932, "flos": 27234068405760.0, "grad_norm": 1.91162794777979, "language_loss": 0.8237381, "learning_rate": 1.5450375758029172e-06, "loss": 0.84475356, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 9744, "time_per_iteration": 2.4350175857543945 }, { "auxiliary_loss_clip": 0.01063383, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.01348352, "balance_loss_mlp": 1.02220893, "epoch": 0.5859010972493612, "flos": 27854265530880.0, "grad_norm": 1.7755557672198368, "language_loss": 0.72233105, "learning_rate": 1.5446583334734183e-06, "loss": 0.74336863, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41210938, "step": 9745, "time_per_iteration": 3.7323148250579834 }, { "auxiliary_loss_clip": 0.01022966, "auxiliary_loss_mlp": 0.01017372, "balance_loss_clip": 1.01453495, "balance_loss_mlp": 1.01521647, "epoch": 0.5859612205020291, "flos": 70003333088640.0, "grad_norm": 0.7370768540706987, "language_loss": 0.53314221, "learning_rate": 1.5442791084105204e-06, "loss": 0.55354559, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.07763672, "step": 9746, "time_per_iteration": 3.1080479621887207 }, { "auxiliary_loss_clip": 0.01061932, "auxiliary_loss_mlp": 0.01048794, "balance_loss_clip": 1.02058959, "balance_loss_mlp": 1.02080011, "epoch": 0.5860213437546972, "flos": 24055017246720.0, "grad_norm": 1.9249684550651125, "language_loss": 0.74429631, "learning_rate": 1.5438999006286054e-06, "loss": 0.76540351, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 9747, "time_per_iteration": 2.400510311126709 }, { "auxiliary_loss_clip": 0.01062437, "auxiliary_loss_mlp": 0.0104897, "balance_loss_clip": 1.02106309, "balance_loss_mlp": 1.0218823, "epoch": 0.5860814670073651, "flos": 18946735246080.0, "grad_norm": 2.2526233750786444, "language_loss": 0.82245249, "learning_rate": 1.543520710142051e-06, "loss": 0.84356654, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40625, "step": 9748, "time_per_iteration": 2.3947677612304688 }, { "auxiliary_loss_clip": 0.01060535, "auxiliary_loss_mlp": 0.01047625, "balance_loss_clip": 1.01856208, "balance_loss_mlp": 1.02019918, "epoch": 0.5861415902600331, "flos": 22560839256960.0, "grad_norm": 1.7823963644150866, "language_loss": 0.73263371, "learning_rate": 1.5431415369652375e-06, "loss": 0.75371528, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40234375, "step": 9749, "time_per_iteration": 2.388312816619873 }, { "auxiliary_loss_clip": 0.01059115, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.02397346, "balance_loss_mlp": 1.02013493, "epoch": 0.586201713512701, "flos": 14391162979200.0, "grad_norm": 2.5057649052766076, "language_loss": 0.76080716, "learning_rate": 1.5427623811125428e-06, "loss": 0.78190815, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 9750, "time_per_iteration": 3.818565607070923 }, { "auxiliary_loss_clip": 0.0105871, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.02093065, "balance_loss_mlp": 1.01973081, "epoch": 0.586261836765369, "flos": 19497594677760.0, "grad_norm": 1.9471842730553588, "language_loss": 0.71956974, "learning_rate": 1.542383242598344e-06, "loss": 0.74065953, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.390625, "step": 9751, "time_per_iteration": 3.810014009475708 }, { "auxiliary_loss_clip": 0.01059518, "auxiliary_loss_mlp": 0.01053057, "balance_loss_clip": 1.02404165, "balance_loss_mlp": 1.01909554, "epoch": 0.5863219600180369, "flos": 20700841374720.0, "grad_norm": 1.743053692931917, "language_loss": 0.75879192, "learning_rate": 1.5420041214370184e-06, "loss": 0.77991766, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40429688, "step": 9752, "time_per_iteration": 2.391598701477051 }, { "auxiliary_loss_clip": 0.01056448, "auxiliary_loss_mlp": 0.01051816, "balance_loss_clip": 1.02423072, "balance_loss_mlp": 1.01819229, "epoch": 0.586382083270705, "flos": 19791109411200.0, "grad_norm": 1.827747827252866, "language_loss": 0.79210883, "learning_rate": 1.541625017642943e-06, "loss": 0.81319147, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.3828125, "step": 9753, "time_per_iteration": 2.392061471939087 }, { "auxiliary_loss_clip": 0.01057221, "auxiliary_loss_mlp": 0.0104409, "balance_loss_clip": 1.01954508, "balance_loss_mlp": 1.02006292, "epoch": 0.5864422065233729, "flos": 16499847542400.0, "grad_norm": 4.031536510987028, "language_loss": 0.72227943, "learning_rate": 1.5412459312304927e-06, "loss": 0.74329263, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 9754, "time_per_iteration": 2.36039137840271 }, { "auxiliary_loss_clip": 0.01056517, "auxiliary_loss_mlp": 0.01051991, "balance_loss_clip": 1.02468038, "balance_loss_mlp": 1.0173434, "epoch": 0.5865023297760409, "flos": 20412214231680.0, "grad_norm": 1.5888446302874326, "language_loss": 0.73173845, "learning_rate": 1.540866862214043e-06, "loss": 0.75282359, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39257812, "step": 9755, "time_per_iteration": 2.4182701110839844 }, { "auxiliary_loss_clip": 0.01013033, "auxiliary_loss_mlp": 0.01006134, "balance_loss_clip": 1.00347519, "balance_loss_mlp": 1.00555003, "epoch": 0.5865624530287089, "flos": 63347666319360.0, "grad_norm": 0.7413834788762707, "language_loss": 0.56997204, "learning_rate": 1.540487810607967e-06, "loss": 0.59016371, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.07470703, "step": 9756, "time_per_iteration": 3.0084025859832764 }, { "auxiliary_loss_clip": 0.01055327, "auxiliary_loss_mlp": 0.01047551, "balance_loss_clip": 1.02332783, "balance_loss_mlp": 1.01786399, "epoch": 0.5866225762813768, "flos": 27015058247040.0, "grad_norm": 1.6775201664448427, "language_loss": 0.77615452, "learning_rate": 1.5401087764266396e-06, "loss": 0.79718328, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 9757, "time_per_iteration": 2.4750819206237793 }, { "auxiliary_loss_clip": 0.01012298, "auxiliary_loss_mlp": 0.0102221, "balance_loss_clip": 1.01956403, "balance_loss_mlp": 1.00502825, "epoch": 0.5866826995340448, "flos": 72983554721280.0, "grad_norm": 0.8564145177413587, "language_loss": 0.60649371, "learning_rate": 1.5397297596844337e-06, "loss": 0.6268388, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.07275391, "step": 9758, "time_per_iteration": 4.476247787475586 }, { "auxiliary_loss_clip": 0.0105902, "auxiliary_loss_mlp": 0.01055727, "balance_loss_clip": 1.02704573, "balance_loss_mlp": 1.01871252, "epoch": 0.5867428227867127, "flos": 21284728819200.0, "grad_norm": 2.0605510637049256, "language_loss": 0.73864412, "learning_rate": 1.5393507603957212e-06, "loss": 0.75979161, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40234375, "step": 9759, "time_per_iteration": 2.415647029876709 }, { "auxiliary_loss_clip": 0.01057867, "auxiliary_loss_mlp": 0.01057549, "balance_loss_clip": 1.03154922, "balance_loss_mlp": 1.01845789, "epoch": 0.5868029460393808, "flos": 33467601392640.0, "grad_norm": 1.9039061845484746, "language_loss": 0.73792493, "learning_rate": 1.5389717785748742e-06, "loss": 0.75907904, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 9760, "time_per_iteration": 2.501605272293091 }, { "auxiliary_loss_clip": 0.01057031, "auxiliary_loss_mlp": 0.01048093, "balance_loss_clip": 1.02184296, "balance_loss_mlp": 1.01796579, "epoch": 0.5868630692920487, "flos": 17888657448960.0, "grad_norm": 3.0528149530363113, "language_loss": 0.74023652, "learning_rate": 1.5385928142362637e-06, "loss": 0.76128781, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 9761, "time_per_iteration": 2.3685927391052246 }, { "auxiliary_loss_clip": 0.01061634, "auxiliary_loss_mlp": 0.01056957, "balance_loss_clip": 1.02701116, "balance_loss_mlp": 1.0205617, "epoch": 0.5869231925447167, "flos": 21033912545280.0, "grad_norm": 1.8360524145174864, "language_loss": 0.75692928, "learning_rate": 1.5382138673942597e-06, "loss": 0.77811515, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41015625, "step": 9762, "time_per_iteration": 2.3865840435028076 }, { "auxiliary_loss_clip": 0.0105966, "auxiliary_loss_mlp": 0.01053581, "balance_loss_clip": 1.0259366, "balance_loss_mlp": 1.02157152, "epoch": 0.5869833157973846, "flos": 74735707680000.0, "grad_norm": 1.2714532985385563, "language_loss": 0.73446083, "learning_rate": 1.5378349380632317e-06, "loss": 0.7555933, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.37890625, "step": 9763, "time_per_iteration": 2.822741985321045 }, { "auxiliary_loss_clip": 0.0105903, "auxiliary_loss_mlp": 0.01043066, "balance_loss_clip": 1.01827013, "balance_loss_mlp": 1.02111769, "epoch": 0.5870434390500526, "flos": 17638050643200.0, "grad_norm": 1.5031529366500294, "language_loss": 0.81271434, "learning_rate": 1.53745602625755e-06, "loss": 0.83373535, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 9764, "time_per_iteration": 2.428218126296997 }, { "auxiliary_loss_clip": 0.01062841, "auxiliary_loss_mlp": 0.01051003, "balance_loss_clip": 1.02490783, "balance_loss_mlp": 1.0227741, "epoch": 0.5871035623027205, "flos": 21505100520960.0, "grad_norm": 1.6664280941894007, "language_loss": 0.79999435, "learning_rate": 1.5370771319915819e-06, "loss": 0.82113284, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9765, "time_per_iteration": 2.4260759353637695 }, { "auxiliary_loss_clip": 0.01059844, "auxiliary_loss_mlp": 0.01051337, "balance_loss_clip": 1.02393103, "balance_loss_mlp": 1.02197659, "epoch": 0.5871636855553886, "flos": 13551048000000.0, "grad_norm": 1.8576583243589238, "language_loss": 0.85036755, "learning_rate": 1.5366982552796947e-06, "loss": 0.87147939, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37890625, "step": 9766, "time_per_iteration": 2.377051591873169 }, { "auxiliary_loss_clip": 0.01065724, "auxiliary_loss_mlp": 0.01050612, "balance_loss_clip": 1.02357566, "balance_loss_mlp": 1.02399993, "epoch": 0.5872238088080565, "flos": 26211741707520.0, "grad_norm": 1.515951626664863, "language_loss": 0.70344818, "learning_rate": 1.536319396136257e-06, "loss": 0.72461152, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.41796875, "step": 9767, "time_per_iteration": 2.4644381999969482 }, { "auxiliary_loss_clip": 0.01063813, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.01424122, "balance_loss_mlp": 1.02376509, "epoch": 0.5872839320607245, "flos": 30663866016000.0, "grad_norm": 1.9142038107917099, "language_loss": 0.65528989, "learning_rate": 1.5359405545756336e-06, "loss": 0.67632365, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 9768, "time_per_iteration": 2.4804739952087402 }, { "auxiliary_loss_clip": 0.01022791, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.02427053, "balance_loss_mlp": 1.0152812, "epoch": 0.5873440553133924, "flos": 60300062029440.0, "grad_norm": 0.7398461795525371, "language_loss": 0.54055202, "learning_rate": 1.5355617306121914e-06, "loss": 0.56104791, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.07519531, "step": 9769, "time_per_iteration": 3.053971290588379 }, { "auxiliary_loss_clip": 0.0106314, "auxiliary_loss_mlp": 0.01039379, "balance_loss_clip": 1.01515591, "balance_loss_mlp": 1.02482939, "epoch": 0.5874041785660604, "flos": 21538338001920.0, "grad_norm": 1.434541581601141, "language_loss": 0.71239769, "learning_rate": 1.5351829242602945e-06, "loss": 0.73342288, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3828125, "step": 9770, "time_per_iteration": 2.4311909675598145 }, { "auxiliary_loss_clip": 0.01061415, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.01628721, "balance_loss_mlp": 1.0235796, "epoch": 0.5874643018187284, "flos": 24387809126400.0, "grad_norm": 1.8488738731751522, "language_loss": 0.68552989, "learning_rate": 1.5348041355343077e-06, "loss": 0.70658422, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.37890625, "step": 9771, "time_per_iteration": 2.43574857711792 }, { "auxiliary_loss_clip": 0.01064902, "auxiliary_loss_mlp": 0.01041826, "balance_loss_clip": 1.01617229, "balance_loss_mlp": 1.02476048, "epoch": 0.5875244250713964, "flos": 28146453632640.0, "grad_norm": 2.0559316889919272, "language_loss": 0.67251682, "learning_rate": 1.5344253644485954e-06, "loss": 0.69358408, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 9772, "time_per_iteration": 2.479489803314209 }, { "auxiliary_loss_clip": 0.01065879, "auxiliary_loss_mlp": 0.01051966, "balance_loss_clip": 1.02402306, "balance_loss_mlp": 1.02483547, "epoch": 0.5875845483240644, "flos": 25811218056960.0, "grad_norm": 2.041369653311522, "language_loss": 0.75990045, "learning_rate": 1.534046611017519e-06, "loss": 0.78107888, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41015625, "step": 9773, "time_per_iteration": 2.4428458213806152 }, { "auxiliary_loss_clip": 0.01063985, "auxiliary_loss_mlp": 0.01051156, "balance_loss_clip": 1.02388096, "balance_loss_mlp": 1.02379918, "epoch": 0.5876446715767323, "flos": 26905361155200.0, "grad_norm": 2.4598936169141683, "language_loss": 0.54777348, "learning_rate": 1.5336678752554421e-06, "loss": 0.56892484, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 9774, "time_per_iteration": 2.4470460414886475 }, { "auxiliary_loss_clip": 0.0106375, "auxiliary_loss_mlp": 0.01050041, "balance_loss_clip": 1.02293265, "balance_loss_mlp": 1.02386355, "epoch": 0.5877047948294003, "flos": 36683346257280.0, "grad_norm": 2.4216129762090306, "language_loss": 0.66738808, "learning_rate": 1.5332891571767264e-06, "loss": 0.68852603, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3984375, "step": 9775, "time_per_iteration": 2.5285141468048096 }, { "auxiliary_loss_clip": 0.01063399, "auxiliary_loss_mlp": 0.0105179, "balance_loss_clip": 1.02526605, "balance_loss_mlp": 1.02381968, "epoch": 0.5877649180820682, "flos": 26723498549760.0, "grad_norm": 1.826914951963819, "language_loss": 0.7476725, "learning_rate": 1.5329104567957326e-06, "loss": 0.76882434, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 9776, "time_per_iteration": 2.456399917602539 }, { "auxiliary_loss_clip": 0.01063431, "auxiliary_loss_mlp": 0.01055286, "balance_loss_clip": 1.02872646, "balance_loss_mlp": 1.02354527, "epoch": 0.5878250413347362, "flos": 21031154547840.0, "grad_norm": 1.902457289599324, "language_loss": 0.75696641, "learning_rate": 1.532531774126821e-06, "loss": 0.77815354, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 9777, "time_per_iteration": 2.4330735206604004 }, { "auxiliary_loss_clip": 0.01060399, "auxiliary_loss_mlp": 0.01045045, "balance_loss_clip": 1.02083373, "balance_loss_mlp": 1.02396441, "epoch": 0.5878851645874041, "flos": 25483069388160.0, "grad_norm": 1.6636980117061506, "language_loss": 0.75383389, "learning_rate": 1.5321531091843512e-06, "loss": 0.77488828, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 9778, "time_per_iteration": 2.4269490242004395 }, { "auxiliary_loss_clip": 0.0106018, "auxiliary_loss_mlp": 0.01053131, "balance_loss_clip": 1.02866948, "balance_loss_mlp": 1.02257562, "epoch": 0.5879452878400722, "flos": 23767996026240.0, "grad_norm": 2.0318245475368504, "language_loss": 0.70881462, "learning_rate": 1.5317744619826824e-06, "loss": 0.72994775, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 9779, "time_per_iteration": 2.4315452575683594 }, { "auxiliary_loss_clip": 0.01061388, "auxiliary_loss_mlp": 0.01056449, "balance_loss_clip": 1.03116441, "balance_loss_mlp": 1.02175117, "epoch": 0.5880054110927401, "flos": 17823474207360.0, "grad_norm": 1.9399220722520079, "language_loss": 0.68221283, "learning_rate": 1.5313958325361727e-06, "loss": 0.70339119, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39648438, "step": 9780, "time_per_iteration": 2.402477979660034 }, { "auxiliary_loss_clip": 0.01062199, "auxiliary_loss_mlp": 0.01055693, "balance_loss_clip": 1.02775002, "balance_loss_mlp": 1.02363515, "epoch": 0.5880655343454081, "flos": 19462402160640.0, "grad_norm": 1.9263018884879528, "language_loss": 0.73937577, "learning_rate": 1.5310172208591807e-06, "loss": 0.76055467, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38671875, "step": 9781, "time_per_iteration": 2.3893721103668213 }, { "auxiliary_loss_clip": 0.01059597, "auxiliary_loss_mlp": 0.0105769, "balance_loss_clip": 1.03294241, "balance_loss_mlp": 1.0208981, "epoch": 0.588125657598076, "flos": 21396520592640.0, "grad_norm": 1.6476443585366127, "language_loss": 0.70904386, "learning_rate": 1.5306386269660622e-06, "loss": 0.73021674, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 9782, "time_per_iteration": 2.411966323852539 }, { "auxiliary_loss_clip": 0.01062036, "auxiliary_loss_mlp": 0.01055708, "balance_loss_clip": 1.02824259, "balance_loss_mlp": 1.02131593, "epoch": 0.588185780850744, "flos": 16033721713920.0, "grad_norm": 2.0895369699825412, "language_loss": 0.7221247, "learning_rate": 1.5302600508711741e-06, "loss": 0.74330211, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 9783, "time_per_iteration": 2.4257309436798096 }, { "auxiliary_loss_clip": 0.01060742, "auxiliary_loss_mlp": 0.01049014, "balance_loss_clip": 1.02064204, "balance_loss_mlp": 1.02058792, "epoch": 0.588245904103412, "flos": 23727217691520.0, "grad_norm": 2.1301783880466316, "language_loss": 0.70159727, "learning_rate": 1.5298814925888719e-06, "loss": 0.72269487, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40234375, "step": 9784, "time_per_iteration": 2.3934402465820312 }, { "auxiliary_loss_clip": 0.01059562, "auxiliary_loss_mlp": 0.01048355, "balance_loss_clip": 1.02212882, "balance_loss_mlp": 1.01947534, "epoch": 0.58830602735608, "flos": 33801126410880.0, "grad_norm": 1.9242367773335916, "language_loss": 0.69917047, "learning_rate": 1.5295029521335102e-06, "loss": 0.72024965, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40039062, "step": 9785, "time_per_iteration": 3.8367767333984375 }, { "auxiliary_loss_clip": 0.01057126, "auxiliary_loss_mlp": 0.01048175, "balance_loss_clip": 1.02340281, "balance_loss_mlp": 1.01947689, "epoch": 0.588366150608748, "flos": 17089809563520.0, "grad_norm": 2.248249477992592, "language_loss": 0.79706579, "learning_rate": 1.5291244295194448e-06, "loss": 0.81811881, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 9786, "time_per_iteration": 2.3927433490753174 }, { "auxiliary_loss_clip": 0.01057991, "auxiliary_loss_mlp": 0.01058908, "balance_loss_clip": 1.03166854, "balance_loss_mlp": 1.01885033, "epoch": 0.5884262738614159, "flos": 22126100607360.0, "grad_norm": 16.03189773149027, "language_loss": 0.80568814, "learning_rate": 1.5287459247610276e-06, "loss": 0.82685715, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39257812, "step": 9787, "time_per_iteration": 2.404615640640259 }, { "auxiliary_loss_clip": 0.01056261, "auxiliary_loss_mlp": 0.01047426, "balance_loss_clip": 1.02286935, "balance_loss_mlp": 1.01739287, "epoch": 0.5884863971140839, "flos": 21030805434240.0, "grad_norm": 1.5489621279929908, "language_loss": 0.67235696, "learning_rate": 1.5283674378726116e-06, "loss": 0.69339383, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38867188, "step": 9788, "time_per_iteration": 2.4064552783966064 }, { "auxiliary_loss_clip": 0.01055916, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.01531875, "balance_loss_mlp": 1.01893878, "epoch": 0.5885465203667518, "flos": 23803991504640.0, "grad_norm": 2.0758238338132093, "language_loss": 0.81393886, "learning_rate": 1.5279889688685506e-06, "loss": 0.83489579, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 9789, "time_per_iteration": 2.3997650146484375 }, { "auxiliary_loss_clip": 0.01055438, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.0167855, "balance_loss_mlp": 1.01817048, "epoch": 0.5886066436194198, "flos": 18879562056960.0, "grad_norm": 1.4910676215258065, "language_loss": 0.7038579, "learning_rate": 1.5276105177631944e-06, "loss": 0.72482657, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 9790, "time_per_iteration": 5.266185760498047 }, { "auxiliary_loss_clip": 0.01057387, "auxiliary_loss_mlp": 0.01043256, "balance_loss_clip": 1.01750648, "balance_loss_mlp": 1.01934624, "epoch": 0.5886667668720877, "flos": 24788996092800.0, "grad_norm": 1.7933219988503695, "language_loss": 0.84635949, "learning_rate": 1.527232084570895e-06, "loss": 0.8673659, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 9791, "time_per_iteration": 2.4299306869506836 }, { "auxiliary_loss_clip": 0.01057898, "auxiliary_loss_mlp": 0.01044127, "balance_loss_clip": 1.01787722, "balance_loss_mlp": 1.01936936, "epoch": 0.5887268901247558, "flos": 21613366247040.0, "grad_norm": 1.5734312241864312, "language_loss": 0.77241886, "learning_rate": 1.5268536693060026e-06, "loss": 0.79343909, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9792, "time_per_iteration": 2.3913958072662354 }, { "auxiliary_loss_clip": 0.01058387, "auxiliary_loss_mlp": 0.01044752, "balance_loss_clip": 1.01885986, "balance_loss_mlp": 1.01896536, "epoch": 0.5887870133774237, "flos": 20480783875200.0, "grad_norm": 2.4198945028240546, "language_loss": 0.70017147, "learning_rate": 1.5264752719828662e-06, "loss": 0.72120297, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 9793, "time_per_iteration": 2.4286458492279053 }, { "auxiliary_loss_clip": 0.010565, "auxiliary_loss_mlp": 0.0103889, "balance_loss_clip": 1.01439214, "balance_loss_mlp": 1.01916432, "epoch": 0.5888471366300917, "flos": 19205336753280.0, "grad_norm": 1.8287091972430205, "language_loss": 0.61531603, "learning_rate": 1.5260968926158353e-06, "loss": 0.63626993, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 9794, "time_per_iteration": 2.351283311843872 }, { "auxiliary_loss_clip": 0.01060029, "auxiliary_loss_mlp": 0.01043572, "balance_loss_clip": 1.01728678, "balance_loss_mlp": 1.02080977, "epoch": 0.5889072598827596, "flos": 19971924675840.0, "grad_norm": 1.6953057625030912, "language_loss": 0.66341114, "learning_rate": 1.525718531219257e-06, "loss": 0.68444717, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 9795, "time_per_iteration": 2.408621311187744 }, { "auxiliary_loss_clip": 0.0105764, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.01523447, "balance_loss_mlp": 1.02005124, "epoch": 0.5889673831354276, "flos": 20740188343680.0, "grad_norm": 1.5282127373746623, "language_loss": 0.74914491, "learning_rate": 1.5253401878074801e-06, "loss": 0.77011299, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 9796, "time_per_iteration": 2.399714708328247 }, { "auxiliary_loss_clip": 0.01058704, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.01826644, "balance_loss_mlp": 1.01985061, "epoch": 0.5890275063880956, "flos": 25299775416960.0, "grad_norm": 1.557435602958131, "language_loss": 0.83843076, "learning_rate": 1.5249618623948507e-06, "loss": 0.85943615, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38867188, "step": 9797, "time_per_iteration": 2.426107883453369 }, { "auxiliary_loss_clip": 0.01056258, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 1.01546288, "balance_loss_mlp": 1.01915383, "epoch": 0.5890876296407636, "flos": 11764577174400.0, "grad_norm": 2.095080868489773, "language_loss": 0.79941326, "learning_rate": 1.5245835549957152e-06, "loss": 0.82038033, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 9798, "time_per_iteration": 3.7924790382385254 }, { "auxiliary_loss_clip": 0.01057082, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.01387334, "balance_loss_mlp": 1.01947284, "epoch": 0.5891477528934316, "flos": 13588614489600.0, "grad_norm": 5.07606119058867, "language_loss": 0.76382101, "learning_rate": 1.5242052656244186e-06, "loss": 0.78476351, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37695312, "step": 9799, "time_per_iteration": 2.4188294410705566 }, { "auxiliary_loss_clip": 0.01060951, "auxiliary_loss_mlp": 0.01043005, "balance_loss_clip": 1.01572967, "balance_loss_mlp": 1.02046514, "epoch": 0.5892078761460995, "flos": 15048298189440.0, "grad_norm": 2.0324980849070373, "language_loss": 0.77970183, "learning_rate": 1.5238269942953064e-06, "loss": 0.80074131, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40625, "step": 9800, "time_per_iteration": 2.3576500415802 }, { "auxiliary_loss_clip": 0.01060581, "auxiliary_loss_mlp": 0.01047019, "balance_loss_clip": 1.02190161, "balance_loss_mlp": 1.02092004, "epoch": 0.5892679993987675, "flos": 15777319622400.0, "grad_norm": 1.8803605095584501, "language_loss": 0.8060469, "learning_rate": 1.523448741022722e-06, "loss": 0.82712293, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39648438, "step": 9801, "time_per_iteration": 2.3911850452423096 }, { "auxiliary_loss_clip": 0.01061109, "auxiliary_loss_mlp": 0.01035249, "balance_loss_clip": 1.00904655, "balance_loss_mlp": 1.02143645, "epoch": 0.5893281226514354, "flos": 25264024318080.0, "grad_norm": 1.6901665474660512, "language_loss": 0.67164814, "learning_rate": 1.5230705058210088e-06, "loss": 0.69261169, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3984375, "step": 9802, "time_per_iteration": 2.419430732727051 }, { "auxiliary_loss_clip": 0.01058657, "auxiliary_loss_mlp": 0.01039433, "balance_loss_clip": 1.01482868, "balance_loss_mlp": 1.01986456, "epoch": 0.5893882459041034, "flos": 19457374924800.0, "grad_norm": 1.5644278859641219, "language_loss": 0.7867099, "learning_rate": 1.5226922887045108e-06, "loss": 0.80769086, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 9803, "time_per_iteration": 2.402287244796753 }, { "auxiliary_loss_clip": 0.01060116, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.0169611, "balance_loss_mlp": 1.01996279, "epoch": 0.5894483691567713, "flos": 20632935047040.0, "grad_norm": 1.7366428014516204, "language_loss": 0.73642284, "learning_rate": 1.5223140896875686e-06, "loss": 0.75744605, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40039062, "step": 9804, "time_per_iteration": 2.4120917320251465 }, { "auxiliary_loss_clip": 0.01058858, "auxiliary_loss_mlp": 0.01041032, "balance_loss_clip": 1.01565182, "balance_loss_mlp": 1.02099419, "epoch": 0.5895084924094394, "flos": 17777459168640.0, "grad_norm": 1.5008006217082384, "language_loss": 0.76376498, "learning_rate": 1.5219359087845234e-06, "loss": 0.78476387, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 9805, "time_per_iteration": 2.3930094242095947 }, { "auxiliary_loss_clip": 0.0106411, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.0173192, "balance_loss_mlp": 1.0221014, "epoch": 0.5895686156621073, "flos": 20120026129920.0, "grad_norm": 1.874442911382488, "language_loss": 0.79020405, "learning_rate": 1.5215577460097174e-06, "loss": 0.81129086, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41992188, "step": 9806, "time_per_iteration": 2.4049558639526367 }, { "auxiliary_loss_clip": 0.01058028, "auxiliary_loss_mlp": 0.01042248, "balance_loss_clip": 1.01488936, "balance_loss_mlp": 1.01937485, "epoch": 0.5896287389147753, "flos": 20849012651520.0, "grad_norm": 1.8709451041246972, "language_loss": 0.78523362, "learning_rate": 1.5211796013774887e-06, "loss": 0.80623639, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 9807, "time_per_iteration": 2.4193782806396484 }, { "auxiliary_loss_clip": 0.01062218, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.01332688, "balance_loss_mlp": 1.02217817, "epoch": 0.5896888621674432, "flos": 14537030106240.0, "grad_norm": 1.8416547567645436, "language_loss": 0.76081586, "learning_rate": 1.5208014749021786e-06, "loss": 0.78184533, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 9808, "time_per_iteration": 2.3565709590911865 }, { "auxiliary_loss_clip": 0.01062188, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.01500654, "balance_loss_mlp": 1.02157044, "epoch": 0.5897489854201112, "flos": 20885706357120.0, "grad_norm": 1.9715277990572166, "language_loss": 0.73674321, "learning_rate": 1.5204233665981236e-06, "loss": 0.75779545, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40625, "step": 9809, "time_per_iteration": 2.4090802669525146 }, { "auxiliary_loss_clip": 0.01061602, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.01836324, "balance_loss_mlp": 1.02087998, "epoch": 0.5898091086727792, "flos": 20010119569920.0, "grad_norm": 2.0932944629257344, "language_loss": 0.83882666, "learning_rate": 1.5200452764796627e-06, "loss": 0.85990149, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40625, "step": 9810, "time_per_iteration": 2.3816840648651123 }, { "auxiliary_loss_clip": 0.01057954, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.01771617, "balance_loss_mlp": 1.01990831, "epoch": 0.5898692319254472, "flos": 16252312936320.0, "grad_norm": 1.5535813052226084, "language_loss": 0.82372475, "learning_rate": 1.5196672045611336e-06, "loss": 0.84473568, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 9811, "time_per_iteration": 2.3717074394226074 }, { "auxiliary_loss_clip": 0.01061246, "auxiliary_loss_mlp": 0.01049205, "balance_loss_clip": 1.02176261, "balance_loss_mlp": 1.0219698, "epoch": 0.5899293551781152, "flos": 20447511482880.0, "grad_norm": 1.676837051849893, "language_loss": 0.77675062, "learning_rate": 1.5192891508568715e-06, "loss": 0.79785514, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39257812, "step": 9812, "time_per_iteration": 2.4155492782592773 }, { "auxiliary_loss_clip": 0.01059694, "auxiliary_loss_mlp": 0.0104258, "balance_loss_clip": 1.01994228, "balance_loss_mlp": 1.02118349, "epoch": 0.5899894784307831, "flos": 13880837502720.0, "grad_norm": 1.6774681844377857, "language_loss": 0.71719193, "learning_rate": 1.5189111153812133e-06, "loss": 0.73821473, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.38476562, "step": 9813, "time_per_iteration": 2.3653905391693115 }, { "auxiliary_loss_clip": 0.01056898, "auxiliary_loss_mlp": 0.01044057, "balance_loss_clip": 1.01827228, "balance_loss_mlp": 1.01851416, "epoch": 0.5900496016834511, "flos": 20082773842560.0, "grad_norm": 1.613512684382054, "language_loss": 0.73272288, "learning_rate": 1.518533098148494e-06, "loss": 0.75373244, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 9814, "time_per_iteration": 2.414436101913452 }, { "auxiliary_loss_clip": 0.01058862, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.0164727, "balance_loss_mlp": 1.02067578, "epoch": 0.590109724936119, "flos": 20258317491840.0, "grad_norm": 1.91464007456179, "language_loss": 0.79493254, "learning_rate": 1.5181550991730476e-06, "loss": 0.81593937, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 9815, "time_per_iteration": 2.4081943035125732 }, { "auxiliary_loss_clip": 0.01060801, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.0195508, "balance_loss_mlp": 1.01989233, "epoch": 0.590169848188787, "flos": 24234156766080.0, "grad_norm": 2.453378709548626, "language_loss": 0.77481234, "learning_rate": 1.5177771184692083e-06, "loss": 0.79589581, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41015625, "step": 9816, "time_per_iteration": 2.4371511936187744 }, { "auxiliary_loss_clip": 0.01057114, "auxiliary_loss_mlp": 0.01039676, "balance_loss_clip": 1.01510656, "balance_loss_mlp": 1.01916194, "epoch": 0.590229971441455, "flos": 17783778124800.0, "grad_norm": 1.7607873733410155, "language_loss": 0.81979162, "learning_rate": 1.517399156051309e-06, "loss": 0.84075952, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 9817, "time_per_iteration": 2.3585567474365234 }, { "auxiliary_loss_clip": 0.01058057, "auxiliary_loss_mlp": 0.01049189, "balance_loss_clip": 1.02321291, "balance_loss_mlp": 1.01883864, "epoch": 0.590290094694123, "flos": 22235797699200.0, "grad_norm": 1.9731097852977253, "language_loss": 0.77723837, "learning_rate": 1.517021211933682e-06, "loss": 0.79831088, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9818, "time_per_iteration": 2.3959803581237793 }, { "auxiliary_loss_clip": 0.01056844, "auxiliary_loss_mlp": 0.01043764, "balance_loss_clip": 1.01870644, "balance_loss_mlp": 1.01869321, "epoch": 0.5903502179467909, "flos": 19097629608960.0, "grad_norm": 1.823609893029903, "language_loss": 0.68174821, "learning_rate": 1.5166432861306592e-06, "loss": 0.70275432, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 9819, "time_per_iteration": 2.3611340522766113 }, { "auxiliary_loss_clip": 0.01057376, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.0159831, "balance_loss_mlp": 1.01903284, "epoch": 0.5904103411994589, "flos": 24234575702400.0, "grad_norm": 1.825395038309527, "language_loss": 0.78959, "learning_rate": 1.5162653786565714e-06, "loss": 0.81057847, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9820, "time_per_iteration": 2.424339532852173 }, { "auxiliary_loss_clip": 0.0100955, "auxiliary_loss_mlp": 0.01017824, "balance_loss_clip": 1.01507068, "balance_loss_mlp": 1.00209427, "epoch": 0.5904704644521268, "flos": 64873650424320.0, "grad_norm": 0.9245256291211371, "language_loss": 0.65184909, "learning_rate": 1.5158874895257487e-06, "loss": 0.67212278, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.07421875, "step": 9821, "time_per_iteration": 2.996143102645874 }, { "auxiliary_loss_clip": 0.01056067, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.01919866, "balance_loss_mlp": 1.0181222, "epoch": 0.5905305877047948, "flos": 19608967514880.0, "grad_norm": 2.043262325818316, "language_loss": 0.62228703, "learning_rate": 1.515509618752521e-06, "loss": 0.64328659, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 9822, "time_per_iteration": 2.3726558685302734 }, { "auxiliary_loss_clip": 0.01058401, "auxiliary_loss_mlp": 0.01045573, "balance_loss_clip": 1.01972818, "balance_loss_mlp": 1.01880682, "epoch": 0.5905907109574628, "flos": 18988630744320.0, "grad_norm": 1.950797215480475, "language_loss": 0.84151828, "learning_rate": 1.5151317663512173e-06, "loss": 0.86255801, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 9823, "time_per_iteration": 2.380657196044922 }, { "auxiliary_loss_clip": 0.0105583, "auxiliary_loss_mlp": 0.01039171, "balance_loss_clip": 1.0135293, "balance_loss_mlp": 1.0183301, "epoch": 0.5906508342101308, "flos": 22199313461760.0, "grad_norm": 2.160961605694398, "language_loss": 0.74832028, "learning_rate": 1.514753932336165e-06, "loss": 0.7692703, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 9824, "time_per_iteration": 2.388810634613037 }, { "auxiliary_loss_clip": 0.01063196, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.01433635, "balance_loss_mlp": 1.02013254, "epoch": 0.5907109574627988, "flos": 20885636534400.0, "grad_norm": 2.0251671970825527, "language_loss": 0.84598768, "learning_rate": 1.514376116721693e-06, "loss": 0.86705911, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4296875, "step": 9825, "time_per_iteration": 3.591508150100708 }, { "auxiliary_loss_clip": 0.01054315, "auxiliary_loss_mlp": 0.01038642, "balance_loss_clip": 1.01594436, "balance_loss_mlp": 1.01808512, "epoch": 0.5907710807154667, "flos": 21505589280000.0, "grad_norm": 1.8172376098103686, "language_loss": 0.76905835, "learning_rate": 1.5139983195221272e-06, "loss": 0.78998792, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 9826, "time_per_iteration": 2.374995470046997 }, { "auxiliary_loss_clip": 0.01054362, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.01094031, "balance_loss_mlp": 1.01706445, "epoch": 0.5908312039681347, "flos": 22017276299520.0, "grad_norm": 1.6017280068908066, "language_loss": 0.73321772, "learning_rate": 1.513620540751793e-06, "loss": 0.75412691, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 9827, "time_per_iteration": 2.397533655166626 }, { "auxiliary_loss_clip": 0.0105623, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.01385856, "balance_loss_mlp": 1.01774395, "epoch": 0.5908913272208026, "flos": 18478514736000.0, "grad_norm": 1.7070527771604371, "language_loss": 0.80268192, "learning_rate": 1.5132427804250178e-06, "loss": 0.82362902, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 9828, "time_per_iteration": 2.3873817920684814 }, { "auxiliary_loss_clip": 0.0105924, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.01680732, "balance_loss_mlp": 1.0194962, "epoch": 0.5909514504734706, "flos": 12311386888320.0, "grad_norm": 3.309359692456124, "language_loss": 0.89635217, "learning_rate": 1.5128650385561241e-06, "loss": 0.91738546, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3984375, "step": 9829, "time_per_iteration": 5.3355231285095215 }, { "auxiliary_loss_clip": 0.0101241, "auxiliary_loss_mlp": 0.01004755, "balance_loss_clip": 1.00231135, "balance_loss_mlp": 1.00474179, "epoch": 0.5910115737261386, "flos": 70209879891840.0, "grad_norm": 0.7588367477791231, "language_loss": 0.57929075, "learning_rate": 1.5124873151594376e-06, "loss": 0.59946239, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.07666016, "step": 9830, "time_per_iteration": 2.9433066844940186 }, { "auxiliary_loss_clip": 0.01061439, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.01707029, "balance_loss_mlp": 1.01885688, "epoch": 0.5910716969788066, "flos": 22016682806400.0, "grad_norm": 4.580588646340233, "language_loss": 0.78057647, "learning_rate": 1.5121096102492812e-06, "loss": 0.80165792, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.42578125, "step": 9831, "time_per_iteration": 2.38224720954895 }, { "auxiliary_loss_clip": 0.01055348, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.01378214, "balance_loss_mlp": 1.01824069, "epoch": 0.5911318202314745, "flos": 21250583642880.0, "grad_norm": 1.9566153667282091, "language_loss": 0.7826122, "learning_rate": 1.5117319238399767e-06, "loss": 0.80354947, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 9832, "time_per_iteration": 2.4239137172698975 }, { "auxiliary_loss_clip": 0.01056781, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.01039279, "balance_loss_mlp": 1.01832116, "epoch": 0.5911919434841425, "flos": 17820646387200.0, "grad_norm": 1.937199388278729, "language_loss": 0.84898812, "learning_rate": 1.511354255945847e-06, "loss": 0.86989999, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38476562, "step": 9833, "time_per_iteration": 2.404972791671753 }, { "auxiliary_loss_clip": 0.01056287, "auxiliary_loss_mlp": 0.01038423, "balance_loss_clip": 1.01181543, "balance_loss_mlp": 1.01803768, "epoch": 0.5912520667368104, "flos": 20373774958080.0, "grad_norm": 1.8074089091538157, "language_loss": 0.75033998, "learning_rate": 1.5109766065812123e-06, "loss": 0.77128708, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 9834, "time_per_iteration": 2.3910415172576904 }, { "auxiliary_loss_clip": 0.01055881, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.01408744, "balance_loss_mlp": 1.0174408, "epoch": 0.5913121899894784, "flos": 17929610340480.0, "grad_norm": 2.2768533605267436, "language_loss": 0.79449487, "learning_rate": 1.5105989757603942e-06, "loss": 0.81544822, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 9835, "time_per_iteration": 2.343096971511841 }, { "auxiliary_loss_clip": 0.01057262, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.01332974, "balance_loss_mlp": 1.01889133, "epoch": 0.5913723132421465, "flos": 22125856227840.0, "grad_norm": 2.053847170654326, "language_loss": 0.74529678, "learning_rate": 1.5102213634977117e-06, "loss": 0.76625741, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9836, "time_per_iteration": 2.4411516189575195 }, { "auxiliary_loss_clip": 0.01056961, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.01173639, "balance_loss_mlp": 1.01809418, "epoch": 0.5914324364948144, "flos": 15697229230080.0, "grad_norm": 1.9953831664624084, "language_loss": 0.84092164, "learning_rate": 1.5098437698074841e-06, "loss": 0.86187053, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 9837, "time_per_iteration": 3.890681266784668 }, { "auxiliary_loss_clip": 0.01057029, "auxiliary_loss_mlp": 0.01041279, "balance_loss_clip": 1.01523137, "balance_loss_mlp": 1.01858115, "epoch": 0.5914925597474824, "flos": 22746227909760.0, "grad_norm": 1.6790512099861714, "language_loss": 0.80551982, "learning_rate": 1.5094661947040304e-06, "loss": 0.82650292, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38476562, "step": 9838, "time_per_iteration": 2.4116079807281494 }, { "auxiliary_loss_clip": 0.01057049, "auxiliary_loss_mlp": 0.01040898, "balance_loss_clip": 1.01399231, "balance_loss_mlp": 1.01827657, "epoch": 0.5915526830001503, "flos": 18291904185600.0, "grad_norm": 1.8720173676136334, "language_loss": 0.7120378, "learning_rate": 1.5090886382016673e-06, "loss": 0.73301721, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 9839, "time_per_iteration": 2.354358673095703 }, { "auxiliary_loss_clip": 0.01058309, "auxiliary_loss_mlp": 0.01042925, "balance_loss_clip": 1.01655614, "balance_loss_mlp": 1.01803267, "epoch": 0.5916128062528183, "flos": 17018132808960.0, "grad_norm": 2.727759830983989, "language_loss": 0.66573399, "learning_rate": 1.5087111003147124e-06, "loss": 0.6867463, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 9840, "time_per_iteration": 2.375784158706665 }, { "auxiliary_loss_clip": 0.01057151, "auxiliary_loss_mlp": 0.01036089, "balance_loss_clip": 1.01082826, "balance_loss_mlp": 1.01785111, "epoch": 0.5916729295054862, "flos": 24753070437120.0, "grad_norm": 1.8564403581734852, "language_loss": 0.82463914, "learning_rate": 1.5083335810574813e-06, "loss": 0.84557152, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39257812, "step": 9841, "time_per_iteration": 2.4149649143218994 }, { "auxiliary_loss_clip": 0.0105366, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.01498878, "balance_loss_mlp": 1.01666641, "epoch": 0.5917330527581542, "flos": 15957366837120.0, "grad_norm": 2.4381471451219485, "language_loss": 0.70548391, "learning_rate": 1.507956080444291e-06, "loss": 0.72641063, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 9842, "time_per_iteration": 2.3622469902038574 }, { "auxiliary_loss_clip": 0.01058411, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.01700926, "balance_loss_mlp": 1.01889896, "epoch": 0.5917931760108222, "flos": 23799732318720.0, "grad_norm": 1.9267965696881344, "language_loss": 0.83556116, "learning_rate": 1.5075785984894549e-06, "loss": 0.85657728, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 9843, "time_per_iteration": 2.403280258178711 }, { "auxiliary_loss_clip": 0.01056112, "auxiliary_loss_mlp": 0.01040922, "balance_loss_clip": 1.01511312, "balance_loss_mlp": 1.01756716, "epoch": 0.5918532992634902, "flos": 23248733241600.0, "grad_norm": 2.482368880578817, "language_loss": 0.83418965, "learning_rate": 1.5072011352072875e-06, "loss": 0.85516, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9844, "time_per_iteration": 2.3888766765594482 }, { "auxiliary_loss_clip": 0.01058288, "auxiliary_loss_mlp": 0.01037434, "balance_loss_clip": 1.01318645, "balance_loss_mlp": 1.01833081, "epoch": 0.5919134225161581, "flos": 19498851486720.0, "grad_norm": 2.0166191137273946, "language_loss": 0.75658673, "learning_rate": 1.5068236906121032e-06, "loss": 0.77754396, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.40039062, "step": 9845, "time_per_iteration": 2.370279312133789 }, { "auxiliary_loss_clip": 0.01057539, "auxiliary_loss_mlp": 0.01043906, "balance_loss_clip": 1.0160588, "balance_loss_mlp": 1.01825953, "epoch": 0.5919735457688261, "flos": 38799397117440.0, "grad_norm": 1.9341905431498867, "language_loss": 0.6548453, "learning_rate": 1.506446264718213e-06, "loss": 0.67585969, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39257812, "step": 9846, "time_per_iteration": 2.552384376525879 }, { "auxiliary_loss_clip": 0.01053124, "auxiliary_loss_mlp": 0.0103321, "balance_loss_clip": 1.01106048, "balance_loss_mlp": 1.01749229, "epoch": 0.592033669021494, "flos": 22162899047040.0, "grad_norm": 1.8267633889259995, "language_loss": 0.77746105, "learning_rate": 1.506068857539931e-06, "loss": 0.79832441, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 9847, "time_per_iteration": 2.382852077484131 }, { "auxiliary_loss_clip": 0.0105669, "auxiliary_loss_mlp": 0.01041442, "balance_loss_clip": 1.01572812, "balance_loss_mlp": 1.01765275, "epoch": 0.592093792274162, "flos": 22709883317760.0, "grad_norm": 2.548019539662544, "language_loss": 0.64375609, "learning_rate": 1.5056914690915667e-06, "loss": 0.66473746, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 9848, "time_per_iteration": 2.4100582599639893 }, { "auxiliary_loss_clip": 0.01057545, "auxiliary_loss_mlp": 0.01040261, "balance_loss_clip": 1.01495337, "balance_loss_mlp": 1.01806593, "epoch": 0.59215391552683, "flos": 22527846155520.0, "grad_norm": 1.802175035767617, "language_loss": 0.77724826, "learning_rate": 1.5053140993874312e-06, "loss": 0.79822636, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39453125, "step": 9849, "time_per_iteration": 2.395768880844116 }, { "auxiliary_loss_clip": 0.0105708, "auxiliary_loss_mlp": 0.01045654, "balance_loss_clip": 1.01740086, "balance_loss_mlp": 1.01740742, "epoch": 0.592214038779498, "flos": 24497855331840.0, "grad_norm": 1.8359711267540675, "language_loss": 0.7651121, "learning_rate": 1.5049367484418353e-06, "loss": 0.78613937, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.39648438, "step": 9850, "time_per_iteration": 2.4219248294830322 }, { "auxiliary_loss_clip": 0.01055289, "auxiliary_loss_mlp": 0.01045972, "balance_loss_clip": 1.02072382, "balance_loss_mlp": 1.01694489, "epoch": 0.592274162032166, "flos": 21830386458240.0, "grad_norm": 3.196406052552999, "language_loss": 0.76592416, "learning_rate": 1.5045594162690868e-06, "loss": 0.78693676, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 9851, "time_per_iteration": 2.3682548999786377 }, { "auxiliary_loss_clip": 0.01056986, "auxiliary_loss_mlp": 0.01038899, "balance_loss_clip": 1.01483095, "balance_loss_mlp": 1.01754022, "epoch": 0.5923342852848339, "flos": 24606993841920.0, "grad_norm": 2.2216923732476874, "language_loss": 0.71832216, "learning_rate": 1.5041821028834954e-06, "loss": 0.739281, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.39453125, "step": 9852, "time_per_iteration": 2.4349143505096436 }, { "auxiliary_loss_clip": 0.01060008, "auxiliary_loss_mlp": 0.01050138, "balance_loss_clip": 1.02174234, "balance_loss_mlp": 1.01838684, "epoch": 0.5923944085375019, "flos": 19937116183680.0, "grad_norm": 1.6194195799452875, "language_loss": 0.81216586, "learning_rate": 1.5038048082993685e-06, "loss": 0.83326733, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.41601562, "step": 9853, "time_per_iteration": 2.4016530513763428 }, { "auxiliary_loss_clip": 0.01054389, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.01340091, "balance_loss_mlp": 1.01733184, "epoch": 0.5924545317901698, "flos": 28657232956800.0, "grad_norm": 1.6575426671886002, "language_loss": 0.6867671, "learning_rate": 1.5034275325310124e-06, "loss": 0.7076844, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 9854, "time_per_iteration": 2.4455151557922363 }, { "auxiliary_loss_clip": 0.01054825, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.01297963, "balance_loss_mlp": 1.01723671, "epoch": 0.5925146550428378, "flos": 19863868417920.0, "grad_norm": 1.8661664873016393, "language_loss": 0.89946586, "learning_rate": 1.5030502755927344e-06, "loss": 0.92037833, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 9855, "time_per_iteration": 2.3918418884277344 }, { "auxiliary_loss_clip": 0.01055549, "auxiliary_loss_mlp": 0.01045431, "balance_loss_clip": 1.02100527, "balance_loss_mlp": 1.01771665, "epoch": 0.5925747782955058, "flos": 15122069625600.0, "grad_norm": 1.6583025908691962, "language_loss": 0.87809771, "learning_rate": 1.5026730374988397e-06, "loss": 0.89910746, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9856, "time_per_iteration": 2.3781585693359375 }, { "auxiliary_loss_clip": 0.01056994, "auxiliary_loss_mlp": 0.01041954, "balance_loss_clip": 1.01727772, "balance_loss_mlp": 1.01789248, "epoch": 0.5926349015481738, "flos": 18404464008960.0, "grad_norm": 1.8783595993335969, "language_loss": 0.78435898, "learning_rate": 1.5022958182636332e-06, "loss": 0.80534846, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.390625, "step": 9857, "time_per_iteration": 2.361701488494873 }, { "auxiliary_loss_clip": 0.01057232, "auxiliary_loss_mlp": 0.01045683, "balance_loss_clip": 1.02162671, "balance_loss_mlp": 1.0182209, "epoch": 0.5926950248008417, "flos": 23110057854720.0, "grad_norm": 2.0581972726452884, "language_loss": 0.66536951, "learning_rate": 1.501918617901419e-06, "loss": 0.68639863, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.390625, "step": 9858, "time_per_iteration": 2.377786636352539 }, { "auxiliary_loss_clip": 0.01055086, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.01615644, "balance_loss_mlp": 1.01647568, "epoch": 0.5927551480535097, "flos": 28032776645760.0, "grad_norm": 1.7312770721302726, "language_loss": 0.78262144, "learning_rate": 1.501541436426501e-06, "loss": 0.8035655, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38671875, "step": 9859, "time_per_iteration": 2.4458224773406982 }, { "auxiliary_loss_clip": 0.01057866, "auxiliary_loss_mlp": 0.01039719, "balance_loss_clip": 1.01256275, "balance_loss_mlp": 1.01764834, "epoch": 0.5928152713061776, "flos": 21797602824960.0, "grad_norm": 2.40942840287197, "language_loss": 0.77119768, "learning_rate": 1.5011642738531818e-06, "loss": 0.7921735, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40234375, "step": 9860, "time_per_iteration": 2.375856399536133 }, { "auxiliary_loss_clip": 0.01055434, "auxiliary_loss_mlp": 0.01041873, "balance_loss_clip": 1.01735139, "balance_loss_mlp": 1.01670718, "epoch": 0.5928753945588456, "flos": 24315678524160.0, "grad_norm": 1.6332966781191838, "language_loss": 0.77835977, "learning_rate": 1.500787130195763e-06, "loss": 0.7993328, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 9861, "time_per_iteration": 2.416691303253174 }, { "auxiliary_loss_clip": 0.01054315, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.01191044, "balance_loss_mlp": 1.01673079, "epoch": 0.5929355178115137, "flos": 26463535499520.0, "grad_norm": 1.6428174741266346, "language_loss": 0.71606749, "learning_rate": 1.5004100054685465e-06, "loss": 0.7369709, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 9862, "time_per_iteration": 2.4039266109466553 }, { "auxiliary_loss_clip": 0.0105719, "auxiliary_loss_mlp": 0.01037525, "balance_loss_clip": 1.0125035, "balance_loss_mlp": 1.01870704, "epoch": 0.5929956410641816, "flos": 24965028501120.0, "grad_norm": 1.8238357687828608, "language_loss": 0.7888006, "learning_rate": 1.500032899685832e-06, "loss": 0.80974776, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 9863, "time_per_iteration": 3.783052921295166 }, { "auxiliary_loss_clip": 0.01055435, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.01317787, "balance_loss_mlp": 1.01732397, "epoch": 0.5930557643168496, "flos": 26207273053440.0, "grad_norm": 1.6561127682033918, "language_loss": 0.71510959, "learning_rate": 1.499655812861921e-06, "loss": 0.73604923, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 9864, "time_per_iteration": 2.4254157543182373 }, { "auxiliary_loss_clip": 0.0105672, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.01108682, "balance_loss_mlp": 1.01750231, "epoch": 0.5931158875695175, "flos": 27853706949120.0, "grad_norm": 1.5979681763400244, "language_loss": 0.68778747, "learning_rate": 1.4992787450111112e-06, "loss": 0.7087115, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39257812, "step": 9865, "time_per_iteration": 2.4131698608398438 }, { "auxiliary_loss_clip": 0.01058285, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.01753664, "balance_loss_mlp": 1.01779819, "epoch": 0.5931760108221855, "flos": 15412756538880.0, "grad_norm": 2.20776128885764, "language_loss": 0.8028183, "learning_rate": 1.4989016961477015e-06, "loss": 0.82385802, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40429688, "step": 9866, "time_per_iteration": 2.34771466255188 }, { "auxiliary_loss_clip": 0.01055316, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.01113236, "balance_loss_mlp": 1.0172112, "epoch": 0.5932361340748534, "flos": 30187266779520.0, "grad_norm": 1.9552084251447066, "language_loss": 0.73629677, "learning_rate": 1.4985246662859903e-06, "loss": 0.75720525, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 9867, "time_per_iteration": 2.442326545715332 }, { "auxiliary_loss_clip": 0.01055979, "auxiliary_loss_mlp": 0.01041129, "balance_loss_clip": 1.01567745, "balance_loss_mlp": 1.01819146, "epoch": 0.5932962573275214, "flos": 20156510367360.0, "grad_norm": 1.558930669342135, "language_loss": 0.68402773, "learning_rate": 1.4981476554402732e-06, "loss": 0.70499879, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 9868, "time_per_iteration": 2.3611552715301514 }, { "auxiliary_loss_clip": 0.01058047, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.01235127, "balance_loss_mlp": 1.01802313, "epoch": 0.5933563805801894, "flos": 25444769760000.0, "grad_norm": 1.5142922592889356, "language_loss": 0.76241916, "learning_rate": 1.4977706636248478e-06, "loss": 0.78339422, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40039062, "step": 9869, "time_per_iteration": 5.0830559730529785 }, { "auxiliary_loss_clip": 0.01057822, "auxiliary_loss_mlp": 0.01042124, "balance_loss_clip": 1.01673269, "balance_loss_mlp": 1.01866794, "epoch": 0.5934165038328574, "flos": 59993701781760.0, "grad_norm": 1.6271390468488758, "language_loss": 0.74956501, "learning_rate": 1.4973936908540091e-06, "loss": 0.77056444, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 9870, "time_per_iteration": 2.7256340980529785 }, { "auxiliary_loss_clip": 0.01057905, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.01189566, "balance_loss_mlp": 1.01789045, "epoch": 0.5934766270855253, "flos": 24419545418880.0, "grad_norm": 2.4236132085305786, "language_loss": 0.73340839, "learning_rate": 1.4970167371420517e-06, "loss": 0.75436223, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40039062, "step": 9871, "time_per_iteration": 2.394937515258789 }, { "auxiliary_loss_clip": 0.01058893, "auxiliary_loss_mlp": 0.01038572, "balance_loss_clip": 1.01147604, "balance_loss_mlp": 1.018466, "epoch": 0.5935367503381933, "flos": 23512047782400.0, "grad_norm": 1.8965951428462107, "language_loss": 0.76538217, "learning_rate": 1.496639802503271e-06, "loss": 0.78635681, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 9872, "time_per_iteration": 2.411900758743286 }, { "auxiliary_loss_clip": 0.01057412, "auxiliary_loss_mlp": 0.01041111, "balance_loss_clip": 1.01446772, "balance_loss_mlp": 1.01738369, "epoch": 0.5935968735908612, "flos": 18947468384640.0, "grad_norm": 1.9642003065547438, "language_loss": 0.80026293, "learning_rate": 1.4962628869519583e-06, "loss": 0.82124817, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40039062, "step": 9873, "time_per_iteration": 2.3314599990844727 }, { "auxiliary_loss_clip": 0.01056732, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.01628566, "balance_loss_mlp": 1.01739824, "epoch": 0.5936569968435292, "flos": 25482266426880.0, "grad_norm": 2.1135977215408452, "language_loss": 0.85698926, "learning_rate": 1.4958859905024078e-06, "loss": 0.87797451, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 9874, "time_per_iteration": 2.4232192039489746 }, { "auxiliary_loss_clip": 0.01009775, "auxiliary_loss_mlp": 0.01002146, "balance_loss_clip": 0.99966669, "balance_loss_mlp": 1.00243056, "epoch": 0.5937171200961973, "flos": 66375194711040.0, "grad_norm": 0.7102851665113262, "language_loss": 0.60183477, "learning_rate": 1.4955091131689115e-06, "loss": 0.62195396, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.07324219, "step": 9875, "time_per_iteration": 3.1292707920074463 }, { "auxiliary_loss_clip": 0.01058601, "auxiliary_loss_mlp": 0.01040521, "balance_loss_clip": 1.01232839, "balance_loss_mlp": 1.01740289, "epoch": 0.5937772433488652, "flos": 14902570707840.0, "grad_norm": 1.9212546015040861, "language_loss": 0.78712887, "learning_rate": 1.4951322549657594e-06, "loss": 0.80812013, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.41210938, "step": 9876, "time_per_iteration": 2.355015516281128 }, { "auxiliary_loss_clip": 0.01051439, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.01304793, "balance_loss_mlp": 1.01543403, "epoch": 0.5938373666015332, "flos": 22560490143360.0, "grad_norm": 2.7260805860399056, "language_loss": 0.7635507, "learning_rate": 1.494755415907243e-06, "loss": 0.7844274, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 9877, "time_per_iteration": 3.811195135116577 }, { "auxiliary_loss_clip": 0.01056243, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.0138073, "balance_loss_mlp": 1.01738334, "epoch": 0.5938974898542011, "flos": 18439935816960.0, "grad_norm": 2.2181731239797817, "language_loss": 0.82335013, "learning_rate": 1.4943785960076522e-06, "loss": 0.84430718, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38867188, "step": 9878, "time_per_iteration": 2.3364570140838623 }, { "auxiliary_loss_clip": 0.01055395, "auxiliary_loss_mlp": 0.01040638, "balance_loss_clip": 1.01453161, "balance_loss_mlp": 1.01746964, "epoch": 0.5939576131068691, "flos": 45585011433600.0, "grad_norm": 1.9624140787621422, "language_loss": 0.72286201, "learning_rate": 1.4940017952812754e-06, "loss": 0.7438224, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 9879, "time_per_iteration": 2.585948944091797 }, { "auxiliary_loss_clip": 0.01055601, "auxiliary_loss_mlp": 0.01040928, "balance_loss_clip": 1.01768196, "balance_loss_mlp": 1.01762104, "epoch": 0.594017736359537, "flos": 23586552357120.0, "grad_norm": 1.5320826756895167, "language_loss": 0.58841634, "learning_rate": 1.493625013742401e-06, "loss": 0.60938168, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 9880, "time_per_iteration": 2.3939645290374756 }, { "auxiliary_loss_clip": 0.01055188, "auxiliary_loss_mlp": 0.01042283, "balance_loss_clip": 1.01584208, "balance_loss_mlp": 1.01658571, "epoch": 0.594077859612205, "flos": 29456045930880.0, "grad_norm": 1.795227670567752, "language_loss": 0.78589994, "learning_rate": 1.4932482514053177e-06, "loss": 0.80687463, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38476562, "step": 9881, "time_per_iteration": 2.4415647983551025 }, { "auxiliary_loss_clip": 0.01055471, "auxiliary_loss_mlp": 0.01040933, "balance_loss_clip": 1.01361036, "balance_loss_mlp": 1.01706493, "epoch": 0.594137982864873, "flos": 16799157561600.0, "grad_norm": 2.0988005771417098, "language_loss": 0.84078759, "learning_rate": 1.4928715082843112e-06, "loss": 0.86175162, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3828125, "step": 9882, "time_per_iteration": 2.339254856109619 }, { "auxiliary_loss_clip": 0.01054639, "auxiliary_loss_mlp": 0.01036023, "balance_loss_clip": 1.01218104, "balance_loss_mlp": 1.01738071, "epoch": 0.594198106117541, "flos": 12749442117120.0, "grad_norm": 3.0819409682543766, "language_loss": 0.81676972, "learning_rate": 1.492494784393667e-06, "loss": 0.83767635, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 9883, "time_per_iteration": 2.347867012023926 }, { "auxiliary_loss_clip": 0.01059509, "auxiliary_loss_mlp": 0.01045472, "balance_loss_clip": 1.01817298, "balance_loss_mlp": 1.01916552, "epoch": 0.5942582293702089, "flos": 20995473271680.0, "grad_norm": 1.831010830545076, "language_loss": 0.75645328, "learning_rate": 1.4921180797476725e-06, "loss": 0.77750301, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 9884, "time_per_iteration": 2.427304744720459 }, { "auxiliary_loss_clip": 0.01057077, "auxiliary_loss_mlp": 0.01038717, "balance_loss_clip": 1.01253867, "balance_loss_mlp": 1.01793611, "epoch": 0.5943183526228769, "flos": 28290226078080.0, "grad_norm": 2.4006219401471984, "language_loss": 0.68194604, "learning_rate": 1.4917413943606106e-06, "loss": 0.70290399, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 9885, "time_per_iteration": 2.464165449142456 }, { "auxiliary_loss_clip": 0.01055731, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.01546299, "balance_loss_mlp": 1.01775503, "epoch": 0.5943784758755448, "flos": 26613417432960.0, "grad_norm": 2.368896367707395, "language_loss": 0.7816304, "learning_rate": 1.4913647282467667e-06, "loss": 0.80260158, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 9886, "time_per_iteration": 2.3972744941711426 }, { "auxiliary_loss_clip": 0.0100962, "auxiliary_loss_mlp": 0.01011648, "balance_loss_clip": 1.00893044, "balance_loss_mlp": 1.00189066, "epoch": 0.5944385991282128, "flos": 64187781298560.0, "grad_norm": 0.8672627612741726, "language_loss": 0.64732945, "learning_rate": 1.490988081420423e-06, "loss": 0.6675421, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.07714844, "step": 9887, "time_per_iteration": 2.9264235496520996 }, { "auxiliary_loss_clip": 0.01053627, "auxiliary_loss_mlp": 0.01040138, "balance_loss_clip": 1.01586711, "balance_loss_mlp": 1.0166229, "epoch": 0.5944987223808808, "flos": 19571017000320.0, "grad_norm": 3.43726645402833, "language_loss": 0.6994127, "learning_rate": 1.4906114538958615e-06, "loss": 0.72035033, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 9888, "time_per_iteration": 2.352416515350342 }, { "auxiliary_loss_clip": 0.01056373, "auxiliary_loss_mlp": 0.01038711, "balance_loss_clip": 1.01377201, "balance_loss_mlp": 1.01816702, "epoch": 0.5945588456335488, "flos": 26176374633600.0, "grad_norm": 1.65752255212212, "language_loss": 0.80502188, "learning_rate": 1.490234845687366e-06, "loss": 0.82597268, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3828125, "step": 9889, "time_per_iteration": 2.451280355453491 }, { "auxiliary_loss_clip": 0.01055055, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.0172348, "balance_loss_mlp": 1.01717949, "epoch": 0.5946189688862168, "flos": 20445521535360.0, "grad_norm": 1.6224895588702797, "language_loss": 0.7203306, "learning_rate": 1.4898582568092154e-06, "loss": 0.74128902, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37890625, "step": 9890, "time_per_iteration": 2.3797669410705566 }, { "auxiliary_loss_clip": 0.01056843, "auxiliary_loss_mlp": 0.01041325, "balance_loss_clip": 1.01718545, "balance_loss_mlp": 1.01816809, "epoch": 0.5946790921388847, "flos": 13436847342720.0, "grad_norm": 2.1329797560472903, "language_loss": 0.69724262, "learning_rate": 1.489481687275691e-06, "loss": 0.71822441, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38671875, "step": 9891, "time_per_iteration": 2.372577428817749 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.01422012, "balance_loss_mlp": 1.01726472, "epoch": 0.5947392153915527, "flos": 20411236713600.0, "grad_norm": 2.1399311138857446, "language_loss": 0.55106956, "learning_rate": 1.4891051371010726e-06, "loss": 0.5719763, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 9892, "time_per_iteration": 2.3510234355926514 }, { "auxiliary_loss_clip": 0.01008537, "auxiliary_loss_mlp": 0.01003397, "balance_loss_clip": 1.00054777, "balance_loss_mlp": 1.00095344, "epoch": 0.5947993386442206, "flos": 65615798528640.0, "grad_norm": 0.6867370855182499, "language_loss": 0.5467301, "learning_rate": 1.4887286062996375e-06, "loss": 0.56684941, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.07568359, "step": 9893, "time_per_iteration": 3.104739189147949 }, { "auxiliary_loss_clip": 0.01055364, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.0175637, "balance_loss_mlp": 1.01860166, "epoch": 0.5948594618968887, "flos": 23182048811520.0, "grad_norm": 1.650462342265301, "language_loss": 0.75747645, "learning_rate": 1.4883520948856658e-06, "loss": 0.77843416, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 9894, "time_per_iteration": 2.3839035034179688 }, { "auxiliary_loss_clip": 0.01055906, "auxiliary_loss_mlp": 0.01039858, "balance_loss_clip": 1.01605177, "balance_loss_mlp": 1.01788211, "epoch": 0.5949195851495566, "flos": 13625901688320.0, "grad_norm": 1.7935500194522842, "language_loss": 0.79213643, "learning_rate": 1.487975602873434e-06, "loss": 0.81309408, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38085938, "step": 9895, "time_per_iteration": 2.3822174072265625 }, { "auxiliary_loss_clip": 0.01056791, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.01830637, "balance_loss_mlp": 1.01847553, "epoch": 0.5949797084022246, "flos": 19750121608320.0, "grad_norm": 3.2978235670272786, "language_loss": 0.80176431, "learning_rate": 1.4875991302772182e-06, "loss": 0.82275099, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 9896, "time_per_iteration": 2.3826372623443604 }, { "auxiliary_loss_clip": 0.01055581, "auxiliary_loss_mlp": 0.01042413, "balance_loss_clip": 1.01751029, "balance_loss_mlp": 1.01780522, "epoch": 0.5950398316548925, "flos": 25772743872000.0, "grad_norm": 1.5108685772478063, "language_loss": 0.84167981, "learning_rate": 1.4872226771112954e-06, "loss": 0.86265969, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37695312, "step": 9897, "time_per_iteration": 2.4422972202301025 }, { "auxiliary_loss_clip": 0.01056247, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.01623988, "balance_loss_mlp": 1.01792884, "epoch": 0.5950999549075605, "flos": 23037927252480.0, "grad_norm": 2.9502264577751407, "language_loss": 0.721506, "learning_rate": 1.486846243389939e-06, "loss": 0.74246883, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 9898, "time_per_iteration": 2.368438959121704 }, { "auxiliary_loss_clip": 0.01058999, "auxiliary_loss_mlp": 0.0103989, "balance_loss_clip": 1.01291275, "balance_loss_mlp": 1.01813173, "epoch": 0.5951600781602284, "flos": 32445169960320.0, "grad_norm": 2.2667441289113417, "language_loss": 0.65112615, "learning_rate": 1.4864698291274251e-06, "loss": 0.67211503, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40820312, "step": 9899, "time_per_iteration": 2.479994773864746 }, { "auxiliary_loss_clip": 0.01055436, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.01498222, "balance_loss_mlp": 1.01812291, "epoch": 0.5952202014128964, "flos": 23799871964160.0, "grad_norm": 2.015337672202236, "language_loss": 0.73150074, "learning_rate": 1.4860934343380267e-06, "loss": 0.7524268, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.37109375, "step": 9900, "time_per_iteration": 2.3990750312805176 }, { "auxiliary_loss_clip": 0.01055003, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.01199329, "balance_loss_mlp": 1.01874018, "epoch": 0.5952803246655644, "flos": 22491082627200.0, "grad_norm": 1.7052892578310308, "language_loss": 0.85480225, "learning_rate": 1.4857170590360169e-06, "loss": 0.87572217, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 9901, "time_per_iteration": 2.4071247577667236 }, { "auxiliary_loss_clip": 0.01008991, "auxiliary_loss_mlp": 0.0100763, "balance_loss_clip": 1.00491166, "balance_loss_mlp": 1.00149095, "epoch": 0.5953404479182324, "flos": 51232001846400.0, "grad_norm": 0.7943805889503383, "language_loss": 0.58186322, "learning_rate": 1.4853407032356674e-06, "loss": 0.60202944, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.07519531, "step": 9902, "time_per_iteration": 2.8641674518585205 }, { "auxiliary_loss_clip": 0.01055501, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.01840627, "balance_loss_mlp": 1.01691866, "epoch": 0.5954005711709004, "flos": 23111559043200.0, "grad_norm": 1.6951676230507706, "language_loss": 0.78530157, "learning_rate": 1.4849643669512503e-06, "loss": 0.80627894, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38671875, "step": 9903, "time_per_iteration": 3.7552719116210938 }, { "auxiliary_loss_clip": 0.01056424, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.01770008, "balance_loss_mlp": 1.01800632, "epoch": 0.5954606944235683, "flos": 35953277483520.0, "grad_norm": 1.749271141857159, "language_loss": 0.79100209, "learning_rate": 1.4845880501970362e-06, "loss": 0.81198633, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38476562, "step": 9904, "time_per_iteration": 2.502774715423584 }, { "auxiliary_loss_clip": 0.01057561, "auxiliary_loss_mlp": 0.01042081, "balance_loss_clip": 1.01698756, "balance_loss_mlp": 1.01839435, "epoch": 0.5955208176762363, "flos": 30442412062080.0, "grad_norm": 2.3521941388620142, "language_loss": 0.73856032, "learning_rate": 1.4842117529872942e-06, "loss": 0.75955677, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39257812, "step": 9905, "time_per_iteration": 2.45149564743042 }, { "auxiliary_loss_clip": 0.01055277, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.01524186, "balance_loss_mlp": 1.01699579, "epoch": 0.5955809409289042, "flos": 17639132895360.0, "grad_norm": 1.696122289872541, "language_loss": 0.71036243, "learning_rate": 1.483835475336295e-06, "loss": 0.73132384, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 9906, "time_per_iteration": 2.334688425064087 }, { "auxiliary_loss_clip": 0.01056157, "auxiliary_loss_mlp": 0.01045375, "balance_loss_clip": 1.02171159, "balance_loss_mlp": 1.01705253, "epoch": 0.5956410641815723, "flos": 24278740439040.0, "grad_norm": 2.1188421897124194, "language_loss": 0.75998938, "learning_rate": 1.4834592172583057e-06, "loss": 0.78100473, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.390625, "step": 9907, "time_per_iteration": 2.3913822174072266 }, { "auxiliary_loss_clip": 0.01055887, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.02097619, "balance_loss_mlp": 1.01688683, "epoch": 0.5957011874342402, "flos": 35732870870400.0, "grad_norm": 1.5681992688545845, "language_loss": 0.68389702, "learning_rate": 1.483082978767595e-06, "loss": 0.70491356, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 9908, "time_per_iteration": 3.9403486251831055 }, { "auxiliary_loss_clip": 0.01056566, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.01335001, "balance_loss_mlp": 1.01848197, "epoch": 0.5957613106869082, "flos": 21244125041280.0, "grad_norm": 2.1618850614092304, "language_loss": 0.7746954, "learning_rate": 1.4827067598784298e-06, "loss": 0.79563701, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38085938, "step": 9909, "time_per_iteration": 3.7156882286071777 }, { "auxiliary_loss_clip": 0.01010295, "auxiliary_loss_mlp": 0.01004772, "balance_loss_clip": 1.00251937, "balance_loss_mlp": 1.00297236, "epoch": 0.5958214339395761, "flos": 65937802798080.0, "grad_norm": 0.9351411240151967, "language_loss": 0.73531342, "learning_rate": 1.4823305606050753e-06, "loss": 0.75546408, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.07324219, "step": 9910, "time_per_iteration": 3.06516170501709 }, { "auxiliary_loss_clip": 0.01056067, "auxiliary_loss_mlp": 0.01044429, "balance_loss_clip": 1.01845372, "balance_loss_mlp": 1.01727247, "epoch": 0.5958815571922441, "flos": 23217660264960.0, "grad_norm": 1.635279359389376, "language_loss": 0.70587265, "learning_rate": 1.481954380961799e-06, "loss": 0.72687757, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 9911, "time_per_iteration": 2.400561571121216 }, { "auxiliary_loss_clip": 0.01061073, "auxiliary_loss_mlp": 0.01046035, "balance_loss_clip": 1.01750779, "balance_loss_mlp": 1.01962876, "epoch": 0.595941680444912, "flos": 16537867879680.0, "grad_norm": 2.051306269517258, "language_loss": 0.67658317, "learning_rate": 1.4815782209628631e-06, "loss": 0.69765425, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 9912, "time_per_iteration": 2.3497612476348877 }, { "auxiliary_loss_clip": 0.01056845, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.01720011, "balance_loss_mlp": 1.01779556, "epoch": 0.59600180369758, "flos": 27817641648000.0, "grad_norm": 2.202082479614937, "language_loss": 0.74147433, "learning_rate": 1.4812020806225337e-06, "loss": 0.76247525, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9913, "time_per_iteration": 2.428071975708008 }, { "auxiliary_loss_clip": 0.01057991, "auxiliary_loss_mlp": 0.01038616, "balance_loss_clip": 1.01306891, "balance_loss_mlp": 1.0172509, "epoch": 0.596061926950248, "flos": 29490435486720.0, "grad_norm": 1.9372761455362892, "language_loss": 0.81776726, "learning_rate": 1.4808259599550738e-06, "loss": 0.83873332, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40625, "step": 9914, "time_per_iteration": 2.4447903633117676 }, { "auxiliary_loss_clip": 0.01055215, "auxiliary_loss_mlp": 0.01037712, "balance_loss_clip": 1.01227319, "balance_loss_mlp": 1.01799452, "epoch": 0.596122050202916, "flos": 16835851267200.0, "grad_norm": 1.9013282223746726, "language_loss": 0.69112027, "learning_rate": 1.4804498589747448e-06, "loss": 0.7120496, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 9915, "time_per_iteration": 2.3612301349639893 }, { "auxiliary_loss_clip": 0.01056751, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.01335323, "balance_loss_mlp": 1.01750326, "epoch": 0.596182173455584, "flos": 20995578005760.0, "grad_norm": 1.6262143831614215, "language_loss": 0.79884487, "learning_rate": 1.4800737776958095e-06, "loss": 0.81979531, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39257812, "step": 9916, "time_per_iteration": 2.3785958290100098 }, { "auxiliary_loss_clip": 0.01056722, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.01712847, "balance_loss_mlp": 1.01834774, "epoch": 0.5962422967082519, "flos": 16064899424640.0, "grad_norm": 1.9198378051777651, "language_loss": 0.85161823, "learning_rate": 1.4796977161325286e-06, "loss": 0.87259561, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38476562, "step": 9917, "time_per_iteration": 3.809262752532959 }, { "auxiliary_loss_clip": 0.01055045, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0138824, "balance_loss_mlp": 1.01746321, "epoch": 0.5963024199609199, "flos": 12166148165760.0, "grad_norm": 1.7948635965720272, "language_loss": 0.77894306, "learning_rate": 1.4793216742991625e-06, "loss": 0.79985976, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.375, "step": 9918, "time_per_iteration": 2.3467795848846436 }, { "auxiliary_loss_clip": 0.01057092, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.01603222, "balance_loss_mlp": 1.01854801, "epoch": 0.5963625432135878, "flos": 28073031310080.0, "grad_norm": 1.4212061824327238, "language_loss": 0.79327917, "learning_rate": 1.4789456522099707e-06, "loss": 0.81425565, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38476562, "step": 9919, "time_per_iteration": 2.4675228595733643 }, { "auxiliary_loss_clip": 0.01054684, "auxiliary_loss_mlp": 0.01034876, "balance_loss_clip": 1.00994968, "balance_loss_mlp": 1.0175916, "epoch": 0.5964226664662559, "flos": 19859434675200.0, "grad_norm": 2.056656753931069, "language_loss": 0.79270834, "learning_rate": 1.4785696498792122e-06, "loss": 0.81360394, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 9920, "time_per_iteration": 2.3454315662384033 }, { "auxiliary_loss_clip": 0.01058311, "auxiliary_loss_mlp": 0.01042083, "balance_loss_clip": 1.0181818, "balance_loss_mlp": 1.01927614, "epoch": 0.5964827897189238, "flos": 12931793481600.0, "grad_norm": 2.1718086821826863, "language_loss": 0.84334207, "learning_rate": 1.4781936673211446e-06, "loss": 0.86434597, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.390625, "step": 9921, "time_per_iteration": 2.3803868293762207 }, { "auxiliary_loss_clip": 0.01055088, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.01191318, "balance_loss_mlp": 1.01743519, "epoch": 0.5965429129715918, "flos": 18149807485440.0, "grad_norm": 2.031014583400477, "language_loss": 0.82404685, "learning_rate": 1.4778177045500252e-06, "loss": 0.84496367, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37695312, "step": 9922, "time_per_iteration": 2.3407955169677734 }, { "auxiliary_loss_clip": 0.01054653, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.01107657, "balance_loss_mlp": 1.01686072, "epoch": 0.5966030362242597, "flos": 21762131016960.0, "grad_norm": 1.700717390348759, "language_loss": 0.78359008, "learning_rate": 1.477441761580111e-06, "loss": 0.80448771, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37695312, "step": 9923, "time_per_iteration": 2.4004054069519043 }, { "auxiliary_loss_clip": 0.01058583, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.01848722, "balance_loss_mlp": 1.01824808, "epoch": 0.5966631594769277, "flos": 18806209557120.0, "grad_norm": 1.9852961445349215, "language_loss": 0.77100259, "learning_rate": 1.4770658384256573e-06, "loss": 0.79204375, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 9924, "time_per_iteration": 2.3756442070007324 }, { "auxiliary_loss_clip": 0.01053916, "auxiliary_loss_mlp": 0.01038108, "balance_loss_clip": 1.01413524, "balance_loss_mlp": 1.01732218, "epoch": 0.5967232827295956, "flos": 14063293601280.0, "grad_norm": 1.7644070749144674, "language_loss": 0.67087138, "learning_rate": 1.4766899351009204e-06, "loss": 0.69179165, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 9925, "time_per_iteration": 2.3415942192077637 }, { "auxiliary_loss_clip": 0.0105466, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.0090549, "balance_loss_mlp": 1.01838708, "epoch": 0.5967834059822636, "flos": 17237282613120.0, "grad_norm": 2.05458727826708, "language_loss": 0.72999245, "learning_rate": 1.4763140516201528e-06, "loss": 0.75086445, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 9926, "time_per_iteration": 2.3691956996917725 }, { "auxiliary_loss_clip": 0.01057669, "auxiliary_loss_mlp": 0.01042016, "balance_loss_clip": 1.01513422, "balance_loss_mlp": 1.01772356, "epoch": 0.5968435292349316, "flos": 42518659743360.0, "grad_norm": 1.751084712656822, "language_loss": 0.71684861, "learning_rate": 1.4759381879976088e-06, "loss": 0.73784548, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 9927, "time_per_iteration": 2.5719106197357178 }, { "auxiliary_loss_clip": 0.01057901, "auxiliary_loss_mlp": 0.01043016, "balance_loss_clip": 1.01525187, "balance_loss_mlp": 1.01753831, "epoch": 0.5969036524875996, "flos": 37629457724160.0, "grad_norm": 1.6003024549540563, "language_loss": 0.64646018, "learning_rate": 1.4755623442475415e-06, "loss": 0.66746926, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40234375, "step": 9928, "time_per_iteration": 2.511996269226074 }, { "auxiliary_loss_clip": 0.01052553, "auxiliary_loss_mlp": 0.01045488, "balance_loss_clip": 1.02209949, "balance_loss_mlp": 1.01576388, "epoch": 0.5969637757402676, "flos": 23147275230720.0, "grad_norm": 1.6866114616404946, "language_loss": 0.69966412, "learning_rate": 1.4751865203842022e-06, "loss": 0.72064459, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 9929, "time_per_iteration": 2.3840153217315674 }, { "auxiliary_loss_clip": 0.01054163, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.01319814, "balance_loss_mlp": 1.01749647, "epoch": 0.5970238989929355, "flos": 24019894552320.0, "grad_norm": 2.177440534955975, "language_loss": 0.78718126, "learning_rate": 1.4748107164218431e-06, "loss": 0.80809605, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 9930, "time_per_iteration": 2.3896005153656006 }, { "auxiliary_loss_clip": 0.01057542, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.01843047, "balance_loss_mlp": 1.01850128, "epoch": 0.5970840222456035, "flos": 19425883011840.0, "grad_norm": 1.8992300168237586, "language_loss": 0.70998168, "learning_rate": 1.4744349323747146e-06, "loss": 0.73098415, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.390625, "step": 9931, "time_per_iteration": 2.3762331008911133 }, { "auxiliary_loss_clip": 0.01011174, "auxiliary_loss_mlp": 0.0100337, "balance_loss_clip": 1.00083113, "balance_loss_mlp": 1.00399065, "epoch": 0.5971441454982714, "flos": 62973781902720.0, "grad_norm": 0.8601237645092096, "language_loss": 0.64256954, "learning_rate": 1.474059168257065e-06, "loss": 0.66271508, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.07177734, "step": 9932, "time_per_iteration": 2.9297099113464355 }, { "auxiliary_loss_clip": 0.01055588, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.01817501, "balance_loss_mlp": 1.0174942, "epoch": 0.5972042687509395, "flos": 20265195029760.0, "grad_norm": 1.8033175774765589, "language_loss": 0.75206226, "learning_rate": 1.4736834240831454e-06, "loss": 0.77305019, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 9933, "time_per_iteration": 2.379478693008423 }, { "auxiliary_loss_clip": 0.01010376, "auxiliary_loss_mlp": 0.01003762, "balance_loss_clip": 1.00106823, "balance_loss_mlp": 1.00329423, "epoch": 0.5972643920036074, "flos": 71648510175360.0, "grad_norm": 0.6660497446821488, "language_loss": 0.52077883, "learning_rate": 1.473307699867203e-06, "loss": 0.54092026, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.0703125, "step": 9934, "time_per_iteration": 3.1557915210723877 }, { "auxiliary_loss_clip": 0.01010248, "auxiliary_loss_mlp": 0.01002996, "balance_loss_clip": 1.00044477, "balance_loss_mlp": 1.00300455, "epoch": 0.5973245152562754, "flos": 56889781735680.0, "grad_norm": 0.8325285759992868, "language_loss": 0.54316813, "learning_rate": 1.4729319956234849e-06, "loss": 0.56330055, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.07226562, "step": 9935, "time_per_iteration": 3.08651065826416 }, { "auxiliary_loss_clip": 0.01056062, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.01733732, "balance_loss_mlp": 1.01803327, "epoch": 0.5973846385089433, "flos": 24163387706880.0, "grad_norm": 1.7365500033417096, "language_loss": 0.66849494, "learning_rate": 1.4725563113662394e-06, "loss": 0.68947721, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 9936, "time_per_iteration": 2.398763656616211 }, { "auxiliary_loss_clip": 0.01055984, "auxiliary_loss_mlp": 0.01042412, "balance_loss_clip": 1.01812899, "balance_loss_mlp": 1.01756144, "epoch": 0.5974447617616113, "flos": 17669786935680.0, "grad_norm": 2.0818594543520956, "language_loss": 0.68737364, "learning_rate": 1.4721806471097103e-06, "loss": 0.70835757, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.38476562, "step": 9937, "time_per_iteration": 2.383715867996216 }, { "auxiliary_loss_clip": 0.01055687, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.01318407, "balance_loss_mlp": 1.01654959, "epoch": 0.5975048850142792, "flos": 22891431720960.0, "grad_norm": 2.0628814772001447, "language_loss": 0.78649455, "learning_rate": 1.4718050028681442e-06, "loss": 0.80743659, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 9938, "time_per_iteration": 2.378601551055908 }, { "auxiliary_loss_clip": 0.01054708, "auxiliary_loss_mlp": 0.01045055, "balance_loss_clip": 1.02064061, "balance_loss_mlp": 1.01711214, "epoch": 0.5975650082669473, "flos": 24351953293440.0, "grad_norm": 1.4402495932248698, "language_loss": 0.7680434, "learning_rate": 1.4714293786557855e-06, "loss": 0.78904098, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 9939, "time_per_iteration": 2.4574036598205566 }, { "auxiliary_loss_clip": 0.01057991, "auxiliary_loss_mlp": 0.01040984, "balance_loss_clip": 1.01213503, "balance_loss_mlp": 1.01744592, "epoch": 0.5976251315196152, "flos": 20922295328640.0, "grad_norm": 2.4299259578614567, "language_loss": 0.70922345, "learning_rate": 1.471053774486878e-06, "loss": 0.73021317, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40625, "step": 9940, "time_per_iteration": 2.3637375831604004 }, { "auxiliary_loss_clip": 0.01053536, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.01649833, "balance_loss_mlp": 1.0168736, "epoch": 0.5976852547722832, "flos": 35843161455360.0, "grad_norm": 1.3841775875988074, "language_loss": 0.70814443, "learning_rate": 1.470678190375664e-06, "loss": 0.72906661, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3671875, "step": 9941, "time_per_iteration": 2.528798818588257 }, { "auxiliary_loss_clip": 0.01054665, "auxiliary_loss_mlp": 0.01040417, "balance_loss_clip": 1.01724303, "balance_loss_mlp": 1.01662493, "epoch": 0.5977453780249512, "flos": 12855229136640.0, "grad_norm": 1.7899245678780817, "language_loss": 0.78945941, "learning_rate": 1.470302626336386e-06, "loss": 0.81041026, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.38085938, "step": 9942, "time_per_iteration": 2.3339056968688965 }, { "auxiliary_loss_clip": 0.01055599, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.02143931, "balance_loss_mlp": 1.0170697, "epoch": 0.5978055012776191, "flos": 20958116250240.0, "grad_norm": 1.8791643231170954, "language_loss": 0.76706159, "learning_rate": 1.4699270823832857e-06, "loss": 0.78808063, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 9943, "time_per_iteration": 3.6250364780426025 }, { "auxiliary_loss_clip": 0.01055285, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.01707339, "balance_loss_mlp": 1.01821935, "epoch": 0.5978656245302871, "flos": 34056585895680.0, "grad_norm": 2.129793730277272, "language_loss": 0.63407278, "learning_rate": 1.4695515585306032e-06, "loss": 0.65503573, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 9944, "time_per_iteration": 2.487746238708496 }, { "auxiliary_loss_clip": 0.01056656, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.01773524, "balance_loss_mlp": 1.01834917, "epoch": 0.597925747782955, "flos": 37371903557760.0, "grad_norm": 1.6528238211017168, "language_loss": 0.73335743, "learning_rate": 1.4691760547925795e-06, "loss": 0.7543658, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 9945, "time_per_iteration": 2.538015842437744 }, { "auxiliary_loss_clip": 0.01055395, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.01283383, "balance_loss_mlp": 1.0168376, "epoch": 0.5979858710356231, "flos": 25373616675840.0, "grad_norm": 3.105106744641933, "language_loss": 0.68694031, "learning_rate": 1.4688005711834522e-06, "loss": 0.7078703, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 9946, "time_per_iteration": 2.3913629055023193 }, { "auxiliary_loss_clip": 0.01057742, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.01486063, "balance_loss_mlp": 1.01851964, "epoch": 0.598045994288291, "flos": 13697578442880.0, "grad_norm": 2.0147583691020117, "language_loss": 0.89563942, "learning_rate": 1.468425107717461e-06, "loss": 0.91662455, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 9947, "time_per_iteration": 2.3525450229644775 }, { "auxiliary_loss_clip": 0.01053644, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 1.01912367, "balance_loss_mlp": 1.01766515, "epoch": 0.598106117540959, "flos": 21980268391680.0, "grad_norm": 1.8359819156631376, "language_loss": 0.73363698, "learning_rate": 1.4680496644088432e-06, "loss": 0.7546038, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 9948, "time_per_iteration": 3.868403196334839 }, { "auxiliary_loss_clip": 0.01058565, "auxiliary_loss_mlp": 0.01043901, "balance_loss_clip": 1.0176034, "balance_loss_mlp": 1.01957297, "epoch": 0.5981662407936269, "flos": 20558290826880.0, "grad_norm": 1.9085672715621058, "language_loss": 0.90419406, "learning_rate": 1.4676742412718347e-06, "loss": 0.9252187, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 9949, "time_per_iteration": 2.373253345489502 }, { "auxiliary_loss_clip": 0.0105723, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.01698613, "balance_loss_mlp": 1.01994979, "epoch": 0.5982263640462949, "flos": 14062979399040.0, "grad_norm": 1.6913773671150722, "language_loss": 0.71487403, "learning_rate": 1.467298838320673e-06, "loss": 0.73584259, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37304688, "step": 9950, "time_per_iteration": 2.3334810733795166 }, { "auxiliary_loss_clip": 0.01056438, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.01514721, "balance_loss_mlp": 1.01852107, "epoch": 0.5982864872989628, "flos": 17706410818560.0, "grad_norm": 1.529870542961733, "language_loss": 0.78716111, "learning_rate": 1.4669234555695921e-06, "loss": 0.8081215, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 9951, "time_per_iteration": 2.3631749153137207 }, { "auxiliary_loss_clip": 0.0105764, "auxiliary_loss_mlp": 0.01045297, "balance_loss_clip": 1.01713943, "balance_loss_mlp": 1.01881647, "epoch": 0.5983466105516309, "flos": 16763825399040.0, "grad_norm": 1.8742363290683295, "language_loss": 0.74962986, "learning_rate": 1.4665480930328275e-06, "loss": 0.77065921, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.38671875, "step": 9952, "time_per_iteration": 2.360698938369751 }, { "auxiliary_loss_clip": 0.01059592, "auxiliary_loss_mlp": 0.01040939, "balance_loss_clip": 1.01313925, "balance_loss_mlp": 1.0196569, "epoch": 0.5984067338042988, "flos": 20041820951040.0, "grad_norm": 2.077515582325337, "language_loss": 0.79660285, "learning_rate": 1.466172750724613e-06, "loss": 0.81760818, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 9953, "time_per_iteration": 2.3742663860321045 }, { "auxiliary_loss_clip": 0.0105649, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.0161736, "balance_loss_mlp": 1.0185622, "epoch": 0.5984668570569668, "flos": 26318785536000.0, "grad_norm": 1.5317461866514381, "language_loss": 0.70584619, "learning_rate": 1.4657974286591807e-06, "loss": 0.72681904, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 9954, "time_per_iteration": 2.4102349281311035 }, { "auxiliary_loss_clip": 0.01058621, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 1.01292157, "balance_loss_mlp": 1.01981902, "epoch": 0.5985269803096348, "flos": 20592715294080.0, "grad_norm": 1.747905215288434, "language_loss": 0.74190533, "learning_rate": 1.4654221268507637e-06, "loss": 0.76286721, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38867188, "step": 9955, "time_per_iteration": 2.3617281913757324 }, { "auxiliary_loss_clip": 0.01056393, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.01273346, "balance_loss_mlp": 1.01818943, "epoch": 0.5985871035623027, "flos": 26864303529600.0, "grad_norm": 1.5313390534872875, "language_loss": 0.69448864, "learning_rate": 1.4650468453135934e-06, "loss": 0.7154156, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3828125, "step": 9956, "time_per_iteration": 3.9031474590301514 }, { "auxiliary_loss_clip": 0.01057381, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.01680624, "balance_loss_mlp": 1.01848388, "epoch": 0.5986472268149707, "flos": 19608688224000.0, "grad_norm": 2.144083995236414, "language_loss": 0.74774683, "learning_rate": 1.4646715840618999e-06, "loss": 0.76873744, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38867188, "step": 9957, "time_per_iteration": 2.359659194946289 }, { "auxiliary_loss_clip": 0.01053547, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.01652944, "balance_loss_mlp": 1.01787305, "epoch": 0.5987073500676386, "flos": 21793657841280.0, "grad_norm": 2.7272328792314013, "language_loss": 0.86016589, "learning_rate": 1.4642963431099138e-06, "loss": 0.88109899, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 9958, "time_per_iteration": 2.374685049057007 }, { "auxiliary_loss_clip": 0.01058445, "auxiliary_loss_mlp": 0.0104366, "balance_loss_clip": 1.01746976, "balance_loss_mlp": 1.01889539, "epoch": 0.5987674733203067, "flos": 24313269640320.0, "grad_norm": 1.8990965587152735, "language_loss": 0.67715812, "learning_rate": 1.463921122471864e-06, "loss": 0.69817913, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 9959, "time_per_iteration": 2.388361692428589 }, { "auxiliary_loss_clip": 0.01058156, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.01458216, "balance_loss_mlp": 1.0193783, "epoch": 0.5988275965729746, "flos": 21319258020480.0, "grad_norm": 1.7429088556643662, "language_loss": 0.84876347, "learning_rate": 1.4635459221619796e-06, "loss": 0.86974943, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 9960, "time_per_iteration": 2.396484136581421 }, { "auxiliary_loss_clip": 0.01055824, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.01239824, "balance_loss_mlp": 1.01775956, "epoch": 0.5988877198256426, "flos": 25116900382080.0, "grad_norm": 1.521707695732219, "language_loss": 0.80874467, "learning_rate": 1.4631707421944868e-06, "loss": 0.82966113, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.38085938, "step": 9961, "time_per_iteration": 2.4081528186798096 }, { "auxiliary_loss_clip": 0.01057454, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.01640236, "balance_loss_mlp": 1.0193603, "epoch": 0.5989478430783105, "flos": 26427993868800.0, "grad_norm": 2.041265034522868, "language_loss": 0.67894268, "learning_rate": 1.4627955825836136e-06, "loss": 0.69992876, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 9962, "time_per_iteration": 2.3977408409118652 }, { "auxiliary_loss_clip": 0.01056124, "auxiliary_loss_mlp": 0.01041693, "balance_loss_clip": 1.01811385, "balance_loss_mlp": 1.01806092, "epoch": 0.5990079663309785, "flos": 25777177614720.0, "grad_norm": 1.4157428472610571, "language_loss": 0.74773693, "learning_rate": 1.4624204433435857e-06, "loss": 0.76871514, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38085938, "step": 9963, "time_per_iteration": 2.4470884799957275 }, { "auxiliary_loss_clip": 0.01055007, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.01420593, "balance_loss_mlp": 1.0174706, "epoch": 0.5990680895836464, "flos": 36830260725120.0, "grad_norm": 1.6925047739996948, "language_loss": 0.68722701, "learning_rate": 1.4620453244886281e-06, "loss": 0.70816827, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 9964, "time_per_iteration": 2.5061001777648926 }, { "auxiliary_loss_clip": 0.010539, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.01021242, "balance_loss_mlp": 1.01701176, "epoch": 0.5991282128363145, "flos": 24132419464320.0, "grad_norm": 2.264245204125111, "language_loss": 0.77349401, "learning_rate": 1.4616702260329662e-06, "loss": 0.79437852, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 9965, "time_per_iteration": 2.4127469062805176 }, { "auxiliary_loss_clip": 0.01054206, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.01013029, "balance_loss_mlp": 1.01629996, "epoch": 0.5991883360889824, "flos": 10303357374720.0, "grad_norm": 1.7109538725366742, "language_loss": 0.78370655, "learning_rate": 1.4612951479908229e-06, "loss": 0.80459511, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 9966, "time_per_iteration": 2.331923723220825 }, { "auxiliary_loss_clip": 0.01056231, "auxiliary_loss_mlp": 0.01036907, "balance_loss_clip": 1.01381636, "balance_loss_mlp": 1.01877761, "epoch": 0.5992484593416504, "flos": 23950068099840.0, "grad_norm": 1.5567797919069344, "language_loss": 0.74803042, "learning_rate": 1.460920090376422e-06, "loss": 0.76896179, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 9967, "time_per_iteration": 2.416025400161743 }, { "auxiliary_loss_clip": 0.01058218, "auxiliary_loss_mlp": 0.01047597, "balance_loss_clip": 1.0205009, "balance_loss_mlp": 1.01729393, "epoch": 0.5993085825943184, "flos": 11943402491520.0, "grad_norm": 2.714080626937971, "language_loss": 0.68751591, "learning_rate": 1.4605450532039847e-06, "loss": 0.70857406, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.41015625, "step": 9968, "time_per_iteration": 2.345857620239258 }, { "auxiliary_loss_clip": 0.01056321, "auxiliary_loss_mlp": 0.01043237, "balance_loss_clip": 1.01699924, "balance_loss_mlp": 1.01764548, "epoch": 0.5993687058469863, "flos": 19025813208960.0, "grad_norm": 1.5801331112902455, "language_loss": 0.80215585, "learning_rate": 1.4601700364877334e-06, "loss": 0.82315141, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 9969, "time_per_iteration": 2.3613781929016113 }, { "auxiliary_loss_clip": 0.01056281, "auxiliary_loss_mlp": 0.01040515, "balance_loss_clip": 1.01493299, "balance_loss_mlp": 1.01684535, "epoch": 0.5994288290996543, "flos": 14282094291840.0, "grad_norm": 1.7574182580157922, "language_loss": 0.82637519, "learning_rate": 1.4597950402418889e-06, "loss": 0.84734315, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39257812, "step": 9970, "time_per_iteration": 2.329341173171997 }, { "auxiliary_loss_clip": 0.0105807, "auxiliary_loss_mlp": 0.01040799, "balance_loss_clip": 1.01158118, "balance_loss_mlp": 1.01724946, "epoch": 0.5994889523523222, "flos": 19205685866880.0, "grad_norm": 1.9217755984884692, "language_loss": 0.63575971, "learning_rate": 1.4594200644806697e-06, "loss": 0.65674841, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40820312, "step": 9971, "time_per_iteration": 2.347970962524414 }, { "auxiliary_loss_clip": 0.01053542, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.01327038, "balance_loss_mlp": 1.01745903, "epoch": 0.5995490756049903, "flos": 28035813934080.0, "grad_norm": 1.8486144461196083, "language_loss": 0.79985332, "learning_rate": 1.4590451092182962e-06, "loss": 0.82075208, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 9972, "time_per_iteration": 2.3997459411621094 }, { "auxiliary_loss_clip": 0.01060481, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.0180198, "balance_loss_mlp": 1.01787972, "epoch": 0.5996091988576582, "flos": 29051856587520.0, "grad_norm": 3.723828446511799, "language_loss": 0.76723301, "learning_rate": 1.4586701744689864e-06, "loss": 0.78831089, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.42578125, "step": 9973, "time_per_iteration": 2.4451162815093994 }, { "auxiliary_loss_clip": 0.01054285, "auxiliary_loss_mlp": 0.01041731, "balance_loss_clip": 1.01639891, "balance_loss_mlp": 1.01605177, "epoch": 0.5996693221103262, "flos": 20812912439040.0, "grad_norm": 2.0709075017681853, "language_loss": 0.67050302, "learning_rate": 1.4582952602469578e-06, "loss": 0.69146311, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 9974, "time_per_iteration": 2.352391004562378 }, { "auxiliary_loss_clip": 0.0105634, "auxiliary_loss_mlp": 0.01046836, "balance_loss_clip": 1.01880956, "balance_loss_mlp": 1.01752388, "epoch": 0.5997294453629941, "flos": 23767786558080.0, "grad_norm": 1.597075682178523, "language_loss": 0.75655019, "learning_rate": 1.457920366566428e-06, "loss": 0.77758193, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.38867188, "step": 9975, "time_per_iteration": 2.411155939102173 }, { "auxiliary_loss_clip": 0.01056382, "auxiliary_loss_mlp": 0.01045277, "balance_loss_clip": 1.01938438, "balance_loss_mlp": 1.01772535, "epoch": 0.5997895686156621, "flos": 20958954122880.0, "grad_norm": 1.8252498726278144, "language_loss": 0.78694606, "learning_rate": 1.457545493441611e-06, "loss": 0.80796266, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 9976, "time_per_iteration": 2.353914260864258 }, { "auxiliary_loss_clip": 0.01055287, "auxiliary_loss_mlp": 0.01041001, "balance_loss_clip": 1.01361823, "balance_loss_mlp": 1.01642394, "epoch": 0.59984969186833, "flos": 28364206982400.0, "grad_norm": 2.7139976296865886, "language_loss": 0.76729989, "learning_rate": 1.4571706408867237e-06, "loss": 0.78826278, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38867188, "step": 9977, "time_per_iteration": 2.432612895965576 }, { "auxiliary_loss_clip": 0.01055615, "auxiliary_loss_mlp": 0.01042101, "balance_loss_clip": 1.01611316, "balance_loss_mlp": 1.01627135, "epoch": 0.5999098151209981, "flos": 22564784240640.0, "grad_norm": 1.5372321596116194, "language_loss": 0.69984853, "learning_rate": 1.4567958089159802e-06, "loss": 0.72082567, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39257812, "step": 9978, "time_per_iteration": 2.3688621520996094 }, { "auxiliary_loss_clip": 0.01060556, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.01251519, "balance_loss_mlp": 1.01985979, "epoch": 0.599969938373666, "flos": 18767770283520.0, "grad_norm": 2.468577529800031, "language_loss": 0.82661968, "learning_rate": 1.456420997543594e-06, "loss": 0.84761494, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40820312, "step": 9979, "time_per_iteration": 2.361079692840576 }, { "auxiliary_loss_clip": 0.01054221, "auxiliary_loss_mlp": 0.0104174, "balance_loss_clip": 1.01665807, "balance_loss_mlp": 1.01705003, "epoch": 0.600030061626334, "flos": 11326452122880.0, "grad_norm": 2.1125025964226527, "language_loss": 0.72336268, "learning_rate": 1.4560462067837782e-06, "loss": 0.7443223, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 9980, "time_per_iteration": 2.330484390258789 }, { "auxiliary_loss_clip": 0.01058806, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 1.01507223, "balance_loss_mlp": 1.01834583, "epoch": 0.600090184879002, "flos": 16577808341760.0, "grad_norm": 2.691356307603904, "language_loss": 0.71007049, "learning_rate": 1.4556714366507445e-06, "loss": 0.73109591, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40429688, "step": 9981, "time_per_iteration": 2.3744113445281982 }, { "auxiliary_loss_clip": 0.01054324, "auxiliary_loss_mlp": 0.01043941, "balance_loss_clip": 1.02074337, "balance_loss_mlp": 1.01676023, "epoch": 0.6001503081316699, "flos": 23617625333760.0, "grad_norm": 1.8151439109385232, "language_loss": 0.7930423, "learning_rate": 1.4552966871587048e-06, "loss": 0.81402498, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 9982, "time_per_iteration": 3.713190793991089 }, { "auxiliary_loss_clip": 0.01056258, "auxiliary_loss_mlp": 0.01045932, "balance_loss_clip": 1.01872849, "balance_loss_mlp": 1.01786613, "epoch": 0.6002104313843379, "flos": 20666626375680.0, "grad_norm": 1.6960455449591838, "language_loss": 0.73771095, "learning_rate": 1.4549219583218686e-06, "loss": 0.75873286, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38476562, "step": 9983, "time_per_iteration": 2.3946287631988525 }, { "auxiliary_loss_clip": 0.01056429, "auxiliary_loss_mlp": 0.01045113, "balance_loss_clip": 1.01868439, "balance_loss_mlp": 1.017259, "epoch": 0.6002705546370058, "flos": 22454144542080.0, "grad_norm": 1.9451310040349825, "language_loss": 0.79418409, "learning_rate": 1.454547250154447e-06, "loss": 0.81519949, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 9984, "time_per_iteration": 2.363572359085083 }, { "auxiliary_loss_clip": 0.01055958, "auxiliary_loss_mlp": 0.01041402, "balance_loss_clip": 1.01701164, "balance_loss_mlp": 1.01757669, "epoch": 0.6003306778896739, "flos": 25190811463680.0, "grad_norm": 1.8448951584694433, "language_loss": 0.83990037, "learning_rate": 1.4541725626706485e-06, "loss": 0.860874, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3828125, "step": 9985, "time_per_iteration": 2.4531543254852295 }, { "auxiliary_loss_clip": 0.01055934, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.01652622, "balance_loss_mlp": 1.01802087, "epoch": 0.6003908011423418, "flos": 26686525553280.0, "grad_norm": 1.746543116683562, "language_loss": 0.73202038, "learning_rate": 1.4537978958846809e-06, "loss": 0.75299686, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 9986, "time_per_iteration": 2.4075582027435303 }, { "auxiliary_loss_clip": 0.01059873, "auxiliary_loss_mlp": 0.01038153, "balance_loss_clip": 1.01282144, "balance_loss_mlp": 1.02011991, "epoch": 0.6004509243950098, "flos": 22563981279360.0, "grad_norm": 1.5043397889060346, "language_loss": 0.72951221, "learning_rate": 1.4534232498107514e-06, "loss": 0.75049245, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3984375, "step": 9987, "time_per_iteration": 5.2034924030303955 }, { "auxiliary_loss_clip": 0.01054176, "auxiliary_loss_mlp": 0.01038287, "balance_loss_clip": 1.01363516, "balance_loss_mlp": 1.01721001, "epoch": 0.6005110476476777, "flos": 19718280581760.0, "grad_norm": 1.754951353683002, "language_loss": 0.8568939, "learning_rate": 1.4530486244630673e-06, "loss": 0.87781852, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 9988, "time_per_iteration": 2.369492292404175 }, { "auxiliary_loss_clip": 0.01055153, "auxiliary_loss_mlp": 0.01040768, "balance_loss_clip": 1.0162586, "balance_loss_mlp": 1.0177424, "epoch": 0.6005711709003457, "flos": 17711577699840.0, "grad_norm": 1.8724557476127222, "language_loss": 0.66228807, "learning_rate": 1.4526740198558346e-06, "loss": 0.68324733, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37304688, "step": 9989, "time_per_iteration": 2.3601319789886475 }, { "auxiliary_loss_clip": 0.0105641, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.01349139, "balance_loss_mlp": 1.01835728, "epoch": 0.6006312941530136, "flos": 18513497784960.0, "grad_norm": 1.4911187259181997, "language_loss": 0.81869841, "learning_rate": 1.452299436003257e-06, "loss": 0.83962923, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38085938, "step": 9990, "time_per_iteration": 2.389498233795166 }, { "auxiliary_loss_clip": 0.0105646, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.01334703, "balance_loss_mlp": 1.01792336, "epoch": 0.6006914174056817, "flos": 21389957256960.0, "grad_norm": 2.0059310199922757, "language_loss": 0.84070396, "learning_rate": 1.4519248729195403e-06, "loss": 0.86165786, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38476562, "step": 9991, "time_per_iteration": 2.404491662979126 }, { "auxiliary_loss_clip": 0.01055017, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.01061249, "balance_loss_mlp": 1.01803172, "epoch": 0.6007515406583496, "flos": 12749686496640.0, "grad_norm": 2.692204334681764, "language_loss": 0.84414041, "learning_rate": 1.4515503306188878e-06, "loss": 0.86503434, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 9992, "time_per_iteration": 2.3645944595336914 }, { "auxiliary_loss_clip": 0.01055299, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.01969624, "balance_loss_mlp": 1.01739645, "epoch": 0.6008116639110176, "flos": 19205930246400.0, "grad_norm": 2.429295269533105, "language_loss": 0.67316431, "learning_rate": 1.4511758091155008e-06, "loss": 0.69416249, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 9993, "time_per_iteration": 2.402932643890381 }, { "auxiliary_loss_clip": 0.01053501, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.01395476, "balance_loss_mlp": 1.0168674, "epoch": 0.6008717871636855, "flos": 17054407578240.0, "grad_norm": 2.489186767266464, "language_loss": 0.82356226, "learning_rate": 1.4508013084235826e-06, "loss": 0.84448516, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 9994, "time_per_iteration": 2.361377239227295 }, { "auxiliary_loss_clip": 0.01052578, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.01183712, "balance_loss_mlp": 1.01771355, "epoch": 0.6009319104163535, "flos": 20297769194880.0, "grad_norm": 1.9537563075947522, "language_loss": 0.73049939, "learning_rate": 1.4504268285573337e-06, "loss": 0.75134861, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34960938, "step": 9995, "time_per_iteration": 3.8425729274749756 }, { "auxiliary_loss_clip": 0.01055342, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.01603198, "balance_loss_mlp": 1.01613283, "epoch": 0.6009920336690215, "flos": 21835658073600.0, "grad_norm": 1.6212383282764546, "language_loss": 0.82098114, "learning_rate": 1.4500523695309546e-06, "loss": 0.84196448, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 9996, "time_per_iteration": 2.43393611907959 }, { "auxiliary_loss_clip": 0.01056322, "auxiliary_loss_mlp": 0.01045815, "balance_loss_clip": 1.02138865, "balance_loss_mlp": 1.0188911, "epoch": 0.6010521569216895, "flos": 22595158990080.0, "grad_norm": 1.7207174660075102, "language_loss": 0.79705715, "learning_rate": 1.4496779313586447e-06, "loss": 0.81807852, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 9997, "time_per_iteration": 2.364619493484497 }, { "auxiliary_loss_clip": 0.0105659, "auxiliary_loss_mlp": 0.01043088, "balance_loss_clip": 1.01710069, "balance_loss_mlp": 1.01768064, "epoch": 0.6011122801743575, "flos": 19170702817920.0, "grad_norm": 2.2689863991073267, "language_loss": 0.73958039, "learning_rate": 1.4493035140546028e-06, "loss": 0.76057714, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 9998, "time_per_iteration": 2.3612189292907715 }, { "auxiliary_loss_clip": 0.01053494, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.01728046, "balance_loss_mlp": 1.01641142, "epoch": 0.6011724034270254, "flos": 25008844124160.0, "grad_norm": 1.6077152778076338, "language_loss": 0.73018277, "learning_rate": 1.448929117633027e-06, "loss": 0.75112164, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 9999, "time_per_iteration": 2.386514902114868 }, { "auxiliary_loss_clip": 0.01058043, "auxiliary_loss_mlp": 0.01046285, "balance_loss_clip": 1.01994014, "balance_loss_mlp": 1.01736701, "epoch": 0.6012325266796934, "flos": 21796625306880.0, "grad_norm": 1.5769567983087593, "language_loss": 0.79357123, "learning_rate": 1.4485547421081142e-06, "loss": 0.81461453, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40820312, "step": 10000, "time_per_iteration": 2.4028587341308594 }, { "auxiliary_loss_clip": 0.01059081, "auxiliary_loss_mlp": 0.01042817, "balance_loss_clip": 1.01587546, "balance_loss_mlp": 1.01847351, "epoch": 0.6012926499323613, "flos": 19571994518400.0, "grad_norm": 1.9518899461351125, "language_loss": 0.7891736, "learning_rate": 1.4481803874940608e-06, "loss": 0.81019258, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40625, "step": 10001, "time_per_iteration": 2.3408384323120117 }, { "auxiliary_loss_clip": 0.01056929, "auxiliary_loss_mlp": 0.010418, "balance_loss_clip": 1.0135591, "balance_loss_mlp": 1.0167706, "epoch": 0.6013527731850293, "flos": 34859343853440.0, "grad_norm": 2.106335718015344, "language_loss": 0.59159625, "learning_rate": 1.4478060538050624e-06, "loss": 0.61258352, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40234375, "step": 10002, "time_per_iteration": 2.500988245010376 }, { "auxiliary_loss_clip": 0.01059886, "auxiliary_loss_mlp": 0.01050315, "balance_loss_clip": 1.02023816, "balance_loss_mlp": 1.01917601, "epoch": 0.6014128964376972, "flos": 23290908030720.0, "grad_norm": 1.5730375891371469, "language_loss": 0.79155594, "learning_rate": 1.447431741055314e-06, "loss": 0.81265795, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.40625, "step": 10003, "time_per_iteration": 2.3700995445251465 }, { "auxiliary_loss_clip": 0.01057572, "auxiliary_loss_mlp": 0.01042489, "balance_loss_clip": 1.01513052, "balance_loss_mlp": 1.01725578, "epoch": 0.6014730196903653, "flos": 24819929424000.0, "grad_norm": 2.104655564803409, "language_loss": 0.78388047, "learning_rate": 1.4470574492590091e-06, "loss": 0.8048811, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40234375, "step": 10004, "time_per_iteration": 2.395785331726074 }, { "auxiliary_loss_clip": 0.01055433, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.01545525, "balance_loss_mlp": 1.01712239, "epoch": 0.6015331429430332, "flos": 23111244840960.0, "grad_norm": 1.5539206305330344, "language_loss": 0.73296744, "learning_rate": 1.4466831784303408e-06, "loss": 0.75393188, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 10005, "time_per_iteration": 2.400291681289673 }, { "auxiliary_loss_clip": 0.01055305, "auxiliary_loss_mlp": 0.01037304, "balance_loss_clip": 1.01392722, "balance_loss_mlp": 1.01806271, "epoch": 0.6015932661957012, "flos": 19200553896960.0, "grad_norm": 3.55078194571101, "language_loss": 0.75541055, "learning_rate": 1.4463089285835026e-06, "loss": 0.77633667, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37304688, "step": 10006, "time_per_iteration": 2.3416922092437744 }, { "auxiliary_loss_clip": 0.0105439, "auxiliary_loss_mlp": 0.01040009, "balance_loss_clip": 1.01381862, "balance_loss_mlp": 1.01624405, "epoch": 0.6016533894483691, "flos": 18112659932160.0, "grad_norm": 1.7889122695374988, "language_loss": 0.75690806, "learning_rate": 1.445934699732685e-06, "loss": 0.77785206, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 10007, "time_per_iteration": 2.428571939468384 }, { "auxiliary_loss_clip": 0.01055588, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.01238275, "balance_loss_mlp": 1.01716971, "epoch": 0.6017135127010371, "flos": 16215968344320.0, "grad_norm": 1.8043965835336002, "language_loss": 0.7120136, "learning_rate": 1.4455604918920785e-06, "loss": 0.73294365, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 10008, "time_per_iteration": 2.369271993637085 }, { "auxiliary_loss_clip": 0.010542, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.01087904, "balance_loss_mlp": 1.01692271, "epoch": 0.6017736359537051, "flos": 23443024291200.0, "grad_norm": 2.3145706592441133, "language_loss": 0.77223259, "learning_rate": 1.4451863050758748e-06, "loss": 0.79312086, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10009, "time_per_iteration": 2.4240803718566895 }, { "auxiliary_loss_clip": 0.0105679, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.01342261, "balance_loss_mlp": 1.01682019, "epoch": 0.601833759206373, "flos": 23512920566400.0, "grad_norm": 2.2627455316789464, "language_loss": 0.75834471, "learning_rate": 1.4448121392982608e-06, "loss": 0.77928662, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3984375, "step": 10010, "time_per_iteration": 2.3826427459716797 }, { "auxiliary_loss_clip": 0.01009479, "auxiliary_loss_mlp": 0.01006877, "balance_loss_clip": 1.00438547, "balance_loss_mlp": 1.00219572, "epoch": 0.6018938824590411, "flos": 63987972387840.0, "grad_norm": 0.8173790918715339, "language_loss": 0.55142069, "learning_rate": 1.4444379945734268e-06, "loss": 0.57158434, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07275391, "step": 10011, "time_per_iteration": 3.0754191875457764 }, { "auxiliary_loss_clip": 0.01057946, "auxiliary_loss_mlp": 0.01044657, "balance_loss_clip": 1.0193367, "balance_loss_mlp": 1.01919627, "epoch": 0.601954005711709, "flos": 34638623038080.0, "grad_norm": 1.4194221558653177, "language_loss": 0.63043129, "learning_rate": 1.44406387091556e-06, "loss": 0.65145737, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 10012, "time_per_iteration": 2.505100965499878 }, { "auxiliary_loss_clip": 0.01055718, "auxiliary_loss_mlp": 0.01033743, "balance_loss_clip": 1.01031852, "balance_loss_mlp": 1.01814413, "epoch": 0.602014128964377, "flos": 19426057568640.0, "grad_norm": 1.7833400399957942, "language_loss": 0.75456846, "learning_rate": 1.4436897683388462e-06, "loss": 0.77546304, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 10013, "time_per_iteration": 2.3759562969207764 }, { "auxiliary_loss_clip": 0.01052294, "auxiliary_loss_mlp": 0.01033745, "balance_loss_clip": 1.01197708, "balance_loss_mlp": 1.01705289, "epoch": 0.6020742522170449, "flos": 28328141681280.0, "grad_norm": 1.7244470837625046, "language_loss": 0.82183379, "learning_rate": 1.4433156868574732e-06, "loss": 0.84269416, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 10014, "time_per_iteration": 2.4222571849823 }, { "auxiliary_loss_clip": 0.01052041, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.00931859, "balance_loss_mlp": 1.0168612, "epoch": 0.6021343754697129, "flos": 22745948618880.0, "grad_norm": 1.3590615099265648, "language_loss": 0.73751742, "learning_rate": 1.442941626485624e-06, "loss": 0.7583611, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 10015, "time_per_iteration": 2.413330554962158 }, { "auxiliary_loss_clip": 0.01009296, "auxiliary_loss_mlp": 0.01004067, "balance_loss_clip": 1.00173008, "balance_loss_mlp": 1.00185907, "epoch": 0.6021944987223808, "flos": 65749027743360.0, "grad_norm": 0.8290510624287949, "language_loss": 0.54894686, "learning_rate": 1.4425675872374848e-06, "loss": 0.56908047, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.07421875, "step": 10016, "time_per_iteration": 2.9392285346984863 }, { "auxiliary_loss_clip": 0.01055958, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.01205862, "balance_loss_mlp": 1.01884198, "epoch": 0.6022546219750489, "flos": 16104316216320.0, "grad_norm": 1.5236522114454711, "language_loss": 0.84080267, "learning_rate": 1.4421935691272381e-06, "loss": 0.86172545, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 10017, "time_per_iteration": 2.3530538082122803 }, { "auxiliary_loss_clip": 0.01055195, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.01438272, "balance_loss_mlp": 1.01849008, "epoch": 0.6023147452277168, "flos": 25511593835520.0, "grad_norm": 2.4719039476460534, "language_loss": 0.84156585, "learning_rate": 1.4418195721690677e-06, "loss": 0.86249685, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 10018, "time_per_iteration": 2.411412239074707 }, { "auxiliary_loss_clip": 0.01058921, "auxiliary_loss_mlp": 0.01045368, "balance_loss_clip": 1.01952338, "balance_loss_mlp": 1.01913595, "epoch": 0.6023748684803848, "flos": 22635029629440.0, "grad_norm": 1.7112613379056687, "language_loss": 0.79485518, "learning_rate": 1.4414455963771549e-06, "loss": 0.81589806, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3984375, "step": 10019, "time_per_iteration": 2.396043062210083 }, { "auxiliary_loss_clip": 0.0105294, "auxiliary_loss_mlp": 0.01038589, "balance_loss_clip": 1.01522386, "balance_loss_mlp": 1.01525283, "epoch": 0.6024349917330527, "flos": 26209332823680.0, "grad_norm": 1.5118120171970497, "language_loss": 0.74529499, "learning_rate": 1.441071641765681e-06, "loss": 0.7662102, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37695312, "step": 10020, "time_per_iteration": 2.434704303741455 }, { "auxiliary_loss_clip": 0.01055262, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.01791286, "balance_loss_mlp": 1.01770687, "epoch": 0.6024951149857207, "flos": 21250688376960.0, "grad_norm": 1.449336209591949, "language_loss": 0.64630264, "learning_rate": 1.4406977083488264e-06, "loss": 0.66730487, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.375, "step": 10021, "time_per_iteration": 3.710780620574951 }, { "auxiliary_loss_clip": 0.01054676, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.01173806, "balance_loss_mlp": 1.01748908, "epoch": 0.6025552382383887, "flos": 26942229417600.0, "grad_norm": 5.805897605448316, "language_loss": 0.8182494, "learning_rate": 1.4403237961407704e-06, "loss": 0.83915985, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 10022, "time_per_iteration": 2.431185722351074 }, { "auxiliary_loss_clip": 0.01056872, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.01113081, "balance_loss_mlp": 1.0181613, "epoch": 0.6026153614910567, "flos": 31683085603200.0, "grad_norm": 1.685334479549561, "language_loss": 0.67582685, "learning_rate": 1.439949905155693e-06, "loss": 0.69675171, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 10023, "time_per_iteration": 2.4425570964813232 }, { "auxiliary_loss_clip": 0.01057958, "auxiliary_loss_mlp": 0.01042096, "balance_loss_clip": 1.01790857, "balance_loss_mlp": 1.01904917, "epoch": 0.6026754847437247, "flos": 29311505435520.0, "grad_norm": 1.933775113647562, "language_loss": 0.75696617, "learning_rate": 1.4395760354077707e-06, "loss": 0.77796674, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38867188, "step": 10024, "time_per_iteration": 2.4413654804229736 }, { "auxiliary_loss_clip": 0.01054933, "auxiliary_loss_mlp": 0.01037363, "balance_loss_clip": 1.01341343, "balance_loss_mlp": 1.01769423, "epoch": 0.6027356079963926, "flos": 23585644661760.0, "grad_norm": 1.7873045696077303, "language_loss": 0.73554862, "learning_rate": 1.4392021869111815e-06, "loss": 0.75647157, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 10025, "time_per_iteration": 2.4041390419006348 }, { "auxiliary_loss_clip": 0.01059276, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.01067436, "balance_loss_mlp": 1.0190767, "epoch": 0.6027957312490606, "flos": 20812702970880.0, "grad_norm": 2.2684067959744336, "language_loss": 0.69361383, "learning_rate": 1.4388283596801016e-06, "loss": 0.71457738, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 10026, "time_per_iteration": 2.4444937705993652 }, { "auxiliary_loss_clip": 0.01053028, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.01538134, "balance_loss_mlp": 1.01715994, "epoch": 0.6028558545017285, "flos": 19934812033920.0, "grad_norm": 1.7470013012371683, "language_loss": 0.81298667, "learning_rate": 1.4384545537287061e-06, "loss": 0.83389568, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 10027, "time_per_iteration": 5.252922058105469 }, { "auxiliary_loss_clip": 0.01056721, "auxiliary_loss_mlp": 0.01045863, "balance_loss_clip": 1.02140164, "balance_loss_mlp": 1.01780713, "epoch": 0.6029159777543965, "flos": 22819720055040.0, "grad_norm": 2.0225026042525127, "language_loss": 0.72393072, "learning_rate": 1.438080769071171e-06, "loss": 0.74495661, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38867188, "step": 10028, "time_per_iteration": 2.384355068206787 }, { "auxiliary_loss_clip": 0.01057726, "auxiliary_loss_mlp": 0.01046766, "balance_loss_clip": 1.02064705, "balance_loss_mlp": 1.01820076, "epoch": 0.6029761010070644, "flos": 23586098509440.0, "grad_norm": 1.719159671660472, "language_loss": 0.84940588, "learning_rate": 1.437707005721669e-06, "loss": 0.87045085, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 10029, "time_per_iteration": 2.402247667312622 }, { "auxiliary_loss_clip": 0.0105408, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.01441467, "balance_loss_mlp": 1.01681924, "epoch": 0.6030362242597325, "flos": 13661582964480.0, "grad_norm": 1.8934965916828035, "language_loss": 0.81761378, "learning_rate": 1.437333263694373e-06, "loss": 0.83853173, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 10030, "time_per_iteration": 2.3609673976898193 }, { "auxiliary_loss_clip": 0.01057984, "auxiliary_loss_mlp": 0.01038847, "balance_loss_clip": 1.01332474, "balance_loss_mlp": 1.01878834, "epoch": 0.6030963475124004, "flos": 24421814657280.0, "grad_norm": 1.5272563858611599, "language_loss": 0.72435725, "learning_rate": 1.4369595430034572e-06, "loss": 0.74532551, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 10031, "time_per_iteration": 2.4335687160491943 }, { "auxiliary_loss_clip": 0.01057108, "auxiliary_loss_mlp": 0.01043215, "balance_loss_clip": 1.01532018, "balance_loss_mlp": 1.01725149, "epoch": 0.6031564707650684, "flos": 29642726304000.0, "grad_norm": 1.5445109829831585, "language_loss": 0.74033219, "learning_rate": 1.4365858436630912e-06, "loss": 0.76133537, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.3984375, "step": 10032, "time_per_iteration": 2.4263806343078613 }, { "auxiliary_loss_clip": 0.01057151, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.01644433, "balance_loss_mlp": 1.01807332, "epoch": 0.6032165940177363, "flos": 16617818626560.0, "grad_norm": 2.06502912268565, "language_loss": 0.69918638, "learning_rate": 1.4362121656874465e-06, "loss": 0.72018301, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 10033, "time_per_iteration": 2.3496081829071045 }, { "auxiliary_loss_clip": 0.01055948, "auxiliary_loss_mlp": 0.01039347, "balance_loss_clip": 1.01403928, "balance_loss_mlp": 1.01826239, "epoch": 0.6032767172704043, "flos": 17487365748480.0, "grad_norm": 1.8489189181336203, "language_loss": 0.76930106, "learning_rate": 1.4358385090906934e-06, "loss": 0.790254, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 10034, "time_per_iteration": 2.3508565425872803 }, { "auxiliary_loss_clip": 0.01057262, "auxiliary_loss_mlp": 0.01041623, "balance_loss_clip": 1.0153966, "balance_loss_mlp": 1.01828384, "epoch": 0.6033368405230723, "flos": 26831764275840.0, "grad_norm": 1.7045397186395337, "language_loss": 0.75598866, "learning_rate": 1.4354648738870004e-06, "loss": 0.77697754, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 10035, "time_per_iteration": 3.836616277694702 }, { "auxiliary_loss_clip": 0.0105434, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.01211119, "balance_loss_mlp": 1.0175097, "epoch": 0.6033969637757403, "flos": 16908959387520.0, "grad_norm": 1.565425281351307, "language_loss": 0.86986268, "learning_rate": 1.435091260090536e-06, "loss": 0.89075994, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 10036, "time_per_iteration": 2.328681230545044 }, { "auxiliary_loss_clip": 0.01055956, "auxiliary_loss_mlp": 0.01041783, "balance_loss_clip": 1.01527119, "balance_loss_mlp": 1.01645041, "epoch": 0.6034570870284083, "flos": 22928963299200.0, "grad_norm": 1.8789243297714495, "language_loss": 0.71435505, "learning_rate": 1.4347176677154676e-06, "loss": 0.73533249, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 10037, "time_per_iteration": 2.4216697216033936 }, { "auxiliary_loss_clip": 0.01054903, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.01774096, "balance_loss_mlp": 1.01776338, "epoch": 0.6035172102810762, "flos": 23365238048640.0, "grad_norm": 1.9236525893463865, "language_loss": 0.86398172, "learning_rate": 1.4343440967759616e-06, "loss": 0.88494164, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 10038, "time_per_iteration": 2.381261110305786 }, { "auxiliary_loss_clip": 0.01056943, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.01596022, "balance_loss_mlp": 1.01764607, "epoch": 0.6035773335337442, "flos": 20886020559360.0, "grad_norm": 1.9464623804513126, "language_loss": 0.7773329, "learning_rate": 1.4339705472861846e-06, "loss": 0.79831517, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.39257812, "step": 10039, "time_per_iteration": 2.3926968574523926 }, { "auxiliary_loss_clip": 0.01054725, "auxiliary_loss_mlp": 0.01037392, "balance_loss_clip": 1.01285887, "balance_loss_mlp": 1.01697338, "epoch": 0.6036374567864121, "flos": 24935142510720.0, "grad_norm": 1.7579240797709554, "language_loss": 0.72440612, "learning_rate": 1.433597019260301e-06, "loss": 0.74532729, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37695312, "step": 10040, "time_per_iteration": 2.3875391483306885 }, { "auxiliary_loss_clip": 0.01057963, "auxiliary_loss_mlp": 0.01042884, "balance_loss_clip": 1.01448798, "balance_loss_mlp": 1.01755738, "epoch": 0.6036975800390801, "flos": 23147170496640.0, "grad_norm": 1.940968858127802, "language_loss": 0.79477692, "learning_rate": 1.433223512712475e-06, "loss": 0.81578535, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40429688, "step": 10041, "time_per_iteration": 2.3988823890686035 }, { "auxiliary_loss_clip": 0.01055852, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.0143708, "balance_loss_mlp": 1.01788008, "epoch": 0.603757703291748, "flos": 18659748936960.0, "grad_norm": 1.7909329400534297, "language_loss": 0.7650072, "learning_rate": 1.4328500276568704e-06, "loss": 0.78595924, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 10042, "time_per_iteration": 2.338707447052002 }, { "auxiliary_loss_clip": 0.01054471, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.01587558, "balance_loss_mlp": 1.01672673, "epoch": 0.6038178265444161, "flos": 19681586876160.0, "grad_norm": 2.667931501560209, "language_loss": 0.85688448, "learning_rate": 1.4324765641076498e-06, "loss": 0.87782705, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 10043, "time_per_iteration": 2.410823106765747 }, { "auxiliary_loss_clip": 0.01057471, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.01502049, "balance_loss_mlp": 1.01749182, "epoch": 0.603877949797084, "flos": 22637124311040.0, "grad_norm": 2.1318722219908826, "language_loss": 0.70641315, "learning_rate": 1.432103122078974e-06, "loss": 0.72739196, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.40039062, "step": 10044, "time_per_iteration": 2.356569290161133 }, { "auxiliary_loss_clip": 0.01057583, "auxiliary_loss_mlp": 0.01047286, "balance_loss_clip": 1.01891375, "balance_loss_mlp": 1.01788473, "epoch": 0.603938073049752, "flos": 25446689884800.0, "grad_norm": 1.6320284887579466, "language_loss": 0.78429461, "learning_rate": 1.4317297015850057e-06, "loss": 0.80534333, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39648438, "step": 10045, "time_per_iteration": 2.416003942489624 }, { "auxiliary_loss_clip": 0.0105459, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.01613116, "balance_loss_mlp": 1.01711988, "epoch": 0.6039981963024199, "flos": 22339210746240.0, "grad_norm": 1.775896432647438, "language_loss": 0.77880311, "learning_rate": 1.4313563026399036e-06, "loss": 0.7997427, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 10046, "time_per_iteration": 2.4374520778656006 }, { "auxiliary_loss_clip": 0.01054852, "auxiliary_loss_mlp": 0.010375, "balance_loss_clip": 1.01388431, "balance_loss_mlp": 1.01696754, "epoch": 0.6040583195550879, "flos": 20702133095040.0, "grad_norm": 1.7282788967952203, "language_loss": 0.87887394, "learning_rate": 1.430982925257827e-06, "loss": 0.89979744, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 10047, "time_per_iteration": 2.3855438232421875 }, { "auxiliary_loss_clip": 0.01054551, "auxiliary_loss_mlp": 0.01040132, "balance_loss_clip": 1.01749444, "balance_loss_mlp": 1.01750207, "epoch": 0.604118442807756, "flos": 27161867980800.0, "grad_norm": 1.5712018331529578, "language_loss": 0.7649647, "learning_rate": 1.4306095694529358e-06, "loss": 0.78591156, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 10048, "time_per_iteration": 2.4322800636291504 }, { "auxiliary_loss_clip": 0.01060273, "auxiliary_loss_mlp": 0.01056746, "balance_loss_clip": 1.02441597, "balance_loss_mlp": 1.01855052, "epoch": 0.6041785660604239, "flos": 30880257822720.0, "grad_norm": 2.3635233156213507, "language_loss": 0.67641109, "learning_rate": 1.430236235239386e-06, "loss": 0.69758129, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 0.41796875, "step": 10049, "time_per_iteration": 2.466089963912964 }, { "auxiliary_loss_clip": 0.01055097, "auxiliary_loss_mlp": 0.01041334, "balance_loss_clip": 1.01677692, "balance_loss_mlp": 1.01747668, "epoch": 0.6042386893130919, "flos": 19937186006400.0, "grad_norm": 1.5201463575166956, "language_loss": 0.677145, "learning_rate": 1.429862922631336e-06, "loss": 0.69810927, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 10050, "time_per_iteration": 2.3757522106170654 }, { "auxiliary_loss_clip": 0.01057579, "auxiliary_loss_mlp": 0.01041316, "balance_loss_clip": 1.01554346, "balance_loss_mlp": 1.01858163, "epoch": 0.6042988125657598, "flos": 32414550831360.0, "grad_norm": 1.9245496196008511, "language_loss": 0.71052706, "learning_rate": 1.4294896316429408e-06, "loss": 0.731516, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 10051, "time_per_iteration": 2.4590303897857666 }, { "auxiliary_loss_clip": 0.01053915, "auxiliary_loss_mlp": 0.01036413, "balance_loss_clip": 1.01177287, "balance_loss_mlp": 1.015944, "epoch": 0.6043589358184278, "flos": 17419843445760.0, "grad_norm": 1.9028008264227345, "language_loss": 0.66022241, "learning_rate": 1.4291163622883553e-06, "loss": 0.68112564, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 10052, "time_per_iteration": 2.38143253326416 }, { "auxiliary_loss_clip": 0.01055313, "auxiliary_loss_mlp": 0.01044948, "balance_loss_clip": 1.01691055, "balance_loss_mlp": 1.01698875, "epoch": 0.6044190590710957, "flos": 27671599964160.0, "grad_norm": 1.5570425749807124, "language_loss": 0.69829649, "learning_rate": 1.4287431145817358e-06, "loss": 0.71929908, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.3828125, "step": 10053, "time_per_iteration": 2.4447693824768066 }, { "auxiliary_loss_clip": 0.01010278, "auxiliary_loss_mlp": 0.01004769, "balance_loss_clip": 1.00246859, "balance_loss_mlp": 1.00270414, "epoch": 0.6044791823237637, "flos": 65313241752960.0, "grad_norm": 0.7230929446614806, "language_loss": 0.60504025, "learning_rate": 1.4283698885372336e-06, "loss": 0.62519073, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.07568359, "step": 10054, "time_per_iteration": 3.1382691860198975 }, { "auxiliary_loss_clip": 0.01054525, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.01219368, "balance_loss_mlp": 1.01694822, "epoch": 0.6045393055764317, "flos": 24491396730240.0, "grad_norm": 1.5748647512038563, "language_loss": 0.86454016, "learning_rate": 1.4279966841690027e-06, "loss": 0.88547021, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 10055, "time_per_iteration": 2.3962295055389404 }, { "auxiliary_loss_clip": 0.01060795, "auxiliary_loss_mlp": 0.01045945, "balance_loss_clip": 1.01950479, "balance_loss_mlp": 1.02026033, "epoch": 0.6045994288290997, "flos": 19053569606400.0, "grad_norm": 2.4912222679583254, "language_loss": 0.7495544, "learning_rate": 1.4276235014911952e-06, "loss": 0.77062184, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40625, "step": 10056, "time_per_iteration": 2.3341498374938965 }, { "auxiliary_loss_clip": 0.01055512, "auxiliary_loss_mlp": 0.01039223, "balance_loss_clip": 1.01389146, "balance_loss_mlp": 1.01713741, "epoch": 0.6046595520817676, "flos": 26575536741120.0, "grad_norm": 1.8932602138371952, "language_loss": 0.81159711, "learning_rate": 1.4272503405179616e-06, "loss": 0.83254445, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 10057, "time_per_iteration": 2.444180488586426 }, { "auxiliary_loss_clip": 0.01054886, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 1.01000714, "balance_loss_mlp": 1.01658833, "epoch": 0.6047196753344356, "flos": 13581632217600.0, "grad_norm": 2.5832678790579986, "language_loss": 0.76563412, "learning_rate": 1.4268772012634527e-06, "loss": 0.78653604, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 10058, "time_per_iteration": 2.372636556625366 }, { "auxiliary_loss_clip": 0.01054687, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.00920796, "balance_loss_mlp": 1.01693988, "epoch": 0.6047797985871035, "flos": 25519274334720.0, "grad_norm": 1.8355110180155307, "language_loss": 0.7154628, "learning_rate": 1.4265040837418176e-06, "loss": 0.73635876, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 10059, "time_per_iteration": 2.4261057376861572 }, { "auxiliary_loss_clip": 0.01055896, "auxiliary_loss_mlp": 0.01040279, "balance_loss_clip": 1.01307499, "balance_loss_mlp": 1.01699531, "epoch": 0.6048399218397715, "flos": 20519153326080.0, "grad_norm": 1.5278120639064248, "language_loss": 0.77328753, "learning_rate": 1.4261309879672054e-06, "loss": 0.79424936, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 10060, "time_per_iteration": 2.417599678039551 }, { "auxiliary_loss_clip": 0.01055454, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.017169, "balance_loss_mlp": 1.01740122, "epoch": 0.6049000450924396, "flos": 20407850311680.0, "grad_norm": 2.848556310749835, "language_loss": 0.74938083, "learning_rate": 1.4257579139537628e-06, "loss": 0.77035856, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 10061, "time_per_iteration": 3.7193291187286377 }, { "auxiliary_loss_clip": 0.01056309, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.01757324, "balance_loss_mlp": 1.01675189, "epoch": 0.6049601683451075, "flos": 20740293077760.0, "grad_norm": 1.701312392823698, "language_loss": 0.67951524, "learning_rate": 1.425384861715639e-06, "loss": 0.70052356, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 10062, "time_per_iteration": 2.381218910217285 }, { "auxiliary_loss_clip": 0.01054421, "auxiliary_loss_mlp": 0.01044493, "balance_loss_clip": 1.01917291, "balance_loss_mlp": 1.01605022, "epoch": 0.6050202915977755, "flos": 20082110526720.0, "grad_norm": 2.0681578088813826, "language_loss": 0.73166335, "learning_rate": 1.425011831266978e-06, "loss": 0.75265253, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10063, "time_per_iteration": 2.3881235122680664 }, { "auxiliary_loss_clip": 0.01054826, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.01281857, "balance_loss_mlp": 1.01689732, "epoch": 0.6050804148504434, "flos": 15959915366400.0, "grad_norm": 1.580038064418922, "language_loss": 0.85104436, "learning_rate": 1.424638822621926e-06, "loss": 0.87195766, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 10064, "time_per_iteration": 2.3497180938720703 }, { "auxiliary_loss_clip": 0.0105528, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.01369107, "balance_loss_mlp": 1.01752234, "epoch": 0.6051405381031114, "flos": 17455699278720.0, "grad_norm": 2.622611413800532, "language_loss": 0.81740904, "learning_rate": 1.4242658357946278e-06, "loss": 0.83834445, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 10065, "time_per_iteration": 2.3603100776672363 }, { "auxiliary_loss_clip": 0.0105929, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.01387715, "balance_loss_mlp": 1.01826715, "epoch": 0.6052006613557793, "flos": 11399350775040.0, "grad_norm": 2.0601715135281453, "language_loss": 0.7943424, "learning_rate": 1.423892870799226e-06, "loss": 0.8153618, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.41015625, "step": 10066, "time_per_iteration": 3.765894889831543 }, { "auxiliary_loss_clip": 0.01056523, "auxiliary_loss_mlp": 0.01040267, "balance_loss_clip": 1.01431513, "balance_loss_mlp": 1.01682794, "epoch": 0.6052607846084473, "flos": 24749928414720.0, "grad_norm": 1.506716426556112, "language_loss": 0.74434, "learning_rate": 1.4235199276498655e-06, "loss": 0.7653079, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 10067, "time_per_iteration": 3.7590901851654053 }, { "auxiliary_loss_clip": 0.01054371, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.01476824, "balance_loss_mlp": 1.01654458, "epoch": 0.6053209078611153, "flos": 20740083609600.0, "grad_norm": 1.4982910934938019, "language_loss": 0.69326591, "learning_rate": 1.4231470063606863e-06, "loss": 0.71421325, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 10068, "time_per_iteration": 2.4085068702697754 }, { "auxiliary_loss_clip": 0.01055416, "auxiliary_loss_mlp": 0.01038844, "balance_loss_clip": 1.01274872, "balance_loss_mlp": 1.01605046, "epoch": 0.6053810311137833, "flos": 18952146506880.0, "grad_norm": 1.9950215390282955, "language_loss": 0.88109636, "learning_rate": 1.4227741069458303e-06, "loss": 0.90203893, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 10069, "time_per_iteration": 2.3394198417663574 }, { "auxiliary_loss_clip": 0.01054371, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.01282048, "balance_loss_mlp": 1.01621854, "epoch": 0.6054411543664512, "flos": 23949998277120.0, "grad_norm": 1.4749230058685392, "language_loss": 0.84257948, "learning_rate": 1.4224012294194387e-06, "loss": 0.86348808, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3828125, "step": 10070, "time_per_iteration": 2.4191813468933105 }, { "auxiliary_loss_clip": 0.01056672, "auxiliary_loss_mlp": 0.01041095, "balance_loss_clip": 1.01418936, "balance_loss_mlp": 1.01650548, "epoch": 0.6055012776191192, "flos": 20592959673600.0, "grad_norm": 1.601699119502509, "language_loss": 0.86926472, "learning_rate": 1.4220283737956496e-06, "loss": 0.89024234, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 10071, "time_per_iteration": 2.3607540130615234 }, { "auxiliary_loss_clip": 0.01058041, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.01769185, "balance_loss_mlp": 1.01771057, "epoch": 0.6055614008717871, "flos": 30296928960000.0, "grad_norm": 1.894987789549741, "language_loss": 0.78396165, "learning_rate": 1.421655540088603e-06, "loss": 0.80499828, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 10072, "time_per_iteration": 2.4543874263763428 }, { "auxiliary_loss_clip": 0.01055083, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.01456332, "balance_loss_mlp": 1.01606882, "epoch": 0.6056215241244551, "flos": 27123812732160.0, "grad_norm": 1.907186574017839, "language_loss": 0.75374997, "learning_rate": 1.4212827283124367e-06, "loss": 0.77471662, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 10073, "time_per_iteration": 2.4208884239196777 }, { "auxiliary_loss_clip": 0.01009151, "auxiliary_loss_mlp": 0.01020763, "balance_loss_clip": 1.01850975, "balance_loss_mlp": 1.00195098, "epoch": 0.6056816473771232, "flos": 56004699058560.0, "grad_norm": 0.7797854709302074, "language_loss": 0.55255222, "learning_rate": 1.4209099384812863e-06, "loss": 0.57285142, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.07226562, "step": 10074, "time_per_iteration": 4.456384658813477 }, { "auxiliary_loss_clip": 0.01055443, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.01051092, "balance_loss_mlp": 1.01842856, "epoch": 0.6057417706297911, "flos": 23548392374400.0, "grad_norm": 1.7159582571694112, "language_loss": 0.82389599, "learning_rate": 1.4205371706092894e-06, "loss": 0.84479851, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 10075, "time_per_iteration": 2.3855326175689697 }, { "auxiliary_loss_clip": 0.01055121, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.00937581, "balance_loss_mlp": 1.01667547, "epoch": 0.6058018938824591, "flos": 27743102161920.0, "grad_norm": 1.7011406953373038, "language_loss": 0.80101311, "learning_rate": 1.4201644247105813e-06, "loss": 0.82191372, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38476562, "step": 10076, "time_per_iteration": 2.4487080574035645 }, { "auxiliary_loss_clip": 0.01056373, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.0152061, "balance_loss_mlp": 1.01709342, "epoch": 0.605862017135127, "flos": 22782293210880.0, "grad_norm": 1.8341809904738935, "language_loss": 0.74470919, "learning_rate": 1.4197917007992964e-06, "loss": 0.76568401, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39257812, "step": 10077, "time_per_iteration": 2.3789799213409424 }, { "auxiliary_loss_clip": 0.0105576, "auxiliary_loss_mlp": 0.01042829, "balance_loss_clip": 1.01895118, "balance_loss_mlp": 1.0172596, "epoch": 0.605922140387795, "flos": 21213959760000.0, "grad_norm": 1.6255654281743943, "language_loss": 0.56921166, "learning_rate": 1.4194189988895682e-06, "loss": 0.59019756, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38476562, "step": 10078, "time_per_iteration": 2.388460874557495 }, { "auxiliary_loss_clip": 0.01058821, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 1.01378584, "balance_loss_mlp": 1.01912868, "epoch": 0.6059822636404629, "flos": 27267236064000.0, "grad_norm": 1.7395304027303617, "language_loss": 0.71811163, "learning_rate": 1.4190463189955297e-06, "loss": 0.73909926, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 10079, "time_per_iteration": 2.5044214725494385 }, { "auxiliary_loss_clip": 0.01054597, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.02132797, "balance_loss_mlp": 1.01703238, "epoch": 0.606042386893131, "flos": 20630281783680.0, "grad_norm": 1.992765764897413, "language_loss": 0.63633847, "learning_rate": 1.4186736611313131e-06, "loss": 0.6573413, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 10080, "time_per_iteration": 2.3873131275177 }, { "auxiliary_loss_clip": 0.01056763, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.01221049, "balance_loss_mlp": 1.0177567, "epoch": 0.6061025101457989, "flos": 23001198635520.0, "grad_norm": 2.0270619902429567, "language_loss": 0.72244698, "learning_rate": 1.4183010253110492e-06, "loss": 0.74339569, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 10081, "time_per_iteration": 2.377899169921875 }, { "auxiliary_loss_clip": 0.01054777, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.01174045, "balance_loss_mlp": 1.0177865, "epoch": 0.6061626333984669, "flos": 29897627207040.0, "grad_norm": 1.6710469538541883, "language_loss": 0.70533222, "learning_rate": 1.4179284115488691e-06, "loss": 0.72623026, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 10082, "time_per_iteration": 2.4608869552612305 }, { "auxiliary_loss_clip": 0.01056959, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.01772022, "balance_loss_mlp": 1.01929641, "epoch": 0.6062227566511348, "flos": 25008041162880.0, "grad_norm": 1.6822961843600346, "language_loss": 0.67198169, "learning_rate": 1.4175558198589015e-06, "loss": 0.69295752, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.375, "step": 10083, "time_per_iteration": 2.396024227142334 }, { "auxiliary_loss_clip": 0.01056149, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.01645708, "balance_loss_mlp": 1.01827884, "epoch": 0.6062828799038028, "flos": 19462925831040.0, "grad_norm": 2.5143542788375886, "language_loss": 0.75408173, "learning_rate": 1.4171832502552764e-06, "loss": 0.77505076, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37890625, "step": 10084, "time_per_iteration": 2.3692259788513184 }, { "auxiliary_loss_clip": 0.01056187, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.01689017, "balance_loss_mlp": 1.01702046, "epoch": 0.6063430031564707, "flos": 13588719223680.0, "grad_norm": 2.55211160879116, "language_loss": 0.73665428, "learning_rate": 1.4168107027521204e-06, "loss": 0.75765586, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 10085, "time_per_iteration": 2.3311548233032227 }, { "auxiliary_loss_clip": 0.01054212, "auxiliary_loss_mlp": 0.01035572, "balance_loss_clip": 1.01109791, "balance_loss_mlp": 1.01668596, "epoch": 0.6064031264091387, "flos": 23254458704640.0, "grad_norm": 2.087533667759812, "language_loss": 0.77719009, "learning_rate": 1.4164381773635605e-06, "loss": 0.79808784, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 10086, "time_per_iteration": 2.4106812477111816 }, { "auxiliary_loss_clip": 0.01054169, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.01056278, "balance_loss_mlp": 1.01735806, "epoch": 0.6064632496618068, "flos": 22457286564480.0, "grad_norm": 1.3770110018893293, "language_loss": 0.73893034, "learning_rate": 1.4160656741037246e-06, "loss": 0.75981319, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 10087, "time_per_iteration": 2.3975656032562256 }, { "auxiliary_loss_clip": 0.01053203, "auxiliary_loss_mlp": 0.01036746, "balance_loss_clip": 1.01409674, "balance_loss_mlp": 1.01719618, "epoch": 0.6065233729144747, "flos": 25117493875200.0, "grad_norm": 1.7153978872844409, "language_loss": 0.84079677, "learning_rate": 1.4156931929867355e-06, "loss": 0.86169624, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 10088, "time_per_iteration": 2.4119889736175537 }, { "auxiliary_loss_clip": 0.01054636, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.01168704, "balance_loss_mlp": 1.01660979, "epoch": 0.6065834961671427, "flos": 23476226860800.0, "grad_norm": 2.0189233974031464, "language_loss": 0.72458029, "learning_rate": 1.4153207340267201e-06, "loss": 0.74548852, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38085938, "step": 10089, "time_per_iteration": 2.3739147186279297 }, { "auxiliary_loss_clip": 0.01056275, "auxiliary_loss_mlp": 0.01039434, "balance_loss_clip": 1.01608086, "balance_loss_mlp": 1.01762629, "epoch": 0.6066436194198106, "flos": 17018447011200.0, "grad_norm": 2.7693877014255857, "language_loss": 0.84235466, "learning_rate": 1.4149482972378009e-06, "loss": 0.86331177, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38671875, "step": 10090, "time_per_iteration": 2.345466375350952 }, { "auxiliary_loss_clip": 0.01059503, "auxiliary_loss_mlp": 0.01047961, "balance_loss_clip": 1.01873064, "balance_loss_mlp": 1.01839459, "epoch": 0.6067037426724786, "flos": 18513777075840.0, "grad_norm": 2.424577423063455, "language_loss": 0.76737815, "learning_rate": 1.4145758826341e-06, "loss": 0.78845274, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41015625, "step": 10091, "time_per_iteration": 2.355814218521118 }, { "auxiliary_loss_clip": 0.01052879, "auxiliary_loss_mlp": 0.01040049, "balance_loss_clip": 1.01486039, "balance_loss_mlp": 1.01593173, "epoch": 0.6067638659251465, "flos": 22344901297920.0, "grad_norm": 1.6518140662506577, "language_loss": 0.80776978, "learning_rate": 1.4142034902297415e-06, "loss": 0.82869905, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36914062, "step": 10092, "time_per_iteration": 2.442143440246582 }, { "auxiliary_loss_clip": 0.01055538, "auxiliary_loss_mlp": 0.01040932, "balance_loss_clip": 1.01517093, "balance_loss_mlp": 1.01653039, "epoch": 0.6068239891778145, "flos": 12450411388800.0, "grad_norm": 1.9774632579834142, "language_loss": 0.77380615, "learning_rate": 1.4138311200388444e-06, "loss": 0.7947709, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 10093, "time_per_iteration": 2.351202964782715 }, { "auxiliary_loss_clip": 0.01053842, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.01557958, "balance_loss_mlp": 1.01815844, "epoch": 0.6068841124304825, "flos": 23184736986240.0, "grad_norm": 2.1601069835477875, "language_loss": 0.88294041, "learning_rate": 1.4134587720755304e-06, "loss": 0.9038738, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35742188, "step": 10094, "time_per_iteration": 2.3961546421051025 }, { "auxiliary_loss_clip": 0.01055206, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.01291299, "balance_loss_mlp": 1.01780248, "epoch": 0.6069442356831505, "flos": 18586920107520.0, "grad_norm": 2.0603463091263143, "language_loss": 0.73000485, "learning_rate": 1.413086446353919e-06, "loss": 0.7509222, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 10095, "time_per_iteration": 2.40669584274292 }, { "auxiliary_loss_clip": 0.0105607, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.01554251, "balance_loss_mlp": 1.01700127, "epoch": 0.6070043589358184, "flos": 20959268325120.0, "grad_norm": 1.61878679684336, "language_loss": 0.78026825, "learning_rate": 1.4127141428881273e-06, "loss": 0.80121505, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.390625, "step": 10096, "time_per_iteration": 2.3878166675567627 }, { "auxiliary_loss_clip": 0.01055844, "auxiliary_loss_mlp": 0.01037885, "balance_loss_clip": 1.01480627, "balance_loss_mlp": 1.0173347, "epoch": 0.6070644821884864, "flos": 11691643610880.0, "grad_norm": 1.6758034508983441, "language_loss": 0.8097719, "learning_rate": 1.4123418616922749e-06, "loss": 0.83070916, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.38476562, "step": 10097, "time_per_iteration": 2.4160642623901367 }, { "auxiliary_loss_clip": 0.01052273, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.01338482, "balance_loss_mlp": 1.01633072, "epoch": 0.6071246054411543, "flos": 19309762229760.0, "grad_norm": 2.005187811436659, "language_loss": 0.68526053, "learning_rate": 1.411969602780478e-06, "loss": 0.70615709, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 10098, "time_per_iteration": 2.362675428390503 }, { "auxiliary_loss_clip": 0.01055171, "auxiliary_loss_mlp": 0.01040537, "balance_loss_clip": 1.01679087, "balance_loss_mlp": 1.01741374, "epoch": 0.6071847286938223, "flos": 17748061937280.0, "grad_norm": 1.856235238382897, "language_loss": 0.81502193, "learning_rate": 1.4115973661668523e-06, "loss": 0.83597904, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37695312, "step": 10099, "time_per_iteration": 2.3933286666870117 }, { "auxiliary_loss_clip": 0.01057484, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.01451981, "balance_loss_mlp": 1.01740289, "epoch": 0.6072448519464904, "flos": 22636426083840.0, "grad_norm": 2.288897385660846, "language_loss": 0.7204082, "learning_rate": 1.4112251518655133e-06, "loss": 0.74137902, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.40234375, "step": 10100, "time_per_iteration": 2.3931686878204346 }, { "auxiliary_loss_clip": 0.01058658, "auxiliary_loss_mlp": 0.01047478, "balance_loss_clip": 1.02084661, "balance_loss_mlp": 1.01983559, "epoch": 0.6073049751991583, "flos": 19536278330880.0, "grad_norm": 1.6201068969416446, "language_loss": 0.71823186, "learning_rate": 1.4108529598905764e-06, "loss": 0.73929322, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38867188, "step": 10101, "time_per_iteration": 3.707383632659912 }, { "auxiliary_loss_clip": 0.01053618, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.01719391, "balance_loss_mlp": 1.01651239, "epoch": 0.6073650984518263, "flos": 28292949164160.0, "grad_norm": 1.9703308230510506, "language_loss": 0.6996755, "learning_rate": 1.410480790256154e-06, "loss": 0.72062123, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10102, "time_per_iteration": 2.462928056716919 }, { "auxiliary_loss_clip": 0.01056563, "auxiliary_loss_mlp": 0.0104715, "balance_loss_clip": 1.02241421, "balance_loss_mlp": 1.01835012, "epoch": 0.6074252217044942, "flos": 25663291159680.0, "grad_norm": 1.8427691617554462, "language_loss": 0.7433483, "learning_rate": 1.4101086429763589e-06, "loss": 0.76438546, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 10103, "time_per_iteration": 2.383058547973633 }, { "auxiliary_loss_clip": 0.01058114, "auxiliary_loss_mlp": 0.01040014, "balance_loss_clip": 1.01413381, "balance_loss_mlp": 1.01850438, "epoch": 0.6074853449571622, "flos": 22855994824320.0, "grad_norm": 2.5120703533469575, "language_loss": 0.77557468, "learning_rate": 1.4097365180653032e-06, "loss": 0.79655594, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 10104, "time_per_iteration": 2.4433412551879883 }, { "auxiliary_loss_clip": 0.01008282, "auxiliary_loss_mlp": 0.01003094, "balance_loss_clip": 1.00057912, "balance_loss_mlp": 1.00146842, "epoch": 0.6075454682098301, "flos": 67108126216320.0, "grad_norm": 0.7110531802209868, "language_loss": 0.56128699, "learning_rate": 1.4093644155370977e-06, "loss": 0.58140075, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.06835938, "step": 10105, "time_per_iteration": 3.0163962841033936 }, { "auxiliary_loss_clip": 0.01008878, "auxiliary_loss_mlp": 0.01004866, "balance_loss_clip": 1.00235057, "balance_loss_mlp": 1.00187433, "epoch": 0.6076055914624982, "flos": 70708963910400.0, "grad_norm": 0.7600719995737946, "language_loss": 0.56845045, "learning_rate": 1.4089923354058533e-06, "loss": 0.58858788, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.0703125, "step": 10106, "time_per_iteration": 5.7897655963897705 }, { "auxiliary_loss_clip": 0.01052679, "auxiliary_loss_mlp": 0.0103981, "balance_loss_clip": 1.01601529, "balance_loss_mlp": 1.01635015, "epoch": 0.6076657147151661, "flos": 28363334198400.0, "grad_norm": 1.613839047428806, "language_loss": 0.69394398, "learning_rate": 1.4086202776856784e-06, "loss": 0.7148689, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 10107, "time_per_iteration": 2.453108072280884 }, { "auxiliary_loss_clip": 0.01057271, "auxiliary_loss_mlp": 0.01038146, "balance_loss_clip": 1.01448345, "balance_loss_mlp": 1.01807451, "epoch": 0.6077258379678341, "flos": 15048856771200.0, "grad_norm": 2.116723110189438, "language_loss": 0.81562561, "learning_rate": 1.4082482423906815e-06, "loss": 0.8365798, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.390625, "step": 10108, "time_per_iteration": 2.3345375061035156 }, { "auxiliary_loss_clip": 0.01058664, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.01581407, "balance_loss_mlp": 1.01834416, "epoch": 0.607785961220502, "flos": 36165968686080.0, "grad_norm": 1.8961280078830969, "language_loss": 0.72577816, "learning_rate": 1.4078762295349714e-06, "loss": 0.74679279, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40234375, "step": 10109, "time_per_iteration": 2.4980127811431885 }, { "auxiliary_loss_clip": 0.01051448, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.01329267, "balance_loss_mlp": 1.01575446, "epoch": 0.60784608447317, "flos": 22523272767360.0, "grad_norm": 1.6241161522472802, "language_loss": 0.80653965, "learning_rate": 1.407504239132653e-06, "loss": 0.82741904, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 10110, "time_per_iteration": 2.3622288703918457 }, { "auxiliary_loss_clip": 0.01055939, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.01239347, "balance_loss_mlp": 1.01682174, "epoch": 0.6079062077258379, "flos": 23840056805760.0, "grad_norm": 2.3711157006717696, "language_loss": 0.71512127, "learning_rate": 1.4071322711978338e-06, "loss": 0.73605275, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 10111, "time_per_iteration": 2.4092485904693604 }, { "auxiliary_loss_clip": 0.01057262, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.01552057, "balance_loss_mlp": 1.0183593, "epoch": 0.6079663309785059, "flos": 23365936275840.0, "grad_norm": 2.667199785939236, "language_loss": 0.66453522, "learning_rate": 1.4067603257446186e-06, "loss": 0.68552411, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 10112, "time_per_iteration": 2.3841166496276855 }, { "auxiliary_loss_clip": 0.01008851, "auxiliary_loss_mlp": 0.01002767, "balance_loss_clip": 1.00032282, "balance_loss_mlp": 1.00211012, "epoch": 0.6080264542311739, "flos": 71379400348800.0, "grad_norm": 0.6281910934367857, "language_loss": 0.49674919, "learning_rate": 1.4063884027871105e-06, "loss": 0.51686537, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.06738281, "step": 10113, "time_per_iteration": 4.4834418296813965 }, { "auxiliary_loss_clip": 0.01008987, "auxiliary_loss_mlp": 0.01004038, "balance_loss_clip": 1.00158238, "balance_loss_mlp": 1.00224495, "epoch": 0.6080865774838419, "flos": 66526508010240.0, "grad_norm": 0.8433108303729301, "language_loss": 0.57013965, "learning_rate": 1.4060165023394147e-06, "loss": 0.59026992, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.06738281, "step": 10114, "time_per_iteration": 2.9610743522644043 }, { "auxiliary_loss_clip": 0.01057116, "auxiliary_loss_mlp": 0.01036779, "balance_loss_clip": 1.00939691, "balance_loss_mlp": 1.01800776, "epoch": 0.6081467007365099, "flos": 19206942675840.0, "grad_norm": 1.8804016506162635, "language_loss": 0.72220814, "learning_rate": 1.4056446244156317e-06, "loss": 0.74314713, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 10115, "time_per_iteration": 2.3535733222961426 }, { "auxiliary_loss_clip": 0.01055967, "auxiliary_loss_mlp": 0.01040007, "balance_loss_clip": 1.01467538, "balance_loss_mlp": 1.01777315, "epoch": 0.6082068239891778, "flos": 24166669374720.0, "grad_norm": 1.7578055187957127, "language_loss": 0.73360687, "learning_rate": 1.4052727690298642e-06, "loss": 0.75456661, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 10116, "time_per_iteration": 2.437307357788086 }, { "auxiliary_loss_clip": 0.0105756, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.01834249, "balance_loss_mlp": 1.0175817, "epoch": 0.6082669472418458, "flos": 37411844019840.0, "grad_norm": 4.247967243604657, "language_loss": 0.54834235, "learning_rate": 1.4049009361962138e-06, "loss": 0.56937903, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 10117, "time_per_iteration": 2.5046637058258057 }, { "auxiliary_loss_clip": 0.01056063, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.01443982, "balance_loss_mlp": 1.01726079, "epoch": 0.6083270704945137, "flos": 15084642781440.0, "grad_norm": 1.7947523458000971, "language_loss": 0.71800816, "learning_rate": 1.4045291259287786e-06, "loss": 0.73897153, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 10118, "time_per_iteration": 2.384993076324463 }, { "auxiliary_loss_clip": 0.01054958, "auxiliary_loss_mlp": 0.0103875, "balance_loss_clip": 1.01465774, "balance_loss_mlp": 1.01675773, "epoch": 0.6083871937471818, "flos": 20667394425600.0, "grad_norm": 1.5490050955863737, "language_loss": 0.75703186, "learning_rate": 1.4041573382416588e-06, "loss": 0.77796888, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 10119, "time_per_iteration": 2.3810994625091553 }, { "auxiliary_loss_clip": 0.01055809, "auxiliary_loss_mlp": 0.01037924, "balance_loss_clip": 1.01303279, "balance_loss_mlp": 1.01757467, "epoch": 0.6084473169998497, "flos": 21505833659520.0, "grad_norm": 1.7074518740409932, "language_loss": 0.68140876, "learning_rate": 1.4037855731489525e-06, "loss": 0.70234609, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3828125, "step": 10120, "time_per_iteration": 2.419356346130371 }, { "auxiliary_loss_clip": 0.01057815, "auxiliary_loss_mlp": 0.01047515, "balance_loss_clip": 1.01953697, "balance_loss_mlp": 1.01755023, "epoch": 0.6085074402525177, "flos": 26868842006400.0, "grad_norm": 1.7306925516631244, "language_loss": 0.75380182, "learning_rate": 1.4034138306647571e-06, "loss": 0.77485514, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 10121, "time_per_iteration": 2.4245526790618896 }, { "auxiliary_loss_clip": 0.01056456, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.01466691, "balance_loss_mlp": 1.01795244, "epoch": 0.6085675635051856, "flos": 10889060209920.0, "grad_norm": 2.5088655894724035, "language_loss": 0.81576145, "learning_rate": 1.4030421108031685e-06, "loss": 0.83670324, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.38476562, "step": 10122, "time_per_iteration": 2.367236375808716 }, { "auxiliary_loss_clip": 0.01054861, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.01937008, "balance_loss_mlp": 1.01707411, "epoch": 0.6086276867578536, "flos": 34860705396480.0, "grad_norm": 1.6657428658776678, "language_loss": 0.57003474, "learning_rate": 1.402670413578284e-06, "loss": 0.59102428, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 10123, "time_per_iteration": 2.487635850906372 }, { "auxiliary_loss_clip": 0.01056657, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.01782215, "balance_loss_mlp": 1.01813877, "epoch": 0.6086878100105215, "flos": 20046673630080.0, "grad_norm": 1.965663591587554, "language_loss": 0.75476539, "learning_rate": 1.4022987390041965e-06, "loss": 0.77575171, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38476562, "step": 10124, "time_per_iteration": 2.3803932666778564 }, { "auxiliary_loss_clip": 0.01056311, "auxiliary_loss_mlp": 0.01040838, "balance_loss_clip": 1.01544666, "balance_loss_mlp": 1.01782489, "epoch": 0.6087479332631895, "flos": 18331495534080.0, "grad_norm": 1.8947008136279415, "language_loss": 0.66711384, "learning_rate": 1.4019270870950006e-06, "loss": 0.68808532, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 10125, "time_per_iteration": 2.340538740158081 }, { "auxiliary_loss_clip": 0.0105453, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.01114011, "balance_loss_mlp": 1.01773667, "epoch": 0.6088080565158575, "flos": 24492409159680.0, "grad_norm": 1.7709671151275594, "language_loss": 0.77409565, "learning_rate": 1.40155545786479e-06, "loss": 0.79499245, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10126, "time_per_iteration": 2.424031972885132 }, { "auxiliary_loss_clip": 0.01057237, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.01025915, "balance_loss_mlp": 1.01737905, "epoch": 0.6088681797685255, "flos": 10268269591680.0, "grad_norm": 2.496299378804706, "language_loss": 0.74478281, "learning_rate": 1.4011838513276558e-06, "loss": 0.7657212, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 10127, "time_per_iteration": 2.3721301555633545 }, { "auxiliary_loss_clip": 0.01056421, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.01602721, "balance_loss_mlp": 1.01731229, "epoch": 0.6089283030211935, "flos": 21972832272000.0, "grad_norm": 2.570220221718418, "language_loss": 0.73415744, "learning_rate": 1.400812267497691e-06, "loss": 0.75514698, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 10128, "time_per_iteration": 2.4035398960113525 }, { "auxiliary_loss_clip": 0.0105358, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.01524794, "balance_loss_mlp": 1.01639342, "epoch": 0.6089884262738614, "flos": 17784231972480.0, "grad_norm": 1.963888248489555, "language_loss": 0.74838346, "learning_rate": 1.4004407063889842e-06, "loss": 0.76931161, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 10129, "time_per_iteration": 2.3678672313690186 }, { "auxiliary_loss_clip": 0.01053522, "auxiliary_loss_mlp": 0.01038071, "balance_loss_clip": 1.01338315, "balance_loss_mlp": 1.01534474, "epoch": 0.6090485495265294, "flos": 36908745194880.0, "grad_norm": 1.5534316782780562, "language_loss": 0.66597307, "learning_rate": 1.400069168015626e-06, "loss": 0.68688899, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 10130, "time_per_iteration": 2.5567409992218018 }, { "auxiliary_loss_clip": 0.01052551, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.00985789, "balance_loss_mlp": 1.01585579, "epoch": 0.6091086727791973, "flos": 19898083416960.0, "grad_norm": 1.5718001089771045, "language_loss": 0.7835381, "learning_rate": 1.3996976523917054e-06, "loss": 0.80439246, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 10131, "time_per_iteration": 2.3770458698272705 }, { "auxiliary_loss_clip": 0.0105548, "auxiliary_loss_mlp": 0.01042989, "balance_loss_clip": 1.01797938, "balance_loss_mlp": 1.01770759, "epoch": 0.6091687960318654, "flos": 22162549933440.0, "grad_norm": 1.8199195292763626, "language_loss": 0.78012693, "learning_rate": 1.3993261595313093e-06, "loss": 0.80111158, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10132, "time_per_iteration": 2.364088296890259 }, { "auxiliary_loss_clip": 0.01052153, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.01568651, "balance_loss_mlp": 1.01659656, "epoch": 0.6092289192845333, "flos": 21464357097600.0, "grad_norm": 1.6218005773946338, "language_loss": 0.76167333, "learning_rate": 1.3989546894485261e-06, "loss": 0.78257895, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 10133, "time_per_iteration": 2.3851053714752197 }, { "auxiliary_loss_clip": 0.01053623, "auxiliary_loss_mlp": 0.01041701, "balance_loss_clip": 1.01599956, "balance_loss_mlp": 1.01611757, "epoch": 0.6092890425372013, "flos": 28693647371520.0, "grad_norm": 1.9082199846094894, "language_loss": 0.65023088, "learning_rate": 1.3985832421574414e-06, "loss": 0.67118418, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 10134, "time_per_iteration": 2.4249978065490723 }, { "auxiliary_loss_clip": 0.01052435, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.01011419, "balance_loss_mlp": 1.01582146, "epoch": 0.6093491657898692, "flos": 20812144389120.0, "grad_norm": 1.835014484177221, "language_loss": 0.80065668, "learning_rate": 1.3982118176721397e-06, "loss": 0.82151961, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 10135, "time_per_iteration": 2.371340036392212 }, { "auxiliary_loss_clip": 0.01055371, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.01603043, "balance_loss_mlp": 1.01705718, "epoch": 0.6094092890425372, "flos": 25445817100800.0, "grad_norm": 1.7831916235547955, "language_loss": 0.72851038, "learning_rate": 1.3978404160067069e-06, "loss": 0.74945998, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3828125, "step": 10136, "time_per_iteration": 2.3884825706481934 }, { "auxiliary_loss_clip": 0.01055116, "auxiliary_loss_mlp": 0.0104056, "balance_loss_clip": 1.01458442, "balance_loss_mlp": 1.0169723, "epoch": 0.6094694122952051, "flos": 35619961933440.0, "grad_norm": 1.8277198989264358, "language_loss": 0.75970495, "learning_rate": 1.3974690371752253e-06, "loss": 0.78066164, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10137, "time_per_iteration": 2.504307508468628 }, { "auxiliary_loss_clip": 0.01057126, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.01623273, "balance_loss_mlp": 1.01786017, "epoch": 0.6095295355478731, "flos": 24455959833600.0, "grad_norm": 1.6649886771836895, "language_loss": 0.80864334, "learning_rate": 1.3970976811917785e-06, "loss": 0.82964075, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 10138, "time_per_iteration": 2.3972814083099365 }, { "auxiliary_loss_clip": 0.01052349, "auxiliary_loss_mlp": 0.01039876, "balance_loss_clip": 1.01531875, "balance_loss_mlp": 1.01709402, "epoch": 0.6095896588005411, "flos": 15632290368000.0, "grad_norm": 4.4079750506959225, "language_loss": 0.81825644, "learning_rate": 1.3967263480704481e-06, "loss": 0.83917868, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35351562, "step": 10139, "time_per_iteration": 2.354684352874756 }, { "auxiliary_loss_clip": 0.01057743, "auxiliary_loss_mlp": 0.01041667, "balance_loss_clip": 1.01466644, "balance_loss_mlp": 1.01807809, "epoch": 0.6096497820532091, "flos": 15549930737280.0, "grad_norm": 2.3605001165086374, "language_loss": 0.84766901, "learning_rate": 1.396355037825315e-06, "loss": 0.86866313, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 10140, "time_per_iteration": 2.321728229522705 }, { "auxiliary_loss_clip": 0.01054915, "auxiliary_loss_mlp": 0.01043356, "balance_loss_clip": 1.01859593, "balance_loss_mlp": 1.01595902, "epoch": 0.6097099053058771, "flos": 24203397991680.0, "grad_norm": 2.8669061180079267, "language_loss": 0.76539171, "learning_rate": 1.3959837504704592e-06, "loss": 0.78637445, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.390625, "step": 10141, "time_per_iteration": 3.65199613571167 }, { "auxiliary_loss_clip": 0.01054371, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.01069069, "balance_loss_mlp": 1.0160346, "epoch": 0.609770028558545, "flos": 19569306343680.0, "grad_norm": 1.8116825012556448, "language_loss": 0.7709986, "learning_rate": 1.3956124860199603e-06, "loss": 0.79189277, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 10142, "time_per_iteration": 2.3339269161224365 }, { "auxiliary_loss_clip": 0.0105498, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.01644874, "balance_loss_mlp": 1.01692009, "epoch": 0.609830151811213, "flos": 23948113063680.0, "grad_norm": 1.6484057234219356, "language_loss": 0.78430855, "learning_rate": 1.3952412444878964e-06, "loss": 0.80528283, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38085938, "step": 10143, "time_per_iteration": 2.404829740524292 }, { "auxiliary_loss_clip": 0.01054236, "auxiliary_loss_mlp": 0.01039588, "balance_loss_clip": 1.01385069, "balance_loss_mlp": 1.01596713, "epoch": 0.6098902750638809, "flos": 16178820791040.0, "grad_norm": 1.8533256669931164, "language_loss": 0.76428032, "learning_rate": 1.3948700258883448e-06, "loss": 0.78521854, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10144, "time_per_iteration": 2.3285562992095947 }, { "auxiliary_loss_clip": 0.01056201, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.01797342, "balance_loss_mlp": 1.01676166, "epoch": 0.609950398316549, "flos": 44524769132160.0, "grad_norm": 1.7557018421791746, "language_loss": 0.74702156, "learning_rate": 1.394498830235383e-06, "loss": 0.76801878, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 10145, "time_per_iteration": 3.9736719131469727 }, { "auxiliary_loss_clip": 0.01053712, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.01542425, "balance_loss_mlp": 1.01723123, "epoch": 0.6100105215692169, "flos": 23220627730560.0, "grad_norm": 2.0027674695441697, "language_loss": 0.7036798, "learning_rate": 1.3941276575430862e-06, "loss": 0.7246123, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 10146, "time_per_iteration": 3.8053507804870605 }, { "auxiliary_loss_clip": 0.01052525, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.01123703, "balance_loss_mlp": 1.01660931, "epoch": 0.6100706448218849, "flos": 15011674306560.0, "grad_norm": 1.6418260738230064, "language_loss": 0.78046823, "learning_rate": 1.3937565078255289e-06, "loss": 0.80133045, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 10147, "time_per_iteration": 2.3386895656585693 }, { "auxiliary_loss_clip": 0.01054295, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.01220298, "balance_loss_mlp": 1.01617718, "epoch": 0.6101307680745528, "flos": 19639132796160.0, "grad_norm": 1.9619752613431187, "language_loss": 0.79608566, "learning_rate": 1.393385381096786e-06, "loss": 0.81698608, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.38085938, "step": 10148, "time_per_iteration": 2.357222557067871 }, { "auxiliary_loss_clip": 0.01058404, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.0158596, "balance_loss_mlp": 1.01703727, "epoch": 0.6101908913272208, "flos": 29934251089920.0, "grad_norm": 2.0238669635046516, "language_loss": 0.551687, "learning_rate": 1.39301427737093e-06, "loss": 0.57271421, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.4140625, "step": 10149, "time_per_iteration": 2.4173364639282227 }, { "auxiliary_loss_clip": 0.01053336, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.01784575, "balance_loss_mlp": 1.01710916, "epoch": 0.6102510145798887, "flos": 21797567913600.0, "grad_norm": 3.352150278897715, "language_loss": 0.8120327, "learning_rate": 1.3926431966620333e-06, "loss": 0.83299541, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 10150, "time_per_iteration": 2.3728349208831787 }, { "auxiliary_loss_clip": 0.01057943, "auxiliary_loss_mlp": 0.01048915, "balance_loss_clip": 1.02292728, "balance_loss_mlp": 1.0188365, "epoch": 0.6103111378325567, "flos": 20705030737920.0, "grad_norm": 1.6817292173346392, "language_loss": 0.69949913, "learning_rate": 1.3922721389841684e-06, "loss": 0.72056764, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 10151, "time_per_iteration": 2.376608371734619 }, { "auxiliary_loss_clip": 0.01054216, "auxiliary_loss_mlp": 0.01034158, "balance_loss_clip": 1.01117432, "balance_loss_mlp": 1.01674056, "epoch": 0.6103712610852247, "flos": 29380528926720.0, "grad_norm": 1.683685675848623, "language_loss": 0.72168148, "learning_rate": 1.3919011043514036e-06, "loss": 0.74256516, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.375, "step": 10152, "time_per_iteration": 2.4395201206207275 }, { "auxiliary_loss_clip": 0.01056303, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.01585603, "balance_loss_mlp": 1.01718307, "epoch": 0.6104313843378927, "flos": 20812004743680.0, "grad_norm": 1.7474882844734831, "language_loss": 0.79200125, "learning_rate": 1.391530092777811e-06, "loss": 0.81297946, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 10153, "time_per_iteration": 3.7833759784698486 }, { "auxiliary_loss_clip": 0.01055352, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.01412892, "balance_loss_mlp": 1.01696491, "epoch": 0.6104915075905607, "flos": 26577247397760.0, "grad_norm": 1.7556249128454096, "language_loss": 0.80533963, "learning_rate": 1.3911591042774573e-06, "loss": 0.82628453, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 10154, "time_per_iteration": 2.3988234996795654 }, { "auxiliary_loss_clip": 0.01054412, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.0140599, "balance_loss_mlp": 1.01753998, "epoch": 0.6105516308432286, "flos": 23914631203200.0, "grad_norm": 1.561216030409758, "language_loss": 0.71023667, "learning_rate": 1.3907881388644116e-06, "loss": 0.73115873, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 10155, "time_per_iteration": 2.384814739227295 }, { "auxiliary_loss_clip": 0.01056681, "auxiliary_loss_mlp": 0.01048257, "balance_loss_clip": 1.02018285, "balance_loss_mlp": 1.0186193, "epoch": 0.6106117540958966, "flos": 31576006863360.0, "grad_norm": 1.6748906020840442, "language_loss": 0.72298396, "learning_rate": 1.3904171965527413e-06, "loss": 0.7440334, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.38085938, "step": 10156, "time_per_iteration": 2.473940849304199 }, { "auxiliary_loss_clip": 0.01053198, "auxiliary_loss_mlp": 0.01045781, "balance_loss_clip": 1.02097368, "balance_loss_mlp": 1.01704359, "epoch": 0.6106718773485645, "flos": 19607187035520.0, "grad_norm": 6.09325101433792, "language_loss": 0.67993784, "learning_rate": 1.3900462773565114e-06, "loss": 0.70092762, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 10157, "time_per_iteration": 2.3599741458892822 }, { "auxiliary_loss_clip": 0.0105382, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.01311183, "balance_loss_mlp": 1.01578784, "epoch": 0.6107320006012326, "flos": 17123081955840.0, "grad_norm": 2.0104429449004515, "language_loss": 0.7380724, "learning_rate": 1.3896753812897877e-06, "loss": 0.75899076, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 10158, "time_per_iteration": 2.3594093322753906 }, { "auxiliary_loss_clip": 0.01056364, "auxiliary_loss_mlp": 0.01039556, "balance_loss_clip": 1.01416445, "balance_loss_mlp": 1.01749945, "epoch": 0.6107921238539005, "flos": 30147081937920.0, "grad_norm": 1.5542448216428797, "language_loss": 0.70227909, "learning_rate": 1.389304508366635e-06, "loss": 0.72323835, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 10159, "time_per_iteration": 2.432460308074951 }, { "auxiliary_loss_clip": 0.01055312, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.01002026, "balance_loss_mlp": 1.01678276, "epoch": 0.6108522471065685, "flos": 18439342323840.0, "grad_norm": 1.7240088646101073, "language_loss": 0.79647923, "learning_rate": 1.3889336586011167e-06, "loss": 0.81738383, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 10160, "time_per_iteration": 2.38443922996521 }, { "auxiliary_loss_clip": 0.01008971, "auxiliary_loss_mlp": 0.01009802, "balance_loss_clip": 1.00744152, "balance_loss_mlp": 1.00168073, "epoch": 0.6109123703592364, "flos": 64131814656000.0, "grad_norm": 0.8302795922540579, "language_loss": 0.61560953, "learning_rate": 1.388562832007295e-06, "loss": 0.63579726, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07324219, "step": 10161, "time_per_iteration": 3.174032688140869 }, { "auxiliary_loss_clip": 0.01056635, "auxiliary_loss_mlp": 0.01042801, "balance_loss_clip": 1.01724243, "balance_loss_mlp": 1.01769304, "epoch": 0.6109724936119044, "flos": 20666800932480.0, "grad_norm": 1.5863965532564583, "language_loss": 0.77472609, "learning_rate": 1.3881920285992324e-06, "loss": 0.79572046, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 10162, "time_per_iteration": 2.374626398086548 }, { "auxiliary_loss_clip": 0.0105515, "auxiliary_loss_mlp": 0.01041392, "balance_loss_clip": 1.01574993, "balance_loss_mlp": 1.01734328, "epoch": 0.6110326168645723, "flos": 31350712659840.0, "grad_norm": 2.500014481168638, "language_loss": 0.72777152, "learning_rate": 1.3878212483909888e-06, "loss": 0.74873698, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 10163, "time_per_iteration": 2.471022605895996 }, { "auxiliary_loss_clip": 0.01052608, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.01316738, "balance_loss_mlp": 1.01606166, "epoch": 0.6110927401172404, "flos": 25002385522560.0, "grad_norm": 1.8317847307123063, "language_loss": 0.60648441, "learning_rate": 1.387450491396625e-06, "loss": 0.62736678, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36523438, "step": 10164, "time_per_iteration": 2.3980040550231934 }, { "auxiliary_loss_clip": 0.010538, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.01949179, "balance_loss_mlp": 1.0163908, "epoch": 0.6111528633699083, "flos": 26246934224640.0, "grad_norm": 1.6860364501075091, "language_loss": 0.76352775, "learning_rate": 1.3870797576302003e-06, "loss": 0.78449982, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 10165, "time_per_iteration": 2.3831772804260254 }, { "auxiliary_loss_clip": 0.01054119, "auxiliary_loss_mlp": 0.01039705, "balance_loss_clip": 1.015136, "balance_loss_mlp": 1.01777327, "epoch": 0.6112129866225763, "flos": 22381385535360.0, "grad_norm": 1.534629945281974, "language_loss": 0.80208755, "learning_rate": 1.3867090471057719e-06, "loss": 0.82302582, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 10166, "time_per_iteration": 2.3842854499816895 }, { "auxiliary_loss_clip": 0.01055495, "auxiliary_loss_mlp": 0.01037825, "balance_loss_clip": 1.01199269, "balance_loss_mlp": 1.01745558, "epoch": 0.6112731098752443, "flos": 25226737119360.0, "grad_norm": 1.8347021853031908, "language_loss": 0.69073033, "learning_rate": 1.3863383598373987e-06, "loss": 0.71166354, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 10167, "time_per_iteration": 2.3885860443115234 }, { "auxiliary_loss_clip": 0.01054151, "auxiliary_loss_mlp": 0.01036008, "balance_loss_clip": 1.01427603, "balance_loss_mlp": 1.01762319, "epoch": 0.6113332331279122, "flos": 22892060125440.0, "grad_norm": 1.7883391119799896, "language_loss": 0.80245984, "learning_rate": 1.3859676958391364e-06, "loss": 0.82336146, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36523438, "step": 10168, "time_per_iteration": 2.3915746212005615 }, { "auxiliary_loss_clip": 0.01058703, "auxiliary_loss_mlp": 0.01049086, "balance_loss_clip": 1.02134562, "balance_loss_mlp": 1.01760066, "epoch": 0.6113933563805802, "flos": 18619459361280.0, "grad_norm": 2.751229441709021, "language_loss": 0.87331653, "learning_rate": 1.3855970551250398e-06, "loss": 0.8943944, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.41015625, "step": 10169, "time_per_iteration": 2.3326714038848877 }, { "auxiliary_loss_clip": 0.01053535, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.01040816, "balance_loss_mlp": 1.01629817, "epoch": 0.6114534796332481, "flos": 41864631644160.0, "grad_norm": 1.717906418074867, "language_loss": 0.79408246, "learning_rate": 1.3852264377091652e-06, "loss": 0.81495261, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37109375, "step": 10170, "time_per_iteration": 2.5349748134613037 }, { "auxiliary_loss_clip": 0.01057348, "auxiliary_loss_mlp": 0.01048985, "balance_loss_clip": 1.02035153, "balance_loss_mlp": 1.01666176, "epoch": 0.6115136028859162, "flos": 21907369739520.0, "grad_norm": 1.825873760285473, "language_loss": 0.6976009, "learning_rate": 1.3848558436055651e-06, "loss": 0.71866417, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40625, "step": 10171, "time_per_iteration": 2.3608834743499756 }, { "auxiliary_loss_clip": 0.01057816, "auxiliary_loss_mlp": 0.01043924, "balance_loss_clip": 1.01457417, "balance_loss_mlp": 1.01757491, "epoch": 0.6115737261385841, "flos": 28803553931520.0, "grad_norm": 1.6821312382291538, "language_loss": 0.80177891, "learning_rate": 1.3844852728282934e-06, "loss": 0.82279629, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.40234375, "step": 10172, "time_per_iteration": 2.424043655395508 }, { "auxiliary_loss_clip": 0.01058011, "auxiliary_loss_mlp": 0.01039302, "balance_loss_clip": 1.01299238, "balance_loss_mlp": 1.01785517, "epoch": 0.6116338493912521, "flos": 21250409086080.0, "grad_norm": 3.9431915422615016, "language_loss": 0.68149912, "learning_rate": 1.3841147253914022e-06, "loss": 0.70247221, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.40234375, "step": 10173, "time_per_iteration": 2.36535382270813 }, { "auxiliary_loss_clip": 0.01056654, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.01320553, "balance_loss_mlp": 1.01741242, "epoch": 0.61169397264392, "flos": 17529226335360.0, "grad_norm": 1.7880653718311157, "language_loss": 0.56936991, "learning_rate": 1.3837442013089416e-06, "loss": 0.59032089, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.39257812, "step": 10174, "time_per_iteration": 2.3660430908203125 }, { "auxiliary_loss_clip": 0.0105774, "auxiliary_loss_mlp": 0.01041725, "balance_loss_clip": 1.0145098, "balance_loss_mlp": 1.01729321, "epoch": 0.611754095896588, "flos": 23950417213440.0, "grad_norm": 1.896923775863193, "language_loss": 0.67183858, "learning_rate": 1.3833737005949628e-06, "loss": 0.69283324, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 10175, "time_per_iteration": 2.3844029903411865 }, { "auxiliary_loss_clip": 0.01054156, "auxiliary_loss_mlp": 0.01040073, "balance_loss_clip": 1.01537299, "balance_loss_mlp": 1.01597261, "epoch": 0.6118142191492559, "flos": 25993674155520.0, "grad_norm": 1.908534130005781, "language_loss": 0.84326243, "learning_rate": 1.3830032232635154e-06, "loss": 0.8642047, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38085938, "step": 10176, "time_per_iteration": 2.427708387374878 }, { "auxiliary_loss_clip": 0.01055338, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.0165664, "balance_loss_mlp": 1.01626813, "epoch": 0.611874342401924, "flos": 24602176074240.0, "grad_norm": 1.8423979073802392, "language_loss": 0.78243244, "learning_rate": 1.3826327693286474e-06, "loss": 0.80342239, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 10177, "time_per_iteration": 2.376904249191284 }, { "auxiliary_loss_clip": 0.01055135, "auxiliary_loss_mlp": 0.01040226, "balance_loss_clip": 1.01603818, "balance_loss_mlp": 1.01671124, "epoch": 0.6119344656545919, "flos": 15886248664320.0, "grad_norm": 1.9243683209452622, "language_loss": 0.76282728, "learning_rate": 1.3822623388044065e-06, "loss": 0.78378093, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38476562, "step": 10178, "time_per_iteration": 2.354708671569824 }, { "auxiliary_loss_clip": 0.01056221, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.01583087, "balance_loss_mlp": 1.01747489, "epoch": 0.6119945889072599, "flos": 21651805520640.0, "grad_norm": 1.7692669478788137, "language_loss": 0.68690848, "learning_rate": 1.3818919317048402e-06, "loss": 0.70788276, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 10179, "time_per_iteration": 2.359222888946533 }, { "auxiliary_loss_clip": 0.01057435, "auxiliary_loss_mlp": 0.01045283, "balance_loss_clip": 1.02052319, "balance_loss_mlp": 1.01838076, "epoch": 0.6120547121599279, "flos": 13771664081280.0, "grad_norm": 1.738499797352307, "language_loss": 0.84995198, "learning_rate": 1.3815215480439933e-06, "loss": 0.87097919, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.390625, "step": 10180, "time_per_iteration": 3.562486171722412 }, { "auxiliary_loss_clip": 0.0105449, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.01366353, "balance_loss_mlp": 1.01689053, "epoch": 0.6121148354125958, "flos": 20078270277120.0, "grad_norm": 1.6952667504803487, "language_loss": 0.78417945, "learning_rate": 1.3811511878359113e-06, "loss": 0.80511159, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10181, "time_per_iteration": 2.344153642654419 }, { "auxiliary_loss_clip": 0.01056061, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.01481485, "balance_loss_mlp": 1.01739192, "epoch": 0.6121749586652638, "flos": 13470713228160.0, "grad_norm": 2.1728932977114934, "language_loss": 0.81810594, "learning_rate": 1.3807808510946384e-06, "loss": 0.83906788, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 10182, "time_per_iteration": 2.322718858718872 }, { "auxiliary_loss_clip": 0.01052974, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.01076102, "balance_loss_mlp": 1.01730478, "epoch": 0.6122350819179317, "flos": 20119502459520.0, "grad_norm": 2.0149375199125696, "language_loss": 0.83989525, "learning_rate": 1.3804105378342177e-06, "loss": 0.86075157, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35546875, "step": 10183, "time_per_iteration": 2.356590747833252 }, { "auxiliary_loss_clip": 0.01008265, "auxiliary_loss_mlp": 0.01003071, "balance_loss_clip": 1.00050795, "balance_loss_mlp": 1.0013411, "epoch": 0.6122952051705998, "flos": 65426115778560.0, "grad_norm": 0.7051638764271655, "language_loss": 0.62855649, "learning_rate": 1.3800402480686914e-06, "loss": 0.64866984, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.06933594, "step": 10184, "time_per_iteration": 3.1742677688598633 }, { "auxiliary_loss_clip": 0.01056308, "auxiliary_loss_mlp": 0.01039122, "balance_loss_clip": 1.01563776, "balance_loss_mlp": 1.01947689, "epoch": 0.6123553284232677, "flos": 20375206323840.0, "grad_norm": 1.8000322856006856, "language_loss": 0.83147037, "learning_rate": 1.379669981812101e-06, "loss": 0.85242462, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36914062, "step": 10185, "time_per_iteration": 3.80475115776062 }, { "auxiliary_loss_clip": 0.01055704, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.01433229, "balance_loss_mlp": 1.01691246, "epoch": 0.6124154516759357, "flos": 23986517425920.0, "grad_norm": 1.9191027119604105, "language_loss": 0.75684536, "learning_rate": 1.3792997390784868e-06, "loss": 0.77781188, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 10186, "time_per_iteration": 3.7379934787750244 }, { "auxiliary_loss_clip": 0.01053502, "auxiliary_loss_mlp": 0.0104161, "balance_loss_clip": 1.01804256, "balance_loss_mlp": 1.01620746, "epoch": 0.6124755749286036, "flos": 21467778410880.0, "grad_norm": 1.6109793806709114, "language_loss": 0.79559231, "learning_rate": 1.3789295198818895e-06, "loss": 0.81654346, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 10187, "time_per_iteration": 2.4205245971679688 }, { "auxiliary_loss_clip": 0.01053315, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.01507473, "balance_loss_mlp": 1.01641059, "epoch": 0.6125356981812716, "flos": 23878042231680.0, "grad_norm": 1.7474831335627108, "language_loss": 0.84508407, "learning_rate": 1.3785593242363462e-06, "loss": 0.86599654, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36914062, "step": 10188, "time_per_iteration": 2.3839361667633057 }, { "auxiliary_loss_clip": 0.01053581, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.01317739, "balance_loss_mlp": 1.01585829, "epoch": 0.6125958214339395, "flos": 14424819396480.0, "grad_norm": 1.77500660677378, "language_loss": 0.7667526, "learning_rate": 1.378189152155896e-06, "loss": 0.78766084, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37695312, "step": 10189, "time_per_iteration": 2.3534929752349854 }, { "auxiliary_loss_clip": 0.01053215, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.01361036, "balance_loss_mlp": 1.0156424, "epoch": 0.6126559446866076, "flos": 23257949840640.0, "grad_norm": 1.5653663069103256, "language_loss": 0.75092119, "learning_rate": 1.3778190036545758e-06, "loss": 0.77182186, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37695312, "step": 10190, "time_per_iteration": 2.3563454151153564 }, { "auxiliary_loss_clip": 0.01055206, "auxiliary_loss_mlp": 0.01039704, "balance_loss_clip": 1.01536143, "balance_loss_mlp": 1.017079, "epoch": 0.6127160679392755, "flos": 26863744947840.0, "grad_norm": 1.5867625152382085, "language_loss": 0.68825495, "learning_rate": 1.3774488787464207e-06, "loss": 0.70920408, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 10191, "time_per_iteration": 2.410029411315918 }, { "auxiliary_loss_clip": 0.01053726, "auxiliary_loss_mlp": 0.01044987, "balance_loss_clip": 1.02035809, "balance_loss_mlp": 1.01559806, "epoch": 0.6127761911919435, "flos": 26395210235520.0, "grad_norm": 2.1274911362553914, "language_loss": 0.75510478, "learning_rate": 1.377078777445467e-06, "loss": 0.77609193, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 10192, "time_per_iteration": 2.39237904548645 }, { "auxiliary_loss_clip": 0.01053261, "auxiliary_loss_mlp": 0.01038158, "balance_loss_clip": 1.01450706, "balance_loss_mlp": 1.01705575, "epoch": 0.6128363144446115, "flos": 22633737909120.0, "grad_norm": 1.7675420024990942, "language_loss": 0.85267615, "learning_rate": 1.3767086997657478e-06, "loss": 0.87359035, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 10193, "time_per_iteration": 3.9120144844055176 }, { "auxiliary_loss_clip": 0.01053981, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.01170492, "balance_loss_mlp": 1.01621807, "epoch": 0.6128964376972794, "flos": 26757888105600.0, "grad_norm": 2.240736804769411, "language_loss": 0.70975566, "learning_rate": 1.3763386457212979e-06, "loss": 0.73066521, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 10194, "time_per_iteration": 2.401890754699707 }, { "auxiliary_loss_clip": 0.01008405, "auxiliary_loss_mlp": 0.01008209, "balance_loss_clip": 1.00581288, "balance_loss_mlp": 1.00138462, "epoch": 0.6129565609499474, "flos": 65565000633600.0, "grad_norm": 0.8289476287768276, "language_loss": 0.58703166, "learning_rate": 1.375968615326149e-06, "loss": 0.60719776, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.0703125, "step": 10195, "time_per_iteration": 2.780883550643921 }, { "auxiliary_loss_clip": 0.01056731, "auxiliary_loss_mlp": 0.01041458, "balance_loss_clip": 1.01446915, "balance_loss_mlp": 1.01794362, "epoch": 0.6130166842026153, "flos": 16361172155520.0, "grad_norm": 2.242929368284764, "language_loss": 0.70608711, "learning_rate": 1.3755986085943324e-06, "loss": 0.7270689, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 10196, "time_per_iteration": 2.347914695739746 }, { "auxiliary_loss_clip": 0.01054594, "auxiliary_loss_mlp": 0.0103992, "balance_loss_clip": 1.01619744, "balance_loss_mlp": 1.01733232, "epoch": 0.6130768074552834, "flos": 23651526130560.0, "grad_norm": 1.8225504378453137, "language_loss": 0.71979177, "learning_rate": 1.3752286255398788e-06, "loss": 0.74073696, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37304688, "step": 10197, "time_per_iteration": 2.3711631298065186 }, { "auxiliary_loss_clip": 0.01055412, "auxiliary_loss_mlp": 0.01045171, "balance_loss_clip": 1.0205543, "balance_loss_mlp": 1.01716948, "epoch": 0.6131369307079513, "flos": 20046429250560.0, "grad_norm": 1.8377312522588325, "language_loss": 0.79929185, "learning_rate": 1.3748586661768191e-06, "loss": 0.82029772, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 10198, "time_per_iteration": 2.358750104904175 }, { "auxiliary_loss_clip": 0.01055631, "auxiliary_loss_mlp": 0.0104117, "balance_loss_clip": 1.0163151, "balance_loss_mlp": 1.016909, "epoch": 0.6131970539606193, "flos": 22671129841920.0, "grad_norm": 1.6489714895117142, "language_loss": 0.75058103, "learning_rate": 1.374488730519181e-06, "loss": 0.77154899, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 10199, "time_per_iteration": 2.397869110107422 }, { "auxiliary_loss_clip": 0.01055961, "auxiliary_loss_mlp": 0.01042451, "balance_loss_clip": 1.01480651, "balance_loss_mlp": 1.01601136, "epoch": 0.6132571772132872, "flos": 26869679879040.0, "grad_norm": 1.6034883609832455, "language_loss": 0.6379528, "learning_rate": 1.374118818580993e-06, "loss": 0.65893686, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 10200, "time_per_iteration": 2.404062032699585 }, { "auxiliary_loss_clip": 0.01052475, "auxiliary_loss_mlp": 0.01043518, "balance_loss_clip": 1.02052224, "balance_loss_mlp": 1.01610017, "epoch": 0.6133173004659552, "flos": 22891571366400.0, "grad_norm": 1.8212952594075678, "language_loss": 0.69763029, "learning_rate": 1.3737489303762822e-06, "loss": 0.71859014, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 10201, "time_per_iteration": 2.3810291290283203 }, { "auxiliary_loss_clip": 0.01052275, "auxiliary_loss_mlp": 0.01037043, "balance_loss_clip": 1.01212859, "balance_loss_mlp": 1.01483464, "epoch": 0.6133774237186231, "flos": 20484065543040.0, "grad_norm": 1.8776983272142918, "language_loss": 0.85426438, "learning_rate": 1.3733790659190746e-06, "loss": 0.87515759, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 10202, "time_per_iteration": 2.3629202842712402 }, { "auxiliary_loss_clip": 0.01008047, "auxiliary_loss_mlp": 0.01004299, "balance_loss_clip": 1.00198603, "balance_loss_mlp": 1.00095081, "epoch": 0.6134375469712912, "flos": 69409635552000.0, "grad_norm": 0.869363193653156, "language_loss": 0.67108035, "learning_rate": 1.3730092252233953e-06, "loss": 0.69120371, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.07128906, "step": 10203, "time_per_iteration": 3.034778356552124 }, { "auxiliary_loss_clip": 0.01053416, "auxiliary_loss_mlp": 0.01038983, "balance_loss_clip": 1.01500964, "balance_loss_mlp": 1.01595151, "epoch": 0.6134976702239591, "flos": 41279941238400.0, "grad_norm": 1.748467560663141, "language_loss": 0.62548184, "learning_rate": 1.37263940830327e-06, "loss": 0.64640582, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 10204, "time_per_iteration": 2.545292377471924 }, { "auxiliary_loss_clip": 0.0105159, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.01443672, "balance_loss_mlp": 1.01575184, "epoch": 0.6135577934766271, "flos": 22345494791040.0, "grad_norm": 1.8412972155130083, "language_loss": 0.73780286, "learning_rate": 1.3722696151727204e-06, "loss": 0.75869679, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 10205, "time_per_iteration": 2.3859293460845947 }, { "auxiliary_loss_clip": 0.01052947, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.01017237, "balance_loss_mlp": 1.01639128, "epoch": 0.6136179167292951, "flos": 23727147868800.0, "grad_norm": 2.065818482946654, "language_loss": 0.76636755, "learning_rate": 1.3718998458457701e-06, "loss": 0.78724623, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 10206, "time_per_iteration": 2.37298583984375 }, { "auxiliary_loss_clip": 0.01054342, "auxiliary_loss_mlp": 0.01037806, "balance_loss_clip": 1.01178229, "balance_loss_mlp": 1.01594996, "epoch": 0.613678039981963, "flos": 26023664880000.0, "grad_norm": 2.1593557529822713, "language_loss": 0.76716059, "learning_rate": 1.3715301003364407e-06, "loss": 0.78808212, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10207, "time_per_iteration": 2.4210500717163086 }, { "auxiliary_loss_clip": 0.01054399, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.0130415, "balance_loss_mlp": 1.01706111, "epoch": 0.613738163234631, "flos": 9859437037440.0, "grad_norm": 2.2259107365022976, "language_loss": 0.83611965, "learning_rate": 1.3711603786587525e-06, "loss": 0.85703135, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37304688, "step": 10208, "time_per_iteration": 2.3206613063812256 }, { "auxiliary_loss_clip": 0.01056267, "auxiliary_loss_mlp": 0.01039063, "balance_loss_clip": 1.01374316, "balance_loss_mlp": 1.01740086, "epoch": 0.613798286487299, "flos": 33180684906240.0, "grad_norm": 1.7877839453468847, "language_loss": 0.74147761, "learning_rate": 1.3707906808267265e-06, "loss": 0.76243091, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 10209, "time_per_iteration": 2.470982074737549 }, { "auxiliary_loss_clip": 0.0105376, "auxiliary_loss_mlp": 0.01042862, "balance_loss_clip": 1.0190078, "balance_loss_mlp": 1.01730633, "epoch": 0.613858409739967, "flos": 25626772010880.0, "grad_norm": 1.7012824297373796, "language_loss": 0.75243425, "learning_rate": 1.37042100685438e-06, "loss": 0.77340043, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 10210, "time_per_iteration": 2.3884389400482178 }, { "auxiliary_loss_clip": 0.01008892, "auxiliary_loss_mlp": 0.01005425, "balance_loss_clip": 1.00276709, "balance_loss_mlp": 1.00178123, "epoch": 0.6139185329926349, "flos": 67188810101760.0, "grad_norm": 0.869753624934555, "language_loss": 0.65221488, "learning_rate": 1.3700513567557325e-06, "loss": 0.67235804, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.07128906, "step": 10211, "time_per_iteration": 3.1526570320129395 }, { "auxiliary_loss_clip": 0.01054701, "auxiliary_loss_mlp": 0.0104009, "balance_loss_clip": 1.01602137, "balance_loss_mlp": 1.01760817, "epoch": 0.6139786562453029, "flos": 21542562276480.0, "grad_norm": 1.7289763263905074, "language_loss": 0.7687909, "learning_rate": 1.369681730544801e-06, "loss": 0.78973877, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 10212, "time_per_iteration": 2.399231433868408 }, { "auxiliary_loss_clip": 0.01055044, "auxiliary_loss_mlp": 0.0104016, "balance_loss_clip": 1.01602054, "balance_loss_mlp": 1.01703811, "epoch": 0.6140387794979708, "flos": 26067271034880.0, "grad_norm": 1.6016841733824052, "language_loss": 0.7557143, "learning_rate": 1.3693121282356009e-06, "loss": 0.7766664, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38085938, "step": 10213, "time_per_iteration": 2.414647340774536 }, { "auxiliary_loss_clip": 0.01056253, "auxiliary_loss_mlp": 0.01040469, "balance_loss_clip": 1.0151248, "balance_loss_mlp": 1.01714766, "epoch": 0.6140989027506388, "flos": 23693526362880.0, "grad_norm": 1.5425392749307867, "language_loss": 0.74246567, "learning_rate": 1.3689425498421483e-06, "loss": 0.76343286, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 10214, "time_per_iteration": 2.3918840885162354 }, { "auxiliary_loss_clip": 0.01054482, "auxiliary_loss_mlp": 0.01038777, "balance_loss_clip": 1.0130868, "balance_loss_mlp": 1.01636612, "epoch": 0.6141590260033067, "flos": 22230770463360.0, "grad_norm": 1.6837767344832155, "language_loss": 0.75449544, "learning_rate": 1.3685729953784572e-06, "loss": 0.775428, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10215, "time_per_iteration": 2.3961873054504395 }, { "auxiliary_loss_clip": 0.01053219, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.01521349, "balance_loss_mlp": 1.01715159, "epoch": 0.6142191492559748, "flos": 23870710846080.0, "grad_norm": 1.6665008478300087, "language_loss": 0.79640222, "learning_rate": 1.368203464858542e-06, "loss": 0.81732649, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 10216, "time_per_iteration": 2.3681938648223877 }, { "auxiliary_loss_clip": 0.01053691, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.01279366, "balance_loss_mlp": 1.01687467, "epoch": 0.6142792725086427, "flos": 15041804676480.0, "grad_norm": 1.9849839900093875, "language_loss": 0.80889416, "learning_rate": 1.3678339582964147e-06, "loss": 0.8298164, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 10217, "time_per_iteration": 2.327357053756714 }, { "auxiliary_loss_clip": 0.01055229, "auxiliary_loss_mlp": 0.01040568, "balance_loss_clip": 1.01465154, "balance_loss_mlp": 1.01712871, "epoch": 0.6143393957613107, "flos": 23329836063360.0, "grad_norm": 2.2239238124845904, "language_loss": 0.79952163, "learning_rate": 1.3674644757060865e-06, "loss": 0.82047963, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 10218, "time_per_iteration": 2.410734176635742 }, { "auxiliary_loss_clip": 0.01053551, "auxiliary_loss_mlp": 0.01038894, "balance_loss_clip": 1.01262021, "balance_loss_mlp": 1.01672411, "epoch": 0.6143995190139786, "flos": 20116150968960.0, "grad_norm": 1.5550103081969515, "language_loss": 0.82754916, "learning_rate": 1.367095017101569e-06, "loss": 0.84847367, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3671875, "step": 10219, "time_per_iteration": 2.369307279586792 }, { "auxiliary_loss_clip": 0.01055387, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.01311409, "balance_loss_mlp": 1.01677132, "epoch": 0.6144596422666466, "flos": 42301918823040.0, "grad_norm": 2.352525811538234, "language_loss": 0.68480939, "learning_rate": 1.3667255824968717e-06, "loss": 0.70575511, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10220, "time_per_iteration": 3.775808811187744 }, { "auxiliary_loss_clip": 0.01052776, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.00900102, "balance_loss_mlp": 1.01596713, "epoch": 0.6145197655193146, "flos": 21572727557760.0, "grad_norm": 1.8490856269044695, "language_loss": 0.73503464, "learning_rate": 1.3663561719060041e-06, "loss": 0.75588727, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 10221, "time_per_iteration": 2.3571949005126953 }, { "auxiliary_loss_clip": 0.010526, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.0147922, "balance_loss_mlp": 1.01622057, "epoch": 0.6145798887719826, "flos": 21470012737920.0, "grad_norm": 1.7346930662986375, "language_loss": 0.80517596, "learning_rate": 1.3659867853429735e-06, "loss": 0.82606173, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.36328125, "step": 10222, "time_per_iteration": 2.389357089996338 }, { "auxiliary_loss_clip": 0.01055023, "auxiliary_loss_mlp": 0.01036943, "balance_loss_clip": 1.01338708, "balance_loss_mlp": 1.01711321, "epoch": 0.6146400120246506, "flos": 20775974353920.0, "grad_norm": 1.7454310919919303, "language_loss": 0.77426469, "learning_rate": 1.365617422821788e-06, "loss": 0.79518437, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37890625, "step": 10223, "time_per_iteration": 2.3504133224487305 }, { "auxiliary_loss_clip": 0.01053038, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.01667953, "balance_loss_mlp": 1.01713014, "epoch": 0.6147001352773185, "flos": 13880453477760.0, "grad_norm": 2.0809952392060835, "language_loss": 0.79173291, "learning_rate": 1.3652480843564535e-06, "loss": 0.81266105, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 10224, "time_per_iteration": 2.3573110103607178 }, { "auxiliary_loss_clip": 0.01051518, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.01433206, "balance_loss_mlp": 1.01594579, "epoch": 0.6147602585299865, "flos": 56639770289280.0, "grad_norm": 1.2197501730766158, "language_loss": 0.66887689, "learning_rate": 1.3648787699609746e-06, "loss": 0.68975532, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35546875, "step": 10225, "time_per_iteration": 5.372615098953247 }, { "auxiliary_loss_clip": 0.01054586, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.01230264, "balance_loss_mlp": 1.01596236, "epoch": 0.6148203817826544, "flos": 32815842531840.0, "grad_norm": 2.2101406814378706, "language_loss": 0.64934772, "learning_rate": 1.364509479649357e-06, "loss": 0.6702677, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 10226, "time_per_iteration": 2.4486351013183594 }, { "auxiliary_loss_clip": 0.01055686, "auxiliary_loss_mlp": 0.01044571, "balance_loss_clip": 1.01890564, "balance_loss_mlp": 1.01733434, "epoch": 0.6148805050353224, "flos": 18331076597760.0, "grad_norm": 1.87647394166594, "language_loss": 0.76754045, "learning_rate": 1.3641402134356037e-06, "loss": 0.78854311, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10227, "time_per_iteration": 2.4433724880218506 }, { "auxiliary_loss_clip": 0.01055443, "auxiliary_loss_mlp": 0.01041793, "balance_loss_clip": 1.01587689, "balance_loss_mlp": 1.01700306, "epoch": 0.6149406282879903, "flos": 14063119044480.0, "grad_norm": 1.9352413361730647, "language_loss": 0.63466549, "learning_rate": 1.3637709713337164e-06, "loss": 0.65563786, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38476562, "step": 10228, "time_per_iteration": 2.3503243923187256 }, { "auxiliary_loss_clip": 0.01052764, "auxiliary_loss_mlp": 0.01037236, "balance_loss_clip": 1.01356089, "balance_loss_mlp": 1.01667464, "epoch": 0.6150007515406584, "flos": 25189065895680.0, "grad_norm": 1.3449134419844209, "language_loss": 0.752653, "learning_rate": 1.3634017533576985e-06, "loss": 0.77355295, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 10229, "time_per_iteration": 2.3935534954071045 }, { "auxiliary_loss_clip": 0.01055503, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.01478243, "balance_loss_mlp": 1.01802897, "epoch": 0.6150608747933263, "flos": 21944168179200.0, "grad_norm": 1.86963620302402, "language_loss": 0.78970277, "learning_rate": 1.3630325595215493e-06, "loss": 0.81066138, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 10230, "time_per_iteration": 2.3655385971069336 }, { "auxiliary_loss_clip": 0.01054341, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.01646745, "balance_loss_mlp": 1.01662731, "epoch": 0.6151209980459943, "flos": 30116148606720.0, "grad_norm": 1.561731816668647, "language_loss": 0.7349093, "learning_rate": 1.36266338983927e-06, "loss": 0.7558741, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 10231, "time_per_iteration": 2.434199571609497 }, { "auxiliary_loss_clip": 0.01055955, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.01605725, "balance_loss_mlp": 1.01745749, "epoch": 0.6151811212986622, "flos": 30007045008000.0, "grad_norm": 1.5908896989075247, "language_loss": 0.71306002, "learning_rate": 1.362294244324858e-06, "loss": 0.73402476, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 10232, "time_per_iteration": 2.442121982574463 }, { "auxiliary_loss_clip": 0.01052635, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.01115084, "balance_loss_mlp": 1.01648402, "epoch": 0.6152412445513302, "flos": 18872091025920.0, "grad_norm": 2.0172774399317412, "language_loss": 0.93094569, "learning_rate": 1.3619251229923126e-06, "loss": 0.95181507, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 10233, "time_per_iteration": 3.7522194385528564 }, { "auxiliary_loss_clip": 0.010521, "auxiliary_loss_mlp": 0.01041607, "balance_loss_clip": 1.01888597, "balance_loss_mlp": 1.01619315, "epoch": 0.6153013678039982, "flos": 25702393749120.0, "grad_norm": 1.701100955987421, "language_loss": 0.71901029, "learning_rate": 1.3615560258556306e-06, "loss": 0.73994738, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 10234, "time_per_iteration": 2.4451911449432373 }, { "auxiliary_loss_clip": 0.01054427, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.01461971, "balance_loss_mlp": 1.0161525, "epoch": 0.6153614910566662, "flos": 28509061680000.0, "grad_norm": 2.4922907446166183, "language_loss": 0.67438912, "learning_rate": 1.3611869529288077e-06, "loss": 0.69534868, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3828125, "step": 10235, "time_per_iteration": 2.4527862071990967 }, { "auxiliary_loss_clip": 0.01056044, "auxiliary_loss_mlp": 0.0103961, "balance_loss_clip": 1.01501679, "balance_loss_mlp": 1.01713443, "epoch": 0.6154216143093342, "flos": 23548671665280.0, "grad_norm": 1.9439115783916816, "language_loss": 0.82019401, "learning_rate": 1.3608179042258398e-06, "loss": 0.84115052, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38867188, "step": 10236, "time_per_iteration": 2.384989023208618 }, { "auxiliary_loss_clip": 0.01054426, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.01341081, "balance_loss_mlp": 1.01611996, "epoch": 0.6154817375620021, "flos": 22746961048320.0, "grad_norm": 1.3718782843607344, "language_loss": 0.81386501, "learning_rate": 1.360448879760721e-06, "loss": 0.83479202, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3828125, "step": 10237, "time_per_iteration": 2.3981287479400635 }, { "auxiliary_loss_clip": 0.01055235, "auxiliary_loss_mlp": 0.01041564, "balance_loss_clip": 1.01694679, "balance_loss_mlp": 1.0171144, "epoch": 0.6155418608146701, "flos": 27161728335360.0, "grad_norm": 1.6604622607031265, "language_loss": 0.77474344, "learning_rate": 1.3600798795474449e-06, "loss": 0.7957114, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 10238, "time_per_iteration": 2.4053943157196045 }, { "auxiliary_loss_clip": 0.01008751, "auxiliary_loss_mlp": 0.0100629, "balance_loss_clip": 1.00360799, "balance_loss_mlp": 1.00174809, "epoch": 0.615601984067338, "flos": 68808257516160.0, "grad_norm": 0.7576818578528131, "language_loss": 0.57717693, "learning_rate": 1.3597109036000036e-06, "loss": 0.59732729, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.0703125, "step": 10239, "time_per_iteration": 2.9996843338012695 }, { "auxiliary_loss_clip": 0.01054205, "auxiliary_loss_mlp": 0.01039689, "balance_loss_clip": 1.01348722, "balance_loss_mlp": 1.01677489, "epoch": 0.615662107320006, "flos": 15516413965440.0, "grad_norm": 1.8405235236782518, "language_loss": 0.79239941, "learning_rate": 1.3593419519323892e-06, "loss": 0.8133384, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 10240, "time_per_iteration": 2.3343698978424072 }, { "auxiliary_loss_clip": 0.01057665, "auxiliary_loss_mlp": 0.01043812, "balance_loss_clip": 1.01857579, "balance_loss_mlp": 1.01912189, "epoch": 0.615722230572674, "flos": 21062786106240.0, "grad_norm": 3.456049119748707, "language_loss": 0.73933661, "learning_rate": 1.3589730245585922e-06, "loss": 0.76035136, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 10241, "time_per_iteration": 2.3832666873931885 }, { "auxiliary_loss_clip": 0.01054275, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.01340795, "balance_loss_mlp": 1.0171051, "epoch": 0.615782353825342, "flos": 23255715513600.0, "grad_norm": 1.6603409239949884, "language_loss": 0.72514284, "learning_rate": 1.3586041214926018e-06, "loss": 0.74605596, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 10242, "time_per_iteration": 2.3843417167663574 }, { "auxiliary_loss_clip": 0.01054158, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.01356947, "balance_loss_mlp": 1.01710916, "epoch": 0.6158424770780099, "flos": 21102901125120.0, "grad_norm": 2.1122451554037815, "language_loss": 0.7303412, "learning_rate": 1.3582352427484086e-06, "loss": 0.75125515, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 10243, "time_per_iteration": 2.371317148208618 }, { "auxiliary_loss_clip": 0.01008589, "auxiliary_loss_mlp": 0.01003141, "balance_loss_clip": 1.00053072, "balance_loss_mlp": 1.00142574, "epoch": 0.6159026003306779, "flos": 70329596544000.0, "grad_norm": 0.7597256756502746, "language_loss": 0.56890279, "learning_rate": 1.3578663883399984e-06, "loss": 0.58902007, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.07128906, "step": 10244, "time_per_iteration": 3.06811785697937 }, { "auxiliary_loss_clip": 0.01053841, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.01279724, "balance_loss_mlp": 1.01576865, "epoch": 0.6159627235833458, "flos": 33872977722240.0, "grad_norm": 2.0228981835662245, "language_loss": 0.64571655, "learning_rate": 1.3574975582813593e-06, "loss": 0.66664851, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38085938, "step": 10245, "time_per_iteration": 2.494758367538452 }, { "auxiliary_loss_clip": 0.01052912, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.01297283, "balance_loss_mlp": 1.01661897, "epoch": 0.6160228468360138, "flos": 26574314843520.0, "grad_norm": 2.440600416961405, "language_loss": 0.80311692, "learning_rate": 1.3571287525864771e-06, "loss": 0.82402492, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 10246, "time_per_iteration": 2.3971683979034424 }, { "auxiliary_loss_clip": 0.01057691, "auxiliary_loss_mlp": 0.01046415, "balance_loss_clip": 1.01842451, "balance_loss_mlp": 1.01804972, "epoch": 0.6160829700886818, "flos": 17192559294720.0, "grad_norm": 2.613669713426707, "language_loss": 0.88897514, "learning_rate": 1.3567599712693368e-06, "loss": 0.91001618, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.39648438, "step": 10247, "time_per_iteration": 2.3688583374023438 }, { "auxiliary_loss_clip": 0.01056096, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.01455772, "balance_loss_mlp": 1.01767635, "epoch": 0.6161430933413498, "flos": 23622408190080.0, "grad_norm": 1.6465646638280664, "language_loss": 0.81161904, "learning_rate": 1.3563912143439235e-06, "loss": 0.83257306, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38476562, "step": 10248, "time_per_iteration": 2.381340980529785 }, { "auxiliary_loss_clip": 0.01053012, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.01216912, "balance_loss_mlp": 1.01630378, "epoch": 0.6162032165940178, "flos": 23001338280960.0, "grad_norm": 1.9644989263730592, "language_loss": 0.88374931, "learning_rate": 1.3560224818242191e-06, "loss": 0.90462983, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10249, "time_per_iteration": 2.4324100017547607 }, { "auxiliary_loss_clip": 0.01055624, "auxiliary_loss_mlp": 0.01039488, "balance_loss_clip": 1.01279712, "balance_loss_mlp": 1.01790881, "epoch": 0.6162633398466857, "flos": 39420397203840.0, "grad_norm": 2.3715895951350783, "language_loss": 0.70475543, "learning_rate": 1.3556537737242072e-06, "loss": 0.72570646, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37695312, "step": 10250, "time_per_iteration": 2.512131690979004 }, { "auxiliary_loss_clip": 0.01050222, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.00799632, "balance_loss_mlp": 1.01568758, "epoch": 0.6163234630993537, "flos": 19243671292800.0, "grad_norm": 1.9641841445918256, "language_loss": 0.74353087, "learning_rate": 1.3552850900578692e-06, "loss": 0.76433742, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 10251, "time_per_iteration": 2.35385799407959 }, { "auxiliary_loss_clip": 0.01053859, "auxiliary_loss_mlp": 0.01038072, "balance_loss_clip": 1.01347899, "balance_loss_mlp": 1.0167402, "epoch": 0.6163835863520216, "flos": 15960857973120.0, "grad_norm": 2.2412589181805953, "language_loss": 0.69304579, "learning_rate": 1.3549164308391844e-06, "loss": 0.71396506, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 10252, "time_per_iteration": 2.3457908630371094 }, { "auxiliary_loss_clip": 0.01009027, "auxiliary_loss_mlp": 0.0100345, "balance_loss_clip": 1.00110197, "balance_loss_mlp": 1.00162506, "epoch": 0.6164437096046896, "flos": 68100322544640.0, "grad_norm": 0.8870067258278813, "language_loss": 0.58072412, "learning_rate": 1.3545477960821333e-06, "loss": 0.60084891, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.07421875, "step": 10253, "time_per_iteration": 3.041614294052124 }, { "auxiliary_loss_clip": 0.01054579, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.01275516, "balance_loss_mlp": 1.01621079, "epoch": 0.6165038328573575, "flos": 21360141089280.0, "grad_norm": 1.6602442248727862, "language_loss": 0.80623412, "learning_rate": 1.3541791858006946e-06, "loss": 0.82716799, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10254, "time_per_iteration": 2.37368106842041 }, { "auxiliary_loss_clip": 0.01056998, "auxiliary_loss_mlp": 0.01040019, "balance_loss_clip": 1.01435339, "balance_loss_mlp": 1.01669383, "epoch": 0.6165639561100256, "flos": 21101015911680.0, "grad_norm": 1.831894490075482, "language_loss": 0.81658947, "learning_rate": 1.353810600008846e-06, "loss": 0.83755958, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.40234375, "step": 10255, "time_per_iteration": 2.4080846309661865 }, { "auxiliary_loss_clip": 0.0105654, "auxiliary_loss_mlp": 0.01040129, "balance_loss_clip": 1.01153016, "balance_loss_mlp": 1.01723945, "epoch": 0.6166240793626935, "flos": 25337341906560.0, "grad_norm": 1.936676156350165, "language_loss": 0.67137593, "learning_rate": 1.3534420387205646e-06, "loss": 0.69234264, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39257812, "step": 10256, "time_per_iteration": 2.399456739425659 }, { "auxiliary_loss_clip": 0.01053975, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.0124892, "balance_loss_mlp": 1.01733899, "epoch": 0.6166842026153615, "flos": 19681621787520.0, "grad_norm": 2.0517921745849272, "language_loss": 0.74059153, "learning_rate": 1.353073501949825e-06, "loss": 0.76150471, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36523438, "step": 10257, "time_per_iteration": 2.389092206954956 }, { "auxiliary_loss_clip": 0.01056466, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.01079535, "balance_loss_mlp": 1.01806748, "epoch": 0.6167443258680294, "flos": 19317337994880.0, "grad_norm": 2.172536155293353, "language_loss": 0.73687088, "learning_rate": 1.3527049897106034e-06, "loss": 0.75779325, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 10258, "time_per_iteration": 2.3538520336151123 }, { "auxiliary_loss_clip": 0.01054878, "auxiliary_loss_mlp": 0.01035444, "balance_loss_clip": 1.01027846, "balance_loss_mlp": 1.01679492, "epoch": 0.6168044491206974, "flos": 25264059229440.0, "grad_norm": 10.557538578325099, "language_loss": 0.67084986, "learning_rate": 1.3523365020168735e-06, "loss": 0.69175309, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 10259, "time_per_iteration": 3.726186990737915 }, { "auxiliary_loss_clip": 0.01054227, "auxiliary_loss_mlp": 0.01042188, "balance_loss_clip": 1.01640296, "balance_loss_mlp": 1.0171504, "epoch": 0.6168645723733654, "flos": 13219198727040.0, "grad_norm": 2.2075743277811752, "language_loss": 0.72696292, "learning_rate": 1.3519680388826084e-06, "loss": 0.74792713, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 10260, "time_per_iteration": 2.3431553840637207 }, { "auxiliary_loss_clip": 0.01056651, "auxiliary_loss_mlp": 0.01045371, "balance_loss_clip": 1.01605713, "balance_loss_mlp": 1.01725161, "epoch": 0.6169246956260334, "flos": 26650809365760.0, "grad_norm": 1.947618265433298, "language_loss": 0.69551229, "learning_rate": 1.3515996003217803e-06, "loss": 0.71653259, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.39453125, "step": 10261, "time_per_iteration": 2.4185078144073486 }, { "auxiliary_loss_clip": 0.01053805, "auxiliary_loss_mlp": 0.01040657, "balance_loss_clip": 1.01557577, "balance_loss_mlp": 1.01586092, "epoch": 0.6169848188787014, "flos": 23147310142080.0, "grad_norm": 1.756720759065761, "language_loss": 0.72147393, "learning_rate": 1.3512311863483602e-06, "loss": 0.74241859, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 10262, "time_per_iteration": 2.3805980682373047 }, { "auxiliary_loss_clip": 0.01054027, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.01376319, "balance_loss_mlp": 1.01614368, "epoch": 0.6170449421313693, "flos": 23330778670080.0, "grad_norm": 1.7260363041589926, "language_loss": 0.71075237, "learning_rate": 1.3508627969763188e-06, "loss": 0.73169541, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 10263, "time_per_iteration": 2.407578706741333 }, { "auxiliary_loss_clip": 0.01055583, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.01075768, "balance_loss_mlp": 1.01717091, "epoch": 0.6171050653840373, "flos": 15850707033600.0, "grad_norm": 2.685011693283178, "language_loss": 0.77635193, "learning_rate": 1.3504944322196244e-06, "loss": 0.79726648, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38476562, "step": 10264, "time_per_iteration": 3.785489320755005 }, { "auxiliary_loss_clip": 0.01053922, "auxiliary_loss_mlp": 0.01037023, "balance_loss_clip": 1.01134551, "balance_loss_mlp": 1.01663923, "epoch": 0.6171651886367052, "flos": 20044544037120.0, "grad_norm": 2.1079325188530573, "language_loss": 0.85828745, "learning_rate": 1.350126092092247e-06, "loss": 0.87919688, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 10265, "time_per_iteration": 3.7775566577911377 }, { "auxiliary_loss_clip": 0.01054652, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.01150584, "balance_loss_mlp": 1.01765394, "epoch": 0.6172253118893732, "flos": 26431485004800.0, "grad_norm": 1.7396464251862396, "language_loss": 0.65309322, "learning_rate": 1.349757776608153e-06, "loss": 0.67401308, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 10266, "time_per_iteration": 2.4222776889801025 }, { "auxiliary_loss_clip": 0.01053917, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.01222086, "balance_loss_mlp": 1.01579821, "epoch": 0.6172854351420412, "flos": 22631922518400.0, "grad_norm": 1.6489298152757297, "language_loss": 0.7649281, "learning_rate": 1.3493894857813094e-06, "loss": 0.78584063, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 10267, "time_per_iteration": 2.362180471420288 }, { "auxiliary_loss_clip": 0.01054582, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.01369798, "balance_loss_mlp": 1.01560068, "epoch": 0.6173455583947092, "flos": 21211934901120.0, "grad_norm": 1.8582415292749932, "language_loss": 0.76160079, "learning_rate": 1.3490212196256818e-06, "loss": 0.78256214, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.390625, "step": 10268, "time_per_iteration": 2.3989670276641846 }, { "auxiliary_loss_clip": 0.01056129, "auxiliary_loss_mlp": 0.0104042, "balance_loss_clip": 1.01425314, "balance_loss_mlp": 1.01649904, "epoch": 0.6174056816473771, "flos": 19499270423040.0, "grad_norm": 1.7775074809634057, "language_loss": 0.76622427, "learning_rate": 1.3486529781552342e-06, "loss": 0.78718972, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 10269, "time_per_iteration": 2.3619232177734375 }, { "auxiliary_loss_clip": 0.01052151, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.01100588, "balance_loss_mlp": 1.01525557, "epoch": 0.6174658049000451, "flos": 15996434515200.0, "grad_norm": 2.0086018334237483, "language_loss": 0.77202344, "learning_rate": 1.3482847613839318e-06, "loss": 0.79291201, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36914062, "step": 10270, "time_per_iteration": 2.363550901412964 }, { "auxiliary_loss_clip": 0.01055738, "auxiliary_loss_mlp": 0.01040358, "balance_loss_clip": 1.0147872, "balance_loss_mlp": 1.0177145, "epoch": 0.617525928152713, "flos": 21902935996800.0, "grad_norm": 2.926390188583105, "language_loss": 0.83611596, "learning_rate": 1.347916569325736e-06, "loss": 0.85707688, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 10271, "time_per_iteration": 2.3760952949523926 }, { "auxiliary_loss_clip": 0.0105335, "auxiliary_loss_mlp": 0.01039906, "balance_loss_clip": 1.01553941, "balance_loss_mlp": 1.01665282, "epoch": 0.617586051405381, "flos": 21104891072640.0, "grad_norm": 1.7458202842304997, "language_loss": 0.77960277, "learning_rate": 1.3475484019946093e-06, "loss": 0.80053532, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 10272, "time_per_iteration": 2.3724489212036133 }, { "auxiliary_loss_clip": 0.01008822, "auxiliary_loss_mlp": 0.01004991, "balance_loss_clip": 1.00239205, "balance_loss_mlp": 1.00173926, "epoch": 0.617646174658049, "flos": 58607717829120.0, "grad_norm": 0.8920638960036812, "language_loss": 0.59255981, "learning_rate": 1.347180259404513e-06, "loss": 0.6126979, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.07128906, "step": 10273, "time_per_iteration": 4.353146076202393 }, { "auxiliary_loss_clip": 0.01052657, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.01006174, "balance_loss_mlp": 1.01503253, "epoch": 0.617706297910717, "flos": 13877904948480.0, "grad_norm": 2.7652999626952526, "language_loss": 0.73832226, "learning_rate": 1.3468121415694059e-06, "loss": 0.7592079, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10274, "time_per_iteration": 2.339550495147705 }, { "auxiliary_loss_clip": 0.01053843, "auxiliary_loss_mlp": 0.01036364, "balance_loss_clip": 1.01127064, "balance_loss_mlp": 1.01650143, "epoch": 0.617766421163385, "flos": 19207431434880.0, "grad_norm": 1.7187404709621221, "language_loss": 0.79134226, "learning_rate": 1.3464440485032484e-06, "loss": 0.81224436, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 10275, "time_per_iteration": 2.3611490726470947 }, { "auxiliary_loss_clip": 0.01054217, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.01438999, "balance_loss_mlp": 1.01645803, "epoch": 0.6178265444160529, "flos": 22564854063360.0, "grad_norm": 1.6553921392704254, "language_loss": 0.80571318, "learning_rate": 1.346075980219998e-06, "loss": 0.82664216, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37695312, "step": 10276, "time_per_iteration": 2.409409761428833 }, { "auxiliary_loss_clip": 0.01054935, "auxiliary_loss_mlp": 0.01045363, "balance_loss_clip": 1.01930356, "balance_loss_mlp": 1.01654732, "epoch": 0.6178866676687209, "flos": 11983482599040.0, "grad_norm": 2.8905722807712824, "language_loss": 0.82108289, "learning_rate": 1.345707936733612e-06, "loss": 0.8420859, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10277, "time_per_iteration": 2.3472745418548584 }, { "auxiliary_loss_clip": 0.01056096, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.01441121, "balance_loss_mlp": 1.01728797, "epoch": 0.6179467909213888, "flos": 20990585681280.0, "grad_norm": 1.661407368833782, "language_loss": 0.82660329, "learning_rate": 1.3453399180580466e-06, "loss": 0.84756595, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 10278, "time_per_iteration": 2.366598606109619 }, { "auxiliary_loss_clip": 0.01054881, "auxiliary_loss_mlp": 0.0103834, "balance_loss_clip": 1.01285315, "balance_loss_mlp": 1.01668274, "epoch": 0.6180069141740568, "flos": 25336922970240.0, "grad_norm": 1.5563691517418181, "language_loss": 0.74772286, "learning_rate": 1.3449719242072567e-06, "loss": 0.76865506, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 10279, "time_per_iteration": 2.389824390411377 }, { "auxiliary_loss_clip": 0.01052638, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.01290154, "balance_loss_mlp": 1.01511788, "epoch": 0.6180670374267248, "flos": 19644718613760.0, "grad_norm": 1.5780966301453538, "language_loss": 0.72004664, "learning_rate": 1.3446039551951975e-06, "loss": 0.74094009, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 10280, "time_per_iteration": 2.364516258239746 }, { "auxiliary_loss_clip": 0.01054357, "auxiliary_loss_mlp": 0.01041095, "balance_loss_clip": 1.01700258, "balance_loss_mlp": 1.01661611, "epoch": 0.6181271606793928, "flos": 19463833526400.0, "grad_norm": 1.4657528659903063, "language_loss": 0.73122168, "learning_rate": 1.3442360110358215e-06, "loss": 0.75217617, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37890625, "step": 10281, "time_per_iteration": 2.348449468612671 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01035606, "balance_loss_clip": 1.01236033, "balance_loss_mlp": 1.01657701, "epoch": 0.6181872839320607, "flos": 25593080682240.0, "grad_norm": 1.4634231713041268, "language_loss": 0.77753198, "learning_rate": 1.3438680917430827e-06, "loss": 0.79841071, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 10282, "time_per_iteration": 2.4308903217315674 }, { "auxiliary_loss_clip": 0.01056012, "auxiliary_loss_mlp": 0.01043046, "balance_loss_clip": 1.01357746, "balance_loss_mlp": 1.01625896, "epoch": 0.6182474071847287, "flos": 25550766247680.0, "grad_norm": 1.7046759284202693, "language_loss": 0.69830477, "learning_rate": 1.343500197330931e-06, "loss": 0.71929526, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.39648438, "step": 10283, "time_per_iteration": 2.4231510162353516 }, { "auxiliary_loss_clip": 0.01059471, "auxiliary_loss_mlp": 0.01041052, "balance_loss_clip": 1.01339555, "balance_loss_mlp": 1.01716161, "epoch": 0.6183075304373966, "flos": 22122749116800.0, "grad_norm": 1.8273736937816063, "language_loss": 0.75947732, "learning_rate": 1.3431323278133176e-06, "loss": 0.78048253, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.42382812, "step": 10284, "time_per_iteration": 2.411320686340332 }, { "auxiliary_loss_clip": 0.01051461, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.0108341, "balance_loss_mlp": 1.01670432, "epoch": 0.6183676536900646, "flos": 22454493655680.0, "grad_norm": 1.4991984395311004, "language_loss": 0.76419777, "learning_rate": 1.3427644832041922e-06, "loss": 0.78506792, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.34765625, "step": 10285, "time_per_iteration": 2.455461263656616 }, { "auxiliary_loss_clip": 0.010549, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.01326847, "balance_loss_mlp": 1.01640105, "epoch": 0.6184277769427327, "flos": 23363108455680.0, "grad_norm": 1.5301748845805834, "language_loss": 0.73832828, "learning_rate": 1.342396663517503e-06, "loss": 0.75926852, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 10286, "time_per_iteration": 2.4174907207489014 }, { "auxiliary_loss_clip": 0.01052373, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.0158577, "balance_loss_mlp": 1.01582766, "epoch": 0.6184879001954006, "flos": 22709953140480.0, "grad_norm": 1.6846940569030369, "language_loss": 0.76935333, "learning_rate": 1.342028868767199e-06, "loss": 0.79027307, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 10287, "time_per_iteration": 2.364710807800293 }, { "auxiliary_loss_clip": 0.01054395, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.01662207, "balance_loss_mlp": 1.0169965, "epoch": 0.6185480234480686, "flos": 23840789944320.0, "grad_norm": 1.7112884699958915, "language_loss": 0.736817, "learning_rate": 1.3416610989672262e-06, "loss": 0.75777423, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 10288, "time_per_iteration": 2.4099583625793457 }, { "auxiliary_loss_clip": 0.0105082, "auxiliary_loss_mlp": 0.01034497, "balance_loss_clip": 1.0124197, "balance_loss_mlp": 1.01588869, "epoch": 0.6186081467007365, "flos": 45475872923520.0, "grad_norm": 1.8519807596649471, "language_loss": 0.73922062, "learning_rate": 1.3412933541315296e-06, "loss": 0.76007378, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 10289, "time_per_iteration": 2.58060884475708 }, { "auxiliary_loss_clip": 0.01056723, "auxiliary_loss_mlp": 0.01044848, "balance_loss_clip": 1.01881289, "balance_loss_mlp": 1.01646495, "epoch": 0.6186682699534045, "flos": 23549719006080.0, "grad_norm": 1.469722787756303, "language_loss": 0.79939497, "learning_rate": 1.340925634274056e-06, "loss": 0.82041073, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40234375, "step": 10290, "time_per_iteration": 2.4029996395111084 }, { "auxiliary_loss_clip": 0.01055565, "auxiliary_loss_mlp": 0.01038899, "balance_loss_clip": 1.01323342, "balance_loss_mlp": 1.01727176, "epoch": 0.6187283932060724, "flos": 25773058074240.0, "grad_norm": 1.5213891059345583, "language_loss": 0.82381678, "learning_rate": 1.3405579394087475e-06, "loss": 0.84476143, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10291, "time_per_iteration": 2.404261827468872 }, { "auxiliary_loss_clip": 0.0105314, "auxiliary_loss_mlp": 0.01046934, "balance_loss_clip": 1.02149487, "balance_loss_mlp": 1.01630592, "epoch": 0.6187885164587404, "flos": 25264024318080.0, "grad_norm": 1.6358674137150262, "language_loss": 0.78600669, "learning_rate": 1.3401902695495487e-06, "loss": 0.80700743, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 10292, "time_per_iteration": 2.4843029975891113 }, { "auxiliary_loss_clip": 0.01059354, "auxiliary_loss_mlp": 0.01050049, "balance_loss_clip": 1.02003193, "balance_loss_mlp": 1.01810622, "epoch": 0.6188486397114084, "flos": 26249552576640.0, "grad_norm": 2.2784556043987965, "language_loss": 0.74439883, "learning_rate": 1.339822624710401e-06, "loss": 0.7654928, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.41210938, "step": 10293, "time_per_iteration": 2.4044783115386963 }, { "auxiliary_loss_clip": 0.01054438, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.01961994, "balance_loss_mlp": 1.01728702, "epoch": 0.6189087629640764, "flos": 20922330240000.0, "grad_norm": 1.571774766573395, "language_loss": 0.84459788, "learning_rate": 1.3394550049052454e-06, "loss": 0.86558676, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 10294, "time_per_iteration": 2.379075288772583 }, { "auxiliary_loss_clip": 0.01055582, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.01165724, "balance_loss_mlp": 1.01729977, "epoch": 0.6189688862167443, "flos": 14828938917120.0, "grad_norm": 2.6394107789561745, "language_loss": 0.72642815, "learning_rate": 1.3390874101480225e-06, "loss": 0.74735332, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10295, "time_per_iteration": 2.3259530067443848 }, { "auxiliary_loss_clip": 0.01053919, "auxiliary_loss_mlp": 0.01049258, "balance_loss_clip": 1.02273369, "balance_loss_mlp": 1.01670241, "epoch": 0.6190290094694123, "flos": 24283767674880.0, "grad_norm": 1.9515641311095626, "language_loss": 0.71184027, "learning_rate": 1.3387198404526705e-06, "loss": 0.73287201, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37304688, "step": 10296, "time_per_iteration": 2.4911317825317383 }, { "auxiliary_loss_clip": 0.01056844, "auxiliary_loss_mlp": 0.01046619, "balance_loss_clip": 1.01970172, "balance_loss_mlp": 1.01795268, "epoch": 0.6190891327220802, "flos": 22528334914560.0, "grad_norm": 1.8461428611174853, "language_loss": 0.72231054, "learning_rate": 1.3383522958331287e-06, "loss": 0.7433452, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 10297, "time_per_iteration": 2.3764867782592773 }, { "auxiliary_loss_clip": 0.01009424, "auxiliary_loss_mlp": 0.0100451, "balance_loss_clip": 1.00193524, "balance_loss_mlp": 1.00219917, "epoch": 0.6191492559747482, "flos": 67726123925760.0, "grad_norm": 0.8977100334170434, "language_loss": 0.64326662, "learning_rate": 1.3379847763033345e-06, "loss": 0.66340595, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.07226562, "step": 10298, "time_per_iteration": 2.916808605194092 }, { "auxiliary_loss_clip": 0.01055173, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.01360488, "balance_loss_mlp": 1.01676452, "epoch": 0.6192093792274163, "flos": 22345564613760.0, "grad_norm": 1.6317495176143209, "language_loss": 0.75302231, "learning_rate": 1.3376172818772236e-06, "loss": 0.77396011, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 10299, "time_per_iteration": 3.6500940322875977 }, { "auxiliary_loss_clip": 0.01056283, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.01204658, "balance_loss_mlp": 1.0166328, "epoch": 0.6192695024800842, "flos": 13553072858880.0, "grad_norm": 1.5907186791749184, "language_loss": 0.68937629, "learning_rate": 1.337249812568732e-06, "loss": 0.71031588, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39648438, "step": 10300, "time_per_iteration": 2.3524250984191895 }, { "auxiliary_loss_clip": 0.01057077, "auxiliary_loss_mlp": 0.01044745, "balance_loss_clip": 1.01985407, "balance_loss_mlp": 1.0183841, "epoch": 0.6193296257327522, "flos": 17414502007680.0, "grad_norm": 1.8268386698941323, "language_loss": 0.68212569, "learning_rate": 1.3368823683917939e-06, "loss": 0.70314389, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 10301, "time_per_iteration": 2.3298444747924805 }, { "auxiliary_loss_clip": 0.01054863, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.01416111, "balance_loss_mlp": 1.01612198, "epoch": 0.6193897489854201, "flos": 31099826563200.0, "grad_norm": 1.5691273205467593, "language_loss": 0.74045044, "learning_rate": 1.3365149493603424e-06, "loss": 0.76138031, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38671875, "step": 10302, "time_per_iteration": 2.4525365829467773 }, { "auxiliary_loss_clip": 0.01053549, "auxiliary_loss_mlp": 0.01037607, "balance_loss_clip": 1.01272845, "balance_loss_mlp": 1.01633358, "epoch": 0.6194498722380881, "flos": 19133066505600.0, "grad_norm": 1.6929313180039014, "language_loss": 0.81947589, "learning_rate": 1.3361475554883107e-06, "loss": 0.84038746, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 10303, "time_per_iteration": 2.35943603515625 }, { "auxiliary_loss_clip": 0.01055232, "auxiliary_loss_mlp": 0.01037763, "balance_loss_clip": 1.00998724, "balance_loss_mlp": 1.01640201, "epoch": 0.619509995490756, "flos": 21834017239680.0, "grad_norm": 1.7371303633106103, "language_loss": 0.77469724, "learning_rate": 1.3357801867896307e-06, "loss": 0.79562718, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38671875, "step": 10304, "time_per_iteration": 3.749577045440674 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01045112, "balance_loss_clip": 1.0170145, "balance_loss_mlp": 1.01636052, "epoch": 0.619570118743424, "flos": 23805387959040.0, "grad_norm": 1.856185229947296, "language_loss": 0.78238231, "learning_rate": 1.3354128432782324e-06, "loss": 0.80339873, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 10305, "time_per_iteration": 3.7299325466156006 }, { "auxiliary_loss_clip": 0.0105857, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.01636517, "balance_loss_mlp": 1.01872826, "epoch": 0.619630241996092, "flos": 21100666798080.0, "grad_norm": 1.5879580408886338, "language_loss": 0.79563439, "learning_rate": 1.335045524968045e-06, "loss": 0.81665224, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3984375, "step": 10306, "time_per_iteration": 2.361147165298462 }, { "auxiliary_loss_clip": 0.01051246, "auxiliary_loss_mlp": 0.01034227, "balance_loss_clip": 1.01230478, "balance_loss_mlp": 1.015836, "epoch": 0.61969036524876, "flos": 27307036880640.0, "grad_norm": 1.633922511296341, "language_loss": 0.81282783, "learning_rate": 1.3346782318729988e-06, "loss": 0.83368266, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 10307, "time_per_iteration": 2.4607291221618652 }, { "auxiliary_loss_clip": 0.01009305, "auxiliary_loss_mlp": 0.01009695, "balance_loss_clip": 1.00752521, "balance_loss_mlp": 1.00210214, "epoch": 0.6197504885014279, "flos": 51645896547840.0, "grad_norm": 0.8016492260076375, "language_loss": 0.59420097, "learning_rate": 1.3343109640070203e-06, "loss": 0.61439097, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.07226562, "step": 10308, "time_per_iteration": 3.0528175830841064 }, { "auxiliary_loss_clip": 0.01051976, "auxiliary_loss_mlp": 0.01033816, "balance_loss_clip": 1.01133299, "balance_loss_mlp": 1.01611245, "epoch": 0.6198106117540959, "flos": 30555739935360.0, "grad_norm": 1.8832124686079856, "language_loss": 0.68878251, "learning_rate": 1.333943721384037e-06, "loss": 0.70964038, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 10309, "time_per_iteration": 2.462857246398926 }, { "auxiliary_loss_clip": 0.01052175, "auxiliary_loss_mlp": 0.01039474, "balance_loss_clip": 1.01650226, "balance_loss_mlp": 1.01623785, "epoch": 0.6198707350067638, "flos": 18908924376960.0, "grad_norm": 1.5806301970392462, "language_loss": 0.73132318, "learning_rate": 1.3335765040179746e-06, "loss": 0.75223964, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 10310, "time_per_iteration": 2.352304220199585 }, { "auxiliary_loss_clip": 0.01055858, "auxiliary_loss_mlp": 0.01042059, "balance_loss_clip": 1.01523697, "balance_loss_mlp": 1.01792216, "epoch": 0.6199308582594318, "flos": 21432795361920.0, "grad_norm": 3.7554705719012635, "language_loss": 0.79621375, "learning_rate": 1.3332093119227573e-06, "loss": 0.81719297, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37890625, "step": 10311, "time_per_iteration": 2.3850057125091553 }, { "auxiliary_loss_clip": 0.01053659, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.01564693, "balance_loss_mlp": 1.01604319, "epoch": 0.6199909815120999, "flos": 18406349222400.0, "grad_norm": 1.6188888866100697, "language_loss": 0.73873174, "learning_rate": 1.3328421451123105e-06, "loss": 0.75968134, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10312, "time_per_iteration": 3.8706939220428467 }, { "auxiliary_loss_clip": 0.01053712, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.01350629, "balance_loss_mlp": 1.01682067, "epoch": 0.6200511047647678, "flos": 21465893197440.0, "grad_norm": 1.9933267918420687, "language_loss": 0.73134398, "learning_rate": 1.3324750036005557e-06, "loss": 0.75227916, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.36914062, "step": 10313, "time_per_iteration": 2.3704605102539062 }, { "auxiliary_loss_clip": 0.01055898, "auxiliary_loss_mlp": 0.01042838, "balance_loss_clip": 1.01719666, "balance_loss_mlp": 1.01715398, "epoch": 0.6201112280174358, "flos": 18215130372480.0, "grad_norm": 1.9568416383467089, "language_loss": 0.79571635, "learning_rate": 1.332107887401416e-06, "loss": 0.8167038, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 10314, "time_per_iteration": 2.3326687812805176 }, { "auxiliary_loss_clip": 0.01054547, "auxiliary_loss_mlp": 0.01039481, "balance_loss_clip": 1.01435208, "balance_loss_mlp": 1.01666141, "epoch": 0.6201713512701037, "flos": 20010154481280.0, "grad_norm": 1.7050344524696863, "language_loss": 0.79191619, "learning_rate": 1.331740796528812e-06, "loss": 0.81285644, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 10315, "time_per_iteration": 2.3799993991851807 }, { "auxiliary_loss_clip": 0.01057236, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.01686883, "balance_loss_mlp": 1.01810431, "epoch": 0.6202314745227717, "flos": 22486718707200.0, "grad_norm": 1.6794934166236695, "language_loss": 0.76978415, "learning_rate": 1.3313737309966641e-06, "loss": 0.79077446, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 10316, "time_per_iteration": 2.369307279586792 }, { "auxiliary_loss_clip": 0.01053031, "auxiliary_loss_mlp": 0.01038014, "balance_loss_clip": 1.01139414, "balance_loss_mlp": 1.01531613, "epoch": 0.6202915977754396, "flos": 26827609824000.0, "grad_norm": 1.8215213697045738, "language_loss": 0.78852355, "learning_rate": 1.3310066908188915e-06, "loss": 0.809434, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37695312, "step": 10317, "time_per_iteration": 2.4071438312530518 }, { "auxiliary_loss_clip": 0.01009245, "auxiliary_loss_mlp": 0.01006998, "balance_loss_clip": 1.00441146, "balance_loss_mlp": 1.00192142, "epoch": 0.6203517210281076, "flos": 62739269233920.0, "grad_norm": 0.7199198521556569, "language_loss": 0.5912407, "learning_rate": 1.3306396760094122e-06, "loss": 0.61140305, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.07324219, "step": 10318, "time_per_iteration": 3.072370767593384 }, { "auxiliary_loss_clip": 0.01056019, "auxiliary_loss_mlp": 0.0104046, "balance_loss_clip": 1.01425791, "balance_loss_mlp": 1.01796484, "epoch": 0.6204118442807756, "flos": 23403153651840.0, "grad_norm": 1.5146564364093789, "language_loss": 0.79088676, "learning_rate": 1.330272686582143e-06, "loss": 0.8118515, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 10319, "time_per_iteration": 2.371276378631592 }, { "auxiliary_loss_clip": 0.01052956, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.01374817, "balance_loss_mlp": 1.01680577, "epoch": 0.6204719675334436, "flos": 20192610579840.0, "grad_norm": 2.168946592111342, "language_loss": 0.6761651, "learning_rate": 1.3299057225510013e-06, "loss": 0.69706142, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36132812, "step": 10320, "time_per_iteration": 2.357863426208496 }, { "auxiliary_loss_clip": 0.01053043, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.01386285, "balance_loss_mlp": 1.01631737, "epoch": 0.6205320907861115, "flos": 13187218055040.0, "grad_norm": 6.625752664763431, "language_loss": 0.77168214, "learning_rate": 1.3295387839299013e-06, "loss": 0.79257977, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10321, "time_per_iteration": 2.3517634868621826 }, { "auxiliary_loss_clip": 0.01052825, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01081717, "balance_loss_mlp": 1.01656437, "epoch": 0.6205922140387795, "flos": 20667324602880.0, "grad_norm": 1.7985303966577608, "language_loss": 0.74595988, "learning_rate": 1.329171870732758e-06, "loss": 0.76682085, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 10322, "time_per_iteration": 2.355745792388916 }, { "auxiliary_loss_clip": 0.01053605, "auxiliary_loss_mlp": 0.01035392, "balance_loss_clip": 1.01276565, "balance_loss_mlp": 1.01661158, "epoch": 0.6206523372914474, "flos": 23876715600000.0, "grad_norm": 1.6204904579828854, "language_loss": 0.74622321, "learning_rate": 1.3288049829734845e-06, "loss": 0.76711315, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 10323, "time_per_iteration": 2.4112660884857178 }, { "auxiliary_loss_clip": 0.0105865, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.01764631, "balance_loss_mlp": 1.01857066, "epoch": 0.6207124605441154, "flos": 13405774366080.0, "grad_norm": 2.3984670192435447, "language_loss": 0.5996629, "learning_rate": 1.3284381206659933e-06, "loss": 0.62068284, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.40234375, "step": 10324, "time_per_iteration": 2.3247218132019043 }, { "auxiliary_loss_clip": 0.01055329, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.01525986, "balance_loss_mlp": 1.01818037, "epoch": 0.6207725837967835, "flos": 18915348067200.0, "grad_norm": 1.8196881155683957, "language_loss": 0.77910483, "learning_rate": 1.3280712838241956e-06, "loss": 0.80005527, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 10325, "time_per_iteration": 2.3593223094940186 }, { "auxiliary_loss_clip": 0.01055917, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.01104593, "balance_loss_mlp": 1.01690257, "epoch": 0.6208327070494514, "flos": 23979290774400.0, "grad_norm": 1.7863978289153462, "language_loss": 0.7331813, "learning_rate": 1.327704472462003e-06, "loss": 0.75411743, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38867188, "step": 10326, "time_per_iteration": 2.381061315536499 }, { "auxiliary_loss_clip": 0.01056266, "auxiliary_loss_mlp": 0.01048716, "balance_loss_clip": 1.02219224, "balance_loss_mlp": 1.01760566, "epoch": 0.6208928303021194, "flos": 22819301118720.0, "grad_norm": 2.8565994117141877, "language_loss": 0.75227427, "learning_rate": 1.3273376865933234e-06, "loss": 0.77332401, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 10327, "time_per_iteration": 2.380669116973877 }, { "auxiliary_loss_clip": 0.01056462, "auxiliary_loss_mlp": 0.01044756, "balance_loss_clip": 1.01794553, "balance_loss_mlp": 1.0170213, "epoch": 0.6209529535547873, "flos": 17563615891200.0, "grad_norm": 3.6940257385244974, "language_loss": 0.8211838, "learning_rate": 1.326970926232066e-06, "loss": 0.84219599, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39453125, "step": 10328, "time_per_iteration": 2.326030731201172 }, { "auxiliary_loss_clip": 0.01054472, "auxiliary_loss_mlp": 0.01044254, "balance_loss_clip": 1.01956606, "balance_loss_mlp": 1.01650882, "epoch": 0.6210130768074553, "flos": 22010992254720.0, "grad_norm": 2.6188011340152793, "language_loss": 0.78938097, "learning_rate": 1.3266041913921396e-06, "loss": 0.81036818, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 10329, "time_per_iteration": 2.407322406768799 }, { "auxiliary_loss_clip": 0.01009209, "auxiliary_loss_mlp": 0.01005976, "balance_loss_clip": 1.00350809, "balance_loss_mlp": 1.00218725, "epoch": 0.6210732000601232, "flos": 63673825040640.0, "grad_norm": 0.834593320396637, "language_loss": 0.62317693, "learning_rate": 1.3262374820874484e-06, "loss": 0.64332879, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.0703125, "step": 10330, "time_per_iteration": 2.9504528045654297 }, { "auxiliary_loss_clip": 0.01057254, "auxiliary_loss_mlp": 0.01048206, "balance_loss_clip": 1.02184844, "balance_loss_mlp": 1.0185082, "epoch": 0.6211333233127913, "flos": 24242221290240.0, "grad_norm": 2.1092282525960924, "language_loss": 0.79457295, "learning_rate": 1.3258707983319002e-06, "loss": 0.81562757, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 10331, "time_per_iteration": 2.401834487915039 }, { "auxiliary_loss_clip": 0.0105606, "auxiliary_loss_mlp": 0.0104453, "balance_loss_clip": 1.02067602, "balance_loss_mlp": 1.01758325, "epoch": 0.6211934465654592, "flos": 16942930007040.0, "grad_norm": 2.0442346171897285, "language_loss": 0.68850219, "learning_rate": 1.3255041401393992e-06, "loss": 0.70950806, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38476562, "step": 10332, "time_per_iteration": 2.363570213317871 }, { "auxiliary_loss_clip": 0.01052267, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.01127183, "balance_loss_mlp": 1.01592302, "epoch": 0.6212535698181272, "flos": 15266505386880.0, "grad_norm": 1.67780319072786, "language_loss": 0.77428395, "learning_rate": 1.3251375075238476e-06, "loss": 0.7951563, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 10333, "time_per_iteration": 2.374277114868164 }, { "auxiliary_loss_clip": 0.01052975, "auxiliary_loss_mlp": 0.01039939, "balance_loss_clip": 1.01609671, "balance_loss_mlp": 1.01691544, "epoch": 0.6213136930707951, "flos": 13443096476160.0, "grad_norm": 4.904472164224447, "language_loss": 0.71535999, "learning_rate": 1.3247709004991507e-06, "loss": 0.73628908, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 10334, "time_per_iteration": 2.3289544582366943 }, { "auxiliary_loss_clip": 0.01053894, "auxiliary_loss_mlp": 0.01035792, "balance_loss_clip": 1.01451302, "balance_loss_mlp": 1.01790524, "epoch": 0.6213738163234631, "flos": 18110320871040.0, "grad_norm": 1.7256149939076006, "language_loss": 0.71313226, "learning_rate": 1.3244043190792078e-06, "loss": 0.73402905, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.359375, "step": 10335, "time_per_iteration": 2.3913121223449707 }, { "auxiliary_loss_clip": 0.01051566, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.01426828, "balance_loss_mlp": 1.0162555, "epoch": 0.621433939576131, "flos": 25336189831680.0, "grad_norm": 1.504678699249693, "language_loss": 0.81549585, "learning_rate": 1.3240377632779213e-06, "loss": 0.83638561, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 10336, "time_per_iteration": 2.39898943901062 }, { "auxiliary_loss_clip": 0.01051485, "auxiliary_loss_mlp": 0.01036088, "balance_loss_clip": 1.01474905, "balance_loss_mlp": 1.01626754, "epoch": 0.621494062828799, "flos": 22564504949760.0, "grad_norm": 1.7804675422657101, "language_loss": 0.7455467, "learning_rate": 1.3236712331091907e-06, "loss": 0.76642239, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 10337, "time_per_iteration": 2.444209337234497 }, { "auxiliary_loss_clip": 0.01054724, "auxiliary_loss_mlp": 0.01038298, "balance_loss_clip": 1.01238143, "balance_loss_mlp": 1.01650012, "epoch": 0.621554186081467, "flos": 27416733972480.0, "grad_norm": 2.2293843961455058, "language_loss": 0.64943683, "learning_rate": 1.3233047285869145e-06, "loss": 0.67036706, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10338, "time_per_iteration": 2.4169082641601562 }, { "auxiliary_loss_clip": 0.01052826, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.01211762, "balance_loss_mlp": 1.01614809, "epoch": 0.621614309334135, "flos": 22345704259200.0, "grad_norm": 1.677295296419799, "language_loss": 0.72334045, "learning_rate": 1.322938249724991e-06, "loss": 0.74421847, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10339, "time_per_iteration": 3.6075921058654785 }, { "auxiliary_loss_clip": 0.01052205, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.01201558, "balance_loss_mlp": 1.01659977, "epoch": 0.621674432586803, "flos": 19280225352960.0, "grad_norm": 1.5194873957947403, "language_loss": 0.70773685, "learning_rate": 1.3225717965373166e-06, "loss": 0.72860587, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 10340, "time_per_iteration": 2.3465890884399414 }, { "auxiliary_loss_clip": 0.01051156, "auxiliary_loss_mlp": 0.01037724, "balance_loss_clip": 1.01514614, "balance_loss_mlp": 1.01596808, "epoch": 0.6217345558394709, "flos": 21608653213440.0, "grad_norm": 1.8917461559509743, "language_loss": 0.7086519, "learning_rate": 1.322205369037788e-06, "loss": 0.72954071, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 10341, "time_per_iteration": 2.3737545013427734 }, { "auxiliary_loss_clip": 0.01054803, "auxiliary_loss_mlp": 0.01039064, "balance_loss_clip": 1.01501954, "balance_loss_mlp": 1.01761925, "epoch": 0.6217946790921389, "flos": 18003137397120.0, "grad_norm": 5.733564295140985, "language_loss": 0.82298255, "learning_rate": 1.321838967240299e-06, "loss": 0.84392124, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 10342, "time_per_iteration": 2.3412342071533203 }, { "auxiliary_loss_clip": 0.01009257, "auxiliary_loss_mlp": 0.01002644, "balance_loss_clip": 0.99998587, "balance_loss_mlp": 1.00215602, "epoch": 0.6218548023448068, "flos": 61970307338880.0, "grad_norm": 0.775701546284952, "language_loss": 0.57408607, "learning_rate": 1.3214725911587452e-06, "loss": 0.59420502, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.07128906, "step": 10343, "time_per_iteration": 4.372875690460205 }, { "auxiliary_loss_clip": 0.01050229, "auxiliary_loss_mlp": 0.01040786, "balance_loss_clip": 1.02048445, "balance_loss_mlp": 1.01596463, "epoch": 0.6219149255974749, "flos": 25737970291200.0, "grad_norm": 1.99436827598977, "language_loss": 0.74039197, "learning_rate": 1.3211062408070184e-06, "loss": 0.76130211, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34179688, "step": 10344, "time_per_iteration": 3.8489503860473633 }, { "auxiliary_loss_clip": 0.01053768, "auxiliary_loss_mlp": 0.01043465, "balance_loss_clip": 1.02012444, "balance_loss_mlp": 1.01747143, "epoch": 0.6219750488501428, "flos": 25409891445120.0, "grad_norm": 1.8180348618964912, "language_loss": 0.61310446, "learning_rate": 1.3207399161990105e-06, "loss": 0.63407671, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 10345, "time_per_iteration": 2.3838841915130615 }, { "auxiliary_loss_clip": 0.01052855, "auxiliary_loss_mlp": 0.01040656, "balance_loss_clip": 1.01744604, "balance_loss_mlp": 1.01627374, "epoch": 0.6220351721028108, "flos": 20046359427840.0, "grad_norm": 1.7175506947687915, "language_loss": 0.79313666, "learning_rate": 1.320373617348614e-06, "loss": 0.81407171, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 10346, "time_per_iteration": 2.3566694259643555 }, { "auxiliary_loss_clip": 0.01054232, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.01751113, "balance_loss_mlp": 1.01632118, "epoch": 0.6220952953554787, "flos": 27487223740800.0, "grad_norm": 1.5753575271455187, "language_loss": 0.72491145, "learning_rate": 1.3200073442697171e-06, "loss": 0.74589717, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 10347, "time_per_iteration": 2.406557321548462 }, { "auxiliary_loss_clip": 0.01052212, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.01776123, "balance_loss_mlp": 1.01574659, "epoch": 0.6221554186081467, "flos": 19206628473600.0, "grad_norm": 1.764215531408927, "language_loss": 0.72804976, "learning_rate": 1.3196410969762108e-06, "loss": 0.74897778, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 10348, "time_per_iteration": 2.3912715911865234 }, { "auxiliary_loss_clip": 0.01008222, "auxiliary_loss_mlp": 0.01009025, "balance_loss_clip": 1.00630677, "balance_loss_mlp": 1.00117147, "epoch": 0.6222155418608146, "flos": 62947805984640.0, "grad_norm": 0.813812655238156, "language_loss": 0.54242289, "learning_rate": 1.3192748754819815e-06, "loss": 0.56259537, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.0703125, "step": 10349, "time_per_iteration": 3.0444953441619873 }, { "auxiliary_loss_clip": 0.01053792, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.01589835, "balance_loss_mlp": 1.01751971, "epoch": 0.6222756651134826, "flos": 22600011669120.0, "grad_norm": 2.144965586643581, "language_loss": 0.71577364, "learning_rate": 1.3189086798009173e-06, "loss": 0.73669153, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 10350, "time_per_iteration": 2.408726930618286 }, { "auxiliary_loss_clip": 0.01054081, "auxiliary_loss_mlp": 0.01043162, "balance_loss_clip": 1.01901054, "balance_loss_mlp": 1.01642859, "epoch": 0.6223357883661506, "flos": 21141165841920.0, "grad_norm": 1.890829756460324, "language_loss": 0.58642673, "learning_rate": 1.3185425099469046e-06, "loss": 0.60739911, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 10351, "time_per_iteration": 2.3930017948150635 }, { "auxiliary_loss_clip": 0.01007691, "auxiliary_loss_mlp": 0.01015959, "balance_loss_clip": 1.01339567, "balance_loss_mlp": 1.00079894, "epoch": 0.6223959116188186, "flos": 63761595932160.0, "grad_norm": 0.8090585668858306, "language_loss": 0.61277419, "learning_rate": 1.3181763659338276e-06, "loss": 0.63301075, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.06884766, "step": 10352, "time_per_iteration": 4.415473461151123 }, { "auxiliary_loss_clip": 0.01051297, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.01761985, "balance_loss_mlp": 1.01574922, "epoch": 0.6224560348714866, "flos": 22564609683840.0, "grad_norm": 1.9343754949635912, "language_loss": 0.83055502, "learning_rate": 1.3178102477755714e-06, "loss": 0.85147274, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 10353, "time_per_iteration": 2.4050681591033936 }, { "auxiliary_loss_clip": 0.01049434, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.01622462, "balance_loss_mlp": 1.01516294, "epoch": 0.6225161581241545, "flos": 24096598542720.0, "grad_norm": 1.478828853788124, "language_loss": 0.76349938, "learning_rate": 1.3174441554860195e-06, "loss": 0.78437328, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 10354, "time_per_iteration": 2.45405650138855 }, { "auxiliary_loss_clip": 0.01052999, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.014328, "balance_loss_mlp": 1.01627827, "epoch": 0.6225762813768225, "flos": 20442623892480.0, "grad_norm": 1.512867933292012, "language_loss": 0.79636383, "learning_rate": 1.3170780890790528e-06, "loss": 0.81727827, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10355, "time_per_iteration": 2.4461629390716553 }, { "auxiliary_loss_clip": 0.01053338, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.01527429, "balance_loss_mlp": 1.01724982, "epoch": 0.6226364046294904, "flos": 27196920852480.0, "grad_norm": 4.750038477446377, "language_loss": 0.79113263, "learning_rate": 1.3167120485685538e-06, "loss": 0.81203365, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.359375, "step": 10356, "time_per_iteration": 2.4590542316436768 }, { "auxiliary_loss_clip": 0.01058318, "auxiliary_loss_mlp": 0.01044884, "balance_loss_clip": 1.01785946, "balance_loss_mlp": 1.01910281, "epoch": 0.6226965278821585, "flos": 20444823308160.0, "grad_norm": 2.2580023699977616, "language_loss": 0.69446462, "learning_rate": 1.3163460339684024e-06, "loss": 0.71549666, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 10357, "time_per_iteration": 2.374389886856079 }, { "auxiliary_loss_clip": 0.01057059, "auxiliary_loss_mlp": 0.01042209, "balance_loss_clip": 1.01429057, "balance_loss_mlp": 1.01819611, "epoch": 0.6227566511348264, "flos": 22161677149440.0, "grad_norm": 2.635816565190352, "language_loss": 0.76905507, "learning_rate": 1.3159800452924778e-06, "loss": 0.79004776, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38867188, "step": 10358, "time_per_iteration": 2.4003114700317383 }, { "auxiliary_loss_clip": 0.01054556, "auxiliary_loss_mlp": 0.01041603, "balance_loss_clip": 1.01675975, "balance_loss_mlp": 1.01698041, "epoch": 0.6228167743874944, "flos": 18039900925440.0, "grad_norm": 2.165154962910996, "language_loss": 0.8364917, "learning_rate": 1.3156140825546588e-06, "loss": 0.85745335, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 10359, "time_per_iteration": 2.3589015007019043 }, { "auxiliary_loss_clip": 0.01052814, "auxiliary_loss_mlp": 0.01047115, "balance_loss_clip": 1.02299857, "balance_loss_mlp": 1.01727796, "epoch": 0.6228768976401623, "flos": 17742057183360.0, "grad_norm": 2.2542880978542588, "language_loss": 0.74243176, "learning_rate": 1.315248145768822e-06, "loss": 0.76343107, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35546875, "step": 10360, "time_per_iteration": 2.337226629257202 }, { "auxiliary_loss_clip": 0.0105459, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.01623333, "balance_loss_mlp": 1.01714683, "epoch": 0.6229370208928303, "flos": 17893963975680.0, "grad_norm": 2.1044991348075848, "language_loss": 0.78679955, "learning_rate": 1.3148822349488442e-06, "loss": 0.80774581, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 10361, "time_per_iteration": 2.3586506843566895 }, { "auxiliary_loss_clip": 0.01053074, "auxiliary_loss_mlp": 0.01038982, "balance_loss_clip": 1.015903, "balance_loss_mlp": 1.01704717, "epoch": 0.6229971441454982, "flos": 17346805148160.0, "grad_norm": 1.723405357730571, "language_loss": 0.68679726, "learning_rate": 1.3145163501086005e-06, "loss": 0.70771784, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 10362, "time_per_iteration": 2.3847625255584717 }, { "auxiliary_loss_clip": 0.01056658, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.01312065, "balance_loss_mlp": 1.0183481, "epoch": 0.6230572673981662, "flos": 29240107971840.0, "grad_norm": 2.201055614045916, "language_loss": 0.68717456, "learning_rate": 1.3141504912619658e-06, "loss": 0.70812327, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10363, "time_per_iteration": 2.4558916091918945 }, { "auxiliary_loss_clip": 0.01056671, "auxiliary_loss_mlp": 0.01041594, "balance_loss_clip": 1.01430702, "balance_loss_mlp": 1.01827061, "epoch": 0.6231173906508342, "flos": 16325037031680.0, "grad_norm": 2.0454598406220574, "language_loss": 0.87868273, "learning_rate": 1.3137846584228127e-06, "loss": 0.89966536, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 10364, "time_per_iteration": 2.3656580448150635 }, { "auxiliary_loss_clip": 0.01011992, "auxiliary_loss_mlp": 0.0100734, "balance_loss_clip": 1.00518203, "balance_loss_mlp": 1.00429797, "epoch": 0.6231775139035022, "flos": 68699119006080.0, "grad_norm": 0.8819379211903688, "language_loss": 0.60916281, "learning_rate": 1.313418851605015e-06, "loss": 0.62935615, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.07714844, "step": 10365, "time_per_iteration": 3.0280003547668457 }, { "auxiliary_loss_clip": 0.01057873, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.02012622, "balance_loss_mlp": 1.01846504, "epoch": 0.6232376371561702, "flos": 19820227351680.0, "grad_norm": 1.7282713085913322, "language_loss": 0.76847243, "learning_rate": 1.3130530708224427e-06, "loss": 0.7895388, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.39453125, "step": 10366, "time_per_iteration": 2.3816006183624268 }, { "auxiliary_loss_clip": 0.01055338, "auxiliary_loss_mlp": 0.01042214, "balance_loss_clip": 1.01679897, "balance_loss_mlp": 1.01749349, "epoch": 0.6232977604088381, "flos": 23257146879360.0, "grad_norm": 1.8525592566391478, "language_loss": 0.78050572, "learning_rate": 1.3126873160889665e-06, "loss": 0.80148125, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 10367, "time_per_iteration": 2.45912766456604 }, { "auxiliary_loss_clip": 0.01053599, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.0124104, "balance_loss_mlp": 1.0175997, "epoch": 0.6233578836615061, "flos": 21105344920320.0, "grad_norm": 1.6354004855902522, "language_loss": 0.79289371, "learning_rate": 1.312321587418457e-06, "loss": 0.8137958, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 10368, "time_per_iteration": 2.399183750152588 }, { "auxiliary_loss_clip": 0.01055217, "auxiliary_loss_mlp": 0.01039407, "balance_loss_clip": 1.01361001, "balance_loss_mlp": 1.01776361, "epoch": 0.623418006914174, "flos": 23768275317120.0, "grad_norm": 1.8066895665749894, "language_loss": 0.70241654, "learning_rate": 1.3119558848247811e-06, "loss": 0.7233628, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10369, "time_per_iteration": 2.425150156021118 }, { "auxiliary_loss_clip": 0.01054611, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.01798606, "balance_loss_mlp": 1.01670289, "epoch": 0.6234781301668421, "flos": 17889634967040.0, "grad_norm": 2.0829942566115727, "language_loss": 0.89214563, "learning_rate": 1.3115902083218072e-06, "loss": 0.91311777, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 10370, "time_per_iteration": 2.3363070487976074 }, { "auxiliary_loss_clip": 0.010509, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.01165271, "balance_loss_mlp": 1.01570845, "epoch": 0.62353825341951, "flos": 26174349774720.0, "grad_norm": 1.415569294073299, "language_loss": 0.66857195, "learning_rate": 1.311224557923402e-06, "loss": 0.68943477, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35351562, "step": 10371, "time_per_iteration": 2.4231669902801514 }, { "auxiliary_loss_clip": 0.01047885, "auxiliary_loss_mlp": 0.01032474, "balance_loss_clip": 1.01371098, "balance_loss_mlp": 1.01482391, "epoch": 0.623598376672178, "flos": 31138545127680.0, "grad_norm": 1.675645161935773, "language_loss": 0.77849656, "learning_rate": 1.3108589336434298e-06, "loss": 0.79930013, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.33007812, "step": 10372, "time_per_iteration": 2.47499418258667 }, { "auxiliary_loss_clip": 0.01053687, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.0170188, "balance_loss_mlp": 1.01653159, "epoch": 0.6236584999248459, "flos": 23729137816320.0, "grad_norm": 1.5715844806074482, "language_loss": 0.78089297, "learning_rate": 1.3104933354957568e-06, "loss": 0.80183816, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10373, "time_per_iteration": 2.402592420578003 }, { "auxiliary_loss_clip": 0.01050558, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.019449, "balance_loss_mlp": 1.01555681, "epoch": 0.6237186231775139, "flos": 21761677169280.0, "grad_norm": 1.4724963280863845, "language_loss": 0.70541281, "learning_rate": 1.3101277634942448e-06, "loss": 0.72633684, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 10374, "time_per_iteration": 2.4063589572906494 }, { "auxiliary_loss_clip": 0.01052249, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.01878881, "balance_loss_mlp": 1.01603293, "epoch": 0.6237787464301818, "flos": 14938601097600.0, "grad_norm": 1.6915383108618636, "language_loss": 0.78314036, "learning_rate": 1.3097622176527577e-06, "loss": 0.80406833, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.36132812, "step": 10375, "time_per_iteration": 2.3391923904418945 }, { "auxiliary_loss_clip": 0.01052199, "auxiliary_loss_mlp": 0.01041211, "balance_loss_clip": 1.01941943, "balance_loss_mlp": 1.01702964, "epoch": 0.6238388696828499, "flos": 35588854045440.0, "grad_norm": 1.4071605051182923, "language_loss": 0.71686786, "learning_rate": 1.3093966979851566e-06, "loss": 0.73780191, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.3515625, "step": 10376, "time_per_iteration": 2.543391466140747 }, { "auxiliary_loss_clip": 0.01053741, "auxiliary_loss_mlp": 0.01047616, "balance_loss_clip": 1.02173531, "balance_loss_mlp": 1.01623321, "epoch": 0.6238989929355178, "flos": 23622373278720.0, "grad_norm": 1.573291646142195, "language_loss": 0.78272569, "learning_rate": 1.309031204505301e-06, "loss": 0.80373931, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10377, "time_per_iteration": 2.4061696529388428 }, { "auxiliary_loss_clip": 0.01053483, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.01223803, "balance_loss_mlp": 1.01721096, "epoch": 0.6239591161881858, "flos": 22086474347520.0, "grad_norm": 1.7015181006235787, "language_loss": 0.69565392, "learning_rate": 1.308665737227052e-06, "loss": 0.71654141, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 10378, "time_per_iteration": 3.627535581588745 }, { "auxiliary_loss_clip": 0.01052879, "auxiliary_loss_mlp": 0.01038257, "balance_loss_clip": 1.01555967, "balance_loss_mlp": 1.01597142, "epoch": 0.6240192394408538, "flos": 24534758505600.0, "grad_norm": 1.7072616048456617, "language_loss": 0.77200609, "learning_rate": 1.3083002961642675e-06, "loss": 0.79291743, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 10379, "time_per_iteration": 2.4317829608917236 }, { "auxiliary_loss_clip": 0.01052552, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.01560068, "balance_loss_mlp": 1.01630712, "epoch": 0.6240793626935217, "flos": 27930585496320.0, "grad_norm": 1.4653428445869618, "language_loss": 0.79947734, "learning_rate": 1.3079348813308051e-06, "loss": 0.82038736, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 10380, "time_per_iteration": 2.4452779293060303 }, { "auxiliary_loss_clip": 0.01052739, "auxiliary_loss_mlp": 0.01037865, "balance_loss_clip": 1.01769471, "balance_loss_mlp": 1.01790142, "epoch": 0.6241394859461897, "flos": 22891431720960.0, "grad_norm": 1.532092472179868, "language_loss": 0.80638897, "learning_rate": 1.3075694927405207e-06, "loss": 0.82729495, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34765625, "step": 10381, "time_per_iteration": 2.424164056777954 }, { "auxiliary_loss_clip": 0.01054272, "auxiliary_loss_mlp": 0.01035424, "balance_loss_clip": 1.01180863, "balance_loss_mlp": 1.01737905, "epoch": 0.6241996091988576, "flos": 12749930876160.0, "grad_norm": 1.850215485497891, "language_loss": 0.75545204, "learning_rate": 1.3072041304072718e-06, "loss": 0.77634895, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 10382, "time_per_iteration": 2.3303074836730957 }, { "auxiliary_loss_clip": 0.01050913, "auxiliary_loss_mlp": 0.01031589, "balance_loss_clip": 1.00999999, "balance_loss_mlp": 1.01606536, "epoch": 0.6242597324515257, "flos": 25850041355520.0, "grad_norm": 1.401425278727459, "language_loss": 0.79464471, "learning_rate": 1.306838794344911e-06, "loss": 0.81546974, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34960938, "step": 10383, "time_per_iteration": 3.860297679901123 }, { "auxiliary_loss_clip": 0.01052503, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.01251531, "balance_loss_mlp": 1.01706171, "epoch": 0.6243198557041936, "flos": 19936697247360.0, "grad_norm": 1.9574102467733465, "language_loss": 0.76416814, "learning_rate": 1.3064734845672925e-06, "loss": 0.78503549, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35351562, "step": 10384, "time_per_iteration": 3.7070844173431396 }, { "auxiliary_loss_clip": 0.01053655, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.01526022, "balance_loss_mlp": 1.01678753, "epoch": 0.6243799789568616, "flos": 18405197147520.0, "grad_norm": 1.9423526662972488, "language_loss": 0.6768074, "learning_rate": 1.3061082010882694e-06, "loss": 0.69773668, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10385, "time_per_iteration": 2.350911855697632 }, { "auxiliary_loss_clip": 0.01009698, "auxiliary_loss_mlp": 0.01014362, "balance_loss_clip": 1.01169205, "balance_loss_mlp": 1.00266314, "epoch": 0.6244401022095295, "flos": 66024037589760.0, "grad_norm": 0.7613603335947061, "language_loss": 0.62089193, "learning_rate": 1.305742943921692e-06, "loss": 0.64113259, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.0703125, "step": 10386, "time_per_iteration": 3.0550146102905273 }, { "auxiliary_loss_clip": 0.01054037, "auxiliary_loss_mlp": 0.01038133, "balance_loss_clip": 1.01390982, "balance_loss_mlp": 1.0169754, "epoch": 0.6245002254621975, "flos": 24570125579520.0, "grad_norm": 2.314102804626421, "language_loss": 0.72964084, "learning_rate": 1.3053777130814128e-06, "loss": 0.75056255, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 10387, "time_per_iteration": 2.382514715194702 }, { "auxiliary_loss_clip": 0.01058721, "auxiliary_loss_mlp": 0.01043287, "balance_loss_clip": 1.01729965, "balance_loss_mlp": 1.01875198, "epoch": 0.6245603487148654, "flos": 29167558433280.0, "grad_norm": 2.065989784685282, "language_loss": 0.66475785, "learning_rate": 1.3050125085812798e-06, "loss": 0.6857779, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.40039062, "step": 10388, "time_per_iteration": 2.445387125015259 }, { "auxiliary_loss_clip": 0.01053355, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.01131415, "balance_loss_mlp": 1.01735973, "epoch": 0.6246204719675335, "flos": 14789312657280.0, "grad_norm": 1.5590397146944934, "language_loss": 0.80171514, "learning_rate": 1.3046473304351417e-06, "loss": 0.82257378, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.36132812, "step": 10389, "time_per_iteration": 2.3365225791931152 }, { "auxiliary_loss_clip": 0.01053177, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.01441884, "balance_loss_mlp": 1.01757646, "epoch": 0.6246805952202014, "flos": 12492760734720.0, "grad_norm": 1.676858400231038, "language_loss": 0.61217564, "learning_rate": 1.3042821786568475e-06, "loss": 0.63309419, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 10390, "time_per_iteration": 2.378221273422241 }, { "auxiliary_loss_clip": 0.01054673, "auxiliary_loss_mlp": 0.01040947, "balance_loss_clip": 1.01853561, "balance_loss_mlp": 1.01768661, "epoch": 0.6247407184728694, "flos": 12785856531840.0, "grad_norm": 1.8529478692312638, "language_loss": 0.79009598, "learning_rate": 1.3039170532602416e-06, "loss": 0.8110522, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36914062, "step": 10391, "time_per_iteration": 2.3453776836395264 }, { "auxiliary_loss_clip": 0.01055609, "auxiliary_loss_mlp": 0.01040564, "balance_loss_clip": 1.01729476, "balance_loss_mlp": 1.0194447, "epoch": 0.6248008417255374, "flos": 40627484150400.0, "grad_norm": 1.769448494277114, "language_loss": 0.65832156, "learning_rate": 1.3035519542591718e-06, "loss": 0.67928326, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 10392, "time_per_iteration": 3.9396097660064697 }, { "auxiliary_loss_clip": 0.01055013, "auxiliary_loss_mlp": 0.01043146, "balance_loss_clip": 1.01821959, "balance_loss_mlp": 1.01799822, "epoch": 0.6248609649782053, "flos": 19900981059840.0, "grad_norm": 1.8247558843499425, "language_loss": 0.77717435, "learning_rate": 1.3031868816674819e-06, "loss": 0.7981559, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36914062, "step": 10393, "time_per_iteration": 2.352872848510742 }, { "auxiliary_loss_clip": 0.01055475, "auxiliary_loss_mlp": 0.01043752, "balance_loss_clip": 1.02024412, "balance_loss_mlp": 1.01880813, "epoch": 0.6249210882308733, "flos": 19681726521600.0, "grad_norm": 1.7577350271273602, "language_loss": 0.8382569, "learning_rate": 1.3028218354990142e-06, "loss": 0.85924917, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 10394, "time_per_iteration": 2.416154623031616 }, { "auxiliary_loss_clip": 0.01055078, "auxiliary_loss_mlp": 0.01044674, "balance_loss_clip": 1.0208081, "balance_loss_mlp": 1.01788104, "epoch": 0.6249812114835412, "flos": 13989871278720.0, "grad_norm": 1.8502069453458474, "language_loss": 0.76604658, "learning_rate": 1.3024568157676128e-06, "loss": 0.78704411, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10395, "time_per_iteration": 2.3470935821533203 }, { "auxiliary_loss_clip": 0.0105578, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.01903486, "balance_loss_mlp": 1.0178858, "epoch": 0.6250413347362093, "flos": 14529384518400.0, "grad_norm": 2.418631137979976, "language_loss": 0.73625791, "learning_rate": 1.302091822487119e-06, "loss": 0.75724649, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 10396, "time_per_iteration": 2.3322646617889404 }, { "auxiliary_loss_clip": 0.01054565, "auxiliary_loss_mlp": 0.01039238, "balance_loss_clip": 1.01701748, "balance_loss_mlp": 1.01791549, "epoch": 0.6251014579888772, "flos": 22961991312000.0, "grad_norm": 1.9033091483329845, "language_loss": 0.77224505, "learning_rate": 1.3017268556713732e-06, "loss": 0.79318309, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3671875, "step": 10397, "time_per_iteration": 2.373284339904785 }, { "auxiliary_loss_clip": 0.01053702, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.01656771, "balance_loss_mlp": 1.01788378, "epoch": 0.6251615812415452, "flos": 28109969395200.0, "grad_norm": 2.5835589994262986, "language_loss": 0.7682333, "learning_rate": 1.3013619153342154e-06, "loss": 0.78917038, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 10398, "time_per_iteration": 2.487374782562256 }, { "auxiliary_loss_clip": 0.01053618, "auxiliary_loss_mlp": 0.0104341, "balance_loss_clip": 1.01984239, "balance_loss_mlp": 1.01679373, "epoch": 0.6252217044942131, "flos": 26723254170240.0, "grad_norm": 1.738572524526894, "language_loss": 0.75255007, "learning_rate": 1.300997001489483e-06, "loss": 0.77352035, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 10399, "time_per_iteration": 2.4130094051361084 }, { "auxiliary_loss_clip": 0.01053161, "auxiliary_loss_mlp": 0.01044538, "balance_loss_clip": 1.02014732, "balance_loss_mlp": 1.01713037, "epoch": 0.6252818277468811, "flos": 20005860384000.0, "grad_norm": 1.9416818484992924, "language_loss": 0.75680745, "learning_rate": 1.3006321141510147e-06, "loss": 0.77778435, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 10400, "time_per_iteration": 2.385928153991699 }, { "auxiliary_loss_clip": 0.01008595, "auxiliary_loss_mlp": 0.01003157, "balance_loss_clip": 1.00096369, "balance_loss_mlp": 1.00167942, "epoch": 0.625341950999549, "flos": 59274907511040.0, "grad_norm": 0.806394618414399, "language_loss": 0.56494671, "learning_rate": 1.3002672533326465e-06, "loss": 0.58506423, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06933594, "step": 10401, "time_per_iteration": 3.077420473098755 }, { "auxiliary_loss_clip": 0.01055005, "auxiliary_loss_mlp": 0.01042464, "balance_loss_clip": 1.01913452, "balance_loss_mlp": 1.01790309, "epoch": 0.625402074252217, "flos": 20156056519680.0, "grad_norm": 2.202435230977698, "language_loss": 0.84251082, "learning_rate": 1.2999024190482146e-06, "loss": 0.86348546, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 10402, "time_per_iteration": 2.37788987159729 }, { "auxiliary_loss_clip": 0.0105205, "auxiliary_loss_mlp": 0.01041832, "balance_loss_clip": 1.01875329, "balance_loss_mlp": 1.01595449, "epoch": 0.625462197504885, "flos": 29131248752640.0, "grad_norm": 1.9496345276339333, "language_loss": 0.70224255, "learning_rate": 1.2995376113115527e-06, "loss": 0.72318137, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 10403, "time_per_iteration": 2.4419331550598145 }, { "auxiliary_loss_clip": 0.01054097, "auxiliary_loss_mlp": 0.01041003, "balance_loss_clip": 1.01657724, "balance_loss_mlp": 1.01648772, "epoch": 0.625522320757553, "flos": 26103231601920.0, "grad_norm": 1.5468519915695669, "language_loss": 0.73480642, "learning_rate": 1.2991728301364954e-06, "loss": 0.75575739, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 10404, "time_per_iteration": 2.4396274089813232 }, { "auxiliary_loss_clip": 0.01054059, "auxiliary_loss_mlp": 0.01042421, "balance_loss_clip": 1.01844847, "balance_loss_mlp": 1.01727653, "epoch": 0.625582444010221, "flos": 20629932670080.0, "grad_norm": 1.7876897860227412, "language_loss": 0.7105279, "learning_rate": 1.2988080755368742e-06, "loss": 0.73149276, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10405, "time_per_iteration": 2.35520601272583 }, { "auxiliary_loss_clip": 0.01053794, "auxiliary_loss_mlp": 0.0104517, "balance_loss_clip": 1.02105427, "balance_loss_mlp": 1.01674199, "epoch": 0.6256425672628889, "flos": 20520479957760.0, "grad_norm": 1.572519998634553, "language_loss": 0.79665995, "learning_rate": 1.2984433475265207e-06, "loss": 0.81764966, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37109375, "step": 10406, "time_per_iteration": 2.39420747756958 }, { "auxiliary_loss_clip": 0.01054422, "auxiliary_loss_mlp": 0.01045614, "balance_loss_clip": 1.02190351, "balance_loss_mlp": 1.017838, "epoch": 0.6257026905155569, "flos": 29528036887680.0, "grad_norm": 1.6690621680314628, "language_loss": 0.7019639, "learning_rate": 1.2980786461192666e-06, "loss": 0.72296429, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 10407, "time_per_iteration": 2.4289591312408447 }, { "auxiliary_loss_clip": 0.01051384, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.02201533, "balance_loss_mlp": 1.01644075, "epoch": 0.6257628137682248, "flos": 24023734801920.0, "grad_norm": 1.7165498652451934, "language_loss": 0.86782885, "learning_rate": 1.2977139713289398e-06, "loss": 0.88878053, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 10408, "time_per_iteration": 2.4138877391815186 }, { "auxiliary_loss_clip": 0.01052313, "auxiliary_loss_mlp": 0.01042705, "balance_loss_clip": 1.02036488, "balance_loss_mlp": 1.01633728, "epoch": 0.6258229370208929, "flos": 20849885435520.0, "grad_norm": 1.6627959219809607, "language_loss": 0.8040418, "learning_rate": 1.2973493231693699e-06, "loss": 0.82499206, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 10409, "time_per_iteration": 2.380603313446045 }, { "auxiliary_loss_clip": 0.01051628, "auxiliary_loss_mlp": 0.01039578, "balance_loss_clip": 1.01671338, "balance_loss_mlp": 1.01532733, "epoch": 0.6258830602735608, "flos": 22230595906560.0, "grad_norm": 2.8588586005566885, "language_loss": 0.71231639, "learning_rate": 1.2969847016543845e-06, "loss": 0.73322845, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 10410, "time_per_iteration": 2.3872859477996826 }, { "auxiliary_loss_clip": 0.01052816, "auxiliary_loss_mlp": 0.01040115, "balance_loss_clip": 1.01772738, "balance_loss_mlp": 1.01723766, "epoch": 0.6259431835262288, "flos": 25075877667840.0, "grad_norm": 1.761882910366046, "language_loss": 0.68642527, "learning_rate": 1.2966201067978086e-06, "loss": 0.70735455, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 10411, "time_per_iteration": 2.4011940956115723 }, { "auxiliary_loss_clip": 0.01053731, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.01953506, "balance_loss_mlp": 1.01679504, "epoch": 0.6260033067788967, "flos": 28251158400000.0, "grad_norm": 2.001412536550126, "language_loss": 0.71159488, "learning_rate": 1.2962555386134702e-06, "loss": 0.73256952, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 10412, "time_per_iteration": 2.476613759994507 }, { "auxiliary_loss_clip": 0.01052065, "auxiliary_loss_mlp": 0.01039512, "balance_loss_clip": 1.01745856, "balance_loss_mlp": 1.01644361, "epoch": 0.6260634300315647, "flos": 23366320300800.0, "grad_norm": 8.203350369684953, "language_loss": 0.70451885, "learning_rate": 1.2958909971151908e-06, "loss": 0.7254346, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 10413, "time_per_iteration": 2.3794362545013428 }, { "auxiliary_loss_clip": 0.01055928, "auxiliary_loss_mlp": 0.01045688, "balance_loss_clip": 1.01793635, "balance_loss_mlp": 1.01699185, "epoch": 0.6261235532842326, "flos": 18034489664640.0, "grad_norm": 2.8099916866485297, "language_loss": 0.8126049, "learning_rate": 1.295526482316796e-06, "loss": 0.83362103, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.390625, "step": 10414, "time_per_iteration": 2.3492469787597656 }, { "auxiliary_loss_clip": 0.01056347, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.02282262, "balance_loss_mlp": 1.01862228, "epoch": 0.6261836765369007, "flos": 22010363850240.0, "grad_norm": 1.776530005457135, "language_loss": 0.75674725, "learning_rate": 1.2951619942321083e-06, "loss": 0.77779263, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 10415, "time_per_iteration": 2.4029667377471924 }, { "auxiliary_loss_clip": 0.01053897, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.01462889, "balance_loss_mlp": 1.01743698, "epoch": 0.6262437997895686, "flos": 24934863219840.0, "grad_norm": 1.7724508029424213, "language_loss": 0.75616497, "learning_rate": 1.2947975328749472e-06, "loss": 0.77707803, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 10416, "time_per_iteration": 2.4498000144958496 }, { "auxiliary_loss_clip": 0.01054554, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.01210022, "balance_loss_mlp": 1.01900053, "epoch": 0.6263039230422366, "flos": 31607219485440.0, "grad_norm": 1.5276670756965094, "language_loss": 0.85273445, "learning_rate": 1.2944330982591352e-06, "loss": 0.87362218, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 10417, "time_per_iteration": 2.4961016178131104 }, { "auxiliary_loss_clip": 0.01054565, "auxiliary_loss_mlp": 0.01038576, "balance_loss_clip": 1.01414967, "balance_loss_mlp": 1.01828229, "epoch": 0.6263640462949046, "flos": 17638504490880.0, "grad_norm": 2.208849435474345, "language_loss": 0.58673292, "learning_rate": 1.2940686903984904e-06, "loss": 0.60766435, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 10418, "time_per_iteration": 3.609483003616333 }, { "auxiliary_loss_clip": 0.0105849, "auxiliary_loss_mlp": 0.01041613, "balance_loss_clip": 1.01485062, "balance_loss_mlp": 1.01877141, "epoch": 0.6264241695475725, "flos": 19973914623360.0, "grad_norm": 1.817707450002884, "language_loss": 0.85063887, "learning_rate": 1.2937043093068316e-06, "loss": 0.87163985, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 10419, "time_per_iteration": 2.356804132461548 }, { "auxiliary_loss_clip": 0.01054497, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.0125432, "balance_loss_mlp": 1.01770234, "epoch": 0.6264842928002405, "flos": 27343102181760.0, "grad_norm": 1.714348133702366, "language_loss": 0.65384924, "learning_rate": 1.2933399549979762e-06, "loss": 0.67475104, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 10420, "time_per_iteration": 2.503810167312622 }, { "auxiliary_loss_clip": 0.01056972, "auxiliary_loss_mlp": 0.01041867, "balance_loss_clip": 1.01415038, "balance_loss_mlp": 1.01939845, "epoch": 0.6265444160529084, "flos": 22996311045120.0, "grad_norm": 1.7521513655525907, "language_loss": 0.872679, "learning_rate": 1.292975627485741e-06, "loss": 0.89366734, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.375, "step": 10421, "time_per_iteration": 2.376208543777466 }, { "auxiliary_loss_clip": 0.01055824, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.01301527, "balance_loss_mlp": 1.01883101, "epoch": 0.6266045393055765, "flos": 19937290740480.0, "grad_norm": 2.349912281322267, "language_loss": 0.80629253, "learning_rate": 1.2926113267839403e-06, "loss": 0.82721072, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36914062, "step": 10422, "time_per_iteration": 3.7676563262939453 }, { "auxiliary_loss_clip": 0.0105241, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.0080862, "balance_loss_mlp": 1.01687336, "epoch": 0.6266646625582444, "flos": 24387948771840.0, "grad_norm": 1.5751908297218915, "language_loss": 0.75787604, "learning_rate": 1.292247052906389e-06, "loss": 0.77871239, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 10423, "time_per_iteration": 3.90198016166687 }, { "auxiliary_loss_clip": 0.01054665, "auxiliary_loss_mlp": 0.01033029, "balance_loss_clip": 1.01020026, "balance_loss_mlp": 1.01797962, "epoch": 0.6267247858109124, "flos": 14682932144640.0, "grad_norm": 1.7947600463761983, "language_loss": 0.78213811, "learning_rate": 1.2918828058669004e-06, "loss": 0.80301505, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10424, "time_per_iteration": 2.3382835388183594 }, { "auxiliary_loss_clip": 0.01053798, "auxiliary_loss_mlp": 0.010426, "balance_loss_clip": 1.01543236, "balance_loss_mlp": 1.01783872, "epoch": 0.6267849090635803, "flos": 24928998111360.0, "grad_norm": 1.933031412551169, "language_loss": 0.7017858, "learning_rate": 1.2915185856792868e-06, "loss": 0.72274971, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.359375, "step": 10425, "time_per_iteration": 2.42175555229187 }, { "auxiliary_loss_clip": 0.01051218, "auxiliary_loss_mlp": 0.01032659, "balance_loss_clip": 1.01176202, "balance_loss_mlp": 1.01681626, "epoch": 0.6268450323162483, "flos": 25336678590720.0, "grad_norm": 1.5233851794142177, "language_loss": 0.75649893, "learning_rate": 1.2911543923573598e-06, "loss": 0.77733773, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 10426, "time_per_iteration": 2.403594732284546 }, { "auxiliary_loss_clip": 0.01055293, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.01296639, "balance_loss_mlp": 1.01839304, "epoch": 0.6269051555689162, "flos": 26176095342720.0, "grad_norm": 1.545790664009311, "language_loss": 0.81243455, "learning_rate": 1.290790225914929e-06, "loss": 0.83335644, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 10427, "time_per_iteration": 2.4388580322265625 }, { "auxiliary_loss_clip": 0.01057837, "auxiliary_loss_mlp": 0.01037951, "balance_loss_clip": 1.01329875, "balance_loss_mlp": 1.01932955, "epoch": 0.6269652788215843, "flos": 18255978529920.0, "grad_norm": 1.7991189765076165, "language_loss": 0.70047635, "learning_rate": 1.2904260863658034e-06, "loss": 0.72143424, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 10428, "time_per_iteration": 2.325345277786255 }, { "auxiliary_loss_clip": 0.01053583, "auxiliary_loss_mlp": 0.01041867, "balance_loss_clip": 1.01652288, "balance_loss_mlp": 1.01727784, "epoch": 0.6270254020742522, "flos": 11764612085760.0, "grad_norm": 1.7053976134569133, "language_loss": 0.73133922, "learning_rate": 1.2900619737237928e-06, "loss": 0.75229371, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 10429, "time_per_iteration": 2.359295606613159 }, { "auxiliary_loss_clip": 0.01056685, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.0177536, "balance_loss_mlp": 1.01812685, "epoch": 0.6270855253269202, "flos": 23474551115520.0, "grad_norm": 1.753916059204552, "language_loss": 0.80374253, "learning_rate": 1.2896978880027023e-06, "loss": 0.82474387, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 10430, "time_per_iteration": 2.3883137702941895 }, { "auxiliary_loss_clip": 0.01011688, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.00012374, "balance_loss_mlp": 1.00432491, "epoch": 0.6271456485795882, "flos": 70061219856000.0, "grad_norm": 0.7664526577515368, "language_loss": 0.59165394, "learning_rate": 1.2893338292163393e-06, "loss": 0.61179876, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.07373047, "step": 10431, "time_per_iteration": 3.1403627395629883 }, { "auxiliary_loss_clip": 0.01011062, "auxiliary_loss_mlp": 0.01003666, "balance_loss_clip": 1.00101936, "balance_loss_mlp": 1.00403404, "epoch": 0.6272057718322561, "flos": 65153059102080.0, "grad_norm": 1.0806094753837518, "language_loss": 0.63857388, "learning_rate": 1.2889697973785095e-06, "loss": 0.65872115, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.0703125, "step": 10432, "time_per_iteration": 4.477612257003784 }, { "auxiliary_loss_clip": 0.01052355, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.01621819, "balance_loss_mlp": 1.01605701, "epoch": 0.6272658950849241, "flos": 24388193151360.0, "grad_norm": 1.8316199503847987, "language_loss": 0.65702039, "learning_rate": 1.2886057925030153e-06, "loss": 0.67792654, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 10433, "time_per_iteration": 2.3769307136535645 }, { "auxiliary_loss_clip": 0.01056969, "auxiliary_loss_mlp": 0.01043559, "balance_loss_clip": 1.01603353, "balance_loss_mlp": 1.01807606, "epoch": 0.627326018337592, "flos": 17965082148480.0, "grad_norm": 1.9986950389610383, "language_loss": 0.62956542, "learning_rate": 1.2882418146036612e-06, "loss": 0.65057075, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.38867188, "step": 10434, "time_per_iteration": 2.3534419536590576 }, { "auxiliary_loss_clip": 0.0105367, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.01909912, "balance_loss_mlp": 1.0161953, "epoch": 0.6273861415902601, "flos": 20229059905920.0, "grad_norm": 1.5515433091497373, "language_loss": 0.85719723, "learning_rate": 1.2878778636942484e-06, "loss": 0.8781631, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 10435, "time_per_iteration": 2.3625059127807617 }, { "auxiliary_loss_clip": 0.01009773, "auxiliary_loss_mlp": 0.01006333, "balance_loss_clip": 1.00392473, "balance_loss_mlp": 1.00257277, "epoch": 0.627446264842928, "flos": 64950144946560.0, "grad_norm": 0.740617752959928, "language_loss": 0.61582357, "learning_rate": 1.2875139397885786e-06, "loss": 0.63598466, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.07226562, "step": 10436, "time_per_iteration": 3.043268918991089 }, { "auxiliary_loss_clip": 0.01053481, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.01927924, "balance_loss_mlp": 1.01628053, "epoch": 0.627506388095596, "flos": 23583200866560.0, "grad_norm": 1.5189686527278299, "language_loss": 0.78301316, "learning_rate": 1.2871500429004523e-06, "loss": 0.80399501, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 10437, "time_per_iteration": 2.415388822555542 }, { "auxiliary_loss_clip": 0.01008386, "auxiliary_loss_mlp": 0.01008459, "balance_loss_clip": 1.00574076, "balance_loss_mlp": 1.00155377, "epoch": 0.6275665113482639, "flos": 67580396444160.0, "grad_norm": 0.8074312465670727, "language_loss": 0.54463273, "learning_rate": 1.2867861730436667e-06, "loss": 0.56480122, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.06835938, "step": 10438, "time_per_iteration": 3.0252771377563477 }, { "auxiliary_loss_clip": 0.01052569, "auxiliary_loss_mlp": 0.01051344, "balance_loss_clip": 1.02759743, "balance_loss_mlp": 1.01583183, "epoch": 0.6276266346009319, "flos": 27635674308480.0, "grad_norm": 1.7261284999480881, "language_loss": 0.85100013, "learning_rate": 1.2864223302320214e-06, "loss": 0.87203932, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 10439, "time_per_iteration": 2.444892644882202 }, { "auxiliary_loss_clip": 0.01055415, "auxiliary_loss_mlp": 0.01048412, "balance_loss_clip": 1.02272272, "balance_loss_mlp": 1.01671672, "epoch": 0.6276867578535998, "flos": 22745075834880.0, "grad_norm": 2.0096707658799473, "language_loss": 0.80846441, "learning_rate": 1.2860585144793128e-06, "loss": 0.8295027, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10440, "time_per_iteration": 2.408198595046997 }, { "auxiliary_loss_clip": 0.01049708, "auxiliary_loss_mlp": 0.01040628, "balance_loss_clip": 1.01961112, "balance_loss_mlp": 1.01554275, "epoch": 0.6277468811062679, "flos": 24643059143040.0, "grad_norm": 1.3685712035651625, "language_loss": 0.75532925, "learning_rate": 1.285694725799337e-06, "loss": 0.77623266, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34179688, "step": 10441, "time_per_iteration": 2.417717456817627 }, { "auxiliary_loss_clip": 0.0105282, "auxiliary_loss_mlp": 0.01037397, "balance_loss_clip": 1.0141511, "balance_loss_mlp": 1.0166347, "epoch": 0.6278070043589358, "flos": 19678060828800.0, "grad_norm": 1.8209256091517885, "language_loss": 0.72795004, "learning_rate": 1.2853309642058884e-06, "loss": 0.74885219, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 10442, "time_per_iteration": 2.39794921875 }, { "auxiliary_loss_clip": 0.01053879, "auxiliary_loss_mlp": 0.01042313, "balance_loss_clip": 1.01959133, "balance_loss_mlp": 1.01737309, "epoch": 0.6278671276116038, "flos": 22120898814720.0, "grad_norm": 1.43718143971416, "language_loss": 0.72102112, "learning_rate": 1.284967229712762e-06, "loss": 0.74198306, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36523438, "step": 10443, "time_per_iteration": 2.4080426692962646 }, { "auxiliary_loss_clip": 0.01054367, "auxiliary_loss_mlp": 0.01047048, "balance_loss_clip": 1.02305126, "balance_loss_mlp": 1.01752281, "epoch": 0.6279272508642717, "flos": 23037473404800.0, "grad_norm": 2.821861614455931, "language_loss": 0.74392879, "learning_rate": 1.2846035223337492e-06, "loss": 0.76494288, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10444, "time_per_iteration": 2.429173707962036 }, { "auxiliary_loss_clip": 0.01054691, "auxiliary_loss_mlp": 0.01040243, "balance_loss_clip": 1.01567388, "balance_loss_mlp": 1.01859331, "epoch": 0.6279873741169397, "flos": 19823194817280.0, "grad_norm": 2.4534369808144603, "language_loss": 0.73415929, "learning_rate": 1.2842398420826423e-06, "loss": 0.75510859, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 10445, "time_per_iteration": 2.364154100418091 }, { "auxiliary_loss_clip": 0.01055024, "auxiliary_loss_mlp": 0.01042667, "balance_loss_clip": 1.0174185, "balance_loss_mlp": 1.01835918, "epoch": 0.6280474973696077, "flos": 23914247178240.0, "grad_norm": 1.4741886975908554, "language_loss": 0.70280182, "learning_rate": 1.2838761889732331e-06, "loss": 0.72377872, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 10446, "time_per_iteration": 2.407290458679199 }, { "auxiliary_loss_clip": 0.01060123, "auxiliary_loss_mlp": 0.01043379, "balance_loss_clip": 1.01698589, "balance_loss_mlp": 1.01957321, "epoch": 0.6281076206222757, "flos": 17967002273280.0, "grad_norm": 1.7463897778696198, "language_loss": 0.75137603, "learning_rate": 1.2835125630193102e-06, "loss": 0.77241111, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40625, "step": 10447, "time_per_iteration": 2.3524186611175537 }, { "auxiliary_loss_clip": 0.01016698, "auxiliary_loss_mlp": 0.01037593, "balance_loss_clip": 1.03497016, "balance_loss_mlp": 1.00966191, "epoch": 0.6281677438749437, "flos": 66775229602560.0, "grad_norm": 0.7054279770741533, "language_loss": 0.52502251, "learning_rate": 1.2831489642346626e-06, "loss": 0.54556543, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.0703125, "step": 10448, "time_per_iteration": 2.8806471824645996 }, { "auxiliary_loss_clip": 0.01059647, "auxiliary_loss_mlp": 0.01049936, "balance_loss_clip": 1.02275622, "balance_loss_mlp": 1.0213449, "epoch": 0.6282278671276116, "flos": 11655368841600.0, "grad_norm": 2.830898432129441, "language_loss": 0.92825508, "learning_rate": 1.282785392633079e-06, "loss": 0.94935089, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3828125, "step": 10449, "time_per_iteration": 2.3643429279327393 }, { "auxiliary_loss_clip": 0.01057464, "auxiliary_loss_mlp": 0.01041239, "balance_loss_clip": 1.01781464, "balance_loss_mlp": 1.01971829, "epoch": 0.6282879903802796, "flos": 42739939140480.0, "grad_norm": 1.5561178109140146, "language_loss": 0.61468542, "learning_rate": 1.2824218482283438e-06, "loss": 0.63567245, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37890625, "step": 10450, "time_per_iteration": 2.5989434719085693 }, { "auxiliary_loss_clip": 0.01055232, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.01080024, "balance_loss_mlp": 1.01957202, "epoch": 0.6283481136329475, "flos": 20008234356480.0, "grad_norm": 1.563206262617694, "language_loss": 0.77598721, "learning_rate": 1.2820583310342452e-06, "loss": 0.79688674, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 10451, "time_per_iteration": 2.3993911743164062 }, { "auxiliary_loss_clip": 0.01056985, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.01617908, "balance_loss_mlp": 1.01855242, "epoch": 0.6284082368856155, "flos": 21903459667200.0, "grad_norm": 1.58192440240751, "language_loss": 0.78396046, "learning_rate": 1.281694841064566e-06, "loss": 0.80496585, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 10452, "time_per_iteration": 2.394521713256836 }, { "auxiliary_loss_clip": 0.01058212, "auxiliary_loss_mlp": 0.01041539, "balance_loss_clip": 1.01618314, "balance_loss_mlp": 1.02112639, "epoch": 0.6284683601382834, "flos": 25482999565440.0, "grad_norm": 1.6904674355651164, "language_loss": 0.7409513, "learning_rate": 1.2813313783330904e-06, "loss": 0.76194876, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 10453, "time_per_iteration": 2.427011251449585 }, { "auxiliary_loss_clip": 0.01056691, "auxiliary_loss_mlp": 0.01040818, "balance_loss_clip": 1.01515222, "balance_loss_mlp": 1.01933801, "epoch": 0.6285284833909515, "flos": 16537937702400.0, "grad_norm": 1.786575101646722, "language_loss": 0.82026047, "learning_rate": 1.2809679428536013e-06, "loss": 0.84123552, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 10454, "time_per_iteration": 2.4013924598693848 }, { "auxiliary_loss_clip": 0.01058138, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.01300478, "balance_loss_mlp": 1.02113366, "epoch": 0.6285886066436194, "flos": 22819580409600.0, "grad_norm": 1.7827179244417677, "language_loss": 0.83663845, "learning_rate": 1.2806045346398792e-06, "loss": 0.85758567, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 10455, "time_per_iteration": 2.4187216758728027 }, { "auxiliary_loss_clip": 0.0105777, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.01327038, "balance_loss_mlp": 1.01970339, "epoch": 0.6286487298962874, "flos": 24714631163520.0, "grad_norm": 1.698544133404817, "language_loss": 0.8316716, "learning_rate": 1.280241153705706e-06, "loss": 0.85263175, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 10456, "time_per_iteration": 2.4711849689483643 }, { "auxiliary_loss_clip": 0.0105985, "auxiliary_loss_mlp": 0.01042978, "balance_loss_clip": 1.01970792, "balance_loss_mlp": 1.02109921, "epoch": 0.6287088531489553, "flos": 20739769407360.0, "grad_norm": 1.5139654517395458, "language_loss": 0.73406839, "learning_rate": 1.27987780006486e-06, "loss": 0.75509667, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.38671875, "step": 10457, "time_per_iteration": 2.463101863861084 }, { "auxiliary_loss_clip": 0.01059125, "auxiliary_loss_mlp": 0.01053001, "balance_loss_clip": 1.02479625, "balance_loss_mlp": 1.01884246, "epoch": 0.6287689764016233, "flos": 23069663544960.0, "grad_norm": 1.648426880299542, "language_loss": 0.82034886, "learning_rate": 1.2795144737311202e-06, "loss": 0.84147012, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40234375, "step": 10458, "time_per_iteration": 3.6649014949798584 }, { "auxiliary_loss_clip": 0.01058686, "auxiliary_loss_mlp": 0.01055018, "balance_loss_clip": 1.02739704, "balance_loss_mlp": 1.02063644, "epoch": 0.6288290996542913, "flos": 32232304200960.0, "grad_norm": 2.5718742965523185, "language_loss": 0.62158442, "learning_rate": 1.2791511747182635e-06, "loss": 0.64272147, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38085938, "step": 10459, "time_per_iteration": 2.4933125972747803 }, { "auxiliary_loss_clip": 0.01057026, "auxiliary_loss_mlp": 0.01048576, "balance_loss_clip": 1.02435231, "balance_loss_mlp": 1.01956487, "epoch": 0.6288892229069593, "flos": 24640266234240.0, "grad_norm": 1.6340825148321925, "language_loss": 0.79700089, "learning_rate": 1.2787879030400666e-06, "loss": 0.81805688, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 10460, "time_per_iteration": 2.4513556957244873 }, { "auxiliary_loss_clip": 0.01055886, "auxiliary_loss_mlp": 0.01041298, "balance_loss_clip": 1.01700282, "balance_loss_mlp": 1.01850057, "epoch": 0.6289493461596273, "flos": 17857375004160.0, "grad_norm": 1.7633119819204586, "language_loss": 0.74899995, "learning_rate": 1.2784246587103047e-06, "loss": 0.76997173, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 10461, "time_per_iteration": 2.3525476455688477 }, { "auxiliary_loss_clip": 0.01053891, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.02108824, "balance_loss_mlp": 1.01775479, "epoch": 0.6290094694122952, "flos": 22344307804800.0, "grad_norm": 2.344253162065434, "language_loss": 0.7150377, "learning_rate": 1.2780614417427523e-06, "loss": 0.73603696, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 10462, "time_per_iteration": 3.876101016998291 }, { "auxiliary_loss_clip": 0.01052281, "auxiliary_loss_mlp": 0.01039869, "balance_loss_clip": 1.01929379, "balance_loss_mlp": 1.01832938, "epoch": 0.6290695926649632, "flos": 28401179978880.0, "grad_norm": 2.795749666282322, "language_loss": 0.73279643, "learning_rate": 1.2776982521511821e-06, "loss": 0.7537179, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 10463, "time_per_iteration": 3.9579579830169678 }, { "auxiliary_loss_clip": 0.01053142, "auxiliary_loss_mlp": 0.01048715, "balance_loss_clip": 1.02496898, "balance_loss_mlp": 1.0180912, "epoch": 0.6291297159176311, "flos": 21504437205120.0, "grad_norm": 1.6134864650126743, "language_loss": 0.73152447, "learning_rate": 1.2773350899493665e-06, "loss": 0.75254303, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34960938, "step": 10464, "time_per_iteration": 2.425267219543457 }, { "auxiliary_loss_clip": 0.01053794, "auxiliary_loss_mlp": 0.01048925, "balance_loss_clip": 1.02539313, "balance_loss_mlp": 1.01770878, "epoch": 0.6291898391702991, "flos": 12202492757760.0, "grad_norm": 1.6915103754745773, "language_loss": 0.69903266, "learning_rate": 1.2769719551510768e-06, "loss": 0.72005981, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36132812, "step": 10465, "time_per_iteration": 2.4495770931243896 }, { "auxiliary_loss_clip": 0.01011358, "auxiliary_loss_mlp": 0.01008228, "balance_loss_clip": 1.0056051, "balance_loss_mlp": 1.00404191, "epoch": 0.629249962422967, "flos": 69296168033280.0, "grad_norm": 0.6986120146895876, "language_loss": 0.59844089, "learning_rate": 1.2766088477700832e-06, "loss": 0.61863685, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.07324219, "step": 10466, "time_per_iteration": 3.146167039871216 }, { "auxiliary_loss_clip": 0.01051672, "auxiliary_loss_mlp": 0.01046333, "balance_loss_clip": 1.02226484, "balance_loss_mlp": 1.01566672, "epoch": 0.6293100856756351, "flos": 40076310516480.0, "grad_norm": 2.394223377683595, "language_loss": 0.66434979, "learning_rate": 1.276245767820154e-06, "loss": 0.68532991, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 10467, "time_per_iteration": 2.5486841201782227 }, { "auxiliary_loss_clip": 0.01010641, "auxiliary_loss_mlp": 0.01013611, "balance_loss_clip": 1.01043987, "balance_loss_mlp": 1.00329638, "epoch": 0.629370208928303, "flos": 67498141547520.0, "grad_norm": 0.8048984627070126, "language_loss": 0.57073814, "learning_rate": 1.2758827153150586e-06, "loss": 0.59098065, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.07324219, "step": 10468, "time_per_iteration": 2.8294243812561035 }, { "auxiliary_loss_clip": 0.01009079, "auxiliary_loss_mlp": 0.01015092, "balance_loss_clip": 1.01246977, "balance_loss_mlp": 1.00193238, "epoch": 0.629430332180971, "flos": 60657154081920.0, "grad_norm": 0.739913754539508, "language_loss": 0.58109212, "learning_rate": 1.2755196902685626e-06, "loss": 0.60133386, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.07128906, "step": 10469, "time_per_iteration": 3.0360817909240723 }, { "auxiliary_loss_clip": 0.01009871, "auxiliary_loss_mlp": 0.01007228, "balance_loss_clip": 1.00451005, "balance_loss_mlp": 1.00264573, "epoch": 0.6294904554336389, "flos": 66866107605120.0, "grad_norm": 0.703491905310845, "language_loss": 0.52211142, "learning_rate": 1.2751566926944329e-06, "loss": 0.54228246, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.07226562, "step": 10470, "time_per_iteration": 3.0835890769958496 }, { "auxiliary_loss_clip": 0.01053895, "auxiliary_loss_mlp": 0.0104868, "balance_loss_clip": 1.02375364, "balance_loss_mlp": 1.01759386, "epoch": 0.6295505786863069, "flos": 42521138449920.0, "grad_norm": 1.6144861135913025, "language_loss": 0.75775194, "learning_rate": 1.2747937226064342e-06, "loss": 0.77877772, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36328125, "step": 10471, "time_per_iteration": 3.938607931137085 }, { "auxiliary_loss_clip": 0.01056017, "auxiliary_loss_mlp": 0.0104546, "balance_loss_clip": 1.02102208, "balance_loss_mlp": 1.01863527, "epoch": 0.629610701938975, "flos": 17383184651520.0, "grad_norm": 1.9395791442557382, "language_loss": 0.64412063, "learning_rate": 1.2744307800183297e-06, "loss": 0.66513538, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 10472, "time_per_iteration": 2.390970468521118 }, { "auxiliary_loss_clip": 0.01057684, "auxiliary_loss_mlp": 0.01051234, "balance_loss_clip": 1.02546144, "balance_loss_mlp": 1.01915741, "epoch": 0.6296708251916429, "flos": 24241802353920.0, "grad_norm": 1.7524912168304718, "language_loss": 0.70282477, "learning_rate": 1.2740678649438828e-06, "loss": 0.72391397, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38476562, "step": 10473, "time_per_iteration": 2.48785662651062 }, { "auxiliary_loss_clip": 0.01055035, "auxiliary_loss_mlp": 0.0104199, "balance_loss_clip": 1.0184586, "balance_loss_mlp": 1.01867068, "epoch": 0.6297309484443109, "flos": 19277607000960.0, "grad_norm": 1.821317644657953, "language_loss": 0.75403148, "learning_rate": 1.2737049773968554e-06, "loss": 0.77500176, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 10474, "time_per_iteration": 2.452282190322876 }, { "auxiliary_loss_clip": 0.01058243, "auxiliary_loss_mlp": 0.01046318, "balance_loss_clip": 1.02229714, "balance_loss_mlp": 1.02045536, "epoch": 0.6297910716969788, "flos": 30661422220800.0, "grad_norm": 1.5237340592522297, "language_loss": 0.67511237, "learning_rate": 1.2733421173910081e-06, "loss": 0.69615805, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 10475, "time_per_iteration": 2.4870405197143555 }, { "auxiliary_loss_clip": 0.01054688, "auxiliary_loss_mlp": 0.01039442, "balance_loss_clip": 1.01748407, "balance_loss_mlp": 1.01932383, "epoch": 0.6298511949496468, "flos": 14422305778560.0, "grad_norm": 1.9113048626642806, "language_loss": 0.91791385, "learning_rate": 1.272979284940101e-06, "loss": 0.93885517, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 10476, "time_per_iteration": 2.3698110580444336 }, { "auxiliary_loss_clip": 0.01057304, "auxiliary_loss_mlp": 0.01039764, "balance_loss_clip": 1.01746058, "balance_loss_mlp": 1.02109349, "epoch": 0.6299113182023147, "flos": 23513025300480.0, "grad_norm": 1.6251481137368493, "language_loss": 0.76254541, "learning_rate": 1.2726164800578913e-06, "loss": 0.78351605, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36328125, "step": 10477, "time_per_iteration": 2.4155051708221436 }, { "auxiliary_loss_clip": 0.01058476, "auxiliary_loss_mlp": 0.01044838, "balance_loss_clip": 1.01920772, "balance_loss_mlp": 1.02040648, "epoch": 0.6299714414549827, "flos": 22673399080320.0, "grad_norm": 1.6076283558823985, "language_loss": 0.71111661, "learning_rate": 1.272253702758138e-06, "loss": 0.73214972, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 10478, "time_per_iteration": 2.4416775703430176 }, { "auxiliary_loss_clip": 0.01063007, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.01561499, "balance_loss_mlp": 1.02336788, "epoch": 0.6300315647076506, "flos": 14500860071040.0, "grad_norm": 2.1915359402828005, "language_loss": 0.68881106, "learning_rate": 1.2718909530545974e-06, "loss": 0.70985973, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 10479, "time_per_iteration": 2.368682861328125 }, { "auxiliary_loss_clip": 0.01060045, "auxiliary_loss_mlp": 0.01045256, "balance_loss_clip": 1.02086568, "balance_loss_mlp": 1.02331889, "epoch": 0.6300916879603187, "flos": 21870606211200.0, "grad_norm": 2.0207525384906595, "language_loss": 0.74573326, "learning_rate": 1.2715282309610245e-06, "loss": 0.76678622, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 10480, "time_per_iteration": 2.4304986000061035 }, { "auxiliary_loss_clip": 0.01060621, "auxiliary_loss_mlp": 0.01044569, "balance_loss_clip": 1.01918983, "balance_loss_mlp": 1.02192116, "epoch": 0.6301518112129866, "flos": 21833004810240.0, "grad_norm": 2.2120851552590852, "language_loss": 0.79497635, "learning_rate": 1.2711655364911744e-06, "loss": 0.8160283, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 10481, "time_per_iteration": 2.4402544498443604 }, { "auxiliary_loss_clip": 0.01022973, "auxiliary_loss_mlp": 0.01015026, "balance_loss_clip": 1.01216543, "balance_loss_mlp": 1.01493406, "epoch": 0.6302119344656546, "flos": 44331872670720.0, "grad_norm": 0.9083362502747326, "language_loss": 0.61883426, "learning_rate": 1.2708028696588e-06, "loss": 0.63921422, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.08007812, "step": 10482, "time_per_iteration": 2.7955269813537598 }, { "auxiliary_loss_clip": 0.01064539, "auxiliary_loss_mlp": 0.01045605, "balance_loss_clip": 1.01773417, "balance_loss_mlp": 1.02337503, "epoch": 0.6302720577183225, "flos": 11217139056000.0, "grad_norm": 1.9172514035561095, "language_loss": 0.83510238, "learning_rate": 1.2704402304776541e-06, "loss": 0.85620379, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.41210938, "step": 10483, "time_per_iteration": 2.365884780883789 }, { "auxiliary_loss_clip": 0.01058403, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.01899576, "balance_loss_mlp": 1.02251577, "epoch": 0.6303321809709905, "flos": 27963683331840.0, "grad_norm": 1.578495553820476, "language_loss": 0.73583865, "learning_rate": 1.270077618961487e-06, "loss": 0.75684506, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 10484, "time_per_iteration": 2.4635939598083496 }, { "auxiliary_loss_clip": 0.01059049, "auxiliary_loss_mlp": 0.01041296, "balance_loss_clip": 1.01743042, "balance_loss_mlp": 1.02204704, "epoch": 0.6303923042236586, "flos": 28219491930240.0, "grad_norm": 1.9394742949490609, "language_loss": 0.75932324, "learning_rate": 1.2697150351240506e-06, "loss": 0.78032672, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 10485, "time_per_iteration": 2.4925811290740967 }, { "auxiliary_loss_clip": 0.01060747, "auxiliary_loss_mlp": 0.01041085, "balance_loss_clip": 1.01590765, "balance_loss_mlp": 1.02097344, "epoch": 0.6304524274763265, "flos": 27629948845440.0, "grad_norm": 2.2885531716100216, "language_loss": 0.83050287, "learning_rate": 1.269352478979093e-06, "loss": 0.85152119, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3984375, "step": 10486, "time_per_iteration": 2.469379425048828 }, { "auxiliary_loss_clip": 0.01060114, "auxiliary_loss_mlp": 0.01046826, "balance_loss_clip": 1.02225661, "balance_loss_mlp": 1.02235532, "epoch": 0.6305125507289945, "flos": 17310355822080.0, "grad_norm": 1.6577055337564814, "language_loss": 0.64971542, "learning_rate": 1.2689899505403628e-06, "loss": 0.67078483, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 10487, "time_per_iteration": 2.38136887550354 }, { "auxiliary_loss_clip": 0.01058696, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.02542114, "balance_loss_mlp": 1.02231264, "epoch": 0.6305726739816624, "flos": 25807203250560.0, "grad_norm": 1.5249336694278446, "language_loss": 0.68031275, "learning_rate": 1.2686274498216065e-06, "loss": 0.70139509, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 10488, "time_per_iteration": 2.484880208969116 }, { "auxiliary_loss_clip": 0.01060665, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.01823139, "balance_loss_mlp": 1.02157807, "epoch": 0.6306327972343304, "flos": 21796415838720.0, "grad_norm": 1.7640174873254941, "language_loss": 0.68793708, "learning_rate": 1.2682649768365706e-06, "loss": 0.70898521, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 10489, "time_per_iteration": 2.394228935241699 }, { "auxiliary_loss_clip": 0.01064499, "auxiliary_loss_mlp": 0.01049731, "balance_loss_clip": 1.02035785, "balance_loss_mlp": 1.02262259, "epoch": 0.6306929204869983, "flos": 20776323467520.0, "grad_norm": 1.6846791024082155, "language_loss": 0.70727706, "learning_rate": 1.2679025315990007e-06, "loss": 0.72841936, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.41796875, "step": 10490, "time_per_iteration": 2.4332456588745117 }, { "auxiliary_loss_clip": 0.01060073, "auxiliary_loss_mlp": 0.01041951, "balance_loss_clip": 1.01685739, "balance_loss_mlp": 1.02151918, "epoch": 0.6307530437396663, "flos": 23653236787200.0, "grad_norm": 2.3280290715327485, "language_loss": 0.79564822, "learning_rate": 1.2675401141226393e-06, "loss": 0.81666839, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 10491, "time_per_iteration": 2.390270948410034 }, { "auxiliary_loss_clip": 0.01058649, "auxiliary_loss_mlp": 0.01045979, "balance_loss_clip": 1.02148187, "balance_loss_mlp": 1.02146268, "epoch": 0.6308131669923343, "flos": 24717808097280.0, "grad_norm": 1.9004198919657853, "language_loss": 0.5614301, "learning_rate": 1.2671777244212308e-06, "loss": 0.58247638, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37304688, "step": 10492, "time_per_iteration": 2.4452359676361084 }, { "auxiliary_loss_clip": 0.01058756, "auxiliary_loss_mlp": 0.01053787, "balance_loss_clip": 1.02782369, "balance_loss_mlp": 1.01990211, "epoch": 0.6308732902450023, "flos": 22564295481600.0, "grad_norm": 1.8457126176544847, "language_loss": 0.65630382, "learning_rate": 1.2668153625085168e-06, "loss": 0.67742926, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38867188, "step": 10493, "time_per_iteration": 2.409545660018921 }, { "auxiliary_loss_clip": 0.01055988, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.02021205, "balance_loss_mlp": 1.01877761, "epoch": 0.6309334134976702, "flos": 24643059143040.0, "grad_norm": 1.3559619580988254, "language_loss": 0.83363169, "learning_rate": 1.2664530283982367e-06, "loss": 0.85463047, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 10494, "time_per_iteration": 2.5296878814697266 }, { "auxiliary_loss_clip": 0.01057304, "auxiliary_loss_mlp": 0.01043621, "balance_loss_clip": 1.01884985, "balance_loss_mlp": 1.01968336, "epoch": 0.6309935367503382, "flos": 41426332035840.0, "grad_norm": 2.005156788594954, "language_loss": 0.80845928, "learning_rate": 1.2660907221041317e-06, "loss": 0.82946861, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 10495, "time_per_iteration": 2.5734477043151855 }, { "auxiliary_loss_clip": 0.01057239, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.02079606, "balance_loss_mlp": 1.01974177, "epoch": 0.6310536600030061, "flos": 15118124641920.0, "grad_norm": 2.108429437271186, "language_loss": 0.71154284, "learning_rate": 1.2657284436399403e-06, "loss": 0.73257649, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 10496, "time_per_iteration": 2.4166693687438965 }, { "auxiliary_loss_clip": 0.01057818, "auxiliary_loss_mlp": 0.01043422, "balance_loss_clip": 1.01675463, "balance_loss_mlp": 1.01867187, "epoch": 0.6311137832556741, "flos": 15230719376640.0, "grad_norm": 2.7306489989212883, "language_loss": 0.81760329, "learning_rate": 1.2653661930193997e-06, "loss": 0.83861566, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 10497, "time_per_iteration": 2.3821651935577393 }, { "auxiliary_loss_clip": 0.01053585, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.02223158, "balance_loss_mlp": 1.01746309, "epoch": 0.6311739065083422, "flos": 22017555590400.0, "grad_norm": 2.474269916083689, "language_loss": 0.7512368, "learning_rate": 1.265003970256247e-06, "loss": 0.77222598, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 10498, "time_per_iteration": 3.779094696044922 }, { "auxiliary_loss_clip": 0.01057011, "auxiliary_loss_mlp": 0.01044267, "balance_loss_clip": 1.01905417, "balance_loss_mlp": 1.01861262, "epoch": 0.6312340297610101, "flos": 22709673849600.0, "grad_norm": 1.9519345462550528, "language_loss": 0.7090286, "learning_rate": 1.264641775364217e-06, "loss": 0.73004138, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10499, "time_per_iteration": 2.393601179122925 }, { "auxiliary_loss_clip": 0.01054701, "auxiliary_loss_mlp": 0.01043183, "balance_loss_clip": 1.01810098, "balance_loss_mlp": 1.01833737, "epoch": 0.6312941530136781, "flos": 24278949907200.0, "grad_norm": 1.8352079553884406, "language_loss": 0.70537311, "learning_rate": 1.2642796083570448e-06, "loss": 0.72635198, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 10500, "time_per_iteration": 2.430647850036621 }, { "auxiliary_loss_clip": 0.01056102, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.02071714, "balance_loss_mlp": 1.01864147, "epoch": 0.631354276266346, "flos": 21724878729600.0, "grad_norm": 1.8208991759970066, "language_loss": 0.75078785, "learning_rate": 1.2639174692484634e-06, "loss": 0.77180976, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 10501, "time_per_iteration": 2.4120659828186035 }, { "auxiliary_loss_clip": 0.01056364, "auxiliary_loss_mlp": 0.01047465, "balance_loss_clip": 1.01985645, "balance_loss_mlp": 1.01923478, "epoch": 0.631414399519014, "flos": 24023944270080.0, "grad_norm": 1.6894652518685347, "language_loss": 0.76563752, "learning_rate": 1.2635553580522053e-06, "loss": 0.78667581, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.37109375, "step": 10502, "time_per_iteration": 5.311256170272827 }, { "auxiliary_loss_clip": 0.01059547, "auxiliary_loss_mlp": 0.01046687, "balance_loss_clip": 1.02037716, "balance_loss_mlp": 1.02025938, "epoch": 0.6314745227716819, "flos": 24314666094720.0, "grad_norm": 2.041255253660512, "language_loss": 0.85934424, "learning_rate": 1.2631932747820022e-06, "loss": 0.88040662, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 10503, "time_per_iteration": 2.3994874954223633 }, { "auxiliary_loss_clip": 0.0105555, "auxiliary_loss_mlp": 0.01043646, "balance_loss_clip": 1.01807547, "balance_loss_mlp": 1.01736021, "epoch": 0.6315346460243499, "flos": 23365307871360.0, "grad_norm": 1.6106642728098897, "language_loss": 0.87061858, "learning_rate": 1.2628312194515838e-06, "loss": 0.89161056, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 10504, "time_per_iteration": 2.38594651222229 }, { "auxiliary_loss_clip": 0.01058709, "auxiliary_loss_mlp": 0.01044686, "balance_loss_clip": 1.01557517, "balance_loss_mlp": 1.01834202, "epoch": 0.6315947692770179, "flos": 20259469566720.0, "grad_norm": 1.615880366153173, "language_loss": 0.78233671, "learning_rate": 1.2624691920746793e-06, "loss": 0.80337065, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.40234375, "step": 10505, "time_per_iteration": 2.3879497051239014 }, { "auxiliary_loss_clip": 0.0105633, "auxiliary_loss_mlp": 0.01050335, "balance_loss_clip": 1.02280998, "balance_loss_mlp": 1.01741135, "epoch": 0.6316548925296859, "flos": 25264652722560.0, "grad_norm": 2.3236020822658956, "language_loss": 0.82993865, "learning_rate": 1.2621071926650166e-06, "loss": 0.85100526, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.38867188, "step": 10506, "time_per_iteration": 2.409675359725952 }, { "auxiliary_loss_clip": 0.01056936, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.01635504, "balance_loss_mlp": 1.01864076, "epoch": 0.6317150157823538, "flos": 22929452058240.0, "grad_norm": 2.412397253348964, "language_loss": 0.7513355, "learning_rate": 1.2617452212363238e-06, "loss": 0.77232921, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10507, "time_per_iteration": 2.4161272048950195 }, { "auxiliary_loss_clip": 0.01058172, "auxiliary_loss_mlp": 0.01042084, "balance_loss_clip": 1.01472557, "balance_loss_mlp": 1.01814461, "epoch": 0.6317751390350218, "flos": 22525995853440.0, "grad_norm": 1.8161682835056638, "language_loss": 0.69776565, "learning_rate": 1.2613832778023258e-06, "loss": 0.71876824, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40039062, "step": 10508, "time_per_iteration": 2.3874475955963135 }, { "auxiliary_loss_clip": 0.01055787, "auxiliary_loss_mlp": 0.01048188, "balance_loss_clip": 1.02105618, "balance_loss_mlp": 1.01854467, "epoch": 0.6318352622876897, "flos": 23293631116800.0, "grad_norm": 1.5799018146157635, "language_loss": 0.7185716, "learning_rate": 1.2610213623767478e-06, "loss": 0.73961139, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.37109375, "step": 10509, "time_per_iteration": 2.4333319664001465 }, { "auxiliary_loss_clip": 0.01054142, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.01490426, "balance_loss_mlp": 1.01754665, "epoch": 0.6318953855403577, "flos": 20703040790400.0, "grad_norm": 1.5790182368749008, "language_loss": 0.80601323, "learning_rate": 1.2606594749733143e-06, "loss": 0.82694173, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 10510, "time_per_iteration": 2.384685754776001 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.01217175, "balance_loss_mlp": 1.01882052, "epoch": 0.6319555087930258, "flos": 22818952005120.0, "grad_norm": 1.5698929414431608, "language_loss": 0.71819878, "learning_rate": 1.2602976156057469e-06, "loss": 0.73914075, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 10511, "time_per_iteration": 2.422070264816284 }, { "auxiliary_loss_clip": 0.01053804, "auxiliary_loss_mlp": 0.01039673, "balance_loss_clip": 1.01742887, "balance_loss_mlp": 1.01742816, "epoch": 0.6320156320456937, "flos": 19970004551040.0, "grad_norm": 1.673445089512672, "language_loss": 0.81074166, "learning_rate": 1.2599357842877684e-06, "loss": 0.83167636, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 10512, "time_per_iteration": 3.8560445308685303 }, { "auxiliary_loss_clip": 0.01057516, "auxiliary_loss_mlp": 0.01040413, "balance_loss_clip": 1.0146277, "balance_loss_mlp": 1.01923585, "epoch": 0.6320757552983617, "flos": 27012265338240.0, "grad_norm": 1.838095516761923, "language_loss": 0.71805704, "learning_rate": 1.2595739810330994e-06, "loss": 0.73903632, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10513, "time_per_iteration": 2.4122586250305176 }, { "auxiliary_loss_clip": 0.01057121, "auxiliary_loss_mlp": 0.0104253, "balance_loss_clip": 1.0169121, "balance_loss_mlp": 1.01879454, "epoch": 0.6321358785510296, "flos": 23694818083200.0, "grad_norm": 1.8492465303466612, "language_loss": 0.67334378, "learning_rate": 1.259212205855459e-06, "loss": 0.69434023, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 10514, "time_per_iteration": 2.4429259300231934 }, { "auxiliary_loss_clip": 0.01053115, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.01455641, "balance_loss_mlp": 1.01700401, "epoch": 0.6321960018036976, "flos": 25994023269120.0, "grad_norm": 1.8260708743162748, "language_loss": 0.75634342, "learning_rate": 1.2588504587685663e-06, "loss": 0.77725118, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 10515, "time_per_iteration": 2.425358772277832 }, { "auxiliary_loss_clip": 0.01052703, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.01272964, "balance_loss_mlp": 1.01816416, "epoch": 0.6322561250563655, "flos": 22819894611840.0, "grad_norm": 1.699331933619142, "language_loss": 0.90944993, "learning_rate": 1.2584887397861379e-06, "loss": 0.93032342, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 10516, "time_per_iteration": 2.4276981353759766 }, { "auxiliary_loss_clip": 0.01060821, "auxiliary_loss_mlp": 0.01044128, "balance_loss_clip": 1.01465952, "balance_loss_mlp": 1.02043033, "epoch": 0.6323162483090335, "flos": 18987443758080.0, "grad_norm": 1.6227092091999098, "language_loss": 0.82418001, "learning_rate": 1.2581270489218911e-06, "loss": 0.84522951, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40429688, "step": 10517, "time_per_iteration": 2.3748738765716553 }, { "auxiliary_loss_clip": 0.01055514, "auxiliary_loss_mlp": 0.0104442, "balance_loss_clip": 1.01929045, "balance_loss_mlp": 1.01836443, "epoch": 0.6323763715617015, "flos": 19864147708800.0, "grad_norm": 1.7659136192592444, "language_loss": 0.79007316, "learning_rate": 1.257765386189541e-06, "loss": 0.81107253, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 10518, "time_per_iteration": 2.3990297317504883 }, { "auxiliary_loss_clip": 0.01053913, "auxiliary_loss_mlp": 0.01038315, "balance_loss_clip": 1.01553392, "balance_loss_mlp": 1.01778698, "epoch": 0.6324364948143695, "flos": 22781629895040.0, "grad_norm": 1.4631114007329078, "language_loss": 0.86031818, "learning_rate": 1.2574037516028018e-06, "loss": 0.88124049, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 10519, "time_per_iteration": 2.419728994369507 }, { "auxiliary_loss_clip": 0.01053135, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.01419175, "balance_loss_mlp": 1.0177995, "epoch": 0.6324966180670374, "flos": 22234855092480.0, "grad_norm": 1.5424131527180835, "language_loss": 0.72885877, "learning_rate": 1.2570421451753867e-06, "loss": 0.74975419, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 10520, "time_per_iteration": 2.405036211013794 }, { "auxiliary_loss_clip": 0.01053524, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.01419342, "balance_loss_mlp": 1.01764333, "epoch": 0.6325567413197054, "flos": 21688115201280.0, "grad_norm": 2.3323249514853726, "language_loss": 0.73421365, "learning_rate": 1.2566805669210081e-06, "loss": 0.75511611, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 10521, "time_per_iteration": 2.5349183082580566 }, { "auxiliary_loss_clip": 0.01058426, "auxiliary_loss_mlp": 0.01045504, "balance_loss_clip": 1.01975441, "balance_loss_mlp": 1.01986086, "epoch": 0.6326168645723733, "flos": 19936138665600.0, "grad_norm": 1.762614424086139, "language_loss": 0.73205447, "learning_rate": 1.256319016853377e-06, "loss": 0.75309378, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38476562, "step": 10522, "time_per_iteration": 2.481027603149414 }, { "auxiliary_loss_clip": 0.01056966, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.01461697, "balance_loss_mlp": 1.01907182, "epoch": 0.6326769878250413, "flos": 20229304285440.0, "grad_norm": 2.1729622907488473, "language_loss": 0.82683325, "learning_rate": 1.2559574949862023e-06, "loss": 0.84778881, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 10523, "time_per_iteration": 2.472660541534424 }, { "auxiliary_loss_clip": 0.01055055, "auxiliary_loss_mlp": 0.01037185, "balance_loss_clip": 1.01366544, "balance_loss_mlp": 1.01804137, "epoch": 0.6327371110777094, "flos": 20774752456320.0, "grad_norm": 2.0092956726918474, "language_loss": 0.74123406, "learning_rate": 1.255596001333195e-06, "loss": 0.76215649, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 10524, "time_per_iteration": 2.3481218814849854 }, { "auxiliary_loss_clip": 0.01060062, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.01628327, "balance_loss_mlp": 1.01943243, "epoch": 0.6327972343303773, "flos": 30335228588160.0, "grad_norm": 3.872404379504013, "language_loss": 0.85970545, "learning_rate": 1.2552345359080615e-06, "loss": 0.88074005, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.40625, "step": 10525, "time_per_iteration": 2.4433298110961914 }, { "auxiliary_loss_clip": 0.01054021, "auxiliary_loss_mlp": 0.01037042, "balance_loss_clip": 1.01324844, "balance_loss_mlp": 1.01735353, "epoch": 0.6328573575830453, "flos": 17091310752000.0, "grad_norm": 1.7460469272368473, "language_loss": 0.67551553, "learning_rate": 1.2548730987245093e-06, "loss": 0.69642621, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 10526, "time_per_iteration": 2.3412821292877197 }, { "auxiliary_loss_clip": 0.01058969, "auxiliary_loss_mlp": 0.01042449, "balance_loss_clip": 1.01633048, "balance_loss_mlp": 1.02014089, "epoch": 0.6329174808357132, "flos": 25045956766080.0, "grad_norm": 1.472329205523203, "language_loss": 0.74434125, "learning_rate": 1.254511689796244e-06, "loss": 0.76535547, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 10527, "time_per_iteration": 2.4166500568389893 }, { "auxiliary_loss_clip": 0.01055915, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.01190233, "balance_loss_mlp": 1.01911139, "epoch": 0.6329776040883812, "flos": 16835886178560.0, "grad_norm": 2.5911715549930885, "language_loss": 0.73128641, "learning_rate": 1.2541503091369693e-06, "loss": 0.75219512, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 10528, "time_per_iteration": 2.3505101203918457 }, { "auxiliary_loss_clip": 0.01054705, "auxiliary_loss_mlp": 0.01041479, "balance_loss_clip": 1.01649249, "balance_loss_mlp": 1.01801729, "epoch": 0.6330377273410491, "flos": 13515855482880.0, "grad_norm": 2.2639938485597595, "language_loss": 0.68142581, "learning_rate": 1.2537889567603905e-06, "loss": 0.70238769, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 10529, "time_per_iteration": 2.352111339569092 }, { "auxiliary_loss_clip": 0.01055881, "auxiliary_loss_mlp": 0.01038407, "balance_loss_clip": 1.01425481, "balance_loss_mlp": 1.01942825, "epoch": 0.6330978505937171, "flos": 21537884154240.0, "grad_norm": 1.8514395983552554, "language_loss": 0.76685131, "learning_rate": 1.2534276326802092e-06, "loss": 0.78779423, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 10530, "time_per_iteration": 2.375882625579834 }, { "auxiliary_loss_clip": 0.01056578, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.01429486, "balance_loss_mlp": 1.01985288, "epoch": 0.6331579738463851, "flos": 25008320453760.0, "grad_norm": 1.5418929883501036, "language_loss": 0.74405503, "learning_rate": 1.2530663369101259e-06, "loss": 0.76499224, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10531, "time_per_iteration": 2.401207447052002 }, { "auxiliary_loss_clip": 0.01052821, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.01143754, "balance_loss_mlp": 1.01750541, "epoch": 0.6332180970990531, "flos": 14975120246400.0, "grad_norm": 2.171939464199027, "language_loss": 0.81234908, "learning_rate": 1.2527050694638432e-06, "loss": 0.83321834, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 10532, "time_per_iteration": 2.3500888347625732 }, { "auxiliary_loss_clip": 0.01053006, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.01456892, "balance_loss_mlp": 1.01743817, "epoch": 0.633278220351721, "flos": 22705973245440.0, "grad_norm": 1.5595244319799706, "language_loss": 0.7590155, "learning_rate": 1.2523438303550582e-06, "loss": 0.7798996, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.35546875, "step": 10533, "time_per_iteration": 2.4066314697265625 }, { "auxiliary_loss_clip": 0.01056769, "auxiliary_loss_mlp": 0.01044771, "balance_loss_clip": 1.01717353, "balance_loss_mlp": 1.01757097, "epoch": 0.633338343604389, "flos": 12602143624320.0, "grad_norm": 2.585295148606703, "language_loss": 0.7909925, "learning_rate": 1.2519826195974706e-06, "loss": 0.8120079, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.39257812, "step": 10534, "time_per_iteration": 2.322382926940918 }, { "auxiliary_loss_clip": 0.01053699, "auxiliary_loss_mlp": 0.01036457, "balance_loss_clip": 1.01347387, "balance_loss_mlp": 1.01742733, "epoch": 0.6333984668570569, "flos": 25958865663360.0, "grad_norm": 3.0806473951071687, "language_loss": 0.86417824, "learning_rate": 1.251621437204777e-06, "loss": 0.88507974, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 10535, "time_per_iteration": 2.4390971660614014 }, { "auxiliary_loss_clip": 0.01054881, "auxiliary_loss_mlp": 0.01037793, "balance_loss_clip": 1.01284266, "balance_loss_mlp": 1.01773, "epoch": 0.6334585901097249, "flos": 23658124377600.0, "grad_norm": 1.6944717437555805, "language_loss": 0.77515024, "learning_rate": 1.2512602831906733e-06, "loss": 0.79607695, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 10536, "time_per_iteration": 2.372629404067993 }, { "auxiliary_loss_clip": 0.01054679, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.01281655, "balance_loss_mlp": 1.01864171, "epoch": 0.633518713362393, "flos": 28759424106240.0, "grad_norm": 1.5594888429898282, "language_loss": 0.60995543, "learning_rate": 1.250899157568855e-06, "loss": 0.63085318, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 10537, "time_per_iteration": 3.739100933074951 }, { "auxiliary_loss_clip": 0.01009515, "auxiliary_loss_mlp": 0.01009023, "balance_loss_clip": 1.00616169, "balance_loss_mlp": 1.00238681, "epoch": 0.6335788366150609, "flos": 70417264567680.0, "grad_norm": 0.7816503288013247, "language_loss": 0.52549839, "learning_rate": 1.2505380603530155e-06, "loss": 0.54568374, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.07128906, "step": 10538, "time_per_iteration": 3.1288058757781982 }, { "auxiliary_loss_clip": 0.01057219, "auxiliary_loss_mlp": 0.01039964, "balance_loss_clip": 1.01278448, "balance_loss_mlp": 1.01803088, "epoch": 0.6336389598677289, "flos": 23730953207040.0, "grad_norm": 1.7738693655818196, "language_loss": 0.8453328, "learning_rate": 1.250176991556848e-06, "loss": 0.86630464, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39257812, "step": 10539, "time_per_iteration": 2.3866984844207764 }, { "auxiliary_loss_clip": 0.01054823, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.01229489, "balance_loss_mlp": 1.01653445, "epoch": 0.6336990831203968, "flos": 29275440134400.0, "grad_norm": 1.8011083627853715, "language_loss": 0.87470877, "learning_rate": 1.2498159511940438e-06, "loss": 0.89562982, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 10540, "time_per_iteration": 3.7856457233428955 }, { "auxiliary_loss_clip": 0.01050724, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.0129776, "balance_loss_mlp": 1.01639295, "epoch": 0.6337592063730648, "flos": 29095532565120.0, "grad_norm": 1.6224240515001154, "language_loss": 0.73206109, "learning_rate": 1.2494549392782943e-06, "loss": 0.75290871, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 10541, "time_per_iteration": 2.4198968410491943 }, { "auxiliary_loss_clip": 0.01056912, "auxiliary_loss_mlp": 0.0104252, "balance_loss_clip": 1.01642537, "balance_loss_mlp": 1.01822734, "epoch": 0.6338193296257327, "flos": 34705272556800.0, "grad_norm": 2.4875098565167955, "language_loss": 0.86519837, "learning_rate": 1.2490939558232887e-06, "loss": 0.88619268, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10542, "time_per_iteration": 3.8619327545166016 }, { "auxiliary_loss_clip": 0.01053973, "auxiliary_loss_mlp": 0.01040241, "balance_loss_clip": 1.01461148, "balance_loss_mlp": 1.01779199, "epoch": 0.6338794528784008, "flos": 16686737383680.0, "grad_norm": 1.617053777984366, "language_loss": 0.78768694, "learning_rate": 1.2487330008427153e-06, "loss": 0.80862904, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36132812, "step": 10543, "time_per_iteration": 2.367506742477417 }, { "auxiliary_loss_clip": 0.01049853, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.01677966, "balance_loss_mlp": 1.01523864, "epoch": 0.6339395761310687, "flos": 22345494791040.0, "grad_norm": 1.7817373425375305, "language_loss": 0.74420047, "learning_rate": 1.2483720743502618e-06, "loss": 0.76508021, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34570312, "step": 10544, "time_per_iteration": 2.381906747817993 }, { "auxiliary_loss_clip": 0.01057382, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.01556277, "balance_loss_mlp": 1.01897502, "epoch": 0.6339996993837367, "flos": 18550819895040.0, "grad_norm": 9.688946921100003, "language_loss": 0.70072532, "learning_rate": 1.2480111763596144e-06, "loss": 0.72169542, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 10545, "time_per_iteration": 2.403141736984253 }, { "auxiliary_loss_clip": 0.01053256, "auxiliary_loss_mlp": 0.01046845, "balance_loss_clip": 1.02150083, "balance_loss_mlp": 1.01660395, "epoch": 0.6340598226364046, "flos": 12968661744000.0, "grad_norm": 1.9416617481841005, "language_loss": 0.72370696, "learning_rate": 1.2476503068844592e-06, "loss": 0.744708, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 10546, "time_per_iteration": 2.3363473415374756 }, { "auxiliary_loss_clip": 0.01052793, "auxiliary_loss_mlp": 0.01038137, "balance_loss_clip": 1.01605928, "balance_loss_mlp": 1.01754987, "epoch": 0.6341199458890726, "flos": 26686769932800.0, "grad_norm": 1.266248590185462, "language_loss": 0.7871117, "learning_rate": 1.2472894659384792e-06, "loss": 0.80802101, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 10547, "time_per_iteration": 2.403912305831909 }, { "auxiliary_loss_clip": 0.01055132, "auxiliary_loss_mlp": 0.01040656, "balance_loss_clip": 1.01568174, "balance_loss_mlp": 1.01637781, "epoch": 0.6341800691417405, "flos": 18733275993600.0, "grad_norm": 1.8153097709876018, "language_loss": 0.6403628, "learning_rate": 1.2469286535353578e-06, "loss": 0.66132069, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 10548, "time_per_iteration": 2.3602781295776367 }, { "auxiliary_loss_clip": 0.01052165, "auxiliary_loss_mlp": 0.01037673, "balance_loss_clip": 1.01503539, "balance_loss_mlp": 1.01649857, "epoch": 0.6342401923944085, "flos": 26248260856320.0, "grad_norm": 1.6812121579971906, "language_loss": 0.63377231, "learning_rate": 1.2465678696887785e-06, "loss": 0.65467072, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 10549, "time_per_iteration": 2.4006459712982178 }, { "auxiliary_loss_clip": 0.01054324, "auxiliary_loss_mlp": 0.0103797, "balance_loss_clip": 1.01490295, "balance_loss_mlp": 1.01632118, "epoch": 0.6343003156470765, "flos": 24679787760000.0, "grad_norm": 1.5354918677252463, "language_loss": 0.75391632, "learning_rate": 1.2462071144124197e-06, "loss": 0.77483928, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.38085938, "step": 10550, "time_per_iteration": 2.4139156341552734 }, { "auxiliary_loss_clip": 0.0101031, "auxiliary_loss_mlp": 0.01013722, "balance_loss_clip": 1.01094401, "balance_loss_mlp": 1.00304782, "epoch": 0.6343604388997445, "flos": 69802164501120.0, "grad_norm": 0.7056183024957036, "language_loss": 0.57742912, "learning_rate": 1.2458463877199638e-06, "loss": 0.59766942, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.07275391, "step": 10551, "time_per_iteration": 4.424536943435669 }, { "auxiliary_loss_clip": 0.01051725, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.01283336, "balance_loss_mlp": 1.01615512, "epoch": 0.6344205621524125, "flos": 21981315732480.0, "grad_norm": 1.886352403430425, "language_loss": 0.6792829, "learning_rate": 1.2454856896250881e-06, "loss": 0.70014191, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35546875, "step": 10552, "time_per_iteration": 2.3620798587799072 }, { "auxiliary_loss_clip": 0.01055806, "auxiliary_loss_mlp": 0.01036999, "balance_loss_clip": 1.01093936, "balance_loss_mlp": 1.01705837, "epoch": 0.6344806854050804, "flos": 20447825685120.0, "grad_norm": 1.6843795925748004, "language_loss": 0.83854735, "learning_rate": 1.24512502014147e-06, "loss": 0.85947537, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10553, "time_per_iteration": 2.4274709224700928 }, { "auxiliary_loss_clip": 0.0105595, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.01778984, "balance_loss_mlp": 1.01743221, "epoch": 0.6345408086577484, "flos": 40509163952640.0, "grad_norm": 2.0576263251712663, "language_loss": 0.56958491, "learning_rate": 1.2447643792827879e-06, "loss": 0.59055817, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38671875, "step": 10554, "time_per_iteration": 2.516509532928467 }, { "auxiliary_loss_clip": 0.01055926, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.00866508, "balance_loss_mlp": 1.01850128, "epoch": 0.6346009319104163, "flos": 21360245823360.0, "grad_norm": 2.3870921251722503, "language_loss": 0.71880019, "learning_rate": 1.2444037670627153e-06, "loss": 0.73968357, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 10555, "time_per_iteration": 2.3956048488616943 }, { "auxiliary_loss_clip": 0.01012684, "auxiliary_loss_mlp": 0.01004095, "balance_loss_clip": 1.00152016, "balance_loss_mlp": 1.00535035, "epoch": 0.6346610551630844, "flos": 71362433427840.0, "grad_norm": 0.7784149961114011, "language_loss": 0.55477673, "learning_rate": 1.2440431834949276e-06, "loss": 0.5749445, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.07324219, "step": 10556, "time_per_iteration": 2.939636468887329 }, { "auxiliary_loss_clip": 0.01055445, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.0108304, "balance_loss_mlp": 1.01748538, "epoch": 0.6347211784157523, "flos": 25410310381440.0, "grad_norm": 1.69413589183333, "language_loss": 0.69599134, "learning_rate": 1.2436826285930985e-06, "loss": 0.71692526, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.37890625, "step": 10557, "time_per_iteration": 2.502399206161499 }, { "auxiliary_loss_clip": 0.01053686, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.01362062, "balance_loss_mlp": 1.0177139, "epoch": 0.6347813016684203, "flos": 15741812903040.0, "grad_norm": 1.9706525508964654, "language_loss": 0.71255255, "learning_rate": 1.2433221023709002e-06, "loss": 0.73346508, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 10558, "time_per_iteration": 2.3516550064086914 }, { "auxiliary_loss_clip": 0.01054155, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.0096122, "balance_loss_mlp": 1.01747298, "epoch": 0.6348414249210882, "flos": 21463868338560.0, "grad_norm": 1.4658797116392277, "language_loss": 0.78971624, "learning_rate": 1.2429616048420031e-06, "loss": 0.81060851, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 10559, "time_per_iteration": 2.4370269775390625 }, { "auxiliary_loss_clip": 0.01056048, "auxiliary_loss_mlp": 0.0103811, "balance_loss_clip": 1.01329088, "balance_loss_mlp": 1.01773882, "epoch": 0.6349015481737562, "flos": 21651980077440.0, "grad_norm": 2.626856065089491, "language_loss": 0.69926584, "learning_rate": 1.242601136020078e-06, "loss": 0.72020745, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 10560, "time_per_iteration": 2.3701963424682617 }, { "auxiliary_loss_clip": 0.01054869, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.01746821, "balance_loss_mlp": 1.0175786, "epoch": 0.6349616714264241, "flos": 22194041846400.0, "grad_norm": 1.7074778445425454, "language_loss": 0.78159529, "learning_rate": 1.2422406959187939e-06, "loss": 0.80255508, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 10561, "time_per_iteration": 2.4509217739105225 }, { "auxiliary_loss_clip": 0.01056445, "auxiliary_loss_mlp": 0.01045014, "balance_loss_clip": 1.02005172, "balance_loss_mlp": 1.01805532, "epoch": 0.6350217946790921, "flos": 25409193217920.0, "grad_norm": 1.9893474569242409, "language_loss": 0.74144012, "learning_rate": 1.2418802845518178e-06, "loss": 0.76245475, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 10562, "time_per_iteration": 2.402327299118042 }, { "auxiliary_loss_clip": 0.01055385, "auxiliary_loss_mlp": 0.01041275, "balance_loss_clip": 1.01686072, "balance_loss_mlp": 1.01761675, "epoch": 0.63508191793176, "flos": 19717931468160.0, "grad_norm": 2.231562273283751, "language_loss": 0.81310129, "learning_rate": 1.2415199019328185e-06, "loss": 0.83406794, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 10563, "time_per_iteration": 2.399550676345825 }, { "auxiliary_loss_clip": 0.01056006, "auxiliary_loss_mlp": 0.01042473, "balance_loss_clip": 1.01619911, "balance_loss_mlp": 1.01852775, "epoch": 0.6351420411844281, "flos": 18185942609280.0, "grad_norm": 2.1024285440910986, "language_loss": 0.82359749, "learning_rate": 1.2411595480754597e-06, "loss": 0.84458232, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.375, "step": 10564, "time_per_iteration": 2.369825601577759 }, { "auxiliary_loss_clip": 0.01054781, "auxiliary_loss_mlp": 0.01038117, "balance_loss_clip": 1.01433444, "balance_loss_mlp": 1.01801491, "epoch": 0.6352021644370961, "flos": 33725190470400.0, "grad_norm": 1.5309668660752853, "language_loss": 0.73571897, "learning_rate": 1.240799222993407e-06, "loss": 0.75664794, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 10565, "time_per_iteration": 2.519329786300659 }, { "auxiliary_loss_clip": 0.01056496, "auxiliary_loss_mlp": 0.01043882, "balance_loss_clip": 1.01742959, "balance_loss_mlp": 1.01784301, "epoch": 0.635262287689764, "flos": 20373774958080.0, "grad_norm": 1.9521500880999239, "language_loss": 0.70212841, "learning_rate": 1.240438926700324e-06, "loss": 0.72313219, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 10566, "time_per_iteration": 2.3518264293670654 }, { "auxiliary_loss_clip": 0.01053668, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.01555657, "balance_loss_mlp": 1.01790631, "epoch": 0.635322410942432, "flos": 27524231648640.0, "grad_norm": 1.5007559089902194, "language_loss": 0.70460021, "learning_rate": 1.2400786592098725e-06, "loss": 0.72552735, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 10567, "time_per_iteration": 2.4497287273406982 }, { "auxiliary_loss_clip": 0.01052326, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.01791954, "balance_loss_mlp": 1.01757312, "epoch": 0.6353825341950999, "flos": 21542527365120.0, "grad_norm": 1.6417184777805978, "language_loss": 0.85837978, "learning_rate": 1.2397184205357154e-06, "loss": 0.87930852, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 10568, "time_per_iteration": 2.402979612350464 }, { "auxiliary_loss_clip": 0.01055846, "auxiliary_loss_mlp": 0.01039656, "balance_loss_clip": 1.01480126, "balance_loss_mlp": 1.01743841, "epoch": 0.635442657447768, "flos": 31758393139200.0, "grad_norm": 1.8956404866322525, "language_loss": 0.85324699, "learning_rate": 1.2393582106915113e-06, "loss": 0.87420201, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 10569, "time_per_iteration": 2.485971212387085 }, { "auxiliary_loss_clip": 0.01052976, "auxiliary_loss_mlp": 0.01040939, "balance_loss_clip": 1.01770484, "balance_loss_mlp": 1.01706934, "epoch": 0.6355027807004359, "flos": 19827803116800.0, "grad_norm": 1.6344510681161528, "language_loss": 0.70225686, "learning_rate": 1.2389980296909198e-06, "loss": 0.72319603, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 10570, "time_per_iteration": 2.3852434158325195 }, { "auxiliary_loss_clip": 0.0105606, "auxiliary_loss_mlp": 0.01044986, "balance_loss_clip": 1.01810431, "balance_loss_mlp": 1.01669359, "epoch": 0.6355629039531039, "flos": 30371084421120.0, "grad_norm": 2.106011072210145, "language_loss": 0.6788125, "learning_rate": 1.2386378775476e-06, "loss": 0.69982296, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 10571, "time_per_iteration": 2.4619498252868652 }, { "auxiliary_loss_clip": 0.01056274, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.01593471, "balance_loss_mlp": 1.01814806, "epoch": 0.6356230272057718, "flos": 17931076617600.0, "grad_norm": 1.7978328884585786, "language_loss": 0.72235072, "learning_rate": 1.2382777542752074e-06, "loss": 0.74331784, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38085938, "step": 10572, "time_per_iteration": 2.3491668701171875 }, { "auxiliary_loss_clip": 0.01051926, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.01729822, "balance_loss_mlp": 1.01603866, "epoch": 0.6356831504584398, "flos": 25374629105280.0, "grad_norm": 1.4752368432872431, "language_loss": 0.82069641, "learning_rate": 1.2379176598873992e-06, "loss": 0.84160858, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.359375, "step": 10573, "time_per_iteration": 2.427277088165283 }, { "auxiliary_loss_clip": 0.01054825, "auxiliary_loss_mlp": 0.01048049, "balance_loss_clip": 1.02191865, "balance_loss_mlp": 1.01622045, "epoch": 0.6357432737111077, "flos": 46498548735360.0, "grad_norm": 1.6301358510327695, "language_loss": 0.69979638, "learning_rate": 1.2375575943978303e-06, "loss": 0.72082508, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10574, "time_per_iteration": 2.5732688903808594 }, { "auxiliary_loss_clip": 0.01053273, "auxiliary_loss_mlp": 0.0103777, "balance_loss_clip": 1.01333213, "balance_loss_mlp": 1.01630473, "epoch": 0.6358033969637757, "flos": 17273417736960.0, "grad_norm": 2.39940966853445, "language_loss": 0.88575089, "learning_rate": 1.2371975578201525e-06, "loss": 0.90666133, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 10575, "time_per_iteration": 2.333828926086426 }, { "auxiliary_loss_clip": 0.0105286, "auxiliary_loss_mlp": 0.01045721, "balance_loss_clip": 1.02236807, "balance_loss_mlp": 1.01688266, "epoch": 0.6358635202164437, "flos": 27124301491200.0, "grad_norm": 2.2832010559061007, "language_loss": 0.72966629, "learning_rate": 1.2368375501680204e-06, "loss": 0.75065213, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 10576, "time_per_iteration": 3.6443793773651123 }, { "auxiliary_loss_clip": 0.01054978, "auxiliary_loss_mlp": 0.01043992, "balance_loss_clip": 1.01781392, "balance_loss_mlp": 1.0167551, "epoch": 0.6359236434691117, "flos": 27524022180480.0, "grad_norm": 1.8639850820859512, "language_loss": 0.70064229, "learning_rate": 1.236477571455085e-06, "loss": 0.721632, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 10577, "time_per_iteration": 2.4302561283111572 }, { "auxiliary_loss_clip": 0.01054436, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.01455891, "balance_loss_mlp": 1.01777601, "epoch": 0.6359837667217797, "flos": 39346730501760.0, "grad_norm": 1.6344513759150767, "language_loss": 0.73614562, "learning_rate": 1.2361176216949964e-06, "loss": 0.75706291, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 10578, "time_per_iteration": 2.5098342895507812 }, { "auxiliary_loss_clip": 0.01010246, "auxiliary_loss_mlp": 0.01008768, "balance_loss_clip": 1.00639558, "balance_loss_mlp": 1.00305974, "epoch": 0.6360438899744476, "flos": 56411017994880.0, "grad_norm": 0.7098777113153529, "language_loss": 0.54563677, "learning_rate": 1.2357577009014044e-06, "loss": 0.56582689, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07177734, "step": 10579, "time_per_iteration": 3.1269760131835938 }, { "auxiliary_loss_clip": 0.01054342, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.0141747, "balance_loss_mlp": 1.01687992, "epoch": 0.6361040132271156, "flos": 24971940950400.0, "grad_norm": 1.5932020620655667, "language_loss": 0.78243554, "learning_rate": 1.2353978090879568e-06, "loss": 0.80336905, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 10580, "time_per_iteration": 3.902066707611084 }, { "auxiliary_loss_clip": 0.01053445, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.01424837, "balance_loss_mlp": 1.01643038, "epoch": 0.6361641364797835, "flos": 23258054574720.0, "grad_norm": 1.8558693793394638, "language_loss": 0.67887533, "learning_rate": 1.235037946268301e-06, "loss": 0.69979012, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10581, "time_per_iteration": 3.899576187133789 }, { "auxiliary_loss_clip": 0.01053544, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.01250923, "balance_loss_mlp": 1.01650929, "epoch": 0.6362242597324516, "flos": 25993325041920.0, "grad_norm": 1.4794772774774314, "language_loss": 0.69208682, "learning_rate": 1.2346781124560828e-06, "loss": 0.71296644, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.37109375, "step": 10582, "time_per_iteration": 2.435685873031616 }, { "auxiliary_loss_clip": 0.01054172, "auxiliary_loss_mlp": 0.01041708, "balance_loss_clip": 1.01926064, "balance_loss_mlp": 1.01683831, "epoch": 0.6362843829851195, "flos": 25702044635520.0, "grad_norm": 1.7033765305013842, "language_loss": 0.85580683, "learning_rate": 1.2343183076649473e-06, "loss": 0.87676561, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37304688, "step": 10583, "time_per_iteration": 2.3920769691467285 }, { "auxiliary_loss_clip": 0.01053805, "auxiliary_loss_mlp": 0.01038187, "balance_loss_clip": 1.01378489, "balance_loss_mlp": 1.01770902, "epoch": 0.6363445062377875, "flos": 20521841500800.0, "grad_norm": 1.6304133558948084, "language_loss": 0.76501834, "learning_rate": 1.233958531908538e-06, "loss": 0.78593826, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 10584, "time_per_iteration": 2.4013376235961914 }, { "auxiliary_loss_clip": 0.01055925, "auxiliary_loss_mlp": 0.01045823, "balance_loss_clip": 1.01959646, "balance_loss_mlp": 1.01737285, "epoch": 0.6364046294904554, "flos": 19462786185600.0, "grad_norm": 2.0157955998808155, "language_loss": 0.74359572, "learning_rate": 1.2335987852004985e-06, "loss": 0.76461315, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10585, "time_per_iteration": 2.348602294921875 }, { "auxiliary_loss_clip": 0.01053522, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.0143671, "balance_loss_mlp": 1.01682401, "epoch": 0.6364647527431234, "flos": 20994844867200.0, "grad_norm": 1.7627335178819241, "language_loss": 0.83564305, "learning_rate": 1.2332390675544697e-06, "loss": 0.85654974, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 10586, "time_per_iteration": 2.4007129669189453 }, { "auxiliary_loss_clip": 0.01053381, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.01085877, "balance_loss_mlp": 1.0177424, "epoch": 0.6365248759957913, "flos": 25769741495040.0, "grad_norm": 1.4538629842312503, "language_loss": 0.73724508, "learning_rate": 1.2328793789840918e-06, "loss": 0.75811452, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 10587, "time_per_iteration": 2.401477098464966 }, { "auxiliary_loss_clip": 0.01054192, "auxiliary_loss_mlp": 0.01037469, "balance_loss_clip": 1.01387727, "balance_loss_mlp": 1.0169481, "epoch": 0.6365849992484593, "flos": 22454493655680.0, "grad_norm": 2.021686373013303, "language_loss": 0.77966654, "learning_rate": 1.2325197195030058e-06, "loss": 0.80058318, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 10588, "time_per_iteration": 2.4241607189178467 }, { "auxiliary_loss_clip": 0.01053325, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.0099287, "balance_loss_mlp": 1.01797986, "epoch": 0.6366451225011273, "flos": 19024696045440.0, "grad_norm": 1.3792874206452939, "language_loss": 0.80564713, "learning_rate": 1.2321600891248478e-06, "loss": 0.82652754, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35351562, "step": 10589, "time_per_iteration": 2.371394157409668 }, { "auxiliary_loss_clip": 0.01053409, "auxiliary_loss_mlp": 0.01038626, "balance_loss_clip": 1.01456928, "balance_loss_mlp": 1.0171504, "epoch": 0.6367052457537953, "flos": 25227225878400.0, "grad_norm": 3.7049579184520467, "language_loss": 0.68489027, "learning_rate": 1.231800487863257e-06, "loss": 0.70581067, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 10590, "time_per_iteration": 3.834165573120117 }, { "auxiliary_loss_clip": 0.01058435, "auxiliary_loss_mlp": 0.01043515, "balance_loss_clip": 1.01688397, "balance_loss_mlp": 1.01860523, "epoch": 0.6367653690064633, "flos": 19207431434880.0, "grad_norm": 2.3495765803875326, "language_loss": 0.79849184, "learning_rate": 1.2314409157318685e-06, "loss": 0.81951135, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3984375, "step": 10591, "time_per_iteration": 2.3824236392974854 }, { "auxiliary_loss_clip": 0.01054569, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.00821209, "balance_loss_mlp": 1.01862824, "epoch": 0.6368254922591312, "flos": 23545774022400.0, "grad_norm": 1.4267203044658878, "language_loss": 0.89747477, "learning_rate": 1.231081372744317e-06, "loss": 0.91832358, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 10592, "time_per_iteration": 2.416938066482544 }, { "auxiliary_loss_clip": 0.01051486, "auxiliary_loss_mlp": 0.01036208, "balance_loss_clip": 1.01423752, "balance_loss_mlp": 1.01735365, "epoch": 0.6368856155117992, "flos": 26466153851520.0, "grad_norm": 1.3554144470817984, "language_loss": 0.69443345, "learning_rate": 1.2307218589142376e-06, "loss": 0.71531045, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 10593, "time_per_iteration": 2.3927080631256104 }, { "auxiliary_loss_clip": 0.0105279, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.01357436, "balance_loss_mlp": 1.0173254, "epoch": 0.6369457387644671, "flos": 33691045294080.0, "grad_norm": 1.8689730736205863, "language_loss": 0.64861178, "learning_rate": 1.2303623742552618e-06, "loss": 0.66950333, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 10594, "time_per_iteration": 2.4740400314331055 }, { "auxiliary_loss_clip": 0.01010527, "auxiliary_loss_mlp": 0.01015672, "balance_loss_clip": 1.01306093, "balance_loss_mlp": 1.00321436, "epoch": 0.6370058620171352, "flos": 70905140173440.0, "grad_norm": 0.7723935863087253, "language_loss": 0.54724157, "learning_rate": 1.230002918781022e-06, "loss": 0.56750351, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.07324219, "step": 10595, "time_per_iteration": 3.1471354961395264 }, { "auxiliary_loss_clip": 0.01055949, "auxiliary_loss_mlp": 0.0104199, "balance_loss_clip": 1.01602578, "balance_loss_mlp": 1.01798737, "epoch": 0.6370659852698031, "flos": 21140886551040.0, "grad_norm": 1.6666198966593575, "language_loss": 0.68110383, "learning_rate": 1.2296434925051493e-06, "loss": 0.70208323, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 10596, "time_per_iteration": 2.384780168533325 }, { "auxiliary_loss_clip": 0.01054751, "auxiliary_loss_mlp": 0.01039787, "balance_loss_clip": 1.01618385, "balance_loss_mlp": 1.01734018, "epoch": 0.6371261085224711, "flos": 20192261466240.0, "grad_norm": 2.09522073756666, "language_loss": 0.8106634, "learning_rate": 1.2292840954412718e-06, "loss": 0.83160877, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 10597, "time_per_iteration": 2.365744113922119 }, { "auxiliary_loss_clip": 0.01056301, "auxiliary_loss_mlp": 0.01037601, "balance_loss_clip": 1.0155468, "balance_loss_mlp": 1.01927042, "epoch": 0.637186231775139, "flos": 19682494571520.0, "grad_norm": 1.7837036168454625, "language_loss": 0.7522254, "learning_rate": 1.2289247276030189e-06, "loss": 0.77316439, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.37109375, "step": 10598, "time_per_iteration": 2.3837478160858154 }, { "auxiliary_loss_clip": 0.01053949, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.0127635, "balance_loss_mlp": 1.01660025, "epoch": 0.637246355027807, "flos": 13070573602560.0, "grad_norm": 2.2546418351519475, "language_loss": 0.69410825, "learning_rate": 1.2285653890040176e-06, "loss": 0.71501094, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 10599, "time_per_iteration": 2.324079751968384 }, { "auxiliary_loss_clip": 0.01055104, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.01303196, "balance_loss_mlp": 1.0176084, "epoch": 0.6373064782804749, "flos": 18221693708160.0, "grad_norm": 1.9253765385793242, "language_loss": 0.82289577, "learning_rate": 1.2282060796578942e-06, "loss": 0.84382987, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 10600, "time_per_iteration": 2.3559093475341797 }, { "auxiliary_loss_clip": 0.01053734, "auxiliary_loss_mlp": 0.01043095, "balance_loss_clip": 1.01959836, "balance_loss_mlp": 1.01728725, "epoch": 0.637366601533143, "flos": 24497331661440.0, "grad_norm": 1.4630287045656956, "language_loss": 0.80345476, "learning_rate": 1.2278467995782732e-06, "loss": 0.82442307, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 10601, "time_per_iteration": 2.400719404220581 }, { "auxiliary_loss_clip": 0.01056373, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.01521182, "balance_loss_mlp": 1.0189724, "epoch": 0.6374267247858109, "flos": 26357853214080.0, "grad_norm": 1.9223051188917244, "language_loss": 0.68614793, "learning_rate": 1.2274875487787797e-06, "loss": 0.70709622, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 10602, "time_per_iteration": 2.4397361278533936 }, { "auxiliary_loss_clip": 0.01053274, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.01319885, "balance_loss_mlp": 1.01749349, "epoch": 0.6374868480384789, "flos": 20370807492480.0, "grad_norm": 1.697791208248418, "language_loss": 0.80390084, "learning_rate": 1.2271283272730354e-06, "loss": 0.82479239, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 10603, "time_per_iteration": 2.3665642738342285 }, { "auxiliary_loss_clip": 0.01054334, "auxiliary_loss_mlp": 0.01041314, "balance_loss_clip": 1.01551723, "balance_loss_mlp": 1.01737845, "epoch": 0.6375469712911469, "flos": 20995193980800.0, "grad_norm": 1.999363034791124, "language_loss": 0.78986752, "learning_rate": 1.2267691350746621e-06, "loss": 0.81082398, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 10604, "time_per_iteration": 2.38584566116333 }, { "auxiliary_loss_clip": 0.01055169, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.00942171, "balance_loss_mlp": 1.01718986, "epoch": 0.6376070945438148, "flos": 19714824357120.0, "grad_norm": 1.601346196488208, "language_loss": 0.78348655, "learning_rate": 1.226409972197281e-06, "loss": 0.80437708, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 10605, "time_per_iteration": 2.389240264892578 }, { "auxiliary_loss_clip": 0.01055318, "auxiliary_loss_mlp": 0.01041264, "balance_loss_clip": 1.0153718, "balance_loss_mlp": 1.01775038, "epoch": 0.6376672177964828, "flos": 21505694014080.0, "grad_norm": 1.6322736209049107, "language_loss": 0.66693819, "learning_rate": 1.2260508386545106e-06, "loss": 0.68790394, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10606, "time_per_iteration": 2.379565477371216 }, { "auxiliary_loss_clip": 0.01053013, "auxiliary_loss_mlp": 0.01038747, "balance_loss_clip": 1.01643062, "balance_loss_mlp": 1.01760197, "epoch": 0.6377273410491507, "flos": 18842868351360.0, "grad_norm": 1.5385206171560668, "language_loss": 0.76431644, "learning_rate": 1.225691734459971e-06, "loss": 0.78523403, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 10607, "time_per_iteration": 2.353355646133423 }, { "auxiliary_loss_clip": 0.01054772, "auxiliary_loss_mlp": 0.01039621, "balance_loss_clip": 1.01520693, "balance_loss_mlp": 1.01722693, "epoch": 0.6377874643018188, "flos": 53061138086400.0, "grad_norm": 1.8270057650195868, "language_loss": 0.66970658, "learning_rate": 1.225332659627278e-06, "loss": 0.69065046, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 10608, "time_per_iteration": 2.683927536010742 }, { "auxiliary_loss_clip": 0.01009675, "auxiliary_loss_mlp": 0.01002365, "balance_loss_clip": 1.00018311, "balance_loss_mlp": 1.00241137, "epoch": 0.6378475875544867, "flos": 65131972640640.0, "grad_norm": 0.7134789297696591, "language_loss": 0.51892561, "learning_rate": 1.2249736141700475e-06, "loss": 0.53904599, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.07275391, "step": 10609, "time_per_iteration": 2.96268367767334 }, { "auxiliary_loss_clip": 0.01051812, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.01520276, "balance_loss_mlp": 1.01757264, "epoch": 0.6379077108071547, "flos": 23001652483200.0, "grad_norm": 1.5450428879717857, "language_loss": 0.75435358, "learning_rate": 1.2246145981018965e-06, "loss": 0.7752257, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34179688, "step": 10610, "time_per_iteration": 2.3828418254852295 }, { "auxiliary_loss_clip": 0.01009651, "auxiliary_loss_mlp": 0.01002363, "balance_loss_clip": 0.99997842, "balance_loss_mlp": 1.00231504, "epoch": 0.6379678340598226, "flos": 67598201237760.0, "grad_norm": 0.8385822727671279, "language_loss": 0.63262922, "learning_rate": 1.2242556114364364e-06, "loss": 0.65274936, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.07324219, "step": 10611, "time_per_iteration": 3.0573079586029053 }, { "auxiliary_loss_clip": 0.01054229, "auxiliary_loss_mlp": 0.01041885, "balance_loss_clip": 1.01793635, "balance_loss_mlp": 1.0167197, "epoch": 0.6380279573124906, "flos": 29678756693760.0, "grad_norm": 1.8553445063427902, "language_loss": 0.73624122, "learning_rate": 1.223896654187282e-06, "loss": 0.75720239, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 10612, "time_per_iteration": 2.4269909858703613 }, { "auxiliary_loss_clip": 0.01009373, "auxiliary_loss_mlp": 0.01008336, "balance_loss_clip": 1.00594032, "balance_loss_mlp": 1.0021081, "epoch": 0.6380880805651585, "flos": 66480981730560.0, "grad_norm": 0.7204601007870353, "language_loss": 0.58019388, "learning_rate": 1.2235377263680446e-06, "loss": 0.600371, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.07275391, "step": 10613, "time_per_iteration": 2.9454524517059326 }, { "auxiliary_loss_clip": 0.01054966, "auxiliary_loss_mlp": 0.01039512, "balance_loss_clip": 1.01481223, "balance_loss_mlp": 1.01715517, "epoch": 0.6381482038178266, "flos": 23913863153280.0, "grad_norm": 1.8488710244293527, "language_loss": 0.77190042, "learning_rate": 1.2231788279923334e-06, "loss": 0.79284525, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 10614, "time_per_iteration": 2.3909289836883545 }, { "auxiliary_loss_clip": 0.01052852, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.01608372, "balance_loss_mlp": 1.01721048, "epoch": 0.6382083270704945, "flos": 24241907088000.0, "grad_norm": 1.9351499339242786, "language_loss": 0.80545157, "learning_rate": 1.2228199590737599e-06, "loss": 0.82638037, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35546875, "step": 10615, "time_per_iteration": 3.6266438961029053 }, { "auxiliary_loss_clip": 0.01009821, "auxiliary_loss_mlp": 0.01002153, "balance_loss_clip": 0.99990022, "balance_loss_mlp": 1.00257933, "epoch": 0.6382684503231625, "flos": 70771736401920.0, "grad_norm": 0.6581695666093357, "language_loss": 0.55737668, "learning_rate": 1.2224611196259305e-06, "loss": 0.57749641, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.07226562, "step": 10616, "time_per_iteration": 3.1206557750701904 }, { "auxiliary_loss_clip": 0.01053684, "auxiliary_loss_mlp": 0.01041923, "balance_loss_clip": 1.01963103, "balance_loss_mlp": 1.01715922, "epoch": 0.6383285735758305, "flos": 16543907544960.0, "grad_norm": 1.7650089907929127, "language_loss": 0.85297406, "learning_rate": 1.2221023096624538e-06, "loss": 0.8739301, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 10617, "time_per_iteration": 2.404188871383667 }, { "auxiliary_loss_clip": 0.0105487, "auxiliary_loss_mlp": 0.01040667, "balance_loss_clip": 1.01564527, "balance_loss_mlp": 1.01723599, "epoch": 0.6383886968284984, "flos": 14426809344000.0, "grad_norm": 1.8911769906679654, "language_loss": 0.88774347, "learning_rate": 1.221743529196936e-06, "loss": 0.9086988, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10618, "time_per_iteration": 2.344275951385498 }, { "auxiliary_loss_clip": 0.01054902, "auxiliary_loss_mlp": 0.01040899, "balance_loss_clip": 1.01851189, "balance_loss_mlp": 1.01845264, "epoch": 0.6384488200811664, "flos": 17928737556480.0, "grad_norm": 1.749997948713539, "language_loss": 0.74908483, "learning_rate": 1.2213847782429806e-06, "loss": 0.77004278, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 10619, "time_per_iteration": 2.3621978759765625 }, { "auxiliary_loss_clip": 0.01058841, "auxiliary_loss_mlp": 0.01045404, "balance_loss_clip": 1.0174737, "balance_loss_mlp": 1.01840949, "epoch": 0.6385089433338343, "flos": 18514580037120.0, "grad_norm": 2.1160464467817217, "language_loss": 0.77392125, "learning_rate": 1.221026056814193e-06, "loss": 0.79496372, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40429688, "step": 10620, "time_per_iteration": 3.6905622482299805 }, { "auxiliary_loss_clip": 0.01052905, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 1.0128088, "balance_loss_mlp": 1.01640725, "epoch": 0.6385690665865024, "flos": 24752476944000.0, "grad_norm": 2.9232748885131916, "language_loss": 0.72212791, "learning_rate": 1.2206673649241752e-06, "loss": 0.74301887, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 10621, "time_per_iteration": 3.794584035873413 }, { "auxiliary_loss_clip": 0.01049548, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.01078987, "balance_loss_mlp": 1.01580131, "epoch": 0.6386291898391703, "flos": 20119537370880.0, "grad_norm": 1.6100612540441017, "language_loss": 0.79087341, "learning_rate": 1.220308702586529e-06, "loss": 0.81169093, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33789062, "step": 10622, "time_per_iteration": 2.381413459777832 }, { "auxiliary_loss_clip": 0.0105134, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.01608658, "balance_loss_mlp": 1.0164696, "epoch": 0.6386893130918383, "flos": 16866505307520.0, "grad_norm": 1.8028721696702565, "language_loss": 0.75806749, "learning_rate": 1.2199500698148546e-06, "loss": 0.7789548, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34960938, "step": 10623, "time_per_iteration": 2.348745107650757 }, { "auxiliary_loss_clip": 0.01049956, "auxiliary_loss_mlp": 0.01029948, "balance_loss_clip": 1.00932503, "balance_loss_mlp": 1.01612282, "epoch": 0.6387494363445062, "flos": 22965168245760.0, "grad_norm": 1.4462931949638416, "language_loss": 0.77092171, "learning_rate": 1.2195914666227527e-06, "loss": 0.79172081, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.33789062, "step": 10624, "time_per_iteration": 2.4097609519958496 }, { "auxiliary_loss_clip": 0.01053507, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.01611245, "balance_loss_mlp": 1.01716065, "epoch": 0.6388095595971742, "flos": 22856588317440.0, "grad_norm": 1.704357042494605, "language_loss": 0.8184998, "learning_rate": 1.21923289302382e-06, "loss": 0.83942372, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 10625, "time_per_iteration": 2.3822128772735596 }, { "auxiliary_loss_clip": 0.01054648, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.015872, "balance_loss_mlp": 1.01845527, "epoch": 0.6388696828498421, "flos": 17310565290240.0, "grad_norm": 1.9290037254456298, "language_loss": 0.73599392, "learning_rate": 1.218874349031654e-06, "loss": 0.7569409, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 10626, "time_per_iteration": 2.3626840114593506 }, { "auxiliary_loss_clip": 0.01054354, "auxiliary_loss_mlp": 0.0104254, "balance_loss_clip": 1.01681459, "balance_loss_mlp": 1.01728415, "epoch": 0.6389298061025102, "flos": 17127690255360.0, "grad_norm": 1.7086403001885544, "language_loss": 0.7418834, "learning_rate": 1.2185158346598517e-06, "loss": 0.76285231, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 10627, "time_per_iteration": 2.4111955165863037 }, { "auxiliary_loss_clip": 0.01057413, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.01038718, "balance_loss_mlp": 1.01879215, "epoch": 0.6389899293551781, "flos": 27709690124160.0, "grad_norm": 2.063970073771208, "language_loss": 0.69109917, "learning_rate": 1.2181573499220064e-06, "loss": 0.71203303, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10628, "time_per_iteration": 2.468275785446167 }, { "auxiliary_loss_clip": 0.01050788, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.01068497, "balance_loss_mlp": 1.01635194, "epoch": 0.6390500526078461, "flos": 21214623075840.0, "grad_norm": 1.9806997635833794, "language_loss": 0.69359159, "learning_rate": 1.2177988948317135e-06, "loss": 0.71441865, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 10629, "time_per_iteration": 2.3919527530670166 }, { "auxiliary_loss_clip": 0.01057499, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.01694751, "balance_loss_mlp": 1.01861835, "epoch": 0.6391101758605141, "flos": 21579954209280.0, "grad_norm": 1.508144503193366, "language_loss": 0.76485586, "learning_rate": 1.2174404694025646e-06, "loss": 0.78587925, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38867188, "step": 10630, "time_per_iteration": 2.4242208003997803 }, { "auxiliary_loss_clip": 0.01053424, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.01266813, "balance_loss_mlp": 1.01806498, "epoch": 0.639170299113182, "flos": 19899479871360.0, "grad_norm": 1.4929711276250475, "language_loss": 0.70923376, "learning_rate": 1.2170820736481511e-06, "loss": 0.73010325, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.35351562, "step": 10631, "time_per_iteration": 3.773270606994629 }, { "auxiliary_loss_clip": 0.01013836, "auxiliary_loss_mlp": 0.01004072, "balance_loss_clip": 1.00171161, "balance_loss_mlp": 1.0066359, "epoch": 0.63923042236585, "flos": 69874434748800.0, "grad_norm": 0.7701817058709713, "language_loss": 0.63134599, "learning_rate": 1.2167237075820646e-06, "loss": 0.65152502, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07226562, "step": 10632, "time_per_iteration": 3.075253963470459 }, { "auxiliary_loss_clip": 0.01051536, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.01543725, "balance_loss_mlp": 1.01663542, "epoch": 0.639290545618518, "flos": 22673713282560.0, "grad_norm": 2.4893384341823404, "language_loss": 0.68404341, "learning_rate": 1.216365371217893e-06, "loss": 0.70494461, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34960938, "step": 10633, "time_per_iteration": 2.3674769401550293 }, { "auxiliary_loss_clip": 0.01053071, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.0125947, "balance_loss_mlp": 1.01757765, "epoch": 0.639350668871186, "flos": 19828152230400.0, "grad_norm": 1.926916928585002, "language_loss": 0.82910407, "learning_rate": 1.216007064569225e-06, "loss": 0.84998322, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 10634, "time_per_iteration": 2.3737244606018066 }, { "auxiliary_loss_clip": 0.01052563, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.0147028, "balance_loss_mlp": 1.01647067, "epoch": 0.6394107921238539, "flos": 20552425718400.0, "grad_norm": 1.4361987849612483, "language_loss": 0.75948489, "learning_rate": 1.2156487876496483e-06, "loss": 0.78040087, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 10635, "time_per_iteration": 2.3667232990264893 }, { "auxiliary_loss_clip": 0.01053787, "auxiliary_loss_mlp": 0.01040594, "balance_loss_clip": 1.01792085, "balance_loss_mlp": 1.01791143, "epoch": 0.6394709153765219, "flos": 25773826124160.0, "grad_norm": 1.6008064540546543, "language_loss": 0.7299993, "learning_rate": 1.2152905404727475e-06, "loss": 0.75094306, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 10636, "time_per_iteration": 2.4134092330932617 }, { "auxiliary_loss_clip": 0.01055584, "auxiliary_loss_mlp": 0.01039148, "balance_loss_clip": 1.01516318, "balance_loss_mlp": 1.01776516, "epoch": 0.6395310386291898, "flos": 17529191424000.0, "grad_norm": 1.8053433036710436, "language_loss": 0.74995255, "learning_rate": 1.2149323230521085e-06, "loss": 0.77089989, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 10637, "time_per_iteration": 2.363415002822876 }, { "auxiliary_loss_clip": 0.010541, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.02235031, "balance_loss_mlp": 1.0167141, "epoch": 0.6395911618818578, "flos": 18587234309760.0, "grad_norm": 2.5926863534588414, "language_loss": 0.79344386, "learning_rate": 1.2145741354013143e-06, "loss": 0.81444526, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 10638, "time_per_iteration": 2.420346736907959 }, { "auxiliary_loss_clip": 0.01052331, "auxiliary_loss_mlp": 0.0103891, "balance_loss_clip": 1.01447248, "balance_loss_mlp": 1.01652098, "epoch": 0.6396512851345257, "flos": 28365289234560.0, "grad_norm": 1.5124309398215467, "language_loss": 0.82738215, "learning_rate": 1.2142159775339478e-06, "loss": 0.84829462, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 10639, "time_per_iteration": 2.4191524982452393 }, { "auxiliary_loss_clip": 0.01011023, "auxiliary_loss_mlp": 0.01003252, "balance_loss_clip": 1.001261, "balance_loss_mlp": 1.00392365, "epoch": 0.6397114083871938, "flos": 70720903595520.0, "grad_norm": 0.8320940677187404, "language_loss": 0.59150916, "learning_rate": 1.21385784946359e-06, "loss": 0.6116519, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.07128906, "step": 10640, "time_per_iteration": 2.9737091064453125 }, { "auxiliary_loss_clip": 0.01050203, "auxiliary_loss_mlp": 0.01034658, "balance_loss_clip": 1.01454735, "balance_loss_mlp": 1.01619995, "epoch": 0.6397715316398617, "flos": 18141777872640.0, "grad_norm": 1.7307083880800425, "language_loss": 0.78962648, "learning_rate": 1.2134997512038215e-06, "loss": 0.81047511, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 10641, "time_per_iteration": 2.3401100635528564 }, { "auxiliary_loss_clip": 0.01055904, "auxiliary_loss_mlp": 0.01043673, "balance_loss_clip": 1.01661253, "balance_loss_mlp": 1.01682508, "epoch": 0.6398316548925297, "flos": 25738319404800.0, "grad_norm": 1.7720164104084586, "language_loss": 0.65601408, "learning_rate": 1.2131416827682209e-06, "loss": 0.67700988, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 10642, "time_per_iteration": 2.4264872074127197 }, { "auxiliary_loss_clip": 0.01009963, "auxiliary_loss_mlp": 0.0100258, "balance_loss_clip": 1.00051725, "balance_loss_mlp": 1.00282574, "epoch": 0.6398917781451977, "flos": 71211399419520.0, "grad_norm": 1.0035766970766313, "language_loss": 0.56111306, "learning_rate": 1.2127836441703667e-06, "loss": 0.58123845, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.07128906, "step": 10643, "time_per_iteration": 2.9874422550201416 }, { "auxiliary_loss_clip": 0.01055457, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.01511145, "balance_loss_mlp": 1.01737249, "epoch": 0.6399519013978656, "flos": 20520794160000.0, "grad_norm": 1.8211860127869288, "language_loss": 0.77781874, "learning_rate": 1.2124256354238358e-06, "loss": 0.79878056, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 10644, "time_per_iteration": 2.429062604904175 }, { "auxiliary_loss_clip": 0.01052905, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.02005279, "balance_loss_mlp": 1.01715517, "epoch": 0.6400120246505336, "flos": 24459730260480.0, "grad_norm": 1.4375778847104521, "language_loss": 0.8322382, "learning_rate": 1.212067656542203e-06, "loss": 0.85319865, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 10645, "time_per_iteration": 2.3978002071380615 }, { "auxiliary_loss_clip": 0.01055956, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.01799119, "balance_loss_mlp": 1.01750565, "epoch": 0.6400721479032015, "flos": 28364835386880.0, "grad_norm": 1.8606091758658874, "language_loss": 0.75443137, "learning_rate": 1.2117097075390447e-06, "loss": 0.77541798, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 10646, "time_per_iteration": 2.4393293857574463 }, { "auxiliary_loss_clip": 0.01053159, "auxiliary_loss_mlp": 0.01046678, "balance_loss_clip": 1.02249014, "balance_loss_mlp": 1.01613033, "epoch": 0.6401322711558696, "flos": 17815723885440.0, "grad_norm": 2.0004776007126277, "language_loss": 0.813142, "learning_rate": 1.2113517884279327e-06, "loss": 0.83414042, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 10647, "time_per_iteration": 2.323615789413452 }, { "auxiliary_loss_clip": 0.01052843, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.01777518, "balance_loss_mlp": 1.01817369, "epoch": 0.6401923944085375, "flos": 26029669633920.0, "grad_norm": 1.7499279505412735, "language_loss": 0.76826632, "learning_rate": 1.2109938992224399e-06, "loss": 0.78918725, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 10648, "time_per_iteration": 2.4515225887298584 }, { "auxiliary_loss_clip": 0.01053441, "auxiliary_loss_mlp": 0.01042019, "balance_loss_clip": 1.01816487, "balance_loss_mlp": 1.01631796, "epoch": 0.6402525176612055, "flos": 23585330459520.0, "grad_norm": 1.8370955993018379, "language_loss": 0.79389489, "learning_rate": 1.210636039936138e-06, "loss": 0.8148495, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 10649, "time_per_iteration": 2.390174150466919 }, { "auxiliary_loss_clip": 0.01054136, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.02014709, "balance_loss_mlp": 1.01708424, "epoch": 0.6403126409138734, "flos": 18040424595840.0, "grad_norm": 1.7017725699080744, "language_loss": 0.77053905, "learning_rate": 1.2102782105825956e-06, "loss": 0.7915253, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 10650, "time_per_iteration": 2.4473276138305664 }, { "auxiliary_loss_clip": 0.01053579, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.02220309, "balance_loss_mlp": 1.01722991, "epoch": 0.6403727641665414, "flos": 21978453000960.0, "grad_norm": 1.5087404024015796, "language_loss": 0.712767, "learning_rate": 1.2099204111753833e-06, "loss": 0.73376483, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 10651, "time_per_iteration": 2.366281509399414 }, { "auxiliary_loss_clip": 0.01053701, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.01440394, "balance_loss_mlp": 1.01770568, "epoch": 0.6404328874192093, "flos": 24894503821440.0, "grad_norm": 2.555140152421492, "language_loss": 0.65466464, "learning_rate": 1.2095626417280684e-06, "loss": 0.675578, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 10652, "time_per_iteration": 2.5446150302886963 }, { "auxiliary_loss_clip": 0.01054734, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.01456666, "balance_loss_mlp": 1.01851678, "epoch": 0.6404930106718774, "flos": 17596399524480.0, "grad_norm": 1.8003078557787768, "language_loss": 0.80441046, "learning_rate": 1.2092049022542168e-06, "loss": 0.82533526, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36328125, "step": 10653, "time_per_iteration": 2.318232536315918 }, { "auxiliary_loss_clip": 0.01058634, "auxiliary_loss_mlp": 0.01043136, "balance_loss_clip": 1.01556253, "balance_loss_mlp": 1.01761961, "epoch": 0.6405531339245453, "flos": 20156824569600.0, "grad_norm": 2.1589398323501854, "language_loss": 0.73159534, "learning_rate": 1.2088471927673952e-06, "loss": 0.75261301, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.41015625, "step": 10654, "time_per_iteration": 2.4215598106384277 }, { "auxiliary_loss_clip": 0.01055533, "auxiliary_loss_mlp": 0.01046545, "balance_loss_clip": 1.02089095, "balance_loss_mlp": 1.01787949, "epoch": 0.6406132571772133, "flos": 21941270536320.0, "grad_norm": 1.7208725452161113, "language_loss": 0.7370075, "learning_rate": 1.2084895132811666e-06, "loss": 0.75802827, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 10655, "time_per_iteration": 3.6894776821136475 }, { "auxiliary_loss_clip": 0.01054686, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.01250887, "balance_loss_mlp": 1.01685095, "epoch": 0.6406733804298813, "flos": 28766720580480.0, "grad_norm": 1.9432084490104047, "language_loss": 0.84308749, "learning_rate": 1.2081318638090952e-06, "loss": 0.86402196, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 10656, "time_per_iteration": 2.4234580993652344 }, { "auxiliary_loss_clip": 0.0105462, "auxiliary_loss_mlp": 0.01039012, "balance_loss_clip": 1.01433516, "balance_loss_mlp": 1.01701593, "epoch": 0.6407335036825492, "flos": 17456222949120.0, "grad_norm": 2.7073119739660623, "language_loss": 0.73978353, "learning_rate": 1.2077742443647433e-06, "loss": 0.7607199, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 10657, "time_per_iteration": 2.3537533283233643 }, { "auxiliary_loss_clip": 0.01053939, "auxiliary_loss_mlp": 0.01044665, "balance_loss_clip": 1.02166939, "balance_loss_mlp": 1.01739037, "epoch": 0.6407936269352172, "flos": 22124250305280.0, "grad_norm": 1.9475887052450147, "language_loss": 0.78454375, "learning_rate": 1.2074166549616707e-06, "loss": 0.80552977, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 10658, "time_per_iteration": 2.4106826782226562 }, { "auxiliary_loss_clip": 0.01054628, "auxiliary_loss_mlp": 0.01039686, "balance_loss_clip": 1.01474714, "balance_loss_mlp": 1.01623309, "epoch": 0.6408537501878852, "flos": 23109569095680.0, "grad_norm": 1.6646400793514755, "language_loss": 0.77166903, "learning_rate": 1.2070590956134386e-06, "loss": 0.7926122, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3828125, "step": 10659, "time_per_iteration": 3.7069573402404785 }, { "auxiliary_loss_clip": 0.01055338, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.01168704, "balance_loss_mlp": 1.01826179, "epoch": 0.6409138734405532, "flos": 16471497651840.0, "grad_norm": 2.5744518849273654, "language_loss": 0.78555185, "learning_rate": 1.2067015663336046e-06, "loss": 0.80646062, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 10660, "time_per_iteration": 3.84342098236084 }, { "auxiliary_loss_clip": 0.01058872, "auxiliary_loss_mlp": 0.01049936, "balance_loss_clip": 1.02132618, "balance_loss_mlp": 1.01926017, "epoch": 0.6409739966932211, "flos": 22776986684160.0, "grad_norm": 1.9754310464487264, "language_loss": 0.69699097, "learning_rate": 1.206344067135727e-06, "loss": 0.71807903, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39453125, "step": 10661, "time_per_iteration": 2.380420207977295 }, { "auxiliary_loss_clip": 0.0105255, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.01512742, "balance_loss_mlp": 1.01785946, "epoch": 0.6410341199458891, "flos": 25150975735680.0, "grad_norm": 1.566880225688396, "language_loss": 0.766415, "learning_rate": 1.205986598033362e-06, "loss": 0.78730643, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 10662, "time_per_iteration": 2.400042772293091 }, { "auxiliary_loss_clip": 0.01054208, "auxiliary_loss_mlp": 0.01038376, "balance_loss_clip": 1.01440287, "balance_loss_mlp": 1.01721311, "epoch": 0.641094243198557, "flos": 27045153705600.0, "grad_norm": 2.544351464232849, "language_loss": 0.71007144, "learning_rate": 1.2056291590400644e-06, "loss": 0.73099732, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 10663, "time_per_iteration": 2.4289908409118652 }, { "auxiliary_loss_clip": 0.01056272, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.01549721, "balance_loss_mlp": 1.01792169, "epoch": 0.641154366451225, "flos": 25373372296320.0, "grad_norm": 1.8729769079934668, "language_loss": 0.68965578, "learning_rate": 1.205271750169389e-06, "loss": 0.71064746, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.3828125, "step": 10664, "time_per_iteration": 2.403374671936035 }, { "auxiliary_loss_clip": 0.01053273, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.01465058, "balance_loss_mlp": 1.01735592, "epoch": 0.6412144897038929, "flos": 25152232544640.0, "grad_norm": 1.7627688978215696, "language_loss": 0.67152077, "learning_rate": 1.2049143714348881e-06, "loss": 0.69243711, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 10665, "time_per_iteration": 2.4104065895080566 }, { "auxiliary_loss_clip": 0.01052678, "auxiliary_loss_mlp": 0.01035773, "balance_loss_clip": 1.01312363, "balance_loss_mlp": 1.01731217, "epoch": 0.641274612956561, "flos": 23439637889280.0, "grad_norm": 1.8658452993328458, "language_loss": 0.64999056, "learning_rate": 1.2045570228501145e-06, "loss": 0.67087507, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 10666, "time_per_iteration": 2.389007568359375 }, { "auxiliary_loss_clip": 0.01054399, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.01831055, "balance_loss_mlp": 1.01716471, "epoch": 0.6413347362092289, "flos": 19426476504960.0, "grad_norm": 1.646074272396001, "language_loss": 0.72778642, "learning_rate": 1.2041997044286176e-06, "loss": 0.74876148, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 10667, "time_per_iteration": 2.41025447845459 }, { "auxiliary_loss_clip": 0.01059378, "auxiliary_loss_mlp": 0.01047361, "balance_loss_clip": 1.01929951, "balance_loss_mlp": 1.01849008, "epoch": 0.6413948594618969, "flos": 17195771139840.0, "grad_norm": 2.797411574128994, "language_loss": 0.79323852, "learning_rate": 1.2038424161839484e-06, "loss": 0.8143059, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40820312, "step": 10668, "time_per_iteration": 2.3776769638061523 }, { "auxiliary_loss_clip": 0.01056034, "auxiliary_loss_mlp": 0.01040755, "balance_loss_clip": 1.01718736, "balance_loss_mlp": 1.01885843, "epoch": 0.6414549827145648, "flos": 22268790800640.0, "grad_norm": 1.6522261830969793, "language_loss": 0.69097501, "learning_rate": 1.2034851581296544e-06, "loss": 0.71194291, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37109375, "step": 10669, "time_per_iteration": 2.4285190105438232 }, { "auxiliary_loss_clip": 0.01059013, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.01950669, "balance_loss_mlp": 1.01917839, "epoch": 0.6415151059672328, "flos": 19639342264320.0, "grad_norm": 1.7723180267559417, "language_loss": 0.79479444, "learning_rate": 1.2031279302792825e-06, "loss": 0.81585753, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 10670, "time_per_iteration": 3.8522849082946777 }, { "auxiliary_loss_clip": 0.01056987, "auxiliary_loss_mlp": 0.01042497, "balance_loss_clip": 1.01507878, "balance_loss_mlp": 1.01845694, "epoch": 0.6415752292199008, "flos": 14864969306880.0, "grad_norm": 2.9585137338296885, "language_loss": 0.91009593, "learning_rate": 1.20277073264638e-06, "loss": 0.93109083, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 10671, "time_per_iteration": 2.341731548309326 }, { "auxiliary_loss_clip": 0.01052726, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.01184702, "balance_loss_mlp": 1.01808846, "epoch": 0.6416353524725688, "flos": 13734725996160.0, "grad_norm": 4.996627810404741, "language_loss": 0.70782137, "learning_rate": 1.2024135652444907e-06, "loss": 0.72869098, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 10672, "time_per_iteration": 2.3723559379577637 }, { "auxiliary_loss_clip": 0.01057629, "auxiliary_loss_mlp": 0.01044643, "balance_loss_clip": 1.01621222, "balance_loss_mlp": 1.01736414, "epoch": 0.6416954757252368, "flos": 24533780987520.0, "grad_norm": 2.2783872948137547, "language_loss": 0.75390226, "learning_rate": 1.2020564280871593e-06, "loss": 0.77492499, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.40234375, "step": 10673, "time_per_iteration": 2.390533208847046 }, { "auxiliary_loss_clip": 0.01054922, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.01393366, "balance_loss_mlp": 1.01728511, "epoch": 0.6417555989779047, "flos": 27708747517440.0, "grad_norm": 2.1857987204806304, "language_loss": 0.71395153, "learning_rate": 1.2016993211879283e-06, "loss": 0.73489869, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 10674, "time_per_iteration": 2.4211432933807373 }, { "auxiliary_loss_clip": 0.01057691, "auxiliary_loss_mlp": 0.01043023, "balance_loss_clip": 1.01694036, "balance_loss_mlp": 1.01853454, "epoch": 0.6418157222305727, "flos": 20555637563520.0, "grad_norm": 1.86127577495924, "language_loss": 0.68764055, "learning_rate": 1.201342244560338e-06, "loss": 0.70864767, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 10675, "time_per_iteration": 2.3760430812835693 }, { "auxiliary_loss_clip": 0.01054251, "auxiliary_loss_mlp": 0.01042324, "balance_loss_clip": 1.01756454, "balance_loss_mlp": 1.01711011, "epoch": 0.6418758454832406, "flos": 22600430605440.0, "grad_norm": 1.9630571367314473, "language_loss": 0.67968976, "learning_rate": 1.2009851982179307e-06, "loss": 0.70065546, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 10676, "time_per_iteration": 2.551560878753662 }, { "auxiliary_loss_clip": 0.01056227, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.015481, "balance_loss_mlp": 1.01858425, "epoch": 0.6419359687359086, "flos": 27374035512960.0, "grad_norm": 2.2638541269874692, "language_loss": 0.76676083, "learning_rate": 1.2006281821742446e-06, "loss": 0.78773028, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 10677, "time_per_iteration": 2.437696933746338 }, { "auxiliary_loss_clip": 0.01009876, "auxiliary_loss_mlp": 0.0101811, "balance_loss_clip": 1.01574957, "balance_loss_mlp": 1.00221074, "epoch": 0.6419960919885765, "flos": 67248791550720.0, "grad_norm": 0.8556270219672066, "language_loss": 0.60833019, "learning_rate": 1.200271196442818e-06, "loss": 0.62861001, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07666016, "step": 10678, "time_per_iteration": 3.1020407676696777 }, { "auxiliary_loss_clip": 0.01053049, "auxiliary_loss_mlp": 0.01044937, "balance_loss_clip": 1.02058268, "balance_loss_mlp": 1.01705718, "epoch": 0.6420562152412446, "flos": 19900841414400.0, "grad_norm": 1.750339292614293, "language_loss": 0.68475562, "learning_rate": 1.1999142410371875e-06, "loss": 0.7057355, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.359375, "step": 10679, "time_per_iteration": 2.371885299682617 }, { "auxiliary_loss_clip": 0.01057117, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.01758385, "balance_loss_mlp": 1.01858175, "epoch": 0.6421163384939125, "flos": 24789031004160.0, "grad_norm": 1.699589616678148, "language_loss": 0.74394464, "learning_rate": 1.1995573159708897e-06, "loss": 0.76494062, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 10680, "time_per_iteration": 2.4071402549743652 }, { "auxiliary_loss_clip": 0.01053144, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.0136714, "balance_loss_mlp": 1.01689231, "epoch": 0.6421764617465805, "flos": 25591649316480.0, "grad_norm": 2.0390462060793606, "language_loss": 0.69125843, "learning_rate": 1.1992004212574582e-06, "loss": 0.71215564, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 10681, "time_per_iteration": 2.4107799530029297 }, { "auxiliary_loss_clip": 0.01052854, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.01549006, "balance_loss_mlp": 1.01629424, "epoch": 0.6422365849992484, "flos": 14133923015040.0, "grad_norm": 1.6734649179945302, "language_loss": 0.75446945, "learning_rate": 1.198843556910427e-06, "loss": 0.77538538, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 10682, "time_per_iteration": 2.334526538848877 }, { "auxiliary_loss_clip": 0.01050962, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.01315141, "balance_loss_mlp": 1.0157485, "epoch": 0.6422967082519164, "flos": 22382781989760.0, "grad_norm": 1.55992840760253, "language_loss": 0.79874563, "learning_rate": 1.1984867229433287e-06, "loss": 0.81962186, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 10683, "time_per_iteration": 2.400954484939575 }, { "auxiliary_loss_clip": 0.01053875, "auxiliary_loss_mlp": 0.01044048, "balance_loss_clip": 1.01833451, "balance_loss_mlp": 1.01613474, "epoch": 0.6423568315045844, "flos": 14647041400320.0, "grad_norm": 1.6955508467256435, "language_loss": 0.68951005, "learning_rate": 1.1981299193696941e-06, "loss": 0.71048927, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 10684, "time_per_iteration": 2.3624110221862793 }, { "auxiliary_loss_clip": 0.01052769, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.01396525, "balance_loss_mlp": 1.01598036, "epoch": 0.6424169547572524, "flos": 26832706882560.0, "grad_norm": 2.126061593480188, "language_loss": 0.73062062, "learning_rate": 1.1977731462030533e-06, "loss": 0.75152707, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 10685, "time_per_iteration": 2.5211782455444336 }, { "auxiliary_loss_clip": 0.01052381, "auxiliary_loss_mlp": 0.01039957, "balance_loss_clip": 1.01641345, "balance_loss_mlp": 1.01642394, "epoch": 0.6424770780099204, "flos": 22706427093120.0, "grad_norm": 1.6148825060397416, "language_loss": 0.75902009, "learning_rate": 1.197416403456935e-06, "loss": 0.77994347, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 10686, "time_per_iteration": 2.3691868782043457 }, { "auxiliary_loss_clip": 0.01057265, "auxiliary_loss_mlp": 0.01051222, "balance_loss_clip": 1.02233708, "balance_loss_mlp": 1.01790047, "epoch": 0.6425372012625883, "flos": 28468422990720.0, "grad_norm": 2.179395057425765, "language_loss": 0.69784379, "learning_rate": 1.197059691144867e-06, "loss": 0.71892869, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.39257812, "step": 10687, "time_per_iteration": 2.4287612438201904 }, { "auxiliary_loss_clip": 0.01055453, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.01503706, "balance_loss_mlp": 1.01735544, "epoch": 0.6425973245152563, "flos": 29350398556800.0, "grad_norm": 1.8218903750570488, "language_loss": 0.67328203, "learning_rate": 1.1967030092803767e-06, "loss": 0.69422823, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.38085938, "step": 10688, "time_per_iteration": 2.4716384410858154 }, { "auxiliary_loss_clip": 0.01053591, "auxiliary_loss_mlp": 0.01042854, "balance_loss_clip": 1.01866603, "balance_loss_mlp": 1.01607394, "epoch": 0.6426574477679242, "flos": 16429602153600.0, "grad_norm": 1.7036518508961833, "language_loss": 0.74397719, "learning_rate": 1.1963463578769876e-06, "loss": 0.76494169, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 10689, "time_per_iteration": 2.4085705280303955 }, { "auxiliary_loss_clip": 0.01051974, "auxiliary_loss_mlp": 0.01034991, "balance_loss_clip": 1.01225734, "balance_loss_mlp": 1.01583719, "epoch": 0.6427175710205922, "flos": 21834820200960.0, "grad_norm": 2.193113961251425, "language_loss": 0.73283064, "learning_rate": 1.195989736948226e-06, "loss": 0.75370026, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 10690, "time_per_iteration": 2.395763635635376 }, { "auxiliary_loss_clip": 0.01051792, "auxiliary_loss_mlp": 0.01040189, "balance_loss_clip": 1.01540542, "balance_loss_mlp": 1.01596475, "epoch": 0.6427776942732601, "flos": 17785628426880.0, "grad_norm": 1.7830753943888145, "language_loss": 0.78103065, "learning_rate": 1.1956331465076143e-06, "loss": 0.80195045, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 10691, "time_per_iteration": 2.3645246028900146 }, { "auxiliary_loss_clip": 0.01055384, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.01164472, "balance_loss_mlp": 1.01674926, "epoch": 0.6428378175259282, "flos": 15084991895040.0, "grad_norm": 1.6204300017191564, "language_loss": 0.7551949, "learning_rate": 1.1952765865686738e-06, "loss": 0.77614248, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38671875, "step": 10692, "time_per_iteration": 2.354276180267334 }, { "auxiliary_loss_clip": 0.01053239, "auxiliary_loss_mlp": 0.01040137, "balance_loss_clip": 1.01617575, "balance_loss_mlp": 1.01689506, "epoch": 0.6428979407785961, "flos": 23840650298880.0, "grad_norm": 1.9530625566787807, "language_loss": 0.6293633, "learning_rate": 1.1949200571449263e-06, "loss": 0.65029705, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 10693, "time_per_iteration": 2.3854289054870605 }, { "auxiliary_loss_clip": 0.010556, "auxiliary_loss_mlp": 0.01040498, "balance_loss_clip": 1.01370001, "balance_loss_mlp": 1.01649117, "epoch": 0.6429580640312641, "flos": 32925469800960.0, "grad_norm": 1.6536205758790061, "language_loss": 0.61994201, "learning_rate": 1.1945635582498903e-06, "loss": 0.640903, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 10694, "time_per_iteration": 2.459805488586426 }, { "auxiliary_loss_clip": 0.01056276, "auxiliary_loss_mlp": 0.01041451, "balance_loss_clip": 1.01629817, "balance_loss_mlp": 1.01781714, "epoch": 0.643018187283932, "flos": 21067324583040.0, "grad_norm": 1.504407988310007, "language_loss": 0.80979681, "learning_rate": 1.1942070898970853e-06, "loss": 0.83077407, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38476562, "step": 10695, "time_per_iteration": 3.580958366394043 }, { "auxiliary_loss_clip": 0.01054903, "auxiliary_loss_mlp": 0.01045051, "balance_loss_clip": 1.01746631, "balance_loss_mlp": 1.01708186, "epoch": 0.6430783105366, "flos": 26723428727040.0, "grad_norm": 1.7353823201177376, "language_loss": 0.74842203, "learning_rate": 1.1938506521000285e-06, "loss": 0.76942152, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37695312, "step": 10696, "time_per_iteration": 2.4017081260681152 }, { "auxiliary_loss_clip": 0.01052038, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.00971532, "balance_loss_mlp": 1.01667142, "epoch": 0.643138433789268, "flos": 23695690867200.0, "grad_norm": 1.6477296287006251, "language_loss": 0.76333517, "learning_rate": 1.1934942448722347e-06, "loss": 0.78418195, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 10697, "time_per_iteration": 2.4084744453430176 }, { "auxiliary_loss_clip": 0.01053377, "auxiliary_loss_mlp": 0.01039519, "balance_loss_clip": 1.01646376, "balance_loss_mlp": 1.01693583, "epoch": 0.643198557041936, "flos": 34200812188800.0, "grad_norm": 1.4906432920631827, "language_loss": 0.6720891, "learning_rate": 1.1931378682272208e-06, "loss": 0.69301808, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 10698, "time_per_iteration": 3.9717938899993896 }, { "auxiliary_loss_clip": 0.0100969, "auxiliary_loss_mlp": 0.01004272, "balance_loss_clip": 1.00209081, "balance_loss_mlp": 1.00244236, "epoch": 0.643258680294604, "flos": 67623059992320.0, "grad_norm": 0.8624979824899943, "language_loss": 0.6359002, "learning_rate": 1.1927815221784996e-06, "loss": 0.65603983, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.07226562, "step": 10699, "time_per_iteration": 2.9856224060058594 }, { "auxiliary_loss_clip": 0.01052825, "auxiliary_loss_mlp": 0.01033711, "balance_loss_clip": 1.01269412, "balance_loss_mlp": 1.01709771, "epoch": 0.6433188035472719, "flos": 25184981266560.0, "grad_norm": 1.7671468828838948, "language_loss": 0.70035625, "learning_rate": 1.1924252067395838e-06, "loss": 0.72122163, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.35742188, "step": 10700, "time_per_iteration": 3.7813150882720947 }, { "auxiliary_loss_clip": 0.01055144, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.00848031, "balance_loss_mlp": 1.01742578, "epoch": 0.6433789267999399, "flos": 24972394798080.0, "grad_norm": 1.6634902475705713, "language_loss": 0.74155116, "learning_rate": 1.1920689219239855e-06, "loss": 0.7624476, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37695312, "step": 10701, "time_per_iteration": 2.4258172512054443 }, { "auxiliary_loss_clip": 0.01055013, "auxiliary_loss_mlp": 0.01040611, "balance_loss_clip": 1.01384902, "balance_loss_mlp": 1.01651549, "epoch": 0.6434390500526078, "flos": 17565082168320.0, "grad_norm": 2.1375158074004377, "language_loss": 0.83228147, "learning_rate": 1.1917126677452144e-06, "loss": 0.85323769, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38476562, "step": 10702, "time_per_iteration": 2.3326172828674316 }, { "auxiliary_loss_clip": 0.01052458, "auxiliary_loss_mlp": 0.01037289, "balance_loss_clip": 1.01410317, "balance_loss_mlp": 1.01666069, "epoch": 0.6434991733052758, "flos": 20842728606720.0, "grad_norm": 2.0747672869938913, "language_loss": 0.76191789, "learning_rate": 1.1913564442167798e-06, "loss": 0.7828154, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35742188, "step": 10703, "time_per_iteration": 2.363732099533081 }, { "auxiliary_loss_clip": 0.01010596, "auxiliary_loss_mlp": 0.01001486, "balance_loss_clip": 0.99922103, "balance_loss_mlp": 1.00305915, "epoch": 0.6435592965579437, "flos": 66091210778880.0, "grad_norm": 0.6945155037601164, "language_loss": 0.54729277, "learning_rate": 1.1910002513521898e-06, "loss": 0.56741357, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.07519531, "step": 10704, "time_per_iteration": 3.0140440464019775 }, { "auxiliary_loss_clip": 0.01052889, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.01218653, "balance_loss_mlp": 1.01629162, "epoch": 0.6436194198106118, "flos": 23767716735360.0, "grad_norm": 1.6281130896651437, "language_loss": 0.78319055, "learning_rate": 1.1906440891649519e-06, "loss": 0.80405295, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.36523438, "step": 10705, "time_per_iteration": 2.4040493965148926 }, { "auxiliary_loss_clip": 0.01052926, "auxiliary_loss_mlp": 0.01040967, "balance_loss_clip": 1.0180192, "balance_loss_mlp": 1.01659822, "epoch": 0.6436795430632797, "flos": 20229269374080.0, "grad_norm": 1.888773325305717, "language_loss": 0.8063305, "learning_rate": 1.1902879576685708e-06, "loss": 0.82726943, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 10706, "time_per_iteration": 2.3540918827056885 }, { "auxiliary_loss_clip": 0.01052227, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.01396537, "balance_loss_mlp": 1.01605856, "epoch": 0.6437396663159477, "flos": 20300841394560.0, "grad_norm": 1.838388547099875, "language_loss": 0.81572294, "learning_rate": 1.1899318568765518e-06, "loss": 0.83661687, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 10707, "time_per_iteration": 2.3948333263397217 }, { "auxiliary_loss_clip": 0.01054303, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.01642239, "balance_loss_mlp": 1.01744175, "epoch": 0.6437997895686156, "flos": 23877448738560.0, "grad_norm": 1.8660772370602432, "language_loss": 0.86525905, "learning_rate": 1.1895757868023978e-06, "loss": 0.88619882, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 10708, "time_per_iteration": 2.4702959060668945 }, { "auxiliary_loss_clip": 0.01058243, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.01764095, "balance_loss_mlp": 1.01843274, "epoch": 0.6438599128212836, "flos": 18988281630720.0, "grad_norm": 3.2966283248168, "language_loss": 0.67608833, "learning_rate": 1.1892197474596106e-06, "loss": 0.69711095, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 10709, "time_per_iteration": 3.814629077911377 }, { "auxiliary_loss_clip": 0.01052714, "auxiliary_loss_mlp": 0.01042989, "balance_loss_clip": 1.01861072, "balance_loss_mlp": 1.01615, "epoch": 0.6439200360739517, "flos": 24095236999680.0, "grad_norm": 1.6438641321722702, "language_loss": 0.81361133, "learning_rate": 1.1888637388616929e-06, "loss": 0.83456832, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 10710, "time_per_iteration": 2.395677328109741 }, { "auxiliary_loss_clip": 0.01052841, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.0136323, "balance_loss_mlp": 1.01610041, "epoch": 0.6439801593266196, "flos": 31900873864320.0, "grad_norm": 1.8083886125286188, "language_loss": 0.6709764, "learning_rate": 1.1885077610221425e-06, "loss": 0.69189262, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 10711, "time_per_iteration": 2.5016355514526367 }, { "auxiliary_loss_clip": 0.01054665, "auxiliary_loss_mlp": 0.01042162, "balance_loss_clip": 1.01914263, "balance_loss_mlp": 1.01777828, "epoch": 0.6440402825792876, "flos": 27124650604800.0, "grad_norm": 1.641758298556535, "language_loss": 0.79769158, "learning_rate": 1.1881518139544597e-06, "loss": 0.8186599, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36914062, "step": 10712, "time_per_iteration": 2.4183201789855957 }, { "auxiliary_loss_clip": 0.01054815, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.01197124, "balance_loss_mlp": 1.01650321, "epoch": 0.6441004058319555, "flos": 20666661287040.0, "grad_norm": 1.7981348117655414, "language_loss": 0.83995491, "learning_rate": 1.1877958976721417e-06, "loss": 0.86086869, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 10713, "time_per_iteration": 2.3674724102020264 }, { "auxiliary_loss_clip": 0.01051097, "auxiliary_loss_mlp": 0.01039173, "balance_loss_clip": 1.01750064, "balance_loss_mlp": 1.01646471, "epoch": 0.6441605290846235, "flos": 26024956600320.0, "grad_norm": 1.557684890340087, "language_loss": 0.78950322, "learning_rate": 1.187440012188684e-06, "loss": 0.81040585, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 10714, "time_per_iteration": 2.4062259197235107 }, { "auxiliary_loss_clip": 0.01051863, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.01479375, "balance_loss_mlp": 1.01637661, "epoch": 0.6442206523372914, "flos": 24898344071040.0, "grad_norm": 1.5237721180283321, "language_loss": 0.8197763, "learning_rate": 1.187084157517583e-06, "loss": 0.84066415, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 10715, "time_per_iteration": 2.4270832538604736 }, { "auxiliary_loss_clip": 0.01053142, "auxiliary_loss_mlp": 0.01041288, "balance_loss_clip": 1.01730299, "balance_loss_mlp": 1.01516271, "epoch": 0.6442807755899594, "flos": 25155130187520.0, "grad_norm": 1.9108719552172138, "language_loss": 0.82817119, "learning_rate": 1.186728333672332e-06, "loss": 0.84911549, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38085938, "step": 10716, "time_per_iteration": 2.419827699661255 }, { "auxiliary_loss_clip": 0.01055216, "auxiliary_loss_mlp": 0.01044894, "balance_loss_clip": 1.01779795, "balance_loss_mlp": 1.01651645, "epoch": 0.6443408988426274, "flos": 27343276738560.0, "grad_norm": 1.8757148115109004, "language_loss": 0.79551828, "learning_rate": 1.186372540666424e-06, "loss": 0.81651938, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38671875, "step": 10717, "time_per_iteration": 2.415886163711548 }, { "auxiliary_loss_clip": 0.01052067, "auxiliary_loss_mlp": 0.01038174, "balance_loss_clip": 1.014642, "balance_loss_mlp": 1.01691508, "epoch": 0.6444010220952954, "flos": 27927094360320.0, "grad_norm": 2.9504539086100205, "language_loss": 0.68914402, "learning_rate": 1.1860167785133513e-06, "loss": 0.71004641, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 10718, "time_per_iteration": 2.411648750305176 }, { "auxiliary_loss_clip": 0.01009957, "auxiliary_loss_mlp": 0.01010483, "balance_loss_clip": 1.00758588, "balance_loss_mlp": 1.00239456, "epoch": 0.6444611453479633, "flos": 71212167469440.0, "grad_norm": 0.7712524103471421, "language_loss": 0.49760127, "learning_rate": 1.185661047226603e-06, "loss": 0.5178057, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.07568359, "step": 10719, "time_per_iteration": 3.1737265586853027 }, { "auxiliary_loss_clip": 0.01055814, "auxiliary_loss_mlp": 0.01044797, "balance_loss_clip": 1.01983488, "balance_loss_mlp": 1.01812685, "epoch": 0.6445212686006313, "flos": 22704192766080.0, "grad_norm": 1.682722298092546, "language_loss": 0.78767395, "learning_rate": 1.18530534681967e-06, "loss": 0.80868006, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10720, "time_per_iteration": 2.3661375045776367 }, { "auxiliary_loss_clip": 0.01054318, "auxiliary_loss_mlp": 0.01041843, "balance_loss_clip": 1.01585507, "balance_loss_mlp": 1.01670694, "epoch": 0.6445813918532992, "flos": 21177754813440.0, "grad_norm": 1.9505274983071488, "language_loss": 0.78128159, "learning_rate": 1.18494967730604e-06, "loss": 0.80224323, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 10721, "time_per_iteration": 2.387746810913086 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.01041907, "balance_loss_clip": 1.01627731, "balance_loss_mlp": 1.01553679, "epoch": 0.6446415151059672, "flos": 25190741640960.0, "grad_norm": 2.0513060507288063, "language_loss": 0.73878652, "learning_rate": 1.1845940386991995e-06, "loss": 0.75973314, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 10722, "time_per_iteration": 2.388833999633789 }, { "auxiliary_loss_clip": 0.01053136, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.0102396, "balance_loss_mlp": 1.01675332, "epoch": 0.6447016383586353, "flos": 25301032225920.0, "grad_norm": 1.4792525898387812, "language_loss": 0.79362714, "learning_rate": 1.184238431012635e-06, "loss": 0.81449652, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 10723, "time_per_iteration": 2.414485454559326 }, { "auxiliary_loss_clip": 0.01057727, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.01952934, "balance_loss_mlp": 1.01840436, "epoch": 0.6447617616113032, "flos": 27702079447680.0, "grad_norm": 1.5103253222744428, "language_loss": 0.59427392, "learning_rate": 1.1838828542598312e-06, "loss": 0.61529553, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.39453125, "step": 10724, "time_per_iteration": 2.4422607421875 }, { "auxiliary_loss_clip": 0.01054386, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.01350439, "balance_loss_mlp": 1.01806796, "epoch": 0.6448218848639712, "flos": 23037997075200.0, "grad_norm": 1.68644636235414, "language_loss": 0.84755027, "learning_rate": 1.183527308454271e-06, "loss": 0.86844927, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 10725, "time_per_iteration": 2.391510486602783 }, { "auxiliary_loss_clip": 0.01055559, "auxiliary_loss_mlp": 0.01040151, "balance_loss_clip": 1.01499808, "balance_loss_mlp": 1.01725495, "epoch": 0.6448820081166391, "flos": 24495027511680.0, "grad_norm": 1.9892069472440201, "language_loss": 0.82797849, "learning_rate": 1.1831717936094368e-06, "loss": 0.8489356, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10726, "time_per_iteration": 2.4205148220062256 }, { "auxiliary_loss_clip": 0.01056457, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.01704335, "balance_loss_mlp": 1.01771736, "epoch": 0.6449421313693071, "flos": 22418183975040.0, "grad_norm": 1.7069441959826102, "language_loss": 0.82297122, "learning_rate": 1.1828163097388108e-06, "loss": 0.84395808, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 10727, "time_per_iteration": 2.393566131591797 }, { "auxiliary_loss_clip": 0.01058732, "auxiliary_loss_mlp": 0.01038891, "balance_loss_clip": 1.01223564, "balance_loss_mlp": 1.01750565, "epoch": 0.645002254621975, "flos": 20224800720000.0, "grad_norm": 1.7841363261415206, "language_loss": 0.80189377, "learning_rate": 1.1824608568558717e-06, "loss": 0.82287002, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.41210938, "step": 10728, "time_per_iteration": 2.3540499210357666 }, { "auxiliary_loss_clip": 0.01055426, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.01621377, "balance_loss_mlp": 1.01744628, "epoch": 0.645062377874643, "flos": 27854195708160.0, "grad_norm": 1.7961420948594051, "language_loss": 0.75625938, "learning_rate": 1.1821054349740988e-06, "loss": 0.77724981, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37890625, "step": 10729, "time_per_iteration": 2.432633638381958 }, { "auxiliary_loss_clip": 0.01054392, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.01731515, "balance_loss_mlp": 1.01686764, "epoch": 0.645122501127311, "flos": 25300333998720.0, "grad_norm": 1.6011409930579357, "language_loss": 0.67619675, "learning_rate": 1.1817500441069706e-06, "loss": 0.69716311, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 10730, "time_per_iteration": 2.395230770111084 }, { "auxiliary_loss_clip": 0.01055907, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.0150423, "balance_loss_mlp": 1.01763082, "epoch": 0.645182624379979, "flos": 18806349202560.0, "grad_norm": 1.4671281571022448, "language_loss": 0.64338934, "learning_rate": 1.1813946842679614e-06, "loss": 0.66434985, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 10731, "time_per_iteration": 2.3703746795654297 }, { "auxiliary_loss_clip": 0.01053643, "auxiliary_loss_mlp": 0.01041254, "balance_loss_clip": 1.0159694, "balance_loss_mlp": 1.01715302, "epoch": 0.6452427476326469, "flos": 18331216243200.0, "grad_norm": 1.800895069199503, "language_loss": 0.69714987, "learning_rate": 1.1810393554705492e-06, "loss": 0.71809882, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 10732, "time_per_iteration": 2.3427488803863525 }, { "auxiliary_loss_clip": 0.01053508, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.01679516, "balance_loss_mlp": 1.01802886, "epoch": 0.6453028708853149, "flos": 22783619842560.0, "grad_norm": 1.5554288016469293, "language_loss": 0.76663816, "learning_rate": 1.1806840577282055e-06, "loss": 0.78757781, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 10733, "time_per_iteration": 2.4185116291046143 }, { "auxiliary_loss_clip": 0.01056726, "auxiliary_loss_mlp": 0.01041337, "balance_loss_clip": 1.01731586, "balance_loss_mlp": 1.01836801, "epoch": 0.6453629941379828, "flos": 23945005952640.0, "grad_norm": 2.6930892556651402, "language_loss": 0.68514895, "learning_rate": 1.1803287910544048e-06, "loss": 0.70612955, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38476562, "step": 10734, "time_per_iteration": 3.7996668815612793 }, { "auxiliary_loss_clip": 0.01053781, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.01778495, "balance_loss_mlp": 1.01878285, "epoch": 0.6454231173906508, "flos": 17675407664640.0, "grad_norm": 2.9105899314277166, "language_loss": 0.75481194, "learning_rate": 1.1799735554626191e-06, "loss": 0.77576637, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34960938, "step": 10735, "time_per_iteration": 2.3712570667266846 }, { "auxiliary_loss_clip": 0.01055001, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.01539695, "balance_loss_mlp": 1.01810312, "epoch": 0.6454832406433189, "flos": 23291710992000.0, "grad_norm": 1.7073602188211963, "language_loss": 0.75933772, "learning_rate": 1.1796183509663176e-06, "loss": 0.78027594, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 10736, "time_per_iteration": 2.392698049545288 }, { "auxiliary_loss_clip": 0.01058739, "auxiliary_loss_mlp": 0.01039719, "balance_loss_clip": 1.01578188, "balance_loss_mlp": 1.02004504, "epoch": 0.6455433638959868, "flos": 20156161253760.0, "grad_norm": 1.900999164393352, "language_loss": 0.72134382, "learning_rate": 1.1792631775789708e-06, "loss": 0.74232841, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38671875, "step": 10737, "time_per_iteration": 2.391960859298706 }, { "auxiliary_loss_clip": 0.01011817, "auxiliary_loss_mlp": 0.01002632, "balance_loss_clip": 1.00027215, "balance_loss_mlp": 1.00413668, "epoch": 0.6456034871486548, "flos": 66529510387200.0, "grad_norm": 0.795175811814308, "language_loss": 0.5848195, "learning_rate": 1.1789080353140464e-06, "loss": 0.60496402, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07714844, "step": 10738, "time_per_iteration": 4.379191637039185 }, { "auxiliary_loss_clip": 0.01053243, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.012254, "balance_loss_mlp": 1.01744652, "epoch": 0.6456636104013227, "flos": 24204969002880.0, "grad_norm": 1.6325777095879264, "language_loss": 0.75423586, "learning_rate": 1.1785529241850118e-06, "loss": 0.77512997, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35742188, "step": 10739, "time_per_iteration": 3.8138628005981445 }, { "auxiliary_loss_clip": 0.01055794, "auxiliary_loss_mlp": 0.01040912, "balance_loss_clip": 1.01546085, "balance_loss_mlp": 1.01777565, "epoch": 0.6457237336539907, "flos": 23622931860480.0, "grad_norm": 2.150783004085481, "language_loss": 0.73029488, "learning_rate": 1.1781978442053324e-06, "loss": 0.75126195, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 10740, "time_per_iteration": 2.3869235515594482 }, { "auxiliary_loss_clip": 0.01012066, "auxiliary_loss_mlp": 0.01019412, "balance_loss_clip": 1.01665783, "balance_loss_mlp": 1.00446248, "epoch": 0.6457838569066586, "flos": 65842454275200.0, "grad_norm": 0.6717043207987834, "language_loss": 0.55340743, "learning_rate": 1.1778427953884733e-06, "loss": 0.57372224, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.07617188, "step": 10741, "time_per_iteration": 3.0664541721343994 }, { "auxiliary_loss_clip": 0.01052078, "auxiliary_loss_mlp": 0.01039348, "balance_loss_clip": 1.01781857, "balance_loss_mlp": 1.01730251, "epoch": 0.6458439801593266, "flos": 22380896776320.0, "grad_norm": 1.6226447133758133, "language_loss": 0.81991458, "learning_rate": 1.1774877777478977e-06, "loss": 0.84082884, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 10742, "time_per_iteration": 2.3851029872894287 }, { "auxiliary_loss_clip": 0.01051921, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.0123105, "balance_loss_mlp": 1.01692116, "epoch": 0.6459041034119946, "flos": 24788123308800.0, "grad_norm": 1.5506538038782571, "language_loss": 0.82652557, "learning_rate": 1.1771327912970678e-06, "loss": 0.8474018, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34960938, "step": 10743, "time_per_iteration": 2.4102864265441895 }, { "auxiliary_loss_clip": 0.01053756, "auxiliary_loss_mlp": 0.01040686, "balance_loss_clip": 1.01677227, "balance_loss_mlp": 1.01748621, "epoch": 0.6459642266646626, "flos": 18324583084800.0, "grad_norm": 1.7463949148069677, "language_loss": 0.72941077, "learning_rate": 1.1767778360494453e-06, "loss": 0.75035518, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 10744, "time_per_iteration": 2.3483335971832275 }, { "auxiliary_loss_clip": 0.01052256, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.01584637, "balance_loss_mlp": 1.01625538, "epoch": 0.6460243499173305, "flos": 43579670094720.0, "grad_norm": 1.6235906214319749, "language_loss": 0.68572462, "learning_rate": 1.1764229120184896e-06, "loss": 0.70664543, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 10745, "time_per_iteration": 2.5791451930999756 }, { "auxiliary_loss_clip": 0.01052343, "auxiliary_loss_mlp": 0.01038542, "balance_loss_clip": 1.01580834, "balance_loss_mlp": 1.01653504, "epoch": 0.6460844731699985, "flos": 19243042888320.0, "grad_norm": 4.0771560974304455, "language_loss": 0.75030899, "learning_rate": 1.1760680192176597e-06, "loss": 0.77121782, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 10746, "time_per_iteration": 2.3682284355163574 }, { "auxiliary_loss_clip": 0.01055678, "auxiliary_loss_mlp": 0.01039029, "balance_loss_clip": 1.01698756, "balance_loss_mlp": 1.01829195, "epoch": 0.6461445964226664, "flos": 27452135957760.0, "grad_norm": 1.407197156894725, "language_loss": 0.67873442, "learning_rate": 1.175713157660413e-06, "loss": 0.69968146, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.37304688, "step": 10747, "time_per_iteration": 2.470689296722412 }, { "auxiliary_loss_clip": 0.01052606, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.02005768, "balance_loss_mlp": 1.01769567, "epoch": 0.6462047196753344, "flos": 20294662083840.0, "grad_norm": 1.5512036110985203, "language_loss": 0.68282259, "learning_rate": 1.1753583273602056e-06, "loss": 0.70376855, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34960938, "step": 10748, "time_per_iteration": 2.3981056213378906 }, { "auxiliary_loss_clip": 0.01056987, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.01756263, "balance_loss_mlp": 1.01873374, "epoch": 0.6462648429280025, "flos": 22017241388160.0, "grad_norm": 1.8297628943502924, "language_loss": 0.77267909, "learning_rate": 1.1750035283304937e-06, "loss": 0.79368258, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10749, "time_per_iteration": 3.8550045490264893 }, { "auxiliary_loss_clip": 0.01053691, "auxiliary_loss_mlp": 0.01041064, "balance_loss_clip": 1.01768696, "balance_loss_mlp": 1.01655889, "epoch": 0.6463249661806704, "flos": 27779935512960.0, "grad_norm": 1.717767441695848, "language_loss": 0.78332275, "learning_rate": 1.17464876058473e-06, "loss": 0.80427027, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 10750, "time_per_iteration": 2.4255354404449463 }, { "auxiliary_loss_clip": 0.0105581, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.01974297, "balance_loss_mlp": 1.01764882, "epoch": 0.6463850894333384, "flos": 22049606085120.0, "grad_norm": 2.212409144374465, "language_loss": 0.70545793, "learning_rate": 1.1742940241363683e-06, "loss": 0.726457, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 10751, "time_per_iteration": 2.4009346961975098 }, { "auxiliary_loss_clip": 0.01054199, "auxiliary_loss_mlp": 0.01047769, "balance_loss_clip": 1.02459455, "balance_loss_mlp": 1.01710939, "epoch": 0.6464452126860063, "flos": 21105170363520.0, "grad_norm": 2.114633290230461, "language_loss": 0.72967124, "learning_rate": 1.1739393189988604e-06, "loss": 0.75069094, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 10752, "time_per_iteration": 2.370039463043213 }, { "auxiliary_loss_clip": 0.01054393, "auxiliary_loss_mlp": 0.01046104, "balance_loss_clip": 1.01930606, "balance_loss_mlp": 1.01659274, "epoch": 0.6465053359386743, "flos": 16027298023680.0, "grad_norm": 1.7612873787385885, "language_loss": 0.79361367, "learning_rate": 1.1735846451856554e-06, "loss": 0.81461859, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37890625, "step": 10753, "time_per_iteration": 2.367999315261841 }, { "auxiliary_loss_clip": 0.01052386, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.02625108, "balance_loss_mlp": 1.01682591, "epoch": 0.6465654591913422, "flos": 23397707479680.0, "grad_norm": 1.6362025251436505, "language_loss": 0.85716891, "learning_rate": 1.1732300027102041e-06, "loss": 0.8781901, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 10754, "time_per_iteration": 2.4041528701782227 }, { "auxiliary_loss_clip": 0.01053089, "auxiliary_loss_mlp": 0.01045076, "balance_loss_clip": 1.02106738, "balance_loss_mlp": 1.01722264, "epoch": 0.6466255824440102, "flos": 15376377035520.0, "grad_norm": 3.1817582807084706, "language_loss": 0.60957265, "learning_rate": 1.1728753915859541e-06, "loss": 0.63055432, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 10755, "time_per_iteration": 2.346297264099121 }, { "auxiliary_loss_clip": 0.01052884, "auxiliary_loss_mlp": 0.01044544, "balance_loss_clip": 1.02036846, "balance_loss_mlp": 1.01653504, "epoch": 0.6466857056966782, "flos": 16251928911360.0, "grad_norm": 2.0630861501193425, "language_loss": 0.69222093, "learning_rate": 1.1725208118263518e-06, "loss": 0.7131952, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 10756, "time_per_iteration": 2.336540699005127 }, { "auxiliary_loss_clip": 0.01057236, "auxiliary_loss_mlp": 0.01047079, "balance_loss_clip": 1.02274823, "balance_loss_mlp": 1.01786733, "epoch": 0.6467458289493462, "flos": 21177196231680.0, "grad_norm": 3.0634835665401465, "language_loss": 0.7641927, "learning_rate": 1.172166263444844e-06, "loss": 0.78523582, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.39453125, "step": 10757, "time_per_iteration": 2.356574296951294 }, { "auxiliary_loss_clip": 0.01052525, "auxiliary_loss_mlp": 0.01041802, "balance_loss_clip": 1.01823473, "balance_loss_mlp": 1.01738441, "epoch": 0.6468059522020141, "flos": 17967316475520.0, "grad_norm": 1.548720198723808, "language_loss": 0.75781214, "learning_rate": 1.1718117464548734e-06, "loss": 0.77875543, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 10758, "time_per_iteration": 2.363762140274048 }, { "auxiliary_loss_clip": 0.0105371, "auxiliary_loss_mlp": 0.01051472, "balance_loss_clip": 1.02530515, "balance_loss_mlp": 1.0169245, "epoch": 0.6468660754546821, "flos": 17889320764800.0, "grad_norm": 1.8491373520255292, "language_loss": 0.68865347, "learning_rate": 1.1714572608698845e-06, "loss": 0.70970523, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 10759, "time_per_iteration": 2.3759570121765137 }, { "auxiliary_loss_clip": 0.01055926, "auxiliary_loss_mlp": 0.01040885, "balance_loss_clip": 1.01707864, "balance_loss_mlp": 1.01778293, "epoch": 0.64692619870735, "flos": 22599906935040.0, "grad_norm": 1.4806612172451579, "language_loss": 0.765571, "learning_rate": 1.1711028067033197e-06, "loss": 0.78653914, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 10760, "time_per_iteration": 2.4225521087646484 }, { "auxiliary_loss_clip": 0.01052047, "auxiliary_loss_mlp": 0.01036678, "balance_loss_clip": 1.01488686, "balance_loss_mlp": 1.01672626, "epoch": 0.646986321960018, "flos": 49598940867840.0, "grad_norm": 1.5112768083666424, "language_loss": 0.66126728, "learning_rate": 1.1707483839686194e-06, "loss": 0.68215448, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 10761, "time_per_iteration": 2.613661527633667 }, { "auxiliary_loss_clip": 0.01054146, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.01581228, "balance_loss_mlp": 1.01722002, "epoch": 0.6470464452126861, "flos": 21907369739520.0, "grad_norm": 1.9644075105599186, "language_loss": 0.71024525, "learning_rate": 1.1703939926792235e-06, "loss": 0.7311914, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 10762, "time_per_iteration": 2.409022808074951 }, { "auxiliary_loss_clip": 0.01056518, "auxiliary_loss_mlp": 0.01044128, "balance_loss_clip": 1.01988125, "balance_loss_mlp": 1.01837718, "epoch": 0.647106568465354, "flos": 18105363457920.0, "grad_norm": 1.8577301819439889, "language_loss": 0.83436453, "learning_rate": 1.1700396328485705e-06, "loss": 0.855371, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38085938, "step": 10763, "time_per_iteration": 2.352147102355957 }, { "auxiliary_loss_clip": 0.01009766, "auxiliary_loss_mlp": 0.01008341, "balance_loss_clip": 1.00595677, "balance_loss_mlp": 1.00233459, "epoch": 0.647166691718022, "flos": 69476773829760.0, "grad_norm": 0.7101647323556208, "language_loss": 0.57929057, "learning_rate": 1.1696853044900978e-06, "loss": 0.59947163, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.07421875, "step": 10764, "time_per_iteration": 3.2071871757507324 }, { "auxiliary_loss_clip": 0.01051894, "auxiliary_loss_mlp": 0.01036958, "balance_loss_clip": 1.01480865, "balance_loss_mlp": 1.01674795, "epoch": 0.6472268149706899, "flos": 34093733448960.0, "grad_norm": 1.8034147837497108, "language_loss": 0.61609912, "learning_rate": 1.1693310076172413e-06, "loss": 0.63698763, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 10765, "time_per_iteration": 2.483252763748169 }, { "auxiliary_loss_clip": 0.01052273, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.01179504, "balance_loss_mlp": 1.01748705, "epoch": 0.6472869382233579, "flos": 28109969395200.0, "grad_norm": 1.780535568082008, "language_loss": 0.63800842, "learning_rate": 1.168976742243437e-06, "loss": 0.65886569, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 10766, "time_per_iteration": 2.44814395904541 }, { "auxiliary_loss_clip": 0.01054837, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.01256084, "balance_loss_mlp": 1.01816034, "epoch": 0.6473470614760258, "flos": 22491047715840.0, "grad_norm": 1.6303592850671424, "language_loss": 0.77162993, "learning_rate": 1.1686225083821174e-06, "loss": 0.79253471, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 10767, "time_per_iteration": 2.3859012126922607 }, { "auxiliary_loss_clip": 0.01052623, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.01195371, "balance_loss_mlp": 1.01787627, "epoch": 0.6474071847286939, "flos": 14538042535680.0, "grad_norm": 1.9618581802708077, "language_loss": 0.7883839, "learning_rate": 1.1682683060467153e-06, "loss": 0.80925846, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 10768, "time_per_iteration": 2.364060163497925 }, { "auxiliary_loss_clip": 0.01053795, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.01005888, "balance_loss_mlp": 1.01806331, "epoch": 0.6474673079813618, "flos": 24097052390400.0, "grad_norm": 2.0090715355000746, "language_loss": 0.721825, "learning_rate": 1.167914135250663e-06, "loss": 0.7426765, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35742188, "step": 10769, "time_per_iteration": 2.4069390296936035 }, { "auxiliary_loss_clip": 0.01053166, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.01266026, "balance_loss_mlp": 1.01855993, "epoch": 0.6475274312340298, "flos": 14975294803200.0, "grad_norm": 1.8178753569146218, "language_loss": 0.7480287, "learning_rate": 1.1675599960073895e-06, "loss": 0.76890707, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34570312, "step": 10770, "time_per_iteration": 2.4079084396362305 }, { "auxiliary_loss_clip": 0.01056294, "auxiliary_loss_mlp": 0.01036837, "balance_loss_clip": 1.01104045, "balance_loss_mlp": 1.01817322, "epoch": 0.6475875544866977, "flos": 25044176286720.0, "grad_norm": 1.8954301642959925, "language_loss": 0.74890387, "learning_rate": 1.167205888330325e-06, "loss": 0.76983517, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 10771, "time_per_iteration": 2.425462007522583 }, { "auxiliary_loss_clip": 0.01055454, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.0175004, "balance_loss_mlp": 1.0194943, "epoch": 0.6476476777393657, "flos": 16471218360960.0, "grad_norm": 1.9796592032778333, "language_loss": 0.74976975, "learning_rate": 1.1668518122328958e-06, "loss": 0.77073807, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 10772, "time_per_iteration": 2.4447150230407715 }, { "auxiliary_loss_clip": 0.01052213, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.0155201, "balance_loss_mlp": 1.01731837, "epoch": 0.6477078009920336, "flos": 25811078411520.0, "grad_norm": 1.5387638023143058, "language_loss": 0.83863413, "learning_rate": 1.1664977677285305e-06, "loss": 0.85950649, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.34960938, "step": 10773, "time_per_iteration": 2.4091789722442627 }, { "auxiliary_loss_clip": 0.01050884, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.00823641, "balance_loss_mlp": 1.0170126, "epoch": 0.6477679242447016, "flos": 17675163285120.0, "grad_norm": 1.6100838379248177, "language_loss": 0.79435909, "learning_rate": 1.1661437548306524e-06, "loss": 0.81516922, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 10774, "time_per_iteration": 3.6323893070220947 }, { "auxiliary_loss_clip": 0.01053819, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.01808405, "balance_loss_mlp": 1.01786137, "epoch": 0.6478280474973696, "flos": 21031259281920.0, "grad_norm": 2.269975489431806, "language_loss": 0.70842749, "learning_rate": 1.1657897735526867e-06, "loss": 0.72937626, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 10775, "time_per_iteration": 2.401000738143921 }, { "auxiliary_loss_clip": 0.01055355, "auxiliary_loss_mlp": 0.01043381, "balance_loss_clip": 1.01856196, "balance_loss_mlp": 1.01849294, "epoch": 0.6478881707500376, "flos": 21615844953600.0, "grad_norm": 2.59157063435956, "language_loss": 0.67743123, "learning_rate": 1.1654358239080574e-06, "loss": 0.69841856, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 10776, "time_per_iteration": 2.372556447982788 }, { "auxiliary_loss_clip": 0.01054842, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.01134026, "balance_loss_mlp": 1.01717687, "epoch": 0.6479482940027056, "flos": 18441576650880.0, "grad_norm": 2.4722359899951556, "language_loss": 0.80916715, "learning_rate": 1.1650819059101839e-06, "loss": 0.83007073, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 10777, "time_per_iteration": 2.3571970462799072 }, { "auxiliary_loss_clip": 0.01054595, "auxiliary_loss_mlp": 0.01039394, "balance_loss_clip": 1.01598144, "balance_loss_mlp": 1.01832831, "epoch": 0.6480084172553735, "flos": 22162968869760.0, "grad_norm": 1.8850138482697545, "language_loss": 0.74282831, "learning_rate": 1.1647280195724896e-06, "loss": 0.7637682, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 10778, "time_per_iteration": 3.702528238296509 }, { "auxiliary_loss_clip": 0.01052423, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.0150038, "balance_loss_mlp": 1.01671982, "epoch": 0.6480685405080415, "flos": 24315085031040.0, "grad_norm": 1.4987539021289757, "language_loss": 0.7923708, "learning_rate": 1.1643741649083923e-06, "loss": 0.81326962, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 10779, "time_per_iteration": 3.7859139442443848 }, { "auxiliary_loss_clip": 0.01011707, "auxiliary_loss_mlp": 0.01002646, "balance_loss_clip": 1.00051248, "balance_loss_mlp": 1.00422502, "epoch": 0.6481286637607094, "flos": 59888017630080.0, "grad_norm": 0.7202543686298508, "language_loss": 0.59475619, "learning_rate": 1.1640203419313095e-06, "loss": 0.61489969, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.07470703, "step": 10780, "time_per_iteration": 3.0126631259918213 }, { "auxiliary_loss_clip": 0.01053554, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.02156377, "balance_loss_mlp": 1.01742435, "epoch": 0.6481887870133775, "flos": 25482999565440.0, "grad_norm": 1.9196869839371726, "language_loss": 0.80842417, "learning_rate": 1.1636665506546599e-06, "loss": 0.82941222, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36132812, "step": 10781, "time_per_iteration": 2.403019905090332 }, { "auxiliary_loss_clip": 0.01055093, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.01652539, "balance_loss_mlp": 1.01843333, "epoch": 0.6482489102660454, "flos": 19929400773120.0, "grad_norm": 3.285919599521553, "language_loss": 0.8038035, "learning_rate": 1.1633127910918578e-06, "loss": 0.82476008, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 10782, "time_per_iteration": 2.346817970275879 }, { "auxiliary_loss_clip": 0.01053973, "auxiliary_loss_mlp": 0.01041108, "balance_loss_clip": 1.01738548, "balance_loss_mlp": 1.01712894, "epoch": 0.6483090335187134, "flos": 26978259807360.0, "grad_norm": 2.173204223796254, "language_loss": 0.66183722, "learning_rate": 1.1629590632563187e-06, "loss": 0.68278807, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 10783, "time_per_iteration": 2.4247231483459473 }, { "auxiliary_loss_clip": 0.01055424, "auxiliary_loss_mlp": 0.01045485, "balance_loss_clip": 1.01965284, "balance_loss_mlp": 1.0176506, "epoch": 0.6483691567713813, "flos": 25076925008640.0, "grad_norm": 1.7895460703469552, "language_loss": 0.89709628, "learning_rate": 1.1626053671614561e-06, "loss": 0.91810524, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 10784, "time_per_iteration": 2.4067225456237793 }, { "auxiliary_loss_clip": 0.01053308, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.01389635, "balance_loss_mlp": 1.01759315, "epoch": 0.6484292800240493, "flos": 16105084266240.0, "grad_norm": 2.1743876122555927, "language_loss": 0.75526309, "learning_rate": 1.1622517028206815e-06, "loss": 0.77615684, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 10785, "time_per_iteration": 2.3301734924316406 }, { "auxiliary_loss_clip": 0.01051925, "auxiliary_loss_mlp": 0.01039201, "balance_loss_clip": 1.016873, "balance_loss_mlp": 1.01665425, "epoch": 0.6484894032767172, "flos": 28839130473600.0, "grad_norm": 1.4619784339513133, "language_loss": 0.70022833, "learning_rate": 1.1618980702474071e-06, "loss": 0.72113961, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 10786, "time_per_iteration": 2.44413685798645 }, { "auxiliary_loss_clip": 0.01051906, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.02121329, "balance_loss_mlp": 1.01546884, "epoch": 0.6485495265293852, "flos": 30225740964480.0, "grad_norm": 2.0407075140378, "language_loss": 0.72299671, "learning_rate": 1.161544469455041e-06, "loss": 0.74394912, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 10787, "time_per_iteration": 2.4500064849853516 }, { "auxiliary_loss_clip": 0.01055739, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.01476169, "balance_loss_mlp": 1.01707757, "epoch": 0.6486096497820532, "flos": 20081202831360.0, "grad_norm": 2.0061588118475417, "language_loss": 0.85293061, "learning_rate": 1.1611909004569934e-06, "loss": 0.87388843, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 10788, "time_per_iteration": 2.3468503952026367 }, { "auxiliary_loss_clip": 0.01053959, "auxiliary_loss_mlp": 0.01040898, "balance_loss_clip": 1.01609111, "balance_loss_mlp": 1.0170908, "epoch": 0.6486697730347212, "flos": 17128109191680.0, "grad_norm": 1.8566565403983104, "language_loss": 0.7852869, "learning_rate": 1.1608373632666708e-06, "loss": 0.80623543, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 10789, "time_per_iteration": 3.799407958984375 }, { "auxiliary_loss_clip": 0.01050838, "auxiliary_loss_mlp": 0.01038869, "balance_loss_clip": 1.01657724, "balance_loss_mlp": 1.01577413, "epoch": 0.6487298962873892, "flos": 38910351018240.0, "grad_norm": 1.6775657055038795, "language_loss": 0.76645803, "learning_rate": 1.160483857897479e-06, "loss": 0.78735507, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 10790, "time_per_iteration": 2.627246856689453 }, { "auxiliary_loss_clip": 0.01052741, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.01619005, "balance_loss_mlp": 1.01747978, "epoch": 0.6487900195400571, "flos": 11947033272960.0, "grad_norm": 2.1058852646791837, "language_loss": 0.60890162, "learning_rate": 1.160130384362823e-06, "loss": 0.62980747, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 10791, "time_per_iteration": 2.4384970664978027 }, { "auxiliary_loss_clip": 0.01052028, "auxiliary_loss_mlp": 0.01043463, "balance_loss_clip": 1.02188623, "balance_loss_mlp": 1.01596367, "epoch": 0.6488501427927251, "flos": 22343400109440.0, "grad_norm": 1.8345734828988036, "language_loss": 0.86987901, "learning_rate": 1.1597769426761082e-06, "loss": 0.89083391, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36132812, "step": 10792, "time_per_iteration": 2.3898937702178955 }, { "auxiliary_loss_clip": 0.01055897, "auxiliary_loss_mlp": 0.01042675, "balance_loss_clip": 1.01894021, "balance_loss_mlp": 1.01770473, "epoch": 0.648910266045393, "flos": 22235204206080.0, "grad_norm": 1.8819855135312773, "language_loss": 0.79001176, "learning_rate": 1.159423532850735e-06, "loss": 0.81099743, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 10793, "time_per_iteration": 2.440857410430908 }, { "auxiliary_loss_clip": 0.01054212, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.01818788, "balance_loss_mlp": 1.01679039, "epoch": 0.6489703892980611, "flos": 25300089619200.0, "grad_norm": 2.3236880751140045, "language_loss": 0.76088244, "learning_rate": 1.1590701549001055e-06, "loss": 0.78185892, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 10794, "time_per_iteration": 2.4491820335388184 }, { "auxiliary_loss_clip": 0.01052852, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.01581037, "balance_loss_mlp": 1.01597214, "epoch": 0.649030512550729, "flos": 24570753984000.0, "grad_norm": 1.63262525907455, "language_loss": 0.71187222, "learning_rate": 1.158716808837621e-06, "loss": 0.73279095, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 10795, "time_per_iteration": 2.549916982650757 }, { "auxiliary_loss_clip": 0.0105524, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.0176605, "balance_loss_mlp": 1.01783586, "epoch": 0.649090635803397, "flos": 26243652556800.0, "grad_norm": 1.7588386041057074, "language_loss": 0.54977721, "learning_rate": 1.158363494676679e-06, "loss": 0.57077622, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.375, "step": 10796, "time_per_iteration": 2.4991137981414795 }, { "auxiliary_loss_clip": 0.01054273, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.01630974, "balance_loss_mlp": 1.01722825, "epoch": 0.6491507590560649, "flos": 24936189851520.0, "grad_norm": 2.56010857797697, "language_loss": 0.78931248, "learning_rate": 1.1580102124306775e-06, "loss": 0.81024933, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 10797, "time_per_iteration": 2.6582343578338623 }, { "auxiliary_loss_clip": 0.01052453, "auxiliary_loss_mlp": 0.01040107, "balance_loss_clip": 1.01655138, "balance_loss_mlp": 1.01812005, "epoch": 0.6492108823087329, "flos": 19498781664000.0, "grad_norm": 1.986628273593499, "language_loss": 0.71138167, "learning_rate": 1.1576569621130134e-06, "loss": 0.73230726, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34375, "step": 10798, "time_per_iteration": 2.3892533779144287 }, { "auxiliary_loss_clip": 0.0105228, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.01171517, "balance_loss_mlp": 1.01654768, "epoch": 0.6492710055614008, "flos": 19718280581760.0, "grad_norm": 1.6933737037440477, "language_loss": 0.77758974, "learning_rate": 1.1573037437370811e-06, "loss": 0.79845417, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 10799, "time_per_iteration": 2.34598708152771 }, { "auxiliary_loss_clip": 0.01054183, "auxiliary_loss_mlp": 0.01042247, "balance_loss_clip": 1.01598549, "balance_loss_mlp": 1.01655781, "epoch": 0.6493311288140688, "flos": 24315853080960.0, "grad_norm": 1.8082726795302015, "language_loss": 0.72255343, "learning_rate": 1.1569505573162755e-06, "loss": 0.74351776, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37695312, "step": 10800, "time_per_iteration": 2.4152514934539795 }, { "auxiliary_loss_clip": 0.01012254, "auxiliary_loss_mlp": 0.01009801, "balance_loss_clip": 1.00763106, "balance_loss_mlp": 1.004632, "epoch": 0.6493912520667368, "flos": 70931465205120.0, "grad_norm": 0.7659409592913372, "language_loss": 0.60339117, "learning_rate": 1.1565974028639897e-06, "loss": 0.62361169, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.07617188, "step": 10801, "time_per_iteration": 3.119224786758423 }, { "auxiliary_loss_clip": 0.01058505, "auxiliary_loss_mlp": 0.01040451, "balance_loss_clip": 1.01379609, "balance_loss_mlp": 1.01935887, "epoch": 0.6494513753194048, "flos": 25336608768000.0, "grad_norm": 1.5580560758472957, "language_loss": 0.7928952, "learning_rate": 1.156244280393614e-06, "loss": 0.81388474, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 10802, "time_per_iteration": 2.4118871688842773 }, { "auxiliary_loss_clip": 0.01054805, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.01767218, "balance_loss_mlp": 1.01696563, "epoch": 0.6495114985720728, "flos": 24680800189440.0, "grad_norm": 1.5252889481577423, "language_loss": 0.75296843, "learning_rate": 1.155891189918541e-06, "loss": 0.77395284, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 10803, "time_per_iteration": 2.392454147338867 }, { "auxiliary_loss_clip": 0.01054249, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.01424599, "balance_loss_mlp": 1.01690984, "epoch": 0.6495716218247407, "flos": 23650269321600.0, "grad_norm": 2.3120280784356235, "language_loss": 0.70802343, "learning_rate": 1.1555381314521578e-06, "loss": 0.72896159, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 10804, "time_per_iteration": 2.372480869293213 }, { "auxiliary_loss_clip": 0.0105427, "auxiliary_loss_mlp": 0.01038932, "balance_loss_clip": 1.01441038, "balance_loss_mlp": 1.01816607, "epoch": 0.6496317450774087, "flos": 22345075854720.0, "grad_norm": 1.7655794729103709, "language_loss": 0.7335664, "learning_rate": 1.1551851050078537e-06, "loss": 0.75449848, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36132812, "step": 10805, "time_per_iteration": 2.368354320526123 }, { "auxiliary_loss_clip": 0.01054707, "auxiliary_loss_mlp": 0.0104193, "balance_loss_clip": 1.01693177, "balance_loss_mlp": 1.01687789, "epoch": 0.6496918683300766, "flos": 30517335573120.0, "grad_norm": 2.0946999363031416, "language_loss": 0.67294276, "learning_rate": 1.1548321105990155e-06, "loss": 0.69390917, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10806, "time_per_iteration": 2.458112955093384 }, { "auxiliary_loss_clip": 0.01056394, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.01186657, "balance_loss_mlp": 1.01743197, "epoch": 0.6497519915827447, "flos": 12458161710720.0, "grad_norm": 1.953209290468795, "language_loss": 0.80800068, "learning_rate": 1.1544791482390275e-06, "loss": 0.82894415, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 10807, "time_per_iteration": 2.345942735671997 }, { "auxiliary_loss_clip": 0.01012555, "auxiliary_loss_mlp": 0.01016534, "balance_loss_clip": 1.01367259, "balance_loss_mlp": 1.00509548, "epoch": 0.6498121148354126, "flos": 69090075077760.0, "grad_norm": 0.7943385853823598, "language_loss": 0.59015685, "learning_rate": 1.1541262179412745e-06, "loss": 0.61044776, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.07421875, "step": 10808, "time_per_iteration": 3.1716926097869873 }, { "auxiliary_loss_clip": 0.01053774, "auxiliary_loss_mlp": 0.01036028, "balance_loss_clip": 1.01162612, "balance_loss_mlp": 1.01777697, "epoch": 0.6498722380880806, "flos": 36895827991680.0, "grad_norm": 2.3949854073535537, "language_loss": 0.64320111, "learning_rate": 1.1537733197191415e-06, "loss": 0.6640991, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 10809, "time_per_iteration": 2.50059175491333 }, { "auxiliary_loss_clip": 0.01053574, "auxiliary_loss_mlp": 0.01040137, "balance_loss_clip": 1.01747513, "balance_loss_mlp": 1.0180676, "epoch": 0.6499323613407485, "flos": 29016629159040.0, "grad_norm": 1.4860361677129648, "language_loss": 0.82337165, "learning_rate": 1.153420453586008e-06, "loss": 0.84430873, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 10810, "time_per_iteration": 2.499319076538086 }, { "auxiliary_loss_clip": 0.01052879, "auxiliary_loss_mlp": 0.01037498, "balance_loss_clip": 1.01459801, "balance_loss_mlp": 1.01728308, "epoch": 0.6499924845934165, "flos": 20118245650560.0, "grad_norm": 1.5157854985562063, "language_loss": 0.72673565, "learning_rate": 1.1530676195552561e-06, "loss": 0.74763942, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 10811, "time_per_iteration": 2.3682758808135986 }, { "auxiliary_loss_clip": 0.01054061, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.0109992, "balance_loss_mlp": 1.01849842, "epoch": 0.6500526078460844, "flos": 24420313468800.0, "grad_norm": 1.4352515154730583, "language_loss": 0.78567165, "learning_rate": 1.1527148176402649e-06, "loss": 0.80654573, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 10812, "time_per_iteration": 2.429349184036255 }, { "auxiliary_loss_clip": 0.01054747, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.01329708, "balance_loss_mlp": 1.01695597, "epoch": 0.6501127310987524, "flos": 23329905886080.0, "grad_norm": 1.6975651038743764, "language_loss": 0.85679889, "learning_rate": 1.152362047854413e-06, "loss": 0.877729, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 10813, "time_per_iteration": 3.654006004333496 }, { "auxiliary_loss_clip": 0.01054623, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.01440537, "balance_loss_mlp": 1.01806223, "epoch": 0.6501728543514204, "flos": 18696826667520.0, "grad_norm": 1.7236413506071793, "language_loss": 0.81134665, "learning_rate": 1.1520093102110764e-06, "loss": 0.83226347, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36523438, "step": 10814, "time_per_iteration": 2.3598430156707764 }, { "auxiliary_loss_clip": 0.01055487, "auxiliary_loss_mlp": 0.01041237, "balance_loss_clip": 1.01578593, "balance_loss_mlp": 1.01737833, "epoch": 0.6502329776040884, "flos": 44198191474560.0, "grad_norm": 1.6140048157093179, "language_loss": 0.66163272, "learning_rate": 1.1516566047236328e-06, "loss": 0.6825999, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 10815, "time_per_iteration": 2.5559487342834473 }, { "auxiliary_loss_clip": 0.01057345, "auxiliary_loss_mlp": 0.01042923, "balance_loss_clip": 1.0149684, "balance_loss_mlp": 1.01870847, "epoch": 0.6502931008567564, "flos": 14573863457280.0, "grad_norm": 1.943309925440172, "language_loss": 0.77384222, "learning_rate": 1.1513039314054546e-06, "loss": 0.79484493, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38671875, "step": 10816, "time_per_iteration": 2.361140727996826 }, { "auxiliary_loss_clip": 0.010544, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.01696491, "balance_loss_mlp": 1.01764655, "epoch": 0.6503532241094243, "flos": 21394006974720.0, "grad_norm": 1.6930775684579886, "language_loss": 0.73744267, "learning_rate": 1.1509512902699174e-06, "loss": 0.75840288, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 10817, "time_per_iteration": 3.825584650039673 }, { "auxiliary_loss_clip": 0.01054045, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.01540089, "balance_loss_mlp": 1.01673055, "epoch": 0.6504133473620923, "flos": 74738256209280.0, "grad_norm": 1.4441933597132044, "language_loss": 0.72427166, "learning_rate": 1.1505986813303916e-06, "loss": 0.74522364, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 10818, "time_per_iteration": 4.228999853134155 }, { "auxiliary_loss_clip": 0.01056469, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.01579881, "balance_loss_mlp": 1.01771092, "epoch": 0.6504734706147602, "flos": 19712415473280.0, "grad_norm": 2.0167462666421727, "language_loss": 0.66551143, "learning_rate": 1.150246104600249e-06, "loss": 0.6865052, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38671875, "step": 10819, "time_per_iteration": 2.3784759044647217 }, { "auxiliary_loss_clip": 0.01055548, "auxiliary_loss_mlp": 0.01044781, "balance_loss_clip": 1.01887727, "balance_loss_mlp": 1.01717901, "epoch": 0.6505335938674283, "flos": 25555688749440.0, "grad_norm": 1.8658747469555255, "language_loss": 0.84263122, "learning_rate": 1.14989356009286e-06, "loss": 0.86363459, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 10820, "time_per_iteration": 2.4128024578094482 }, { "auxiliary_loss_clip": 0.01057633, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.01566827, "balance_loss_mlp": 1.01774764, "epoch": 0.6505937171200962, "flos": 17820471830400.0, "grad_norm": 2.068466120624152, "language_loss": 0.79893208, "learning_rate": 1.1495410478215914e-06, "loss": 0.81993324, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3984375, "step": 10821, "time_per_iteration": 2.3682940006256104 }, { "auxiliary_loss_clip": 0.01051738, "auxiliary_loss_mlp": 0.01033554, "balance_loss_clip": 1.01067734, "balance_loss_mlp": 1.01651025, "epoch": 0.6506538403727642, "flos": 20667080223360.0, "grad_norm": 1.4371886423491413, "language_loss": 0.80632854, "learning_rate": 1.1491885677998126e-06, "loss": 0.82718146, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 10822, "time_per_iteration": 2.353161573410034 }, { "auxiliary_loss_clip": 0.01053419, "auxiliary_loss_mlp": 0.01041802, "balance_loss_clip": 1.01533771, "balance_loss_mlp": 1.01583409, "epoch": 0.6507139636254321, "flos": 11720831374080.0, "grad_norm": 1.8946028144907963, "language_loss": 0.87778509, "learning_rate": 1.1488361200408883e-06, "loss": 0.89873731, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.375, "step": 10823, "time_per_iteration": 2.3475732803344727 }, { "auxiliary_loss_clip": 0.010543, "auxiliary_loss_mlp": 0.01046177, "balance_loss_clip": 1.01972461, "balance_loss_mlp": 1.01634932, "epoch": 0.6507740868781001, "flos": 26760506457600.0, "grad_norm": 2.8027049062445655, "language_loss": 0.67927575, "learning_rate": 1.148483704558183e-06, "loss": 0.70028055, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 10824, "time_per_iteration": 2.3966002464294434 }, { "auxiliary_loss_clip": 0.01055083, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.01794612, "balance_loss_mlp": 1.01626194, "epoch": 0.650834210130768, "flos": 16470799424640.0, "grad_norm": 7.717703101173519, "language_loss": 0.88988507, "learning_rate": 1.1481313213650607e-06, "loss": 0.91086233, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38671875, "step": 10825, "time_per_iteration": 2.3522849082946777 }, { "auxiliary_loss_clip": 0.010569, "auxiliary_loss_mlp": 0.01044292, "balance_loss_clip": 1.0141443, "balance_loss_mlp": 1.01737392, "epoch": 0.650894333383436, "flos": 17127725166720.0, "grad_norm": 2.386447377950689, "language_loss": 0.74998832, "learning_rate": 1.147778970474885e-06, "loss": 0.77100021, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.39453125, "step": 10826, "time_per_iteration": 2.3244056701660156 }, { "auxiliary_loss_clip": 0.01052451, "auxiliary_loss_mlp": 0.01039759, "balance_loss_clip": 1.01493919, "balance_loss_mlp": 1.01666927, "epoch": 0.650954456636104, "flos": 18733241082240.0, "grad_norm": 2.0653776747881785, "language_loss": 0.71008837, "learning_rate": 1.1474266519010157e-06, "loss": 0.73101044, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35742188, "step": 10827, "time_per_iteration": 2.355087995529175 }, { "auxiliary_loss_clip": 0.01054807, "auxiliary_loss_mlp": 0.01043346, "balance_loss_clip": 1.01951623, "balance_loss_mlp": 1.01726246, "epoch": 0.651014579888772, "flos": 24527287474560.0, "grad_norm": 1.9168299355887195, "language_loss": 0.77912331, "learning_rate": 1.1470743656568136e-06, "loss": 0.80010486, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 10828, "time_per_iteration": 3.916597366333008 }, { "auxiliary_loss_clip": 0.01053263, "auxiliary_loss_mlp": 0.01041275, "balance_loss_clip": 1.01674139, "balance_loss_mlp": 1.0165906, "epoch": 0.65107470314144, "flos": 24059939748480.0, "grad_norm": 2.298963801721368, "language_loss": 0.90059984, "learning_rate": 1.1467221117556362e-06, "loss": 0.92154527, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 10829, "time_per_iteration": 2.370250701904297 }, { "auxiliary_loss_clip": 0.01009248, "auxiliary_loss_mlp": 0.01003689, "balance_loss_clip": 1.00080407, "balance_loss_mlp": 1.0018847, "epoch": 0.6511348263941079, "flos": 72477139317120.0, "grad_norm": 0.6487071572043749, "language_loss": 0.55510783, "learning_rate": 1.1463698902108428e-06, "loss": 0.57523721, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.07373047, "step": 10830, "time_per_iteration": 3.1682636737823486 }, { "auxiliary_loss_clip": 0.01056096, "auxiliary_loss_mlp": 0.01043295, "balance_loss_clip": 1.01672339, "balance_loss_mlp": 1.01717949, "epoch": 0.6511949496467759, "flos": 23366564680320.0, "grad_norm": 1.8691742682468517, "language_loss": 0.75837529, "learning_rate": 1.1460177010357878e-06, "loss": 0.77936924, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38867188, "step": 10831, "time_per_iteration": 2.3737268447875977 }, { "auxiliary_loss_clip": 0.01009399, "auxiliary_loss_mlp": 0.01008161, "balance_loss_clip": 1.00544274, "balance_loss_mlp": 1.00206065, "epoch": 0.6512550728994438, "flos": 67329824549760.0, "grad_norm": 0.6468390732548982, "language_loss": 0.51084518, "learning_rate": 1.145665544243828e-06, "loss": 0.53102076, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.07324219, "step": 10832, "time_per_iteration": 3.1326940059661865 }, { "auxiliary_loss_clip": 0.01055941, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.01692355, "balance_loss_mlp": 1.01686358, "epoch": 0.6513151961521119, "flos": 21140642171520.0, "grad_norm": 2.7371950685804918, "language_loss": 0.85071003, "learning_rate": 1.145313419848316e-06, "loss": 0.87171471, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.390625, "step": 10833, "time_per_iteration": 2.3562026023864746 }, { "auxiliary_loss_clip": 0.01055041, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.01232433, "balance_loss_mlp": 1.01713872, "epoch": 0.6513753194047798, "flos": 15157925458560.0, "grad_norm": 2.9030836898556043, "language_loss": 0.84866822, "learning_rate": 1.1449613278626049e-06, "loss": 0.86959809, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 10834, "time_per_iteration": 2.3514082431793213 }, { "auxiliary_loss_clip": 0.01057013, "auxiliary_loss_mlp": 0.01043798, "balance_loss_clip": 1.01884723, "balance_loss_mlp": 1.01775813, "epoch": 0.6514354426574478, "flos": 30225322028160.0, "grad_norm": 1.3972494887904792, "language_loss": 0.77849197, "learning_rate": 1.1446092683000455e-06, "loss": 0.79950011, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39257812, "step": 10835, "time_per_iteration": 2.428652286529541 }, { "auxiliary_loss_clip": 0.01056418, "auxiliary_loss_mlp": 0.01043919, "balance_loss_clip": 1.01726377, "balance_loss_mlp": 1.01848257, "epoch": 0.6514955659101157, "flos": 24204480243840.0, "grad_norm": 1.532486729260784, "language_loss": 0.78667426, "learning_rate": 1.1442572411739882e-06, "loss": 0.80767763, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 10836, "time_per_iteration": 2.4050674438476562 }, { "auxiliary_loss_clip": 0.01055808, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.01382971, "balance_loss_mlp": 1.01829147, "epoch": 0.6515556891627837, "flos": 12377163623040.0, "grad_norm": 1.895021463482281, "language_loss": 0.82713223, "learning_rate": 1.143905246497783e-06, "loss": 0.84808028, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 10837, "time_per_iteration": 2.3338091373443604 }, { "auxiliary_loss_clip": 0.01053446, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.01210141, "balance_loss_mlp": 1.01712751, "epoch": 0.6516158124154516, "flos": 49599359804160.0, "grad_norm": 2.0284265369776344, "language_loss": 0.59868884, "learning_rate": 1.1435532842847758e-06, "loss": 0.61960632, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 10838, "time_per_iteration": 2.642843008041382 }, { "auxiliary_loss_clip": 0.01008796, "auxiliary_loss_mlp": 0.01009198, "balance_loss_clip": 1.00656331, "balance_loss_mlp": 1.00158143, "epoch": 0.6516759356681197, "flos": 59699731334400.0, "grad_norm": 0.7565750832514271, "language_loss": 0.6106931, "learning_rate": 1.1432013545483147e-06, "loss": 0.63087308, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.07226562, "step": 10839, "time_per_iteration": 3.060102701187134 }, { "auxiliary_loss_clip": 0.01054037, "auxiliary_loss_mlp": 0.01032769, "balance_loss_clip": 1.00953531, "balance_loss_mlp": 1.01756358, "epoch": 0.6517360589207876, "flos": 37449305775360.0, "grad_norm": 1.533482435905043, "language_loss": 0.68293023, "learning_rate": 1.1428494573017439e-06, "loss": 0.70379823, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 10840, "time_per_iteration": 2.56447172164917 }, { "auxiliary_loss_clip": 0.01054962, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.01417816, "balance_loss_mlp": 1.01720476, "epoch": 0.6517961821734556, "flos": 25373721409920.0, "grad_norm": 1.9917287178557868, "language_loss": 0.75145662, "learning_rate": 1.1424975925584071e-06, "loss": 0.77238965, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 10841, "time_per_iteration": 2.4008235931396484 }, { "auxiliary_loss_clip": 0.01056452, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.01160622, "balance_loss_mlp": 1.01850867, "epoch": 0.6518563054261236, "flos": 28765743062400.0, "grad_norm": 1.4708041940498449, "language_loss": 0.63665831, "learning_rate": 1.142145760331648e-06, "loss": 0.65758967, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 10842, "time_per_iteration": 2.4356653690338135 }, { "auxiliary_loss_clip": 0.01010079, "auxiliary_loss_mlp": 0.0100809, "balance_loss_clip": 1.00576591, "balance_loss_mlp": 1.00279379, "epoch": 0.6519164286787915, "flos": 68921725034880.0, "grad_norm": 0.8262203834906235, "language_loss": 0.56176281, "learning_rate": 1.141793960634807e-06, "loss": 0.58194453, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.07324219, "step": 10843, "time_per_iteration": 2.8479809761047363 }, { "auxiliary_loss_clip": 0.0105766, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.01718593, "balance_loss_mlp": 1.0185113, "epoch": 0.6519765519314595, "flos": 20441087792640.0, "grad_norm": 3.0742250346746425, "language_loss": 0.83656204, "learning_rate": 1.1414421934812253e-06, "loss": 0.85759181, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.390625, "step": 10844, "time_per_iteration": 2.3978476524353027 }, { "auxiliary_loss_clip": 0.01054919, "auxiliary_loss_mlp": 0.01037814, "balance_loss_clip": 1.01190925, "balance_loss_mlp": 1.01755881, "epoch": 0.6520366751841274, "flos": 28401703649280.0, "grad_norm": 1.8590370575435202, "language_loss": 0.61440438, "learning_rate": 1.1410904588842421e-06, "loss": 0.63533175, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 10845, "time_per_iteration": 2.4270389080047607 }, { "auxiliary_loss_clip": 0.01053957, "auxiliary_loss_mlp": 0.01041641, "balance_loss_clip": 1.01510525, "balance_loss_mlp": 1.01772237, "epoch": 0.6520967984367955, "flos": 22272316848000.0, "grad_norm": 1.6485110342784552, "language_loss": 0.80947173, "learning_rate": 1.140738756857194e-06, "loss": 0.83042765, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36328125, "step": 10846, "time_per_iteration": 2.3721299171447754 }, { "auxiliary_loss_clip": 0.01010062, "auxiliary_loss_mlp": 0.01001878, "balance_loss_clip": 0.99932724, "balance_loss_mlp": 1.0026989, "epoch": 0.6521569216894634, "flos": 68913309530880.0, "grad_norm": 0.7061262379449746, "language_loss": 0.60234952, "learning_rate": 1.1403870874134192e-06, "loss": 0.62246889, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.07373047, "step": 10847, "time_per_iteration": 3.1255948543548584 }, { "auxiliary_loss_clip": 0.01056176, "auxiliary_loss_mlp": 0.01045294, "balance_loss_clip": 1.01925921, "balance_loss_mlp": 1.0177654, "epoch": 0.6522170449421314, "flos": 29129293716480.0, "grad_norm": 1.9688649551604427, "language_loss": 0.81930685, "learning_rate": 1.1400354505662514e-06, "loss": 0.8403216, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 10848, "time_per_iteration": 2.461585521697998 }, { "auxiliary_loss_clip": 0.01053698, "auxiliary_loss_mlp": 0.01046121, "balance_loss_clip": 1.02113485, "balance_loss_mlp": 1.01681352, "epoch": 0.6522771681947993, "flos": 26650704631680.0, "grad_norm": 2.2284534242375775, "language_loss": 0.76329631, "learning_rate": 1.1396838463290263e-06, "loss": 0.78429449, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 10849, "time_per_iteration": 2.421409845352173 }, { "auxiliary_loss_clip": 0.01053444, "auxiliary_loss_mlp": 0.01046804, "balance_loss_clip": 1.02322435, "balance_loss_mlp": 1.01757956, "epoch": 0.6523372914474673, "flos": 25738563784320.0, "grad_norm": 1.5799737942617464, "language_loss": 0.69317603, "learning_rate": 1.1393322747150752e-06, "loss": 0.71417856, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 10850, "time_per_iteration": 2.393721580505371 }, { "auxiliary_loss_clip": 0.01054864, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.0180397, "balance_loss_mlp": 1.01850986, "epoch": 0.6523974147001352, "flos": 24826178557440.0, "grad_norm": 1.5839390083016187, "language_loss": 0.67862773, "learning_rate": 1.1389807357377313e-06, "loss": 0.69961005, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 10851, "time_per_iteration": 2.3956663608551025 }, { "auxiliary_loss_clip": 0.01056415, "auxiliary_loss_mlp": 0.0105025, "balance_loss_clip": 1.02565742, "balance_loss_mlp": 1.01645088, "epoch": 0.6524575379528033, "flos": 26316586120320.0, "grad_norm": 2.219100611821956, "language_loss": 0.75660157, "learning_rate": 1.1386292294103235e-06, "loss": 0.77766824, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.40039062, "step": 10852, "time_per_iteration": 2.406606912612915 }, { "auxiliary_loss_clip": 0.0105741, "auxiliary_loss_mlp": 0.0105085, "balance_loss_clip": 1.02319336, "balance_loss_mlp": 1.0175463, "epoch": 0.6525176612054712, "flos": 19493300580480.0, "grad_norm": 2.3591754115656514, "language_loss": 0.68156993, "learning_rate": 1.1382777557461812e-06, "loss": 0.70265257, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 10853, "time_per_iteration": 3.6012465953826904 }, { "auxiliary_loss_clip": 0.0100973, "auxiliary_loss_mlp": 0.01010705, "balance_loss_clip": 1.00860679, "balance_loss_mlp": 1.00235808, "epoch": 0.6525777844581392, "flos": 71703534211200.0, "grad_norm": 0.7303666065794024, "language_loss": 0.63113761, "learning_rate": 1.137926314758634e-06, "loss": 0.65134203, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.07373047, "step": 10854, "time_per_iteration": 3.10831880569458 }, { "auxiliary_loss_clip": 0.01055495, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.0218302, "balance_loss_mlp": 1.01704836, "epoch": 0.6526379077108072, "flos": 26651856706560.0, "grad_norm": 1.799005469093403, "language_loss": 0.78347981, "learning_rate": 1.1375749064610072e-06, "loss": 0.80451483, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 10855, "time_per_iteration": 2.4424846172332764 }, { "auxiliary_loss_clip": 0.01050339, "auxiliary_loss_mlp": 0.01041087, "balance_loss_clip": 1.01811528, "balance_loss_mlp": 1.01473308, "epoch": 0.6526980309634751, "flos": 22819266207360.0, "grad_norm": 2.078824012189562, "language_loss": 0.8019731, "learning_rate": 1.1372235308666256e-06, "loss": 0.8228873, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 10856, "time_per_iteration": 3.8704025745391846 }, { "auxiliary_loss_clip": 0.01053356, "auxiliary_loss_mlp": 0.01044398, "balance_loss_clip": 1.01897097, "balance_loss_mlp": 1.01560926, "epoch": 0.6527581542161431, "flos": 28363822957440.0, "grad_norm": 1.8947831298706803, "language_loss": 0.74254572, "learning_rate": 1.136872187988815e-06, "loss": 0.76352328, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 10857, "time_per_iteration": 3.8371284008026123 }, { "auxiliary_loss_clip": 0.01054927, "auxiliary_loss_mlp": 0.0105357, "balance_loss_clip": 1.02867913, "balance_loss_mlp": 1.01732707, "epoch": 0.652818277468811, "flos": 18368224151040.0, "grad_norm": 2.84322490143623, "language_loss": 0.64806604, "learning_rate": 1.1365208778408965e-06, "loss": 0.66915107, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37695312, "step": 10858, "time_per_iteration": 2.3932607173919678 }, { "auxiliary_loss_clip": 0.01052178, "auxiliary_loss_mlp": 0.01047495, "balance_loss_clip": 1.02385581, "balance_loss_mlp": 1.01667356, "epoch": 0.6528784007214791, "flos": 18035327537280.0, "grad_norm": 1.7028701505147006, "language_loss": 0.79627031, "learning_rate": 1.1361696004361939e-06, "loss": 0.81726706, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 10859, "time_per_iteration": 2.32818341255188 }, { "auxiliary_loss_clip": 0.01055147, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.02122688, "balance_loss_mlp": 1.01644051, "epoch": 0.652938523974147, "flos": 22380931687680.0, "grad_norm": 1.5592536190391166, "language_loss": 0.68833625, "learning_rate": 1.1358183557880256e-06, "loss": 0.70935524, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10860, "time_per_iteration": 2.3756582736968994 }, { "auxiliary_loss_clip": 0.01055723, "auxiliary_loss_mlp": 0.01044534, "balance_loss_clip": 1.01888037, "balance_loss_mlp": 1.01694453, "epoch": 0.652998647226815, "flos": 16763092260480.0, "grad_norm": 1.8854254635591694, "language_loss": 0.68607676, "learning_rate": 1.135467143909712e-06, "loss": 0.70707935, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10861, "time_per_iteration": 2.342540979385376 }, { "auxiliary_loss_clip": 0.01056078, "auxiliary_loss_mlp": 0.01042006, "balance_loss_clip": 1.0156846, "balance_loss_mlp": 1.01766777, "epoch": 0.6530587704794829, "flos": 35771065764480.0, "grad_norm": 1.6633586769561004, "language_loss": 0.66415519, "learning_rate": 1.135115964814572e-06, "loss": 0.68513596, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38476562, "step": 10862, "time_per_iteration": 2.4994595050811768 }, { "auxiliary_loss_clip": 0.01052439, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.02086675, "balance_loss_mlp": 1.01606095, "epoch": 0.6531188937321509, "flos": 19315173490560.0, "grad_norm": 3.8362502703751677, "language_loss": 0.7799868, "learning_rate": 1.13476481851592e-06, "loss": 0.80095804, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 10863, "time_per_iteration": 2.3574490547180176 }, { "auxiliary_loss_clip": 0.01053895, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.01327407, "balance_loss_mlp": 1.01698899, "epoch": 0.6531790169848188, "flos": 22892653618560.0, "grad_norm": 1.754641195639989, "language_loss": 0.75449103, "learning_rate": 1.1344137050270739e-06, "loss": 0.77539831, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37109375, "step": 10864, "time_per_iteration": 2.3847403526306152 }, { "auxiliary_loss_clip": 0.01053348, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.02062833, "balance_loss_mlp": 1.01717329, "epoch": 0.6532391402374869, "flos": 29562426443520.0, "grad_norm": 1.6469026409168777, "language_loss": 0.87563455, "learning_rate": 1.1340626243613458e-06, "loss": 0.89660406, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36132812, "step": 10865, "time_per_iteration": 2.43021821975708 }, { "auxiliary_loss_clip": 0.01057038, "auxiliary_loss_mlp": 0.01041633, "balance_loss_clip": 1.01520467, "balance_loss_mlp": 1.01737428, "epoch": 0.6532992634901548, "flos": 23104541859840.0, "grad_norm": 1.6352807888459828, "language_loss": 0.82381558, "learning_rate": 1.133711576532051e-06, "loss": 0.84480226, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39648438, "step": 10866, "time_per_iteration": 2.3758327960968018 }, { "auxiliary_loss_clip": 0.01053299, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.01610184, "balance_loss_mlp": 1.01641703, "epoch": 0.6533593867428228, "flos": 26066153871360.0, "grad_norm": 1.4173117049431991, "language_loss": 0.8317157, "learning_rate": 1.1333605615524995e-06, "loss": 0.85264206, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 10867, "time_per_iteration": 3.9364686012268066 }, { "auxiliary_loss_clip": 0.01053763, "auxiliary_loss_mlp": 0.01035761, "balance_loss_clip": 1.01081097, "balance_loss_mlp": 1.01575303, "epoch": 0.6534195099954908, "flos": 21211481053440.0, "grad_norm": 1.671957066998913, "language_loss": 0.81966102, "learning_rate": 1.1330095794360016e-06, "loss": 0.84055626, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 10868, "time_per_iteration": 2.3885602951049805 }, { "auxiliary_loss_clip": 0.01055695, "auxiliary_loss_mlp": 0.01045563, "balance_loss_clip": 1.01932478, "balance_loss_mlp": 1.01749218, "epoch": 0.6534796332481587, "flos": 19645556486400.0, "grad_norm": 2.149068195606952, "language_loss": 0.81982821, "learning_rate": 1.1326586301958675e-06, "loss": 0.84084082, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 10869, "time_per_iteration": 2.3572838306427 }, { "auxiliary_loss_clip": 0.01056108, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.01463223, "balance_loss_mlp": 1.01791143, "epoch": 0.6535397565008267, "flos": 24021395740800.0, "grad_norm": 3.0202915512863564, "language_loss": 0.73001981, "learning_rate": 1.1323077138454063e-06, "loss": 0.75096762, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3828125, "step": 10870, "time_per_iteration": 2.409684658050537 }, { "auxiliary_loss_clip": 0.01056414, "auxiliary_loss_mlp": 0.01048297, "balance_loss_clip": 1.02316785, "balance_loss_mlp": 1.0191102, "epoch": 0.6535998797534947, "flos": 24601757137920.0, "grad_norm": 2.4298286629920667, "language_loss": 0.7597698, "learning_rate": 1.1319568303979221e-06, "loss": 0.78081691, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 10871, "time_per_iteration": 2.3893327713012695 }, { "auxiliary_loss_clip": 0.01052118, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.01134765, "balance_loss_mlp": 1.0173496, "epoch": 0.6536600030061627, "flos": 23363143367040.0, "grad_norm": 1.6236961443726945, "language_loss": 0.5666132, "learning_rate": 1.1316059798667227e-06, "loss": 0.58748066, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 10872, "time_per_iteration": 2.3857245445251465 }, { "auxiliary_loss_clip": 0.01053958, "auxiliary_loss_mlp": 0.01040965, "balance_loss_clip": 1.01663446, "balance_loss_mlp": 1.01742339, "epoch": 0.6537201262588306, "flos": 23877344004480.0, "grad_norm": 1.7761890507169438, "language_loss": 0.76036984, "learning_rate": 1.1312551622651112e-06, "loss": 0.78131902, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 10873, "time_per_iteration": 2.3857414722442627 }, { "auxiliary_loss_clip": 0.01055518, "auxiliary_loss_mlp": 0.01043382, "balance_loss_clip": 1.01866949, "balance_loss_mlp": 1.0184834, "epoch": 0.6537802495114986, "flos": 24353559216000.0, "grad_norm": 1.5016789792219662, "language_loss": 0.76069772, "learning_rate": 1.1309043776063917e-06, "loss": 0.78168672, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 10874, "time_per_iteration": 2.4362268447875977 }, { "auxiliary_loss_clip": 0.01053684, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.01452708, "balance_loss_mlp": 1.01740479, "epoch": 0.6538403727641665, "flos": 27995768737920.0, "grad_norm": 1.4996858374638242, "language_loss": 0.8291434, "learning_rate": 1.1305536259038642e-06, "loss": 0.85005987, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 10875, "time_per_iteration": 2.4080471992492676 }, { "auxiliary_loss_clip": 0.01053589, "auxiliary_loss_mlp": 0.01043725, "balance_loss_clip": 1.0189054, "balance_loss_mlp": 1.01658022, "epoch": 0.6539004960168345, "flos": 27562356720000.0, "grad_norm": 1.86530938985815, "language_loss": 0.7062211, "learning_rate": 1.1302029071708314e-06, "loss": 0.72719431, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 10876, "time_per_iteration": 2.4242231845855713 }, { "auxiliary_loss_clip": 0.01054263, "auxiliary_loss_mlp": 0.01042673, "balance_loss_clip": 1.01737714, "balance_loss_mlp": 1.01730323, "epoch": 0.6539606192695024, "flos": 14529419429760.0, "grad_norm": 1.9702038324060185, "language_loss": 0.80256522, "learning_rate": 1.1298522214205908e-06, "loss": 0.82353461, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 10877, "time_per_iteration": 2.32517409324646 }, { "auxiliary_loss_clip": 0.01054814, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.01445508, "balance_loss_mlp": 1.0169276, "epoch": 0.6540207425221705, "flos": 21615286371840.0, "grad_norm": 2.1518569150335534, "language_loss": 0.80450457, "learning_rate": 1.1295015686664408e-06, "loss": 0.82543856, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 10878, "time_per_iteration": 2.396343946456909 }, { "auxiliary_loss_clip": 0.01054054, "auxiliary_loss_mlp": 0.01038247, "balance_loss_clip": 1.01309419, "balance_loss_mlp": 1.01743579, "epoch": 0.6540808657748384, "flos": 17668215924480.0, "grad_norm": 1.953428167092928, "language_loss": 0.85477471, "learning_rate": 1.1291509489216797e-06, "loss": 0.87569773, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 10879, "time_per_iteration": 2.353224039077759 }, { "auxiliary_loss_clip": 0.01056609, "auxiliary_loss_mlp": 0.01036749, "balance_loss_clip": 1.01036823, "balance_loss_mlp": 1.01719689, "epoch": 0.6541409890275064, "flos": 14537414131200.0, "grad_norm": 3.2230934059481253, "language_loss": 0.74054599, "learning_rate": 1.128800362199601e-06, "loss": 0.76147962, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39453125, "step": 10880, "time_per_iteration": 2.3977901935577393 }, { "auxiliary_loss_clip": 0.01052563, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.01227307, "balance_loss_mlp": 1.01713121, "epoch": 0.6542011122801744, "flos": 17164349049600.0, "grad_norm": 1.858943996426421, "language_loss": 0.85518855, "learning_rate": 1.1284498085135005e-06, "loss": 0.87605989, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 10881, "time_per_iteration": 2.3558406829833984 }, { "auxiliary_loss_clip": 0.01057324, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.01300192, "balance_loss_mlp": 1.01859117, "epoch": 0.6542612355328423, "flos": 18185628407040.0, "grad_norm": 4.504819140392659, "language_loss": 0.7913698, "learning_rate": 1.1280992878766699e-06, "loss": 0.81232864, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10882, "time_per_iteration": 2.335016965866089 }, { "auxiliary_loss_clip": 0.01056662, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.013767, "balance_loss_mlp": 1.01808083, "epoch": 0.6543213587855103, "flos": 19791423613440.0, "grad_norm": 2.1893033138058113, "language_loss": 0.83097523, "learning_rate": 1.1277488003024024e-06, "loss": 0.85193551, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 10883, "time_per_iteration": 2.3662497997283936 }, { "auxiliary_loss_clip": 0.01055696, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.02034557, "balance_loss_mlp": 1.01812446, "epoch": 0.6543814820381783, "flos": 21104053200000.0, "grad_norm": 3.2152067760635017, "language_loss": 0.86441487, "learning_rate": 1.127398345803988e-06, "loss": 0.88543421, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 10884, "time_per_iteration": 2.361074447631836 }, { "auxiliary_loss_clip": 0.01056349, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.01481342, "balance_loss_mlp": 1.019014, "epoch": 0.6544416052908463, "flos": 20192994604800.0, "grad_norm": 3.629881508952694, "language_loss": 0.81829488, "learning_rate": 1.127047924394715e-06, "loss": 0.83926582, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 10885, "time_per_iteration": 2.396135091781616 }, { "auxiliary_loss_clip": 0.01053308, "auxiliary_loss_mlp": 0.01039152, "balance_loss_clip": 1.01454687, "balance_loss_mlp": 1.01705599, "epoch": 0.6545017285435142, "flos": 23367123262080.0, "grad_norm": 1.672135285596585, "language_loss": 0.72619742, "learning_rate": 1.1266975360878722e-06, "loss": 0.74712199, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 10886, "time_per_iteration": 2.3702552318573 }, { "auxiliary_loss_clip": 0.01053905, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.01229739, "balance_loss_mlp": 1.017241, "epoch": 0.6545618517961822, "flos": 19133729821440.0, "grad_norm": 1.7211594343936427, "language_loss": 0.78939897, "learning_rate": 1.1263471808967468e-06, "loss": 0.81030303, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 10887, "time_per_iteration": 2.3636722564697266 }, { "auxiliary_loss_clip": 0.01053661, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.01293254, "balance_loss_mlp": 1.01715231, "epoch": 0.6546219750488501, "flos": 14937763224960.0, "grad_norm": 2.2734524463173442, "language_loss": 0.7960279, "learning_rate": 1.1259968588346234e-06, "loss": 0.8169387, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 10888, "time_per_iteration": 2.325827121734619 }, { "auxiliary_loss_clip": 0.01051919, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.01372635, "balance_loss_mlp": 1.01653302, "epoch": 0.6546820983015181, "flos": 36319027553280.0, "grad_norm": 2.1289576948138493, "language_loss": 0.67559969, "learning_rate": 1.1256465699147874e-06, "loss": 0.6964922, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 10889, "time_per_iteration": 2.498112440109253 }, { "auxiliary_loss_clip": 0.01053956, "auxiliary_loss_mlp": 0.01039389, "balance_loss_clip": 1.01513004, "balance_loss_mlp": 1.01654184, "epoch": 0.654742221554186, "flos": 20410433752320.0, "grad_norm": 1.4094993274914456, "language_loss": 0.80036259, "learning_rate": 1.1252963141505203e-06, "loss": 0.82129604, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 10890, "time_per_iteration": 2.351438522338867 }, { "auxiliary_loss_clip": 0.01053781, "auxiliary_loss_mlp": 0.0104284, "balance_loss_clip": 1.01829529, "balance_loss_mlp": 1.01634085, "epoch": 0.6548023448068541, "flos": 24862488238080.0, "grad_norm": 2.1029633713808686, "language_loss": 0.66346121, "learning_rate": 1.1249460915551052e-06, "loss": 0.68442738, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 10891, "time_per_iteration": 2.4404098987579346 }, { "auxiliary_loss_clip": 0.01053497, "auxiliary_loss_mlp": 0.01048485, "balance_loss_clip": 1.02424979, "balance_loss_mlp": 1.01628685, "epoch": 0.654862468059522, "flos": 21426685873920.0, "grad_norm": 1.8861935014433036, "language_loss": 0.80268615, "learning_rate": 1.1245959021418214e-06, "loss": 0.82370597, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 10892, "time_per_iteration": 3.5707638263702393 }, { "auxiliary_loss_clip": 0.01056025, "auxiliary_loss_mlp": 0.01045093, "balance_loss_clip": 1.0186162, "balance_loss_mlp": 1.01756763, "epoch": 0.65492259131219, "flos": 26576653904640.0, "grad_norm": 1.8254655954455277, "language_loss": 0.79119468, "learning_rate": 1.1242457459239497e-06, "loss": 0.81220579, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38476562, "step": 10893, "time_per_iteration": 2.4414963722229004 }, { "auxiliary_loss_clip": 0.01057282, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.0186466, "balance_loss_mlp": 1.01797593, "epoch": 0.6549827145648579, "flos": 21500422398720.0, "grad_norm": 1.5908157098753875, "language_loss": 0.7148149, "learning_rate": 1.123895622914766e-06, "loss": 0.73584509, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39257812, "step": 10894, "time_per_iteration": 2.36429500579834 }, { "auxiliary_loss_clip": 0.01055959, "auxiliary_loss_mlp": 0.01046278, "balance_loss_clip": 1.01995635, "balance_loss_mlp": 1.01716232, "epoch": 0.6550428378175259, "flos": 22593378510720.0, "grad_norm": 2.5921001848732086, "language_loss": 0.64239269, "learning_rate": 1.123545533127549e-06, "loss": 0.66341507, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38671875, "step": 10895, "time_per_iteration": 3.6776013374328613 }, { "auxiliary_loss_clip": 0.01052834, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.01752448, "balance_loss_mlp": 1.01600385, "epoch": 0.655102961070194, "flos": 12822969173760.0, "grad_norm": 1.7941781488503206, "language_loss": 0.80018985, "learning_rate": 1.1231954765755722e-06, "loss": 0.82111812, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3671875, "step": 10896, "time_per_iteration": 2.3542048931121826 }, { "auxiliary_loss_clip": 0.0105345, "auxiliary_loss_mlp": 0.01041238, "balance_loss_clip": 1.01614475, "balance_loss_mlp": 1.01740742, "epoch": 0.6551630843228619, "flos": 24789903788160.0, "grad_norm": 1.43851306082348, "language_loss": 0.71638358, "learning_rate": 1.1228454532721111e-06, "loss": 0.7373305, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 10897, "time_per_iteration": 3.8449649810791016 }, { "auxiliary_loss_clip": 0.01055199, "auxiliary_loss_mlp": 0.0104031, "balance_loss_clip": 1.01426244, "balance_loss_mlp": 1.01681566, "epoch": 0.6552232075755299, "flos": 16723605646080.0, "grad_norm": 1.883190156750911, "language_loss": 0.76374972, "learning_rate": 1.1224954632304391e-06, "loss": 0.7847048, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 10898, "time_per_iteration": 2.3632802963256836 }, { "auxiliary_loss_clip": 0.01053863, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.01853895, "balance_loss_mlp": 1.01736808, "epoch": 0.6552833308281978, "flos": 22015425997440.0, "grad_norm": 2.702339383351021, "language_loss": 0.75170422, "learning_rate": 1.122145506463827e-06, "loss": 0.77265114, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 10899, "time_per_iteration": 2.3497908115386963 }, { "auxiliary_loss_clip": 0.01053618, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.01617503, "balance_loss_mlp": 1.01682305, "epoch": 0.6553434540808658, "flos": 24862243858560.0, "grad_norm": 2.1313179106792144, "language_loss": 0.57240915, "learning_rate": 1.1217955829855443e-06, "loss": 0.59333348, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 10900, "time_per_iteration": 2.3976008892059326 }, { "auxiliary_loss_clip": 0.01054494, "auxiliary_loss_mlp": 0.01043719, "balance_loss_clip": 1.01666999, "balance_loss_mlp": 1.01764059, "epoch": 0.6554035773335337, "flos": 23219964414720.0, "grad_norm": 4.18235754323933, "language_loss": 0.78035593, "learning_rate": 1.1214456928088622e-06, "loss": 0.80133808, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.36914062, "step": 10901, "time_per_iteration": 2.3664395809173584 }, { "auxiliary_loss_clip": 0.0105292, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.01305532, "balance_loss_mlp": 1.01668859, "epoch": 0.6554637005862017, "flos": 22782502679040.0, "grad_norm": 1.8648306804526138, "language_loss": 0.74666953, "learning_rate": 1.1210958359470463e-06, "loss": 0.76756942, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 10902, "time_per_iteration": 2.448197364807129 }, { "auxiliary_loss_clip": 0.01052551, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.01399136, "balance_loss_mlp": 1.01742744, "epoch": 0.6555238238388696, "flos": 21506147861760.0, "grad_norm": 1.7724073784425207, "language_loss": 0.68918109, "learning_rate": 1.1207460124133645e-06, "loss": 0.7100755, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 10903, "time_per_iteration": 2.362318754196167 }, { "auxiliary_loss_clip": 0.0105386, "auxiliary_loss_mlp": 0.01040547, "balance_loss_clip": 1.0157392, "balance_loss_mlp": 1.01672411, "epoch": 0.6555839470915377, "flos": 30518138534400.0, "grad_norm": 1.7697872251891265, "language_loss": 0.67786908, "learning_rate": 1.1203962222210832e-06, "loss": 0.6988132, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 10904, "time_per_iteration": 2.4458484649658203 }, { "auxiliary_loss_clip": 0.01053715, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.01574671, "balance_loss_mlp": 1.01647234, "epoch": 0.6556440703442056, "flos": 24641837245440.0, "grad_norm": 1.7964741925287844, "language_loss": 0.91474867, "learning_rate": 1.120046465383464e-06, "loss": 0.9357028, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 10905, "time_per_iteration": 2.3655476570129395 }, { "auxiliary_loss_clip": 0.0105067, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.01428556, "balance_loss_mlp": 1.01585603, "epoch": 0.6557041935968736, "flos": 23731337232000.0, "grad_norm": 1.7031375079309898, "language_loss": 0.77000517, "learning_rate": 1.1196967419137721e-06, "loss": 0.79086959, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 10906, "time_per_iteration": 2.379581928253174 }, { "auxiliary_loss_clip": 0.01055003, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.01600063, "balance_loss_mlp": 1.01758504, "epoch": 0.6557643168495415, "flos": 11102135437440.0, "grad_norm": 3.203707656998057, "language_loss": 0.76722348, "learning_rate": 1.119347051825267e-06, "loss": 0.78816646, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 10907, "time_per_iteration": 3.7495715618133545 }, { "auxiliary_loss_clip": 0.0105414, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.01268888, "balance_loss_mlp": 1.01671576, "epoch": 0.6558244401022095, "flos": 30189710574720.0, "grad_norm": 1.3931173120940865, "language_loss": 0.72902644, "learning_rate": 1.118997395131211e-06, "loss": 0.74994123, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 10908, "time_per_iteration": 2.435680866241455 }, { "auxiliary_loss_clip": 0.01053414, "auxiliary_loss_mlp": 0.01038025, "balance_loss_clip": 1.0149219, "balance_loss_mlp": 1.01724505, "epoch": 0.6558845633548775, "flos": 17930099099520.0, "grad_norm": 2.4092058942148538, "language_loss": 0.83502561, "learning_rate": 1.118647771844861e-06, "loss": 0.85593998, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 10909, "time_per_iteration": 2.35548734664917 }, { "auxiliary_loss_clip": 0.01053342, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.01306188, "balance_loss_mlp": 1.0161798, "epoch": 0.6559446866075455, "flos": 21903180376320.0, "grad_norm": 2.4595225433419077, "language_loss": 0.6644873, "learning_rate": 1.1182981819794767e-06, "loss": 0.68540311, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 10910, "time_per_iteration": 2.3611481189727783 }, { "auxiliary_loss_clip": 0.01057648, "auxiliary_loss_mlp": 0.01040517, "balance_loss_clip": 1.01166892, "balance_loss_mlp": 1.01799583, "epoch": 0.6560048098602135, "flos": 14127359679360.0, "grad_norm": 2.768977876119546, "language_loss": 0.77378666, "learning_rate": 1.117948625548313e-06, "loss": 0.79476833, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.39648438, "step": 10911, "time_per_iteration": 2.306070566177368 }, { "auxiliary_loss_clip": 0.01049543, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.0116576, "balance_loss_mlp": 1.01526022, "epoch": 0.6560649331128814, "flos": 18806558670720.0, "grad_norm": 1.6868363606378773, "language_loss": 0.76185125, "learning_rate": 1.1175991025646265e-06, "loss": 0.78266442, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.34179688, "step": 10912, "time_per_iteration": 2.3844969272613525 }, { "auxiliary_loss_clip": 0.01057482, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.01794648, "balance_loss_mlp": 1.01772666, "epoch": 0.6561250563655494, "flos": 17052731832960.0, "grad_norm": 1.842829019046689, "language_loss": 0.78754532, "learning_rate": 1.1172496130416697e-06, "loss": 0.80857158, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 10913, "time_per_iteration": 2.330132484436035 }, { "auxiliary_loss_clip": 0.01049366, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.01140141, "balance_loss_mlp": 1.01583683, "epoch": 0.6561851796182173, "flos": 22636565729280.0, "grad_norm": 1.6942201478969052, "language_loss": 0.72311091, "learning_rate": 1.1169001569926961e-06, "loss": 0.74392927, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.3359375, "step": 10914, "time_per_iteration": 2.387188196182251 }, { "auxiliary_loss_clip": 0.01052785, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.00910366, "balance_loss_mlp": 1.01666594, "epoch": 0.6562453028708853, "flos": 19238364766080.0, "grad_norm": 1.6568745222252, "language_loss": 0.75020367, "learning_rate": 1.116550734430958e-06, "loss": 0.77106601, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 10915, "time_per_iteration": 2.3345999717712402 }, { "auxiliary_loss_clip": 0.01051076, "auxiliary_loss_mlp": 0.01036583, "balance_loss_clip": 1.0134325, "balance_loss_mlp": 1.0164355, "epoch": 0.6563054261235532, "flos": 23800290900480.0, "grad_norm": 2.004224316972854, "language_loss": 0.80660868, "learning_rate": 1.1162013453697042e-06, "loss": 0.8274852, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34570312, "step": 10916, "time_per_iteration": 2.403728723526001 }, { "auxiliary_loss_clip": 0.01053267, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.01787305, "balance_loss_mlp": 1.0173831, "epoch": 0.6563655493762213, "flos": 19239167727360.0, "grad_norm": 1.7967780130513291, "language_loss": 0.77458829, "learning_rate": 1.1158519898221831e-06, "loss": 0.79551548, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.359375, "step": 10917, "time_per_iteration": 2.3653388023376465 }, { "auxiliary_loss_clip": 0.0105015, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.00940859, "balance_loss_mlp": 1.01535964, "epoch": 0.6564256726288892, "flos": 25555269813120.0, "grad_norm": 1.8337976947019927, "language_loss": 0.72035462, "learning_rate": 1.1155026678016445e-06, "loss": 0.74117517, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 10918, "time_per_iteration": 2.4084508419036865 }, { "auxiliary_loss_clip": 0.01050156, "auxiliary_loss_mlp": 0.01033479, "balance_loss_clip": 1.01308179, "balance_loss_mlp": 1.01622021, "epoch": 0.6564857958815572, "flos": 22199522929920.0, "grad_norm": 1.5341689138688268, "language_loss": 0.76743597, "learning_rate": 1.115153379321332e-06, "loss": 0.78827226, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33984375, "step": 10919, "time_per_iteration": 2.3554093837738037 }, { "auxiliary_loss_clip": 0.01009078, "auxiliary_loss_mlp": 0.01001902, "balance_loss_clip": 0.999542, "balance_loss_mlp": 1.00179625, "epoch": 0.6565459191342251, "flos": 58120470627840.0, "grad_norm": 0.7225237045151639, "language_loss": 0.53100681, "learning_rate": 1.1148041243944931e-06, "loss": 0.55111659, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07275391, "step": 10920, "time_per_iteration": 3.043001413345337 }, { "auxiliary_loss_clip": 0.01052192, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.01443291, "balance_loss_mlp": 1.01667356, "epoch": 0.6566060423868931, "flos": 30808336688640.0, "grad_norm": 1.3962137759036009, "language_loss": 0.66480821, "learning_rate": 1.1144549030343697e-06, "loss": 0.68570018, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 10921, "time_per_iteration": 2.45133376121521 }, { "auxiliary_loss_clip": 0.010507, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.01303685, "balance_loss_mlp": 1.01556206, "epoch": 0.6566661656395612, "flos": 23366320300800.0, "grad_norm": 1.8158656213809317, "language_loss": 0.82414806, "learning_rate": 1.114105715254205e-06, "loss": 0.8450191, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3515625, "step": 10922, "time_per_iteration": 2.401548385620117 }, { "auxiliary_loss_clip": 0.01053029, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.01589084, "balance_loss_mlp": 1.01666355, "epoch": 0.6567262888922291, "flos": 25734514066560.0, "grad_norm": 2.2849731962286604, "language_loss": 0.72738314, "learning_rate": 1.1137565610672414e-06, "loss": 0.74831814, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 10923, "time_per_iteration": 2.4093282222747803 }, { "auxiliary_loss_clip": 0.01055359, "auxiliary_loss_mlp": 0.01039858, "balance_loss_clip": 1.01452613, "balance_loss_mlp": 1.01827335, "epoch": 0.6567864121448971, "flos": 17122907399040.0, "grad_norm": 2.0504406935467143, "language_loss": 0.81592751, "learning_rate": 1.1134074404867169e-06, "loss": 0.83687967, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 10924, "time_per_iteration": 2.358916997909546 }, { "auxiliary_loss_clip": 0.01050552, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.01648891, "balance_loss_mlp": 1.01575649, "epoch": 0.656846535397565, "flos": 22418218886400.0, "grad_norm": 1.8437444048092415, "language_loss": 0.73723346, "learning_rate": 1.1130583535258717e-06, "loss": 0.75811315, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 10925, "time_per_iteration": 2.370487689971924 }, { "auxiliary_loss_clip": 0.01053315, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.01226211, "balance_loss_mlp": 1.01723742, "epoch": 0.656906658650233, "flos": 17703792466560.0, "grad_norm": 2.7257720635469727, "language_loss": 0.7370615, "learning_rate": 1.112709300197942e-06, "loss": 0.75793993, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 10926, "time_per_iteration": 2.3539621829986572 }, { "auxiliary_loss_clip": 0.01054367, "auxiliary_loss_mlp": 0.01040985, "balance_loss_clip": 1.01615345, "balance_loss_mlp": 1.01676393, "epoch": 0.6569667819029009, "flos": 21174193854720.0, "grad_norm": 1.724360005698026, "language_loss": 0.73916948, "learning_rate": 1.1123602805161656e-06, "loss": 0.76012301, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 10927, "time_per_iteration": 2.3523316383361816 }, { "auxiliary_loss_clip": 0.01008783, "auxiliary_loss_mlp": 0.01005012, "balance_loss_clip": 1.00286603, "balance_loss_mlp": 1.00157428, "epoch": 0.6570269051555689, "flos": 68758330538880.0, "grad_norm": 0.7299083684526992, "language_loss": 0.64463818, "learning_rate": 1.112011294493775e-06, "loss": 0.66477621, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.07226562, "step": 10928, "time_per_iteration": 3.041449546813965 }, { "auxiliary_loss_clip": 0.01052851, "auxiliary_loss_mlp": 0.010361, "balance_loss_clip": 1.01261544, "balance_loss_mlp": 1.01679599, "epoch": 0.6570870284082369, "flos": 26318192042880.0, "grad_norm": 1.5912710905572895, "language_loss": 0.78762591, "learning_rate": 1.1116623421440063e-06, "loss": 0.80851531, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 10929, "time_per_iteration": 2.418849468231201 }, { "auxiliary_loss_clip": 0.01052894, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.0130111, "balance_loss_mlp": 1.01713538, "epoch": 0.6571471516609049, "flos": 26173616636160.0, "grad_norm": 1.6396423609876543, "language_loss": 0.66619676, "learning_rate": 1.1113134234800895e-06, "loss": 0.68710911, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.35742188, "step": 10930, "time_per_iteration": 2.4309372901916504 }, { "auxiliary_loss_clip": 0.01052698, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.0129981, "balance_loss_mlp": 1.01586652, "epoch": 0.6572072749135728, "flos": 20375206323840.0, "grad_norm": 1.7537098013232442, "language_loss": 0.71959436, "learning_rate": 1.110964538515258e-06, "loss": 0.74048948, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 10931, "time_per_iteration": 2.349529266357422 }, { "auxiliary_loss_clip": 0.01052146, "auxiliary_loss_mlp": 0.01035351, "balance_loss_clip": 1.01257038, "balance_loss_mlp": 1.01545143, "epoch": 0.6572673981662408, "flos": 17127794989440.0, "grad_norm": 2.1255689868781733, "language_loss": 0.70263046, "learning_rate": 1.1106156872627393e-06, "loss": 0.7235055, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 10932, "time_per_iteration": 3.5594820976257324 }, { "auxiliary_loss_clip": 0.01051768, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.01322317, "balance_loss_mlp": 1.01618099, "epoch": 0.6573275214189087, "flos": 41273692104960.0, "grad_norm": 1.7189496006406522, "language_loss": 0.81332016, "learning_rate": 1.1102668697357626e-06, "loss": 0.83417886, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.35546875, "step": 10933, "time_per_iteration": 2.5212414264678955 }, { "auxiliary_loss_clip": 0.01053441, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.01656377, "balance_loss_mlp": 1.01697063, "epoch": 0.6573876446715767, "flos": 22889127571200.0, "grad_norm": 1.8250006476761995, "language_loss": 0.75405931, "learning_rate": 1.1099180859475571e-06, "loss": 0.77500385, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36523438, "step": 10934, "time_per_iteration": 2.405470371246338 }, { "auxiliary_loss_clip": 0.01051166, "auxiliary_loss_mlp": 0.01037021, "balance_loss_clip": 1.01507425, "balance_loss_mlp": 1.01576376, "epoch": 0.6574477679242448, "flos": 44016468514560.0, "grad_norm": 1.64415481347668, "language_loss": 0.76994187, "learning_rate": 1.1095693359113454e-06, "loss": 0.7908237, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 10935, "time_per_iteration": 3.818411350250244 }, { "auxiliary_loss_clip": 0.01053235, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.01608324, "balance_loss_mlp": 1.01586652, "epoch": 0.6575078911769127, "flos": 24570369959040.0, "grad_norm": 1.683741949834991, "language_loss": 0.80072856, "learning_rate": 1.1092206196403538e-06, "loss": 0.82168478, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 10936, "time_per_iteration": 2.4135582447052 }, { "auxiliary_loss_clip": 0.01049466, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.01390159, "balance_loss_mlp": 1.01527667, "epoch": 0.6575680144295807, "flos": 20922958644480.0, "grad_norm": 1.901685194596564, "language_loss": 0.70595843, "learning_rate": 1.1088719371478056e-06, "loss": 0.7268098, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34179688, "step": 10937, "time_per_iteration": 3.7711799144744873 }, { "auxiliary_loss_clip": 0.010507, "auxiliary_loss_mlp": 0.01037381, "balance_loss_clip": 1.01388526, "balance_loss_mlp": 1.01532888, "epoch": 0.6576281376822486, "flos": 10924881131520.0, "grad_norm": 2.387956708532646, "language_loss": 0.70245135, "learning_rate": 1.1085232884469236e-06, "loss": 0.72333217, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 10938, "time_per_iteration": 2.403691053390503 }, { "auxiliary_loss_clip": 0.01053009, "auxiliary_loss_mlp": 0.01037819, "balance_loss_clip": 1.01450157, "balance_loss_mlp": 1.01609755, "epoch": 0.6576882609349166, "flos": 19280539555200.0, "grad_norm": 1.9705309228256223, "language_loss": 0.72391266, "learning_rate": 1.108174673550927e-06, "loss": 0.74482095, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36914062, "step": 10939, "time_per_iteration": 2.379563570022583 }, { "auxiliary_loss_clip": 0.0105295, "auxiliary_loss_mlp": 0.01038437, "balance_loss_clip": 1.01451135, "balance_loss_mlp": 1.01580095, "epoch": 0.6577483841875845, "flos": 20219773484160.0, "grad_norm": 3.773742666089784, "language_loss": 0.79613531, "learning_rate": 1.107826092473037e-06, "loss": 0.81704915, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 10940, "time_per_iteration": 2.3426568508148193 }, { "auxiliary_loss_clip": 0.01053958, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.01390302, "balance_loss_mlp": 1.01594949, "epoch": 0.6578085074402525, "flos": 34749646761600.0, "grad_norm": 2.6251113442083196, "language_loss": 0.70015579, "learning_rate": 1.107477545226471e-06, "loss": 0.7210815, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 10941, "time_per_iteration": 2.5188167095184326 }, { "auxiliary_loss_clip": 0.01049687, "auxiliary_loss_mlp": 0.01033606, "balance_loss_clip": 1.01105118, "balance_loss_mlp": 1.01470423, "epoch": 0.6578686306929205, "flos": 23470047550080.0, "grad_norm": 1.7584490587122854, "language_loss": 0.70199746, "learning_rate": 1.1071290318244448e-06, "loss": 0.72283036, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 10942, "time_per_iteration": 2.414778709411621 }, { "auxiliary_loss_clip": 0.01056749, "auxiliary_loss_mlp": 0.01038065, "balance_loss_clip": 1.01291215, "balance_loss_mlp": 1.01759684, "epoch": 0.6579287539455885, "flos": 18076105872000.0, "grad_norm": 2.082827533394438, "language_loss": 0.72336876, "learning_rate": 1.1067805522801753e-06, "loss": 0.74431694, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 10943, "time_per_iteration": 2.3819007873535156 }, { "auxiliary_loss_clip": 0.010509, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.01369786, "balance_loss_mlp": 1.01527548, "epoch": 0.6579888771982564, "flos": 28660025865600.0, "grad_norm": 1.7508207743363633, "language_loss": 0.60904765, "learning_rate": 1.1064321066068778e-06, "loss": 0.62991738, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 10944, "time_per_iteration": 2.441410541534424 }, { "auxiliary_loss_clip": 0.01055082, "auxiliary_loss_mlp": 0.01045635, "balance_loss_clip": 1.01986217, "balance_loss_mlp": 1.01668715, "epoch": 0.6580490004509244, "flos": 25045363272960.0, "grad_norm": 1.4993276569153515, "language_loss": 0.73571789, "learning_rate": 1.1060836948177646e-06, "loss": 0.75672507, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 10945, "time_per_iteration": 2.3921141624450684 }, { "auxiliary_loss_clip": 0.01051686, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.01306832, "balance_loss_mlp": 1.01593733, "epoch": 0.6581091237035923, "flos": 43507085644800.0, "grad_norm": 1.5646281810448457, "language_loss": 0.70832735, "learning_rate": 1.105735316926046e-06, "loss": 0.72921467, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35742188, "step": 10946, "time_per_iteration": 2.5434823036193848 }, { "auxiliary_loss_clip": 0.01053255, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.01199675, "balance_loss_mlp": 1.01729488, "epoch": 0.6581692469562603, "flos": 22414413548160.0, "grad_norm": 1.8205855876643469, "language_loss": 0.83194196, "learning_rate": 1.105386972944934e-06, "loss": 0.85282302, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 10947, "time_per_iteration": 3.7467236518859863 }, { "auxiliary_loss_clip": 0.01053671, "auxiliary_loss_mlp": 0.0103532, "balance_loss_clip": 1.01284885, "balance_loss_mlp": 1.01650763, "epoch": 0.6582293702089284, "flos": 24858717811200.0, "grad_norm": 1.6085025705062304, "language_loss": 0.77908909, "learning_rate": 1.1050386628876385e-06, "loss": 0.79997897, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37109375, "step": 10948, "time_per_iteration": 2.3765974044799805 }, { "auxiliary_loss_clip": 0.01051944, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.01231301, "balance_loss_mlp": 1.01639676, "epoch": 0.6582894934615963, "flos": 23038555656960.0, "grad_norm": 1.9903554031842552, "language_loss": 0.80499327, "learning_rate": 1.1046903867673655e-06, "loss": 0.82586831, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 10949, "time_per_iteration": 2.360328197479248 }, { "auxiliary_loss_clip": 0.010091, "auxiliary_loss_mlp": 0.0100449, "balance_loss_clip": 1.00208199, "balance_loss_mlp": 1.00166845, "epoch": 0.6583496167142643, "flos": 72548432046720.0, "grad_norm": 0.7324140752680341, "language_loss": 0.61844182, "learning_rate": 1.104342144597323e-06, "loss": 0.6385777, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.07421875, "step": 10950, "time_per_iteration": 3.089111804962158 }, { "auxiliary_loss_clip": 0.01049647, "auxiliary_loss_mlp": 0.01035567, "balance_loss_clip": 1.01461005, "balance_loss_mlp": 1.01500785, "epoch": 0.6584097399669322, "flos": 13078009722240.0, "grad_norm": 2.341140602960788, "language_loss": 0.68222904, "learning_rate": 1.1039939363907178e-06, "loss": 0.70308119, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34765625, "step": 10951, "time_per_iteration": 2.3374369144439697 }, { "auxiliary_loss_clip": 0.01051236, "auxiliary_loss_mlp": 0.01036386, "balance_loss_clip": 1.01401067, "balance_loss_mlp": 1.01657748, "epoch": 0.6584698632196002, "flos": 28691936714880.0, "grad_norm": 1.322642626373734, "language_loss": 0.77318543, "learning_rate": 1.1036457621607504e-06, "loss": 0.79406166, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 10952, "time_per_iteration": 2.451864242553711 }, { "auxiliary_loss_clip": 0.0105196, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.01437473, "balance_loss_mlp": 1.01702571, "epoch": 0.6585299864722681, "flos": 14318403972480.0, "grad_norm": 1.7553921040810019, "language_loss": 0.74745941, "learning_rate": 1.1032976219206257e-06, "loss": 0.76833349, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34960938, "step": 10953, "time_per_iteration": 2.3536643981933594 }, { "auxiliary_loss_clip": 0.01051288, "auxiliary_loss_mlp": 0.01039283, "balance_loss_clip": 1.01569104, "balance_loss_mlp": 1.01584816, "epoch": 0.6585901097249361, "flos": 26796676492800.0, "grad_norm": 1.8806439409830178, "language_loss": 0.79280996, "learning_rate": 1.102949515683546e-06, "loss": 0.8137157, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 10954, "time_per_iteration": 2.38666033744812 }, { "auxiliary_loss_clip": 0.01053919, "auxiliary_loss_mlp": 0.01038626, "balance_loss_clip": 1.01441455, "balance_loss_mlp": 1.01725554, "epoch": 0.658650232977604, "flos": 18732158830080.0, "grad_norm": 2.0434802285520814, "language_loss": 0.71030712, "learning_rate": 1.1026014434627096e-06, "loss": 0.73123252, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 10955, "time_per_iteration": 2.3292224407196045 }, { "auxiliary_loss_clip": 0.01050368, "auxiliary_loss_mlp": 0.01037248, "balance_loss_clip": 1.01571918, "balance_loss_mlp": 1.01551068, "epoch": 0.6587103562302721, "flos": 24752302387200.0, "grad_norm": 1.866510187607943, "language_loss": 0.81850755, "learning_rate": 1.1022534052713172e-06, "loss": 0.83938372, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 10956, "time_per_iteration": 2.3850834369659424 }, { "auxiliary_loss_clip": 0.01053414, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.01480412, "balance_loss_mlp": 1.01755619, "epoch": 0.65877047948294, "flos": 22345040943360.0, "grad_norm": 2.295498705213328, "language_loss": 0.83506829, "learning_rate": 1.1019054011225648e-06, "loss": 0.85597658, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 10957, "time_per_iteration": 2.3747761249542236 }, { "auxiliary_loss_clip": 0.01052023, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.0157764, "balance_loss_mlp": 1.0167532, "epoch": 0.658830602735608, "flos": 45178971788160.0, "grad_norm": 1.5916049244810861, "language_loss": 0.77391624, "learning_rate": 1.1015574310296506e-06, "loss": 0.79480028, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3515625, "step": 10958, "time_per_iteration": 2.5576913356781006 }, { "auxiliary_loss_clip": 0.01051865, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.01870942, "balance_loss_mlp": 1.01665354, "epoch": 0.6588907259882759, "flos": 19900597034880.0, "grad_norm": 1.5332520199329662, "language_loss": 0.76263595, "learning_rate": 1.1012094950057678e-06, "loss": 0.78357315, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 10959, "time_per_iteration": 2.372605800628662 }, { "auxiliary_loss_clip": 0.01053159, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.01214075, "balance_loss_mlp": 1.01674628, "epoch": 0.6589508492409439, "flos": 24132628932480.0, "grad_norm": 1.7638883857446737, "language_loss": 0.66060436, "learning_rate": 1.1008615930641107e-06, "loss": 0.68148613, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 10960, "time_per_iteration": 2.370884656906128 }, { "auxiliary_loss_clip": 0.01055611, "auxiliary_loss_mlp": 0.01042093, "balance_loss_clip": 1.01717842, "balance_loss_mlp": 1.01770711, "epoch": 0.659010972493612, "flos": 18221938087680.0, "grad_norm": 4.165001894199809, "language_loss": 0.83000278, "learning_rate": 1.1005137252178734e-06, "loss": 0.8509798, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 10961, "time_per_iteration": 2.352940320968628 }, { "auxiliary_loss_clip": 0.01052651, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.01341331, "balance_loss_mlp": 1.01716447, "epoch": 0.6590710957462799, "flos": 27598771134720.0, "grad_norm": 1.6740917085539997, "language_loss": 0.75781333, "learning_rate": 1.1001658914802453e-06, "loss": 0.7786904, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 10962, "time_per_iteration": 2.4125282764434814 }, { "auxiliary_loss_clip": 0.01053771, "auxiliary_loss_mlp": 0.0103589, "balance_loss_clip": 1.01334715, "balance_loss_mlp": 1.0168066, "epoch": 0.6591312189989479, "flos": 20301923646720.0, "grad_norm": 1.8625958978669943, "language_loss": 0.81050873, "learning_rate": 1.0998180918644165e-06, "loss": 0.83140528, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37109375, "step": 10963, "time_per_iteration": 2.3857223987579346 }, { "auxiliary_loss_clip": 0.01049406, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.00779212, "balance_loss_mlp": 1.01440763, "epoch": 0.6591913422516158, "flos": 12312120026880.0, "grad_norm": 1.6316626684218902, "language_loss": 0.79739797, "learning_rate": 1.0994703263835754e-06, "loss": 0.81819201, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34960938, "step": 10964, "time_per_iteration": 2.3310420513153076 }, { "auxiliary_loss_clip": 0.0105286, "auxiliary_loss_mlp": 0.01039753, "balance_loss_clip": 1.01629257, "balance_loss_mlp": 1.01609814, "epoch": 0.6592514655042838, "flos": 25883418481920.0, "grad_norm": 1.7041331202188874, "language_loss": 0.75081921, "learning_rate": 1.0991225950509106e-06, "loss": 0.77174532, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 10965, "time_per_iteration": 2.4712212085723877 }, { "auxiliary_loss_clip": 0.01055009, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.01139295, "balance_loss_mlp": 1.01725268, "epoch": 0.6593115887569517, "flos": 14062769930880.0, "grad_norm": 2.192735635203252, "language_loss": 0.75114125, "learning_rate": 1.0987748978796067e-06, "loss": 0.77205801, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 10966, "time_per_iteration": 2.367060422897339 }, { "auxiliary_loss_clip": 0.01051857, "auxiliary_loss_mlp": 0.01038747, "balance_loss_clip": 1.01452422, "balance_loss_mlp": 1.01619565, "epoch": 0.6593717120096197, "flos": 24716760756480.0, "grad_norm": 1.5462423954098592, "language_loss": 0.7806654, "learning_rate": 1.0984272348828487e-06, "loss": 0.80157137, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35742188, "step": 10967, "time_per_iteration": 2.4023942947387695 }, { "auxiliary_loss_clip": 0.01009121, "auxiliary_loss_mlp": 0.01002781, "balance_loss_clip": 1.00020659, "balance_loss_mlp": 1.00183713, "epoch": 0.6594318352622877, "flos": 55554772101120.0, "grad_norm": 0.6957618228155965, "language_loss": 0.48576456, "learning_rate": 1.0980796060738221e-06, "loss": 0.50588363, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.07275391, "step": 10968, "time_per_iteration": 2.9842116832733154 }, { "auxiliary_loss_clip": 0.01053429, "auxiliary_loss_mlp": 0.01037523, "balance_loss_clip": 1.01406288, "balance_loss_mlp": 1.01655889, "epoch": 0.6594919585149557, "flos": 17455978569600.0, "grad_norm": 2.0316048677157936, "language_loss": 0.80807054, "learning_rate": 1.0977320114657058e-06, "loss": 0.82898009, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 10969, "time_per_iteration": 2.353447437286377 }, { "auxiliary_loss_clip": 0.01051803, "auxiliary_loss_mlp": 0.01035746, "balance_loss_clip": 1.01351309, "balance_loss_mlp": 1.01593232, "epoch": 0.6595520817676236, "flos": 18222252289920.0, "grad_norm": 1.8692972874235732, "language_loss": 0.68200767, "learning_rate": 1.0973844510716817e-06, "loss": 0.70288312, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 10970, "time_per_iteration": 2.329214334487915 }, { "auxiliary_loss_clip": 0.01051819, "auxiliary_loss_mlp": 0.0103424, "balance_loss_clip": 1.01010013, "balance_loss_mlp": 1.01676023, "epoch": 0.6596122050202916, "flos": 22198685057280.0, "grad_norm": 1.5379416264348598, "language_loss": 0.78184575, "learning_rate": 1.0970369249049308e-06, "loss": 0.80270636, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.34960938, "step": 10971, "time_per_iteration": 2.368194103240967 }, { "auxiliary_loss_clip": 0.01055264, "auxiliary_loss_mlp": 0.01037952, "balance_loss_clip": 1.01359773, "balance_loss_mlp": 1.01739848, "epoch": 0.6596723282729595, "flos": 14172955781760.0, "grad_norm": 2.5114955286695224, "language_loss": 0.72969651, "learning_rate": 1.096689432978629e-06, "loss": 0.75062865, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 10972, "time_per_iteration": 3.531177043914795 }, { "auxiliary_loss_clip": 0.01052442, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.00963712, "balance_loss_mlp": 1.01630497, "epoch": 0.6597324515256275, "flos": 30551934597120.0, "grad_norm": 3.483721229483422, "language_loss": 0.56522918, "learning_rate": 1.0963419753059556e-06, "loss": 0.586092, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 10973, "time_per_iteration": 2.4305319786071777 }, { "auxiliary_loss_clip": 0.01056006, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.01668632, "balance_loss_mlp": 1.01743579, "epoch": 0.6597925747782956, "flos": 17638888515840.0, "grad_norm": 1.9094726533963842, "language_loss": 0.79976726, "learning_rate": 1.0959945519000839e-06, "loss": 0.82073903, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38476562, "step": 10974, "time_per_iteration": 2.321925640106201 }, { "auxiliary_loss_clip": 0.01054466, "auxiliary_loss_mlp": 0.01038872, "balance_loss_clip": 1.01384974, "balance_loss_mlp": 1.01662982, "epoch": 0.6598526980309635, "flos": 22818044309760.0, "grad_norm": 2.2315298059667596, "language_loss": 0.70118642, "learning_rate": 1.0956471627741906e-06, "loss": 0.72211981, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 10975, "time_per_iteration": 5.159539461135864 }, { "auxiliary_loss_clip": 0.01052063, "auxiliary_loss_mlp": 0.01034642, "balance_loss_clip": 1.01209974, "balance_loss_mlp": 1.01568377, "epoch": 0.6599128212836315, "flos": 21067010380800.0, "grad_norm": 1.6984704357896858, "language_loss": 0.71947908, "learning_rate": 1.0952998079414464e-06, "loss": 0.74034613, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 10976, "time_per_iteration": 2.362541437149048 }, { "auxiliary_loss_clip": 0.01050316, "auxiliary_loss_mlp": 0.01037291, "balance_loss_clip": 1.01482022, "balance_loss_mlp": 1.01553404, "epoch": 0.6599729445362994, "flos": 22162445199360.0, "grad_norm": 1.6513543481654722, "language_loss": 0.68433475, "learning_rate": 1.0949524874150243e-06, "loss": 0.7052108, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 10977, "time_per_iteration": 2.3722870349884033 }, { "auxiliary_loss_clip": 0.01054666, "auxiliary_loss_mlp": 0.01042395, "balance_loss_clip": 1.01672935, "balance_loss_mlp": 1.01674461, "epoch": 0.6600330677889674, "flos": 18149109258240.0, "grad_norm": 2.0388476031754723, "language_loss": 0.8262496, "learning_rate": 1.0946052012080952e-06, "loss": 0.84722012, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 10978, "time_per_iteration": 2.400414228439331 }, { "auxiliary_loss_clip": 0.0105538, "auxiliary_loss_mlp": 0.01040727, "balance_loss_clip": 1.01572943, "balance_loss_mlp": 1.01671243, "epoch": 0.6600931910416353, "flos": 18149144169600.0, "grad_norm": 2.1193792882188616, "language_loss": 0.68270773, "learning_rate": 1.0942579493338278e-06, "loss": 0.70366883, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 10979, "time_per_iteration": 2.3367092609405518 }, { "auxiliary_loss_clip": 0.01053341, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.01584053, "balance_loss_mlp": 1.01628292, "epoch": 0.6601533142943034, "flos": 17419773623040.0, "grad_norm": 2.2172033147694092, "language_loss": 0.74270737, "learning_rate": 1.0939107318053889e-06, "loss": 0.76365161, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 10980, "time_per_iteration": 2.3616421222686768 }, { "auxiliary_loss_clip": 0.01050735, "auxiliary_loss_mlp": 0.01036899, "balance_loss_clip": 1.01521456, "balance_loss_mlp": 1.01621246, "epoch": 0.6602134375469713, "flos": 28218339855360.0, "grad_norm": 1.5874318777085463, "language_loss": 0.74207604, "learning_rate": 1.0935635486359459e-06, "loss": 0.76295245, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 10981, "time_per_iteration": 2.4377834796905518 }, { "auxiliary_loss_clip": 0.0105431, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.01527798, "balance_loss_mlp": 1.01675153, "epoch": 0.6602735607996393, "flos": 29416943341440.0, "grad_norm": 1.8519774688071646, "language_loss": 0.70182443, "learning_rate": 1.0932163998386647e-06, "loss": 0.72277123, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 10982, "time_per_iteration": 2.4352056980133057 }, { "auxiliary_loss_clip": 0.01052217, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.01517498, "balance_loss_mlp": 1.0164938, "epoch": 0.6603336840523072, "flos": 18587059752960.0, "grad_norm": 1.6188509616678113, "language_loss": 0.70595336, "learning_rate": 1.0928692854267075e-06, "loss": 0.7268492, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 10983, "time_per_iteration": 2.354893922805786 }, { "auxiliary_loss_clip": 0.01052704, "auxiliary_loss_mlp": 0.01039415, "balance_loss_clip": 1.01417875, "balance_loss_mlp": 1.01612127, "epoch": 0.6603938073049752, "flos": 33253478824320.0, "grad_norm": 1.791988521132254, "language_loss": 0.71985877, "learning_rate": 1.092522205413239e-06, "loss": 0.74077994, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 10984, "time_per_iteration": 2.5645902156829834 }, { "auxiliary_loss_clip": 0.01052814, "auxiliary_loss_mlp": 0.01038567, "balance_loss_clip": 1.01387918, "balance_loss_mlp": 1.01677692, "epoch": 0.6604539305576431, "flos": 17383324296960.0, "grad_norm": 1.6499117730196402, "language_loss": 0.84855783, "learning_rate": 1.0921751598114193e-06, "loss": 0.86947161, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 10985, "time_per_iteration": 2.409233808517456 }, { "auxiliary_loss_clip": 0.01054168, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.01367831, "balance_loss_mlp": 1.01633811, "epoch": 0.6605140538103111, "flos": 21250094883840.0, "grad_norm": 2.347029977403867, "language_loss": 0.75266331, "learning_rate": 1.0918281486344077e-06, "loss": 0.77360046, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 10986, "time_per_iteration": 3.8403308391571045 }, { "auxiliary_loss_clip": 0.01050977, "auxiliary_loss_mlp": 0.01035394, "balance_loss_clip": 1.01272058, "balance_loss_mlp": 1.0159204, "epoch": 0.6605741770629792, "flos": 13880837502720.0, "grad_norm": 1.7389573446518238, "language_loss": 0.80381495, "learning_rate": 1.0914811718953636e-06, "loss": 0.82467866, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 10987, "time_per_iteration": 2.3704097270965576 }, { "auxiliary_loss_clip": 0.01008407, "auxiliary_loss_mlp": 0.01002825, "balance_loss_clip": 1.0002383, "balance_loss_mlp": 1.00096047, "epoch": 0.6606343003156471, "flos": 69312436727040.0, "grad_norm": 0.8056466248847116, "language_loss": 0.54119754, "learning_rate": 1.0911342296074454e-06, "loss": 0.56130993, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.07470703, "step": 10988, "time_per_iteration": 3.1228384971618652 }, { "auxiliary_loss_clip": 0.01051587, "auxiliary_loss_mlp": 0.01045615, "balance_loss_clip": 1.02340603, "balance_loss_mlp": 1.01635671, "epoch": 0.6606944235683151, "flos": 27271146136320.0, "grad_norm": 2.4243254624798167, "language_loss": 0.78070533, "learning_rate": 1.0907873217838077e-06, "loss": 0.80167729, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 10989, "time_per_iteration": 2.470628023147583 }, { "auxiliary_loss_clip": 0.01052795, "auxiliary_loss_mlp": 0.01039678, "balance_loss_clip": 1.01564574, "balance_loss_mlp": 1.01677346, "epoch": 0.660754546820983, "flos": 13771943372160.0, "grad_norm": 2.2924281566128655, "language_loss": 0.7832368, "learning_rate": 1.0904404484376064e-06, "loss": 0.80416155, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 10990, "time_per_iteration": 2.3420610427856445 }, { "auxiliary_loss_clip": 0.01054883, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.01676059, "balance_loss_mlp": 1.01700115, "epoch": 0.660814670073651, "flos": 15704316236160.0, "grad_norm": 1.9415244563694911, "language_loss": 0.61760986, "learning_rate": 1.0900936095819937e-06, "loss": 0.63858223, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 10991, "time_per_iteration": 2.3624494075775146 }, { "auxiliary_loss_clip": 0.01056415, "auxiliary_loss_mlp": 0.01045727, "balance_loss_clip": 1.01970387, "balance_loss_mlp": 1.01775122, "epoch": 0.6608747933263189, "flos": 20848977740160.0, "grad_norm": 2.76579974510713, "language_loss": 0.70288944, "learning_rate": 1.0897468052301234e-06, "loss": 0.72391087, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 10992, "time_per_iteration": 2.356637716293335 }, { "auxiliary_loss_clip": 0.01053224, "auxiliary_loss_mlp": 0.01041814, "balance_loss_clip": 1.01666057, "balance_loss_mlp": 1.01536238, "epoch": 0.660934916578987, "flos": 20631049833600.0, "grad_norm": 1.6692457210310343, "language_loss": 0.88315344, "learning_rate": 1.0894000353951444e-06, "loss": 0.90410376, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 10993, "time_per_iteration": 2.376451253890991 }, { "auxiliary_loss_clip": 0.01058203, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.01587415, "balance_loss_mlp": 1.01810646, "epoch": 0.6609950398316549, "flos": 25112571373440.0, "grad_norm": 1.704756258372959, "language_loss": 0.67169654, "learning_rate": 1.0890533000902078e-06, "loss": 0.69271636, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.40234375, "step": 10994, "time_per_iteration": 2.3987278938293457 }, { "auxiliary_loss_clip": 0.01056889, "auxiliary_loss_mlp": 0.01042015, "balance_loss_clip": 1.01756525, "balance_loss_mlp": 1.01815343, "epoch": 0.6610551630843229, "flos": 18660202784640.0, "grad_norm": 1.6564119628171163, "language_loss": 0.78127825, "learning_rate": 1.0887065993284626e-06, "loss": 0.80226731, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 10995, "time_per_iteration": 2.3313136100769043 }, { "auxiliary_loss_clip": 0.01054413, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.0137918, "balance_loss_mlp": 1.0169785, "epoch": 0.6611152863369908, "flos": 23257077056640.0, "grad_norm": 1.9644766004032226, "language_loss": 0.75808704, "learning_rate": 1.088359933123053e-06, "loss": 0.77899849, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.375, "step": 10996, "time_per_iteration": 2.353569507598877 }, { "auxiliary_loss_clip": 0.01054851, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.02175951, "balance_loss_mlp": 1.01749444, "epoch": 0.6611754095896588, "flos": 22158744595200.0, "grad_norm": 1.9069974854158243, "language_loss": 0.69787759, "learning_rate": 1.088013301487126e-06, "loss": 0.71889406, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 10997, "time_per_iteration": 2.3434255123138428 }, { "auxiliary_loss_clip": 0.01056138, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.01166892, "balance_loss_mlp": 1.01812291, "epoch": 0.6612355328423267, "flos": 13990360037760.0, "grad_norm": 1.8801047148020151, "language_loss": 0.69401163, "learning_rate": 1.0876667044338269e-06, "loss": 0.7149415, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 10998, "time_per_iteration": 2.3429505825042725 }, { "auxiliary_loss_clip": 0.01008624, "auxiliary_loss_mlp": 0.01006127, "balance_loss_clip": 1.00349295, "balance_loss_mlp": 1.00120711, "epoch": 0.6612956560949947, "flos": 61450660529280.0, "grad_norm": 0.6572434108159338, "language_loss": 0.51242673, "learning_rate": 1.087320141976297e-06, "loss": 0.5325743, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.07421875, "step": 10999, "time_per_iteration": 2.9681572914123535 }, { "auxiliary_loss_clip": 0.01054965, "auxiliary_loss_mlp": 0.01045549, "balance_loss_clip": 1.0184406, "balance_loss_mlp": 1.01608419, "epoch": 0.6613557793476627, "flos": 21615565662720.0, "grad_norm": 2.263205087962403, "language_loss": 0.7114495, "learning_rate": 1.086973614127679e-06, "loss": 0.73245466, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38867188, "step": 11000, "time_per_iteration": 2.3870296478271484 }, { "auxiliary_loss_clip": 0.01050859, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.016029, "balance_loss_mlp": 1.01564121, "epoch": 0.6614159026003307, "flos": 34018740115200.0, "grad_norm": 1.8063003811644438, "language_loss": 0.66129017, "learning_rate": 1.0866271209011133e-06, "loss": 0.68218696, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 11001, "time_per_iteration": 2.4838876724243164 }, { "auxiliary_loss_clip": 0.0105201, "auxiliary_loss_mlp": 0.01043512, "balance_loss_clip": 1.01913428, "balance_loss_mlp": 1.01588488, "epoch": 0.6614760258529987, "flos": 24096144695040.0, "grad_norm": 1.8163447648323026, "language_loss": 0.73503041, "learning_rate": 1.086280662309739e-06, "loss": 0.75598562, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 11002, "time_per_iteration": 2.3817479610443115 }, { "auxiliary_loss_clip": 0.01052859, "auxiliary_loss_mlp": 0.01042931, "balance_loss_clip": 1.0167768, "balance_loss_mlp": 1.0160234, "epoch": 0.6615361491056666, "flos": 14902884910080.0, "grad_norm": 2.2757079346919453, "language_loss": 0.7978884, "learning_rate": 1.0859342383666928e-06, "loss": 0.81884629, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 11003, "time_per_iteration": 2.33282470703125 }, { "auxiliary_loss_clip": 0.01058194, "auxiliary_loss_mlp": 0.01047453, "balance_loss_clip": 1.01623237, "balance_loss_mlp": 1.01796997, "epoch": 0.6615962723583346, "flos": 15303967142400.0, "grad_norm": 2.236055847937065, "language_loss": 0.70581049, "learning_rate": 1.0855878490851119e-06, "loss": 0.72686696, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 0.40234375, "step": 11004, "time_per_iteration": 2.371011972427368 }, { "auxiliary_loss_clip": 0.01055964, "auxiliary_loss_mlp": 0.01042609, "balance_loss_clip": 1.01564419, "balance_loss_mlp": 1.01709247, "epoch": 0.6616563956110025, "flos": 18731635159680.0, "grad_norm": 2.1013190084760893, "language_loss": 0.71520829, "learning_rate": 1.085241494478132e-06, "loss": 0.73619401, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 11005, "time_per_iteration": 2.335397243499756 }, { "auxiliary_loss_clip": 0.01054271, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.01458502, "balance_loss_mlp": 1.0174129, "epoch": 0.6617165188636706, "flos": 24494015082240.0, "grad_norm": 1.9630766430702902, "language_loss": 0.78858519, "learning_rate": 1.0848951745588855e-06, "loss": 0.80952442, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 11006, "time_per_iteration": 2.4194066524505615 }, { "auxiliary_loss_clip": 0.01054434, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.01806891, "balance_loss_mlp": 1.01729155, "epoch": 0.6617766421163385, "flos": 22378662449280.0, "grad_norm": 1.7701009770813798, "language_loss": 0.77308661, "learning_rate": 1.0845488893405068e-06, "loss": 0.79406846, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 11007, "time_per_iteration": 2.3893160820007324 }, { "auxiliary_loss_clip": 0.01053751, "auxiliary_loss_mlp": 0.01039171, "balance_loss_clip": 1.01305246, "balance_loss_mlp": 1.01678944, "epoch": 0.6618367653690065, "flos": 20849361765120.0, "grad_norm": 1.6711976655336938, "language_loss": 0.78928673, "learning_rate": 1.0842026388361248e-06, "loss": 0.81021601, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36914062, "step": 11008, "time_per_iteration": 2.35719895362854 }, { "auxiliary_loss_clip": 0.01056952, "auxiliary_loss_mlp": 0.01045625, "balance_loss_clip": 1.01601338, "balance_loss_mlp": 1.01687372, "epoch": 0.6618968886216744, "flos": 17711368231680.0, "grad_norm": 2.005208322110249, "language_loss": 0.82779467, "learning_rate": 1.0838564230588715e-06, "loss": 0.84882045, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40039062, "step": 11009, "time_per_iteration": 2.32336688041687 }, { "auxiliary_loss_clip": 0.01009027, "auxiliary_loss_mlp": 0.0100297, "balance_loss_clip": 1.00025165, "balance_loss_mlp": 1.00156367, "epoch": 0.6619570118743424, "flos": 67032155364480.0, "grad_norm": 1.0249942177720932, "language_loss": 0.67454445, "learning_rate": 1.0835102420218735e-06, "loss": 0.69466448, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.07470703, "step": 11010, "time_per_iteration": 2.9407904148101807 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01043821, "balance_loss_clip": 1.0167129, "balance_loss_mlp": 1.01554489, "epoch": 0.6620171351270103, "flos": 18659923493760.0, "grad_norm": 1.6766997569195867, "language_loss": 0.72627485, "learning_rate": 1.0831640957382593e-06, "loss": 0.7472499, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38085938, "step": 11011, "time_per_iteration": 3.610153913497925 }, { "auxiliary_loss_clip": 0.01054407, "auxiliary_loss_mlp": 0.0104015, "balance_loss_clip": 1.01586723, "balance_loss_mlp": 1.01767731, "epoch": 0.6620772583796783, "flos": 24169357549440.0, "grad_norm": 2.0170854952038093, "language_loss": 0.73461086, "learning_rate": 1.0828179842211557e-06, "loss": 0.75555646, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 11012, "time_per_iteration": 2.394791603088379 }, { "auxiliary_loss_clip": 0.01050571, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.01086545, "balance_loss_mlp": 1.01614761, "epoch": 0.6621373816323463, "flos": 23622408190080.0, "grad_norm": 1.5772244000080036, "language_loss": 0.80569404, "learning_rate": 1.0824719074836845e-06, "loss": 0.826527, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 11013, "time_per_iteration": 2.382826328277588 }, { "auxiliary_loss_clip": 0.01054248, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.01322198, "balance_loss_mlp": 1.01725733, "epoch": 0.6621975048850143, "flos": 18441227537280.0, "grad_norm": 1.8677204766154925, "language_loss": 0.72084004, "learning_rate": 1.082125865538971e-06, "loss": 0.74175817, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 11014, "time_per_iteration": 3.7482316493988037 }, { "auxiliary_loss_clip": 0.01051237, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.01480055, "balance_loss_mlp": 1.01645398, "epoch": 0.6622576281376823, "flos": 14063014310400.0, "grad_norm": 1.915870147983513, "language_loss": 0.78297246, "learning_rate": 1.081779858400137e-06, "loss": 0.80386162, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34765625, "step": 11015, "time_per_iteration": 3.7315332889556885 }, { "auxiliary_loss_clip": 0.0105251, "auxiliary_loss_mlp": 0.01037042, "balance_loss_clip": 1.01303315, "balance_loss_mlp": 1.01607943, "epoch": 0.6623177513903502, "flos": 17018028074880.0, "grad_norm": 1.7712905860677401, "language_loss": 0.83531785, "learning_rate": 1.0814338860803021e-06, "loss": 0.85621339, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 11016, "time_per_iteration": 2.3255972862243652 }, { "auxiliary_loss_clip": 0.01053263, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.01340985, "balance_loss_mlp": 1.01613665, "epoch": 0.6623778746430182, "flos": 17270170980480.0, "grad_norm": 1.937628600669207, "language_loss": 0.7223087, "learning_rate": 1.0810879485925864e-06, "loss": 0.74322236, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 11017, "time_per_iteration": 2.3364663124084473 }, { "auxiliary_loss_clip": 0.0105263, "auxiliary_loss_mlp": 0.01049474, "balance_loss_clip": 1.02316415, "balance_loss_mlp": 1.01601243, "epoch": 0.6624379978956861, "flos": 48791016028800.0, "grad_norm": 1.8590390368919154, "language_loss": 0.7810185, "learning_rate": 1.0807420459501084e-06, "loss": 0.8020395, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3671875, "step": 11018, "time_per_iteration": 2.597356081008911 }, { "auxiliary_loss_clip": 0.01054518, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.01899314, "balance_loss_mlp": 1.01676583, "epoch": 0.6624981211483542, "flos": 18951448279680.0, "grad_norm": 2.269058083104104, "language_loss": 0.84312022, "learning_rate": 1.0803961781659841e-06, "loss": 0.86413437, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.37695312, "step": 11019, "time_per_iteration": 2.3729348182678223 }, { "auxiliary_loss_clip": 0.01051971, "auxiliary_loss_mlp": 0.01035954, "balance_loss_clip": 1.01261258, "balance_loss_mlp": 1.01621604, "epoch": 0.6625582444010221, "flos": 23255506045440.0, "grad_norm": 1.734133460513638, "language_loss": 0.72696048, "learning_rate": 1.080050345253328e-06, "loss": 0.74783969, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 11020, "time_per_iteration": 2.3715970516204834 }, { "auxiliary_loss_clip": 0.01056308, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.01443732, "balance_loss_mlp": 1.01686239, "epoch": 0.6626183676536901, "flos": 21393832417920.0, "grad_norm": 1.6973368451313842, "language_loss": 0.73517621, "learning_rate": 1.0797045472252554e-06, "loss": 0.75615358, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 11021, "time_per_iteration": 2.3756909370422363 }, { "auxiliary_loss_clip": 0.01054954, "auxiliary_loss_mlp": 0.01045025, "balance_loss_clip": 1.01987159, "balance_loss_mlp": 1.01764214, "epoch": 0.662678490906358, "flos": 14570511966720.0, "grad_norm": 3.248210328346275, "language_loss": 0.84570658, "learning_rate": 1.0793587840948793e-06, "loss": 0.86670637, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 11022, "time_per_iteration": 2.318394660949707 }, { "auxiliary_loss_clip": 0.01058358, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.01236272, "balance_loss_mlp": 1.01758289, "epoch": 0.662738614159026, "flos": 15991581836160.0, "grad_norm": 2.245351747554217, "language_loss": 0.74408782, "learning_rate": 1.0790130558753099e-06, "loss": 0.76509184, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40820312, "step": 11023, "time_per_iteration": 2.389620065689087 }, { "auxiliary_loss_clip": 0.01052202, "auxiliary_loss_mlp": 0.01040863, "balance_loss_clip": 1.01636541, "balance_loss_mlp": 1.0159421, "epoch": 0.6627987374116939, "flos": 19535335724160.0, "grad_norm": 1.6669599124674837, "language_loss": 0.75515962, "learning_rate": 1.0786673625796574e-06, "loss": 0.77609026, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 11024, "time_per_iteration": 2.366286516189575 }, { "auxiliary_loss_clip": 0.01053902, "auxiliary_loss_mlp": 0.01041236, "balance_loss_clip": 1.0160116, "balance_loss_mlp": 1.01664138, "epoch": 0.662858860664362, "flos": 15702012086400.0, "grad_norm": 3.4564825527258787, "language_loss": 0.72003198, "learning_rate": 1.0783217042210306e-06, "loss": 0.74098337, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 11025, "time_per_iteration": 3.805116891860962 }, { "auxiliary_loss_clip": 0.01054636, "auxiliary_loss_mlp": 0.0104248, "balance_loss_clip": 1.01736295, "balance_loss_mlp": 1.01776481, "epoch": 0.6629189839170299, "flos": 20153333433600.0, "grad_norm": 1.5783810565648915, "language_loss": 0.80002558, "learning_rate": 1.0779760808125379e-06, "loss": 0.82099676, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 11026, "time_per_iteration": 2.3950438499450684 }, { "auxiliary_loss_clip": 0.01054084, "auxiliary_loss_mlp": 0.0103793, "balance_loss_clip": 1.01401663, "balance_loss_mlp": 1.01750684, "epoch": 0.6629791071696979, "flos": 20914579918080.0, "grad_norm": 1.699493919176788, "language_loss": 0.76714694, "learning_rate": 1.0776304923672842e-06, "loss": 0.78806698, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 11027, "time_per_iteration": 2.368635654449463 }, { "auxiliary_loss_clip": 0.01053796, "auxiliary_loss_mlp": 0.01038208, "balance_loss_clip": 1.01238751, "balance_loss_mlp": 1.01656163, "epoch": 0.6630392304223659, "flos": 20845940451840.0, "grad_norm": 2.3760086183588216, "language_loss": 0.71846992, "learning_rate": 1.0772849388983742e-06, "loss": 0.73939002, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 11028, "time_per_iteration": 2.3263754844665527 }, { "auxiliary_loss_clip": 0.01051837, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.01644015, "balance_loss_mlp": 1.01573384, "epoch": 0.6630993536750338, "flos": 20994775044480.0, "grad_norm": 1.8073190190692723, "language_loss": 0.80768323, "learning_rate": 1.0769394204189138e-06, "loss": 0.82858908, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36132812, "step": 11029, "time_per_iteration": 2.3579559326171875 }, { "auxiliary_loss_clip": 0.01054279, "auxiliary_loss_mlp": 0.0103953, "balance_loss_clip": 1.01306593, "balance_loss_mlp": 1.01634121, "epoch": 0.6631594769277018, "flos": 18258073211520.0, "grad_norm": 1.969291156411193, "language_loss": 0.78064871, "learning_rate": 1.0765939369420012e-06, "loss": 0.80158675, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 11030, "time_per_iteration": 2.3274123668670654 }, { "auxiliary_loss_clip": 0.01056142, "auxiliary_loss_mlp": 0.01040136, "balance_loss_clip": 1.01413655, "balance_loss_mlp": 1.01725447, "epoch": 0.6632196001803697, "flos": 17819564135040.0, "grad_norm": 3.505748101806825, "language_loss": 0.77175272, "learning_rate": 1.0762484884807391e-06, "loss": 0.79271555, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38867188, "step": 11031, "time_per_iteration": 2.3321754932403564 }, { "auxiliary_loss_clip": 0.01055615, "auxiliary_loss_mlp": 0.01038293, "balance_loss_clip": 1.01359248, "balance_loss_mlp": 1.01764846, "epoch": 0.6632797234330378, "flos": 12669561192960.0, "grad_norm": 2.8878250999856085, "language_loss": 0.77050829, "learning_rate": 1.075903075048228e-06, "loss": 0.7914474, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38085938, "step": 11032, "time_per_iteration": 2.3173999786376953 }, { "auxiliary_loss_clip": 0.01050869, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.01486838, "balance_loss_mlp": 1.0150857, "epoch": 0.6633398466857057, "flos": 23583654714240.0, "grad_norm": 1.9021182024802024, "language_loss": 0.81679559, "learning_rate": 1.0755576966575635e-06, "loss": 0.8376894, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 11033, "time_per_iteration": 2.3874671459198 }, { "auxiliary_loss_clip": 0.01053996, "auxiliary_loss_mlp": 0.01041356, "balance_loss_clip": 1.01514196, "balance_loss_mlp": 1.0157187, "epoch": 0.6633999699383737, "flos": 20630630897280.0, "grad_norm": 1.6312835047773901, "language_loss": 0.81573606, "learning_rate": 1.0752123533218451e-06, "loss": 0.83668959, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 11034, "time_per_iteration": 2.350402593612671 }, { "auxiliary_loss_clip": 0.01052898, "auxiliary_loss_mlp": 0.01035378, "balance_loss_clip": 1.0123229, "balance_loss_mlp": 1.01681638, "epoch": 0.6634600931910416, "flos": 21796066725120.0, "grad_norm": 1.562458206269915, "language_loss": 0.76819491, "learning_rate": 1.074867045054166e-06, "loss": 0.7890777, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 11035, "time_per_iteration": 2.3610806465148926 }, { "auxiliary_loss_clip": 0.01053953, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.01342869, "balance_loss_mlp": 1.01585829, "epoch": 0.6635202164437096, "flos": 18731914450560.0, "grad_norm": 1.8680490459639247, "language_loss": 0.83906519, "learning_rate": 1.074521771867622e-06, "loss": 0.85997856, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.38085938, "step": 11036, "time_per_iteration": 2.3151466846466064 }, { "auxiliary_loss_clip": 0.01008777, "auxiliary_loss_mlp": 0.01002892, "balance_loss_clip": 1.00050831, "balance_loss_mlp": 1.00134587, "epoch": 0.6635803396963775, "flos": 60219482878080.0, "grad_norm": 0.7752356146725335, "language_loss": 0.52385253, "learning_rate": 1.0741765337753044e-06, "loss": 0.54396921, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.07421875, "step": 11037, "time_per_iteration": 2.9919838905334473 }, { "auxiliary_loss_clip": 0.01056221, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.01357162, "balance_loss_mlp": 1.01842523, "epoch": 0.6636404629490456, "flos": 29165812865280.0, "grad_norm": 1.7308428446831448, "language_loss": 0.80135077, "learning_rate": 1.0738313307903052e-06, "loss": 0.82230246, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 11038, "time_per_iteration": 2.419877290725708 }, { "auxiliary_loss_clip": 0.01054239, "auxiliary_loss_mlp": 0.01043882, "balance_loss_clip": 1.01685691, "balance_loss_mlp": 1.01738632, "epoch": 0.6637005862017135, "flos": 38906231477760.0, "grad_norm": 2.3483036259570005, "language_loss": 0.65533406, "learning_rate": 1.073486162925716e-06, "loss": 0.67631525, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.36914062, "step": 11039, "time_per_iteration": 2.508627414703369 }, { "auxiliary_loss_clip": 0.01055323, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.01129746, "balance_loss_mlp": 1.01682949, "epoch": 0.6637607094543815, "flos": 22782258299520.0, "grad_norm": 1.634375009208824, "language_loss": 0.65410435, "learning_rate": 1.0731410301946237e-06, "loss": 0.67502475, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 11040, "time_per_iteration": 2.382423162460327 }, { "auxiliary_loss_clip": 0.01051056, "auxiliary_loss_mlp": 0.01041335, "balance_loss_clip": 1.01882839, "balance_loss_mlp": 1.01507556, "epoch": 0.6638208327070495, "flos": 18113113779840.0, "grad_norm": 2.752992661402511, "language_loss": 0.72869432, "learning_rate": 1.0727959326101161e-06, "loss": 0.74961823, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 11041, "time_per_iteration": 2.3562893867492676 }, { "auxiliary_loss_clip": 0.01052071, "auxiliary_loss_mlp": 0.01043923, "balance_loss_clip": 1.01875842, "balance_loss_mlp": 1.01555276, "epoch": 0.6638809559597174, "flos": 29423576499840.0, "grad_norm": 2.335265886249534, "language_loss": 0.63769978, "learning_rate": 1.0724508701852806e-06, "loss": 0.6586597, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 11042, "time_per_iteration": 2.4152917861938477 }, { "auxiliary_loss_clip": 0.0105608, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.01127768, "balance_loss_mlp": 1.01629186, "epoch": 0.6639410792123854, "flos": 28071495210240.0, "grad_norm": 1.9004996666444258, "language_loss": 0.69648266, "learning_rate": 1.0721058429331998e-06, "loss": 0.71743822, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.3984375, "step": 11043, "time_per_iteration": 2.4085865020751953 }, { "auxiliary_loss_clip": 0.01050426, "auxiliary_loss_mlp": 0.01036771, "balance_loss_clip": 1.0144912, "balance_loss_mlp": 1.01609886, "epoch": 0.6640012024650533, "flos": 25555199990400.0, "grad_norm": 1.5380082793740195, "language_loss": 0.84339392, "learning_rate": 1.0717608508669587e-06, "loss": 0.86426592, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 11044, "time_per_iteration": 2.381227970123291 }, { "auxiliary_loss_clip": 0.01052423, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.01296806, "balance_loss_mlp": 1.01571822, "epoch": 0.6640613257177214, "flos": 14866051559040.0, "grad_norm": 2.0578606047597385, "language_loss": 0.71234465, "learning_rate": 1.0714158939996392e-06, "loss": 0.73324585, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 11045, "time_per_iteration": 2.3562262058258057 }, { "auxiliary_loss_clip": 0.01054499, "auxiliary_loss_mlp": 0.01039439, "balance_loss_clip": 1.01450086, "balance_loss_mlp": 1.01680017, "epoch": 0.6641214489703893, "flos": 23219999326080.0, "grad_norm": 1.520629315522978, "language_loss": 0.65145737, "learning_rate": 1.0710709723443235e-06, "loss": 0.67239678, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37695312, "step": 11046, "time_per_iteration": 2.3806560039520264 }, { "auxiliary_loss_clip": 0.01052834, "auxiliary_loss_mlp": 0.01038753, "balance_loss_clip": 1.01454115, "balance_loss_mlp": 1.01606131, "epoch": 0.6641815722230573, "flos": 37741109852160.0, "grad_norm": 2.4924034493364275, "language_loss": 0.72004569, "learning_rate": 1.070726085914088e-06, "loss": 0.74096155, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 11047, "time_per_iteration": 2.51143217086792 }, { "auxiliary_loss_clip": 0.01053681, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.01208663, "balance_loss_mlp": 1.01685143, "epoch": 0.6642416954757252, "flos": 17930168922240.0, "grad_norm": 1.9681979442625266, "language_loss": 0.77844352, "learning_rate": 1.0703812347220126e-06, "loss": 0.79935831, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 11048, "time_per_iteration": 2.3412768840789795 }, { "auxiliary_loss_clip": 0.01008603, "auxiliary_loss_mlp": 0.01006602, "balance_loss_clip": 1.00434875, "balance_loss_mlp": 1.00136185, "epoch": 0.6643018187283932, "flos": 51992829394560.0, "grad_norm": 0.7512400024556176, "language_loss": 0.55066133, "learning_rate": 1.0700364187811745e-06, "loss": 0.57081336, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.07226562, "step": 11049, "time_per_iteration": 3.016592502593994 }, { "auxiliary_loss_clip": 0.01053016, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.01149035, "balance_loss_mlp": 1.01601672, "epoch": 0.6643619419810611, "flos": 30225356939520.0, "grad_norm": 1.5633864817236542, "language_loss": 0.65155321, "learning_rate": 1.069691638104648e-06, "loss": 0.67243487, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 11050, "time_per_iteration": 2.4380106925964355 }, { "auxiliary_loss_clip": 0.0105196, "auxiliary_loss_mlp": 0.01038529, "balance_loss_clip": 1.01503325, "balance_loss_mlp": 1.01641452, "epoch": 0.6644220652337292, "flos": 22965028600320.0, "grad_norm": 2.2382984307500586, "language_loss": 0.79854983, "learning_rate": 1.0693468927055085e-06, "loss": 0.81945479, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 11051, "time_per_iteration": 3.6065726280212402 }, { "auxiliary_loss_clip": 0.01053516, "auxiliary_loss_mlp": 0.01041823, "balance_loss_clip": 1.01794541, "balance_loss_mlp": 1.01738071, "epoch": 0.6644821884863971, "flos": 21141165841920.0, "grad_norm": 1.6669988972452632, "language_loss": 0.86320388, "learning_rate": 1.0690021825968276e-06, "loss": 0.88415724, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 11052, "time_per_iteration": 2.351734161376953 }, { "auxiliary_loss_clip": 0.01055672, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.01766634, "balance_loss_mlp": 1.01776612, "epoch": 0.6645423117390651, "flos": 20191807618560.0, "grad_norm": 2.16655708495554, "language_loss": 0.76051271, "learning_rate": 1.0686575077916776e-06, "loss": 0.78151178, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 11053, "time_per_iteration": 2.359241247177124 }, { "auxiliary_loss_clip": 0.01051789, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.01182806, "balance_loss_mlp": 1.0161196, "epoch": 0.6646024349917331, "flos": 24350836129920.0, "grad_norm": 1.615278128809129, "language_loss": 0.80435836, "learning_rate": 1.0683128683031278e-06, "loss": 0.82524109, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 11054, "time_per_iteration": 3.7380881309509277 }, { "auxiliary_loss_clip": 0.01051638, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.01099026, "balance_loss_mlp": 1.01637602, "epoch": 0.664662558244401, "flos": 18805720798080.0, "grad_norm": 1.595842642680796, "language_loss": 0.74626446, "learning_rate": 1.0679682641442472e-06, "loss": 0.76711971, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 11055, "time_per_iteration": 3.7174551486968994 }, { "auxiliary_loss_clip": 0.01052847, "auxiliary_loss_mlp": 0.01040294, "balance_loss_clip": 1.01443779, "balance_loss_mlp": 1.01613605, "epoch": 0.664722681497069, "flos": 18951797393280.0, "grad_norm": 1.9269800948912357, "language_loss": 0.73933256, "learning_rate": 1.0676236953281042e-06, "loss": 0.76026398, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 11056, "time_per_iteration": 2.355121612548828 }, { "auxiliary_loss_clip": 0.0105236, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.01478302, "balance_loss_mlp": 1.01592112, "epoch": 0.6647828047497369, "flos": 19570318773120.0, "grad_norm": 1.978106573746429, "language_loss": 0.70822358, "learning_rate": 1.0672791618677641e-06, "loss": 0.72913826, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 11057, "time_per_iteration": 2.3549771308898926 }, { "auxiliary_loss_clip": 0.01054194, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.0126102, "balance_loss_mlp": 1.01716018, "epoch": 0.664842928002405, "flos": 23148322571520.0, "grad_norm": 1.7775560859634052, "language_loss": 0.81240928, "learning_rate": 1.066934663776291e-06, "loss": 0.83331537, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 11058, "time_per_iteration": 2.3633978366851807 }, { "auxiliary_loss_clip": 0.01009222, "auxiliary_loss_mlp": 0.01002792, "balance_loss_clip": 1.00058699, "balance_loss_mlp": 1.00205207, "epoch": 0.6649030512550729, "flos": 65241844289280.0, "grad_norm": 0.7903196064692106, "language_loss": 0.62720037, "learning_rate": 1.0665902010667496e-06, "loss": 0.64732051, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.07177734, "step": 11059, "time_per_iteration": 2.8993260860443115 }, { "auxiliary_loss_clip": 0.01052696, "auxiliary_loss_mlp": 0.01040087, "balance_loss_clip": 1.01721048, "balance_loss_mlp": 1.01613545, "epoch": 0.6649631745077409, "flos": 20193727743360.0, "grad_norm": 1.4298423642113622, "language_loss": 0.79701805, "learning_rate": 1.0662457737522008e-06, "loss": 0.81794596, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 11060, "time_per_iteration": 2.389570713043213 }, { "auxiliary_loss_clip": 0.01052562, "auxiliary_loss_mlp": 0.01036178, "balance_loss_clip": 1.01264572, "balance_loss_mlp": 1.01631558, "epoch": 0.6650232977604088, "flos": 17237596815360.0, "grad_norm": 1.6681991528514222, "language_loss": 0.79207468, "learning_rate": 1.0659013818457055e-06, "loss": 0.81296217, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 11061, "time_per_iteration": 2.345914840698242 }, { "auxiliary_loss_clip": 0.01052293, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.01250744, "balance_loss_mlp": 1.01685119, "epoch": 0.6650834210130768, "flos": 10006316593920.0, "grad_norm": 2.269524265448418, "language_loss": 0.57289517, "learning_rate": 1.0655570253603243e-06, "loss": 0.59377438, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 11062, "time_per_iteration": 2.3277463912963867 }, { "auxiliary_loss_clip": 0.01054157, "auxiliary_loss_mlp": 0.01038952, "balance_loss_clip": 1.01023436, "balance_loss_mlp": 1.01516688, "epoch": 0.6651435442657447, "flos": 10451319183360.0, "grad_norm": 1.7834484340318153, "language_loss": 0.77057225, "learning_rate": 1.0652127043091144e-06, "loss": 0.79150331, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.390625, "step": 11063, "time_per_iteration": 2.3249144554138184 }, { "auxiliary_loss_clip": 0.01052718, "auxiliary_loss_mlp": 0.01038562, "balance_loss_clip": 1.01542401, "balance_loss_mlp": 1.01702499, "epoch": 0.6652036675184128, "flos": 22343190641280.0, "grad_norm": 2.085207738086411, "language_loss": 0.71247017, "learning_rate": 1.0648684187051316e-06, "loss": 0.733383, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 11064, "time_per_iteration": 2.4113166332244873 }, { "auxiliary_loss_clip": 0.01008169, "auxiliary_loss_mlp": 0.01004797, "balance_loss_clip": 1.00249636, "balance_loss_mlp": 1.00101471, "epoch": 0.6652637907710807, "flos": 52906995100800.0, "grad_norm": 0.8616491667351515, "language_loss": 0.63150954, "learning_rate": 1.0645241685614322e-06, "loss": 0.65163922, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.07128906, "step": 11065, "time_per_iteration": 4.334945201873779 }, { "auxiliary_loss_clip": 0.01052664, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.01207304, "balance_loss_mlp": 1.01692581, "epoch": 0.6653239140237487, "flos": 23103738898560.0, "grad_norm": 2.0781541315325645, "language_loss": 0.63631618, "learning_rate": 1.0641799538910708e-06, "loss": 0.65719789, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 11066, "time_per_iteration": 2.359642505645752 }, { "auxiliary_loss_clip": 0.01053912, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.0127697, "balance_loss_mlp": 1.01639295, "epoch": 0.6653840372764167, "flos": 25958167436160.0, "grad_norm": 1.5381060837561717, "language_loss": 0.70299721, "learning_rate": 1.0638357747070985e-06, "loss": 0.72391355, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 11067, "time_per_iteration": 2.413130521774292 }, { "auxiliary_loss_clip": 0.01008448, "auxiliary_loss_mlp": 0.01003363, "balance_loss_clip": 1.00062096, "balance_loss_mlp": 1.00095749, "epoch": 0.6654441605290846, "flos": 66039051340800.0, "grad_norm": 0.9267257122259847, "language_loss": 0.72269142, "learning_rate": 1.0634916310225684e-06, "loss": 0.74280953, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.07519531, "step": 11068, "time_per_iteration": 2.9948508739471436 }, { "auxiliary_loss_clip": 0.01008748, "auxiliary_loss_mlp": 0.01010429, "balance_loss_clip": 1.00825977, "balance_loss_mlp": 1.00143588, "epoch": 0.6655042837817526, "flos": 65192371159680.0, "grad_norm": 0.7120506108063273, "language_loss": 0.57849514, "learning_rate": 1.0631475228505285e-06, "loss": 0.59868693, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.07324219, "step": 11069, "time_per_iteration": 3.1411874294281006 }, { "auxiliary_loss_clip": 0.01008244, "auxiliary_loss_mlp": 0.01002791, "balance_loss_clip": 1.0003351, "balance_loss_mlp": 1.00093198, "epoch": 0.6655644070344205, "flos": 69005411297280.0, "grad_norm": 0.7530776933455405, "language_loss": 0.6358887, "learning_rate": 1.062803450204029e-06, "loss": 0.65599906, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.07324219, "step": 11070, "time_per_iteration": 3.0696499347686768 }, { "auxiliary_loss_clip": 0.01051162, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.01495767, "balance_loss_mlp": 1.01521778, "epoch": 0.6656245302870886, "flos": 36314209785600.0, "grad_norm": 2.08031918678009, "language_loss": 0.5993371, "learning_rate": 1.062459413096116e-06, "loss": 0.62022048, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 11071, "time_per_iteration": 2.5099880695343018 }, { "auxiliary_loss_clip": 0.0105343, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.01551175, "balance_loss_mlp": 1.01711905, "epoch": 0.6656846535397565, "flos": 21793867309440.0, "grad_norm": 1.662710577623812, "language_loss": 0.73561239, "learning_rate": 1.0621154115398364e-06, "loss": 0.75653386, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11072, "time_per_iteration": 2.36806321144104 }, { "auxiliary_loss_clip": 0.01053382, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.01420236, "balance_loss_mlp": 1.01663649, "epoch": 0.6657447767924245, "flos": 37486104215040.0, "grad_norm": 1.693904306827611, "language_loss": 0.71574211, "learning_rate": 1.0617714455482353e-06, "loss": 0.7366665, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3671875, "step": 11073, "time_per_iteration": 2.526862382888794 }, { "auxiliary_loss_clip": 0.01055249, "auxiliary_loss_mlp": 0.01039705, "balance_loss_clip": 1.01447999, "balance_loss_mlp": 1.01761627, "epoch": 0.6658049000450924, "flos": 16836724051200.0, "grad_norm": 2.4404081932689503, "language_loss": 0.56646264, "learning_rate": 1.061427515134354e-06, "loss": 0.58741218, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 11074, "time_per_iteration": 2.3248791694641113 }, { "auxiliary_loss_clip": 0.01052271, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.01446795, "balance_loss_mlp": 1.0170902, "epoch": 0.6658650232977604, "flos": 33509566713600.0, "grad_norm": 1.4626357704944923, "language_loss": 0.7277298, "learning_rate": 1.061083620311235e-06, "loss": 0.74862289, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 11075, "time_per_iteration": 2.4545867443084717 }, { "auxiliary_loss_clip": 0.01050715, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.01618838, "balance_loss_mlp": 1.01602411, "epoch": 0.6659251465504283, "flos": 37704800171520.0, "grad_norm": 1.7338755664049061, "language_loss": 0.66984951, "learning_rate": 1.0607397610919202e-06, "loss": 0.69073641, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 11076, "time_per_iteration": 2.512529134750366 }, { "auxiliary_loss_clip": 0.01051812, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.01143384, "balance_loss_mlp": 1.0157268, "epoch": 0.6659852698030964, "flos": 24892444051200.0, "grad_norm": 1.719472356063621, "language_loss": 0.76245737, "learning_rate": 1.0603959374894468e-06, "loss": 0.78332663, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 11077, "time_per_iteration": 2.3991341590881348 }, { "auxiliary_loss_clip": 0.01053247, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.01467848, "balance_loss_mlp": 1.01680899, "epoch": 0.6660453930557643, "flos": 24351674002560.0, "grad_norm": 1.5606811678236454, "language_loss": 0.67498875, "learning_rate": 1.0600521495168538e-06, "loss": 0.6959157, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 11078, "time_per_iteration": 2.380706548690796 }, { "auxiliary_loss_clip": 0.01053631, "auxiliary_loss_mlp": 0.01040199, "balance_loss_clip": 1.01536798, "balance_loss_mlp": 1.01606512, "epoch": 0.6661055163084323, "flos": 10597046664960.0, "grad_norm": 2.1465229995504096, "language_loss": 0.71546638, "learning_rate": 1.0597083971871783e-06, "loss": 0.73640472, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 11079, "time_per_iteration": 2.331461191177368 }, { "auxiliary_loss_clip": 0.01052769, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.01234448, "balance_loss_mlp": 1.01683378, "epoch": 0.6661656395611003, "flos": 24056448612480.0, "grad_norm": 1.534315362453059, "language_loss": 0.81357062, "learning_rate": 1.0593646805134544e-06, "loss": 0.83444846, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 11080, "time_per_iteration": 2.3926966190338135 }, { "auxiliary_loss_clip": 0.01050203, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.01491213, "balance_loss_mlp": 1.01525557, "epoch": 0.6662257628137682, "flos": 23035169255040.0, "grad_norm": 1.7969901882229995, "language_loss": 0.7913422, "learning_rate": 1.0590209995087157e-06, "loss": 0.81220627, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34960938, "step": 11081, "time_per_iteration": 2.3693289756774902 }, { "auxiliary_loss_clip": 0.01054494, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.01369154, "balance_loss_mlp": 1.01721632, "epoch": 0.6662858860664362, "flos": 24753279905280.0, "grad_norm": 1.8425349681182936, "language_loss": 0.81548941, "learning_rate": 1.0586773541859946e-06, "loss": 0.8364284, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 11082, "time_per_iteration": 2.4081456661224365 }, { "auxiliary_loss_clip": 0.01050977, "auxiliary_loss_mlp": 0.01033666, "balance_loss_clip": 1.01141, "balance_loss_mlp": 1.01590967, "epoch": 0.6663460093191041, "flos": 20008094711040.0, "grad_norm": 1.5705430900448303, "language_loss": 0.84503484, "learning_rate": 1.0583337445583234e-06, "loss": 0.86588126, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 11083, "time_per_iteration": 2.360698938369751 }, { "auxiliary_loss_clip": 0.01055592, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.01795256, "balance_loss_mlp": 1.01684666, "epoch": 0.6664061325717722, "flos": 17820436919040.0, "grad_norm": 2.8497659526144754, "language_loss": 0.87059861, "learning_rate": 1.057990170638731e-06, "loss": 0.89158875, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 11084, "time_per_iteration": 2.3598005771636963 }, { "auxiliary_loss_clip": 0.01053075, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 1.01181138, "balance_loss_mlp": 1.01556349, "epoch": 0.6664662558244401, "flos": 18075931315200.0, "grad_norm": 2.383218340047172, "language_loss": 0.74446201, "learning_rate": 1.0576466324402452e-06, "loss": 0.76537168, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 11085, "time_per_iteration": 2.3618545532226562 }, { "auxiliary_loss_clip": 0.01051472, "auxiliary_loss_mlp": 0.01034682, "balance_loss_clip": 1.01076818, "balance_loss_mlp": 1.01533949, "epoch": 0.6665263790771081, "flos": 21573286139520.0, "grad_norm": 1.7854271489477993, "language_loss": 0.81906641, "learning_rate": 1.057303129975894e-06, "loss": 0.83992791, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 11086, "time_per_iteration": 2.4040451049804688 }, { "auxiliary_loss_clip": 0.01053144, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.01571155, "balance_loss_mlp": 1.01575017, "epoch": 0.666586502329776, "flos": 24205492673280.0, "grad_norm": 1.8908288651514884, "language_loss": 0.76270235, "learning_rate": 1.056959663258702e-06, "loss": 0.78362656, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 11087, "time_per_iteration": 2.3763089179992676 }, { "auxiliary_loss_clip": 0.01052797, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.0122689, "balance_loss_mlp": 1.01653194, "epoch": 0.666646625582444, "flos": 22199418195840.0, "grad_norm": 2.0160489869812253, "language_loss": 0.65974188, "learning_rate": 1.0566162323016939e-06, "loss": 0.68062651, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 11088, "time_per_iteration": 2.382625102996826 }, { "auxiliary_loss_clip": 0.01054874, "auxiliary_loss_mlp": 0.01040649, "balance_loss_clip": 1.01574636, "balance_loss_mlp": 1.01738501, "epoch": 0.6667067488351119, "flos": 18258945995520.0, "grad_norm": 1.9049040515903986, "language_loss": 0.6587162, "learning_rate": 1.0562728371178928e-06, "loss": 0.67967141, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 11089, "time_per_iteration": 2.3442752361297607 }, { "auxiliary_loss_clip": 0.01053207, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.01250124, "balance_loss_mlp": 1.01676106, "epoch": 0.66676687208778, "flos": 17235641779200.0, "grad_norm": 2.209393072286546, "language_loss": 0.82180685, "learning_rate": 1.0559294777203221e-06, "loss": 0.84269792, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 11090, "time_per_iteration": 3.643364906311035 }, { "auxiliary_loss_clip": 0.01053616, "auxiliary_loss_mlp": 0.01039538, "balance_loss_clip": 1.01450443, "balance_loss_mlp": 1.01620984, "epoch": 0.6668269953404479, "flos": 19751273683200.0, "grad_norm": 2.0236698465470977, "language_loss": 0.79260385, "learning_rate": 1.0555861541219984e-06, "loss": 0.81353545, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 11091, "time_per_iteration": 2.3484153747558594 }, { "auxiliary_loss_clip": 0.01051364, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.01411128, "balance_loss_mlp": 1.01527798, "epoch": 0.6668871185931159, "flos": 20557383131520.0, "grad_norm": 1.7849590555252393, "language_loss": 0.80407488, "learning_rate": 1.0552428663359425e-06, "loss": 0.82495761, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 11092, "time_per_iteration": 2.3365137577056885 }, { "auxiliary_loss_clip": 0.0100965, "auxiliary_loss_mlp": 0.01010033, "balance_loss_clip": 1.00745785, "balance_loss_mlp": 1.00209618, "epoch": 0.6669472418457839, "flos": 58085452667520.0, "grad_norm": 0.756347656755789, "language_loss": 0.57846516, "learning_rate": 1.0548996143751724e-06, "loss": 0.59866196, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.07568359, "step": 11093, "time_per_iteration": 3.0215201377868652 }, { "auxiliary_loss_clip": 0.01051364, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.01687026, "balance_loss_mlp": 1.01574564, "epoch": 0.6670073650984518, "flos": 26063989367040.0, "grad_norm": 1.9316821396453496, "language_loss": 0.77387172, "learning_rate": 1.054556398252703e-06, "loss": 0.79478401, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 11094, "time_per_iteration": 3.704000473022461 }, { "auxiliary_loss_clip": 0.01053477, "auxiliary_loss_mlp": 0.01041711, "balance_loss_clip": 1.01715398, "balance_loss_mlp": 1.0164063, "epoch": 0.6670674883511198, "flos": 32415458526720.0, "grad_norm": 1.768657619877568, "language_loss": 0.74496132, "learning_rate": 1.05421321798155e-06, "loss": 0.76591313, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 11095, "time_per_iteration": 3.828704595565796 }, { "auxiliary_loss_clip": 0.01052712, "auxiliary_loss_mlp": 0.01037416, "balance_loss_clip": 1.0150404, "balance_loss_mlp": 1.01741552, "epoch": 0.6671276116037878, "flos": 18036898548480.0, "grad_norm": 2.0568076570125764, "language_loss": 0.74117565, "learning_rate": 1.053870073574727e-06, "loss": 0.76207685, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 11096, "time_per_iteration": 2.3367269039154053 }, { "auxiliary_loss_clip": 0.01050022, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.0188787, "balance_loss_mlp": 1.01610792, "epoch": 0.6671877348564558, "flos": 23765971167360.0, "grad_norm": 3.4943703868276685, "language_loss": 0.65780115, "learning_rate": 1.0535269650452456e-06, "loss": 0.67869425, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33984375, "step": 11097, "time_per_iteration": 2.409106969833374 }, { "auxiliary_loss_clip": 0.01053461, "auxiliary_loss_mlp": 0.01042962, "balance_loss_clip": 1.01802301, "balance_loss_mlp": 1.01589191, "epoch": 0.6672478581091237, "flos": 20917442649600.0, "grad_norm": 1.7491957289690647, "language_loss": 0.77287662, "learning_rate": 1.0531838924061158e-06, "loss": 0.79384089, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 11098, "time_per_iteration": 2.439530611038208 }, { "auxiliary_loss_clip": 0.01052613, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.01820588, "balance_loss_mlp": 1.01595688, "epoch": 0.6673079813617917, "flos": 27854544821760.0, "grad_norm": 1.5718982019815968, "language_loss": 0.75541848, "learning_rate": 1.0528408556703476e-06, "loss": 0.77635807, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 11099, "time_per_iteration": 2.4369869232177734 }, { "auxiliary_loss_clip": 0.01051946, "auxiliary_loss_mlp": 0.01045399, "balance_loss_clip": 1.02046037, "balance_loss_mlp": 1.01616347, "epoch": 0.6673681046144596, "flos": 21615775130880.0, "grad_norm": 1.9049178209630346, "language_loss": 0.78646219, "learning_rate": 1.0524978548509502e-06, "loss": 0.80743563, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35742188, "step": 11100, "time_per_iteration": 2.3699584007263184 }, { "auxiliary_loss_clip": 0.01051927, "auxiliary_loss_mlp": 0.01045109, "balance_loss_clip": 1.0221374, "balance_loss_mlp": 1.0164187, "epoch": 0.6674282278671276, "flos": 20888743645440.0, "grad_norm": 1.7542286929410327, "language_loss": 0.60885721, "learning_rate": 1.0521548899609288e-06, "loss": 0.62982762, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 11101, "time_per_iteration": 2.4126718044281006 }, { "auxiliary_loss_clip": 0.01055771, "auxiliary_loss_mlp": 0.01040464, "balance_loss_clip": 1.01335621, "balance_loss_mlp": 1.01658773, "epoch": 0.6674883511197955, "flos": 23623036594560.0, "grad_norm": 1.632053212543779, "language_loss": 0.72539759, "learning_rate": 1.0518119610132884e-06, "loss": 0.74636, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 11102, "time_per_iteration": 2.383998394012451 }, { "auxiliary_loss_clip": 0.01053321, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.01369333, "balance_loss_mlp": 1.01627111, "epoch": 0.6675484743724636, "flos": 19608653312640.0, "grad_norm": 1.4899469736121012, "language_loss": 0.84968531, "learning_rate": 1.051469068021034e-06, "loss": 0.87059951, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 11103, "time_per_iteration": 2.3735787868499756 }, { "auxiliary_loss_clip": 0.01054011, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.01222265, "balance_loss_mlp": 1.01669574, "epoch": 0.6676085976251315, "flos": 14318578529280.0, "grad_norm": 1.9408855611533244, "language_loss": 0.79684138, "learning_rate": 1.0511262109971668e-06, "loss": 0.81773543, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 11104, "time_per_iteration": 3.774425506591797 }, { "auxiliary_loss_clip": 0.01055294, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.01570046, "balance_loss_mlp": 1.01751339, "epoch": 0.6676687208777995, "flos": 38103159317760.0, "grad_norm": 1.696436883369857, "language_loss": 0.58837366, "learning_rate": 1.0507833899546889e-06, "loss": 0.60931969, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37695312, "step": 11105, "time_per_iteration": 2.517803907394409 }, { "auxiliary_loss_clip": 0.01056351, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.01685572, "balance_loss_mlp": 1.01689148, "epoch": 0.6677288441304675, "flos": 23980617406080.0, "grad_norm": 1.6624265309868647, "language_loss": 0.74219739, "learning_rate": 1.0504406049066e-06, "loss": 0.76318872, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 11106, "time_per_iteration": 2.403228282928467 }, { "auxiliary_loss_clip": 0.01052442, "auxiliary_loss_mlp": 0.01033632, "balance_loss_clip": 1.0097301, "balance_loss_mlp": 1.01672149, "epoch": 0.6677889673831354, "flos": 24169532106240.0, "grad_norm": 1.814311614309522, "language_loss": 0.78022659, "learning_rate": 1.0500978558659e-06, "loss": 0.80108726, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 11107, "time_per_iteration": 2.3622937202453613 }, { "auxiliary_loss_clip": 0.01050393, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.01321745, "balance_loss_mlp": 1.01549673, "epoch": 0.6678490906358034, "flos": 22308556705920.0, "grad_norm": 2.5874244894986638, "language_loss": 0.90971595, "learning_rate": 1.049755142845583e-06, "loss": 0.93057501, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 11108, "time_per_iteration": 2.3738913536071777 }, { "auxiliary_loss_clip": 0.01050834, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.00984216, "balance_loss_mlp": 1.01582289, "epoch": 0.6679092138884714, "flos": 36897399002880.0, "grad_norm": 1.3501642786322143, "language_loss": 0.83394641, "learning_rate": 1.049412465858646e-06, "loss": 0.85476536, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 11109, "time_per_iteration": 2.4977805614471436 }, { "auxiliary_loss_clip": 0.01052047, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.01609921, "balance_loss_mlp": 1.01577854, "epoch": 0.6679693371411394, "flos": 18149318726400.0, "grad_norm": 1.9493853812743032, "language_loss": 0.71636319, "learning_rate": 1.0490698249180847e-06, "loss": 0.73728096, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 11110, "time_per_iteration": 2.3509953022003174 }, { "auxiliary_loss_clip": 0.01054436, "auxiliary_loss_mlp": 0.01039367, "balance_loss_clip": 1.01260495, "balance_loss_mlp": 1.0160675, "epoch": 0.6680294603938073, "flos": 27196955763840.0, "grad_norm": 1.89903469361097, "language_loss": 0.74880332, "learning_rate": 1.04872722003689e-06, "loss": 0.76974142, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3828125, "step": 11111, "time_per_iteration": 2.4025425910949707 }, { "auxiliary_loss_clip": 0.01051649, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.01330054, "balance_loss_mlp": 1.01598597, "epoch": 0.6680895836464753, "flos": 21724250325120.0, "grad_norm": 2.8022053310098465, "language_loss": 0.67001224, "learning_rate": 1.0483846512280553e-06, "loss": 0.6909073, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 11112, "time_per_iteration": 2.366159677505493 }, { "auxiliary_loss_clip": 0.01053112, "auxiliary_loss_mlp": 0.01036593, "balance_loss_clip": 1.01271582, "balance_loss_mlp": 1.01683784, "epoch": 0.6681497068991432, "flos": 19645451752320.0, "grad_norm": 2.3877243277458455, "language_loss": 0.64102721, "learning_rate": 1.048042118504569e-06, "loss": 0.66192424, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 11113, "time_per_iteration": 2.3433260917663574 }, { "auxiliary_loss_clip": 0.01051613, "auxiliary_loss_mlp": 0.0103322, "balance_loss_clip": 1.01051044, "balance_loss_mlp": 1.01591158, "epoch": 0.6682098301518112, "flos": 17418237523200.0, "grad_norm": 1.9516651319348364, "language_loss": 0.66727704, "learning_rate": 1.047699621879422e-06, "loss": 0.68812537, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 11114, "time_per_iteration": 2.320551633834839 }, { "auxiliary_loss_clip": 0.01053731, "auxiliary_loss_mlp": 0.01038421, "balance_loss_clip": 1.01573551, "balance_loss_mlp": 1.0168035, "epoch": 0.6682699534044791, "flos": 22597986810240.0, "grad_norm": 1.577834355332142, "language_loss": 0.79573482, "learning_rate": 1.0473571613655998e-06, "loss": 0.81665635, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 11115, "time_per_iteration": 2.3728630542755127 }, { "auxiliary_loss_clip": 0.01052612, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.0095973, "balance_loss_mlp": 1.01555336, "epoch": 0.6683300766571472, "flos": 24862523149440.0, "grad_norm": 1.7020649569688744, "language_loss": 0.80740649, "learning_rate": 1.0470147369760896e-06, "loss": 0.82827282, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 11116, "time_per_iteration": 2.3787641525268555 }, { "auxiliary_loss_clip": 0.01053889, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.01563144, "balance_loss_mlp": 1.01682806, "epoch": 0.6683901999098151, "flos": 27125383743360.0, "grad_norm": 1.6017618841728283, "language_loss": 0.79713601, "learning_rate": 1.0466723487238768e-06, "loss": 0.81809711, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 11117, "time_per_iteration": 2.4083964824676514 }, { "auxiliary_loss_clip": 0.01053527, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.011127, "balance_loss_mlp": 1.0158962, "epoch": 0.6684503231624831, "flos": 20738023839360.0, "grad_norm": 1.5704368987309416, "language_loss": 0.66644192, "learning_rate": 1.0463299966219441e-06, "loss": 0.68735081, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37695312, "step": 11118, "time_per_iteration": 2.3498356342315674 }, { "auxiliary_loss_clip": 0.01051541, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.01111281, "balance_loss_mlp": 1.01578665, "epoch": 0.668510446415151, "flos": 21761118587520.0, "grad_norm": 1.4449181094383172, "language_loss": 0.69689739, "learning_rate": 1.0459876806832727e-06, "loss": 0.71774709, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 11119, "time_per_iteration": 2.376066207885742 }, { "auxiliary_loss_clip": 0.01053698, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.01189923, "balance_loss_mlp": 1.01684213, "epoch": 0.668570569667819, "flos": 30189920042880.0, "grad_norm": 1.7133694913750732, "language_loss": 0.68963921, "learning_rate": 1.0456454009208448e-06, "loss": 0.71053553, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 11120, "time_per_iteration": 2.414970874786377 }, { "auxiliary_loss_clip": 0.01053265, "auxiliary_loss_mlp": 0.01038104, "balance_loss_clip": 1.01409578, "balance_loss_mlp": 1.01645792, "epoch": 0.668630692920487, "flos": 24169497194880.0, "grad_norm": 2.021859863721085, "language_loss": 0.73561335, "learning_rate": 1.045303157347638e-06, "loss": 0.75652707, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 11121, "time_per_iteration": 2.387606620788574 }, { "auxiliary_loss_clip": 0.01055333, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.01443481, "balance_loss_mlp": 1.01676726, "epoch": 0.668690816173155, "flos": 17456188037760.0, "grad_norm": 2.617572073354759, "language_loss": 0.7240622, "learning_rate": 1.0449609499766316e-06, "loss": 0.74502265, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 11122, "time_per_iteration": 2.331324815750122 }, { "auxiliary_loss_clip": 0.01053602, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.00957143, "balance_loss_mlp": 1.01708257, "epoch": 0.668750939425823, "flos": 25004061267840.0, "grad_norm": 2.2109273225574593, "language_loss": 0.72394323, "learning_rate": 1.0446187788208015e-06, "loss": 0.74482363, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 11123, "time_per_iteration": 2.3914716243743896 }, { "auxiliary_loss_clip": 0.01055872, "auxiliary_loss_mlp": 0.0104363, "balance_loss_clip": 1.01751161, "balance_loss_mlp": 1.01683235, "epoch": 0.6688110626784909, "flos": 24095655936000.0, "grad_norm": 1.687601623462158, "language_loss": 0.80513746, "learning_rate": 1.0442766438931244e-06, "loss": 0.82613248, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 11124, "time_per_iteration": 2.3770549297332764 }, { "auxiliary_loss_clip": 0.01054241, "auxiliary_loss_mlp": 0.01038989, "balance_loss_clip": 1.0145632, "balance_loss_mlp": 1.01687896, "epoch": 0.6688711859311589, "flos": 21758535146880.0, "grad_norm": 2.0008967964980005, "language_loss": 0.74675715, "learning_rate": 1.0439345452065716e-06, "loss": 0.76768947, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 11125, "time_per_iteration": 2.387712001800537 }, { "auxiliary_loss_clip": 0.01055385, "auxiliary_loss_mlp": 0.01039111, "balance_loss_clip": 1.01525736, "balance_loss_mlp": 1.01827061, "epoch": 0.6689313091838268, "flos": 22928544362880.0, "grad_norm": 2.779555637617207, "language_loss": 0.68178427, "learning_rate": 1.043592482774116e-06, "loss": 0.70272917, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 11126, "time_per_iteration": 2.3853566646575928 }, { "auxiliary_loss_clip": 0.01052366, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.01262474, "balance_loss_mlp": 1.01483047, "epoch": 0.6689914324364948, "flos": 20885112864000.0, "grad_norm": 1.8247251928245596, "language_loss": 0.71883649, "learning_rate": 1.0432504566087305e-06, "loss": 0.73971945, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 11127, "time_per_iteration": 2.395620346069336 }, { "auxiliary_loss_clip": 0.01055616, "auxiliary_loss_mlp": 0.01046033, "balance_loss_clip": 1.01788759, "balance_loss_mlp": 1.01560271, "epoch": 0.6690515556891627, "flos": 22747100693760.0, "grad_norm": 1.9505001727243227, "language_loss": 0.81481642, "learning_rate": 1.0429084667233827e-06, "loss": 0.83583283, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40039062, "step": 11128, "time_per_iteration": 2.37058687210083 }, { "auxiliary_loss_clip": 0.01053569, "auxiliary_loss_mlp": 0.01039338, "balance_loss_clip": 1.01437581, "balance_loss_mlp": 1.01570153, "epoch": 0.6691116789418308, "flos": 23330324822400.0, "grad_norm": 1.7546993793741807, "language_loss": 0.81760806, "learning_rate": 1.0425665131310427e-06, "loss": 0.8385371, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 11129, "time_per_iteration": 3.6070003509521484 }, { "auxiliary_loss_clip": 0.01050959, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.01567459, "balance_loss_mlp": 1.0162003, "epoch": 0.6691718021944987, "flos": 32445798364800.0, "grad_norm": 1.7768678947667995, "language_loss": 0.7158798, "learning_rate": 1.0422245958446762e-06, "loss": 0.73676527, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 11130, "time_per_iteration": 2.447805404663086 }, { "auxiliary_loss_clip": 0.01050881, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.01608431, "balance_loss_mlp": 1.01603341, "epoch": 0.6692319254471667, "flos": 23730499359360.0, "grad_norm": 1.594577438066387, "language_loss": 0.70763457, "learning_rate": 1.0418827148772486e-06, "loss": 0.72853798, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34960938, "step": 11131, "time_per_iteration": 2.4214019775390625 }, { "auxiliary_loss_clip": 0.01052186, "auxiliary_loss_mlp": 0.01039759, "balance_loss_clip": 1.01366448, "balance_loss_mlp": 1.01563537, "epoch": 0.6692920486998346, "flos": 14427053723520.0, "grad_norm": 2.2815171772407488, "language_loss": 0.67980152, "learning_rate": 1.0415408702417243e-06, "loss": 0.70072091, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36523438, "step": 11132, "time_per_iteration": 2.31101393699646 }, { "auxiliary_loss_clip": 0.0105276, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.01329339, "balance_loss_mlp": 1.01578546, "epoch": 0.6693521719525026, "flos": 21506392241280.0, "grad_norm": 1.6375899266586758, "language_loss": 0.75883776, "learning_rate": 1.0411990619510661e-06, "loss": 0.7797538, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36914062, "step": 11133, "time_per_iteration": 3.6703567504882812 }, { "auxiliary_loss_clip": 0.0105752, "auxiliary_loss_mlp": 0.01049725, "balance_loss_clip": 1.02037537, "balance_loss_mlp": 1.01839495, "epoch": 0.6694122952051706, "flos": 25405876638720.0, "grad_norm": 2.0812098729123214, "language_loss": 0.68439066, "learning_rate": 1.0408572900182363e-06, "loss": 0.70546305, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.390625, "step": 11134, "time_per_iteration": 3.829972267150879 }, { "auxiliary_loss_clip": 0.01057377, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.01620054, "balance_loss_mlp": 1.01802063, "epoch": 0.6694724184578386, "flos": 25660672807680.0, "grad_norm": 1.7955832102487468, "language_loss": 0.78872031, "learning_rate": 1.0405155544561943e-06, "loss": 0.80975729, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 0.39453125, "step": 11135, "time_per_iteration": 2.3893089294433594 }, { "auxiliary_loss_clip": 0.01051146, "auxiliary_loss_mlp": 0.01039558, "balance_loss_clip": 1.0159905, "balance_loss_mlp": 1.01625419, "epoch": 0.6695325417105066, "flos": 17708435677440.0, "grad_norm": 2.130713397003565, "language_loss": 0.75296158, "learning_rate": 1.040173855277898e-06, "loss": 0.77386868, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34765625, "step": 11136, "time_per_iteration": 2.3752248287200928 }, { "auxiliary_loss_clip": 0.01056941, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.01394558, "balance_loss_mlp": 1.01807249, "epoch": 0.6695926649631745, "flos": 24458962210560.0, "grad_norm": 1.758038412284191, "language_loss": 0.63325661, "learning_rate": 1.0398321924963061e-06, "loss": 0.65422595, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 11137, "time_per_iteration": 2.3739590644836426 }, { "auxiliary_loss_clip": 0.01051563, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.01435232, "balance_loss_mlp": 1.01584017, "epoch": 0.6696527882158425, "flos": 24278984818560.0, "grad_norm": 1.9302855700072241, "language_loss": 0.67662716, "learning_rate": 1.0394905661243724e-06, "loss": 0.69753653, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35742188, "step": 11138, "time_per_iteration": 2.3812472820281982 }, { "auxiliary_loss_clip": 0.01050198, "auxiliary_loss_mlp": 0.01036691, "balance_loss_clip": 1.0145061, "balance_loss_mlp": 1.01510119, "epoch": 0.6697129114685104, "flos": 23001652483200.0, "grad_norm": 1.886655235935033, "language_loss": 0.74081814, "learning_rate": 1.039148976175053e-06, "loss": 0.76168704, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 11139, "time_per_iteration": 2.369271993637085 }, { "auxiliary_loss_clip": 0.010496, "auxiliary_loss_mlp": 0.01036059, "balance_loss_clip": 1.01452971, "balance_loss_mlp": 1.01504755, "epoch": 0.6697730347211784, "flos": 22637019576960.0, "grad_norm": 1.883661104118875, "language_loss": 0.72291058, "learning_rate": 1.0388074226613016e-06, "loss": 0.7437672, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 11140, "time_per_iteration": 2.398655891418457 }, { "auxiliary_loss_clip": 0.01054222, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.01155257, "balance_loss_mlp": 1.0160116, "epoch": 0.6698331579738463, "flos": 28875963824640.0, "grad_norm": 2.0742197059580003, "language_loss": 0.76844776, "learning_rate": 1.0384659055960691e-06, "loss": 0.78935945, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 11141, "time_per_iteration": 2.4253556728363037 }, { "auxiliary_loss_clip": 0.01054258, "auxiliary_loss_mlp": 0.01042554, "balance_loss_clip": 1.01698351, "balance_loss_mlp": 1.01644993, "epoch": 0.6698932812265144, "flos": 24205946520960.0, "grad_norm": 6.33558595047457, "language_loss": 0.83553934, "learning_rate": 1.0381244249923052e-06, "loss": 0.85650742, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 11142, "time_per_iteration": 2.3848793506622314 }, { "auxiliary_loss_clip": 0.01052136, "auxiliary_loss_mlp": 0.01038742, "balance_loss_clip": 1.01501942, "balance_loss_mlp": 1.01593375, "epoch": 0.6699534044791823, "flos": 22089197433600.0, "grad_norm": 2.170220690116343, "language_loss": 0.70936733, "learning_rate": 1.037782980862959e-06, "loss": 0.73027617, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36132812, "step": 11143, "time_per_iteration": 2.386200189590454 }, { "auxiliary_loss_clip": 0.01050335, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.01405406, "balance_loss_mlp": 1.01531124, "epoch": 0.6700135277318503, "flos": 25191195488640.0, "grad_norm": 1.4403435064305836, "language_loss": 0.70899856, "learning_rate": 1.0374415732209796e-06, "loss": 0.72985744, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 11144, "time_per_iteration": 3.803497791290283 }, { "auxiliary_loss_clip": 0.01052807, "auxiliary_loss_mlp": 0.01035718, "balance_loss_clip": 1.01163781, "balance_loss_mlp": 1.01634526, "epoch": 0.6700736509845182, "flos": 23439079307520.0, "grad_norm": 1.760328020760937, "language_loss": 0.75283015, "learning_rate": 1.0371002020793114e-06, "loss": 0.77371538, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 11145, "time_per_iteration": 2.3802082538604736 }, { "auxiliary_loss_clip": 0.01054294, "auxiliary_loss_mlp": 0.01043082, "balance_loss_clip": 1.01726127, "balance_loss_mlp": 1.01614809, "epoch": 0.6701337742371862, "flos": 24388786644480.0, "grad_norm": 1.611937137218705, "language_loss": 0.72061288, "learning_rate": 1.0367588674509008e-06, "loss": 0.74158669, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 11146, "time_per_iteration": 2.4337122440338135 }, { "auxiliary_loss_clip": 0.01049834, "auxiliary_loss_mlp": 0.01037902, "balance_loss_clip": 1.01596749, "balance_loss_mlp": 1.01490247, "epoch": 0.6701938974898543, "flos": 14792768881920.0, "grad_norm": 1.9523151288661005, "language_loss": 0.79713207, "learning_rate": 1.0364175693486905e-06, "loss": 0.8180095, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34960938, "step": 11147, "time_per_iteration": 2.350590944290161 }, { "auxiliary_loss_clip": 0.01054404, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.01075125, "balance_loss_mlp": 1.01755905, "epoch": 0.6702540207425222, "flos": 20153054142720.0, "grad_norm": 1.7225347272687048, "language_loss": 0.71577656, "learning_rate": 1.0360763077856218e-06, "loss": 0.73667336, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 11148, "time_per_iteration": 2.3787808418273926 }, { "auxiliary_loss_clip": 0.01052853, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.01251245, "balance_loss_mlp": 1.01578522, "epoch": 0.6703141439951902, "flos": 21213121887360.0, "grad_norm": 1.7937357302727512, "language_loss": 0.71856451, "learning_rate": 1.035735082774636e-06, "loss": 0.73946071, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 11149, "time_per_iteration": 2.369661808013916 }, { "auxiliary_loss_clip": 0.01055138, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.0122416, "balance_loss_mlp": 1.01716614, "epoch": 0.6703742672478581, "flos": 23111419397760.0, "grad_norm": 1.8041468063171908, "language_loss": 0.74790531, "learning_rate": 1.0353938943286727e-06, "loss": 0.76881218, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 11150, "time_per_iteration": 2.3930537700653076 }, { "auxiliary_loss_clip": 0.01052779, "auxiliary_loss_mlp": 0.01034623, "balance_loss_clip": 1.01020885, "balance_loss_mlp": 1.01629984, "epoch": 0.6704343905005261, "flos": 22527811244160.0, "grad_norm": 2.118659427203162, "language_loss": 0.79482961, "learning_rate": 1.035052742460671e-06, "loss": 0.81570363, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 11151, "time_per_iteration": 2.3773388862609863 }, { "auxiliary_loss_clip": 0.01008369, "auxiliary_loss_mlp": 0.0100662, "balance_loss_clip": 1.00424826, "balance_loss_mlp": 1.00149465, "epoch": 0.670494513753194, "flos": 64789473536640.0, "grad_norm": 0.7936826860557125, "language_loss": 0.55503136, "learning_rate": 1.0347116271835643e-06, "loss": 0.57518119, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.06884766, "step": 11152, "time_per_iteration": 3.1202144622802734 }, { "auxiliary_loss_clip": 0.01054478, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.01430249, "balance_loss_mlp": 1.01637793, "epoch": 0.670554637005862, "flos": 23510441859840.0, "grad_norm": 1.5887262492988332, "language_loss": 0.81628597, "learning_rate": 1.0343705485102896e-06, "loss": 0.83722854, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 11153, "time_per_iteration": 2.419187068939209 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.01040194, "balance_loss_clip": 1.01593471, "balance_loss_mlp": 1.01618218, "epoch": 0.67061476025853, "flos": 19462402160640.0, "grad_norm": 1.6711944924875497, "language_loss": 0.77149135, "learning_rate": 1.0340295064537814e-06, "loss": 0.79242498, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 11154, "time_per_iteration": 2.3821256160736084 }, { "auxiliary_loss_clip": 0.01055409, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.01666629, "balance_loss_mlp": 1.01792169, "epoch": 0.670674883511198, "flos": 20518978769280.0, "grad_norm": 1.5347356468952325, "language_loss": 0.76946187, "learning_rate": 1.0336885010269702e-06, "loss": 0.79042071, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 11155, "time_per_iteration": 2.3736956119537354 }, { "auxiliary_loss_clip": 0.01054957, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.01433253, "balance_loss_mlp": 1.01799202, "epoch": 0.6707350067638659, "flos": 25482790097280.0, "grad_norm": 1.7510763414102284, "language_loss": 0.8243506, "learning_rate": 1.0333475322427878e-06, "loss": 0.84529048, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 11156, "time_per_iteration": 2.3882389068603516 }, { "auxiliary_loss_clip": 0.01052999, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.01600337, "balance_loss_mlp": 1.0165509, "epoch": 0.6707951300165339, "flos": 22272351759360.0, "grad_norm": 1.7991300997342061, "language_loss": 0.75378656, "learning_rate": 1.033006600114165e-06, "loss": 0.77471238, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 11157, "time_per_iteration": 2.3758904933929443 }, { "auxiliary_loss_clip": 0.01055462, "auxiliary_loss_mlp": 0.01039301, "balance_loss_clip": 1.01377833, "balance_loss_mlp": 1.01744223, "epoch": 0.6708552532692018, "flos": 23983549960320.0, "grad_norm": 1.5749311083830946, "language_loss": 0.75156802, "learning_rate": 1.0326657046540282e-06, "loss": 0.77251565, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11158, "time_per_iteration": 2.4210853576660156 }, { "auxiliary_loss_clip": 0.01055314, "auxiliary_loss_mlp": 0.01042307, "balance_loss_clip": 1.01747561, "balance_loss_mlp": 1.01711822, "epoch": 0.6709153765218698, "flos": 24936329496960.0, "grad_norm": 1.4301384266384447, "language_loss": 0.8235774, "learning_rate": 1.0323248458753044e-06, "loss": 0.84455365, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 11159, "time_per_iteration": 2.4076154232025146 }, { "auxiliary_loss_clip": 0.0105368, "auxiliary_loss_mlp": 0.01038601, "balance_loss_clip": 1.01303077, "balance_loss_mlp": 1.01708126, "epoch": 0.6709754997745379, "flos": 17529261246720.0, "grad_norm": 1.5246328349125604, "language_loss": 0.77541286, "learning_rate": 1.0319840237909193e-06, "loss": 0.7963357, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36523438, "step": 11160, "time_per_iteration": 2.34653377532959 }, { "auxiliary_loss_clip": 0.01051607, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.0145371, "balance_loss_mlp": 1.01582122, "epoch": 0.6710356230272058, "flos": 22089790926720.0, "grad_norm": 2.2475762981924934, "language_loss": 0.74794281, "learning_rate": 1.0316432384137978e-06, "loss": 0.76884139, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 11161, "time_per_iteration": 2.371363401412964 }, { "auxiliary_loss_clip": 0.01056884, "auxiliary_loss_mlp": 0.01043895, "balance_loss_clip": 1.01700151, "balance_loss_mlp": 1.01771951, "epoch": 0.6710957462798738, "flos": 24205318116480.0, "grad_norm": 1.7566980269132053, "language_loss": 0.69537812, "learning_rate": 1.0313024897568618e-06, "loss": 0.71638596, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 11162, "time_per_iteration": 2.411726474761963 }, { "auxiliary_loss_clip": 0.01053505, "auxiliary_loss_mlp": 0.01040361, "balance_loss_clip": 1.01712751, "balance_loss_mlp": 1.01725471, "epoch": 0.6711558695325417, "flos": 19093091132160.0, "grad_norm": 1.6708994586892005, "language_loss": 0.70656508, "learning_rate": 1.030961777833032e-06, "loss": 0.72750378, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11163, "time_per_iteration": 2.311664342880249 }, { "auxiliary_loss_clip": 0.01051433, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.01671791, "balance_loss_mlp": 1.01708364, "epoch": 0.6712159927852097, "flos": 25556666267520.0, "grad_norm": 1.6454346898293588, "language_loss": 0.76693583, "learning_rate": 1.0306211026552291e-06, "loss": 0.78785086, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34375, "step": 11164, "time_per_iteration": 2.429870128631592 }, { "auxiliary_loss_clip": 0.01052387, "auxiliary_loss_mlp": 0.01043147, "balance_loss_clip": 1.01955581, "balance_loss_mlp": 1.01588988, "epoch": 0.6712761160378776, "flos": 22227942643200.0, "grad_norm": 1.9043281867697093, "language_loss": 0.66386366, "learning_rate": 1.0302804642363704e-06, "loss": 0.68481898, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 11165, "time_per_iteration": 2.350022077560425 }, { "auxiliary_loss_clip": 0.0105075, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.01718545, "balance_loss_mlp": 1.01548791, "epoch": 0.6713362392905456, "flos": 22454423832960.0, "grad_norm": 1.951279135897549, "language_loss": 0.73526412, "learning_rate": 1.0299398625893738e-06, "loss": 0.75618196, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 11166, "time_per_iteration": 2.4447855949401855 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.01255059, "balance_loss_mlp": 1.01640475, "epoch": 0.6713963625432136, "flos": 25629006337920.0, "grad_norm": 2.131204506991228, "language_loss": 0.7834419, "learning_rate": 1.0295992977271546e-06, "loss": 0.80431706, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 11167, "time_per_iteration": 2.4093410968780518 }, { "auxiliary_loss_clip": 0.01052535, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.01716757, "balance_loss_mlp": 1.01592374, "epoch": 0.6714564857958816, "flos": 35005036423680.0, "grad_norm": 1.8954778320817187, "language_loss": 0.69598007, "learning_rate": 1.029258769662629e-06, "loss": 0.71691263, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 11168, "time_per_iteration": 2.5489211082458496 }, { "auxiliary_loss_clip": 0.01054949, "auxiliary_loss_mlp": 0.01048122, "balance_loss_clip": 1.02065599, "balance_loss_mlp": 1.01674628, "epoch": 0.6715166090485495, "flos": 26278914896640.0, "grad_norm": 1.8276990544672413, "language_loss": 0.74899304, "learning_rate": 1.0289182784087068e-06, "loss": 0.7700237, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.3828125, "step": 11169, "time_per_iteration": 3.627204418182373 }, { "auxiliary_loss_clip": 0.01055665, "auxiliary_loss_mlp": 0.0104197, "balance_loss_clip": 1.01654315, "balance_loss_mlp": 1.0175432, "epoch": 0.6715767323012175, "flos": 15923256572160.0, "grad_norm": 2.3078466258054102, "language_loss": 0.76797116, "learning_rate": 1.0285778239783005e-06, "loss": 0.78894746, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 11170, "time_per_iteration": 2.4076972007751465 }, { "auxiliary_loss_clip": 0.01054488, "auxiliary_loss_mlp": 0.01039476, "balance_loss_clip": 1.01494229, "balance_loss_mlp": 1.01630521, "epoch": 0.6716368555538854, "flos": 17490542682240.0, "grad_norm": 4.208387216710968, "language_loss": 0.76288652, "learning_rate": 1.0282374063843212e-06, "loss": 0.78382611, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3828125, "step": 11171, "time_per_iteration": 2.3279902935028076 }, { "auxiliary_loss_clip": 0.01053865, "auxiliary_loss_mlp": 0.01044386, "balance_loss_clip": 1.02056789, "balance_loss_mlp": 1.01668251, "epoch": 0.6716969788065534, "flos": 16760648465280.0, "grad_norm": 1.4750846901602184, "language_loss": 0.87450647, "learning_rate": 1.0278970256396762e-06, "loss": 0.89548898, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 11172, "time_per_iteration": 2.363266706466675 }, { "auxiliary_loss_clip": 0.01053085, "auxiliary_loss_mlp": 0.01041271, "balance_loss_clip": 1.01654696, "balance_loss_mlp": 1.01615357, "epoch": 0.6717571020592215, "flos": 22708731242880.0, "grad_norm": 1.655556468613651, "language_loss": 0.64295065, "learning_rate": 1.0275566817572733e-06, "loss": 0.66389418, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 11173, "time_per_iteration": 3.8535830974578857 }, { "auxiliary_loss_clip": 0.01056073, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.01850498, "balance_loss_mlp": 1.0165751, "epoch": 0.6718172253118894, "flos": 18733101436800.0, "grad_norm": 3.906373021860777, "language_loss": 0.7291038, "learning_rate": 1.02721637475002e-06, "loss": 0.75011432, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39648438, "step": 11174, "time_per_iteration": 3.7550208568573 }, { "auxiliary_loss_clip": 0.01051877, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.01274443, "balance_loss_mlp": 1.01610827, "epoch": 0.6718773485645574, "flos": 15631627052160.0, "grad_norm": 2.136773607954721, "language_loss": 0.70541888, "learning_rate": 1.0268761046308178e-06, "loss": 0.72629452, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 11175, "time_per_iteration": 2.394313335418701 }, { "auxiliary_loss_clip": 0.0105271, "auxiliary_loss_mlp": 0.01040084, "balance_loss_clip": 1.01601517, "balance_loss_mlp": 1.01679444, "epoch": 0.6719374718172253, "flos": 19353752409600.0, "grad_norm": 2.4315442964137053, "language_loss": 0.75025302, "learning_rate": 1.0265358714125714e-06, "loss": 0.77118099, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 11176, "time_per_iteration": 2.3464784622192383 }, { "auxiliary_loss_clip": 0.01053052, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.01234782, "balance_loss_mlp": 1.01590788, "epoch": 0.6719975950698933, "flos": 21980233480320.0, "grad_norm": 1.7800881199918708, "language_loss": 0.73990482, "learning_rate": 1.026195675108182e-06, "loss": 0.76080692, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 11177, "time_per_iteration": 2.401541233062744 }, { "auxiliary_loss_clip": 0.01053307, "auxiliary_loss_mlp": 0.01043656, "balance_loss_clip": 1.01500964, "balance_loss_mlp": 1.01640844, "epoch": 0.6720577183225612, "flos": 25226911676160.0, "grad_norm": 1.9318392640061994, "language_loss": 0.7752713, "learning_rate": 1.025855515730551e-06, "loss": 0.79624093, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.36914062, "step": 11178, "time_per_iteration": 2.392406702041626 }, { "auxiliary_loss_clip": 0.01054293, "auxiliary_loss_mlp": 0.01042432, "balance_loss_clip": 1.01855397, "balance_loss_mlp": 1.0165633, "epoch": 0.6721178415752292, "flos": 16944954865920.0, "grad_norm": 1.757826512586554, "language_loss": 0.71767688, "learning_rate": 1.0255153932925766e-06, "loss": 0.73864412, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37695312, "step": 11179, "time_per_iteration": 2.3909482955932617 }, { "auxiliary_loss_clip": 0.01053217, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.01376438, "balance_loss_mlp": 1.01691258, "epoch": 0.6721779648278972, "flos": 21540362860800.0, "grad_norm": 1.4887267934023418, "language_loss": 0.75062823, "learning_rate": 1.0251753078071557e-06, "loss": 0.77153337, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 11180, "time_per_iteration": 2.3668999671936035 }, { "auxiliary_loss_clip": 0.01053731, "auxiliary_loss_mlp": 0.01036369, "balance_loss_clip": 1.01213336, "balance_loss_mlp": 1.0171535, "epoch": 0.6722380880805652, "flos": 22604235943680.0, "grad_norm": 1.389827792963096, "language_loss": 0.75930154, "learning_rate": 1.0248352592871848e-06, "loss": 0.78020257, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11181, "time_per_iteration": 2.4671754837036133 }, { "auxiliary_loss_clip": 0.0105414, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.01219714, "balance_loss_mlp": 1.01634991, "epoch": 0.6722982113332331, "flos": 15924338824320.0, "grad_norm": 2.256772585669221, "language_loss": 0.76061094, "learning_rate": 1.0244952477455585e-06, "loss": 0.78151166, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37890625, "step": 11182, "time_per_iteration": 2.3252110481262207 }, { "auxiliary_loss_clip": 0.01051224, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.01482153, "balance_loss_mlp": 1.01548982, "epoch": 0.6723583345859011, "flos": 20595089266560.0, "grad_norm": 1.780156157528948, "language_loss": 0.70582354, "learning_rate": 1.0241552731951699e-06, "loss": 0.72672117, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 11183, "time_per_iteration": 3.786536931991577 }, { "auxiliary_loss_clip": 0.01052892, "auxiliary_loss_mlp": 0.01039553, "balance_loss_clip": 1.01423335, "balance_loss_mlp": 1.01614904, "epoch": 0.672418457838569, "flos": 21724773995520.0, "grad_norm": 2.4335847744494923, "language_loss": 0.79111218, "learning_rate": 1.0238153356489112e-06, "loss": 0.81203669, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 11184, "time_per_iteration": 2.3463525772094727 }, { "auxiliary_loss_clip": 0.01058395, "auxiliary_loss_mlp": 0.01039578, "balance_loss_clip": 1.0134716, "balance_loss_mlp": 1.01889586, "epoch": 0.672478581091237, "flos": 21469314510720.0, "grad_norm": 1.8171154627750272, "language_loss": 0.6786105, "learning_rate": 1.0234754351196743e-06, "loss": 0.69959021, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39453125, "step": 11185, "time_per_iteration": 2.425527811050415 }, { "auxiliary_loss_clip": 0.01054328, "auxiliary_loss_mlp": 0.0103919, "balance_loss_clip": 1.01412034, "balance_loss_mlp": 1.01714611, "epoch": 0.6725387043439051, "flos": 30845449330560.0, "grad_norm": 2.223599457505325, "language_loss": 0.81936312, "learning_rate": 1.023135571620345e-06, "loss": 0.84029835, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 11186, "time_per_iteration": 2.4141592979431152 }, { "auxiliary_loss_clip": 0.01052476, "auxiliary_loss_mlp": 0.01039113, "balance_loss_clip": 1.01567614, "balance_loss_mlp": 1.0172627, "epoch": 0.672598827596573, "flos": 24054947424000.0, "grad_norm": 1.648590536068683, "language_loss": 0.81096184, "learning_rate": 1.022795745163813e-06, "loss": 0.83187771, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 11187, "time_per_iteration": 2.444425106048584 }, { "auxiliary_loss_clip": 0.010566, "auxiliary_loss_mlp": 0.01042523, "balance_loss_clip": 1.01622593, "balance_loss_mlp": 1.0173372, "epoch": 0.672658950849241, "flos": 21870780768000.0, "grad_norm": 2.1881549012276404, "language_loss": 0.71703374, "learning_rate": 1.022455955762965e-06, "loss": 0.73802495, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 11188, "time_per_iteration": 2.358220100402832 }, { "auxiliary_loss_clip": 0.01051154, "auxiliary_loss_mlp": 0.01038438, "balance_loss_clip": 1.01522827, "balance_loss_mlp": 1.01617527, "epoch": 0.6727190741019089, "flos": 23220976844160.0, "grad_norm": 1.8130962046668468, "language_loss": 0.77149785, "learning_rate": 1.0221162034306842e-06, "loss": 0.7923938, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34960938, "step": 11189, "time_per_iteration": 2.4324443340301514 }, { "auxiliary_loss_clip": 0.01056552, "auxiliary_loss_mlp": 0.01046042, "balance_loss_clip": 1.01652586, "balance_loss_mlp": 1.01683569, "epoch": 0.6727791973545769, "flos": 15777703647360.0, "grad_norm": 1.9808707417688924, "language_loss": 0.76791763, "learning_rate": 1.0217764881798562e-06, "loss": 0.78894353, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.39648438, "step": 11190, "time_per_iteration": 2.3326523303985596 }, { "auxiliary_loss_clip": 0.01052976, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.0160172, "balance_loss_mlp": 1.01641476, "epoch": 0.6728393206072448, "flos": 21248838074880.0, "grad_norm": 1.5945699223644034, "language_loss": 0.7780419, "learning_rate": 1.0214368100233612e-06, "loss": 0.79899585, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3671875, "step": 11191, "time_per_iteration": 2.4357545375823975 }, { "auxiliary_loss_clip": 0.01051176, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.01224899, "balance_loss_mlp": 1.0164814, "epoch": 0.6728994438599128, "flos": 32121943793280.0, "grad_norm": 1.7506412938171345, "language_loss": 0.87019533, "learning_rate": 1.0210971689740802e-06, "loss": 0.89106953, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.34765625, "step": 11192, "time_per_iteration": 2.4485106468200684 }, { "auxiliary_loss_clip": 0.01054747, "auxiliary_loss_mlp": 0.01042779, "balance_loss_clip": 1.01586175, "balance_loss_mlp": 1.01699913, "epoch": 0.6729595671125808, "flos": 23111244840960.0, "grad_norm": 1.8373273135529498, "language_loss": 0.76596963, "learning_rate": 1.0207575650448923e-06, "loss": 0.78694487, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 11193, "time_per_iteration": 2.409313678741455 }, { "auxiliary_loss_clip": 0.01052978, "auxiliary_loss_mlp": 0.01040051, "balance_loss_clip": 1.01560068, "balance_loss_mlp": 1.01635408, "epoch": 0.6730196903652488, "flos": 14610522251520.0, "grad_norm": 1.9033375931183367, "language_loss": 0.79331046, "learning_rate": 1.0204179982486758e-06, "loss": 0.81424069, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 11194, "time_per_iteration": 2.4071316719055176 }, { "auxiliary_loss_clip": 0.0105477, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.01519322, "balance_loss_mlp": 1.01688087, "epoch": 0.6730798136179167, "flos": 21104856161280.0, "grad_norm": 1.949109161589922, "language_loss": 0.91388071, "learning_rate": 1.0200784685983075e-06, "loss": 0.9348191, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 11195, "time_per_iteration": 2.4333207607269287 }, { "auxiliary_loss_clip": 0.01052383, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.01389146, "balance_loss_mlp": 1.01574063, "epoch": 0.6731399368705847, "flos": 28984997600640.0, "grad_norm": 1.7134904256146055, "language_loss": 0.73134601, "learning_rate": 1.019738976106662e-06, "loss": 0.75225061, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 11196, "time_per_iteration": 2.407958507537842 }, { "auxiliary_loss_clip": 0.01008505, "auxiliary_loss_mlp": 0.01003068, "balance_loss_clip": 1.00079107, "balance_loss_mlp": 1.00156784, "epoch": 0.6732000601232526, "flos": 64740386298240.0, "grad_norm": 0.7951740180139502, "language_loss": 0.56653446, "learning_rate": 1.0193995207866123e-06, "loss": 0.58665019, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.06933594, "step": 11197, "time_per_iteration": 2.929255247116089 }, { "auxiliary_loss_clip": 0.01050724, "auxiliary_loss_mlp": 0.01038087, "balance_loss_clip": 1.01586652, "balance_loss_mlp": 1.01575494, "epoch": 0.6732601833759206, "flos": 17200693641600.0, "grad_norm": 2.0893327541967097, "language_loss": 0.76616001, "learning_rate": 1.0190601026510312e-06, "loss": 0.78704816, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 11198, "time_per_iteration": 2.3472352027893066 }, { "auxiliary_loss_clip": 0.01052776, "auxiliary_loss_mlp": 0.01039254, "balance_loss_clip": 1.01435161, "balance_loss_mlp": 1.01544476, "epoch": 0.6733203066285887, "flos": 18657933546240.0, "grad_norm": 2.153742943219092, "language_loss": 0.83109432, "learning_rate": 1.0187207217127892e-06, "loss": 0.85201466, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 11199, "time_per_iteration": 2.3395397663116455 }, { "auxiliary_loss_clip": 0.01054604, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.01259303, "balance_loss_mlp": 1.01709652, "epoch": 0.6733804298812566, "flos": 35807864204160.0, "grad_norm": 1.7069933053319137, "language_loss": 0.72391117, "learning_rate": 1.0183813779847552e-06, "loss": 0.74483633, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 11200, "time_per_iteration": 2.4895339012145996 }, { "auxiliary_loss_clip": 0.01054871, "auxiliary_loss_mlp": 0.01040325, "balance_loss_clip": 1.01530313, "balance_loss_mlp": 1.01791716, "epoch": 0.6734405531339246, "flos": 61636714364160.0, "grad_norm": 1.6422479235830079, "language_loss": 0.65071422, "learning_rate": 1.0180420714797987e-06, "loss": 0.67166615, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 11201, "time_per_iteration": 2.732311487197876 }, { "auxiliary_loss_clip": 0.01054782, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.01335979, "balance_loss_mlp": 1.01666737, "epoch": 0.6735006763865925, "flos": 20521282919040.0, "grad_norm": 1.8576787956246579, "language_loss": 0.6496762, "learning_rate": 1.0177028022107856e-06, "loss": 0.6706025, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38085938, "step": 11202, "time_per_iteration": 2.3703365325927734 }, { "auxiliary_loss_clip": 0.0105395, "auxiliary_loss_mlp": 0.01035433, "balance_loss_clip": 1.01335597, "balance_loss_mlp": 1.01760232, "epoch": 0.6735607996392605, "flos": 13917985056000.0, "grad_norm": 2.401803121906537, "language_loss": 0.75684547, "learning_rate": 1.0173635701905796e-06, "loss": 0.77773935, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 11203, "time_per_iteration": 2.348654270172119 }, { "auxiliary_loss_clip": 0.01056084, "auxiliary_loss_mlp": 0.01040279, "balance_loss_clip": 1.01328993, "balance_loss_mlp": 1.01684558, "epoch": 0.6736209228919284, "flos": 18806244468480.0, "grad_norm": 1.7524300373947743, "language_loss": 0.68837124, "learning_rate": 1.0170243754320456e-06, "loss": 0.70933485, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 11204, "time_per_iteration": 2.33200740814209 }, { "auxiliary_loss_clip": 0.01055602, "auxiliary_loss_mlp": 0.01044696, "balance_loss_clip": 1.0196383, "balance_loss_mlp": 1.01784599, "epoch": 0.6736810461445965, "flos": 20372169035520.0, "grad_norm": 1.5906572749603662, "language_loss": 0.74530828, "learning_rate": 1.0166852179480465e-06, "loss": 0.76631129, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 11205, "time_per_iteration": 2.376124382019043 }, { "auxiliary_loss_clip": 0.01049553, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.01300144, "balance_loss_mlp": 1.01482725, "epoch": 0.6737411693972644, "flos": 30006242046720.0, "grad_norm": 1.5708399003439901, "language_loss": 0.73156106, "learning_rate": 1.0163460977514416e-06, "loss": 0.752415, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 11206, "time_per_iteration": 2.445042133331299 }, { "auxiliary_loss_clip": 0.01058824, "auxiliary_loss_mlp": 0.01043912, "balance_loss_clip": 1.01654124, "balance_loss_mlp": 1.01839113, "epoch": 0.6738012926499324, "flos": 25446166214400.0, "grad_norm": 1.9174645953156422, "language_loss": 0.68303955, "learning_rate": 1.016007014855092e-06, "loss": 0.70406687, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.40429688, "step": 11207, "time_per_iteration": 2.450312376022339 }, { "auxiliary_loss_clip": 0.01052176, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.01868117, "balance_loss_mlp": 1.01691866, "epoch": 0.6738614159026003, "flos": 20775834708480.0, "grad_norm": 2.553395698014128, "language_loss": 0.7439664, "learning_rate": 1.0156679692718553e-06, "loss": 0.76490438, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 11208, "time_per_iteration": 2.3964240550994873 }, { "auxiliary_loss_clip": 0.0105424, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.01412988, "balance_loss_mlp": 1.01719582, "epoch": 0.6739215391552683, "flos": 19566059587200.0, "grad_norm": 5.514860337447355, "language_loss": 0.76861751, "learning_rate": 1.0153289610145867e-06, "loss": 0.78955001, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 11209, "time_per_iteration": 3.5512943267822266 }, { "auxiliary_loss_clip": 0.01049727, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.01145935, "balance_loss_mlp": 1.01496768, "epoch": 0.6739816624079362, "flos": 24387075987840.0, "grad_norm": 1.7109167790238478, "language_loss": 0.67575395, "learning_rate": 1.0149899900961428e-06, "loss": 0.6965965, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34570312, "step": 11210, "time_per_iteration": 2.400242567062378 }, { "auxiliary_loss_clip": 0.01049826, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.01256609, "balance_loss_mlp": 1.0153445, "epoch": 0.6740417856606042, "flos": 22527078105600.0, "grad_norm": 6.295556336115029, "language_loss": 0.80925012, "learning_rate": 1.014651056529377e-06, "loss": 0.83008969, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 11211, "time_per_iteration": 2.3902089595794678 }, { "auxiliary_loss_clip": 0.01050896, "auxiliary_loss_mlp": 0.01031113, "balance_loss_clip": 1.00891602, "balance_loss_mlp": 1.01610374, "epoch": 0.6741019089132723, "flos": 25774279971840.0, "grad_norm": 1.5331738405103945, "language_loss": 0.77070451, "learning_rate": 1.014312160327143e-06, "loss": 0.79152459, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 11212, "time_per_iteration": 3.783914566040039 }, { "auxiliary_loss_clip": 0.01052622, "auxiliary_loss_mlp": 0.01040075, "balance_loss_clip": 1.01617384, "balance_loss_mlp": 1.01603556, "epoch": 0.6741620321659402, "flos": 21104611781760.0, "grad_norm": 2.272392810555447, "language_loss": 0.79225612, "learning_rate": 1.0139733015022905e-06, "loss": 0.81318307, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 11213, "time_per_iteration": 3.7933425903320312 }, { "auxiliary_loss_clip": 0.01053912, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.01329112, "balance_loss_mlp": 1.01594782, "epoch": 0.6742221554186082, "flos": 20739385382400.0, "grad_norm": 1.7533477792829248, "language_loss": 0.69426835, "learning_rate": 1.0136344800676685e-06, "loss": 0.71519989, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 11214, "time_per_iteration": 2.3653347492218018 }, { "auxiliary_loss_clip": 0.01052808, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.01731682, "balance_loss_mlp": 1.01649487, "epoch": 0.6742822786712761, "flos": 37772776321920.0, "grad_norm": 2.514347004734488, "language_loss": 0.73163247, "learning_rate": 1.0132956960361263e-06, "loss": 0.75257003, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 11215, "time_per_iteration": 2.508117198944092 }, { "auxiliary_loss_clip": 0.01053148, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.01477218, "balance_loss_mlp": 1.01641965, "epoch": 0.6743424019239441, "flos": 37262520668160.0, "grad_norm": 2.483328537531991, "language_loss": 0.68133378, "learning_rate": 1.0129569494205096e-06, "loss": 0.70224416, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 11216, "time_per_iteration": 2.4973838329315186 }, { "auxiliary_loss_clip": 0.01007918, "auxiliary_loss_mlp": 0.01007348, "balance_loss_clip": 1.00511837, "balance_loss_mlp": 1.00111365, "epoch": 0.674402525176612, "flos": 65994011953920.0, "grad_norm": 0.7531869504142047, "language_loss": 0.56323338, "learning_rate": 1.0126182402336646e-06, "loss": 0.58338606, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.06835938, "step": 11217, "time_per_iteration": 3.0888004302978516 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01040481, "balance_loss_clip": 1.01543546, "balance_loss_mlp": 1.01634264, "epoch": 0.67446264842928, "flos": 26460218920320.0, "grad_norm": 1.8496034563233383, "language_loss": 0.75967765, "learning_rate": 1.0122795684884363e-06, "loss": 0.78060502, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 11218, "time_per_iteration": 2.4399452209472656 }, { "auxiliary_loss_clip": 0.01052507, "auxiliary_loss_mlp": 0.01043139, "balance_loss_clip": 1.0179497, "balance_loss_mlp": 1.01618588, "epoch": 0.674522771681948, "flos": 23731267409280.0, "grad_norm": 1.5400469968713306, "language_loss": 0.66781509, "learning_rate": 1.0119409341976639e-06, "loss": 0.68877155, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 11219, "time_per_iteration": 2.3657610416412354 }, { "auxiliary_loss_clip": 0.01055392, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.01454699, "balance_loss_mlp": 1.0168941, "epoch": 0.674582894934616, "flos": 24753175171200.0, "grad_norm": 5.100870799851118, "language_loss": 0.76080352, "learning_rate": 1.0116023373741904e-06, "loss": 0.78174919, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 11220, "time_per_iteration": 2.423861265182495 }, { "auxiliary_loss_clip": 0.01054428, "auxiliary_loss_mlp": 0.01039989, "balance_loss_clip": 1.01484752, "balance_loss_mlp": 1.01705027, "epoch": 0.6746430181872839, "flos": 24825480330240.0, "grad_norm": 1.543893623072826, "language_loss": 0.71649611, "learning_rate": 1.0112637780308554e-06, "loss": 0.73744029, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 11221, "time_per_iteration": 2.3897268772125244 }, { "auxiliary_loss_clip": 0.01051359, "auxiliary_loss_mlp": 0.01040336, "balance_loss_clip": 1.01707792, "balance_loss_mlp": 1.01619387, "epoch": 0.6747031414399519, "flos": 16872544972800.0, "grad_norm": 2.281338816938866, "language_loss": 0.59331489, "learning_rate": 1.010925256180498e-06, "loss": 0.61423182, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 11222, "time_per_iteration": 2.3602776527404785 }, { "auxiliary_loss_clip": 0.01052646, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.01488304, "balance_loss_mlp": 1.01633692, "epoch": 0.6747632646926198, "flos": 22783794399360.0, "grad_norm": 1.617190723229836, "language_loss": 0.77581966, "learning_rate": 1.0105867718359528e-06, "loss": 0.79675055, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 11223, "time_per_iteration": 3.811629056930542 }, { "auxiliary_loss_clip": 0.01054491, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.01515436, "balance_loss_mlp": 1.01690674, "epoch": 0.6748233879452878, "flos": 20045102618880.0, "grad_norm": 1.673007834055559, "language_loss": 0.75642747, "learning_rate": 1.0102483250100574e-06, "loss": 0.77737963, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 11224, "time_per_iteration": 2.365281343460083 }, { "auxiliary_loss_clip": 0.01051037, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.01365995, "balance_loss_mlp": 1.01593423, "epoch": 0.6748835111979558, "flos": 23001722305920.0, "grad_norm": 1.6241218338514314, "language_loss": 0.63629115, "learning_rate": 1.0099099157156445e-06, "loss": 0.65714943, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.3515625, "step": 11225, "time_per_iteration": 2.3551177978515625 }, { "auxiliary_loss_clip": 0.01050001, "auxiliary_loss_mlp": 0.01036246, "balance_loss_clip": 1.01464486, "balance_loss_mlp": 1.01591051, "epoch": 0.6749436344506238, "flos": 12196662560640.0, "grad_norm": 1.7124872275994794, "language_loss": 0.64428532, "learning_rate": 1.0095715439655462e-06, "loss": 0.66514784, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 11226, "time_per_iteration": 2.338573694229126 }, { "auxiliary_loss_clip": 0.01054104, "auxiliary_loss_mlp": 0.01039771, "balance_loss_clip": 1.01572669, "balance_loss_mlp": 1.01711082, "epoch": 0.6750037577032918, "flos": 11872947634560.0, "grad_norm": 2.1996949634505496, "language_loss": 0.73554635, "learning_rate": 1.0092332097725945e-06, "loss": 0.7564851, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 11227, "time_per_iteration": 2.3238253593444824 }, { "auxiliary_loss_clip": 0.01051943, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.01114774, "balance_loss_mlp": 1.01623905, "epoch": 0.6750638809559597, "flos": 17018656479360.0, "grad_norm": 2.0290034100689844, "language_loss": 0.72095686, "learning_rate": 1.0088949131496183e-06, "loss": 0.74181843, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 11228, "time_per_iteration": 2.4230082035064697 }, { "auxiliary_loss_clip": 0.01009732, "auxiliary_loss_mlp": 0.01002175, "balance_loss_clip": 0.9997192, "balance_loss_mlp": 1.00286996, "epoch": 0.6751240042086277, "flos": 70947384785280.0, "grad_norm": 0.7591488868751114, "language_loss": 0.5330714, "learning_rate": 1.0085566541094482e-06, "loss": 0.55319047, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.06835938, "step": 11229, "time_per_iteration": 3.048752546310425 }, { "auxiliary_loss_clip": 0.01052747, "auxiliary_loss_mlp": 0.01034285, "balance_loss_clip": 1.01099193, "balance_loss_mlp": 1.01641679, "epoch": 0.6751841274612956, "flos": 22674027484800.0, "grad_norm": 2.052975486513101, "language_loss": 0.81281394, "learning_rate": 1.0082184326649072e-06, "loss": 0.83368421, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11230, "time_per_iteration": 2.4306225776672363 }, { "auxiliary_loss_clip": 0.01050639, "auxiliary_loss_mlp": 0.01036429, "balance_loss_clip": 1.01383841, "balance_loss_mlp": 1.01652777, "epoch": 0.6752442507139637, "flos": 21287556639360.0, "grad_norm": 1.5598429293008773, "language_loss": 0.67060828, "learning_rate": 1.0078802488288228e-06, "loss": 0.69147897, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.33984375, "step": 11231, "time_per_iteration": 2.3528716564178467 }, { "auxiliary_loss_clip": 0.01055204, "auxiliary_loss_mlp": 0.01043799, "balance_loss_clip": 1.01647604, "balance_loss_mlp": 1.01662707, "epoch": 0.6753043739666316, "flos": 28255661965440.0, "grad_norm": 3.0749381550938386, "language_loss": 0.68208766, "learning_rate": 1.0075421026140198e-06, "loss": 0.70307767, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 11232, "time_per_iteration": 2.427762746810913 }, { "auxiliary_loss_clip": 0.01051437, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.01266837, "balance_loss_mlp": 1.01607144, "epoch": 0.6753644972192996, "flos": 21359303216640.0, "grad_norm": 1.5474324197997924, "language_loss": 0.73380989, "learning_rate": 1.0072039940333188e-06, "loss": 0.75467777, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 11233, "time_per_iteration": 2.3637351989746094 }, { "auxiliary_loss_clip": 0.01053385, "auxiliary_loss_mlp": 0.0104464, "balance_loss_clip": 1.02027345, "balance_loss_mlp": 1.01660776, "epoch": 0.6754246204719675, "flos": 26540763160320.0, "grad_norm": 1.7190881463842758, "language_loss": 0.77959764, "learning_rate": 1.0068659230995418e-06, "loss": 0.80057788, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 11234, "time_per_iteration": 2.4112913608551025 }, { "auxiliary_loss_clip": 0.01052186, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.01274753, "balance_loss_mlp": 1.0160284, "epoch": 0.6754847437246355, "flos": 25555514192640.0, "grad_norm": 1.4980263651276857, "language_loss": 0.76174974, "learning_rate": 1.0065278898255101e-06, "loss": 0.78262603, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 11235, "time_per_iteration": 2.3982231616973877 }, { "auxiliary_loss_clip": 0.01009136, "auxiliary_loss_mlp": 0.01004784, "balance_loss_clip": 1.00220931, "balance_loss_mlp": 1.0022217, "epoch": 0.6755448669773034, "flos": 59510502432000.0, "grad_norm": 0.7978702145228411, "language_loss": 0.515154, "learning_rate": 1.0061898942240387e-06, "loss": 0.53529322, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.06933594, "step": 11236, "time_per_iteration": 2.9726338386535645 }, { "auxiliary_loss_clip": 0.01052986, "auxiliary_loss_mlp": 0.01038955, "balance_loss_clip": 1.01403999, "balance_loss_mlp": 1.0172559, "epoch": 0.6756049902299714, "flos": 23293421648640.0, "grad_norm": 2.251847424990654, "language_loss": 0.77061641, "learning_rate": 1.0058519363079464e-06, "loss": 0.79153585, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.35742188, "step": 11237, "time_per_iteration": 2.3756790161132812 }, { "auxiliary_loss_clip": 0.01053566, "auxiliary_loss_mlp": 0.01044755, "balance_loss_clip": 1.01998305, "balance_loss_mlp": 1.01745725, "epoch": 0.6756651134826394, "flos": 31574121649920.0, "grad_norm": 1.9260401935067815, "language_loss": 0.78808886, "learning_rate": 1.0055140160900482e-06, "loss": 0.80907202, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 11238, "time_per_iteration": 2.4313414096832275 }, { "auxiliary_loss_clip": 0.0105598, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.01649618, "balance_loss_mlp": 1.01728463, "epoch": 0.6757252367353074, "flos": 27271041402240.0, "grad_norm": 1.701257224388742, "language_loss": 0.68104428, "learning_rate": 1.0051761335831587e-06, "loss": 0.70202702, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 11239, "time_per_iteration": 2.3921566009521484 }, { "auxiliary_loss_clip": 0.0105221, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.01348913, "balance_loss_mlp": 1.01679671, "epoch": 0.6757853599879754, "flos": 16830125804160.0, "grad_norm": 1.7479341775624346, "language_loss": 0.83706176, "learning_rate": 1.0048382888000898e-06, "loss": 0.85794556, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 11240, "time_per_iteration": 2.3573455810546875 }, { "auxiliary_loss_clip": 0.01057452, "auxiliary_loss_mlp": 0.01048263, "balance_loss_clip": 1.01933074, "balance_loss_mlp": 1.01769531, "epoch": 0.6758454832406433, "flos": 23218986896640.0, "grad_norm": 2.0989220104304906, "language_loss": 0.75970536, "learning_rate": 1.0045004817536525e-06, "loss": 0.78076249, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.3984375, "step": 11241, "time_per_iteration": 2.3480899333953857 }, { "auxiliary_loss_clip": 0.01054355, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.01555252, "balance_loss_mlp": 1.01720202, "epoch": 0.6759056064933113, "flos": 16288622616960.0, "grad_norm": 2.2666344290488536, "language_loss": 0.81797755, "learning_rate": 1.0041627124566572e-06, "loss": 0.83892941, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 11242, "time_per_iteration": 2.3419365882873535 }, { "auxiliary_loss_clip": 0.0105158, "auxiliary_loss_mlp": 0.01038313, "balance_loss_clip": 1.01485217, "balance_loss_mlp": 1.01556087, "epoch": 0.6759657297459792, "flos": 25921089705600.0, "grad_norm": 1.965817171642329, "language_loss": 0.73769748, "learning_rate": 1.0038249809219109e-06, "loss": 0.75859642, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 11243, "time_per_iteration": 2.4012815952301025 }, { "auxiliary_loss_clip": 0.01053362, "auxiliary_loss_mlp": 0.01041304, "balance_loss_clip": 1.01741409, "balance_loss_mlp": 1.01702034, "epoch": 0.6760258529986473, "flos": 22999767269760.0, "grad_norm": 1.746431872682607, "language_loss": 0.73671883, "learning_rate": 1.003487287162221e-06, "loss": 0.75766551, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 11244, "time_per_iteration": 2.3787403106689453 }, { "auxiliary_loss_clip": 0.01053845, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.02049255, "balance_loss_mlp": 1.01675367, "epoch": 0.6760859762513152, "flos": 20958290807040.0, "grad_norm": 1.8975951159605597, "language_loss": 0.86797643, "learning_rate": 1.003149631190393e-06, "loss": 0.88896346, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 11245, "time_per_iteration": 2.3712878227233887 }, { "auxiliary_loss_clip": 0.01056182, "auxiliary_loss_mlp": 0.01040997, "balance_loss_clip": 1.01605821, "balance_loss_mlp": 1.01714551, "epoch": 0.6761460995039832, "flos": 23621814696960.0, "grad_norm": 1.7636016090495583, "language_loss": 0.75315988, "learning_rate": 1.0028120130192327e-06, "loss": 0.77413166, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.390625, "step": 11246, "time_per_iteration": 2.3789920806884766 }, { "auxiliary_loss_clip": 0.01052206, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.01219273, "balance_loss_mlp": 1.0157702, "epoch": 0.6762062227566511, "flos": 20770004511360.0, "grad_norm": 1.7842532650098828, "language_loss": 0.88562119, "learning_rate": 1.002474432661539e-06, "loss": 0.90649885, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 11247, "time_per_iteration": 2.3573360443115234 }, { "auxiliary_loss_clip": 0.01007944, "auxiliary_loss_mlp": 0.01004182, "balance_loss_clip": 1.00194085, "balance_loss_mlp": 1.00108695, "epoch": 0.6762663460093191, "flos": 52814963157120.0, "grad_norm": 0.8379559024183869, "language_loss": 0.54075325, "learning_rate": 1.002136890130115e-06, "loss": 0.56087452, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06835938, "step": 11248, "time_per_iteration": 4.254245758056641 }, { "auxiliary_loss_clip": 0.01049191, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.01664805, "balance_loss_mlp": 1.01621783, "epoch": 0.676326469261987, "flos": 23695167196800.0, "grad_norm": 1.985554267292256, "language_loss": 0.74695665, "learning_rate": 1.001799385437761e-06, "loss": 0.76783079, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33007812, "step": 11249, "time_per_iteration": 2.422506809234619 }, { "auxiliary_loss_clip": 0.01053067, "auxiliary_loss_mlp": 0.01047079, "balance_loss_clip": 1.01981568, "balance_loss_mlp": 1.01568723, "epoch": 0.676386592514655, "flos": 14062874664960.0, "grad_norm": 2.3508218292613954, "language_loss": 0.75480735, "learning_rate": 1.0014619185972732e-06, "loss": 0.77580881, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37304688, "step": 11250, "time_per_iteration": 2.373190402984619 }, { "auxiliary_loss_clip": 0.01054302, "auxiliary_loss_mlp": 0.01043086, "balance_loss_clip": 1.01902938, "balance_loss_mlp": 1.01720679, "epoch": 0.676446715767323, "flos": 20411201802240.0, "grad_norm": 2.474140781676039, "language_loss": 0.77035689, "learning_rate": 1.0011244896214497e-06, "loss": 0.79133081, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 11251, "time_per_iteration": 2.3637707233428955 }, { "auxiliary_loss_clip": 0.01052734, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.01264477, "balance_loss_mlp": 1.01679325, "epoch": 0.676506839019991, "flos": 21287172614400.0, "grad_norm": 1.5594920691893785, "language_loss": 0.71216697, "learning_rate": 1.0007870985230873e-06, "loss": 0.7330637, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.359375, "step": 11252, "time_per_iteration": 3.7509493827819824 }, { "auxiliary_loss_clip": 0.01054972, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.01634562, "balance_loss_mlp": 1.01746178, "epoch": 0.676566962272659, "flos": 29931248712960.0, "grad_norm": 1.7012152859068883, "language_loss": 0.68295085, "learning_rate": 1.0004497453149765e-06, "loss": 0.70389897, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 11253, "time_per_iteration": 3.8877933025360107 }, { "auxiliary_loss_clip": 0.0105668, "auxiliary_loss_mlp": 0.01044338, "balance_loss_clip": 1.0169673, "balance_loss_mlp": 1.01819921, "epoch": 0.6766270855253269, "flos": 17930238744960.0, "grad_norm": 1.8017611255494455, "language_loss": 0.78054392, "learning_rate": 1.0001124300099115e-06, "loss": 0.80155408, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 11254, "time_per_iteration": 2.352283239364624 }, { "auxiliary_loss_clip": 0.01054465, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.01386535, "balance_loss_mlp": 1.01736975, "epoch": 0.6766872087779949, "flos": 23103948366720.0, "grad_norm": 1.8887426335313948, "language_loss": 0.73382336, "learning_rate": 9.997751526206835e-07, "loss": 0.75474608, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 11255, "time_per_iteration": 2.4247100353240967 }, { "auxiliary_loss_clip": 0.01053473, "auxiliary_loss_mlp": 0.01045862, "balance_loss_clip": 1.01958871, "balance_loss_mlp": 1.01622665, "epoch": 0.6767473320306628, "flos": 26211951175680.0, "grad_norm": 1.9549020741031604, "language_loss": 0.7714802, "learning_rate": 9.994379131600828e-07, "loss": 0.7924735, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37304688, "step": 11256, "time_per_iteration": 2.4050979614257812 }, { "auxiliary_loss_clip": 0.01054935, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.01193953, "balance_loss_mlp": 1.01843023, "epoch": 0.6768074552833309, "flos": 18367770303360.0, "grad_norm": 2.0767070682364213, "language_loss": 0.66376221, "learning_rate": 9.991007116408965e-07, "loss": 0.68468213, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 11257, "time_per_iteration": 2.3848414421081543 }, { "auxiliary_loss_clip": 0.01050617, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.01387405, "balance_loss_mlp": 1.01533937, "epoch": 0.6768675785359988, "flos": 23038800036480.0, "grad_norm": 1.594611908202567, "language_loss": 0.7636444, "learning_rate": 9.987635480759109e-07, "loss": 0.78451592, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 11258, "time_per_iteration": 2.360917806625366 }, { "auxiliary_loss_clip": 0.01051723, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.01588309, "balance_loss_mlp": 1.01611567, "epoch": 0.6769277017886668, "flos": 33035131981440.0, "grad_norm": 1.5451505589981567, "language_loss": 0.6802907, "learning_rate": 9.984264224779127e-07, "loss": 0.70121431, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35546875, "step": 11259, "time_per_iteration": 2.461118698120117 }, { "auxiliary_loss_clip": 0.01054307, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.0150708, "balance_loss_mlp": 1.01704264, "epoch": 0.6769878250413347, "flos": 20847406728960.0, "grad_norm": 2.014787359444734, "language_loss": 0.86561245, "learning_rate": 9.980893348596839e-07, "loss": 0.88654387, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37304688, "step": 11260, "time_per_iteration": 2.3591816425323486 }, { "auxiliary_loss_clip": 0.01054857, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.01181448, "balance_loss_mlp": 1.01662123, "epoch": 0.6770479482940027, "flos": 15595072992000.0, "grad_norm": 2.4520220412066824, "language_loss": 0.78611684, "learning_rate": 9.977522852340081e-07, "loss": 0.80704415, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 11261, "time_per_iteration": 2.3850655555725098 }, { "auxiliary_loss_clip": 0.01054355, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.01871967, "balance_loss_mlp": 1.01611567, "epoch": 0.6771080715466706, "flos": 18620122677120.0, "grad_norm": 1.8252450479222821, "language_loss": 0.88763559, "learning_rate": 9.97415273613666e-07, "loss": 0.90866256, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.3828125, "step": 11262, "time_per_iteration": 2.346998691558838 }, { "auxiliary_loss_clip": 0.01055462, "auxiliary_loss_mlp": 0.01038132, "balance_loss_clip": 1.01258588, "balance_loss_mlp": 1.01787972, "epoch": 0.6771681947993387, "flos": 12494611036800.0, "grad_norm": 1.9814405248972815, "language_loss": 0.75111645, "learning_rate": 9.97078300011439e-07, "loss": 0.77205247, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 11263, "time_per_iteration": 3.758420944213867 }, { "auxiliary_loss_clip": 0.01053714, "auxiliary_loss_mlp": 0.01047035, "balance_loss_clip": 1.02028441, "balance_loss_mlp": 1.01604259, "epoch": 0.6772283180520066, "flos": 22235867521920.0, "grad_norm": 2.1154180733141854, "language_loss": 0.69256043, "learning_rate": 9.967413644401016e-07, "loss": 0.71356797, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37695312, "step": 11264, "time_per_iteration": 2.369351387023926 }, { "auxiliary_loss_clip": 0.01053264, "auxiliary_loss_mlp": 0.01039012, "balance_loss_clip": 1.01458621, "balance_loss_mlp": 1.01742959, "epoch": 0.6772884413046746, "flos": 16142231819520.0, "grad_norm": 2.2737909251482455, "language_loss": 0.7466321, "learning_rate": 9.964044669124324e-07, "loss": 0.76755488, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 11265, "time_per_iteration": 2.3745946884155273 }, { "auxiliary_loss_clip": 0.0105203, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.01329088, "balance_loss_mlp": 1.01638627, "epoch": 0.6773485645573426, "flos": 19134742250880.0, "grad_norm": 2.765439884532397, "language_loss": 0.62763751, "learning_rate": 9.96067607441207e-07, "loss": 0.648525, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 11266, "time_per_iteration": 2.348555564880371 }, { "auxiliary_loss_clip": 0.01055355, "auxiliary_loss_mlp": 0.01041293, "balance_loss_clip": 1.0155201, "balance_loss_mlp": 1.01764941, "epoch": 0.6774086878100105, "flos": 14136052608000.0, "grad_norm": 2.059845012442261, "language_loss": 0.72085965, "learning_rate": 9.957307860391976e-07, "loss": 0.74182606, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 11267, "time_per_iteration": 2.352788209915161 }, { "auxiliary_loss_clip": 0.01053946, "auxiliary_loss_mlp": 0.01040905, "balance_loss_clip": 1.01550162, "balance_loss_mlp": 1.01749241, "epoch": 0.6774688110626785, "flos": 22196066705280.0, "grad_norm": 1.9447508611679605, "language_loss": 0.7184279, "learning_rate": 9.953940027191785e-07, "loss": 0.73937643, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 11268, "time_per_iteration": 2.3769571781158447 }, { "auxiliary_loss_clip": 0.01054605, "auxiliary_loss_mlp": 0.01040565, "balance_loss_clip": 1.01679468, "balance_loss_mlp": 1.01732993, "epoch": 0.6775289343153464, "flos": 23038834947840.0, "grad_norm": 1.4935839834510718, "language_loss": 0.77973628, "learning_rate": 9.950572574939194e-07, "loss": 0.80068803, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 11269, "time_per_iteration": 2.3872597217559814 }, { "auxiliary_loss_clip": 0.01054923, "auxiliary_loss_mlp": 0.01044737, "balance_loss_clip": 1.01774836, "balance_loss_mlp": 1.01649261, "epoch": 0.6775890575680145, "flos": 18292602412800.0, "grad_norm": 2.699809028304398, "language_loss": 0.75388038, "learning_rate": 9.94720550376189e-07, "loss": 0.77487695, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38476562, "step": 11270, "time_per_iteration": 2.3385002613067627 }, { "auxiliary_loss_clip": 0.01053778, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.01356673, "balance_loss_mlp": 1.01712251, "epoch": 0.6776491808206824, "flos": 25335317047680.0, "grad_norm": 1.7617927050073507, "language_loss": 0.73063993, "learning_rate": 9.94383881378756e-07, "loss": 0.75157058, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 11271, "time_per_iteration": 2.4167957305908203 }, { "auxiliary_loss_clip": 0.01054766, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.01445317, "balance_loss_mlp": 1.01685202, "epoch": 0.6777093040733504, "flos": 26027121104640.0, "grad_norm": 1.7515857738516651, "language_loss": 0.68855506, "learning_rate": 9.94047250514387e-07, "loss": 0.70949018, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37890625, "step": 11272, "time_per_iteration": 2.4052112102508545 }, { "auxiliary_loss_clip": 0.01056218, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.01592433, "balance_loss_mlp": 1.01702034, "epoch": 0.6777694273260183, "flos": 18002648638080.0, "grad_norm": 1.8895045822361585, "language_loss": 0.75351954, "learning_rate": 9.937106577958481e-07, "loss": 0.774517, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.390625, "step": 11273, "time_per_iteration": 2.363518238067627 }, { "auxiliary_loss_clip": 0.01051835, "auxiliary_loss_mlp": 0.01044938, "balance_loss_clip": 1.02055943, "balance_loss_mlp": 1.01611567, "epoch": 0.6778295505786863, "flos": 23439952091520.0, "grad_norm": 1.7445604503654453, "language_loss": 0.70837665, "learning_rate": 9.933741032359015e-07, "loss": 0.72934437, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35742188, "step": 11274, "time_per_iteration": 2.3861422538757324 }, { "auxiliary_loss_clip": 0.01056165, "auxiliary_loss_mlp": 0.01041283, "balance_loss_clip": 1.01400733, "balance_loss_mlp": 1.01811242, "epoch": 0.6778896738313542, "flos": 19097420140800.0, "grad_norm": 1.547533175675112, "language_loss": 0.67390382, "learning_rate": 9.930375868473093e-07, "loss": 0.69487834, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38085938, "step": 11275, "time_per_iteration": 2.4139182567596436 }, { "auxiliary_loss_clip": 0.01055886, "auxiliary_loss_mlp": 0.01038602, "balance_loss_clip": 1.01435518, "balance_loss_mlp": 1.0188179, "epoch": 0.6779497970840223, "flos": 26102742842880.0, "grad_norm": 2.179060574676903, "language_loss": 0.73494768, "learning_rate": 9.927011086428335e-07, "loss": 0.75589257, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 11276, "time_per_iteration": 2.4253876209259033 }, { "auxiliary_loss_clip": 0.01053492, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.0160532, "balance_loss_mlp": 1.01668525, "epoch": 0.6780099203366902, "flos": 19718210759040.0, "grad_norm": 1.7095118807542862, "language_loss": 0.77528596, "learning_rate": 9.923646686352317e-07, "loss": 0.79624486, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.36914062, "step": 11277, "time_per_iteration": 2.3601174354553223 }, { "auxiliary_loss_clip": 0.01054054, "auxiliary_loss_mlp": 0.0103857, "balance_loss_clip": 1.01425159, "balance_loss_mlp": 1.01632261, "epoch": 0.6780700435893582, "flos": 18213803740800.0, "grad_norm": 2.579252165846449, "language_loss": 0.85349035, "learning_rate": 9.920282668372627e-07, "loss": 0.87441659, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37695312, "step": 11278, "time_per_iteration": 2.3053858280181885 }, { "auxiliary_loss_clip": 0.0105086, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.01296175, "balance_loss_mlp": 1.01654005, "epoch": 0.6781301668420262, "flos": 25375013130240.0, "grad_norm": 1.6275199152831021, "language_loss": 0.7088905, "learning_rate": 9.916919032616844e-07, "loss": 0.72974271, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34179688, "step": 11279, "time_per_iteration": 2.467942237854004 }, { "auxiliary_loss_clip": 0.01055205, "auxiliary_loss_mlp": 0.01039781, "balance_loss_clip": 1.01404333, "balance_loss_mlp": 1.01787639, "epoch": 0.6781902900946941, "flos": 24019405793280.0, "grad_norm": 1.9048314269531397, "language_loss": 0.75921476, "learning_rate": 9.913555779212485e-07, "loss": 0.7801646, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 11280, "time_per_iteration": 2.373027801513672 }, { "auxiliary_loss_clip": 0.01053004, "auxiliary_loss_mlp": 0.01046597, "balance_loss_clip": 1.02101493, "balance_loss_mlp": 1.01571667, "epoch": 0.6782504133473621, "flos": 19645731043200.0, "grad_norm": 1.933457490988833, "language_loss": 0.71751374, "learning_rate": 9.910192908287104e-07, "loss": 0.73850977, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 11281, "time_per_iteration": 2.3672471046447754 }, { "auxiliary_loss_clip": 0.01051609, "auxiliary_loss_mlp": 0.01040307, "balance_loss_clip": 1.01778805, "balance_loss_mlp": 1.01540756, "epoch": 0.67831053660003, "flos": 24931686286080.0, "grad_norm": 1.574980703608313, "language_loss": 0.65066314, "learning_rate": 9.906830419968217e-07, "loss": 0.67158228, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 11282, "time_per_iteration": 2.4322304725646973 }, { "auxiliary_loss_clip": 0.01056606, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.02331901, "balance_loss_mlp": 1.01752567, "epoch": 0.6783706598526981, "flos": 31207149682560.0, "grad_norm": 1.5794368277177209, "language_loss": 0.75686443, "learning_rate": 9.90346831438334e-07, "loss": 0.77793181, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 11283, "time_per_iteration": 2.437342643737793 }, { "auxiliary_loss_clip": 0.01051822, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.01377225, "balance_loss_mlp": 1.01679659, "epoch": 0.678430783105366, "flos": 35439949630080.0, "grad_norm": 1.623192298622196, "language_loss": 0.57496369, "learning_rate": 9.900106591659948e-07, "loss": 0.59584117, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 11284, "time_per_iteration": 2.4764459133148193 }, { "auxiliary_loss_clip": 0.01052937, "auxiliary_loss_mlp": 0.01037287, "balance_loss_clip": 1.01383805, "balance_loss_mlp": 1.01598859, "epoch": 0.678490906358034, "flos": 14427926507520.0, "grad_norm": 2.0865480729638732, "language_loss": 0.76822889, "learning_rate": 9.896745251925535e-07, "loss": 0.78913116, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 11285, "time_per_iteration": 2.3642964363098145 }, { "auxiliary_loss_clip": 0.01052086, "auxiliary_loss_mlp": 0.01042527, "balance_loss_clip": 1.01831532, "balance_loss_mlp": 1.01640415, "epoch": 0.6785510296107019, "flos": 24310232352000.0, "grad_norm": 1.9595682947868875, "language_loss": 0.67616379, "learning_rate": 9.893384295307557e-07, "loss": 0.69710994, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 11286, "time_per_iteration": 2.3742895126342773 }, { "auxiliary_loss_clip": 0.01052049, "auxiliary_loss_mlp": 0.01041359, "balance_loss_clip": 1.01677799, "balance_loss_mlp": 1.01536, "epoch": 0.6786111528633699, "flos": 26976095303040.0, "grad_norm": 2.204643939786479, "language_loss": 0.53921473, "learning_rate": 9.890023721933447e-07, "loss": 0.56014872, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11287, "time_per_iteration": 2.4239234924316406 }, { "auxiliary_loss_clip": 0.01053209, "auxiliary_loss_mlp": 0.01039124, "balance_loss_clip": 1.01602173, "balance_loss_mlp": 1.01763463, "epoch": 0.6786712761160378, "flos": 24316376751360.0, "grad_norm": 1.518466352182726, "language_loss": 0.77912945, "learning_rate": 9.886663531930655e-07, "loss": 0.80005276, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 11288, "time_per_iteration": 3.675100803375244 }, { "auxiliary_loss_clip": 0.01053751, "auxiliary_loss_mlp": 0.01044296, "balance_loss_clip": 1.01960802, "balance_loss_mlp": 1.01668131, "epoch": 0.6787313993687059, "flos": 22929312412800.0, "grad_norm": 2.472114699776041, "language_loss": 0.74512041, "learning_rate": 9.883303725426593e-07, "loss": 0.76610088, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 11289, "time_per_iteration": 2.428295135498047 }, { "auxiliary_loss_clip": 0.01052362, "auxiliary_loss_mlp": 0.01039552, "balance_loss_clip": 1.01660419, "balance_loss_mlp": 1.01623309, "epoch": 0.6787915226213738, "flos": 26867270995200.0, "grad_norm": 1.4191528310959205, "language_loss": 0.80967057, "learning_rate": 9.879944302548682e-07, "loss": 0.83058971, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36132812, "step": 11290, "time_per_iteration": 2.3952667713165283 }, { "auxiliary_loss_clip": 0.0105147, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.01262355, "balance_loss_mlp": 1.01670134, "epoch": 0.6788516458740418, "flos": 20007885242880.0, "grad_norm": 1.4823884987894118, "language_loss": 0.75626892, "learning_rate": 9.87658526342428e-07, "loss": 0.77712154, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 11291, "time_per_iteration": 2.3917276859283447 }, { "auxiliary_loss_clip": 0.01053293, "auxiliary_loss_mlp": 0.01039161, "balance_loss_clip": 1.01527095, "balance_loss_mlp": 1.01712668, "epoch": 0.6789117691267098, "flos": 28725942245760.0, "grad_norm": 2.3918937027607154, "language_loss": 0.76264089, "learning_rate": 9.873226608180785e-07, "loss": 0.78356546, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 11292, "time_per_iteration": 3.9469332695007324 }, { "auxiliary_loss_clip": 0.01052127, "auxiliary_loss_mlp": 0.01039968, "balance_loss_clip": 1.01464844, "balance_loss_mlp": 1.01591349, "epoch": 0.6789718923793777, "flos": 23402350690560.0, "grad_norm": 2.296129383324774, "language_loss": 0.85437715, "learning_rate": 9.869868336945556e-07, "loss": 0.87529814, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 11293, "time_per_iteration": 2.3850936889648438 }, { "auxiliary_loss_clip": 0.01056657, "auxiliary_loss_mlp": 0.01043644, "balance_loss_clip": 1.01641655, "balance_loss_mlp": 1.0176754, "epoch": 0.6790320156320457, "flos": 20447825685120.0, "grad_norm": 2.0774252407999225, "language_loss": 0.80777556, "learning_rate": 9.866510449845929e-07, "loss": 0.82877862, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 11294, "time_per_iteration": 2.3601133823394775 }, { "auxiliary_loss_clip": 0.01052478, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.01342797, "balance_loss_mlp": 1.01683903, "epoch": 0.6790921388847136, "flos": 24166145704320.0, "grad_norm": 1.8230657832273485, "language_loss": 0.79918033, "learning_rate": 9.86315294700924e-07, "loss": 0.82006478, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 11295, "time_per_iteration": 2.392289400100708 }, { "auxiliary_loss_clip": 0.0105049, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.01309061, "balance_loss_mlp": 1.0164479, "epoch": 0.6791522621373817, "flos": 21907020625920.0, "grad_norm": 1.7376730216707605, "language_loss": 0.72434521, "learning_rate": 9.859795828562823e-07, "loss": 0.74518573, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 11296, "time_per_iteration": 2.3648340702056885 }, { "auxiliary_loss_clip": 0.01051624, "auxiliary_loss_mlp": 0.01038323, "balance_loss_clip": 1.01579261, "balance_loss_mlp": 1.01652813, "epoch": 0.6792123853900496, "flos": 24825375596160.0, "grad_norm": 1.6101181970844585, "language_loss": 0.71841669, "learning_rate": 9.856439094633949e-07, "loss": 0.73931611, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 11297, "time_per_iteration": 2.4036624431610107 }, { "auxiliary_loss_clip": 0.01054103, "auxiliary_loss_mlp": 0.01037614, "balance_loss_clip": 1.01333082, "balance_loss_mlp": 1.01636243, "epoch": 0.6792725086427176, "flos": 17565326547840.0, "grad_norm": 1.9507210384189233, "language_loss": 0.67675126, "learning_rate": 9.853082745349918e-07, "loss": 0.69766843, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37695312, "step": 11298, "time_per_iteration": 2.3248016834259033 }, { "auxiliary_loss_clip": 0.01054078, "auxiliary_loss_mlp": 0.01038007, "balance_loss_clip": 1.01642966, "balance_loss_mlp": 1.01756382, "epoch": 0.6793326318953855, "flos": 26940658406400.0, "grad_norm": 1.9121428418216855, "language_loss": 0.73110199, "learning_rate": 9.84972678083801e-07, "loss": 0.75202286, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36523438, "step": 11299, "time_per_iteration": 2.4153406620025635 }, { "auxiliary_loss_clip": 0.01053092, "auxiliary_loss_mlp": 0.01040864, "balance_loss_clip": 1.01671267, "balance_loss_mlp": 1.01679838, "epoch": 0.6793927551480535, "flos": 24317074978560.0, "grad_norm": 1.4209445640763256, "language_loss": 0.77907449, "learning_rate": 9.846371201225488e-07, "loss": 0.80001402, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 11300, "time_per_iteration": 2.4375524520874023 }, { "auxiliary_loss_clip": 0.01051945, "auxiliary_loss_mlp": 0.01037068, "balance_loss_clip": 1.0137862, "balance_loss_mlp": 1.01626587, "epoch": 0.6794528784007214, "flos": 11435835012480.0, "grad_norm": 2.1343495402906036, "language_loss": 0.64181101, "learning_rate": 9.843016006639577e-07, "loss": 0.66270113, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 11301, "time_per_iteration": 2.4139604568481445 }, { "auxiliary_loss_clip": 0.01052558, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.01151466, "balance_loss_mlp": 1.01651525, "epoch": 0.6795130016533895, "flos": 25228482687360.0, "grad_norm": 1.804331287032592, "language_loss": 0.83806932, "learning_rate": 9.839661197207525e-07, "loss": 0.85892653, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.359375, "step": 11302, "time_per_iteration": 2.385406017303467 }, { "auxiliary_loss_clip": 0.01052603, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.01627195, "balance_loss_mlp": 1.01615071, "epoch": 0.6795731249060574, "flos": 18295430232960.0, "grad_norm": 1.8396475543618434, "language_loss": 0.71670312, "learning_rate": 9.83630677305654e-07, "loss": 0.73763376, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11303, "time_per_iteration": 3.7659027576446533 }, { "auxiliary_loss_clip": 0.01054309, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.0124383, "balance_loss_mlp": 1.01661503, "epoch": 0.6796332481587254, "flos": 20299410028800.0, "grad_norm": 2.434726827501631, "language_loss": 0.71962571, "learning_rate": 9.832952734313813e-07, "loss": 0.74053669, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37695312, "step": 11304, "time_per_iteration": 2.3472208976745605 }, { "auxiliary_loss_clip": 0.01054115, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.01615953, "balance_loss_mlp": 1.01719534, "epoch": 0.6796933714113934, "flos": 23585714484480.0, "grad_norm": 1.919155189417444, "language_loss": 0.73916245, "learning_rate": 9.829599081106536e-07, "loss": 0.76010954, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 11305, "time_per_iteration": 2.3813540935516357 }, { "auxiliary_loss_clip": 0.01053604, "auxiliary_loss_mlp": 0.01036424, "balance_loss_clip": 1.01171196, "balance_loss_mlp": 1.01684117, "epoch": 0.6797534946640613, "flos": 27118855319040.0, "grad_norm": 2.1868048299752973, "language_loss": 0.67097187, "learning_rate": 9.826245813561882e-07, "loss": 0.69187224, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 11306, "time_per_iteration": 2.3829691410064697 }, { "auxiliary_loss_clip": 0.01051347, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.01316977, "balance_loss_mlp": 1.01591039, "epoch": 0.6798136179167293, "flos": 22126344986880.0, "grad_norm": 1.5553161400698892, "language_loss": 0.81303728, "learning_rate": 9.822892931807021e-07, "loss": 0.8339082, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 11307, "time_per_iteration": 2.3684165477752686 }, { "auxiliary_loss_clip": 0.0105324, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.01850343, "balance_loss_mlp": 1.01688075, "epoch": 0.6798737411693972, "flos": 17487819596160.0, "grad_norm": 1.6848261866720162, "language_loss": 0.8996042, "learning_rate": 9.819540435969066e-07, "loss": 0.92054749, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 11308, "time_per_iteration": 2.330977439880371 }, { "auxiliary_loss_clip": 0.01052924, "auxiliary_loss_mlp": 0.01038604, "balance_loss_clip": 1.01449966, "balance_loss_mlp": 1.01680899, "epoch": 0.6799338644220653, "flos": 22891187341440.0, "grad_norm": 2.0323730365446466, "language_loss": 0.72306037, "learning_rate": 9.816188326175154e-07, "loss": 0.74397564, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 11309, "time_per_iteration": 2.3653910160064697 }, { "auxiliary_loss_clip": 0.01052142, "auxiliary_loss_mlp": 0.01042025, "balance_loss_clip": 1.01672864, "balance_loss_mlp": 1.01588535, "epoch": 0.6799939876747332, "flos": 23179430459520.0, "grad_norm": 2.0673285615712387, "language_loss": 0.85508621, "learning_rate": 9.812836602552411e-07, "loss": 0.87602782, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 11310, "time_per_iteration": 2.3496391773223877 }, { "auxiliary_loss_clip": 0.01052577, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.01139092, "balance_loss_mlp": 1.01714158, "epoch": 0.6800541109274012, "flos": 19498921309440.0, "grad_norm": 2.461221733558314, "language_loss": 0.84251785, "learning_rate": 9.80948526522792e-07, "loss": 0.86337811, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 11311, "time_per_iteration": 2.3542094230651855 }, { "auxiliary_loss_clip": 0.01055588, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.01488972, "balance_loss_mlp": 1.01657796, "epoch": 0.6801142341800691, "flos": 22276436388480.0, "grad_norm": 1.7579058547909288, "language_loss": 0.77063441, "learning_rate": 9.806134314328767e-07, "loss": 0.79160225, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 11312, "time_per_iteration": 2.392918109893799 }, { "auxiliary_loss_clip": 0.01009025, "auxiliary_loss_mlp": 0.01004238, "balance_loss_clip": 1.00174665, "balance_loss_mlp": 1.00214028, "epoch": 0.6801743574327371, "flos": 68711547450240.0, "grad_norm": 0.6599246814647423, "language_loss": 0.57335389, "learning_rate": 9.802783749982038e-07, "loss": 0.59348649, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.06933594, "step": 11313, "time_per_iteration": 3.1280665397644043 }, { "auxiliary_loss_clip": 0.01051917, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.00877905, "balance_loss_mlp": 1.0156343, "epoch": 0.680234480685405, "flos": 29459187953280.0, "grad_norm": 2.087798068426426, "language_loss": 0.69325662, "learning_rate": 9.799433572314754e-07, "loss": 0.71410531, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 11314, "time_per_iteration": 2.4193294048309326 }, { "auxiliary_loss_clip": 0.0105117, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.01653981, "balance_loss_mlp": 1.01615846, "epoch": 0.6802946039380731, "flos": 15916169566080.0, "grad_norm": 1.8198767109516336, "language_loss": 0.82100427, "learning_rate": 9.796083781453972e-07, "loss": 0.84188867, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34960938, "step": 11315, "time_per_iteration": 2.3320631980895996 }, { "auxiliary_loss_clip": 0.01052958, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.01064742, "balance_loss_mlp": 1.01656938, "epoch": 0.680354727190741, "flos": 22017555590400.0, "grad_norm": 1.6326475927579498, "language_loss": 0.70885432, "learning_rate": 9.792734377526718e-07, "loss": 0.72973233, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 11316, "time_per_iteration": 2.376171588897705 }, { "auxiliary_loss_clip": 0.01052362, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.01585579, "balance_loss_mlp": 1.01652789, "epoch": 0.680414850443409, "flos": 18440529310080.0, "grad_norm": 2.1486012271944084, "language_loss": 0.67608082, "learning_rate": 9.789385360660003e-07, "loss": 0.69699681, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 11317, "time_per_iteration": 2.3507442474365234 }, { "auxiliary_loss_clip": 0.01054259, "auxiliary_loss_mlp": 0.01050419, "balance_loss_clip": 1.02674437, "balance_loss_mlp": 1.01735544, "epoch": 0.680474973696077, "flos": 26357434277760.0, "grad_norm": 1.427726714094424, "language_loss": 0.76189744, "learning_rate": 9.78603673098082e-07, "loss": 0.7829442, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 11318, "time_per_iteration": 2.4003381729125977 }, { "auxiliary_loss_clip": 0.01049767, "auxiliary_loss_mlp": 0.01035915, "balance_loss_clip": 1.01270461, "balance_loss_mlp": 1.01536596, "epoch": 0.6805350969487449, "flos": 18332123938560.0, "grad_norm": 1.5612113322352652, "language_loss": 0.68884349, "learning_rate": 9.782688488616143e-07, "loss": 0.70970035, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34375, "step": 11319, "time_per_iteration": 2.366793394088745 }, { "auxiliary_loss_clip": 0.01052697, "auxiliary_loss_mlp": 0.01037653, "balance_loss_clip": 1.01474094, "balance_loss_mlp": 1.01586473, "epoch": 0.6805952202014129, "flos": 19936487779200.0, "grad_norm": 1.9590695334177763, "language_loss": 0.778593, "learning_rate": 9.779340633692945e-07, "loss": 0.79949653, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 11320, "time_per_iteration": 2.341738224029541 }, { "auxiliary_loss_clip": 0.01051464, "auxiliary_loss_mlp": 0.01039399, "balance_loss_clip": 1.01485419, "balance_loss_mlp": 1.01578641, "epoch": 0.6806553434540809, "flos": 25223245983360.0, "grad_norm": 1.9941067757150803, "language_loss": 0.76120341, "learning_rate": 9.77599316633817e-07, "loss": 0.782112, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.35546875, "step": 11321, "time_per_iteration": 2.4126908779144287 }, { "auxiliary_loss_clip": 0.01053287, "auxiliary_loss_mlp": 0.01045625, "balance_loss_clip": 1.02143741, "balance_loss_mlp": 1.0170145, "epoch": 0.6807154667067489, "flos": 17784615997440.0, "grad_norm": 1.8758312289261039, "language_loss": 0.74230587, "learning_rate": 9.772646086678758e-07, "loss": 0.76329499, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 11322, "time_per_iteration": 2.3454885482788086 }, { "auxiliary_loss_clip": 0.01051834, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.01071644, "balance_loss_mlp": 1.0158776, "epoch": 0.6807755899594168, "flos": 22198824702720.0, "grad_norm": 1.573839321786856, "language_loss": 0.7952615, "learning_rate": 9.769299394841638e-07, "loss": 0.816127, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 11323, "time_per_iteration": 2.3848392963409424 }, { "auxiliary_loss_clip": 0.01007802, "auxiliary_loss_mlp": 0.01003447, "balance_loss_clip": 1.00139701, "balance_loss_mlp": 1.00098681, "epoch": 0.6808357132120848, "flos": 68628105567360.0, "grad_norm": 0.7705674752663615, "language_loss": 0.57208943, "learning_rate": 9.765953090953714e-07, "loss": 0.59220195, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06835938, "step": 11324, "time_per_iteration": 2.795022964477539 }, { "auxiliary_loss_clip": 0.0105355, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.0136652, "balance_loss_mlp": 1.01712179, "epoch": 0.6808958364647527, "flos": 23842186398720.0, "grad_norm": 2.007031615039834, "language_loss": 0.69869846, "learning_rate": 9.76260717514186e-07, "loss": 0.71963531, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36328125, "step": 11325, "time_per_iteration": 2.4232065677642822 }, { "auxiliary_loss_clip": 0.0105418, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.01264393, "balance_loss_mlp": 1.01595712, "epoch": 0.6809559597174207, "flos": 17710774738560.0, "grad_norm": 2.2087568090806293, "language_loss": 0.71746671, "learning_rate": 9.759261647532974e-07, "loss": 0.73838055, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 11326, "time_per_iteration": 2.3051655292510986 }, { "auxiliary_loss_clip": 0.01053347, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.01666188, "balance_loss_mlp": 1.01588583, "epoch": 0.6810160829700886, "flos": 22490803336320.0, "grad_norm": 1.9710726230785698, "language_loss": 0.7456708, "learning_rate": 9.75591650825392e-07, "loss": 0.76660872, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 11327, "time_per_iteration": 2.372598648071289 }, { "auxiliary_loss_clip": 0.0105078, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.01139617, "balance_loss_mlp": 1.01547813, "epoch": 0.6810762062227567, "flos": 16832045928960.0, "grad_norm": 1.8022847802693245, "language_loss": 0.78302848, "learning_rate": 9.752571757431526e-07, "loss": 0.80388319, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 11328, "time_per_iteration": 3.5912673473358154 }, { "auxiliary_loss_clip": 0.01053214, "auxiliary_loss_mlp": 0.01038345, "balance_loss_clip": 1.01438427, "balance_loss_mlp": 1.01614332, "epoch": 0.6811363294754246, "flos": 12713830663680.0, "grad_norm": 1.9867205532036336, "language_loss": 0.65883005, "learning_rate": 9.74922739519265e-07, "loss": 0.67974561, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 11329, "time_per_iteration": 2.341482162475586 }, { "auxiliary_loss_clip": 0.01054363, "auxiliary_loss_mlp": 0.01037632, "balance_loss_clip": 1.01377773, "balance_loss_mlp": 1.017012, "epoch": 0.6811964527280926, "flos": 17711019118080.0, "grad_norm": 1.9106282524070042, "language_loss": 0.80071592, "learning_rate": 9.745883421664096e-07, "loss": 0.8216359, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 11330, "time_per_iteration": 2.3177225589752197 }, { "auxiliary_loss_clip": 0.010534, "auxiliary_loss_mlp": 0.0103745, "balance_loss_clip": 1.01242816, "balance_loss_mlp": 1.01686656, "epoch": 0.6812565759807605, "flos": 24862313681280.0, "grad_norm": 1.7928339999133578, "language_loss": 0.64917552, "learning_rate": 9.742539836972665e-07, "loss": 0.670084, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 11331, "time_per_iteration": 3.8097686767578125 }, { "auxiliary_loss_clip": 0.01051712, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.01333761, "balance_loss_mlp": 1.01602399, "epoch": 0.6813166992334285, "flos": 17165047276800.0, "grad_norm": 1.689369831393743, "language_loss": 0.73483318, "learning_rate": 9.739196641245148e-07, "loss": 0.75571734, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 11332, "time_per_iteration": 3.730844020843506 }, { "auxiliary_loss_clip": 0.01054633, "auxiliary_loss_mlp": 0.01039958, "balance_loss_clip": 1.01592517, "balance_loss_mlp": 1.01661801, "epoch": 0.6813768224860965, "flos": 18842554149120.0, "grad_norm": 2.4554504090549854, "language_loss": 0.76488346, "learning_rate": 9.735853834608326e-07, "loss": 0.78582937, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 11333, "time_per_iteration": 2.339313268661499 }, { "auxiliary_loss_clip": 0.01055695, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.0105654, "balance_loss_mlp": 1.01713085, "epoch": 0.6814369457387645, "flos": 24531651394560.0, "grad_norm": 1.4305273344234692, "language_loss": 0.72545427, "learning_rate": 9.732511417188963e-07, "loss": 0.74637389, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 11334, "time_per_iteration": 2.422800064086914 }, { "auxiliary_loss_clip": 0.01050348, "auxiliary_loss_mlp": 0.01037895, "balance_loss_clip": 1.01433885, "balance_loss_mlp": 1.01486874, "epoch": 0.6814970689914325, "flos": 18222007910400.0, "grad_norm": 1.6455207138431267, "language_loss": 0.8672936, "learning_rate": 9.729169389113791e-07, "loss": 0.88817602, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 11335, "time_per_iteration": 2.3310294151306152 }, { "auxiliary_loss_clip": 0.01048957, "auxiliary_loss_mlp": 0.01028379, "balance_loss_clip": 1.00744617, "balance_loss_mlp": 1.01486683, "epoch": 0.6815571922441004, "flos": 25227609903360.0, "grad_norm": 1.8234258559114502, "language_loss": 0.83389199, "learning_rate": 9.725827750509542e-07, "loss": 0.8546654, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33984375, "step": 11336, "time_per_iteration": 2.4328370094299316 }, { "auxiliary_loss_clip": 0.01051431, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.01173389, "balance_loss_mlp": 1.0158205, "epoch": 0.6816173154967684, "flos": 19455280243200.0, "grad_norm": 1.812462853137593, "language_loss": 0.82813579, "learning_rate": 9.72248650150294e-07, "loss": 0.8489871, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35546875, "step": 11337, "time_per_iteration": 2.3259758949279785 }, { "auxiliary_loss_clip": 0.01050418, "auxiliary_loss_mlp": 0.01036071, "balance_loss_clip": 1.01466107, "balance_loss_mlp": 1.01571894, "epoch": 0.6816774387494363, "flos": 17930483124480.0, "grad_norm": 1.5548121892171674, "language_loss": 0.73011971, "learning_rate": 9.719145642220673e-07, "loss": 0.75098467, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34765625, "step": 11338, "time_per_iteration": 2.3799033164978027 }, { "auxiliary_loss_clip": 0.01052779, "auxiliary_loss_mlp": 0.01037984, "balance_loss_clip": 1.01427341, "balance_loss_mlp": 1.01705265, "epoch": 0.6817375620021043, "flos": 22232027272320.0, "grad_norm": 1.4589737679692956, "language_loss": 0.78951555, "learning_rate": 9.715805172789435e-07, "loss": 0.8104232, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 11339, "time_per_iteration": 2.355992317199707 }, { "auxiliary_loss_clip": 0.01053085, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.01859868, "balance_loss_mlp": 1.01653409, "epoch": 0.6817976852547722, "flos": 25373232650880.0, "grad_norm": 3.552380950005672, "language_loss": 0.7151162, "learning_rate": 9.712465093335901e-07, "loss": 0.73607421, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11340, "time_per_iteration": 2.4233591556549072 }, { "auxiliary_loss_clip": 0.01055985, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.01938903, "balance_loss_mlp": 1.01769495, "epoch": 0.6818578085074403, "flos": 22264880728320.0, "grad_norm": 2.3550485434979884, "language_loss": 0.84839368, "learning_rate": 9.709125403986722e-07, "loss": 0.86939609, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3828125, "step": 11341, "time_per_iteration": 2.3447048664093018 }, { "auxiliary_loss_clip": 0.01053461, "auxiliary_loss_mlp": 0.01045125, "balance_loss_clip": 1.01941168, "balance_loss_mlp": 1.01675189, "epoch": 0.6819179317601082, "flos": 19317128526720.0, "grad_norm": 2.0356370174976095, "language_loss": 0.6935879, "learning_rate": 9.705786104868531e-07, "loss": 0.71457374, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 11342, "time_per_iteration": 3.799626350402832 }, { "auxiliary_loss_clip": 0.01051757, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.01613879, "balance_loss_mlp": 1.01549602, "epoch": 0.6819780550127762, "flos": 21103110593280.0, "grad_norm": 1.6064830238785888, "language_loss": 0.75567943, "learning_rate": 9.702447196107963e-07, "loss": 0.77661073, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 11343, "time_per_iteration": 2.359135150909424 }, { "auxiliary_loss_clip": 0.0105454, "auxiliary_loss_mlp": 0.01044045, "balance_loss_clip": 1.01921368, "balance_loss_mlp": 1.01740527, "epoch": 0.6820381782654441, "flos": 29715101285760.0, "grad_norm": 1.6437877363621936, "language_loss": 0.80945468, "learning_rate": 9.699108677831639e-07, "loss": 0.83044052, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 11344, "time_per_iteration": 2.4701366424560547 }, { "auxiliary_loss_clip": 0.01054017, "auxiliary_loss_mlp": 0.01043399, "balance_loss_clip": 1.01835322, "balance_loss_mlp": 1.0168891, "epoch": 0.6820983015181121, "flos": 29240841110400.0, "grad_norm": 3.3114406747545013, "language_loss": 0.67253613, "learning_rate": 9.695770550166136e-07, "loss": 0.69351029, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 11345, "time_per_iteration": 2.4269392490386963 }, { "auxiliary_loss_clip": 0.01055092, "auxiliary_loss_mlp": 0.01043908, "balance_loss_clip": 1.01836133, "balance_loss_mlp": 1.0175004, "epoch": 0.6821584247707801, "flos": 18871008773760.0, "grad_norm": 2.340703834828303, "language_loss": 0.65980124, "learning_rate": 9.692432813238054e-07, "loss": 0.6807912, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 11346, "time_per_iteration": 2.3652656078338623 }, { "auxiliary_loss_clip": 0.01053302, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.01448059, "balance_loss_mlp": 1.01705098, "epoch": 0.6822185480234481, "flos": 21323517206400.0, "grad_norm": 1.5298447802888782, "language_loss": 0.79407704, "learning_rate": 9.689095467173952e-07, "loss": 0.81500864, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 11347, "time_per_iteration": 2.3567261695861816 }, { "auxiliary_loss_clip": 0.01008141, "auxiliary_loss_mlp": 0.01002297, "balance_loss_clip": 1.00005543, "balance_loss_mlp": 1.00124049, "epoch": 0.6822786712761161, "flos": 63485434010880.0, "grad_norm": 0.728293305653424, "language_loss": 0.52573669, "learning_rate": 9.685758512100378e-07, "loss": 0.5458411, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06933594, "step": 11348, "time_per_iteration": 3.01145076751709 }, { "auxiliary_loss_clip": 0.01051568, "auxiliary_loss_mlp": 0.01037831, "balance_loss_clip": 1.01508629, "balance_loss_mlp": 1.01614904, "epoch": 0.682338794528784, "flos": 21067883164800.0, "grad_norm": 1.770305780845218, "language_loss": 0.80625159, "learning_rate": 9.682421948143873e-07, "loss": 0.82714558, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 11349, "time_per_iteration": 2.3378827571868896 }, { "auxiliary_loss_clip": 0.01057666, "auxiliary_loss_mlp": 0.01040501, "balance_loss_clip": 1.01006722, "balance_loss_mlp": 1.01699567, "epoch": 0.682398917781452, "flos": 36281775265920.0, "grad_norm": 2.1031734446507304, "language_loss": 0.74915248, "learning_rate": 9.67908577543096e-07, "loss": 0.77013421, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 0.40820312, "step": 11350, "time_per_iteration": 2.4916343688964844 }, { "auxiliary_loss_clip": 0.01052453, "auxiliary_loss_mlp": 0.01039309, "balance_loss_clip": 1.01522851, "balance_loss_mlp": 1.01691389, "epoch": 0.6824590410341199, "flos": 24858159229440.0, "grad_norm": 2.9763238195965145, "language_loss": 0.80416662, "learning_rate": 9.675749994088161e-07, "loss": 0.82508421, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35546875, "step": 11351, "time_per_iteration": 2.3824033737182617 }, { "auxiliary_loss_clip": 0.01051914, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.01219022, "balance_loss_mlp": 1.01547277, "epoch": 0.6825191642867879, "flos": 22451386544640.0, "grad_norm": 1.8477021283931088, "language_loss": 0.74282789, "learning_rate": 9.672414604241954e-07, "loss": 0.76368457, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36523438, "step": 11352, "time_per_iteration": 2.4317715167999268 }, { "auxiliary_loss_clip": 0.01054761, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.0150646, "balance_loss_mlp": 1.01696849, "epoch": 0.6825792875394558, "flos": 29423087740800.0, "grad_norm": 1.6703696153045267, "language_loss": 0.80787569, "learning_rate": 9.669079606018814e-07, "loss": 0.8288188, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37890625, "step": 11353, "time_per_iteration": 2.4259681701660156 }, { "auxiliary_loss_clip": 0.01051617, "auxiliary_loss_mlp": 0.01032471, "balance_loss_clip": 1.01039314, "balance_loss_mlp": 1.01617074, "epoch": 0.6826394107921239, "flos": 18769969699200.0, "grad_norm": 1.6783306293963058, "language_loss": 0.795811, "learning_rate": 9.665744999545218e-07, "loss": 0.81665188, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 11354, "time_per_iteration": 2.375084638595581 }, { "auxiliary_loss_clip": 0.01052657, "auxiliary_loss_mlp": 0.0103885, "balance_loss_clip": 1.0166769, "balance_loss_mlp": 1.0171026, "epoch": 0.6826995340447918, "flos": 16616666551680.0, "grad_norm": 1.8896232226626437, "language_loss": 0.62531334, "learning_rate": 9.662410784947599e-07, "loss": 0.64622843, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 11355, "time_per_iteration": 2.3192830085754395 }, { "auxiliary_loss_clip": 0.01050132, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.01564682, "balance_loss_mlp": 1.01498508, "epoch": 0.6827596572974598, "flos": 20847301994880.0, "grad_norm": 1.9518129522602248, "language_loss": 0.83647722, "learning_rate": 9.659076962352398e-07, "loss": 0.85736406, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 11356, "time_per_iteration": 2.4157068729400635 }, { "auxiliary_loss_clip": 0.01053777, "auxiliary_loss_mlp": 0.01046057, "balance_loss_clip": 1.01955652, "balance_loss_mlp": 1.01665974, "epoch": 0.6828197805501277, "flos": 22746961048320.0, "grad_norm": 1.8109027517291565, "language_loss": 0.79695088, "learning_rate": 9.655743531886052e-07, "loss": 0.81794924, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 11357, "time_per_iteration": 2.39264178276062 }, { "auxiliary_loss_clip": 0.01008161, "auxiliary_loss_mlp": 0.01004194, "balance_loss_clip": 1.00204802, "balance_loss_mlp": 1.00128829, "epoch": 0.6828799038027957, "flos": 71642854535040.0, "grad_norm": 0.8266562495881349, "language_loss": 0.59640837, "learning_rate": 9.65241049367493e-07, "loss": 0.61653185, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06884766, "step": 11358, "time_per_iteration": 3.0967319011688232 }, { "auxiliary_loss_clip": 0.01056333, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.01704788, "balance_loss_mlp": 1.01758206, "epoch": 0.6829400270554637, "flos": 19828117319040.0, "grad_norm": 1.6284598962430035, "language_loss": 0.79510963, "learning_rate": 9.64907784784544e-07, "loss": 0.81609893, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 11359, "time_per_iteration": 2.357759952545166 }, { "auxiliary_loss_clip": 0.01053748, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.01674414, "balance_loss_mlp": 1.01633477, "epoch": 0.6830001503081317, "flos": 21979570164480.0, "grad_norm": 2.1086087243854847, "language_loss": 0.83126533, "learning_rate": 9.645745594523958e-07, "loss": 0.85221446, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 11360, "time_per_iteration": 2.389735698699951 }, { "auxiliary_loss_clip": 0.0105665, "auxiliary_loss_mlp": 0.01040918, "balance_loss_clip": 1.01549053, "balance_loss_mlp": 1.01923323, "epoch": 0.6830602735607997, "flos": 24315608701440.0, "grad_norm": 1.5989442437006638, "language_loss": 0.75674617, "learning_rate": 9.642413733836844e-07, "loss": 0.77772182, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 11361, "time_per_iteration": 2.3768551349639893 }, { "auxiliary_loss_clip": 0.01007899, "auxiliary_loss_mlp": 0.01002844, "balance_loss_clip": 1.00059092, "balance_loss_mlp": 1.00116277, "epoch": 0.6831203968134676, "flos": 57687268078080.0, "grad_norm": 0.8634784272620708, "language_loss": 0.59820902, "learning_rate": 9.639082265910437e-07, "loss": 0.61831653, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.06738281, "step": 11362, "time_per_iteration": 3.1297991275787354 }, { "auxiliary_loss_clip": 0.01055362, "auxiliary_loss_mlp": 0.01039491, "balance_loss_clip": 1.01350331, "balance_loss_mlp": 1.01724708, "epoch": 0.6831805200661356, "flos": 14387671843200.0, "grad_norm": 2.343762547009656, "language_loss": 0.76665699, "learning_rate": 9.635751190871074e-07, "loss": 0.78760552, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38085938, "step": 11363, "time_per_iteration": 2.3158111572265625 }, { "auxiliary_loss_clip": 0.01053648, "auxiliary_loss_mlp": 0.01041948, "balance_loss_clip": 1.01864266, "balance_loss_mlp": 1.01655674, "epoch": 0.6832406433188035, "flos": 22819196384640.0, "grad_norm": 3.936751934995813, "language_loss": 0.90873158, "learning_rate": 9.632420508845063e-07, "loss": 0.9296875, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 11364, "time_per_iteration": 2.4252119064331055 }, { "auxiliary_loss_clip": 0.01050886, "auxiliary_loss_mlp": 0.01031632, "balance_loss_clip": 1.01016283, "balance_loss_mlp": 1.01595521, "epoch": 0.6833007665714715, "flos": 17560892805120.0, "grad_norm": 1.7910296885189234, "language_loss": 0.89342546, "learning_rate": 9.629090219958697e-07, "loss": 0.91425067, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 11365, "time_per_iteration": 2.353181838989258 }, { "auxiliary_loss_clip": 0.01056667, "auxiliary_loss_mlp": 0.01041342, "balance_loss_clip": 1.01415062, "balance_loss_mlp": 1.01705575, "epoch": 0.6833608898241395, "flos": 22445102499840.0, "grad_norm": 2.2021279066629065, "language_loss": 0.81566691, "learning_rate": 9.625760324338272e-07, "loss": 0.83664703, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.39648438, "step": 11366, "time_per_iteration": 2.3499133586883545 }, { "auxiliary_loss_clip": 0.01053915, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.01501787, "balance_loss_mlp": 1.01619911, "epoch": 0.6834210130768075, "flos": 24533501696640.0, "grad_norm": 1.6222430056311468, "language_loss": 0.77719277, "learning_rate": 9.622430822110062e-07, "loss": 0.79814565, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37695312, "step": 11367, "time_per_iteration": 3.6529366970062256 }, { "auxiliary_loss_clip": 0.01053966, "auxiliary_loss_mlp": 0.0105029, "balance_loss_clip": 1.02356291, "balance_loss_mlp": 1.01701808, "epoch": 0.6834811363294754, "flos": 20046115048320.0, "grad_norm": 1.6672231246096054, "language_loss": 0.70510632, "learning_rate": 9.619101713400312e-07, "loss": 0.72614884, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.36914062, "step": 11368, "time_per_iteration": 2.369992971420288 }, { "auxiliary_loss_clip": 0.01053455, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.01445317, "balance_loss_mlp": 1.01644075, "epoch": 0.6835412595821434, "flos": 24789589585920.0, "grad_norm": 1.876659628855962, "language_loss": 0.74168968, "learning_rate": 9.615772998335261e-07, "loss": 0.76260245, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 11369, "time_per_iteration": 2.3782598972320557 }, { "auxiliary_loss_clip": 0.01052705, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.01558113, "balance_loss_mlp": 1.0154742, "epoch": 0.6836013828348113, "flos": 19499340245760.0, "grad_norm": 3.147366781903276, "language_loss": 0.80007488, "learning_rate": 9.612444677041138e-07, "loss": 0.82101393, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 11370, "time_per_iteration": 2.351530075073242 }, { "auxiliary_loss_clip": 0.01007832, "auxiliary_loss_mlp": 0.01003233, "balance_loss_clip": 1.00070572, "balance_loss_mlp": 1.00093269, "epoch": 0.6836615060874793, "flos": 58360706893440.0, "grad_norm": 0.7499592677271231, "language_loss": 0.59844542, "learning_rate": 9.609116749644162e-07, "loss": 0.61855602, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.06933594, "step": 11371, "time_per_iteration": 5.550664186477661 }, { "auxiliary_loss_clip": 0.01051212, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.01111591, "balance_loss_mlp": 1.01614666, "epoch": 0.6837216293401474, "flos": 12166078343040.0, "grad_norm": 1.8018436660183397, "language_loss": 0.65006596, "learning_rate": 9.605789216270511e-07, "loss": 0.67091078, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 11372, "time_per_iteration": 2.3247623443603516 }, { "auxiliary_loss_clip": 0.01051522, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.013111, "balance_loss_mlp": 1.01568675, "epoch": 0.6837817525928153, "flos": 22126484632320.0, "grad_norm": 1.5755480603599943, "language_loss": 0.72922432, "learning_rate": 9.602462077046375e-07, "loss": 0.75010109, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 11373, "time_per_iteration": 2.373307704925537 }, { "auxiliary_loss_clip": 0.01007293, "auxiliary_loss_mlp": 0.01003694, "balance_loss_clip": 1.00105965, "balance_loss_mlp": 1.00047231, "epoch": 0.6838418758454833, "flos": 65002409118720.0, "grad_norm": 1.23504323957892, "language_loss": 0.56612694, "learning_rate": 9.599135332097935e-07, "loss": 0.58623683, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.06835938, "step": 11374, "time_per_iteration": 3.1802003383636475 }, { "auxiliary_loss_clip": 0.01054251, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.0113126, "balance_loss_mlp": 1.01643813, "epoch": 0.6839019990981512, "flos": 21029827916160.0, "grad_norm": 1.6616161163100196, "language_loss": 0.74834883, "learning_rate": 9.595808981551312e-07, "loss": 0.7692529, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 11375, "time_per_iteration": 2.411531686782837 }, { "auxiliary_loss_clip": 0.01052545, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.01398206, "balance_loss_mlp": 1.01608419, "epoch": 0.6839621223508192, "flos": 24934409372160.0, "grad_norm": 1.556788079524683, "language_loss": 0.71406019, "learning_rate": 9.592483025532651e-07, "loss": 0.7349534, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36523438, "step": 11376, "time_per_iteration": 2.407729148864746 }, { "auxiliary_loss_clip": 0.01054937, "auxiliary_loss_mlp": 0.01041323, "balance_loss_clip": 1.01502502, "balance_loss_mlp": 1.01690638, "epoch": 0.6840222456034871, "flos": 26357643745920.0, "grad_norm": 1.9219086966868084, "language_loss": 0.7500093, "learning_rate": 9.58915746416808e-07, "loss": 0.77097189, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 11377, "time_per_iteration": 2.389582633972168 }, { "auxiliary_loss_clip": 0.01007625, "auxiliary_loss_mlp": 0.01006377, "balance_loss_clip": 1.00370705, "balance_loss_mlp": 1.00080597, "epoch": 0.6840823688561551, "flos": 65984865177600.0, "grad_norm": 0.7266071172549926, "language_loss": 0.56967378, "learning_rate": 9.585832297583707e-07, "loss": 0.58981371, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.06835938, "step": 11378, "time_per_iteration": 3.1209464073181152 }, { "auxiliary_loss_clip": 0.01053253, "auxiliary_loss_mlp": 0.01042626, "balance_loss_clip": 1.0153991, "balance_loss_mlp": 1.01580346, "epoch": 0.684142492108823, "flos": 21396520592640.0, "grad_norm": 1.604804490683469, "language_loss": 0.79544926, "learning_rate": 9.58250752590561e-07, "loss": 0.81640804, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.375, "step": 11379, "time_per_iteration": 2.359462022781372 }, { "auxiliary_loss_clip": 0.01049984, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.00883377, "balance_loss_mlp": 1.01616418, "epoch": 0.6842026153614911, "flos": 18800588828160.0, "grad_norm": 2.085792465910839, "language_loss": 0.70464754, "learning_rate": 9.57918314925988e-07, "loss": 0.72545135, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 11380, "time_per_iteration": 2.376437187194824 }, { "auxiliary_loss_clip": 0.01051609, "auxiliary_loss_mlp": 0.01034914, "balance_loss_clip": 1.01169217, "balance_loss_mlp": 1.01505899, "epoch": 0.684262738614159, "flos": 19645381929600.0, "grad_norm": 2.129986725740952, "language_loss": 0.79649007, "learning_rate": 9.575859167772568e-07, "loss": 0.81735533, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 11381, "time_per_iteration": 3.8441781997680664 }, { "auxiliary_loss_clip": 0.01008078, "auxiliary_loss_mlp": 0.01003237, "balance_loss_clip": 1.00112712, "balance_loss_mlp": 1.00106585, "epoch": 0.684322861866827, "flos": 62351699564160.0, "grad_norm": 0.876433460935326, "language_loss": 0.67290825, "learning_rate": 9.572535581569713e-07, "loss": 0.69302136, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.0703125, "step": 11382, "time_per_iteration": 2.8873586654663086 }, { "auxiliary_loss_clip": 0.01008802, "auxiliary_loss_mlp": 0.01002122, "balance_loss_clip": 0.99967843, "balance_loss_mlp": 1.00172222, "epoch": 0.6843829851194949, "flos": 65801606117760.0, "grad_norm": 0.8273041813436298, "language_loss": 0.58214319, "learning_rate": 9.569212390777356e-07, "loss": 0.60225242, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.07080078, "step": 11383, "time_per_iteration": 3.116757392883301 }, { "auxiliary_loss_clip": 0.01052572, "auxiliary_loss_mlp": 0.01038837, "balance_loss_clip": 1.01565051, "balance_loss_mlp": 1.01554716, "epoch": 0.6844431083721629, "flos": 27853916417280.0, "grad_norm": 1.6594473975741142, "language_loss": 0.80916452, "learning_rate": 9.565889595521517e-07, "loss": 0.83007866, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 11384, "time_per_iteration": 2.4293770790100098 }, { "auxiliary_loss_clip": 0.01055965, "auxiliary_loss_mlp": 0.01037908, "balance_loss_clip": 1.01289785, "balance_loss_mlp": 1.01669025, "epoch": 0.684503231624831, "flos": 18254163139200.0, "grad_norm": 1.9324826564400799, "language_loss": 0.78837657, "learning_rate": 9.562567195928187e-07, "loss": 0.80931532, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.39257812, "step": 11385, "time_per_iteration": 2.375232696533203 }, { "auxiliary_loss_clip": 0.01057887, "auxiliary_loss_mlp": 0.01043492, "balance_loss_clip": 1.01490569, "balance_loss_mlp": 1.01784146, "epoch": 0.6845633548774989, "flos": 17638713959040.0, "grad_norm": 1.8890213519401367, "language_loss": 0.8533538, "learning_rate": 9.55924519212335e-07, "loss": 0.87436759, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40039062, "step": 11386, "time_per_iteration": 2.3406333923339844 }, { "auxiliary_loss_clip": 0.01053173, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.01852298, "balance_loss_mlp": 1.01709688, "epoch": 0.6846234781301669, "flos": 20806698216960.0, "grad_norm": 1.9772526472984346, "language_loss": 0.83759701, "learning_rate": 9.555923584232984e-07, "loss": 0.85853946, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 11387, "time_per_iteration": 2.3421337604522705 }, { "auxiliary_loss_clip": 0.01053142, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.01488447, "balance_loss_mlp": 1.01670218, "epoch": 0.6846836013828348, "flos": 36099703192320.0, "grad_norm": 1.502170825808821, "language_loss": 0.73184991, "learning_rate": 9.552602372383047e-07, "loss": 0.75277013, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 11388, "time_per_iteration": 2.486311674118042 }, { "auxiliary_loss_clip": 0.01052096, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.01068282, "balance_loss_mlp": 1.01628017, "epoch": 0.6847437246355028, "flos": 43140811904640.0, "grad_norm": 1.90970081294091, "language_loss": 0.63388002, "learning_rate": 9.549281556699469e-07, "loss": 0.6547296, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 11389, "time_per_iteration": 2.5340685844421387 }, { "auxiliary_loss_clip": 0.01007786, "auxiliary_loss_mlp": 0.0100547, "balance_loss_clip": 1.00330007, "balance_loss_mlp": 1.00081158, "epoch": 0.6848038478881707, "flos": 71660556460800.0, "grad_norm": 0.7358965959007148, "language_loss": 0.56055951, "learning_rate": 9.54596113730818e-07, "loss": 0.58069205, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06982422, "step": 11390, "time_per_iteration": 3.096839666366577 }, { "auxiliary_loss_clip": 0.01053999, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.01600718, "balance_loss_mlp": 1.01767051, "epoch": 0.6848639711408387, "flos": 19936801981440.0, "grad_norm": 2.1161401688098045, "language_loss": 0.88333428, "learning_rate": 9.542641114335109e-07, "loss": 0.90426862, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 11391, "time_per_iteration": 2.3745217323303223 }, { "auxiliary_loss_clip": 0.01056127, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.01640987, "balance_loss_mlp": 1.01808608, "epoch": 0.6849240943935067, "flos": 26866363299840.0, "grad_norm": 1.616583042120885, "language_loss": 0.80129439, "learning_rate": 9.539321487906117e-07, "loss": 0.82227039, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 11392, "time_per_iteration": 2.420243978500366 }, { "auxiliary_loss_clip": 0.01051678, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.01350927, "balance_loss_mlp": 1.01661479, "epoch": 0.6849842176461747, "flos": 13734516528000.0, "grad_norm": 2.9713834552248644, "language_loss": 0.72076523, "learning_rate": 9.536002258147104e-07, "loss": 0.7416622, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.34960938, "step": 11393, "time_per_iteration": 2.3382303714752197 }, { "auxiliary_loss_clip": 0.01055633, "auxiliary_loss_mlp": 0.01041797, "balance_loss_clip": 1.01554668, "balance_loss_mlp": 1.01761603, "epoch": 0.6850443408988426, "flos": 24971906039040.0, "grad_norm": 1.5918988199169637, "language_loss": 0.65063387, "learning_rate": 9.532683425183936e-07, "loss": 0.67160821, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38085938, "step": 11394, "time_per_iteration": 2.405317544937134 }, { "auxiliary_loss_clip": 0.01055443, "auxiliary_loss_mlp": 0.01043705, "balance_loss_clip": 1.01998258, "balance_loss_mlp": 1.01704121, "epoch": 0.6851044641515106, "flos": 27743032339200.0, "grad_norm": 1.5229700433464899, "language_loss": 0.81700999, "learning_rate": 9.529364989142468e-07, "loss": 0.83800143, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 11395, "time_per_iteration": 2.4437007904052734 }, { "auxiliary_loss_clip": 0.01053918, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.02261734, "balance_loss_mlp": 1.01748514, "epoch": 0.6851645874041785, "flos": 24349963345920.0, "grad_norm": 1.9005084333620816, "language_loss": 0.73970962, "learning_rate": 9.526046950148527e-07, "loss": 0.76072741, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 11396, "time_per_iteration": 2.4277310371398926 }, { "auxiliary_loss_clip": 0.01054989, "auxiliary_loss_mlp": 0.01043981, "balance_loss_clip": 1.0170989, "balance_loss_mlp": 1.01720333, "epoch": 0.6852247106568465, "flos": 15077171750400.0, "grad_norm": 2.4587053463050297, "language_loss": 0.79926455, "learning_rate": 9.522729308327931e-07, "loss": 0.82025433, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37695312, "step": 11397, "time_per_iteration": 2.3470871448516846 }, { "auxiliary_loss_clip": 0.01054603, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.01470435, "balance_loss_mlp": 1.01659846, "epoch": 0.6852848339095146, "flos": 18769027092480.0, "grad_norm": 2.04043766801662, "language_loss": 0.73115468, "learning_rate": 9.519412063806493e-07, "loss": 0.75209868, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 11398, "time_per_iteration": 2.358701705932617 }, { "auxiliary_loss_clip": 0.01051911, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.01564717, "balance_loss_mlp": 1.01521575, "epoch": 0.6853449571621825, "flos": 27853148367360.0, "grad_norm": 1.8171658081085251, "language_loss": 0.72207129, "learning_rate": 9.516095216709996e-07, "loss": 0.74297965, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 11399, "time_per_iteration": 2.4178152084350586 }, { "auxiliary_loss_clip": 0.01052927, "auxiliary_loss_mlp": 0.0104469, "balance_loss_clip": 1.02008474, "balance_loss_mlp": 1.01632285, "epoch": 0.6854050804148505, "flos": 18149528194560.0, "grad_norm": 1.7504873925681992, "language_loss": 0.71053958, "learning_rate": 9.512778767164217e-07, "loss": 0.73151577, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 11400, "time_per_iteration": 2.3521294593811035 }, { "auxiliary_loss_clip": 0.01058554, "auxiliary_loss_mlp": 0.01043988, "balance_loss_clip": 1.01502049, "balance_loss_mlp": 1.01766253, "epoch": 0.6854652036675184, "flos": 16325281411200.0, "grad_norm": 1.8245004352550898, "language_loss": 0.79666638, "learning_rate": 9.509462715294927e-07, "loss": 0.8176918, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.40820312, "step": 11401, "time_per_iteration": 2.3620312213897705 }, { "auxiliary_loss_clip": 0.01052535, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.01576924, "balance_loss_mlp": 1.0166049, "epoch": 0.6855253269201864, "flos": 14939892817920.0, "grad_norm": 1.8095769572688383, "language_loss": 0.76579332, "learning_rate": 9.50614706122786e-07, "loss": 0.78669727, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 11402, "time_per_iteration": 2.3734965324401855 }, { "auxiliary_loss_clip": 0.01053568, "auxiliary_loss_mlp": 0.01042131, "balance_loss_clip": 1.01727557, "balance_loss_mlp": 1.01692629, "epoch": 0.6855854501728543, "flos": 23036670443520.0, "grad_norm": 2.049032637188324, "language_loss": 0.73968315, "learning_rate": 9.502831805088742e-07, "loss": 0.76064014, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 11403, "time_per_iteration": 2.3925039768218994 }, { "auxiliary_loss_clip": 0.01052459, "auxiliary_loss_mlp": 0.0103277, "balance_loss_clip": 1.01045394, "balance_loss_mlp": 1.0172044, "epoch": 0.6856455734255223, "flos": 13252994789760.0, "grad_norm": 2.095374277071407, "language_loss": 0.82569098, "learning_rate": 9.499516947003294e-07, "loss": 0.84654325, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 11404, "time_per_iteration": 2.357356309890747 }, { "auxiliary_loss_clip": 0.01052477, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.01256871, "balance_loss_mlp": 1.01684678, "epoch": 0.6857056966781903, "flos": 23332279858560.0, "grad_norm": 1.7751694383968486, "language_loss": 0.7865603, "learning_rate": 9.496202487097222e-07, "loss": 0.80744112, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 11405, "time_per_iteration": 2.393982172012329 }, { "auxiliary_loss_clip": 0.01007099, "auxiliary_loss_mlp": 0.01003223, "balance_loss_clip": 1.00071931, "balance_loss_mlp": 1.00031745, "epoch": 0.6857658199308583, "flos": 61849648080000.0, "grad_norm": 0.7983949674638053, "language_loss": 0.61073905, "learning_rate": 9.492888425496199e-07, "loss": 0.63084227, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.06787109, "step": 11406, "time_per_iteration": 3.0813186168670654 }, { "auxiliary_loss_clip": 0.0105291, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.01647711, "balance_loss_mlp": 1.01673388, "epoch": 0.6858259431835262, "flos": 16653604636800.0, "grad_norm": 1.6710915142503449, "language_loss": 0.771905, "learning_rate": 9.489574762325907e-07, "loss": 0.79284072, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 11407, "time_per_iteration": 3.5456738471984863 }, { "auxiliary_loss_clip": 0.01054731, "auxiliary_loss_mlp": 0.0104468, "balance_loss_clip": 1.01895463, "balance_loss_mlp": 1.01655579, "epoch": 0.6858860664361942, "flos": 21871863020160.0, "grad_norm": 2.182440221219345, "language_loss": 0.73620093, "learning_rate": 9.486261497711991e-07, "loss": 0.75719506, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 11408, "time_per_iteration": 2.3730273246765137 }, { "auxiliary_loss_clip": 0.01054421, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.01435852, "balance_loss_mlp": 1.01691341, "epoch": 0.6859461896888621, "flos": 15266749766400.0, "grad_norm": 1.8033499426626518, "language_loss": 0.7091217, "learning_rate": 9.482948631780087e-07, "loss": 0.73005581, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 11409, "time_per_iteration": 2.333545207977295 }, { "auxiliary_loss_clip": 0.01049634, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.0125463, "balance_loss_mlp": 1.01608276, "epoch": 0.6860063129415301, "flos": 18619424449920.0, "grad_norm": 1.6981971754958989, "language_loss": 0.78515393, "learning_rate": 9.479636164655825e-07, "loss": 0.80598027, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 11410, "time_per_iteration": 3.910554885864258 }, { "auxiliary_loss_clip": 0.01054953, "auxiliary_loss_mlp": 0.01052347, "balance_loss_clip": 1.02548897, "balance_loss_mlp": 1.0163064, "epoch": 0.6860664361941982, "flos": 23950242656640.0, "grad_norm": 2.0066568529767452, "language_loss": 0.73147869, "learning_rate": 9.476324096464821e-07, "loss": 0.75255167, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 11411, "time_per_iteration": 3.818382501602173 }, { "auxiliary_loss_clip": 0.01053397, "auxiliary_loss_mlp": 0.01040799, "balance_loss_clip": 1.01525283, "balance_loss_mlp": 1.01621962, "epoch": 0.6861265594468661, "flos": 20406872793600.0, "grad_norm": 2.0717552477297487, "language_loss": 0.71665782, "learning_rate": 9.473012427332654e-07, "loss": 0.73759973, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 11412, "time_per_iteration": 2.355769157409668 }, { "auxiliary_loss_clip": 0.01053485, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.01395559, "balance_loss_mlp": 1.01695812, "epoch": 0.6861866826995341, "flos": 11428014867840.0, "grad_norm": 3.10557366282532, "language_loss": 0.73879433, "learning_rate": 9.469701157384919e-07, "loss": 0.75970662, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 11413, "time_per_iteration": 2.311180591583252 }, { "auxiliary_loss_clip": 0.01052051, "auxiliary_loss_mlp": 0.01041332, "balance_loss_clip": 1.01808608, "balance_loss_mlp": 1.01610494, "epoch": 0.686246805952202, "flos": 15996678894720.0, "grad_norm": 1.878246766086936, "language_loss": 0.7460165, "learning_rate": 9.466390286747164e-07, "loss": 0.76695025, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 11414, "time_per_iteration": 2.34238862991333 }, { "auxiliary_loss_clip": 0.01055012, "auxiliary_loss_mlp": 0.01038136, "balance_loss_clip": 1.01325703, "balance_loss_mlp": 1.01747167, "epoch": 0.68630692920487, "flos": 19825743346560.0, "grad_norm": 2.1046613464676973, "language_loss": 0.89145231, "learning_rate": 9.46307981554495e-07, "loss": 0.91238379, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 11415, "time_per_iteration": 2.343322515487671 }, { "auxiliary_loss_clip": 0.0105543, "auxiliary_loss_mlp": 0.01042469, "balance_loss_clip": 1.0174228, "balance_loss_mlp": 1.01711655, "epoch": 0.6863670524575379, "flos": 26285024384640.0, "grad_norm": 1.6120753942973145, "language_loss": 0.6777283, "learning_rate": 9.459769743903801e-07, "loss": 0.69870722, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 11416, "time_per_iteration": 2.405473232269287 }, { "auxiliary_loss_clip": 0.01054142, "auxiliary_loss_mlp": 0.01036108, "balance_loss_clip": 1.01288629, "balance_loss_mlp": 1.0168128, "epoch": 0.686427175710206, "flos": 19172099272320.0, "grad_norm": 1.3240030716321713, "language_loss": 0.77222443, "learning_rate": 9.456460071949237e-07, "loss": 0.79312694, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 11417, "time_per_iteration": 2.3662796020507812 }, { "auxiliary_loss_clip": 0.01053027, "auxiliary_loss_mlp": 0.01039997, "balance_loss_clip": 1.01712012, "balance_loss_mlp": 1.01623583, "epoch": 0.6864872989628739, "flos": 18915627358080.0, "grad_norm": 1.7723684080542992, "language_loss": 0.78753871, "learning_rate": 9.45315079980678e-07, "loss": 0.80846894, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11418, "time_per_iteration": 2.338643789291382 }, { "auxiliary_loss_clip": 0.01052672, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.0072794, "balance_loss_mlp": 1.01661801, "epoch": 0.6865474222155419, "flos": 25955060325120.0, "grad_norm": 1.7581327647645042, "language_loss": 0.77606386, "learning_rate": 9.449841927601887e-07, "loss": 0.79689044, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 11419, "time_per_iteration": 2.415435791015625 }, { "auxiliary_loss_clip": 0.01052467, "auxiliary_loss_mlp": 0.01039227, "balance_loss_clip": 1.01661336, "balance_loss_mlp": 1.01685405, "epoch": 0.6866075454682098, "flos": 18477118281600.0, "grad_norm": 2.0046152572094798, "language_loss": 0.72498465, "learning_rate": 9.446533455460044e-07, "loss": 0.74590158, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 11420, "time_per_iteration": 2.32309889793396 }, { "auxiliary_loss_clip": 0.01052122, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.01364958, "balance_loss_mlp": 1.0162586, "epoch": 0.6866676687208778, "flos": 34238588146560.0, "grad_norm": 1.391249037503435, "language_loss": 0.75273812, "learning_rate": 9.443225383506712e-07, "loss": 0.77362895, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 11421, "time_per_iteration": 3.901359796524048 }, { "auxiliary_loss_clip": 0.01050059, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.01229, "balance_loss_mlp": 1.01591301, "epoch": 0.6867277919735457, "flos": 21720794100480.0, "grad_norm": 2.4750192652790175, "language_loss": 0.77915734, "learning_rate": 9.439917711867338e-07, "loss": 0.80000448, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34179688, "step": 11422, "time_per_iteration": 2.3792598247528076 }, { "auxiliary_loss_clip": 0.01053669, "auxiliary_loss_mlp": 0.01040332, "balance_loss_clip": 1.01520288, "balance_loss_mlp": 1.01668179, "epoch": 0.6867879152262137, "flos": 24096842922240.0, "grad_norm": 2.0320622025122552, "language_loss": 0.77850974, "learning_rate": 9.436610440667334e-07, "loss": 0.79944974, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 11423, "time_per_iteration": 2.4101178646087646 }, { "auxiliary_loss_clip": 0.01054316, "auxiliary_loss_mlp": 0.01038901, "balance_loss_clip": 1.01469004, "balance_loss_mlp": 1.01740718, "epoch": 0.6868480384788818, "flos": 21614762701440.0, "grad_norm": 1.417671774622705, "language_loss": 0.73900366, "learning_rate": 9.433303570032129e-07, "loss": 0.75993586, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 11424, "time_per_iteration": 2.3900701999664307 }, { "auxiliary_loss_clip": 0.01052121, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.01476836, "balance_loss_mlp": 1.01612568, "epoch": 0.6869081617315497, "flos": 26284954561920.0, "grad_norm": 1.7933362047044097, "language_loss": 0.66261768, "learning_rate": 9.429997100087112e-07, "loss": 0.6835196, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 11425, "time_per_iteration": 2.4188790321350098 }, { "auxiliary_loss_clip": 0.01052606, "auxiliary_loss_mlp": 0.01036285, "balance_loss_clip": 1.01297975, "balance_loss_mlp": 1.01696014, "epoch": 0.6869682849842177, "flos": 21104053200000.0, "grad_norm": 1.7534829414093445, "language_loss": 0.72768205, "learning_rate": 9.426691030957657e-07, "loss": 0.74857092, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 11426, "time_per_iteration": 2.3830528259277344 }, { "auxiliary_loss_clip": 0.01052863, "auxiliary_loss_mlp": 0.01036779, "balance_loss_clip": 1.01397467, "balance_loss_mlp": 1.01647925, "epoch": 0.6870284082368856, "flos": 17091694776960.0, "grad_norm": 2.184818056179269, "language_loss": 0.85850322, "learning_rate": 9.423385362769136e-07, "loss": 0.87939966, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 11427, "time_per_iteration": 2.3857760429382324 }, { "auxiliary_loss_clip": 0.01053074, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.01305449, "balance_loss_mlp": 1.01697314, "epoch": 0.6870885314895536, "flos": 27306862323840.0, "grad_norm": 1.4852306847482033, "language_loss": 0.7707994, "learning_rate": 9.420080095646909e-07, "loss": 0.79169178, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 11428, "time_per_iteration": 2.411097764968872 }, { "auxiliary_loss_clip": 0.01055896, "auxiliary_loss_mlp": 0.01045039, "balance_loss_clip": 1.01865816, "balance_loss_mlp": 1.01793098, "epoch": 0.6871486547422215, "flos": 20813471020800.0, "grad_norm": 1.878803217663699, "language_loss": 0.74445891, "learning_rate": 9.4167752297163e-07, "loss": 0.76546824, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11429, "time_per_iteration": 2.36568021774292 }, { "auxiliary_loss_clip": 0.01054269, "auxiliary_loss_mlp": 0.01035358, "balance_loss_clip": 1.01138496, "balance_loss_mlp": 1.01710343, "epoch": 0.6872087779948896, "flos": 30152807400960.0, "grad_norm": 1.6865181414198038, "language_loss": 0.84157979, "learning_rate": 9.413470765102643e-07, "loss": 0.86247599, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 11430, "time_per_iteration": 2.4159929752349854 }, { "auxiliary_loss_clip": 0.0105331, "auxiliary_loss_mlp": 0.01042503, "balance_loss_clip": 1.01822019, "balance_loss_mlp": 1.01665962, "epoch": 0.6872689012475575, "flos": 20703529549440.0, "grad_norm": 2.0313860215205923, "language_loss": 0.71394795, "learning_rate": 9.410166701931225e-07, "loss": 0.73490608, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 11431, "time_per_iteration": 2.363665819168091 }, { "auxiliary_loss_clip": 0.0105311, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.00900424, "balance_loss_mlp": 1.01600242, "epoch": 0.6873290245002255, "flos": 25519658359680.0, "grad_norm": 1.7519579095684388, "language_loss": 0.81252372, "learning_rate": 9.406863040327355e-07, "loss": 0.83338356, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 11432, "time_per_iteration": 2.391531467437744 }, { "auxiliary_loss_clip": 0.01050838, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.0136795, "balance_loss_mlp": 1.01673579, "epoch": 0.6873891477528934, "flos": 25190322704640.0, "grad_norm": 1.7252539829248246, "language_loss": 0.68783939, "learning_rate": 9.403559780416295e-07, "loss": 0.70871449, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.33984375, "step": 11433, "time_per_iteration": 2.3989617824554443 }, { "auxiliary_loss_clip": 0.0105378, "auxiliary_loss_mlp": 0.01048398, "balance_loss_clip": 1.02469969, "balance_loss_mlp": 1.01749587, "epoch": 0.6874492710055614, "flos": 35150938462080.0, "grad_norm": 1.7886679608656766, "language_loss": 0.73253757, "learning_rate": 9.400256922323309e-07, "loss": 0.75355935, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 11434, "time_per_iteration": 2.4657020568847656 }, { "auxiliary_loss_clip": 0.01052602, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 1.01067972, "balance_loss_mlp": 1.01653969, "epoch": 0.6875093942582293, "flos": 17821239880320.0, "grad_norm": 1.8108592304889672, "language_loss": 0.81900543, "learning_rate": 9.396954466173657e-07, "loss": 0.83986163, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36132812, "step": 11435, "time_per_iteration": 2.3510630130767822 }, { "auxiliary_loss_clip": 0.01054081, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.01954675, "balance_loss_mlp": 1.01652932, "epoch": 0.6875695175108973, "flos": 20703494638080.0, "grad_norm": 2.13403768972144, "language_loss": 0.81747961, "learning_rate": 9.393652412092538e-07, "loss": 0.83845812, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 11436, "time_per_iteration": 2.348726987838745 }, { "auxiliary_loss_clip": 0.01046728, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 1.0119921, "balance_loss_mlp": 1.01481235, "epoch": 0.6876296407635654, "flos": 25372848625920.0, "grad_norm": 5.492950686616412, "language_loss": 0.83031064, "learning_rate": 9.390350760205183e-07, "loss": 0.85109299, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.31835938, "step": 11437, "time_per_iteration": 2.437922954559326 }, { "auxiliary_loss_clip": 0.01057944, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.01588154, "balance_loss_mlp": 1.01822531, "epoch": 0.6876897640162333, "flos": 23221186312320.0, "grad_norm": 3.2994476053622996, "language_loss": 0.79323244, "learning_rate": 9.387049510636793e-07, "loss": 0.81423056, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39648438, "step": 11438, "time_per_iteration": 2.369091510772705 }, { "auxiliary_loss_clip": 0.01049504, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.01344669, "balance_loss_mlp": 1.0150044, "epoch": 0.6877498872689013, "flos": 27123149416320.0, "grad_norm": 1.643890579892767, "language_loss": 0.73123777, "learning_rate": 9.383748663512554e-07, "loss": 0.75209546, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34570312, "step": 11439, "time_per_iteration": 2.4404802322387695 }, { "auxiliary_loss_clip": 0.01052096, "auxiliary_loss_mlp": 0.01038652, "balance_loss_clip": 1.01459575, "balance_loss_mlp": 1.01565886, "epoch": 0.6878100105215692, "flos": 11580899178240.0, "grad_norm": 1.7652085383842524, "language_loss": 0.77155674, "learning_rate": 9.380448218957623e-07, "loss": 0.7924642, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 11440, "time_per_iteration": 2.3258981704711914 }, { "auxiliary_loss_clip": 0.01051063, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 1.01962066, "balance_loss_mlp": 1.01581872, "epoch": 0.6878701337742372, "flos": 20302133114880.0, "grad_norm": 1.8455345028598718, "language_loss": 0.7286045, "learning_rate": 9.377148177097167e-07, "loss": 0.74954283, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 11441, "time_per_iteration": 2.3673832416534424 }, { "auxiliary_loss_clip": 0.01056743, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.01612735, "balance_loss_mlp": 1.01766801, "epoch": 0.6879302570269051, "flos": 13839360940800.0, "grad_norm": 1.7667219109599703, "language_loss": 0.67867965, "learning_rate": 9.373848538056317e-07, "loss": 0.69968396, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.390625, "step": 11442, "time_per_iteration": 2.335947036743164 }, { "auxiliary_loss_clip": 0.01052252, "auxiliary_loss_mlp": 0.01047595, "balance_loss_clip": 1.02501678, "balance_loss_mlp": 1.01655579, "epoch": 0.6879903802795732, "flos": 21323587029120.0, "grad_norm": 2.314234502644966, "language_loss": 0.70618325, "learning_rate": 9.370549301960189e-07, "loss": 0.72718173, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 11443, "time_per_iteration": 2.3682708740234375 }, { "auxiliary_loss_clip": 0.01055037, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.01845264, "balance_loss_mlp": 1.01770663, "epoch": 0.6880505035322411, "flos": 25150975735680.0, "grad_norm": 1.518761797283864, "language_loss": 0.76974887, "learning_rate": 9.367250468933893e-07, "loss": 0.7907536, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37304688, "step": 11444, "time_per_iteration": 2.404855728149414 }, { "auxiliary_loss_clip": 0.0105065, "auxiliary_loss_mlp": 0.01037058, "balance_loss_clip": 1.01356232, "balance_loss_mlp": 1.01500034, "epoch": 0.6881106267849091, "flos": 23214588065280.0, "grad_norm": 2.7124635523575575, "language_loss": 0.77575308, "learning_rate": 9.363952039102536e-07, "loss": 0.79663014, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 11445, "time_per_iteration": 2.3630080223083496 }, { "auxiliary_loss_clip": 0.0100844, "auxiliary_loss_mlp": 0.01004678, "balance_loss_clip": 1.0024842, "balance_loss_mlp": 1.00134933, "epoch": 0.688170750037577, "flos": 48482173342080.0, "grad_norm": 0.820205972513339, "language_loss": 0.58464539, "learning_rate": 9.360654012591183e-07, "loss": 0.60477656, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.07080078, "step": 11446, "time_per_iteration": 4.205999374389648 }, { "auxiliary_loss_clip": 0.01054128, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.01524436, "balance_loss_mlp": 1.01660872, "epoch": 0.688230873290245, "flos": 22782537590400.0, "grad_norm": 1.5371582173993914, "language_loss": 0.761603, "learning_rate": 9.357356389524886e-07, "loss": 0.78256536, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.375, "step": 11447, "time_per_iteration": 2.386584758758545 }, { "auxiliary_loss_clip": 0.01054147, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.01617289, "balance_loss_mlp": 1.0165211, "epoch": 0.6882909965429129, "flos": 22454563478400.0, "grad_norm": 2.004466807386617, "language_loss": 0.74444127, "learning_rate": 9.354059170028705e-07, "loss": 0.76539695, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 11448, "time_per_iteration": 2.3885252475738525 }, { "auxiliary_loss_clip": 0.01055107, "auxiliary_loss_mlp": 0.0104065, "balance_loss_clip": 1.01422083, "balance_loss_mlp": 1.01626348, "epoch": 0.688351119795581, "flos": 26212928693760.0, "grad_norm": 1.6024194882552285, "language_loss": 0.76119238, "learning_rate": 9.350762354227673e-07, "loss": 0.78214997, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 11449, "time_per_iteration": 2.4111266136169434 }, { "auxiliary_loss_clip": 0.01050851, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.0173347, "balance_loss_mlp": 1.01572585, "epoch": 0.6884112430482489, "flos": 22564155836160.0, "grad_norm": 1.7285089577011743, "language_loss": 0.70851457, "learning_rate": 9.34746594224679e-07, "loss": 0.72942281, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 11450, "time_per_iteration": 3.792769432067871 }, { "auxiliary_loss_clip": 0.01055444, "auxiliary_loss_mlp": 0.01041846, "balance_loss_clip": 1.01548862, "balance_loss_mlp": 1.01658726, "epoch": 0.6884713663009169, "flos": 17340276723840.0, "grad_norm": 2.8015795758218713, "language_loss": 0.78045934, "learning_rate": 9.344169934211068e-07, "loss": 0.80143225, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 11451, "time_per_iteration": 2.341939687728882 }, { "auxiliary_loss_clip": 0.0105338, "auxiliary_loss_mlp": 0.01033558, "balance_loss_clip": 1.0106343, "balance_loss_mlp": 1.01653934, "epoch": 0.6885314895535849, "flos": 26469575164800.0, "grad_norm": 1.840430705850967, "language_loss": 0.69953501, "learning_rate": 9.340874330245505e-07, "loss": 0.72040445, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11452, "time_per_iteration": 2.420421838760376 }, { "auxiliary_loss_clip": 0.01051723, "auxiliary_loss_mlp": 0.01044606, "balance_loss_clip": 1.01994133, "balance_loss_mlp": 1.01578641, "epoch": 0.6885916128062528, "flos": 20520514869120.0, "grad_norm": 1.8168821672280988, "language_loss": 0.73187315, "learning_rate": 9.337579130475042e-07, "loss": 0.75283647, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 11453, "time_per_iteration": 2.3810086250305176 }, { "auxiliary_loss_clip": 0.01008182, "auxiliary_loss_mlp": 0.01002868, "balance_loss_clip": 1.00081778, "balance_loss_mlp": 1.00134516, "epoch": 0.6886517360589208, "flos": 70712629603200.0, "grad_norm": 0.7773188015408529, "language_loss": 0.5068633, "learning_rate": 9.334284335024644e-07, "loss": 0.52697384, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06835938, "step": 11454, "time_per_iteration": 2.8703362941741943 }, { "auxiliary_loss_clip": 0.01051407, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.0128305, "balance_loss_mlp": 1.01673853, "epoch": 0.6887118593115887, "flos": 17892602432640.0, "grad_norm": 2.0796890364802523, "language_loss": 0.76109898, "learning_rate": 9.330989944019263e-07, "loss": 0.78196841, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 11455, "time_per_iteration": 2.341731309890747 }, { "auxiliary_loss_clip": 0.01053776, "auxiliary_loss_mlp": 0.01047496, "balance_loss_clip": 1.02110291, "balance_loss_mlp": 1.01558304, "epoch": 0.6887719825642568, "flos": 17452173231360.0, "grad_norm": 2.5039186098344723, "language_loss": 0.7492305, "learning_rate": 9.327695957583803e-07, "loss": 0.77024323, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 11456, "time_per_iteration": 2.304070234298706 }, { "auxiliary_loss_clip": 0.01050787, "auxiliary_loss_mlp": 0.01037632, "balance_loss_clip": 1.01462436, "balance_loss_mlp": 1.01669312, "epoch": 0.6888321058169247, "flos": 23069244608640.0, "grad_norm": 1.6372825780488538, "language_loss": 0.81922424, "learning_rate": 9.32440237584319e-07, "loss": 0.84010845, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.33984375, "step": 11457, "time_per_iteration": 2.3790359497070312 }, { "auxiliary_loss_clip": 0.01055367, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.01443529, "balance_loss_mlp": 1.01750338, "epoch": 0.6888922290695927, "flos": 23367681843840.0, "grad_norm": 1.7289092911399988, "language_loss": 0.77299583, "learning_rate": 9.321109198922301e-07, "loss": 0.79394448, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 11458, "time_per_iteration": 2.3676681518554688 }, { "auxiliary_loss_clip": 0.01053397, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.01320481, "balance_loss_mlp": 1.01720428, "epoch": 0.6889523523222606, "flos": 17630893814400.0, "grad_norm": 2.2255707534935887, "language_loss": 0.68766475, "learning_rate": 9.31781642694603e-07, "loss": 0.70855552, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 11459, "time_per_iteration": 2.3116209506988525 }, { "auxiliary_loss_clip": 0.01053753, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.01409781, "balance_loss_mlp": 1.01746273, "epoch": 0.6890124755749286, "flos": 25226981498880.0, "grad_norm": 1.467589670020225, "language_loss": 0.69702542, "learning_rate": 9.314524060039221e-07, "loss": 0.71793401, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 11460, "time_per_iteration": 2.4163267612457275 }, { "auxiliary_loss_clip": 0.01056675, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.0152837, "balance_loss_mlp": 1.01764452, "epoch": 0.6890725988275965, "flos": 20229199551360.0, "grad_norm": 1.6390567231115725, "language_loss": 0.78558654, "learning_rate": 9.311232098326731e-07, "loss": 0.80657339, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 11461, "time_per_iteration": 3.7757821083068848 }, { "auxiliary_loss_clip": 0.01054376, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.01887083, "balance_loss_mlp": 1.01778841, "epoch": 0.6891327220802645, "flos": 14534516488320.0, "grad_norm": 1.6325304259324493, "language_loss": 0.71034712, "learning_rate": 9.307940541933401e-07, "loss": 0.73131788, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 11462, "time_per_iteration": 2.3606221675872803 }, { "auxiliary_loss_clip": 0.01053214, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.0100348, "balance_loss_mlp": 1.01701248, "epoch": 0.6891928453329325, "flos": 21138163464960.0, "grad_norm": 1.4076762683006872, "language_loss": 0.8804251, "learning_rate": 9.304649390984034e-07, "loss": 0.90129024, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 11463, "time_per_iteration": 2.3563036918640137 }, { "auxiliary_loss_clip": 0.01051153, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.01388502, "balance_loss_mlp": 1.0167191, "epoch": 0.6892529685856005, "flos": 17857549560960.0, "grad_norm": 1.761349322399204, "language_loss": 0.69400972, "learning_rate": 9.301358645603428e-07, "loss": 0.71487629, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 11464, "time_per_iteration": 2.3600642681121826 }, { "auxiliary_loss_clip": 0.01054238, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.016819, "balance_loss_mlp": 1.01742911, "epoch": 0.6893130918382685, "flos": 29933517951360.0, "grad_norm": 1.9555964864017952, "language_loss": 0.6650064, "learning_rate": 9.298068305916373e-07, "loss": 0.68594325, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3671875, "step": 11465, "time_per_iteration": 2.4247515201568604 }, { "auxiliary_loss_clip": 0.01054572, "auxiliary_loss_mlp": 0.01038666, "balance_loss_clip": 1.01432347, "balance_loss_mlp": 1.0168848, "epoch": 0.6893732150909364, "flos": 24387390190080.0, "grad_norm": 1.4080244407263103, "language_loss": 0.74039596, "learning_rate": 9.294778372047649e-07, "loss": 0.76132834, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37695312, "step": 11466, "time_per_iteration": 2.421003580093384 }, { "auxiliary_loss_clip": 0.0105427, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 1.01746035, "balance_loss_mlp": 1.01721883, "epoch": 0.6894333383436044, "flos": 16981927862400.0, "grad_norm": 1.663787210997787, "language_loss": 0.734384, "learning_rate": 9.291488844121995e-07, "loss": 0.75535357, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 11467, "time_per_iteration": 2.3372459411621094 }, { "auxiliary_loss_clip": 0.01055973, "auxiliary_loss_mlp": 0.0104292, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.01697814, "epoch": 0.6894934615962723, "flos": 18984650849280.0, "grad_norm": 2.016854929595268, "language_loss": 0.82030994, "learning_rate": 9.288199722264156e-07, "loss": 0.84129888, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 11468, "time_per_iteration": 2.3383443355560303 }, { "auxiliary_loss_clip": 0.01054868, "auxiliary_loss_mlp": 0.01044614, "balance_loss_clip": 1.0192821, "balance_loss_mlp": 1.0171746, "epoch": 0.6895535848489404, "flos": 34530252577920.0, "grad_norm": 1.6608558158114934, "language_loss": 0.66898555, "learning_rate": 9.284911006598875e-07, "loss": 0.68998039, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 11469, "time_per_iteration": 2.4896671772003174 }, { "auxiliary_loss_clip": 0.01008282, "auxiliary_loss_mlp": 0.01004005, "balance_loss_clip": 1.0015012, "balance_loss_mlp": 1.00137043, "epoch": 0.6896137081016083, "flos": 50072954797440.0, "grad_norm": 0.804907892950301, "language_loss": 0.55250263, "learning_rate": 9.281622697250824e-07, "loss": 0.57262546, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.06933594, "step": 11470, "time_per_iteration": 2.9047701358795166 }, { "auxiliary_loss_clip": 0.0104879, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.01603377, "balance_loss_mlp": 1.01581764, "epoch": 0.6896738313542763, "flos": 19937186006400.0, "grad_norm": 1.7090542292587518, "language_loss": 0.78932106, "learning_rate": 9.278334794344715e-07, "loss": 0.81016904, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33007812, "step": 11471, "time_per_iteration": 2.349907875061035 }, { "auxiliary_loss_clip": 0.01052643, "auxiliary_loss_mlp": 0.01043136, "balance_loss_clip": 1.01776838, "balance_loss_mlp": 1.01594555, "epoch": 0.6897339546069442, "flos": 21724424881920.0, "grad_norm": 2.2095753139025938, "language_loss": 0.79525906, "learning_rate": 9.275047298005232e-07, "loss": 0.81621683, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 11472, "time_per_iteration": 2.3949384689331055 }, { "auxiliary_loss_clip": 0.01051558, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.0147357, "balance_loss_mlp": 1.01636577, "epoch": 0.6897940778596122, "flos": 19825533878400.0, "grad_norm": 1.5907336707154234, "language_loss": 0.76839924, "learning_rate": 9.271760208357024e-07, "loss": 0.78927445, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 11473, "time_per_iteration": 2.3631575107574463 }, { "auxiliary_loss_clip": 0.01054343, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.01468229, "balance_loss_mlp": 1.01684141, "epoch": 0.6898542011122801, "flos": 17309133924480.0, "grad_norm": 2.0122353641413384, "language_loss": 0.76908076, "learning_rate": 9.268473525524751e-07, "loss": 0.79003334, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.375, "step": 11474, "time_per_iteration": 2.3419392108917236 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01042074, "balance_loss_clip": 1.01707625, "balance_loss_mlp": 1.0182023, "epoch": 0.6899143243649482, "flos": 24752895880320.0, "grad_norm": 1.56706670131835, "language_loss": 0.75660127, "learning_rate": 9.26518724963303e-07, "loss": 0.77755886, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 11475, "time_per_iteration": 2.4613237380981445 }, { "auxiliary_loss_clip": 0.0105393, "auxiliary_loss_mlp": 0.01037948, "balance_loss_clip": 1.01340318, "balance_loss_mlp": 1.01732278, "epoch": 0.6899744476176161, "flos": 17233686743040.0, "grad_norm": 2.1384181042250443, "language_loss": 0.89444441, "learning_rate": 9.261901380806491e-07, "loss": 0.91536313, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11476, "time_per_iteration": 2.347249984741211 }, { "auxiliary_loss_clip": 0.01051671, "auxiliary_loss_mlp": 0.01041225, "balance_loss_clip": 1.01685846, "balance_loss_mlp": 1.01619065, "epoch": 0.6900345708702841, "flos": 25409507420160.0, "grad_norm": 1.3347988916410705, "language_loss": 0.71568525, "learning_rate": 9.258615919169724e-07, "loss": 0.73661423, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 11477, "time_per_iteration": 2.408958673477173 }, { "auxiliary_loss_clip": 0.01054506, "auxiliary_loss_mlp": 0.01044193, "balance_loss_clip": 1.01884961, "balance_loss_mlp": 1.01629567, "epoch": 0.6900946941229521, "flos": 23433249110400.0, "grad_norm": 2.1747924529570977, "language_loss": 0.68517017, "learning_rate": 9.255330864847313e-07, "loss": 0.70615715, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 11478, "time_per_iteration": 2.37276291847229 }, { "auxiliary_loss_clip": 0.01053824, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.02094817, "balance_loss_mlp": 1.01676917, "epoch": 0.69015481737562, "flos": 17819180110080.0, "grad_norm": 2.046820502063544, "language_loss": 0.77392203, "learning_rate": 9.252046217963843e-07, "loss": 0.79492247, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 11479, "time_per_iteration": 2.375174045562744 }, { "auxiliary_loss_clip": 0.01054657, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.0148015, "balance_loss_mlp": 1.01764238, "epoch": 0.690214940628288, "flos": 17455559633280.0, "grad_norm": 1.6176880431254026, "language_loss": 0.80087149, "learning_rate": 9.248761978643856e-07, "loss": 0.82182491, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37109375, "step": 11480, "time_per_iteration": 2.3398263454437256 }, { "auxiliary_loss_clip": 0.01054423, "auxiliary_loss_mlp": 0.01040788, "balance_loss_clip": 1.01692259, "balance_loss_mlp": 1.01818442, "epoch": 0.6902750638809559, "flos": 29565498643200.0, "grad_norm": 2.2548350015536274, "language_loss": 0.76715171, "learning_rate": 9.245478147011885e-07, "loss": 0.78810382, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 11481, "time_per_iteration": 2.4338207244873047 }, { "auxiliary_loss_clip": 0.01052414, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.00931048, "balance_loss_mlp": 1.01629472, "epoch": 0.690335187133624, "flos": 25555933128960.0, "grad_norm": 1.7334447765601162, "language_loss": 0.70254874, "learning_rate": 9.24219472319246e-07, "loss": 0.72341752, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 11482, "time_per_iteration": 2.3930554389953613 }, { "auxiliary_loss_clip": 0.01053187, "auxiliary_loss_mlp": 0.01039956, "balance_loss_clip": 1.01532722, "balance_loss_mlp": 1.01698375, "epoch": 0.6903953103862919, "flos": 22487451845760.0, "grad_norm": 2.039627890317923, "language_loss": 0.8332603, "learning_rate": 9.238911707310096e-07, "loss": 0.85419172, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 11483, "time_per_iteration": 2.391796112060547 }, { "auxiliary_loss_clip": 0.01054048, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.01386344, "balance_loss_mlp": 1.0174222, "epoch": 0.6904554336389599, "flos": 26099426263680.0, "grad_norm": 1.7918064897953652, "language_loss": 0.66441119, "learning_rate": 9.235629099489273e-07, "loss": 0.68531787, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 11484, "time_per_iteration": 2.3765573501586914 }, { "auxiliary_loss_clip": 0.01053184, "auxiliary_loss_mlp": 0.01039775, "balance_loss_clip": 1.01651716, "balance_loss_mlp": 1.01690745, "epoch": 0.6905155568916278, "flos": 31170525799680.0, "grad_norm": 1.5742287109196982, "language_loss": 0.74475849, "learning_rate": 9.232346899854479e-07, "loss": 0.76568806, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11485, "time_per_iteration": 3.750086784362793 }, { "auxiliary_loss_clip": 0.01055012, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.01823914, "balance_loss_mlp": 1.01731288, "epoch": 0.6905756801442958, "flos": 17638713959040.0, "grad_norm": 1.8893143199063736, "language_loss": 0.86112505, "learning_rate": 9.22906510853017e-07, "loss": 0.88210338, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37695312, "step": 11486, "time_per_iteration": 2.3484222888946533 }, { "auxiliary_loss_clip": 0.01051412, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.01328444, "balance_loss_mlp": 1.01514494, "epoch": 0.6906358033969637, "flos": 22342666970880.0, "grad_norm": 1.4254920715551358, "language_loss": 0.73115838, "learning_rate": 9.225783725640786e-07, "loss": 0.75203449, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 11487, "time_per_iteration": 2.4042537212371826 }, { "auxiliary_loss_clip": 0.01009493, "auxiliary_loss_mlp": 0.01006194, "balance_loss_clip": 1.00385785, "balance_loss_mlp": 1.00211453, "epoch": 0.6906959266496318, "flos": 69744172999680.0, "grad_norm": 0.9018722559669676, "language_loss": 0.66812789, "learning_rate": 9.222502751310759e-07, "loss": 0.68828475, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.07421875, "step": 11488, "time_per_iteration": 3.038252353668213 }, { "auxiliary_loss_clip": 0.01057443, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.01552713, "balance_loss_mlp": 1.01791835, "epoch": 0.6907560499022997, "flos": 21433179386880.0, "grad_norm": 1.653110569691665, "language_loss": 0.75834525, "learning_rate": 9.219222185664519e-07, "loss": 0.77934134, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 11489, "time_per_iteration": 3.7904717922210693 }, { "auxiliary_loss_clip": 0.0105559, "auxiliary_loss_mlp": 0.01045123, "balance_loss_clip": 1.01825368, "balance_loss_mlp": 1.0171963, "epoch": 0.6908161731549677, "flos": 14391337536000.0, "grad_norm": 2.4907561228356747, "language_loss": 0.63248217, "learning_rate": 9.215942028826445e-07, "loss": 0.65348923, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38476562, "step": 11490, "time_per_iteration": 3.7150027751922607 }, { "auxiliary_loss_clip": 0.0105382, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.01665759, "balance_loss_mlp": 1.0170778, "epoch": 0.6908762964076357, "flos": 20009945013120.0, "grad_norm": 1.7516940389577251, "language_loss": 0.73282564, "learning_rate": 9.212662280920937e-07, "loss": 0.75379169, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 11491, "time_per_iteration": 2.3741273880004883 }, { "auxiliary_loss_clip": 0.01052346, "auxiliary_loss_mlp": 0.01039008, "balance_loss_clip": 1.01668048, "balance_loss_mlp": 1.01609099, "epoch": 0.6909364196603036, "flos": 28767767921280.0, "grad_norm": 1.3819539689568943, "language_loss": 0.71237773, "learning_rate": 9.20938294207235e-07, "loss": 0.73329127, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 11492, "time_per_iteration": 2.431334972381592 }, { "auxiliary_loss_clip": 0.01055498, "auxiliary_loss_mlp": 0.01049607, "balance_loss_clip": 1.02370334, "balance_loss_mlp": 1.01769495, "epoch": 0.6909965429129716, "flos": 22527043194240.0, "grad_norm": 1.935535657016633, "language_loss": 0.7581138, "learning_rate": 9.206104012405049e-07, "loss": 0.77916485, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 11493, "time_per_iteration": 2.3616623878479004 }, { "auxiliary_loss_clip": 0.0105228, "auxiliary_loss_mlp": 0.01037533, "balance_loss_clip": 1.01376247, "balance_loss_mlp": 1.0167563, "epoch": 0.6910566661656395, "flos": 18404952768000.0, "grad_norm": 1.9672020221519684, "language_loss": 0.75939214, "learning_rate": 9.20282549204336e-07, "loss": 0.78029025, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 11494, "time_per_iteration": 2.364143133163452 }, { "auxiliary_loss_clip": 0.01053, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.01245224, "balance_loss_mlp": 1.01648808, "epoch": 0.6911167894183076, "flos": 30772655412480.0, "grad_norm": 1.7676081989034769, "language_loss": 0.69207048, "learning_rate": 9.19954738111161e-07, "loss": 0.71296108, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 11495, "time_per_iteration": 2.4957523345947266 }, { "auxiliary_loss_clip": 0.01052623, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.01375484, "balance_loss_mlp": 1.01528084, "epoch": 0.6911769126709755, "flos": 13734865641600.0, "grad_norm": 1.7191030994691852, "language_loss": 0.75231993, "learning_rate": 9.196269679734119e-07, "loss": 0.77323145, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 11496, "time_per_iteration": 2.469289779663086 }, { "auxiliary_loss_clip": 0.01051543, "auxiliary_loss_mlp": 0.0104028, "balance_loss_clip": 1.016891, "balance_loss_mlp": 1.01595998, "epoch": 0.6912370359236435, "flos": 17565885129600.0, "grad_norm": 1.6227945126849885, "language_loss": 0.81131971, "learning_rate": 9.19299238803515e-07, "loss": 0.83223796, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 11497, "time_per_iteration": 2.328131914138794 }, { "auxiliary_loss_clip": 0.01054471, "auxiliary_loss_mlp": 0.0104146, "balance_loss_clip": 1.01542485, "balance_loss_mlp": 1.01655626, "epoch": 0.6912971591763114, "flos": 22089686192640.0, "grad_norm": 1.6770496565923623, "language_loss": 0.81315827, "learning_rate": 9.189715506138993e-07, "loss": 0.83411759, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 11498, "time_per_iteration": 2.428844451904297 }, { "auxiliary_loss_clip": 0.01050609, "auxiliary_loss_mlp": 0.01038858, "balance_loss_clip": 1.0162437, "balance_loss_mlp": 1.01497531, "epoch": 0.6913572824289794, "flos": 29970176745600.0, "grad_norm": 1.483604674078266, "language_loss": 0.86791927, "learning_rate": 9.186439034169915e-07, "loss": 0.88881397, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 11499, "time_per_iteration": 2.521315813064575 }, { "auxiliary_loss_clip": 0.01051259, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.01458895, "balance_loss_mlp": 1.01607227, "epoch": 0.6914174056816473, "flos": 20447895507840.0, "grad_norm": 1.627785158530618, "language_loss": 0.76495326, "learning_rate": 9.183162972252145e-07, "loss": 0.78583872, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 11500, "time_per_iteration": 2.462388277053833 }, { "auxiliary_loss_clip": 0.01054725, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.02331758, "balance_loss_mlp": 1.01672268, "epoch": 0.6914775289343154, "flos": 21281621708160.0, "grad_norm": 1.7821653587911734, "language_loss": 0.78537363, "learning_rate": 9.179887320509921e-07, "loss": 0.80640936, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11501, "time_per_iteration": 3.768042802810669 }, { "auxiliary_loss_clip": 0.01054586, "auxiliary_loss_mlp": 0.01039923, "balance_loss_clip": 1.01500821, "balance_loss_mlp": 1.01694179, "epoch": 0.6915376521869833, "flos": 23876994890880.0, "grad_norm": 1.77767064320417, "language_loss": 0.74992776, "learning_rate": 9.176612079067458e-07, "loss": 0.77087283, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 11502, "time_per_iteration": 2.37093186378479 }, { "auxiliary_loss_clip": 0.01056306, "auxiliary_loss_mlp": 0.01039321, "balance_loss_clip": 1.01303554, "balance_loss_mlp": 1.01754487, "epoch": 0.6915977754396513, "flos": 11509466803200.0, "grad_norm": 1.8528140598975427, "language_loss": 0.75759959, "learning_rate": 9.173337248048953e-07, "loss": 0.77855581, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 11503, "time_per_iteration": 2.3967771530151367 }, { "auxiliary_loss_clip": 0.01053452, "auxiliary_loss_mlp": 0.0104107, "balance_loss_clip": 1.01636994, "balance_loss_mlp": 1.01668644, "epoch": 0.6916578986923193, "flos": 22600186225920.0, "grad_norm": 1.6678818941826938, "language_loss": 0.78282362, "learning_rate": 9.170062827578575e-07, "loss": 0.80376887, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 11504, "time_per_iteration": 2.428076982498169 }, { "auxiliary_loss_clip": 0.01052478, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.01421356, "balance_loss_mlp": 1.01629591, "epoch": 0.6917180219449872, "flos": 23476226860800.0, "grad_norm": 1.6545849003663962, "language_loss": 0.74881482, "learning_rate": 9.166788817780499e-07, "loss": 0.76971304, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 11505, "time_per_iteration": 2.5259311199188232 }, { "auxiliary_loss_clip": 0.01053395, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.0127511, "balance_loss_mlp": 1.01667619, "epoch": 0.6917781451976552, "flos": 23731407054720.0, "grad_norm": 1.8181830886906272, "language_loss": 0.89358616, "learning_rate": 9.163515218778886e-07, "loss": 0.91449714, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 11506, "time_per_iteration": 2.376441478729248 }, { "auxiliary_loss_clip": 0.01052915, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.01042604, "balance_loss_mlp": 1.01644421, "epoch": 0.6918382684503231, "flos": 31465436987520.0, "grad_norm": 2.0782644769115373, "language_loss": 0.7204963, "learning_rate": 9.160242030697856e-07, "loss": 0.7413711, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11507, "time_per_iteration": 2.485802412033081 }, { "auxiliary_loss_clip": 0.01054546, "auxiliary_loss_mlp": 0.01038628, "balance_loss_clip": 1.01342702, "balance_loss_mlp": 1.0162077, "epoch": 0.6918983917029912, "flos": 21649466459520.0, "grad_norm": 1.912275326820313, "language_loss": 0.77865887, "learning_rate": 9.156969253661538e-07, "loss": 0.79959059, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38476562, "step": 11508, "time_per_iteration": 2.375572681427002 }, { "auxiliary_loss_clip": 0.01050488, "auxiliary_loss_mlp": 0.01039996, "balance_loss_clip": 1.01963544, "balance_loss_mlp": 1.01649427, "epoch": 0.6919585149556591, "flos": 25549090502400.0, "grad_norm": 1.6245268004366946, "language_loss": 0.75629115, "learning_rate": 9.153696887794027e-07, "loss": 0.77719599, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33984375, "step": 11509, "time_per_iteration": 2.4217801094055176 }, { "auxiliary_loss_clip": 0.01054651, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.01745462, "balance_loss_mlp": 1.01814604, "epoch": 0.6920186382083271, "flos": 23658648048000.0, "grad_norm": 2.163997069355503, "language_loss": 0.65747499, "learning_rate": 9.150424933219425e-07, "loss": 0.67843759, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11510, "time_per_iteration": 2.387324810028076 }, { "auxiliary_loss_clip": 0.01056841, "auxiliary_loss_mlp": 0.01043709, "balance_loss_clip": 1.01651788, "balance_loss_mlp": 1.01812196, "epoch": 0.692078761460995, "flos": 19060970814720.0, "grad_norm": 1.760363731643227, "language_loss": 0.7659409, "learning_rate": 9.147153390061788e-07, "loss": 0.78694636, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38671875, "step": 11511, "time_per_iteration": 2.346987247467041 }, { "auxiliary_loss_clip": 0.01051843, "auxiliary_loss_mlp": 0.01041169, "balance_loss_clip": 1.01898444, "balance_loss_mlp": 1.0159061, "epoch": 0.692138884713663, "flos": 29022005508480.0, "grad_norm": 1.523499138118957, "language_loss": 0.63514161, "learning_rate": 9.143882258445184e-07, "loss": 0.65607172, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 11512, "time_per_iteration": 2.4706127643585205 }, { "auxiliary_loss_clip": 0.01054457, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.01976883, "balance_loss_mlp": 1.01604617, "epoch": 0.6921990079663309, "flos": 14756947960320.0, "grad_norm": 1.9557598606053521, "language_loss": 0.84036356, "learning_rate": 9.140611538493666e-07, "loss": 0.86136794, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 11513, "time_per_iteration": 2.344864845275879 }, { "auxiliary_loss_clip": 0.01053053, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.01216507, "balance_loss_mlp": 1.01698565, "epoch": 0.692259131218999, "flos": 23840720121600.0, "grad_norm": 1.452809205272252, "language_loss": 0.78952283, "learning_rate": 9.137341230331233e-07, "loss": 0.81040263, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 11514, "time_per_iteration": 2.398527145385742 }, { "auxiliary_loss_clip": 0.01055194, "auxiliary_loss_mlp": 0.01040179, "balance_loss_clip": 1.01545501, "balance_loss_mlp": 1.01720011, "epoch": 0.6923192544716669, "flos": 19134078935040.0, "grad_norm": 1.7809348444448574, "language_loss": 0.75921881, "learning_rate": 9.134071334081907e-07, "loss": 0.78017253, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 11515, "time_per_iteration": 2.3349075317382812 }, { "auxiliary_loss_clip": 0.01051319, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.01941752, "balance_loss_mlp": 1.0165689, "epoch": 0.6923793777243349, "flos": 28073380423680.0, "grad_norm": 1.8845986350118744, "language_loss": 0.54892719, "learning_rate": 9.130801849869694e-07, "loss": 0.56986022, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 11516, "time_per_iteration": 2.4056625366210938 }, { "auxiliary_loss_clip": 0.01051409, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.01309347, "balance_loss_mlp": 1.0159229, "epoch": 0.6924395009770029, "flos": 16580321959680.0, "grad_norm": 1.9989449485315323, "language_loss": 0.74210232, "learning_rate": 9.127532777818557e-07, "loss": 0.7629903, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 11517, "time_per_iteration": 2.3381876945495605 }, { "auxiliary_loss_clip": 0.01054126, "auxiliary_loss_mlp": 0.01039653, "balance_loss_clip": 1.01411867, "balance_loss_mlp": 1.01634848, "epoch": 0.6924996242296708, "flos": 16654337775360.0, "grad_norm": 1.8207956481489977, "language_loss": 0.76904309, "learning_rate": 9.124264118052465e-07, "loss": 0.78998089, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 11518, "time_per_iteration": 2.346539258956909 }, { "auxiliary_loss_clip": 0.0105545, "auxiliary_loss_mlp": 0.01039507, "balance_loss_clip": 1.01126647, "balance_loss_mlp": 1.01686549, "epoch": 0.6925597474823388, "flos": 34752649138560.0, "grad_norm": 1.3639793011519266, "language_loss": 0.65606904, "learning_rate": 9.120995870695376e-07, "loss": 0.67701864, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.38671875, "step": 11519, "time_per_iteration": 2.4963490962982178 }, { "auxiliary_loss_clip": 0.01052634, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.01323676, "balance_loss_mlp": 1.01595378, "epoch": 0.6926198707350067, "flos": 21870641122560.0, "grad_norm": 2.4054172443288366, "language_loss": 0.6390357, "learning_rate": 9.117728035871212e-07, "loss": 0.65992945, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 11520, "time_per_iteration": 2.3828723430633545 }, { "auxiliary_loss_clip": 0.01057008, "auxiliary_loss_mlp": 0.01042111, "balance_loss_clip": 1.01429987, "balance_loss_mlp": 1.01704192, "epoch": 0.6926799939876748, "flos": 13005425272320.0, "grad_norm": 2.0515745868003243, "language_loss": 0.79133517, "learning_rate": 9.114460613703887e-07, "loss": 0.81232637, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 11521, "time_per_iteration": 2.3506743907928467 }, { "auxiliary_loss_clip": 0.01055366, "auxiliary_loss_mlp": 0.01040596, "balance_loss_clip": 1.01386881, "balance_loss_mlp": 1.01643813, "epoch": 0.6927401172403427, "flos": 16760369174400.0, "grad_norm": 1.7679972627436769, "language_loss": 0.83077717, "learning_rate": 9.111193604317304e-07, "loss": 0.85173678, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 11522, "time_per_iteration": 2.3741588592529297 }, { "auxiliary_loss_clip": 0.01053924, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.01587558, "balance_loss_mlp": 1.01708961, "epoch": 0.6928002404930107, "flos": 25704383696640.0, "grad_norm": 1.3406284512889908, "language_loss": 0.77326965, "learning_rate": 9.107927007835361e-07, "loss": 0.79420364, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 11523, "time_per_iteration": 2.483132839202881 }, { "auxiliary_loss_clip": 0.01052467, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.01829147, "balance_loss_mlp": 1.01713181, "epoch": 0.6928603637456786, "flos": 18587269221120.0, "grad_norm": 2.1678184695236538, "language_loss": 0.69566643, "learning_rate": 9.104660824381915e-07, "loss": 0.71660143, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 11524, "time_per_iteration": 2.474879503250122 }, { "auxiliary_loss_clip": 0.01055548, "auxiliary_loss_mlp": 0.01038192, "balance_loss_clip": 1.01289606, "balance_loss_mlp": 1.01779008, "epoch": 0.6929204869983466, "flos": 22199767309440.0, "grad_norm": 1.7672332924600918, "language_loss": 0.65489113, "learning_rate": 9.101395054080815e-07, "loss": 0.67582858, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 11525, "time_per_iteration": 3.6500043869018555 }, { "auxiliary_loss_clip": 0.0105408, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.01334, "balance_loss_mlp": 1.01769793, "epoch": 0.6929806102510145, "flos": 17893789418880.0, "grad_norm": 2.267936681730246, "language_loss": 0.7126348, "learning_rate": 9.098129697055907e-07, "loss": 0.73355317, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 11526, "time_per_iteration": 2.3763978481292725 }, { "auxiliary_loss_clip": 0.01052861, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.01336336, "balance_loss_mlp": 1.0169102, "epoch": 0.6930407335036826, "flos": 19754171326080.0, "grad_norm": 1.6154588566355925, "language_loss": 0.77333808, "learning_rate": 9.094864753431022e-07, "loss": 0.79423529, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 11527, "time_per_iteration": 2.381542444229126 }, { "auxiliary_loss_clip": 0.01050725, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.01161134, "balance_loss_mlp": 1.01527357, "epoch": 0.6931008567563505, "flos": 21543155769600.0, "grad_norm": 3.4644728567690333, "language_loss": 0.80603522, "learning_rate": 9.091600223329952e-07, "loss": 0.82689095, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 11528, "time_per_iteration": 2.3859262466430664 }, { "auxiliary_loss_clip": 0.01049192, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.01237535, "balance_loss_mlp": 1.01513124, "epoch": 0.6931609800090185, "flos": 26248819438080.0, "grad_norm": 1.3403837894507182, "language_loss": 0.76606548, "learning_rate": 9.088336106876491e-07, "loss": 0.78689772, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 11529, "time_per_iteration": 5.276731252670288 }, { "auxiliary_loss_clip": 0.0105262, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.01794946, "balance_loss_mlp": 1.01648974, "epoch": 0.6932211032616865, "flos": 32342001292800.0, "grad_norm": 1.5733469137996845, "language_loss": 0.73508871, "learning_rate": 9.085072404194436e-07, "loss": 0.75604093, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 11530, "time_per_iteration": 2.446352243423462 }, { "auxiliary_loss_clip": 0.01057251, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.01758933, "balance_loss_mlp": 1.0170449, "epoch": 0.6932812265143544, "flos": 22048139808000.0, "grad_norm": 1.863708378601082, "language_loss": 0.79226214, "learning_rate": 9.081809115407513e-07, "loss": 0.81329823, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.40234375, "step": 11531, "time_per_iteration": 2.374729871749878 }, { "auxiliary_loss_clip": 0.0105049, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.0128237, "balance_loss_mlp": 1.01504636, "epoch": 0.6933413497670224, "flos": 26255243128320.0, "grad_norm": 1.4978426653986063, "language_loss": 0.70222282, "learning_rate": 9.078546240639484e-07, "loss": 0.7230891, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 11532, "time_per_iteration": 2.444790840148926 }, { "auxiliary_loss_clip": 0.01053336, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.01686406, "balance_loss_mlp": 1.01600349, "epoch": 0.6934014730196904, "flos": 19571994518400.0, "grad_norm": 1.447808405394924, "language_loss": 0.67992598, "learning_rate": 9.075283780014082e-07, "loss": 0.70087522, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37304688, "step": 11533, "time_per_iteration": 2.3924949169158936 }, { "auxiliary_loss_clip": 0.01054217, "auxiliary_loss_mlp": 0.01042668, "balance_loss_clip": 1.01622772, "balance_loss_mlp": 1.01632917, "epoch": 0.6934615962723584, "flos": 22118385196800.0, "grad_norm": 2.6109034263162325, "language_loss": 0.59700918, "learning_rate": 9.072021733655007e-07, "loss": 0.61797804, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11534, "time_per_iteration": 2.3471391201019287 }, { "auxiliary_loss_clip": 0.01052942, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.01397324, "balance_loss_mlp": 1.01566362, "epoch": 0.6935217195250263, "flos": 21359757064320.0, "grad_norm": 2.075226532966578, "language_loss": 0.73003173, "learning_rate": 9.068760101685971e-07, "loss": 0.75093937, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 11535, "time_per_iteration": 2.379699945449829 }, { "auxiliary_loss_clip": 0.01008821, "auxiliary_loss_mlp": 0.0100352, "balance_loss_clip": 1.00102818, "balance_loss_mlp": 1.00156891, "epoch": 0.6935818427776943, "flos": 64060137901440.0, "grad_norm": 0.7154913137288637, "language_loss": 0.59215784, "learning_rate": 9.065498884230638e-07, "loss": 0.61228132, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07226562, "step": 11536, "time_per_iteration": 3.109936237335205 }, { "auxiliary_loss_clip": 0.01055531, "auxiliary_loss_mlp": 0.01041966, "balance_loss_clip": 1.01720643, "balance_loss_mlp": 1.01692808, "epoch": 0.6936419660303622, "flos": 20301539621760.0, "grad_norm": 2.1399357195898716, "language_loss": 0.74490905, "learning_rate": 9.062238081412692e-07, "loss": 0.76588404, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 11537, "time_per_iteration": 2.3765084743499756 }, { "auxiliary_loss_clip": 0.01008346, "auxiliary_loss_mlp": 0.01006957, "balance_loss_clip": 1.00451326, "balance_loss_mlp": 1.00139725, "epoch": 0.6937020892830302, "flos": 67179349123200.0, "grad_norm": 0.7493355353136424, "language_loss": 0.55623138, "learning_rate": 9.058977693355767e-07, "loss": 0.57638437, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.06933594, "step": 11538, "time_per_iteration": 3.033482313156128 }, { "auxiliary_loss_clip": 0.01050248, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.01324463, "balance_loss_mlp": 1.0157305, "epoch": 0.6937622125356981, "flos": 23877064713600.0, "grad_norm": 1.5646476447307522, "language_loss": 0.78478098, "learning_rate": 9.055717720183505e-07, "loss": 0.80562365, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34570312, "step": 11539, "time_per_iteration": 2.3856000900268555 }, { "auxiliary_loss_clip": 0.01053166, "auxiliary_loss_mlp": 0.01036516, "balance_loss_clip": 1.01309144, "balance_loss_mlp": 1.01686406, "epoch": 0.6938223357883662, "flos": 28729363559040.0, "grad_norm": 1.7575681836264927, "language_loss": 0.65471381, "learning_rate": 9.05245816201953e-07, "loss": 0.67561066, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 11540, "time_per_iteration": 3.8264381885528564 }, { "auxiliary_loss_clip": 0.01051326, "auxiliary_loss_mlp": 0.01041339, "balance_loss_clip": 1.01989365, "balance_loss_mlp": 1.01571155, "epoch": 0.6938824590410341, "flos": 28653846554880.0, "grad_norm": 1.3814787782319071, "language_loss": 0.87313873, "learning_rate": 9.049199018987437e-07, "loss": 0.89406538, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35546875, "step": 11541, "time_per_iteration": 2.417262077331543 }, { "auxiliary_loss_clip": 0.01052681, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.01262128, "balance_loss_mlp": 1.01689243, "epoch": 0.6939425822937021, "flos": 18982241965440.0, "grad_norm": 4.103317544760109, "language_loss": 0.84892511, "learning_rate": 9.04594029121081e-07, "loss": 0.86981857, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 11542, "time_per_iteration": 2.362509250640869 }, { "auxiliary_loss_clip": 0.01053471, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.01004541, "balance_loss_mlp": 1.01608729, "epoch": 0.6940027055463701, "flos": 23074725692160.0, "grad_norm": 2.1140565481878033, "language_loss": 0.76116383, "learning_rate": 9.04268197881323e-07, "loss": 0.78205138, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 11543, "time_per_iteration": 2.365304470062256 }, { "auxiliary_loss_clip": 0.01051256, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.01755285, "balance_loss_mlp": 1.01562309, "epoch": 0.694062828799038, "flos": 18185593495680.0, "grad_norm": 1.6184320028998964, "language_loss": 0.7710138, "learning_rate": 9.039424081918241e-07, "loss": 0.79192287, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 11544, "time_per_iteration": 2.384392499923706 }, { "auxiliary_loss_clip": 0.01055462, "auxiliary_loss_mlp": 0.01039511, "balance_loss_clip": 1.01515675, "balance_loss_mlp": 1.01751649, "epoch": 0.694122952051706, "flos": 17820576564480.0, "grad_norm": 2.2194374601727658, "language_loss": 0.73188519, "learning_rate": 9.036166600649388e-07, "loss": 0.75283492, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.38085938, "step": 11545, "time_per_iteration": 2.3280136585235596 }, { "auxiliary_loss_clip": 0.01050396, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.00915742, "balance_loss_mlp": 1.01589465, "epoch": 0.694183075304374, "flos": 21214239050880.0, "grad_norm": 1.519953841412578, "language_loss": 0.80632961, "learning_rate": 9.0329095351302e-07, "loss": 0.82712924, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34570312, "step": 11546, "time_per_iteration": 2.3915302753448486 }, { "auxiliary_loss_clip": 0.01052365, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.01622605, "balance_loss_mlp": 1.01610088, "epoch": 0.694243198557042, "flos": 24059381166720.0, "grad_norm": 1.4332717864269475, "language_loss": 0.79557991, "learning_rate": 9.029652885484194e-07, "loss": 0.81649256, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 11547, "time_per_iteration": 2.487957000732422 }, { "auxiliary_loss_clip": 0.0105308, "auxiliary_loss_mlp": 0.01044844, "balance_loss_clip": 1.02033448, "balance_loss_mlp": 1.01681316, "epoch": 0.6943033218097099, "flos": 21140816728320.0, "grad_norm": 2.4564028622434995, "language_loss": 0.82227969, "learning_rate": 9.026396651834834e-07, "loss": 0.84325898, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 11548, "time_per_iteration": 2.39040470123291 }, { "auxiliary_loss_clip": 0.01007973, "auxiliary_loss_mlp": 0.01007907, "balance_loss_clip": 1.00542748, "balance_loss_mlp": 1.00102448, "epoch": 0.6943634450623779, "flos": 57808869943680.0, "grad_norm": 0.6983198094806087, "language_loss": 0.53850627, "learning_rate": 9.023140834305613e-07, "loss": 0.55866516, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.06933594, "step": 11549, "time_per_iteration": 3.0069692134857178 }, { "auxiliary_loss_clip": 0.01052708, "auxiliary_loss_mlp": 0.01038739, "balance_loss_clip": 1.01393104, "balance_loss_mlp": 1.01535416, "epoch": 0.6944235683150458, "flos": 30589396352640.0, "grad_norm": 1.412146031347397, "language_loss": 0.74335611, "learning_rate": 9.01988543302e-07, "loss": 0.76427054, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 11550, "time_per_iteration": 2.459702730178833 }, { "auxiliary_loss_clip": 0.01054823, "auxiliary_loss_mlp": 0.01039934, "balance_loss_clip": 1.0152458, "balance_loss_mlp": 1.01747942, "epoch": 0.6944836915677138, "flos": 19718420227200.0, "grad_norm": 1.89412770928752, "language_loss": 0.74995404, "learning_rate": 9.016630448101425e-07, "loss": 0.77090168, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 11551, "time_per_iteration": 2.349748134613037 }, { "auxiliary_loss_clip": 0.01052402, "auxiliary_loss_mlp": 0.01042071, "balance_loss_clip": 1.01909924, "balance_loss_mlp": 1.01697409, "epoch": 0.6945438148203817, "flos": 24862418415360.0, "grad_norm": 1.5626797211838361, "language_loss": 0.85417622, "learning_rate": 9.01337587967333e-07, "loss": 0.875121, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 11552, "time_per_iteration": 2.426356077194214 }, { "auxiliary_loss_clip": 0.01052849, "auxiliary_loss_mlp": 0.01043526, "balance_loss_clip": 1.02093542, "balance_loss_mlp": 1.01709199, "epoch": 0.6946039380730498, "flos": 33325295224320.0, "grad_norm": 1.5285701443228237, "language_loss": 0.68160129, "learning_rate": 9.010121727859117e-07, "loss": 0.70256501, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 11553, "time_per_iteration": 2.459977388381958 }, { "auxiliary_loss_clip": 0.0105463, "auxiliary_loss_mlp": 0.01042969, "balance_loss_clip": 1.01605153, "balance_loss_mlp": 1.01690841, "epoch": 0.6946640613257177, "flos": 20849885435520.0, "grad_norm": 1.857417230654762, "language_loss": 0.80495417, "learning_rate": 9.006867992782195e-07, "loss": 0.82593012, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37695312, "step": 11554, "time_per_iteration": 2.4154860973358154 }, { "auxiliary_loss_clip": 0.01054613, "auxiliary_loss_mlp": 0.01039829, "balance_loss_clip": 1.01535511, "balance_loss_mlp": 1.01688063, "epoch": 0.6947241845783857, "flos": 19353822232320.0, "grad_norm": 2.0491083285994507, "language_loss": 0.73560435, "learning_rate": 9.003614674565934e-07, "loss": 0.75654876, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37695312, "step": 11555, "time_per_iteration": 2.3440942764282227 }, { "auxiliary_loss_clip": 0.0105274, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.0122273, "balance_loss_mlp": 1.01703429, "epoch": 0.6947843078310536, "flos": 27119169521280.0, "grad_norm": 1.707371308402884, "language_loss": 0.78913736, "learning_rate": 9.000361773333705e-07, "loss": 0.81002712, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 11556, "time_per_iteration": 2.5135788917541504 }, { "auxiliary_loss_clip": 0.01053462, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.01313818, "balance_loss_mlp": 1.01634407, "epoch": 0.6948444310837216, "flos": 28583845545600.0, "grad_norm": 3.6184923015996877, "language_loss": 0.62668383, "learning_rate": 8.997109289208869e-07, "loss": 0.64758015, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37109375, "step": 11557, "time_per_iteration": 2.4196648597717285 }, { "auxiliary_loss_clip": 0.0105175, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.01731515, "balance_loss_mlp": 1.01612246, "epoch": 0.6949045543363896, "flos": 15668355669120.0, "grad_norm": 2.8992106941674667, "language_loss": 0.86499155, "learning_rate": 8.993857222314752e-07, "loss": 0.88591832, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 11558, "time_per_iteration": 2.3283145427703857 }, { "auxiliary_loss_clip": 0.01054136, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.01217914, "balance_loss_mlp": 1.01690412, "epoch": 0.6949646775890576, "flos": 23258264042880.0, "grad_norm": 1.467116628542879, "language_loss": 0.7149756, "learning_rate": 8.990605572774664e-07, "loss": 0.73587275, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 11559, "time_per_iteration": 2.3957700729370117 }, { "auxiliary_loss_clip": 0.0105213, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.01778865, "balance_loss_mlp": 1.01646686, "epoch": 0.6950248008417256, "flos": 22381455358080.0, "grad_norm": 2.015518713171025, "language_loss": 0.80035937, "learning_rate": 8.987354340711921e-07, "loss": 0.82127678, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 11560, "time_per_iteration": 2.3796913623809814 }, { "auxiliary_loss_clip": 0.01051527, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.01849639, "balance_loss_mlp": 1.01674032, "epoch": 0.6950849240943935, "flos": 23476226860800.0, "grad_norm": 1.6898196034068835, "language_loss": 0.77923167, "learning_rate": 8.9841035262498e-07, "loss": 0.80015194, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 11561, "time_per_iteration": 2.397343158721924 }, { "auxiliary_loss_clip": 0.01051078, "auxiliary_loss_mlp": 0.01043588, "balance_loss_clip": 1.01620603, "balance_loss_mlp": 1.01472545, "epoch": 0.6951450473470615, "flos": 17419599066240.0, "grad_norm": 2.3780493968595393, "language_loss": 0.79800677, "learning_rate": 8.980853129511577e-07, "loss": 0.81895345, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.36328125, "step": 11562, "time_per_iteration": 2.3227412700653076 }, { "auxiliary_loss_clip": 0.01053656, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.01506829, "balance_loss_mlp": 1.01640046, "epoch": 0.6952051705997294, "flos": 20484693947520.0, "grad_norm": 1.9896241968635169, "language_loss": 0.70326614, "learning_rate": 8.977603150620515e-07, "loss": 0.72419339, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 11563, "time_per_iteration": 2.3537721633911133 }, { "auxiliary_loss_clip": 0.0104889, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.01318765, "balance_loss_mlp": 1.01470327, "epoch": 0.6952652938523974, "flos": 13988719203840.0, "grad_norm": 2.483716085772271, "language_loss": 0.75223881, "learning_rate": 8.974353589699846e-07, "loss": 0.77307826, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 11564, "time_per_iteration": 2.3247358798980713 }, { "auxiliary_loss_clip": 0.01060664, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.01452541, "balance_loss_mlp": 1.01928401, "epoch": 0.6953254171050653, "flos": 30952702627200.0, "grad_norm": 2.1352724731006476, "language_loss": 0.74116719, "learning_rate": 8.971104446872785e-07, "loss": 0.76221615, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.4140625, "step": 11565, "time_per_iteration": 3.739166736602783 }, { "auxiliary_loss_clip": 0.01008751, "auxiliary_loss_mlp": 0.01005758, "balance_loss_clip": 1.00339794, "balance_loss_mlp": 1.00162244, "epoch": 0.6953855403577334, "flos": 61667261804160.0, "grad_norm": 0.8967487992596903, "language_loss": 0.58546269, "learning_rate": 8.96785572226255e-07, "loss": 0.60560775, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07128906, "step": 11566, "time_per_iteration": 2.8646254539489746 }, { "auxiliary_loss_clip": 0.01054958, "auxiliary_loss_mlp": 0.01043455, "balance_loss_clip": 1.01700282, "balance_loss_mlp": 1.01521158, "epoch": 0.6954456636104013, "flos": 23037927252480.0, "grad_norm": 1.873272202675458, "language_loss": 0.75109261, "learning_rate": 8.964607415992338e-07, "loss": 0.77207673, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3984375, "step": 11567, "time_per_iteration": 2.3713159561157227 }, { "auxiliary_loss_clip": 0.01051096, "auxiliary_loss_mlp": 0.0103865, "balance_loss_clip": 1.01485598, "balance_loss_mlp": 1.0155617, "epoch": 0.6955057868630693, "flos": 23917284466560.0, "grad_norm": 1.2990935562318244, "language_loss": 0.77224946, "learning_rate": 8.961359528185313e-07, "loss": 0.79314697, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 11568, "time_per_iteration": 3.7449910640716553 }, { "auxiliary_loss_clip": 0.01054957, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 1.01490188, "balance_loss_mlp": 1.01867414, "epoch": 0.6955659101157372, "flos": 22593727624320.0, "grad_norm": 1.7162374412704953, "language_loss": 0.73217404, "learning_rate": 8.958112058964649e-07, "loss": 0.75309891, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 11569, "time_per_iteration": 3.808018445968628 }, { "auxiliary_loss_clip": 0.01055036, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.01502991, "balance_loss_mlp": 1.01773596, "epoch": 0.6956260333684052, "flos": 24571347477120.0, "grad_norm": 1.5833263591246132, "language_loss": 0.77963126, "learning_rate": 8.954865008453471e-07, "loss": 0.80057454, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 11570, "time_per_iteration": 2.387924909591675 }, { "auxiliary_loss_clip": 0.01052277, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.01366329, "balance_loss_mlp": 1.01566505, "epoch": 0.6956861566210732, "flos": 25844944296960.0, "grad_norm": 1.8163462212446806, "language_loss": 0.75876367, "learning_rate": 8.95161837677493e-07, "loss": 0.77966994, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11571, "time_per_iteration": 2.4393911361694336 }, { "auxiliary_loss_clip": 0.0104984, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.01356995, "balance_loss_mlp": 1.01535988, "epoch": 0.6957462798737412, "flos": 15300580740480.0, "grad_norm": 2.241990276879471, "language_loss": 0.75529879, "learning_rate": 8.948372164052118e-07, "loss": 0.77614874, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 11572, "time_per_iteration": 2.33640718460083 }, { "auxiliary_loss_clip": 0.01052861, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.01002812, "balance_loss_mlp": 1.0160948, "epoch": 0.6958064031264092, "flos": 36245360851200.0, "grad_norm": 1.855673932407113, "language_loss": 0.71470255, "learning_rate": 8.94512637040814e-07, "loss": 0.73555982, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11573, "time_per_iteration": 2.511404275894165 }, { "auxiliary_loss_clip": 0.01055492, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.01580119, "balance_loss_mlp": 1.0175581, "epoch": 0.6958665263790771, "flos": 19207710725760.0, "grad_norm": 1.7300041225976708, "language_loss": 0.7612859, "learning_rate": 8.941880995966095e-07, "loss": 0.78225505, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11574, "time_per_iteration": 2.366525888442993 }, { "auxiliary_loss_clip": 0.01053295, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.01281953, "balance_loss_mlp": 1.01668727, "epoch": 0.6959266496317451, "flos": 21794844827520.0, "grad_norm": 1.6027373067451658, "language_loss": 0.75645202, "learning_rate": 8.938636040849014e-07, "loss": 0.77733672, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3671875, "step": 11575, "time_per_iteration": 2.3972008228302 }, { "auxiliary_loss_clip": 0.01052382, "auxiliary_loss_mlp": 0.01037984, "balance_loss_clip": 1.0131166, "balance_loss_mlp": 1.01559424, "epoch": 0.695986772884413, "flos": 20557208574720.0, "grad_norm": 1.91399863718846, "language_loss": 0.80278832, "learning_rate": 8.935391505179966e-07, "loss": 0.82369196, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 11576, "time_per_iteration": 2.3558080196380615 }, { "auxiliary_loss_clip": 0.0105455, "auxiliary_loss_mlp": 0.0103837, "balance_loss_clip": 1.01297808, "balance_loss_mlp": 1.01688349, "epoch": 0.696046896137081, "flos": 14935424163840.0, "grad_norm": 2.4174822470824457, "language_loss": 0.5787378, "learning_rate": 8.932147389081985e-07, "loss": 0.59966701, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 11577, "time_per_iteration": 2.323350667953491 }, { "auxiliary_loss_clip": 0.01050072, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.01324594, "balance_loss_mlp": 1.01555908, "epoch": 0.696107019389749, "flos": 30738824438400.0, "grad_norm": 1.5175528897674602, "language_loss": 0.77425075, "learning_rate": 8.928903692678081e-07, "loss": 0.79508454, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.34570312, "step": 11578, "time_per_iteration": 2.464705467224121 }, { "auxiliary_loss_clip": 0.01053925, "auxiliary_loss_mlp": 0.01036804, "balance_loss_clip": 1.01411891, "balance_loss_mlp": 1.01746643, "epoch": 0.696167142642417, "flos": 20775695063040.0, "grad_norm": 1.8427457013325552, "language_loss": 0.80399108, "learning_rate": 8.925660416091254e-07, "loss": 0.82489836, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36523438, "step": 11579, "time_per_iteration": 2.3513641357421875 }, { "auxiliary_loss_clip": 0.01051056, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.012568, "balance_loss_mlp": 1.01513803, "epoch": 0.6962272658950849, "flos": 22564051102080.0, "grad_norm": 1.8541760369824505, "language_loss": 0.73697233, "learning_rate": 8.922417559444502e-07, "loss": 0.75784695, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 11580, "time_per_iteration": 3.825021743774414 }, { "auxiliary_loss_clip": 0.01052627, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.01090956, "balance_loss_mlp": 1.01609635, "epoch": 0.6962873891477529, "flos": 22199069082240.0, "grad_norm": 1.8628750519224015, "language_loss": 0.6714623, "learning_rate": 8.919175122860787e-07, "loss": 0.69233412, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 11581, "time_per_iteration": 2.3771555423736572 }, { "auxiliary_loss_clip": 0.01053543, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.01174498, "balance_loss_mlp": 1.01704204, "epoch": 0.6963475124004208, "flos": 12489025219200.0, "grad_norm": 2.4860546287487995, "language_loss": 0.77812243, "learning_rate": 8.915933106463056e-07, "loss": 0.79899639, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36523438, "step": 11582, "time_per_iteration": 2.3225293159484863 }, { "auxiliary_loss_clip": 0.01051644, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.01276338, "balance_loss_mlp": 1.01536644, "epoch": 0.6964076356530888, "flos": 17164139581440.0, "grad_norm": 2.7401986135400924, "language_loss": 0.70797729, "learning_rate": 8.91269151037425e-07, "loss": 0.7288475, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 11583, "time_per_iteration": 2.3888485431671143 }, { "auxiliary_loss_clip": 0.01054069, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.01488471, "balance_loss_mlp": 1.01799941, "epoch": 0.6964677589057569, "flos": 19936313222400.0, "grad_norm": 2.108932441149312, "language_loss": 0.83433253, "learning_rate": 8.909450334717301e-07, "loss": 0.85525775, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 11584, "time_per_iteration": 2.3491950035095215 }, { "auxiliary_loss_clip": 0.0105487, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.01147723, "balance_loss_mlp": 1.01739788, "epoch": 0.6965278821584248, "flos": 22782956526720.0, "grad_norm": 2.323242599946581, "language_loss": 0.81395125, "learning_rate": 8.906209579615107e-07, "loss": 0.83487189, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 11585, "time_per_iteration": 2.38248872756958 }, { "auxiliary_loss_clip": 0.01050525, "auxiliary_loss_mlp": 0.01035191, "balance_loss_clip": 1.01324487, "balance_loss_mlp": 1.01602852, "epoch": 0.6965880054110928, "flos": 20046533984640.0, "grad_norm": 1.6123779950085897, "language_loss": 0.79137683, "learning_rate": 8.90296924519055e-07, "loss": 0.81223398, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34570312, "step": 11586, "time_per_iteration": 2.376453161239624 }, { "auxiliary_loss_clip": 0.0104794, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.01440382, "balance_loss_mlp": 1.01522136, "epoch": 0.6966481286637607, "flos": 21907160271360.0, "grad_norm": 1.5638269576469919, "language_loss": 0.79637337, "learning_rate": 8.899729331566519e-07, "loss": 0.8171922, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.328125, "step": 11587, "time_per_iteration": 2.3909571170806885 }, { "auxiliary_loss_clip": 0.01050835, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.01405525, "balance_loss_mlp": 1.01640391, "epoch": 0.6967082519164287, "flos": 15632255456640.0, "grad_norm": 1.9309067872880417, "language_loss": 0.74090803, "learning_rate": 8.896489838865857e-07, "loss": 0.76178229, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34375, "step": 11588, "time_per_iteration": 2.350559949874878 }, { "auxiliary_loss_clip": 0.01050887, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.01173091, "balance_loss_mlp": 1.01564848, "epoch": 0.6967683751690966, "flos": 24023455511040.0, "grad_norm": 1.7213783218890435, "language_loss": 0.76645297, "learning_rate": 8.893250767211413e-07, "loss": 0.78728515, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.3515625, "step": 11589, "time_per_iteration": 2.3964054584503174 }, { "auxiliary_loss_clip": 0.01052819, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.01202524, "balance_loss_mlp": 1.01623726, "epoch": 0.6968284984217646, "flos": 31023506597760.0, "grad_norm": 2.0513123346592765, "language_loss": 0.6500628, "learning_rate": 8.890012116726012e-07, "loss": 0.67094034, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11590, "time_per_iteration": 2.4435911178588867 }, { "auxiliary_loss_clip": 0.01008463, "auxiliary_loss_mlp": 0.01005014, "balance_loss_clip": 1.00243902, "balance_loss_mlp": 1.00149894, "epoch": 0.6968886216744326, "flos": 67619673590400.0, "grad_norm": 0.7559601346058112, "language_loss": 0.61363077, "learning_rate": 8.88677388753248e-07, "loss": 0.63376546, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.06933594, "step": 11591, "time_per_iteration": 3.0924127101898193 }, { "auxiliary_loss_clip": 0.01053749, "auxiliary_loss_mlp": 0.0103689, "balance_loss_clip": 1.01321507, "balance_loss_mlp": 1.0168519, "epoch": 0.6969487449271006, "flos": 24862523149440.0, "grad_norm": 1.6961223723905692, "language_loss": 0.70548522, "learning_rate": 8.883536079753582e-07, "loss": 0.72639167, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 11592, "time_per_iteration": 2.385193347930908 }, { "auxiliary_loss_clip": 0.01050862, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.01166344, "balance_loss_mlp": 1.01627302, "epoch": 0.6970088681797685, "flos": 28766580935040.0, "grad_norm": 1.6868821010494794, "language_loss": 0.63442284, "learning_rate": 8.880298693512109e-07, "loss": 0.65527093, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 11593, "time_per_iteration": 2.459378719329834 }, { "auxiliary_loss_clip": 0.01049411, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.00873137, "balance_loss_mlp": 1.0161531, "epoch": 0.6970689914324365, "flos": 27307316171520.0, "grad_norm": 1.4571851987143956, "language_loss": 0.55676532, "learning_rate": 8.877061728930832e-07, "loss": 0.57755184, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33203125, "step": 11594, "time_per_iteration": 2.40832257270813 }, { "auxiliary_loss_clip": 0.01051507, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.01248837, "balance_loss_mlp": 1.01622808, "epoch": 0.6971291146851044, "flos": 19135231009920.0, "grad_norm": 2.4614175461899492, "language_loss": 0.78304631, "learning_rate": 8.87382518613248e-07, "loss": 0.80390704, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 11595, "time_per_iteration": 2.358025312423706 }, { "auxiliary_loss_clip": 0.0105458, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.01545811, "balance_loss_mlp": 1.01763475, "epoch": 0.6971892379377724, "flos": 14609649467520.0, "grad_norm": 2.2759913012179487, "language_loss": 0.72565949, "learning_rate": 8.870589065239793e-07, "loss": 0.74659228, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 11596, "time_per_iteration": 2.3428988456726074 }, { "auxiliary_loss_clip": 0.01053255, "auxiliary_loss_mlp": 0.01044909, "balance_loss_clip": 1.02011335, "balance_loss_mlp": 1.01709199, "epoch": 0.6972493611904405, "flos": 22306427112960.0, "grad_norm": 1.969685580749523, "language_loss": 0.76872182, "learning_rate": 8.867353366375492e-07, "loss": 0.78970349, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 11597, "time_per_iteration": 2.37727427482605 }, { "auxiliary_loss_clip": 0.01050605, "auxiliary_loss_mlp": 0.01035568, "balance_loss_clip": 1.01481342, "balance_loss_mlp": 1.0155859, "epoch": 0.6973094844431084, "flos": 17419424509440.0, "grad_norm": 1.8621659776966977, "language_loss": 0.76309967, "learning_rate": 8.864118089662267e-07, "loss": 0.78396147, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34960938, "step": 11598, "time_per_iteration": 2.3557000160217285 }, { "auxiliary_loss_clip": 0.01053884, "auxiliary_loss_mlp": 0.01040364, "balance_loss_clip": 1.01581919, "balance_loss_mlp": 1.01642501, "epoch": 0.6973696076957764, "flos": 27234138228480.0, "grad_norm": 1.778545507796275, "language_loss": 0.90925097, "learning_rate": 8.860883235222791e-07, "loss": 0.93019348, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 11599, "time_per_iteration": 2.396793842315674 }, { "auxiliary_loss_clip": 0.0105599, "auxiliary_loss_mlp": 0.01046434, "balance_loss_clip": 1.01979101, "balance_loss_mlp": 1.01768637, "epoch": 0.6974297309484443, "flos": 22016997008640.0, "grad_norm": 1.830212777454065, "language_loss": 0.7096566, "learning_rate": 8.85764880317974e-07, "loss": 0.73068082, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 11600, "time_per_iteration": 2.3984360694885254 }, { "auxiliary_loss_clip": 0.01052319, "auxiliary_loss_mlp": 0.01038023, "balance_loss_clip": 1.01518297, "balance_loss_mlp": 1.01559258, "epoch": 0.6974898542011123, "flos": 28365184500480.0, "grad_norm": 1.68095654370787, "language_loss": 0.77759576, "learning_rate": 8.854414793655771e-07, "loss": 0.79849923, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11601, "time_per_iteration": 2.4187092781066895 }, { "auxiliary_loss_clip": 0.01049317, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.01714492, "balance_loss_mlp": 1.01526141, "epoch": 0.6975499774537802, "flos": 15231138312960.0, "grad_norm": 1.695083918114348, "language_loss": 0.73400986, "learning_rate": 8.851181206773508e-07, "loss": 0.75487775, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33984375, "step": 11602, "time_per_iteration": 2.3627638816833496 }, { "auxiliary_loss_clip": 0.0105175, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.01271462, "balance_loss_mlp": 1.0159936, "epoch": 0.6976101007064482, "flos": 22156510268160.0, "grad_norm": 2.153134496457893, "language_loss": 0.77250421, "learning_rate": 8.847948042655567e-07, "loss": 0.79337192, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 11603, "time_per_iteration": 2.3522982597351074 }, { "auxiliary_loss_clip": 0.01051509, "auxiliary_loss_mlp": 0.01039436, "balance_loss_clip": 1.01571321, "balance_loss_mlp": 1.01517427, "epoch": 0.6976702239591162, "flos": 22272421582080.0, "grad_norm": 1.7172762318889199, "language_loss": 0.62932587, "learning_rate": 8.844715301424557e-07, "loss": 0.65023535, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 11604, "time_per_iteration": 3.639749765396118 }, { "auxiliary_loss_clip": 0.01052163, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.01583099, "balance_loss_mlp": 1.01596582, "epoch": 0.6977303472117842, "flos": 25847423003520.0, "grad_norm": 3.4041046785085847, "language_loss": 0.81933475, "learning_rate": 8.841482983203057e-07, "loss": 0.8402608, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 11605, "time_per_iteration": 2.420377016067505 }, { "auxiliary_loss_clip": 0.01051642, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.01352024, "balance_loss_mlp": 1.01583934, "epoch": 0.6977904704644521, "flos": 20958535186560.0, "grad_norm": 1.6572227326874687, "language_loss": 0.71576542, "learning_rate": 8.838251088113638e-07, "loss": 0.73664069, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 11606, "time_per_iteration": 2.3563010692596436 }, { "auxiliary_loss_clip": 0.01053801, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.01348245, "balance_loss_mlp": 1.01682091, "epoch": 0.6978505937171201, "flos": 22053935093760.0, "grad_norm": 1.9572113513859628, "language_loss": 0.83534712, "learning_rate": 8.835019616278856e-07, "loss": 0.85625595, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 11607, "time_per_iteration": 2.38275146484375 }, { "auxiliary_loss_clip": 0.01054739, "auxiliary_loss_mlp": 0.01042091, "balance_loss_clip": 1.01821351, "balance_loss_mlp": 1.01742339, "epoch": 0.697910716969788, "flos": 20042798469120.0, "grad_norm": 1.8769507605945905, "language_loss": 0.80212808, "learning_rate": 8.831788567821265e-07, "loss": 0.82309639, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 11608, "time_per_iteration": 3.74326753616333 }, { "auxiliary_loss_clip": 0.01051946, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.01359367, "balance_loss_mlp": 1.01650035, "epoch": 0.697970840222456, "flos": 15887330916480.0, "grad_norm": 1.966320485106676, "language_loss": 0.91428769, "learning_rate": 8.828557942863357e-07, "loss": 0.93517113, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 11609, "time_per_iteration": 3.709801435470581 }, { "auxiliary_loss_clip": 0.01052633, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.01368809, "balance_loss_mlp": 1.01542008, "epoch": 0.698030963475124, "flos": 21214553253120.0, "grad_norm": 1.5866161236653757, "language_loss": 0.65249598, "learning_rate": 8.82532774152765e-07, "loss": 0.67339194, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 11610, "time_per_iteration": 2.360164165496826 }, { "auxiliary_loss_clip": 0.01051783, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.01555538, "balance_loss_mlp": 1.01644874, "epoch": 0.698091086727792, "flos": 33758497774080.0, "grad_norm": 2.0159556774404424, "language_loss": 0.85570145, "learning_rate": 8.822097963936643e-07, "loss": 0.87659323, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 11611, "time_per_iteration": 2.4812636375427246 }, { "auxiliary_loss_clip": 0.01052301, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.01514244, "balance_loss_mlp": 1.01566696, "epoch": 0.69815120998046, "flos": 15886946891520.0, "grad_norm": 1.940930807163004, "language_loss": 0.72228658, "learning_rate": 8.818868610212793e-07, "loss": 0.74319077, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 11612, "time_per_iteration": 2.3347463607788086 }, { "auxiliary_loss_clip": 0.01050338, "auxiliary_loss_mlp": 0.01040248, "balance_loss_clip": 1.01783717, "balance_loss_mlp": 1.01575708, "epoch": 0.6982113332331279, "flos": 18946211575680.0, "grad_norm": 1.4985155717040275, "language_loss": 0.82066548, "learning_rate": 8.815639680478573e-07, "loss": 0.84157133, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 11613, "time_per_iteration": 2.4345881938934326 }, { "auxiliary_loss_clip": 0.01050306, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.01006341, "balance_loss_mlp": 1.01645041, "epoch": 0.6982714564857959, "flos": 24388437530880.0, "grad_norm": 2.026493467018975, "language_loss": 0.76821673, "learning_rate": 8.812411174856411e-07, "loss": 0.78903109, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33789062, "step": 11614, "time_per_iteration": 2.4006752967834473 }, { "auxiliary_loss_clip": 0.01054306, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.01292253, "balance_loss_mlp": 1.01798868, "epoch": 0.6983315797384638, "flos": 20082704019840.0, "grad_norm": 2.7625709647120655, "language_loss": 0.79290974, "learning_rate": 8.809183093468746e-07, "loss": 0.81379247, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.36328125, "step": 11615, "time_per_iteration": 2.419020891189575 }, { "auxiliary_loss_clip": 0.01049267, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.01773679, "balance_loss_mlp": 1.01533461, "epoch": 0.6983917029911318, "flos": 13511701031040.0, "grad_norm": 2.4175594533015867, "language_loss": 0.73687935, "learning_rate": 8.80595543643797e-07, "loss": 0.7577616, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33984375, "step": 11616, "time_per_iteration": 2.3636999130249023 }, { "auxiliary_loss_clip": 0.01051111, "auxiliary_loss_mlp": 0.0103951, "balance_loss_clip": 1.01773047, "balance_loss_mlp": 1.01655447, "epoch": 0.6984518262437998, "flos": 22017311210880.0, "grad_norm": 1.7883712298753913, "language_loss": 0.85105097, "learning_rate": 8.802728203886487e-07, "loss": 0.87195718, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 11617, "time_per_iteration": 2.4609696865081787 }, { "auxiliary_loss_clip": 0.0105371, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.01934075, "balance_loss_mlp": 1.01708412, "epoch": 0.6985119494964678, "flos": 18769620585600.0, "grad_norm": 2.213873187580799, "language_loss": 0.60273886, "learning_rate": 8.799501395936682e-07, "loss": 0.62369597, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 11618, "time_per_iteration": 2.4006450176239014 }, { "auxiliary_loss_clip": 0.0105148, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.01761174, "balance_loss_mlp": 1.01616359, "epoch": 0.6985720727491357, "flos": 22381734648960.0, "grad_norm": 1.7803714561394302, "language_loss": 0.83944619, "learning_rate": 8.796275012710903e-07, "loss": 0.86035275, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35351562, "step": 11619, "time_per_iteration": 3.872917890548706 }, { "auxiliary_loss_clip": 0.01047816, "auxiliary_loss_mlp": 0.01034777, "balance_loss_clip": 1.01534605, "balance_loss_mlp": 1.01447678, "epoch": 0.6986321960018037, "flos": 39566299242240.0, "grad_norm": 1.937177192152036, "language_loss": 0.68072248, "learning_rate": 8.793049054331494e-07, "loss": 0.7015484, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.33398438, "step": 11620, "time_per_iteration": 2.56880784034729 }, { "auxiliary_loss_clip": 0.01053126, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.01343417, "balance_loss_mlp": 1.01608884, "epoch": 0.6986923192544716, "flos": 17966757893760.0, "grad_norm": 2.080679860087928, "language_loss": 0.73882598, "learning_rate": 8.789823520920794e-07, "loss": 0.75973392, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 11621, "time_per_iteration": 2.364098310470581 }, { "auxiliary_loss_clip": 0.01053412, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.01905489, "balance_loss_mlp": 1.01641786, "epoch": 0.6987524425071396, "flos": 25593115593600.0, "grad_norm": 1.6028099462176315, "language_loss": 0.69929129, "learning_rate": 8.7865984126011e-07, "loss": 0.72024071, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37109375, "step": 11622, "time_per_iteration": 2.44592547416687 }, { "auxiliary_loss_clip": 0.01048771, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.01373076, "balance_loss_mlp": 1.01501608, "epoch": 0.6988125657598077, "flos": 17529121601280.0, "grad_norm": 1.6909221715128162, "language_loss": 0.63526857, "learning_rate": 8.783373729494721e-07, "loss": 0.65611136, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33789062, "step": 11623, "time_per_iteration": 2.3564870357513428 }, { "auxiliary_loss_clip": 0.01052982, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.0132854, "balance_loss_mlp": 1.01539755, "epoch": 0.6988726890124756, "flos": 39164169669120.0, "grad_norm": 1.7662552665025708, "language_loss": 0.61916733, "learning_rate": 8.780149471723932e-07, "loss": 0.64005733, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.37695312, "step": 11624, "time_per_iteration": 2.5177547931671143 }, { "auxiliary_loss_clip": 0.01053799, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.02116299, "balance_loss_mlp": 1.0165751, "epoch": 0.6989328122651436, "flos": 20192436023040.0, "grad_norm": 1.9220385006714453, "language_loss": 0.7941035, "learning_rate": 8.776925639411017e-07, "loss": 0.81511748, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37109375, "step": 11625, "time_per_iteration": 2.354222059249878 }, { "auxiliary_loss_clip": 0.01049788, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.01136589, "balance_loss_mlp": 1.01543176, "epoch": 0.6989929355178115, "flos": 21833807771520.0, "grad_norm": 1.8709478912433393, "language_loss": 0.67617595, "learning_rate": 8.773702232678188e-07, "loss": 0.69698983, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34375, "step": 11626, "time_per_iteration": 2.378319501876831 }, { "auxiliary_loss_clip": 0.01053093, "auxiliary_loss_mlp": 0.01041967, "balance_loss_clip": 1.01776767, "balance_loss_mlp": 1.0168283, "epoch": 0.6990530587704795, "flos": 26321683178880.0, "grad_norm": 2.168284421833644, "language_loss": 0.71998459, "learning_rate": 8.770479251647697e-07, "loss": 0.74093521, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 11627, "time_per_iteration": 2.4146807193756104 }, { "auxiliary_loss_clip": 0.01051709, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.01330924, "balance_loss_mlp": 1.01745558, "epoch": 0.6991131820231474, "flos": 19827942762240.0, "grad_norm": 2.1061698962593645, "language_loss": 0.6348002, "learning_rate": 8.767256696441768e-07, "loss": 0.6556499, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.34179688, "step": 11628, "time_per_iteration": 2.377227544784546 }, { "auxiliary_loss_clip": 0.0105289, "auxiliary_loss_mlp": 0.01039354, "balance_loss_clip": 1.01672769, "balance_loss_mlp": 1.01664519, "epoch": 0.6991733052758154, "flos": 33983407952640.0, "grad_norm": 1.9618913309861241, "language_loss": 0.69874573, "learning_rate": 8.764034567182581e-07, "loss": 0.71966815, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 11629, "time_per_iteration": 2.5003819465637207 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01037555, "balance_loss_clip": 1.01522732, "balance_loss_mlp": 1.01654291, "epoch": 0.6992334285284834, "flos": 15632220545280.0, "grad_norm": 1.6736693820148398, "language_loss": 0.73622537, "learning_rate": 8.760812863992337e-07, "loss": 0.75712347, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 11630, "time_per_iteration": 2.351524591445923 }, { "auxiliary_loss_clip": 0.01050674, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.01564717, "balance_loss_mlp": 1.01609206, "epoch": 0.6992935517811514, "flos": 21725192931840.0, "grad_norm": 1.9047625800670134, "language_loss": 0.75181299, "learning_rate": 8.757591586993196e-07, "loss": 0.77269661, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34570312, "step": 11631, "time_per_iteration": 2.3895297050476074 }, { "auxiliary_loss_clip": 0.01055257, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.01209331, "balance_loss_mlp": 1.0181793, "epoch": 0.6993536750338193, "flos": 20114370489600.0, "grad_norm": 2.0447692505600825, "language_loss": 0.90306181, "learning_rate": 8.7543707363073e-07, "loss": 0.92398417, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 11632, "time_per_iteration": 2.3458268642425537 }, { "auxiliary_loss_clip": 0.01053149, "auxiliary_loss_mlp": 0.01039603, "balance_loss_clip": 1.01760924, "balance_loss_mlp": 1.01671338, "epoch": 0.6994137982864873, "flos": 22009665623040.0, "grad_norm": 1.605118778338367, "language_loss": 0.80616403, "learning_rate": 8.751150312056792e-07, "loss": 0.82709157, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 11633, "time_per_iteration": 2.412066698074341 }, { "auxiliary_loss_clip": 0.01054897, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.01486635, "balance_loss_mlp": 1.01705873, "epoch": 0.6994739215391552, "flos": 25517877880320.0, "grad_norm": 1.8589679345341321, "language_loss": 0.68695641, "learning_rate": 8.747930314363794e-07, "loss": 0.70790726, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 11634, "time_per_iteration": 2.4107253551483154 }, { "auxiliary_loss_clip": 0.01009069, "auxiliary_loss_mlp": 0.01002702, "balance_loss_clip": 1.00018644, "balance_loss_mlp": 1.00208354, "epoch": 0.6995340447918232, "flos": 59125095400320.0, "grad_norm": 0.6864107652113981, "language_loss": 0.531546, "learning_rate": 8.744710743350412e-07, "loss": 0.55166364, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.06982422, "step": 11635, "time_per_iteration": 3.1277272701263428 }, { "auxiliary_loss_clip": 0.01052414, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.01294255, "balance_loss_mlp": 1.01615739, "epoch": 0.6995941680444913, "flos": 17966862627840.0, "grad_norm": 1.8063572211320686, "language_loss": 0.8227706, "learning_rate": 8.741491599138726e-07, "loss": 0.84365839, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 11636, "time_per_iteration": 2.3450889587402344 }, { "auxiliary_loss_clip": 0.01053395, "auxiliary_loss_mlp": 0.01041973, "balance_loss_clip": 1.01772594, "balance_loss_mlp": 1.01597524, "epoch": 0.6996542912971592, "flos": 21979046494080.0, "grad_norm": 3.6704368814587904, "language_loss": 0.84197581, "learning_rate": 8.738272881850801e-07, "loss": 0.86292946, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 11637, "time_per_iteration": 2.3990094661712646 }, { "auxiliary_loss_clip": 0.01052056, "auxiliary_loss_mlp": 0.0103247, "balance_loss_clip": 1.00945103, "balance_loss_mlp": 1.01601028, "epoch": 0.6997144145498272, "flos": 11685534122880.0, "grad_norm": 2.3280504823733676, "language_loss": 0.68976474, "learning_rate": 8.735054591608704e-07, "loss": 0.71061003, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 11638, "time_per_iteration": 2.365551471710205 }, { "auxiliary_loss_clip": 0.01055083, "auxiliary_loss_mlp": 0.01045323, "balance_loss_clip": 1.0182029, "balance_loss_mlp": 1.01699924, "epoch": 0.6997745378024951, "flos": 29605858041600.0, "grad_norm": 11.855144041182964, "language_loss": 0.79052615, "learning_rate": 8.731836728534459e-07, "loss": 0.81153023, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38085938, "step": 11639, "time_per_iteration": 2.4729456901550293 }, { "auxiliary_loss_clip": 0.01053782, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.01886892, "balance_loss_mlp": 1.01728201, "epoch": 0.6998346610551631, "flos": 20885566711680.0, "grad_norm": 2.214195161931057, "language_loss": 0.84653401, "learning_rate": 8.728619292750093e-07, "loss": 0.86749828, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 11640, "time_per_iteration": 2.381373882293701 }, { "auxiliary_loss_clip": 0.01052229, "auxiliary_loss_mlp": 0.01035805, "balance_loss_clip": 1.01391816, "balance_loss_mlp": 1.01579547, "epoch": 0.699894784307831, "flos": 27161798158080.0, "grad_norm": 2.303276877340575, "language_loss": 0.76479125, "learning_rate": 8.725402284377619e-07, "loss": 0.78567159, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 11641, "time_per_iteration": 2.4226763248443604 }, { "auxiliary_loss_clip": 0.01052765, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.00957274, "balance_loss_mlp": 1.01650465, "epoch": 0.699954907560499, "flos": 20922574619520.0, "grad_norm": 1.899966531906291, "language_loss": 0.78706491, "learning_rate": 8.722185703539022e-07, "loss": 0.80793476, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 11642, "time_per_iteration": 2.3795344829559326 }, { "auxiliary_loss_clip": 0.01056758, "auxiliary_loss_mlp": 0.01043162, "balance_loss_clip": 1.01508868, "balance_loss_mlp": 1.01702726, "epoch": 0.700015030813167, "flos": 28656534729600.0, "grad_norm": 4.47725169375736, "language_loss": 0.7628879, "learning_rate": 8.718969550356266e-07, "loss": 0.78388709, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.3984375, "step": 11643, "time_per_iteration": 2.4311883449554443 }, { "auxiliary_loss_clip": 0.01053808, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.01172972, "balance_loss_mlp": 1.01658177, "epoch": 0.700075154065835, "flos": 29204007759360.0, "grad_norm": 2.9454503166943606, "language_loss": 0.61389166, "learning_rate": 8.715753824951315e-07, "loss": 0.63479114, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 11644, "time_per_iteration": 3.7346742153167725 }, { "auxiliary_loss_clip": 0.01052072, "auxiliary_loss_mlp": 0.01035121, "balance_loss_clip": 1.01154137, "balance_loss_mlp": 1.01673627, "epoch": 0.7001352773185029, "flos": 23111314663680.0, "grad_norm": 1.670992255123713, "language_loss": 0.82716888, "learning_rate": 8.712538527446119e-07, "loss": 0.84804082, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 11645, "time_per_iteration": 2.3787055015563965 }, { "auxiliary_loss_clip": 0.01052196, "auxiliary_loss_mlp": 0.01042175, "balance_loss_clip": 1.01755857, "balance_loss_mlp": 1.01556015, "epoch": 0.7001954005711709, "flos": 21321841461120.0, "grad_norm": 1.8596787889723019, "language_loss": 0.68985921, "learning_rate": 8.709323657962584e-07, "loss": 0.71080291, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11646, "time_per_iteration": 2.3904736042022705 }, { "auxiliary_loss_clip": 0.01052906, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.01404881, "balance_loss_mlp": 1.01672792, "epoch": 0.7002555238238388, "flos": 24534653771520.0, "grad_norm": 1.529638644803851, "language_loss": 0.72159827, "learning_rate": 8.706109216622635e-07, "loss": 0.74248993, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36328125, "step": 11647, "time_per_iteration": 3.8580915927886963 }, { "auxiliary_loss_clip": 0.01055729, "auxiliary_loss_mlp": 0.01039395, "balance_loss_clip": 1.01420641, "balance_loss_mlp": 1.01738393, "epoch": 0.7003156470765068, "flos": 39054996247680.0, "grad_norm": 1.7755579417219047, "language_loss": 0.72782332, "learning_rate": 8.702895203548155e-07, "loss": 0.74877453, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 11648, "time_per_iteration": 2.540794849395752 }, { "auxiliary_loss_clip": 0.01051891, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.01331961, "balance_loss_mlp": 1.01567996, "epoch": 0.7003757703291749, "flos": 28802820792960.0, "grad_norm": 1.515977300836846, "language_loss": 0.7855401, "learning_rate": 8.699681618861014e-07, "loss": 0.80643922, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 11649, "time_per_iteration": 3.8794047832489014 }, { "auxiliary_loss_clip": 0.01053623, "auxiliary_loss_mlp": 0.01041781, "balance_loss_clip": 1.01799834, "balance_loss_mlp": 1.01681304, "epoch": 0.7004358935818428, "flos": 15953142562560.0, "grad_norm": 1.6478815874091366, "language_loss": 0.79224283, "learning_rate": 8.69646846268308e-07, "loss": 0.81319684, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 11650, "time_per_iteration": 2.3217427730560303 }, { "auxiliary_loss_clip": 0.01052474, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.01315749, "balance_loss_mlp": 1.01592875, "epoch": 0.7004960168345108, "flos": 20410957422720.0, "grad_norm": 2.129889873976562, "language_loss": 0.79411435, "learning_rate": 8.693255735136194e-07, "loss": 0.8150177, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36523438, "step": 11651, "time_per_iteration": 2.38931941986084 }, { "auxiliary_loss_clip": 0.0105552, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.01307178, "balance_loss_mlp": 1.01723444, "epoch": 0.7005561400871787, "flos": 17346595680000.0, "grad_norm": 1.7432764385854966, "language_loss": 0.7153815, "learning_rate": 8.690043436342198e-07, "loss": 0.73629969, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3828125, "step": 11652, "time_per_iteration": 2.379960775375366 }, { "auxiliary_loss_clip": 0.01053514, "auxiliary_loss_mlp": 0.0104252, "balance_loss_clip": 1.01810563, "balance_loss_mlp": 1.01696134, "epoch": 0.7006162633398467, "flos": 25300927491840.0, "grad_norm": 1.6462672250919261, "language_loss": 0.75166285, "learning_rate": 8.686831566422874e-07, "loss": 0.77262318, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 11653, "time_per_iteration": 2.433234930038452 }, { "auxiliary_loss_clip": 0.01053826, "auxiliary_loss_mlp": 0.01039732, "balance_loss_clip": 1.01325548, "balance_loss_mlp": 1.01607072, "epoch": 0.7006763865925146, "flos": 20667918096000.0, "grad_norm": 1.9462396809942573, "language_loss": 0.72059357, "learning_rate": 8.68362012550003e-07, "loss": 0.74152923, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11654, "time_per_iteration": 2.3964052200317383 }, { "auxiliary_loss_clip": 0.01054431, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.01204085, "balance_loss_mlp": 1.01589751, "epoch": 0.7007365098451827, "flos": 20045451732480.0, "grad_norm": 2.785867635805633, "language_loss": 0.74970877, "learning_rate": 8.680409113695453e-07, "loss": 0.77064687, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 11655, "time_per_iteration": 2.3460280895233154 }, { "auxiliary_loss_clip": 0.01058287, "auxiliary_loss_mlp": 0.01043544, "balance_loss_clip": 1.01572084, "balance_loss_mlp": 1.01820397, "epoch": 0.7007966330978506, "flos": 20776323467520.0, "grad_norm": 2.0464353274645455, "language_loss": 0.71654683, "learning_rate": 8.677198531130889e-07, "loss": 0.73756516, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.40039062, "step": 11656, "time_per_iteration": 2.392951488494873 }, { "auxiliary_loss_clip": 0.01051346, "auxiliary_loss_mlp": 0.0103814, "balance_loss_clip": 1.01512074, "balance_loss_mlp": 1.01537013, "epoch": 0.7008567563505186, "flos": 29637035752320.0, "grad_norm": 1.5162099575520795, "language_loss": 0.78773332, "learning_rate": 8.673988377928092e-07, "loss": 0.8086282, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 11657, "time_per_iteration": 2.4203271865844727 }, { "auxiliary_loss_clip": 0.01056203, "auxiliary_loss_mlp": 0.01044444, "balance_loss_clip": 1.01794362, "balance_loss_mlp": 1.01703501, "epoch": 0.7009168796031865, "flos": 17091066372480.0, "grad_norm": 2.1362766900840127, "language_loss": 0.79599524, "learning_rate": 8.670778654208797e-07, "loss": 0.8170017, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 11658, "time_per_iteration": 2.403447389602661 }, { "auxiliary_loss_clip": 0.01051411, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.0128032, "balance_loss_mlp": 1.01608396, "epoch": 0.7009770028558545, "flos": 20447930419200.0, "grad_norm": 1.7403631691447954, "language_loss": 0.83572906, "learning_rate": 8.667569360094713e-07, "loss": 0.85661256, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35351562, "step": 11659, "time_per_iteration": 3.785346031188965 }, { "auxiliary_loss_clip": 0.0105034, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.01059663, "balance_loss_mlp": 1.01486742, "epoch": 0.7010371261085224, "flos": 19244125140480.0, "grad_norm": 2.1222440042605752, "language_loss": 0.71123558, "learning_rate": 8.664360495707526e-07, "loss": 0.73206902, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 11660, "time_per_iteration": 2.3738138675689697 }, { "auxiliary_loss_clip": 0.01054342, "auxiliary_loss_mlp": 0.01043184, "balance_loss_clip": 1.0176729, "balance_loss_mlp": 1.01666152, "epoch": 0.7010972493611904, "flos": 22126484632320.0, "grad_norm": 2.1979808825325815, "language_loss": 0.82587749, "learning_rate": 8.661152061168924e-07, "loss": 0.84685272, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 11661, "time_per_iteration": 2.39043927192688 }, { "auxiliary_loss_clip": 0.01052014, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.01677847, "balance_loss_mlp": 1.01533985, "epoch": 0.7011573726138585, "flos": 31389885072000.0, "grad_norm": 2.38469615613404, "language_loss": 0.80677056, "learning_rate": 8.657944056600579e-07, "loss": 0.82768595, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 11662, "time_per_iteration": 2.429936408996582 }, { "auxiliary_loss_clip": 0.01054702, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.01272738, "balance_loss_mlp": 1.01644778, "epoch": 0.7012174958665264, "flos": 18149598017280.0, "grad_norm": 1.7279675924616176, "language_loss": 0.83996689, "learning_rate": 8.654736482124134e-07, "loss": 0.86087859, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 11663, "time_per_iteration": 2.3359127044677734 }, { "auxiliary_loss_clip": 0.01008196, "auxiliary_loss_mlp": 0.0100814, "balance_loss_clip": 1.00575578, "balance_loss_mlp": 1.00145268, "epoch": 0.7012776191191944, "flos": 60648216773760.0, "grad_norm": 0.8219505640328004, "language_loss": 0.53739923, "learning_rate": 8.651529337861209e-07, "loss": 0.55756259, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06738281, "step": 11664, "time_per_iteration": 2.995934009552002 }, { "auxiliary_loss_clip": 0.01054092, "auxiliary_loss_mlp": 0.0103898, "balance_loss_clip": 1.01193166, "balance_loss_mlp": 1.01674676, "epoch": 0.7013377423718623, "flos": 27197374700160.0, "grad_norm": 2.049169491218709, "language_loss": 0.81315339, "learning_rate": 8.64832262393344e-07, "loss": 0.83408409, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.375, "step": 11665, "time_per_iteration": 2.454685688018799 }, { "auxiliary_loss_clip": 0.01052978, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.01156211, "balance_loss_mlp": 1.01652241, "epoch": 0.7013978656245303, "flos": 16542650736000.0, "grad_norm": 2.1394588576838083, "language_loss": 0.78863728, "learning_rate": 8.645116340462404e-07, "loss": 0.80951476, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11666, "time_per_iteration": 2.306657552719116 }, { "auxiliary_loss_clip": 0.01052887, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.01674211, "balance_loss_mlp": 1.01689947, "epoch": 0.7014579888771982, "flos": 23142806576640.0, "grad_norm": 1.736986141174474, "language_loss": 0.82378578, "learning_rate": 8.641910487569695e-07, "loss": 0.84472346, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 11667, "time_per_iteration": 2.377025842666626 }, { "auxiliary_loss_clip": 0.01052251, "auxiliary_loss_mlp": 0.01039162, "balance_loss_clip": 1.01481926, "balance_loss_mlp": 1.01597154, "epoch": 0.7015181121298663, "flos": 25080939815040.0, "grad_norm": 2.7597718497773536, "language_loss": 0.65940952, "learning_rate": 8.638705065376879e-07, "loss": 0.68032372, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 11668, "time_per_iteration": 2.3921072483062744 }, { "auxiliary_loss_clip": 0.01054823, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.01071048, "balance_loss_mlp": 1.01657605, "epoch": 0.7015782353825342, "flos": 23326868597760.0, "grad_norm": 1.735423967205533, "language_loss": 0.7767638, "learning_rate": 8.635500074005519e-07, "loss": 0.79766941, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 11669, "time_per_iteration": 2.4696245193481445 }, { "auxiliary_loss_clip": 0.01008064, "auxiliary_loss_mlp": 0.01005728, "balance_loss_clip": 1.00302231, "balance_loss_mlp": 1.0010916, "epoch": 0.7016383586352022, "flos": 70393732444800.0, "grad_norm": 0.7004618485629884, "language_loss": 0.54537761, "learning_rate": 8.632295513577122e-07, "loss": 0.56551552, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.06982422, "step": 11670, "time_per_iteration": 3.129777431488037 }, { "auxiliary_loss_clip": 0.01053548, "auxiliary_loss_mlp": 0.01037443, "balance_loss_clip": 1.01339841, "balance_loss_mlp": 1.0168643, "epoch": 0.7016984818878701, "flos": 19791249056640.0, "grad_norm": 1.807700361279175, "language_loss": 0.82868499, "learning_rate": 8.629091384213218e-07, "loss": 0.84959489, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 11671, "time_per_iteration": 2.370938301086426 }, { "auxiliary_loss_clip": 0.01054296, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.01210117, "balance_loss_mlp": 1.01749933, "epoch": 0.7017586051405381, "flos": 12896077294080.0, "grad_norm": 2.0225843641789503, "language_loss": 0.76885897, "learning_rate": 8.625887686035313e-07, "loss": 0.78975987, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 11672, "time_per_iteration": 2.3380470275878906 }, { "auxiliary_loss_clip": 0.0105154, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.01387024, "balance_loss_mlp": 1.01566935, "epoch": 0.701818728393206, "flos": 18331844647680.0, "grad_norm": 1.7926303568800686, "language_loss": 0.87995046, "learning_rate": 8.622684419164883e-07, "loss": 0.90083909, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 11673, "time_per_iteration": 2.3659539222717285 }, { "auxiliary_loss_clip": 0.01051718, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.01521432, "balance_loss_mlp": 1.01654625, "epoch": 0.701878851645874, "flos": 17383254474240.0, "grad_norm": 1.7687659679367151, "language_loss": 0.74608481, "learning_rate": 8.619481583723399e-07, "loss": 0.7669853, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 11674, "time_per_iteration": 2.3332138061523438 }, { "auxiliary_loss_clip": 0.01051125, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.0145998, "balance_loss_mlp": 1.01689661, "epoch": 0.701938974898542, "flos": 23914351912320.0, "grad_norm": 1.9146178962144484, "language_loss": 0.7323283, "learning_rate": 8.616279179832329e-07, "loss": 0.75320613, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34179688, "step": 11675, "time_per_iteration": 2.397435426712036 }, { "auxiliary_loss_clip": 0.01052378, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.01572466, "balance_loss_mlp": 1.01555574, "epoch": 0.70199909815121, "flos": 21794600448000.0, "grad_norm": 2.0233199127695785, "language_loss": 0.51929373, "learning_rate": 8.613077207613078e-07, "loss": 0.54021221, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 11676, "time_per_iteration": 2.364105701446533 }, { "auxiliary_loss_clip": 0.01008362, "auxiliary_loss_mlp": 0.01004279, "balance_loss_clip": 1.00158513, "balance_loss_mlp": 1.00137579, "epoch": 0.702059221403878, "flos": 71711459089920.0, "grad_norm": 0.7287796981515064, "language_loss": 0.59266067, "learning_rate": 8.609875667187079e-07, "loss": 0.61278701, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.06982422, "step": 11677, "time_per_iteration": 3.07122540473938 }, { "auxiliary_loss_clip": 0.01053441, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.0091449, "balance_loss_mlp": 1.0159297, "epoch": 0.7021193446565459, "flos": 28109794838400.0, "grad_norm": 2.224457071584434, "language_loss": 0.63638902, "learning_rate": 8.606674558675737e-07, "loss": 0.65725887, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 11678, "time_per_iteration": 2.4134421348571777 }, { "auxiliary_loss_clip": 0.01051261, "auxiliary_loss_mlp": 0.01040422, "balance_loss_clip": 1.01648486, "balance_loss_mlp": 1.01589, "epoch": 0.7021794679092139, "flos": 22923936063360.0, "grad_norm": 1.6151568489866754, "language_loss": 0.80101788, "learning_rate": 8.603473882200444e-07, "loss": 0.82193476, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35351562, "step": 11679, "time_per_iteration": 2.447798252105713 }, { "auxiliary_loss_clip": 0.01052409, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.01613879, "balance_loss_mlp": 1.01690245, "epoch": 0.7022395911618818, "flos": 18076839010560.0, "grad_norm": 2.2825911743132714, "language_loss": 0.71877038, "learning_rate": 8.600273637882567e-07, "loss": 0.73967826, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 11680, "time_per_iteration": 2.404818058013916 }, { "auxiliary_loss_clip": 0.01054808, "auxiliary_loss_mlp": 0.01041088, "balance_loss_clip": 1.01668549, "balance_loss_mlp": 1.0167073, "epoch": 0.7022997144145499, "flos": 16033372600320.0, "grad_norm": 1.754227999388068, "language_loss": 0.75999808, "learning_rate": 8.597073825843446e-07, "loss": 0.7809571, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 11681, "time_per_iteration": 2.35341215133667 }, { "auxiliary_loss_clip": 0.01054322, "auxiliary_loss_mlp": 0.01036418, "balance_loss_clip": 1.01347041, "balance_loss_mlp": 1.01676917, "epoch": 0.7023598376672178, "flos": 26467480483200.0, "grad_norm": 1.944087849389389, "language_loss": 0.77723658, "learning_rate": 8.593874446204434e-07, "loss": 0.79814392, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.375, "step": 11682, "time_per_iteration": 2.4471960067749023 }, { "auxiliary_loss_clip": 0.01053471, "auxiliary_loss_mlp": 0.01041374, "balance_loss_clip": 1.01619744, "balance_loss_mlp": 1.01583982, "epoch": 0.7024199609198858, "flos": 17054966160000.0, "grad_norm": 2.0439080109216925, "language_loss": 0.75194108, "learning_rate": 8.590675499086841e-07, "loss": 0.77288949, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 11683, "time_per_iteration": 3.5945653915405273 }, { "auxiliary_loss_clip": 0.01052328, "auxiliary_loss_mlp": 0.01040177, "balance_loss_clip": 1.01366484, "balance_loss_mlp": 1.01618123, "epoch": 0.7024800841725537, "flos": 25847841939840.0, "grad_norm": 1.8006752924284735, "language_loss": 0.72663105, "learning_rate": 8.587476984611976e-07, "loss": 0.74755609, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36328125, "step": 11684, "time_per_iteration": 2.394998073577881 }, { "auxiliary_loss_clip": 0.01052189, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.0139823, "balance_loss_mlp": 1.01596713, "epoch": 0.7025402074252217, "flos": 23511908136960.0, "grad_norm": 1.8863236225247524, "language_loss": 0.72987747, "learning_rate": 8.584278902901128e-07, "loss": 0.75076497, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 11685, "time_per_iteration": 2.4023122787475586 }, { "auxiliary_loss_clip": 0.01052517, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.01258719, "balance_loss_mlp": 1.01580393, "epoch": 0.7026003306778896, "flos": 20150121588480.0, "grad_norm": 1.5880218901273346, "language_loss": 0.85316288, "learning_rate": 8.581081254075582e-07, "loss": 0.87403464, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 11686, "time_per_iteration": 2.3437767028808594 }, { "auxiliary_loss_clip": 0.01008047, "auxiliary_loss_mlp": 0.0100451, "balance_loss_clip": 1.00197053, "balance_loss_mlp": 1.00098813, "epoch": 0.7026604539305576, "flos": 64769294770560.0, "grad_norm": 0.987636016672983, "language_loss": 0.70034885, "learning_rate": 8.577884038256566e-07, "loss": 0.72047448, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.0703125, "step": 11687, "time_per_iteration": 4.430670261383057 }, { "auxiliary_loss_clip": 0.01052834, "auxiliary_loss_mlp": 0.01039826, "balance_loss_clip": 1.01781988, "balance_loss_mlp": 1.01641047, "epoch": 0.7027205771832256, "flos": 21870396743040.0, "grad_norm": 3.7394662544084385, "language_loss": 0.78750795, "learning_rate": 8.574687255565329e-07, "loss": 0.80843455, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 11688, "time_per_iteration": 2.3935906887054443 }, { "auxiliary_loss_clip": 0.01053648, "auxiliary_loss_mlp": 0.01041446, "balance_loss_clip": 1.01694894, "balance_loss_mlp": 1.01700306, "epoch": 0.7027807004358936, "flos": 23366669414400.0, "grad_norm": 2.0287205181243086, "language_loss": 0.69971073, "learning_rate": 8.571490906123107e-07, "loss": 0.7206617, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11689, "time_per_iteration": 3.7493700981140137 }, { "auxiliary_loss_clip": 0.01054959, "auxiliary_loss_mlp": 0.01041622, "balance_loss_clip": 1.01443005, "balance_loss_mlp": 1.01686561, "epoch": 0.7028408236885616, "flos": 15303373649280.0, "grad_norm": 1.994691472760705, "language_loss": 0.80797231, "learning_rate": 8.568294990051086e-07, "loss": 0.82893801, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38085938, "step": 11690, "time_per_iteration": 2.401571750640869 }, { "auxiliary_loss_clip": 0.01055415, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.01717281, "balance_loss_mlp": 1.01825333, "epoch": 0.7029009469412295, "flos": 22017101742720.0, "grad_norm": 1.6180703425643823, "language_loss": 0.7659539, "learning_rate": 8.56509950747047e-07, "loss": 0.78691566, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 11691, "time_per_iteration": 2.3610615730285645 }, { "auxiliary_loss_clip": 0.01053923, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.01545, "balance_loss_mlp": 1.01790309, "epoch": 0.7029610701938975, "flos": 21834436176000.0, "grad_norm": 1.949335260845897, "language_loss": 0.82377321, "learning_rate": 8.561904458502429e-07, "loss": 0.84468877, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 11692, "time_per_iteration": 2.384661912918091 }, { "auxiliary_loss_clip": 0.01052965, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.01123548, "balance_loss_mlp": 1.01648045, "epoch": 0.7030211934465654, "flos": 19134637516800.0, "grad_norm": 1.586962564984575, "language_loss": 0.77468365, "learning_rate": 8.558709843268111e-07, "loss": 0.79556674, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 11693, "time_per_iteration": 2.3609442710876465 }, { "auxiliary_loss_clip": 0.01052874, "auxiliary_loss_mlp": 0.01037622, "balance_loss_clip": 1.01498389, "balance_loss_mlp": 1.01769912, "epoch": 0.7030813166992335, "flos": 38544461303040.0, "grad_norm": 1.636306874966264, "language_loss": 0.69897205, "learning_rate": 8.55551566188866e-07, "loss": 0.719877, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 11694, "time_per_iteration": 2.5595955848693848 }, { "auxiliary_loss_clip": 0.01052828, "auxiliary_loss_mlp": 0.01039446, "balance_loss_clip": 1.01515126, "balance_loss_mlp": 1.0158031, "epoch": 0.7031414399519014, "flos": 14720009875200.0, "grad_norm": 2.2611275335301477, "language_loss": 0.77200365, "learning_rate": 8.552321914485203e-07, "loss": 0.79292643, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 11695, "time_per_iteration": 2.3596301078796387 }, { "auxiliary_loss_clip": 0.01054529, "auxiliary_loss_mlp": 0.01039853, "balance_loss_clip": 1.01613009, "balance_loss_mlp": 1.01727057, "epoch": 0.7032015632045694, "flos": 14026390427520.0, "grad_norm": 1.9152164946404682, "language_loss": 0.75332648, "learning_rate": 8.549128601178852e-07, "loss": 0.7742703, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37304688, "step": 11696, "time_per_iteration": 2.424807071685791 }, { "auxiliary_loss_clip": 0.01054149, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.01328444, "balance_loss_mlp": 1.01713729, "epoch": 0.7032616864572373, "flos": 27635918688000.0, "grad_norm": 1.4727917707338387, "language_loss": 0.76258242, "learning_rate": 8.545935722090693e-07, "loss": 0.78351432, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 11697, "time_per_iteration": 2.4145584106445312 }, { "auxiliary_loss_clip": 0.01054696, "auxiliary_loss_mlp": 0.01042733, "balance_loss_clip": 1.01678073, "balance_loss_mlp": 1.01738429, "epoch": 0.7033218097099053, "flos": 17966338957440.0, "grad_norm": 3.0251722270196444, "language_loss": 0.81481159, "learning_rate": 8.542743277341793e-07, "loss": 0.83578587, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 11698, "time_per_iteration": 2.3612053394317627 }, { "auxiliary_loss_clip": 0.01053592, "auxiliary_loss_mlp": 0.01041706, "balance_loss_clip": 1.01749492, "balance_loss_mlp": 1.01649189, "epoch": 0.7033819329625732, "flos": 19500666877440.0, "grad_norm": 1.386048467884942, "language_loss": 0.85611272, "learning_rate": 8.539551267053222e-07, "loss": 0.87706566, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 11699, "time_per_iteration": 3.759413242340088 }, { "auxiliary_loss_clip": 0.01052437, "auxiliary_loss_mlp": 0.01037059, "balance_loss_clip": 1.01167893, "balance_loss_mlp": 1.01626837, "epoch": 0.7034420562152413, "flos": 23986517425920.0, "grad_norm": 1.9493389858323087, "language_loss": 0.80508983, "learning_rate": 8.53635969134601e-07, "loss": 0.82598472, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36132812, "step": 11700, "time_per_iteration": 2.4684903621673584 }, { "auxiliary_loss_clip": 0.0105222, "auxiliary_loss_mlp": 0.01034093, "balance_loss_clip": 1.00953603, "balance_loss_mlp": 1.01539016, "epoch": 0.7035021794679092, "flos": 35041974508800.0, "grad_norm": 1.7442158572046753, "language_loss": 0.7584734, "learning_rate": 8.533168550341186e-07, "loss": 0.77933651, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 11701, "time_per_iteration": 2.45867919921875 }, { "auxiliary_loss_clip": 0.01054391, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.01585197, "balance_loss_mlp": 1.01705348, "epoch": 0.7035623027205772, "flos": 10996697531520.0, "grad_norm": 2.501398189330933, "language_loss": 0.85194129, "learning_rate": 8.529977844159769e-07, "loss": 0.87289178, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 11702, "time_per_iteration": 2.343057870864868 }, { "auxiliary_loss_clip": 0.01053578, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 1.01177001, "balance_loss_mlp": 1.01724744, "epoch": 0.7036224259732452, "flos": 23622582746880.0, "grad_norm": 1.9476649228554805, "language_loss": 0.61636961, "learning_rate": 8.526787572922738e-07, "loss": 0.63725233, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 11703, "time_per_iteration": 2.431807518005371 }, { "auxiliary_loss_clip": 0.01053349, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.01198816, "balance_loss_mlp": 1.01628935, "epoch": 0.7036825492259131, "flos": 31684831171200.0, "grad_norm": 2.7293669882190517, "language_loss": 0.63105142, "learning_rate": 8.523597736751067e-07, "loss": 0.65193957, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 11704, "time_per_iteration": 2.4580395221710205 }, { "auxiliary_loss_clip": 0.01050063, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 1.01616096, "balance_loss_mlp": 1.0152998, "epoch": 0.7037426724785811, "flos": 30191491054080.0, "grad_norm": 1.5777958823256464, "language_loss": 0.72068524, "learning_rate": 8.520408335765719e-07, "loss": 0.74155945, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 11705, "time_per_iteration": 2.441856622695923 }, { "auxiliary_loss_clip": 0.01051592, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.01786685, "balance_loss_mlp": 1.0161581, "epoch": 0.703802795731249, "flos": 24310511642880.0, "grad_norm": 1.945010893284573, "language_loss": 0.63391471, "learning_rate": 8.517219370087645e-07, "loss": 0.65481699, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.35351562, "step": 11706, "time_per_iteration": 2.412151575088501 }, { "auxiliary_loss_clip": 0.01053769, "auxiliary_loss_mlp": 0.01041972, "balance_loss_clip": 1.01677084, "balance_loss_mlp": 1.01631236, "epoch": 0.7038629189839171, "flos": 22527846155520.0, "grad_norm": 1.8185552558901321, "language_loss": 0.69524527, "learning_rate": 8.514030839837756e-07, "loss": 0.71620262, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 11707, "time_per_iteration": 2.3653564453125 }, { "auxiliary_loss_clip": 0.01049823, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.01039028, "balance_loss_mlp": 1.01565313, "epoch": 0.703923042236585, "flos": 26249273285760.0, "grad_norm": 1.8867188749519992, "language_loss": 0.77393579, "learning_rate": 8.510842745136974e-07, "loss": 0.79474962, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34179688, "step": 11708, "time_per_iteration": 2.418410062789917 }, { "auxiliary_loss_clip": 0.01050953, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.01252079, "balance_loss_mlp": 1.01660752, "epoch": 0.703983165489253, "flos": 19389259128960.0, "grad_norm": 1.8118395101186702, "language_loss": 0.73314255, "learning_rate": 8.50765508610619e-07, "loss": 0.75398248, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 11709, "time_per_iteration": 2.3824455738067627 }, { "auxiliary_loss_clip": 0.01050228, "auxiliary_loss_mlp": 0.01037012, "balance_loss_clip": 1.01463652, "balance_loss_mlp": 1.01484287, "epoch": 0.7040432887419209, "flos": 16682897134080.0, "grad_norm": 2.2652831965756985, "language_loss": 0.80498421, "learning_rate": 8.504467862866267e-07, "loss": 0.82585669, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 11710, "time_per_iteration": 2.3531723022460938 }, { "auxiliary_loss_clip": 0.01053613, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.02157629, "balance_loss_mlp": 1.01698327, "epoch": 0.7041034119945889, "flos": 21140362880640.0, "grad_norm": 1.6177818638504742, "language_loss": 0.78719282, "learning_rate": 8.501281075538076e-07, "loss": 0.80817896, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 11711, "time_per_iteration": 2.3527982234954834 }, { "auxiliary_loss_clip": 0.01051071, "auxiliary_loss_mlp": 0.01038144, "balance_loss_clip": 1.01523209, "balance_loss_mlp": 1.01531303, "epoch": 0.7041635352472568, "flos": 16909343412480.0, "grad_norm": 2.3815037394255025, "language_loss": 0.75866103, "learning_rate": 8.498094724242457e-07, "loss": 0.77955317, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 11712, "time_per_iteration": 2.35577392578125 }, { "auxiliary_loss_clip": 0.01007762, "auxiliary_loss_mlp": 0.0100522, "balance_loss_clip": 1.00282359, "balance_loss_mlp": 1.00075316, "epoch": 0.7042236584999249, "flos": 71677558293120.0, "grad_norm": 0.900332642961795, "language_loss": 0.64789879, "learning_rate": 8.494908809100247e-07, "loss": 0.66802859, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.0703125, "step": 11713, "time_per_iteration": 3.052063465118408 }, { "auxiliary_loss_clip": 0.01050945, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.01537251, "balance_loss_mlp": 1.01594591, "epoch": 0.7042837817525928, "flos": 28656918754560.0, "grad_norm": 1.8685914882095165, "language_loss": 0.73534989, "learning_rate": 8.49172333023225e-07, "loss": 0.75622362, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34960938, "step": 11714, "time_per_iteration": 2.474581003189087 }, { "auxiliary_loss_clip": 0.01052544, "auxiliary_loss_mlp": 0.01047397, "balance_loss_clip": 1.02273273, "balance_loss_mlp": 1.01628745, "epoch": 0.7043439050052608, "flos": 19752600314880.0, "grad_norm": 1.9388971082739306, "language_loss": 0.81345153, "learning_rate": 8.488538287759248e-07, "loss": 0.83445096, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 11715, "time_per_iteration": 2.3424272537231445 }, { "auxiliary_loss_clip": 0.01053779, "auxiliary_loss_mlp": 0.01042158, "balance_loss_clip": 1.01856649, "balance_loss_mlp": 1.01634264, "epoch": 0.7044040282579288, "flos": 11537956339200.0, "grad_norm": 2.2194174379105327, "language_loss": 0.72750384, "learning_rate": 8.485353681802037e-07, "loss": 0.74846315, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 11716, "time_per_iteration": 2.3767900466918945 }, { "auxiliary_loss_clip": 0.01054908, "auxiliary_loss_mlp": 0.01039372, "balance_loss_clip": 1.01611435, "balance_loss_mlp": 1.01731181, "epoch": 0.7044641515105967, "flos": 33654735613440.0, "grad_norm": 2.189494639803364, "language_loss": 0.68131471, "learning_rate": 8.482169512481358e-07, "loss": 0.70225751, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 11717, "time_per_iteration": 2.4795823097229004 }, { "auxiliary_loss_clip": 0.01051115, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.01938033, "balance_loss_mlp": 1.014925, "epoch": 0.7045242747632647, "flos": 26722660677120.0, "grad_norm": 1.5905799496760784, "language_loss": 0.74836373, "learning_rate": 8.478985779917967e-07, "loss": 0.76929164, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36132812, "step": 11718, "time_per_iteration": 2.4388298988342285 }, { "auxiliary_loss_clip": 0.01052838, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.01393294, "balance_loss_mlp": 1.01652408, "epoch": 0.7045843980159326, "flos": 26796432113280.0, "grad_norm": 1.6767661602901758, "language_loss": 0.81281412, "learning_rate": 8.475802484232606e-07, "loss": 0.83368945, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.36328125, "step": 11719, "time_per_iteration": 2.439176321029663 }, { "auxiliary_loss_clip": 0.01051972, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.01736999, "balance_loss_mlp": 1.01675725, "epoch": 0.7046445212686007, "flos": 41573176680960.0, "grad_norm": 1.820200825915218, "language_loss": 0.66773838, "learning_rate": 8.472619625545951e-07, "loss": 0.68867135, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 11720, "time_per_iteration": 2.5654046535491943 }, { "auxiliary_loss_clip": 0.01055906, "auxiliary_loss_mlp": 0.01036369, "balance_loss_clip": 1.01207399, "balance_loss_mlp": 1.01862812, "epoch": 0.7047046445212686, "flos": 15559252070400.0, "grad_norm": 2.3312572898047557, "language_loss": 0.81849861, "learning_rate": 8.46943720397872e-07, "loss": 0.83942133, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 11721, "time_per_iteration": 2.338749647140503 }, { "auxiliary_loss_clip": 0.01008393, "auxiliary_loss_mlp": 0.0100394, "balance_loss_clip": 1.00140119, "balance_loss_mlp": 1.00143445, "epoch": 0.7047647677739366, "flos": 70406475091200.0, "grad_norm": 0.7623445852027596, "language_loss": 0.64851409, "learning_rate": 8.466255219651582e-07, "loss": 0.66863739, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.06933594, "step": 11722, "time_per_iteration": 3.140843152999878 }, { "auxiliary_loss_clip": 0.01052142, "auxiliary_loss_mlp": 0.01041343, "balance_loss_clip": 1.01834774, "balance_loss_mlp": 1.01657653, "epoch": 0.7048248910266045, "flos": 23658892427520.0, "grad_norm": 1.5342205969690794, "language_loss": 0.6644429, "learning_rate": 8.463073672685211e-07, "loss": 0.68537772, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 11723, "time_per_iteration": 3.6164069175720215 }, { "auxiliary_loss_clip": 0.01053383, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.01541078, "balance_loss_mlp": 1.01631069, "epoch": 0.7048850142792725, "flos": 21396101656320.0, "grad_norm": 1.7276378329871664, "language_loss": 0.8153336, "learning_rate": 8.459892563200235e-07, "loss": 0.83626252, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 11724, "time_per_iteration": 2.402494192123413 }, { "auxiliary_loss_clip": 0.01053534, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.01885319, "balance_loss_mlp": 1.01630878, "epoch": 0.7049451375319404, "flos": 21647162309760.0, "grad_norm": 1.7689261293706857, "language_loss": 0.73741221, "learning_rate": 8.456711891317296e-07, "loss": 0.75838321, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 11725, "time_per_iteration": 2.3724658489227295 }, { "auxiliary_loss_clip": 0.01055136, "auxiliary_loss_mlp": 0.01044044, "balance_loss_clip": 1.01837814, "balance_loss_mlp": 1.01763439, "epoch": 0.7050052607846085, "flos": 14865911913600.0, "grad_norm": 1.8982171600694007, "language_loss": 0.79078114, "learning_rate": 8.453531657156998e-07, "loss": 0.811773, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 11726, "time_per_iteration": 3.711406707763672 }, { "auxiliary_loss_clip": 0.01052597, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.014256, "balance_loss_mlp": 1.01560664, "epoch": 0.7050653840372764, "flos": 19240843472640.0, "grad_norm": 2.024646677267195, "language_loss": 0.71123385, "learning_rate": 8.450351860839931e-07, "loss": 0.7321341, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 11727, "time_per_iteration": 2.3739194869995117 }, { "auxiliary_loss_clip": 0.01047846, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.01255071, "balance_loss_mlp": 1.01521909, "epoch": 0.7051255072899444, "flos": 27779237285760.0, "grad_norm": 1.5562806674674543, "language_loss": 0.69711185, "learning_rate": 8.44717250248668e-07, "loss": 0.71792531, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.32617188, "step": 11728, "time_per_iteration": 3.79042387008667 }, { "auxiliary_loss_clip": 0.01052147, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.01417017, "balance_loss_mlp": 1.01653671, "epoch": 0.7051856305426124, "flos": 27890784679680.0, "grad_norm": 1.8526011904576865, "language_loss": 0.74001926, "learning_rate": 8.443993582217803e-07, "loss": 0.76093292, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 11729, "time_per_iteration": 2.4454312324523926 }, { "auxiliary_loss_clip": 0.0105642, "auxiliary_loss_mlp": 0.01044466, "balance_loss_clip": 1.01866901, "balance_loss_mlp": 1.01716316, "epoch": 0.7052457537952803, "flos": 25042465630080.0, "grad_norm": 1.5844329301097693, "language_loss": 0.79485786, "learning_rate": 8.440815100153862e-07, "loss": 0.81586671, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 11730, "time_per_iteration": 2.386445999145508 }, { "auxiliary_loss_clip": 0.01054023, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.01682353, "balance_loss_mlp": 1.01672113, "epoch": 0.7053058770479483, "flos": 21870641122560.0, "grad_norm": 2.2551294367164108, "language_loss": 0.64129955, "learning_rate": 8.437637056415359e-07, "loss": 0.66224778, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 11731, "time_per_iteration": 2.378030300140381 }, { "auxiliary_loss_clip": 0.01054382, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.01517689, "balance_loss_mlp": 1.01650429, "epoch": 0.7053660003006162, "flos": 16397796038400.0, "grad_norm": 2.0361613327326036, "language_loss": 0.75795561, "learning_rate": 8.434459451122815e-07, "loss": 0.77889037, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37890625, "step": 11732, "time_per_iteration": 2.354156970977783 }, { "auxiliary_loss_clip": 0.01051857, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.01166248, "balance_loss_mlp": 1.01606059, "epoch": 0.7054261235532843, "flos": 22710441899520.0, "grad_norm": 1.6107121533854414, "language_loss": 0.71990073, "learning_rate": 8.431282284396735e-07, "loss": 0.74075627, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 11733, "time_per_iteration": 2.384000062942505 }, { "auxiliary_loss_clip": 0.01050801, "auxiliary_loss_mlp": 0.01035656, "balance_loss_clip": 1.01288748, "balance_loss_mlp": 1.01529193, "epoch": 0.7054862468059522, "flos": 13588858869120.0, "grad_norm": 2.038418524907823, "language_loss": 0.7430315, "learning_rate": 8.428105556357583e-07, "loss": 0.76389605, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 11734, "time_per_iteration": 2.348458766937256 }, { "auxiliary_loss_clip": 0.01054689, "auxiliary_loss_mlp": 0.01043123, "balance_loss_clip": 1.01669455, "balance_loss_mlp": 1.01620948, "epoch": 0.7055463700586202, "flos": 15879999530880.0, "grad_norm": 2.709815294222579, "language_loss": 0.7173273, "learning_rate": 8.424929267125829e-07, "loss": 0.73830545, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38476562, "step": 11735, "time_per_iteration": 2.3873109817504883 }, { "auxiliary_loss_clip": 0.01053144, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.01781487, "balance_loss_mlp": 1.01614249, "epoch": 0.7056064933112881, "flos": 23075039894400.0, "grad_norm": 1.9699608725937527, "language_loss": 0.72873485, "learning_rate": 8.421753416821933e-07, "loss": 0.74970996, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 11736, "time_per_iteration": 2.3784830570220947 }, { "auxiliary_loss_clip": 0.01052579, "auxiliary_loss_mlp": 0.01036434, "balance_loss_clip": 1.01373649, "balance_loss_mlp": 1.01689613, "epoch": 0.7056666165639561, "flos": 24056134410240.0, "grad_norm": 1.8111662053652924, "language_loss": 0.69764507, "learning_rate": 8.41857800556629e-07, "loss": 0.71853518, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 11737, "time_per_iteration": 2.4847075939178467 }, { "auxiliary_loss_clip": 0.01054412, "auxiliary_loss_mlp": 0.01043897, "balance_loss_clip": 1.01750433, "balance_loss_mlp": 1.01649904, "epoch": 0.705726739816624, "flos": 17492288250240.0, "grad_norm": 2.205681679520115, "language_loss": 0.69035101, "learning_rate": 8.415403033479332e-07, "loss": 0.71133411, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11738, "time_per_iteration": 2.3752493858337402 }, { "auxiliary_loss_clip": 0.01052854, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.01159298, "balance_loss_mlp": 1.01627469, "epoch": 0.7057868630692921, "flos": 51348578342400.0, "grad_norm": 3.6350415176944737, "language_loss": 0.76195836, "learning_rate": 8.41222850068145e-07, "loss": 0.7828564, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 11739, "time_per_iteration": 3.9973809719085693 }, { "auxiliary_loss_clip": 0.01050631, "auxiliary_loss_mlp": 0.01036213, "balance_loss_clip": 1.01289582, "balance_loss_mlp": 1.01590383, "epoch": 0.70584698632196, "flos": 26101800236160.0, "grad_norm": 1.9220311291067262, "language_loss": 0.72662526, "learning_rate": 8.409054407293032e-07, "loss": 0.74749368, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 11740, "time_per_iteration": 2.417091131210327 }, { "auxiliary_loss_clip": 0.01051442, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.01483369, "balance_loss_mlp": 1.01599026, "epoch": 0.705907109574628, "flos": 21542073517440.0, "grad_norm": 1.668167657436761, "language_loss": 0.83325875, "learning_rate": 8.405880753434434e-07, "loss": 0.85413849, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 11741, "time_per_iteration": 2.375587224960327 }, { "auxiliary_loss_clip": 0.01053107, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.01509202, "balance_loss_mlp": 1.01590705, "epoch": 0.705967232827296, "flos": 22709743672320.0, "grad_norm": 3.262959062923107, "language_loss": 0.79346871, "learning_rate": 8.402707539225993e-07, "loss": 0.81438863, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 11742, "time_per_iteration": 2.3679652214050293 }, { "auxiliary_loss_clip": 0.01053485, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.01223624, "balance_loss_mlp": 1.01629066, "epoch": 0.7060273560799639, "flos": 28690051501440.0, "grad_norm": 2.1797390349779344, "language_loss": 0.65530407, "learning_rate": 8.39953476478805e-07, "loss": 0.67619491, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37304688, "step": 11743, "time_per_iteration": 2.4386730194091797 }, { "auxiliary_loss_clip": 0.01054095, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.01425016, "balance_loss_mlp": 1.01635742, "epoch": 0.7060874793326319, "flos": 15705258842880.0, "grad_norm": 3.853423696676147, "language_loss": 0.66619825, "learning_rate": 8.396362430240902e-07, "loss": 0.68712664, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37695312, "step": 11744, "time_per_iteration": 2.335339307785034 }, { "auxiliary_loss_clip": 0.01051742, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.01073623, "balance_loss_mlp": 1.01684022, "epoch": 0.7061476025852998, "flos": 21505694014080.0, "grad_norm": 1.738053208148986, "language_loss": 0.64446819, "learning_rate": 8.393190535704857e-07, "loss": 0.66532397, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 11745, "time_per_iteration": 2.377833366394043 }, { "auxiliary_loss_clip": 0.01052531, "auxiliary_loss_mlp": 0.01039941, "balance_loss_clip": 1.01661205, "balance_loss_mlp": 1.01657355, "epoch": 0.7062077258379679, "flos": 28180633720320.0, "grad_norm": 1.747770203780208, "language_loss": 0.72033614, "learning_rate": 8.390019081300188e-07, "loss": 0.74126089, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 11746, "time_per_iteration": 2.4467108249664307 }, { "auxiliary_loss_clip": 0.01052989, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.01034045, "balance_loss_mlp": 1.01695347, "epoch": 0.7062678490906358, "flos": 27852485051520.0, "grad_norm": 1.4539536746232318, "language_loss": 0.79746258, "learning_rate": 8.386848067147175e-07, "loss": 0.81834209, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 11747, "time_per_iteration": 2.4083681106567383 }, { "auxiliary_loss_clip": 0.01052747, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.0139147, "balance_loss_mlp": 1.01746702, "epoch": 0.7063279723433038, "flos": 23183759468160.0, "grad_norm": 1.6176620314447931, "language_loss": 0.65990281, "learning_rate": 8.383677493366031e-07, "loss": 0.68078679, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35351562, "step": 11748, "time_per_iteration": 2.362006902694702 }, { "auxiliary_loss_clip": 0.01053215, "auxiliary_loss_mlp": 0.01041991, "balance_loss_clip": 1.01761305, "balance_loss_mlp": 1.01581407, "epoch": 0.7063880955959717, "flos": 20187757900800.0, "grad_norm": 2.4043278808613553, "language_loss": 0.80669975, "learning_rate": 8.380507360077003e-07, "loss": 0.82765186, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 11749, "time_per_iteration": 2.3967604637145996 }, { "auxiliary_loss_clip": 0.01008845, "auxiliary_loss_mlp": 0.01004941, "balance_loss_clip": 1.00204432, "balance_loss_mlp": 1.00194597, "epoch": 0.7064482188486397, "flos": 63665376491520.0, "grad_norm": 0.7886036334288581, "language_loss": 0.54043287, "learning_rate": 8.377337667400304e-07, "loss": 0.56057072, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.06884766, "step": 11750, "time_per_iteration": 2.965235948562622 }, { "auxiliary_loss_clip": 0.01052379, "auxiliary_loss_mlp": 0.01040272, "balance_loss_clip": 1.01588225, "balance_loss_mlp": 1.0163213, "epoch": 0.7065083421013076, "flos": 25190078325120.0, "grad_norm": 1.8539040465360817, "language_loss": 0.80089486, "learning_rate": 8.37416841545612e-07, "loss": 0.82182133, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 11751, "time_per_iteration": 2.423936367034912 }, { "auxiliary_loss_clip": 0.01049263, "auxiliary_loss_mlp": 0.01030663, "balance_loss_clip": 1.00951493, "balance_loss_mlp": 1.01501894, "epoch": 0.7065684653539757, "flos": 22892583795840.0, "grad_norm": 1.582121074810804, "language_loss": 0.69077885, "learning_rate": 8.370999604364634e-07, "loss": 0.71157813, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 11752, "time_per_iteration": 2.3553566932678223 }, { "auxiliary_loss_clip": 0.0105162, "auxiliary_loss_mlp": 0.01037391, "balance_loss_clip": 1.01565886, "balance_loss_mlp": 1.01676202, "epoch": 0.7066285886066436, "flos": 23549125512960.0, "grad_norm": 1.9697778450507746, "language_loss": 0.7821213, "learning_rate": 8.367831234246025e-07, "loss": 0.80301142, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 11753, "time_per_iteration": 2.392807722091675 }, { "auxiliary_loss_clip": 0.01050907, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01110315, "balance_loss_mlp": 1.01634669, "epoch": 0.7066887118593116, "flos": 21068232278400.0, "grad_norm": 1.4706552864781466, "language_loss": 0.71584558, "learning_rate": 8.364663305220405e-07, "loss": 0.73668742, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 11754, "time_per_iteration": 2.3911681175231934 }, { "auxiliary_loss_clip": 0.01050634, "auxiliary_loss_mlp": 0.01037046, "balance_loss_clip": 1.01595783, "balance_loss_mlp": 1.01517665, "epoch": 0.7067488351119796, "flos": 21175311018240.0, "grad_norm": 2.098645692349254, "language_loss": 0.90119553, "learning_rate": 8.361495817407919e-07, "loss": 0.92207229, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 11755, "time_per_iteration": 2.4026427268981934 }, { "auxiliary_loss_clip": 0.0105131, "auxiliary_loss_mlp": 0.01036638, "balance_loss_clip": 1.01457214, "balance_loss_mlp": 1.01614821, "epoch": 0.7068089583646475, "flos": 20448174798720.0, "grad_norm": 2.0813737317661802, "language_loss": 0.80679893, "learning_rate": 8.358328770928678e-07, "loss": 0.82767838, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 11756, "time_per_iteration": 2.360037088394165 }, { "auxiliary_loss_clip": 0.01008247, "auxiliary_loss_mlp": 0.01002655, "balance_loss_clip": 1.00030637, "balance_loss_mlp": 1.00123167, "epoch": 0.7068690816173155, "flos": 59106452734080.0, "grad_norm": 0.8259914294892376, "language_loss": 0.60522997, "learning_rate": 8.355162165902785e-07, "loss": 0.62533903, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.0703125, "step": 11757, "time_per_iteration": 2.7885687351226807 }, { "auxiliary_loss_clip": 0.01052011, "auxiliary_loss_mlp": 0.01039451, "balance_loss_clip": 1.01549029, "balance_loss_mlp": 1.01629376, "epoch": 0.7069292048699835, "flos": 16250672102400.0, "grad_norm": 1.976031762051657, "language_loss": 0.80938447, "learning_rate": 8.351996002450307e-07, "loss": 0.83029914, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35742188, "step": 11758, "time_per_iteration": 2.3350675106048584 }, { "auxiliary_loss_clip": 0.01050061, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.01618457, "balance_loss_mlp": 1.01496708, "epoch": 0.7069893281226515, "flos": 41171151841920.0, "grad_norm": 1.6859831124471654, "language_loss": 0.78883684, "learning_rate": 8.348830280691304e-07, "loss": 0.80973858, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 11759, "time_per_iteration": 2.5645201206207275 }, { "auxiliary_loss_clip": 0.01051618, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.01351607, "balance_loss_mlp": 1.01585054, "epoch": 0.7070494513753194, "flos": 24206121077760.0, "grad_norm": 1.8064623637159547, "language_loss": 0.6893096, "learning_rate": 8.34566500074583e-07, "loss": 0.71019948, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 11760, "time_per_iteration": 2.4318594932556152 }, { "auxiliary_loss_clip": 0.01053711, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.01637745, "balance_loss_mlp": 1.01692188, "epoch": 0.7071095746279874, "flos": 20184860257920.0, "grad_norm": 1.9839216648768392, "language_loss": 0.81314677, "learning_rate": 8.342500162733899e-07, "loss": 0.83407545, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 11761, "time_per_iteration": 2.4425573348999023 }, { "auxiliary_loss_clip": 0.01052066, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.01321936, "balance_loss_mlp": 1.01666284, "epoch": 0.7071696978806553, "flos": 18182172182400.0, "grad_norm": 2.45721572435099, "language_loss": 0.76029181, "learning_rate": 8.33933576677553e-07, "loss": 0.7811662, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35351562, "step": 11762, "time_per_iteration": 3.6084506511688232 }, { "auxiliary_loss_clip": 0.01051105, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.01710486, "balance_loss_mlp": 1.0160023, "epoch": 0.7072298211333233, "flos": 24130638984960.0, "grad_norm": 1.746028787070688, "language_loss": 0.77796984, "learning_rate": 8.336171812990724e-07, "loss": 0.79886281, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.3515625, "step": 11763, "time_per_iteration": 2.4882588386535645 }, { "auxiliary_loss_clip": 0.01051798, "auxiliary_loss_mlp": 0.01042247, "balance_loss_clip": 1.02015722, "balance_loss_mlp": 1.01663888, "epoch": 0.7072899443859912, "flos": 27197200143360.0, "grad_norm": 2.1905391442756117, "language_loss": 0.7966907, "learning_rate": 8.333008301499453e-07, "loss": 0.81763119, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 11764, "time_per_iteration": 2.4255123138427734 }, { "auxiliary_loss_clip": 0.01053808, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.0202775, "balance_loss_mlp": 1.01691926, "epoch": 0.7073500676386593, "flos": 16434664300800.0, "grad_norm": 1.6063257086964478, "language_loss": 0.80631995, "learning_rate": 8.32984523242167e-07, "loss": 0.82730663, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 11765, "time_per_iteration": 2.3693227767944336 }, { "auxiliary_loss_clip": 0.01049861, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.01481247, "balance_loss_mlp": 1.01533604, "epoch": 0.7074101908913272, "flos": 27672472748160.0, "grad_norm": 1.7228984531456335, "language_loss": 0.69259846, "learning_rate": 8.326682605877324e-07, "loss": 0.71346045, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 11766, "time_per_iteration": 3.9094440937042236 }, { "auxiliary_loss_clip": 0.01053034, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.01714826, "balance_loss_mlp": 1.01634836, "epoch": 0.7074703141439952, "flos": 22236949774080.0, "grad_norm": 2.631140019346152, "language_loss": 0.65305328, "learning_rate": 8.323520421986352e-07, "loss": 0.67400372, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3671875, "step": 11767, "time_per_iteration": 3.8022382259368896 }, { "auxiliary_loss_clip": 0.01051145, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.01016986, "balance_loss_mlp": 1.01504266, "epoch": 0.7075304373966632, "flos": 29641923342720.0, "grad_norm": 1.5061615662929713, "language_loss": 0.53865635, "learning_rate": 8.320358680868646e-07, "loss": 0.55951232, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 11768, "time_per_iteration": 2.4309890270233154 }, { "auxiliary_loss_clip": 0.01049394, "auxiliary_loss_mlp": 0.01036006, "balance_loss_clip": 1.01510882, "balance_loss_mlp": 1.01523137, "epoch": 0.7075905606493311, "flos": 19754206237440.0, "grad_norm": 1.691726033111157, "language_loss": 0.76908928, "learning_rate": 8.317197382644119e-07, "loss": 0.78994322, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34179688, "step": 11769, "time_per_iteration": 2.3710732460021973 }, { "auxiliary_loss_clip": 0.01008358, "auxiliary_loss_mlp": 0.01004843, "balance_loss_clip": 1.00236332, "balance_loss_mlp": 1.0011642, "epoch": 0.7076506839019991, "flos": 65713136999040.0, "grad_norm": 0.8690675832735557, "language_loss": 0.62068832, "learning_rate": 8.314036527432637e-07, "loss": 0.64082032, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.07226562, "step": 11770, "time_per_iteration": 2.970346212387085 }, { "auxiliary_loss_clip": 0.01053385, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.01730466, "balance_loss_mlp": 1.01536906, "epoch": 0.707710807154667, "flos": 23764260510720.0, "grad_norm": 1.8978259127695623, "language_loss": 0.76787513, "learning_rate": 8.310876115354055e-07, "loss": 0.78881913, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.38085938, "step": 11771, "time_per_iteration": 2.394069194793701 }, { "auxiliary_loss_clip": 0.01050127, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.01482129, "balance_loss_mlp": 1.01568627, "epoch": 0.7077709304073351, "flos": 21250304352000.0, "grad_norm": 1.5085800385246073, "language_loss": 0.72749782, "learning_rate": 8.307716146528221e-07, "loss": 0.74834913, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34375, "step": 11772, "time_per_iteration": 2.3766093254089355 }, { "auxiliary_loss_clip": 0.01054562, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.01383352, "balance_loss_mlp": 1.01648521, "epoch": 0.707831053660003, "flos": 20739699584640.0, "grad_norm": 1.940522791989844, "language_loss": 0.71248955, "learning_rate": 8.30455662107496e-07, "loss": 0.73341966, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 11773, "time_per_iteration": 2.3562734127044678 }, { "auxiliary_loss_clip": 0.01051896, "auxiliary_loss_mlp": 0.01041658, "balance_loss_clip": 1.0180192, "balance_loss_mlp": 1.01579118, "epoch": 0.707891176912671, "flos": 21979919278080.0, "grad_norm": 1.6051061858747486, "language_loss": 0.71711624, "learning_rate": 8.301397539114095e-07, "loss": 0.73805177, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 11774, "time_per_iteration": 2.3715498447418213 }, { "auxiliary_loss_clip": 0.01051162, "auxiliary_loss_mlp": 0.01037354, "balance_loss_clip": 1.01634955, "balance_loss_mlp": 1.01692224, "epoch": 0.7079513001653389, "flos": 21067918076160.0, "grad_norm": 1.5465960276307475, "language_loss": 0.75334442, "learning_rate": 8.298238900765407e-07, "loss": 0.77422953, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34179688, "step": 11775, "time_per_iteration": 2.3623576164245605 }, { "auxiliary_loss_clip": 0.01053321, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.0109539, "balance_loss_mlp": 1.01782632, "epoch": 0.7080114234180069, "flos": 18039691457280.0, "grad_norm": 1.6996297622343441, "language_loss": 0.87977207, "learning_rate": 8.295080706148665e-07, "loss": 0.90062833, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35546875, "step": 11776, "time_per_iteration": 2.3654417991638184 }, { "auxiliary_loss_clip": 0.01051945, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.01176238, "balance_loss_mlp": 1.01681757, "epoch": 0.7080715466706748, "flos": 15121371398400.0, "grad_norm": 1.4799550903346335, "language_loss": 0.75699854, "learning_rate": 8.291922955383641e-07, "loss": 0.77785742, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 11777, "time_per_iteration": 2.3275177478790283 }, { "auxiliary_loss_clip": 0.01056125, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.01169276, "balance_loss_mlp": 1.01724148, "epoch": 0.7081316699233429, "flos": 14422096310400.0, "grad_norm": 2.5104688766882073, "language_loss": 0.8321209, "learning_rate": 8.288765648590066e-07, "loss": 0.85305643, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 11778, "time_per_iteration": 3.7400238513946533 }, { "auxiliary_loss_clip": 0.01048316, "auxiliary_loss_mlp": 0.01035782, "balance_loss_clip": 1.01629162, "balance_loss_mlp": 1.01531208, "epoch": 0.7081917931760108, "flos": 23221256135040.0, "grad_norm": 1.494611289325078, "language_loss": 0.8579818, "learning_rate": 8.285608785887673e-07, "loss": 0.87882274, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.33007812, "step": 11779, "time_per_iteration": 2.3920862674713135 }, { "auxiliary_loss_clip": 0.01053283, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.01661456, "balance_loss_mlp": 1.01723146, "epoch": 0.7082519164286788, "flos": 39306964596480.0, "grad_norm": 8.041743902085788, "language_loss": 0.72827667, "learning_rate": 8.28245236739618e-07, "loss": 0.74920738, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 11780, "time_per_iteration": 2.507024049758911 }, { "auxiliary_loss_clip": 0.01051099, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.01294827, "balance_loss_mlp": 1.01650262, "epoch": 0.7083120396813467, "flos": 21650129775360.0, "grad_norm": 1.38527338383707, "language_loss": 0.74354845, "learning_rate": 8.279296393235256e-07, "loss": 0.76440394, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 11781, "time_per_iteration": 2.395230293273926 }, { "auxiliary_loss_clip": 0.01050886, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.01509142, "balance_loss_mlp": 1.01641214, "epoch": 0.7083721629340147, "flos": 17566059686400.0, "grad_norm": 1.6623694958798765, "language_loss": 0.79056793, "learning_rate": 8.276140863524585e-07, "loss": 0.81144631, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 11782, "time_per_iteration": 2.3439900875091553 }, { "auxiliary_loss_clip": 0.01051043, "auxiliary_loss_mlp": 0.01035088, "balance_loss_clip": 1.0135355, "balance_loss_mlp": 1.01593542, "epoch": 0.7084322861866827, "flos": 29349246481920.0, "grad_norm": 1.5111525782890733, "language_loss": 0.70654982, "learning_rate": 8.272985778383828e-07, "loss": 0.72741115, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 11783, "time_per_iteration": 2.443906307220459 }, { "auxiliary_loss_clip": 0.01053323, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.01492167, "balance_loss_mlp": 1.01726246, "epoch": 0.7084924094393507, "flos": 20193238984320.0, "grad_norm": 1.7318878831963107, "language_loss": 0.79798138, "learning_rate": 8.269831137932632e-07, "loss": 0.81888819, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 11784, "time_per_iteration": 2.365013360977173 }, { "auxiliary_loss_clip": 0.01052042, "auxiliary_loss_mlp": 0.01037563, "balance_loss_clip": 1.01538968, "balance_loss_mlp": 1.01665521, "epoch": 0.7085525326920187, "flos": 23476087215360.0, "grad_norm": 2.0061065629957584, "language_loss": 0.78901613, "learning_rate": 8.266676942290609e-07, "loss": 0.8099122, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35351562, "step": 11785, "time_per_iteration": 2.3966333866119385 }, { "auxiliary_loss_clip": 0.01052309, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.01732361, "balance_loss_mlp": 1.01656318, "epoch": 0.7086126559446866, "flos": 25957608854400.0, "grad_norm": 1.6205595262910608, "language_loss": 0.78471369, "learning_rate": 8.26352319157738e-07, "loss": 0.80565047, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35742188, "step": 11786, "time_per_iteration": 2.423788547515869 }, { "auxiliary_loss_clip": 0.01052325, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.01278472, "balance_loss_mlp": 1.01566899, "epoch": 0.7086727791973546, "flos": 26723568372480.0, "grad_norm": 3.1214970019678816, "language_loss": 0.80192709, "learning_rate": 8.260369885912526e-07, "loss": 0.82281554, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 11787, "time_per_iteration": 2.3953213691711426 }, { "auxiliary_loss_clip": 0.01052912, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.01539314, "balance_loss_mlp": 1.01632452, "epoch": 0.7087329024500225, "flos": 21682459560960.0, "grad_norm": 1.786788798996011, "language_loss": 0.78178191, "learning_rate": 8.257217025415615e-07, "loss": 0.80269033, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36523438, "step": 11788, "time_per_iteration": 2.423516273498535 }, { "auxiliary_loss_clip": 0.01053832, "auxiliary_loss_mlp": 0.01047353, "balance_loss_clip": 1.01993477, "balance_loss_mlp": 1.01601648, "epoch": 0.7087930257026905, "flos": 17930099099520.0, "grad_norm": 1.82743873836133, "language_loss": 0.69151783, "learning_rate": 8.254064610206212e-07, "loss": 0.71252972, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37890625, "step": 11789, "time_per_iteration": 2.339289903640747 }, { "auxiliary_loss_clip": 0.01053532, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.01326275, "balance_loss_mlp": 1.01513386, "epoch": 0.7088531489553584, "flos": 18910669944960.0, "grad_norm": 1.7166653975179405, "language_loss": 0.78701413, "learning_rate": 8.250912640403858e-07, "loss": 0.80791819, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38476562, "step": 11790, "time_per_iteration": 2.394822597503662 }, { "auxiliary_loss_clip": 0.01054387, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.01023102, "balance_loss_mlp": 1.01584721, "epoch": 0.7089132722080265, "flos": 27379656241920.0, "grad_norm": 1.7299221792417945, "language_loss": 0.72542918, "learning_rate": 8.247761116128085e-07, "loss": 0.74632019, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38476562, "step": 11791, "time_per_iteration": 2.3981773853302 }, { "auxiliary_loss_clip": 0.01053717, "auxiliary_loss_mlp": 0.01040536, "balance_loss_clip": 1.01628923, "balance_loss_mlp": 1.01715708, "epoch": 0.7089733954606944, "flos": 22161851706240.0, "grad_norm": 1.8265431646639285, "language_loss": 0.82909667, "learning_rate": 8.244610037498376e-07, "loss": 0.85003918, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11792, "time_per_iteration": 2.3898210525512695 }, { "auxiliary_loss_clip": 0.01053667, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.01040578, "balance_loss_mlp": 1.0156852, "epoch": 0.7090335187133624, "flos": 24424677388800.0, "grad_norm": 2.21715149643487, "language_loss": 0.6540556, "learning_rate": 8.241459404634232e-07, "loss": 0.67494166, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37890625, "step": 11793, "time_per_iteration": 2.3949155807495117 }, { "auxiliary_loss_clip": 0.01052179, "auxiliary_loss_mlp": 0.01037625, "balance_loss_clip": 1.01571417, "balance_loss_mlp": 1.01592767, "epoch": 0.7090936419660303, "flos": 21834156885120.0, "grad_norm": 2.054363048594997, "language_loss": 0.71501166, "learning_rate": 8.238309217655133e-07, "loss": 0.7359097, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 11794, "time_per_iteration": 2.404747486114502 }, { "auxiliary_loss_clip": 0.01053113, "auxiliary_loss_mlp": 0.01041033, "balance_loss_clip": 1.01796556, "balance_loss_mlp": 1.01699734, "epoch": 0.7091537652186983, "flos": 20081377388160.0, "grad_norm": 1.8665998911082642, "language_loss": 0.77487469, "learning_rate": 8.23515947668052e-07, "loss": 0.79581618, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 11795, "time_per_iteration": 2.3456039428710938 }, { "auxiliary_loss_clip": 0.01053207, "auxiliary_loss_mlp": 0.01038934, "balance_loss_clip": 1.01522338, "balance_loss_mlp": 1.0167439, "epoch": 0.7092138884713663, "flos": 13150733817600.0, "grad_norm": 2.5206315900615097, "language_loss": 0.7558378, "learning_rate": 8.232010181829838e-07, "loss": 0.77675921, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 11796, "time_per_iteration": 2.3349499702453613 }, { "auxiliary_loss_clip": 0.01057412, "auxiliary_loss_mlp": 0.01044344, "balance_loss_clip": 1.01686668, "balance_loss_mlp": 1.01859426, "epoch": 0.7092740117240343, "flos": 21645102539520.0, "grad_norm": 1.9897182748920894, "language_loss": 0.74625701, "learning_rate": 8.228861333222523e-07, "loss": 0.76727456, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.38867188, "step": 11797, "time_per_iteration": 2.3709568977355957 }, { "auxiliary_loss_clip": 0.01053191, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.01591921, "balance_loss_mlp": 1.0169518, "epoch": 0.7093341349767023, "flos": 21031468750080.0, "grad_norm": 1.5117565906790191, "language_loss": 0.80353796, "learning_rate": 8.225712930977953e-07, "loss": 0.82446486, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 11798, "time_per_iteration": 2.409475564956665 }, { "auxiliary_loss_clip": 0.01052081, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.0130235, "balance_loss_mlp": 1.01579773, "epoch": 0.7093942582293702, "flos": 22016578072320.0, "grad_norm": 2.6272341974738906, "language_loss": 0.67372739, "learning_rate": 8.222564975215529e-07, "loss": 0.69462454, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 11799, "time_per_iteration": 2.3579423427581787 }, { "auxiliary_loss_clip": 0.01052532, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.0120374, "balance_loss_mlp": 1.01610088, "epoch": 0.7094543814820382, "flos": 27234347696640.0, "grad_norm": 1.6558766189975322, "language_loss": 0.82733184, "learning_rate": 8.219417466054622e-07, "loss": 0.84822816, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 11800, "time_per_iteration": 2.439335823059082 }, { "auxiliary_loss_clip": 0.0105094, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.01349831, "balance_loss_mlp": 1.01536012, "epoch": 0.7095145047347061, "flos": 12088466657280.0, "grad_norm": 1.7515215071970471, "language_loss": 0.87231195, "learning_rate": 8.21627040361459e-07, "loss": 0.89317429, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 11801, "time_per_iteration": 2.3191843032836914 }, { "auxiliary_loss_clip": 0.01053753, "auxiliary_loss_mlp": 0.0104051, "balance_loss_clip": 1.01701379, "balance_loss_mlp": 1.0163312, "epoch": 0.7095746279873741, "flos": 19382975084160.0, "grad_norm": 1.6886098558063236, "language_loss": 0.77545333, "learning_rate": 8.213123788014758e-07, "loss": 0.7963959, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 11802, "time_per_iteration": 3.6549456119537354 }, { "auxiliary_loss_clip": 0.01055262, "auxiliary_loss_mlp": 0.01044021, "balance_loss_clip": 1.0185101, "balance_loss_mlp": 1.01696086, "epoch": 0.709634751240042, "flos": 21359547596160.0, "grad_norm": 1.9402741640486607, "language_loss": 0.8291418, "learning_rate": 8.209977619374462e-07, "loss": 0.85013461, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 11803, "time_per_iteration": 2.362514019012451 }, { "auxiliary_loss_clip": 0.01054289, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.01227212, "balance_loss_mlp": 1.01658463, "epoch": 0.7096948744927101, "flos": 13916204576640.0, "grad_norm": 2.2450633639231086, "language_loss": 0.69171333, "learning_rate": 8.206831897812995e-07, "loss": 0.71264303, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37695312, "step": 11804, "time_per_iteration": 2.367338180541992 }, { "auxiliary_loss_clip": 0.01049132, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.01441813, "balance_loss_mlp": 1.01534033, "epoch": 0.709754997745378, "flos": 30297068605440.0, "grad_norm": 1.7216809123045562, "language_loss": 0.79318404, "learning_rate": 8.203686623449637e-07, "loss": 0.81403106, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33789062, "step": 11805, "time_per_iteration": 2.4112584590911865 }, { "auxiliary_loss_clip": 0.01053836, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.01264858, "balance_loss_mlp": 1.01699126, "epoch": 0.709815120998046, "flos": 18514161100800.0, "grad_norm": 2.6009074625166853, "language_loss": 0.80357552, "learning_rate": 8.200541796403667e-07, "loss": 0.82447845, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 11806, "time_per_iteration": 3.7489686012268066 }, { "auxiliary_loss_clip": 0.01053371, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.01306653, "balance_loss_mlp": 1.0171715, "epoch": 0.7098752442507139, "flos": 22271513886720.0, "grad_norm": 2.2622721222705207, "language_loss": 0.57757169, "learning_rate": 8.197397416794332e-07, "loss": 0.59847915, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 11807, "time_per_iteration": 3.7828528881073 }, { "auxiliary_loss_clip": 0.01055884, "auxiliary_loss_mlp": 0.01046189, "balance_loss_clip": 1.02021313, "balance_loss_mlp": 1.0167191, "epoch": 0.7099353675033819, "flos": 19274604624000.0, "grad_norm": 1.9683750271505465, "language_loss": 0.69746155, "learning_rate": 8.194253484740882e-07, "loss": 0.71848226, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 11808, "time_per_iteration": 2.3475189208984375 }, { "auxiliary_loss_clip": 0.01055168, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.01685286, "balance_loss_mlp": 1.01685953, "epoch": 0.70999549075605, "flos": 21907439562240.0, "grad_norm": 1.8606070957251337, "language_loss": 0.72059739, "learning_rate": 8.191110000362513e-07, "loss": 0.74155474, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3828125, "step": 11809, "time_per_iteration": 2.4748032093048096 }, { "auxiliary_loss_clip": 0.01009313, "auxiliary_loss_mlp": 0.01004393, "balance_loss_clip": 1.00190198, "balance_loss_mlp": 1.00222421, "epoch": 0.7100556140087179, "flos": 70453015666560.0, "grad_norm": 0.7529155436134074, "language_loss": 0.59499109, "learning_rate": 8.187966963778435e-07, "loss": 0.61512816, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07080078, "step": 11810, "time_per_iteration": 3.116567850112915 }, { "auxiliary_loss_clip": 0.01054068, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.016958, "balance_loss_mlp": 1.01729238, "epoch": 0.7101157372613859, "flos": 23038450922880.0, "grad_norm": 1.95434771323913, "language_loss": 0.75013268, "learning_rate": 8.18482437510784e-07, "loss": 0.77107632, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 11811, "time_per_iteration": 2.415149450302124 }, { "auxiliary_loss_clip": 0.01050933, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.01416802, "balance_loss_mlp": 1.01570809, "epoch": 0.7101758605140538, "flos": 23184213315840.0, "grad_norm": 2.137783006700291, "language_loss": 0.84548557, "learning_rate": 8.181682234469882e-07, "loss": 0.8663522, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 11812, "time_per_iteration": 2.3637962341308594 }, { "auxiliary_loss_clip": 0.01054831, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.01495242, "balance_loss_mlp": 1.01720953, "epoch": 0.7102359837667218, "flos": 23694992640000.0, "grad_norm": 1.5145816481083905, "language_loss": 0.71389449, "learning_rate": 8.178540541983716e-07, "loss": 0.73483455, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 11813, "time_per_iteration": 2.392571210861206 }, { "auxiliary_loss_clip": 0.01051996, "auxiliary_loss_mlp": 0.01033305, "balance_loss_clip": 1.01008272, "balance_loss_mlp": 1.01617861, "epoch": 0.7102961070193897, "flos": 19390097001600.0, "grad_norm": 1.9385817572530637, "language_loss": 0.82557893, "learning_rate": 8.175399297768495e-07, "loss": 0.84643191, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 11814, "time_per_iteration": 2.345689058303833 }, { "auxiliary_loss_clip": 0.01052996, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.01499736, "balance_loss_mlp": 1.01671255, "epoch": 0.7103562302720577, "flos": 21506427152640.0, "grad_norm": 1.6960117772966579, "language_loss": 0.77349728, "learning_rate": 8.172258501943301e-07, "loss": 0.79442644, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36328125, "step": 11815, "time_per_iteration": 2.360841989517212 }, { "auxiliary_loss_clip": 0.01051579, "auxiliary_loss_mlp": 0.01038094, "balance_loss_clip": 1.01419282, "balance_loss_mlp": 1.01562798, "epoch": 0.7104163535247257, "flos": 14534272108800.0, "grad_norm": 1.977096361405698, "language_loss": 0.79466844, "learning_rate": 8.16911815462725e-07, "loss": 0.81556523, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 11816, "time_per_iteration": 2.332798719406128 }, { "auxiliary_loss_clip": 0.01055517, "auxiliary_loss_mlp": 0.01038888, "balance_loss_clip": 1.01361585, "balance_loss_mlp": 1.01791859, "epoch": 0.7104764767773937, "flos": 11399525331840.0, "grad_norm": 1.7757270705156833, "language_loss": 0.87784827, "learning_rate": 8.165978255939426e-07, "loss": 0.89879227, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 11817, "time_per_iteration": 3.758486270904541 }, { "auxiliary_loss_clip": 0.01050688, "auxiliary_loss_mlp": 0.01039904, "balance_loss_clip": 1.01619279, "balance_loss_mlp": 1.01511598, "epoch": 0.7105366000300616, "flos": 11689688574720.0, "grad_norm": 2.29149237709545, "language_loss": 0.86196685, "learning_rate": 8.162838805998897e-07, "loss": 0.88287276, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 11818, "time_per_iteration": 2.3107738494873047 }, { "auxiliary_loss_clip": 0.01053735, "auxiliary_loss_mlp": 0.01039345, "balance_loss_clip": 1.01367879, "balance_loss_mlp": 1.01559103, "epoch": 0.7105967232827296, "flos": 19353019271040.0, "grad_norm": 3.291294109363843, "language_loss": 0.76012814, "learning_rate": 8.159699804924709e-07, "loss": 0.78105891, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 11819, "time_per_iteration": 2.3576385974884033 }, { "auxiliary_loss_clip": 0.01053785, "auxiliary_loss_mlp": 0.01044223, "balance_loss_clip": 1.01635206, "balance_loss_mlp": 1.01534438, "epoch": 0.7106568465353975, "flos": 22929277501440.0, "grad_norm": 1.6598459892288746, "language_loss": 0.71730822, "learning_rate": 8.156561252835883e-07, "loss": 0.73828828, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38476562, "step": 11820, "time_per_iteration": 2.370387554168701 }, { "auxiliary_loss_clip": 0.01053516, "auxiliary_loss_mlp": 0.01042043, "balance_loss_clip": 1.0165205, "balance_loss_mlp": 1.01567912, "epoch": 0.7107169697880655, "flos": 19098642038400.0, "grad_norm": 1.8006683421361198, "language_loss": 0.77270901, "learning_rate": 8.153423149851449e-07, "loss": 0.79366463, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11821, "time_per_iteration": 2.4090652465820312 }, { "auxiliary_loss_clip": 0.01008544, "auxiliary_loss_mlp": 0.01003215, "balance_loss_clip": 1.00090265, "balance_loss_mlp": 1.00133824, "epoch": 0.7107770930407336, "flos": 63635071564800.0, "grad_norm": 0.7815052358258732, "language_loss": 0.55160952, "learning_rate": 8.150285496090388e-07, "loss": 0.57172716, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.07226562, "step": 11822, "time_per_iteration": 3.0461642742156982 }, { "auxiliary_loss_clip": 0.01050005, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.01292074, "balance_loss_mlp": 1.01526189, "epoch": 0.7108372162934015, "flos": 22053376512000.0, "grad_norm": 2.257354919920837, "language_loss": 0.61468285, "learning_rate": 8.147148291671688e-07, "loss": 0.63554442, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 11823, "time_per_iteration": 2.4178292751312256 }, { "auxiliary_loss_clip": 0.01052709, "auxiliary_loss_mlp": 0.01038223, "balance_loss_clip": 1.01444101, "balance_loss_mlp": 1.01586986, "epoch": 0.7108973395460695, "flos": 19134148757760.0, "grad_norm": 2.1816076088165626, "language_loss": 0.72369707, "learning_rate": 8.144011536714322e-07, "loss": 0.74460644, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 11824, "time_per_iteration": 2.3468844890594482 }, { "auxiliary_loss_clip": 0.01050445, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.01550424, "balance_loss_mlp": 1.01565206, "epoch": 0.7109574627987374, "flos": 17893475216640.0, "grad_norm": 1.5961935145497086, "language_loss": 0.73953509, "learning_rate": 8.140875231337223e-07, "loss": 0.76042056, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 11825, "time_per_iteration": 2.3686375617980957 }, { "auxiliary_loss_clip": 0.01054911, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.01718497, "balance_loss_mlp": 1.01659989, "epoch": 0.7110175860514054, "flos": 28978538999040.0, "grad_norm": 4.235957620496351, "language_loss": 0.80123591, "learning_rate": 8.137739375659321e-07, "loss": 0.82220745, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 11826, "time_per_iteration": 2.427446126937866 }, { "auxiliary_loss_clip": 0.01050444, "auxiliary_loss_mlp": 0.01037511, "balance_loss_clip": 1.01446748, "balance_loss_mlp": 1.01499915, "epoch": 0.7110777093040733, "flos": 26172220181760.0, "grad_norm": 1.443999521377218, "language_loss": 0.83608449, "learning_rate": 8.134603969799527e-07, "loss": 0.85696405, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 11827, "time_per_iteration": 2.433148145675659 }, { "auxiliary_loss_clip": 0.01054491, "auxiliary_loss_mlp": 0.0103642, "balance_loss_clip": 1.01137424, "balance_loss_mlp": 1.01682425, "epoch": 0.7111378325567413, "flos": 26868737272320.0, "grad_norm": 1.322285469478923, "language_loss": 0.63244617, "learning_rate": 8.131469013876748e-07, "loss": 0.6533553, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 11828, "time_per_iteration": 2.4376096725463867 }, { "auxiliary_loss_clip": 0.01053182, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.01459253, "balance_loss_mlp": 1.01665401, "epoch": 0.7111979558094093, "flos": 27270587554560.0, "grad_norm": 1.5288110270526933, "language_loss": 0.72952187, "learning_rate": 8.128334508009846e-07, "loss": 0.75043654, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 11829, "time_per_iteration": 2.4639363288879395 }, { "auxiliary_loss_clip": 0.01053313, "auxiliary_loss_mlp": 0.01037985, "balance_loss_clip": 1.01562154, "balance_loss_mlp": 1.01687789, "epoch": 0.7112580790620773, "flos": 25045747297920.0, "grad_norm": 1.8648012586162592, "language_loss": 0.81462806, "learning_rate": 8.125200452317697e-07, "loss": 0.83554107, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 11830, "time_per_iteration": 2.4137136936187744 }, { "auxiliary_loss_clip": 0.01053187, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.01528299, "balance_loss_mlp": 1.01671004, "epoch": 0.7113182023147452, "flos": 21645730944000.0, "grad_norm": 1.6939791598698022, "language_loss": 0.84663773, "learning_rate": 8.122066846919138e-07, "loss": 0.86757171, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36328125, "step": 11831, "time_per_iteration": 2.4023618698120117 }, { "auxiliary_loss_clip": 0.01054045, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.01324487, "balance_loss_mlp": 1.01682043, "epoch": 0.7113783255674132, "flos": 20995228892160.0, "grad_norm": 1.8245952064802147, "language_loss": 0.7858299, "learning_rate": 8.118933691932985e-07, "loss": 0.80673981, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 11832, "time_per_iteration": 2.3468964099884033 }, { "auxiliary_loss_clip": 0.01008344, "auxiliary_loss_mlp": 0.01010358, "balance_loss_clip": 1.00799811, "balance_loss_mlp": 1.00119781, "epoch": 0.7114384488200811, "flos": 66768142596480.0, "grad_norm": 0.7533420308465638, "language_loss": 0.56598341, "learning_rate": 8.115800987478059e-07, "loss": 0.58617043, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07128906, "step": 11833, "time_per_iteration": 2.929084062576294 }, { "auxiliary_loss_clip": 0.01052332, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.0159533, "balance_loss_mlp": 1.01622248, "epoch": 0.7114985720727491, "flos": 25008879035520.0, "grad_norm": 1.7082744670809078, "language_loss": 0.72317809, "learning_rate": 8.11266873367315e-07, "loss": 0.74408221, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 11834, "time_per_iteration": 2.393937826156616 }, { "auxiliary_loss_clip": 0.01055083, "auxiliary_loss_mlp": 0.01042294, "balance_loss_clip": 1.01638961, "balance_loss_mlp": 1.01713729, "epoch": 0.7115586953254172, "flos": 21469070131200.0, "grad_norm": 1.8943181923480448, "language_loss": 0.80764985, "learning_rate": 8.10953693063704e-07, "loss": 0.82862353, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 11835, "time_per_iteration": 2.370798110961914 }, { "auxiliary_loss_clip": 0.01052218, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.01238084, "balance_loss_mlp": 1.01577806, "epoch": 0.7116188185780851, "flos": 28621307301120.0, "grad_norm": 1.8168836050959192, "language_loss": 0.76630044, "learning_rate": 8.10640557848848e-07, "loss": 0.78718412, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 11836, "time_per_iteration": 2.4083449840545654 }, { "auxiliary_loss_clip": 0.01052992, "auxiliary_loss_mlp": 0.0103695, "balance_loss_clip": 1.01319122, "balance_loss_mlp": 1.01620841, "epoch": 0.7116789418307531, "flos": 25292653499520.0, "grad_norm": 1.6659384490398124, "language_loss": 0.70413941, "learning_rate": 8.103274677346208e-07, "loss": 0.72503883, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 11837, "time_per_iteration": 2.4145584106445312 }, { "auxiliary_loss_clip": 0.01056852, "auxiliary_loss_mlp": 0.01046284, "balance_loss_clip": 1.01564777, "balance_loss_mlp": 1.01788867, "epoch": 0.711739065083421, "flos": 25556107685760.0, "grad_norm": 1.8966925709202866, "language_loss": 0.62819695, "learning_rate": 8.100144227328958e-07, "loss": 0.64922833, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 0.390625, "step": 11838, "time_per_iteration": 2.3927271366119385 }, { "auxiliary_loss_clip": 0.01053791, "auxiliary_loss_mlp": 0.01036907, "balance_loss_clip": 1.01419806, "balance_loss_mlp": 1.01751518, "epoch": 0.711799188336089, "flos": 26139785662080.0, "grad_norm": 2.564655143347502, "language_loss": 0.68120611, "learning_rate": 8.097014228555426e-07, "loss": 0.70211309, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 11839, "time_per_iteration": 2.4034202098846436 }, { "auxiliary_loss_clip": 0.01054622, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.02271783, "balance_loss_mlp": 1.01737976, "epoch": 0.7118593115887569, "flos": 21139629742080.0, "grad_norm": 1.9855986135745387, "language_loss": 0.85100806, "learning_rate": 8.093884681144305e-07, "loss": 0.87203956, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 11840, "time_per_iteration": 2.346921920776367 }, { "auxiliary_loss_clip": 0.01054952, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.01530027, "balance_loss_mlp": 1.01698589, "epoch": 0.711919434841425, "flos": 14974806044160.0, "grad_norm": 2.6567593877409386, "language_loss": 0.78235215, "learning_rate": 8.090755585214277e-07, "loss": 0.80329913, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 11841, "time_per_iteration": 2.3458197116851807 }, { "auxiliary_loss_clip": 0.01054365, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.0229497, "balance_loss_mlp": 1.01627362, "epoch": 0.7119795580940929, "flos": 16508051712000.0, "grad_norm": 1.8669589235139754, "language_loss": 0.76292896, "learning_rate": 8.087626940883994e-07, "loss": 0.78396523, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 11842, "time_per_iteration": 3.5786001682281494 }, { "auxiliary_loss_clip": 0.01008148, "auxiliary_loss_mlp": 0.0100263, "balance_loss_clip": 0.99986464, "balance_loss_mlp": 1.00091767, "epoch": 0.7120396813467609, "flos": 66567286344960.0, "grad_norm": 0.8035538260306488, "language_loss": 0.6179502, "learning_rate": 8.084498748272082e-07, "loss": 0.63805795, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.07226562, "step": 11843, "time_per_iteration": 3.000540256500244 }, { "auxiliary_loss_clip": 0.01052503, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.0104655, "balance_loss_mlp": 1.016011, "epoch": 0.7120998045994288, "flos": 26431519916160.0, "grad_norm": 1.6271905717956923, "language_loss": 0.81356245, "learning_rate": 8.081371007497171e-07, "loss": 0.83442479, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 11844, "time_per_iteration": 2.4270031452178955 }, { "auxiliary_loss_clip": 0.01052381, "auxiliary_loss_mlp": 0.01039593, "balance_loss_clip": 1.01498866, "balance_loss_mlp": 1.01593709, "epoch": 0.7121599278520968, "flos": 16427263092480.0, "grad_norm": 2.0881061576832587, "language_loss": 0.80066031, "learning_rate": 8.078243718677873e-07, "loss": 0.82158005, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 11845, "time_per_iteration": 3.6863064765930176 }, { "auxiliary_loss_clip": 0.01053117, "auxiliary_loss_mlp": 0.01041387, "balance_loss_clip": 1.01523256, "balance_loss_mlp": 1.01765549, "epoch": 0.7122200511047647, "flos": 28948618097280.0, "grad_norm": 3.567338195116469, "language_loss": 0.78347707, "learning_rate": 8.075116881932762e-07, "loss": 0.80442214, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.35351562, "step": 11846, "time_per_iteration": 2.481381893157959 }, { "auxiliary_loss_clip": 0.01053778, "auxiliary_loss_mlp": 0.01041668, "balance_loss_clip": 1.01761174, "balance_loss_mlp": 1.01634169, "epoch": 0.7122801743574327, "flos": 16470939070080.0, "grad_norm": 2.1749225225629645, "language_loss": 0.60300916, "learning_rate": 8.071990497380421e-07, "loss": 0.62396359, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 11847, "time_per_iteration": 3.7581920623779297 }, { "auxiliary_loss_clip": 0.01050593, "auxiliary_loss_mlp": 0.01041558, "balance_loss_clip": 1.01696563, "balance_loss_mlp": 1.01574326, "epoch": 0.7123402976101008, "flos": 20630002492800.0, "grad_norm": 1.3812932199983736, "language_loss": 0.72235864, "learning_rate": 8.068864565139395e-07, "loss": 0.74328017, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.34765625, "step": 11848, "time_per_iteration": 2.3804235458374023 }, { "auxiliary_loss_clip": 0.01008184, "auxiliary_loss_mlp": 0.01002534, "balance_loss_clip": 1.00013804, "balance_loss_mlp": 1.00100279, "epoch": 0.7124004208627687, "flos": 62322756180480.0, "grad_norm": 0.85055997830156, "language_loss": 0.63179433, "learning_rate": 8.065739085328211e-07, "loss": 0.65190148, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.07177734, "step": 11849, "time_per_iteration": 2.9836130142211914 }, { "auxiliary_loss_clip": 0.01052939, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.02019119, "balance_loss_mlp": 1.01571965, "epoch": 0.7124605441154367, "flos": 39674425322880.0, "grad_norm": 1.4432348969734283, "language_loss": 0.6483168, "learning_rate": 8.0626140580654e-07, "loss": 0.66928804, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 11850, "time_per_iteration": 2.5119211673736572 }, { "auxiliary_loss_clip": 0.01053655, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.01409602, "balance_loss_mlp": 1.01687169, "epoch": 0.7125206673681046, "flos": 28180668631680.0, "grad_norm": 1.5439779473449253, "language_loss": 0.70404005, "learning_rate": 8.05948948346946e-07, "loss": 0.72495151, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 11851, "time_per_iteration": 2.4293599128723145 }, { "auxiliary_loss_clip": 0.01053889, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.01779985, "balance_loss_mlp": 1.01735699, "epoch": 0.7125807906207726, "flos": 26175746229120.0, "grad_norm": 1.6463536105689336, "language_loss": 0.84043032, "learning_rate": 8.056365361658882e-07, "loss": 0.86137873, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36523438, "step": 11852, "time_per_iteration": 2.4106132984161377 }, { "auxiliary_loss_clip": 0.01055127, "auxiliary_loss_mlp": 0.01041665, "balance_loss_clip": 1.0154624, "balance_loss_mlp": 1.01741195, "epoch": 0.7126409138734405, "flos": 17156598727680.0, "grad_norm": 2.3572061130468103, "language_loss": 0.73679173, "learning_rate": 8.053241692752126e-07, "loss": 0.75775969, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37695312, "step": 11853, "time_per_iteration": 2.3418514728546143 }, { "auxiliary_loss_clip": 0.01050994, "auxiliary_loss_mlp": 0.01037019, "balance_loss_clip": 1.01497746, "balance_loss_mlp": 1.01573384, "epoch": 0.7127010371261085, "flos": 18768957269760.0, "grad_norm": 1.9306988248057908, "language_loss": 0.94091797, "learning_rate": 8.050118476867635e-07, "loss": 0.96179813, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 11854, "time_per_iteration": 2.3264312744140625 }, { "auxiliary_loss_clip": 0.01052008, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.01651049, "balance_loss_mlp": 1.01595116, "epoch": 0.7127611603787765, "flos": 20375380880640.0, "grad_norm": 1.7407743762622219, "language_loss": 0.8052572, "learning_rate": 8.046995714123856e-07, "loss": 0.82617092, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 11855, "time_per_iteration": 2.3533198833465576 }, { "auxiliary_loss_clip": 0.01053834, "auxiliary_loss_mlp": 0.01040523, "balance_loss_clip": 1.01485682, "balance_loss_mlp": 1.01653075, "epoch": 0.7128212836314445, "flos": 20447965330560.0, "grad_norm": 1.6212280999031499, "language_loss": 0.7453531, "learning_rate": 8.043873404639192e-07, "loss": 0.76629674, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 11856, "time_per_iteration": 2.3821818828582764 }, { "auxiliary_loss_clip": 0.01055799, "auxiliary_loss_mlp": 0.01038812, "balance_loss_clip": 1.01377821, "balance_loss_mlp": 1.01766098, "epoch": 0.7128814068841124, "flos": 23439707712000.0, "grad_norm": 1.5928830073211622, "language_loss": 0.70896649, "learning_rate": 8.040751548532046e-07, "loss": 0.72991264, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 11857, "time_per_iteration": 3.850458860397339 }, { "auxiliary_loss_clip": 0.01051436, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.01167703, "balance_loss_mlp": 1.01575899, "epoch": 0.7129415301367804, "flos": 18221972999040.0, "grad_norm": 2.004843749115044, "language_loss": 0.86186087, "learning_rate": 8.03763014592081e-07, "loss": 0.8827374, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 11858, "time_per_iteration": 2.3199496269226074 }, { "auxiliary_loss_clip": 0.01057214, "auxiliary_loss_mlp": 0.01044428, "balance_loss_clip": 1.0173552, "balance_loss_mlp": 1.01818168, "epoch": 0.7130016533894483, "flos": 15522977301120.0, "grad_norm": 1.6195539983187577, "language_loss": 0.81727087, "learning_rate": 8.034509196923829e-07, "loss": 0.83828723, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.390625, "step": 11859, "time_per_iteration": 2.359506845474243 }, { "auxiliary_loss_clip": 0.01051925, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.01182175, "balance_loss_mlp": 1.01651144, "epoch": 0.7130617766421163, "flos": 57113646439680.0, "grad_norm": 1.2341453534003826, "language_loss": 0.69521618, "learning_rate": 8.031388701659456e-07, "loss": 0.71608323, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 11860, "time_per_iteration": 2.6778433322906494 }, { "auxiliary_loss_clip": 0.01054958, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.01481557, "balance_loss_mlp": 1.01746118, "epoch": 0.7131218998947844, "flos": 19787338984320.0, "grad_norm": 2.220392626822036, "language_loss": 0.65475869, "learning_rate": 8.028268660246023e-07, "loss": 0.67572051, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.375, "step": 11861, "time_per_iteration": 2.4142262935638428 }, { "auxiliary_loss_clip": 0.01057121, "auxiliary_loss_mlp": 0.01042313, "balance_loss_clip": 1.01596808, "balance_loss_mlp": 1.01792288, "epoch": 0.7131820231474523, "flos": 26650669720320.0, "grad_norm": 2.8425097367141667, "language_loss": 0.68557107, "learning_rate": 8.025149072801849e-07, "loss": 0.7065655, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 11862, "time_per_iteration": 2.4201173782348633 }, { "auxiliary_loss_clip": 0.01053455, "auxiliary_loss_mlp": 0.01044601, "balance_loss_clip": 1.02181983, "balance_loss_mlp": 1.01761949, "epoch": 0.7132421464001203, "flos": 29204321961600.0, "grad_norm": 2.535254190885049, "language_loss": 0.68714559, "learning_rate": 8.022029939445214e-07, "loss": 0.70812619, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 11863, "time_per_iteration": 2.4689650535583496 }, { "auxiliary_loss_clip": 0.01057686, "auxiliary_loss_mlp": 0.01044222, "balance_loss_clip": 1.0182941, "balance_loss_mlp": 1.01851773, "epoch": 0.7133022696527882, "flos": 23072561187840.0, "grad_norm": 3.3224899482546686, "language_loss": 0.66867781, "learning_rate": 8.018911260294414e-07, "loss": 0.68969685, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 11864, "time_per_iteration": 2.373098850250244 }, { "auxiliary_loss_clip": 0.01055095, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.01553869, "balance_loss_mlp": 1.01721251, "epoch": 0.7133623929054562, "flos": 17456153126400.0, "grad_norm": 2.7339158769046707, "language_loss": 0.87589896, "learning_rate": 8.015793035467697e-07, "loss": 0.8968699, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11865, "time_per_iteration": 2.3892478942871094 }, { "auxiliary_loss_clip": 0.01053232, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.01442361, "balance_loss_mlp": 1.01695395, "epoch": 0.7134225161581241, "flos": 19535545192320.0, "grad_norm": 2.0149381006534854, "language_loss": 0.76357067, "learning_rate": 8.012675265083304e-07, "loss": 0.78450882, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 11866, "time_per_iteration": 2.3340768814086914 }, { "auxiliary_loss_clip": 0.01055922, "auxiliary_loss_mlp": 0.01044282, "balance_loss_clip": 1.01802063, "balance_loss_mlp": 1.0186547, "epoch": 0.7134826394107922, "flos": 26248889260800.0, "grad_norm": 2.0135751985455643, "language_loss": 0.7205888, "learning_rate": 8.009557949259464e-07, "loss": 0.74159086, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 11867, "time_per_iteration": 2.426377296447754 }, { "auxiliary_loss_clip": 0.01051156, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.01279867, "balance_loss_mlp": 1.01564527, "epoch": 0.7135427626634601, "flos": 15814397352960.0, "grad_norm": 1.89859548975805, "language_loss": 0.7274636, "learning_rate": 8.006441088114397e-07, "loss": 0.74833286, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 11868, "time_per_iteration": 2.350062608718872 }, { "auxiliary_loss_clip": 0.0105688, "auxiliary_loss_mlp": 0.01037577, "balance_loss_clip": 1.01037383, "balance_loss_mlp": 1.01773524, "epoch": 0.7136028859161281, "flos": 18222426846720.0, "grad_norm": 1.9996559523091737, "language_loss": 0.67905056, "learning_rate": 8.003324681766286e-07, "loss": 0.69999516, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 11869, "time_per_iteration": 2.35369873046875 }, { "auxiliary_loss_clip": 0.0105252, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.01204324, "balance_loss_mlp": 1.01548159, "epoch": 0.713663009168796, "flos": 24313723488000.0, "grad_norm": 1.7560283737634654, "language_loss": 0.78858411, "learning_rate": 8.000208730333298e-07, "loss": 0.80946803, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 11870, "time_per_iteration": 2.4271585941314697 }, { "auxiliary_loss_clip": 0.01054042, "auxiliary_loss_mlp": 0.01040934, "balance_loss_clip": 1.01510179, "balance_loss_mlp": 1.0171591, "epoch": 0.713723132421464, "flos": 26537376758400.0, "grad_norm": 2.3680105818073844, "language_loss": 0.82067823, "learning_rate": 7.997093233933597e-07, "loss": 0.84162796, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36914062, "step": 11871, "time_per_iteration": 2.3964641094207764 }, { "auxiliary_loss_clip": 0.01054788, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.01599908, "balance_loss_mlp": 1.01713395, "epoch": 0.7137832556741319, "flos": 19864636467840.0, "grad_norm": 1.6274968594032253, "language_loss": 0.79802841, "learning_rate": 7.993978192685331e-07, "loss": 0.81898773, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 11872, "time_per_iteration": 2.4025540351867676 }, { "auxiliary_loss_clip": 0.01054303, "auxiliary_loss_mlp": 0.01040257, "balance_loss_clip": 1.01492476, "balance_loss_mlp": 1.01622987, "epoch": 0.7138433789267999, "flos": 21687870821760.0, "grad_norm": 2.397034497709944, "language_loss": 0.85303712, "learning_rate": 7.990863606706606e-07, "loss": 0.87398273, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 11873, "time_per_iteration": 2.420586585998535 }, { "auxiliary_loss_clip": 0.01050872, "auxiliary_loss_mlp": 0.01037112, "balance_loss_clip": 1.0137943, "balance_loss_mlp": 1.01516581, "epoch": 0.713903502179468, "flos": 17601775873920.0, "grad_norm": 1.9009093574987883, "language_loss": 0.87453163, "learning_rate": 7.987749476115539e-07, "loss": 0.89541149, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 11874, "time_per_iteration": 2.3548779487609863 }, { "auxiliary_loss_clip": 0.01054062, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.00806046, "balance_loss_mlp": 1.01657057, "epoch": 0.7139636254321359, "flos": 18039377255040.0, "grad_norm": 1.8422142836138475, "language_loss": 0.84181136, "learning_rate": 7.984635801030228e-07, "loss": 0.86268079, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 11875, "time_per_iteration": 2.3589565753936768 }, { "auxiliary_loss_clip": 0.01057862, "auxiliary_loss_mlp": 0.01043337, "balance_loss_clip": 1.01533461, "balance_loss_mlp": 1.01742339, "epoch": 0.7140237486848039, "flos": 23330010620160.0, "grad_norm": 1.749646364249995, "language_loss": 0.71005362, "learning_rate": 7.981522581568721e-07, "loss": 0.73106569, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.40429688, "step": 11876, "time_per_iteration": 2.3878326416015625 }, { "auxiliary_loss_clip": 0.01055176, "auxiliary_loss_mlp": 0.01039271, "balance_loss_clip": 1.0147258, "balance_loss_mlp": 1.01733553, "epoch": 0.7140838719374718, "flos": 16836130558080.0, "grad_norm": 2.0812151517657425, "language_loss": 0.79385018, "learning_rate": 7.978409817849079e-07, "loss": 0.81479466, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37890625, "step": 11877, "time_per_iteration": 2.331735610961914 }, { "auxiliary_loss_clip": 0.01054196, "auxiliary_loss_mlp": 0.01037573, "balance_loss_clip": 1.01473236, "balance_loss_mlp": 1.01743329, "epoch": 0.7141439951901398, "flos": 21140956373760.0, "grad_norm": 1.9055146279761588, "language_loss": 0.70231712, "learning_rate": 7.97529750998934e-07, "loss": 0.72323483, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 11878, "time_per_iteration": 2.382715940475464 }, { "auxiliary_loss_clip": 0.01050689, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.01636291, "balance_loss_mlp": 1.01555109, "epoch": 0.7142041184428077, "flos": 24716551288320.0, "grad_norm": 1.991704391828737, "language_loss": 0.69166028, "learning_rate": 7.972185658107535e-07, "loss": 0.71256089, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 11879, "time_per_iteration": 2.376295804977417 }, { "auxiliary_loss_clip": 0.01053071, "auxiliary_loss_mlp": 0.01038744, "balance_loss_clip": 1.01432991, "balance_loss_mlp": 1.01701498, "epoch": 0.7142642416954758, "flos": 21907125360000.0, "grad_norm": 1.580571788914813, "language_loss": 0.70452547, "learning_rate": 7.969074262321646e-07, "loss": 0.7254436, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 11880, "time_per_iteration": 2.39935040473938 }, { "auxiliary_loss_clip": 0.01054987, "auxiliary_loss_mlp": 0.01037615, "balance_loss_clip": 1.01330853, "balance_loss_mlp": 1.01684308, "epoch": 0.7143243649481437, "flos": 20804813003520.0, "grad_norm": 2.2635017809985123, "language_loss": 0.82569766, "learning_rate": 7.965963322749674e-07, "loss": 0.84662372, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 11881, "time_per_iteration": 3.631772518157959 }, { "auxiliary_loss_clip": 0.01052701, "auxiliary_loss_mlp": 0.01035211, "balance_loss_clip": 1.01235855, "balance_loss_mlp": 1.01578641, "epoch": 0.7143844882008117, "flos": 27233789114880.0, "grad_norm": 1.4770511163556561, "language_loss": 0.64622909, "learning_rate": 7.962852839509579e-07, "loss": 0.66710818, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36914062, "step": 11882, "time_per_iteration": 2.449892282485962 }, { "auxiliary_loss_clip": 0.01054231, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.0145818, "balance_loss_mlp": 1.01635957, "epoch": 0.7144446114534796, "flos": 17928702645120.0, "grad_norm": 2.405004496271898, "language_loss": 0.70740336, "learning_rate": 7.959742812719304e-07, "loss": 0.72833633, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37890625, "step": 11883, "time_per_iteration": 2.3299291133880615 }, { "auxiliary_loss_clip": 0.01051752, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.01735449, "balance_loss_mlp": 1.01566041, "epoch": 0.7145047347061476, "flos": 20739909052800.0, "grad_norm": 1.90368465480192, "language_loss": 0.79038656, "learning_rate": 7.956633242496788e-07, "loss": 0.81133044, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36132812, "step": 11884, "time_per_iteration": 2.3630733489990234 }, { "auxiliary_loss_clip": 0.0105651, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.01294541, "balance_loss_mlp": 1.01642895, "epoch": 0.7145648579588155, "flos": 21177545345280.0, "grad_norm": 1.8955374970064391, "language_loss": 0.75451511, "learning_rate": 7.953524128959954e-07, "loss": 0.77548528, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.40039062, "step": 11885, "time_per_iteration": 3.8235628604888916 }, { "auxiliary_loss_clip": 0.01008648, "auxiliary_loss_mlp": 0.01007268, "balance_loss_clip": 1.0047766, "balance_loss_mlp": 1.00135326, "epoch": 0.7146249812114835, "flos": 64781094810240.0, "grad_norm": 0.9208247648721084, "language_loss": 0.6637603, "learning_rate": 7.95041547222669e-07, "loss": 0.68391949, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07324219, "step": 11886, "time_per_iteration": 2.976095199584961 }, { "auxiliary_loss_clip": 0.01052977, "auxiliary_loss_mlp": 0.01039111, "balance_loss_clip": 1.01423192, "balance_loss_mlp": 1.01679134, "epoch": 0.7146851044641516, "flos": 18112904311680.0, "grad_norm": 2.205232448517343, "language_loss": 0.76203895, "learning_rate": 7.947307272414874e-07, "loss": 0.78295982, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36132812, "step": 11887, "time_per_iteration": 3.762922525405884 }, { "auxiliary_loss_clip": 0.01052624, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.01180196, "balance_loss_mlp": 1.01639402, "epoch": 0.7147452277168195, "flos": 19242868331520.0, "grad_norm": 1.486527951297893, "language_loss": 0.72448158, "learning_rate": 7.944199529642372e-07, "loss": 0.74535728, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 11888, "time_per_iteration": 2.3544697761535645 }, { "auxiliary_loss_clip": 0.01055366, "auxiliary_loss_mlp": 0.01042752, "balance_loss_clip": 1.01836157, "balance_loss_mlp": 1.01727557, "epoch": 0.7148053509694875, "flos": 23763701928960.0, "grad_norm": 1.971117309098969, "language_loss": 0.84921646, "learning_rate": 7.941092244027041e-07, "loss": 0.87019765, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 11889, "time_per_iteration": 2.4031383991241455 }, { "auxiliary_loss_clip": 0.01054035, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.01233268, "balance_loss_mlp": 1.01707983, "epoch": 0.7148654742221554, "flos": 22484414557440.0, "grad_norm": 1.7508442990826478, "language_loss": 0.7658236, "learning_rate": 7.937985415686695e-07, "loss": 0.78672707, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 11890, "time_per_iteration": 2.3596489429473877 }, { "auxiliary_loss_clip": 0.01052305, "auxiliary_loss_mlp": 0.01039036, "balance_loss_clip": 1.01658869, "balance_loss_mlp": 1.0163393, "epoch": 0.7149255974748234, "flos": 24678112014720.0, "grad_norm": 1.554043841399718, "language_loss": 0.74796075, "learning_rate": 7.934879044739147e-07, "loss": 0.76887405, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 11891, "time_per_iteration": 2.410285472869873 }, { "auxiliary_loss_clip": 0.0105547, "auxiliary_loss_mlp": 0.01040436, "balance_loss_clip": 1.01651049, "balance_loss_mlp": 1.01740289, "epoch": 0.7149857207274913, "flos": 18404603654400.0, "grad_norm": 1.8249831424908263, "language_loss": 0.69784009, "learning_rate": 7.931773131302211e-07, "loss": 0.71879923, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38085938, "step": 11892, "time_per_iteration": 2.3318088054656982 }, { "auxiliary_loss_clip": 0.01054749, "auxiliary_loss_mlp": 0.01040221, "balance_loss_clip": 1.01286292, "balance_loss_mlp": 1.01584935, "epoch": 0.7150458439801594, "flos": 24968449814400.0, "grad_norm": 1.8279378043048726, "language_loss": 0.74157131, "learning_rate": 7.928667675493632e-07, "loss": 0.76252103, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 11893, "time_per_iteration": 2.3881959915161133 }, { "auxiliary_loss_clip": 0.01055453, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.00933266, "balance_loss_mlp": 1.01708114, "epoch": 0.7151059672328273, "flos": 16689844494720.0, "grad_norm": 2.157579857911161, "language_loss": 0.67599124, "learning_rate": 7.925562677431185e-07, "loss": 0.69691277, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38476562, "step": 11894, "time_per_iteration": 2.3656461238861084 }, { "auxiliary_loss_clip": 0.01056196, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.01094055, "balance_loss_mlp": 1.01767898, "epoch": 0.7151660904854953, "flos": 27270587554560.0, "grad_norm": 1.6384705456494812, "language_loss": 0.78773969, "learning_rate": 7.922458137232613e-07, "loss": 0.80865765, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38476562, "step": 11895, "time_per_iteration": 2.4224467277526855 }, { "auxiliary_loss_clip": 0.01053809, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.01353264, "balance_loss_mlp": 1.01630926, "epoch": 0.7152262137381632, "flos": 18331286065920.0, "grad_norm": 2.881337486785914, "language_loss": 0.71118844, "learning_rate": 7.919354055015643e-07, "loss": 0.73212385, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 11896, "time_per_iteration": 2.319547653198242 }, { "auxiliary_loss_clip": 0.01054722, "auxiliary_loss_mlp": 0.01045085, "balance_loss_clip": 1.01819134, "balance_loss_mlp": 1.01625323, "epoch": 0.7152863369908312, "flos": 21798196318080.0, "grad_norm": 1.753127327340692, "language_loss": 0.8713094, "learning_rate": 7.91625043089798e-07, "loss": 0.89230746, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38476562, "step": 11897, "time_per_iteration": 3.81473970413208 }, { "auxiliary_loss_clip": 0.01052064, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.01125455, "balance_loss_mlp": 1.01584387, "epoch": 0.7153464602434991, "flos": 22157487786240.0, "grad_norm": 1.8635272476912215, "language_loss": 0.79111207, "learning_rate": 7.913147264997304e-07, "loss": 0.811988, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 11898, "time_per_iteration": 2.455075979232788 }, { "auxiliary_loss_clip": 0.0105586, "auxiliary_loss_mlp": 0.01042595, "balance_loss_clip": 1.01472378, "balance_loss_mlp": 1.01676691, "epoch": 0.7154065834961671, "flos": 24714945365760.0, "grad_norm": 1.8021853187560672, "language_loss": 0.74120688, "learning_rate": 7.910044557431302e-07, "loss": 0.76219141, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.390625, "step": 11899, "time_per_iteration": 2.4003450870513916 }, { "auxiliary_loss_clip": 0.01052575, "auxiliary_loss_mlp": 0.01037031, "balance_loss_clip": 1.01254511, "balance_loss_mlp": 1.0158813, "epoch": 0.7154667067488351, "flos": 22600395694080.0, "grad_norm": 1.9244268577211279, "language_loss": 0.77209234, "learning_rate": 7.906942308317614e-07, "loss": 0.79298842, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 11900, "time_per_iteration": 2.35492205619812 }, { "auxiliary_loss_clip": 0.01054219, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.01261258, "balance_loss_mlp": 1.01667213, "epoch": 0.7155268300015031, "flos": 18770144256000.0, "grad_norm": 2.260139258466625, "language_loss": 0.82078892, "learning_rate": 7.903840517773886e-07, "loss": 0.84171766, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 11901, "time_per_iteration": 2.3544371128082275 }, { "auxiliary_loss_clip": 0.01056849, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.01621389, "balance_loss_mlp": 1.0173732, "epoch": 0.7155869532541711, "flos": 18295360410240.0, "grad_norm": 1.9134314305306346, "language_loss": 0.82263792, "learning_rate": 7.900739185917744e-07, "loss": 0.84362751, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39453125, "step": 11902, "time_per_iteration": 2.3189008235931396 }, { "auxiliary_loss_clip": 0.01053962, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.01427925, "balance_loss_mlp": 1.01647186, "epoch": 0.715647076506839, "flos": 11980096197120.0, "grad_norm": 1.7928297718535435, "language_loss": 0.68597579, "learning_rate": 7.897638312866785e-07, "loss": 0.70690691, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 11903, "time_per_iteration": 2.3301796913146973 }, { "auxiliary_loss_clip": 0.01050917, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.01515603, "balance_loss_mlp": 1.01526523, "epoch": 0.715707199759507, "flos": 18950680229760.0, "grad_norm": 1.611236137726872, "language_loss": 0.76968664, "learning_rate": 7.894537898738589e-07, "loss": 0.79058909, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 11904, "time_per_iteration": 2.3475146293640137 }, { "auxiliary_loss_clip": 0.01053757, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.0167048, "balance_loss_mlp": 1.01623142, "epoch": 0.7157673230121749, "flos": 15303513294720.0, "grad_norm": 1.877231757526791, "language_loss": 0.73510969, "learning_rate": 7.891437943650727e-07, "loss": 0.75606936, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 11905, "time_per_iteration": 2.354663848876953 }, { "auxiliary_loss_clip": 0.01052344, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.01424408, "balance_loss_mlp": 1.0157274, "epoch": 0.715827446264843, "flos": 23220732464640.0, "grad_norm": 1.475831858447686, "language_loss": 0.78819311, "learning_rate": 7.88833844772076e-07, "loss": 0.80910087, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 11906, "time_per_iteration": 2.373241901397705 }, { "auxiliary_loss_clip": 0.01008611, "auxiliary_loss_mlp": 0.0100873, "balance_loss_clip": 1.00635755, "balance_loss_mlp": 1.0014751, "epoch": 0.7158875695175109, "flos": 60972490281600.0, "grad_norm": 0.7367735577011942, "language_loss": 0.55333072, "learning_rate": 7.885239411066205e-07, "loss": 0.57350415, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07128906, "step": 11907, "time_per_iteration": 2.931745767593384 }, { "auxiliary_loss_clip": 0.0105468, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.01410413, "balance_loss_mlp": 1.01667857, "epoch": 0.7159476927701789, "flos": 17127829900800.0, "grad_norm": 1.7958833879673786, "language_loss": 0.7044282, "learning_rate": 7.882140833804593e-07, "loss": 0.7253716, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 11908, "time_per_iteration": 2.3553526401519775 }, { "auxiliary_loss_clip": 0.0105469, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.01465034, "balance_loss_mlp": 1.0172075, "epoch": 0.7160078160228468, "flos": 22489546527360.0, "grad_norm": 1.630228873617626, "language_loss": 0.72466838, "learning_rate": 7.879042716053415e-07, "loss": 0.74562192, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 11909, "time_per_iteration": 2.396656036376953 }, { "auxiliary_loss_clip": 0.01054176, "auxiliary_loss_mlp": 0.0103624, "balance_loss_clip": 1.01077688, "balance_loss_mlp": 1.01685905, "epoch": 0.7160679392755148, "flos": 30589640732160.0, "grad_norm": 1.6979647105857998, "language_loss": 0.75840217, "learning_rate": 7.875945057930144e-07, "loss": 0.77930629, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 11910, "time_per_iteration": 2.4298155307769775 }, { "auxiliary_loss_clip": 0.01053231, "auxiliary_loss_mlp": 0.01039379, "balance_loss_clip": 1.0170157, "balance_loss_mlp": 1.01644349, "epoch": 0.7161280625281827, "flos": 21322609511040.0, "grad_norm": 1.4809652261212423, "language_loss": 0.76911259, "learning_rate": 7.872847859552251e-07, "loss": 0.7900387, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36914062, "step": 11911, "time_per_iteration": 2.402038335800171 }, { "auxiliary_loss_clip": 0.01054518, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.01595354, "balance_loss_mlp": 1.01679683, "epoch": 0.7161881857808508, "flos": 61857889027200.0, "grad_norm": 1.6877721068727722, "language_loss": 0.59714204, "learning_rate": 7.869751121037192e-07, "loss": 0.61809576, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37695312, "step": 11912, "time_per_iteration": 2.740537643432617 }, { "auxiliary_loss_clip": 0.01053135, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.01270914, "balance_loss_mlp": 1.01583362, "epoch": 0.7162483090335187, "flos": 20811097048320.0, "grad_norm": 1.8251757135616946, "language_loss": 0.79312789, "learning_rate": 7.866654842502376e-07, "loss": 0.81404674, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 11913, "time_per_iteration": 2.3910491466522217 }, { "auxiliary_loss_clip": 0.01050904, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.0122354, "balance_loss_mlp": 1.01564074, "epoch": 0.7163084322861867, "flos": 24096389074560.0, "grad_norm": 1.6175526203266204, "language_loss": 0.75415677, "learning_rate": 7.863559024065234e-07, "loss": 0.77500778, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 11914, "time_per_iteration": 2.4009103775024414 }, { "auxiliary_loss_clip": 0.01051631, "auxiliary_loss_mlp": 0.01038539, "balance_loss_clip": 1.0154953, "balance_loss_mlp": 1.01587009, "epoch": 0.7163685555388547, "flos": 20079946022400.0, "grad_norm": 1.667927293032064, "language_loss": 0.74499023, "learning_rate": 7.860463665843143e-07, "loss": 0.76589191, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 11915, "time_per_iteration": 2.35589599609375 }, { "auxiliary_loss_clip": 0.01055517, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.01628423, "balance_loss_mlp": 1.01706171, "epoch": 0.7164286787915226, "flos": 17456013480960.0, "grad_norm": 1.8038938455563338, "language_loss": 0.81393492, "learning_rate": 7.85736876795349e-07, "loss": 0.83489454, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3828125, "step": 11916, "time_per_iteration": 2.3357057571411133 }, { "auxiliary_loss_clip": 0.01053833, "auxiliary_loss_mlp": 0.01038651, "balance_loss_clip": 1.01577437, "balance_loss_mlp": 1.01678741, "epoch": 0.7164888020441906, "flos": 19717896556800.0, "grad_norm": 2.7767659762949024, "language_loss": 0.69258165, "learning_rate": 7.854274330513626e-07, "loss": 0.71350646, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.37109375, "step": 11917, "time_per_iteration": 2.420375347137451 }, { "auxiliary_loss_clip": 0.01053695, "auxiliary_loss_mlp": 0.0103724, "balance_loss_clip": 1.01158595, "balance_loss_mlp": 1.01726949, "epoch": 0.7165489252968585, "flos": 21469454156160.0, "grad_norm": 1.6268505297474343, "language_loss": 0.7693702, "learning_rate": 7.851180353640896e-07, "loss": 0.79027957, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 11918, "time_per_iteration": 2.354374408721924 }, { "auxiliary_loss_clip": 0.01008884, "auxiliary_loss_mlp": 0.01003052, "balance_loss_clip": 1.00081134, "balance_loss_mlp": 1.00178492, "epoch": 0.7166090485495266, "flos": 69924499505280.0, "grad_norm": 0.6356037168803202, "language_loss": 0.53992254, "learning_rate": 7.848086837452639e-07, "loss": 0.5600419, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.07128906, "step": 11919, "time_per_iteration": 3.053708553314209 }, { "auxiliary_loss_clip": 0.01054008, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.00974894, "balance_loss_mlp": 1.01619995, "epoch": 0.7166691718021945, "flos": 27342683245440.0, "grad_norm": 1.892105717126141, "language_loss": 0.69411701, "learning_rate": 7.844993782066132e-07, "loss": 0.71500194, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 11920, "time_per_iteration": 3.6935031414031982 }, { "auxiliary_loss_clip": 0.01053514, "auxiliary_loss_mlp": 0.01041424, "balance_loss_clip": 1.0171169, "balance_loss_mlp": 1.01617503, "epoch": 0.7167292950548625, "flos": 30407568658560.0, "grad_norm": 2.06067406660684, "language_loss": 0.75605363, "learning_rate": 7.841901187598678e-07, "loss": 0.77700299, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37304688, "step": 11921, "time_per_iteration": 2.4161055088043213 }, { "auxiliary_loss_clip": 0.0105709, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.01543891, "balance_loss_mlp": 1.01681757, "epoch": 0.7167894183075304, "flos": 14570477055360.0, "grad_norm": 3.738916611887295, "language_loss": 0.77171904, "learning_rate": 7.83880905416755e-07, "loss": 0.79274213, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.40234375, "step": 11922, "time_per_iteration": 2.346613883972168 }, { "auxiliary_loss_clip": 0.01008172, "auxiliary_loss_mlp": 0.01002411, "balance_loss_clip": 0.99997926, "balance_loss_mlp": 1.00103688, "epoch": 0.7168495415601984, "flos": 64107725817600.0, "grad_norm": 0.7549146162207981, "language_loss": 0.55155635, "learning_rate": 7.83571738189001e-07, "loss": 0.57166219, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.07128906, "step": 11923, "time_per_iteration": 2.8438587188720703 }, { "auxiliary_loss_clip": 0.01054039, "auxiliary_loss_mlp": 0.01039246, "balance_loss_clip": 1.01415288, "balance_loss_mlp": 1.01625443, "epoch": 0.7169096648128663, "flos": 24680276519040.0, "grad_norm": 1.493813367560006, "language_loss": 0.77968585, "learning_rate": 7.832626170883279e-07, "loss": 0.80061865, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 11924, "time_per_iteration": 3.7888686656951904 }, { "auxiliary_loss_clip": 0.010532, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.01193285, "balance_loss_mlp": 1.01594067, "epoch": 0.7169697880655344, "flos": 20666486730240.0, "grad_norm": 1.5962721560709763, "language_loss": 0.69385773, "learning_rate": 7.829535421264588e-07, "loss": 0.71475226, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37304688, "step": 11925, "time_per_iteration": 2.354365825653076 }, { "auxiliary_loss_clip": 0.01050718, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.01261759, "balance_loss_mlp": 1.01578498, "epoch": 0.7170299113182023, "flos": 21031643306880.0, "grad_norm": 1.6429501986462847, "language_loss": 0.77969003, "learning_rate": 7.826445133151133e-07, "loss": 0.80054867, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 11926, "time_per_iteration": 3.855084180831909 }, { "auxiliary_loss_clip": 0.01056242, "auxiliary_loss_mlp": 0.01040525, "balance_loss_clip": 1.01347613, "balance_loss_mlp": 1.01652765, "epoch": 0.7170900345708703, "flos": 22892199770880.0, "grad_norm": 2.3581731993644097, "language_loss": 0.78086019, "learning_rate": 7.823355306660093e-07, "loss": 0.80182791, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39648438, "step": 11927, "time_per_iteration": 2.399858236312866 }, { "auxiliary_loss_clip": 0.01053903, "auxiliary_loss_mlp": 0.01039781, "balance_loss_clip": 1.01627326, "balance_loss_mlp": 1.01735067, "epoch": 0.7171501578235383, "flos": 15517915153920.0, "grad_norm": 2.5595873646871707, "language_loss": 0.70616472, "learning_rate": 7.820265941908642e-07, "loss": 0.7271015, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36523438, "step": 11928, "time_per_iteration": 2.3427693843841553 }, { "auxiliary_loss_clip": 0.01051313, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.01090729, "balance_loss_mlp": 1.01662171, "epoch": 0.7172102810762062, "flos": 26103091956480.0, "grad_norm": 1.930114897444092, "language_loss": 0.65919942, "learning_rate": 7.817177039013931e-07, "loss": 0.68003786, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 11929, "time_per_iteration": 2.4334864616394043 }, { "auxiliary_loss_clip": 0.01053627, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.01397407, "balance_loss_mlp": 1.01594615, "epoch": 0.7172704043288742, "flos": 21505589280000.0, "grad_norm": 2.094070257135292, "language_loss": 0.72018504, "learning_rate": 7.81408859809308e-07, "loss": 0.74111497, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 11930, "time_per_iteration": 2.395768165588379 }, { "auxiliary_loss_clip": 0.01054263, "auxiliary_loss_mlp": 0.01040738, "balance_loss_clip": 1.01509631, "balance_loss_mlp": 1.01642513, "epoch": 0.7173305275815421, "flos": 18769934787840.0, "grad_norm": 2.2075402918474754, "language_loss": 0.8300283, "learning_rate": 7.811000619263219e-07, "loss": 0.85097826, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11931, "time_per_iteration": 2.3533082008361816 }, { "auxiliary_loss_clip": 0.01052964, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.01424193, "balance_loss_mlp": 1.01698101, "epoch": 0.7173906508342102, "flos": 16178960436480.0, "grad_norm": 2.0338899567733093, "language_loss": 0.7989018, "learning_rate": 7.80791310264143e-07, "loss": 0.81980729, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 11932, "time_per_iteration": 2.3625948429107666 }, { "auxiliary_loss_clip": 0.0105231, "auxiliary_loss_mlp": 0.0104298, "balance_loss_clip": 1.01866126, "balance_loss_mlp": 1.01545274, "epoch": 0.7174507740868781, "flos": 26612684294400.0, "grad_norm": 1.4527995951453703, "language_loss": 0.75857818, "learning_rate": 7.804826048344803e-07, "loss": 0.77953112, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 11933, "time_per_iteration": 2.4043662548065186 }, { "auxiliary_loss_clip": 0.01059722, "auxiliary_loss_mlp": 0.01051767, "balance_loss_clip": 1.02231097, "balance_loss_mlp": 1.01928353, "epoch": 0.7175108973395461, "flos": 18432185495040.0, "grad_norm": 3.1991848194251133, "language_loss": 0.71437359, "learning_rate": 7.801739456490388e-07, "loss": 0.73548847, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.40429688, "step": 11934, "time_per_iteration": 2.3425910472869873 }, { "auxiliary_loss_clip": 0.01053109, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.0130105, "balance_loss_mlp": 1.01659107, "epoch": 0.717571020592214, "flos": 23913828241920.0, "grad_norm": 1.8933338131269595, "language_loss": 0.86776423, "learning_rate": 7.798653327195237e-07, "loss": 0.88867867, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 11935, "time_per_iteration": 2.3742575645446777 }, { "auxiliary_loss_clip": 0.0105368, "auxiliary_loss_mlp": 0.01038904, "balance_loss_clip": 1.01468027, "balance_loss_mlp": 1.01570523, "epoch": 0.717631143844882, "flos": 38255310489600.0, "grad_norm": 1.522305562615613, "language_loss": 0.74724817, "learning_rate": 7.795567660576388e-07, "loss": 0.76817399, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 11936, "time_per_iteration": 3.9423036575317383 }, { "auxiliary_loss_clip": 0.01008651, "auxiliary_loss_mlp": 0.01005169, "balance_loss_clip": 1.00286806, "balance_loss_mlp": 1.00181448, "epoch": 0.7176912670975499, "flos": 65512385481600.0, "grad_norm": 0.7631400997413118, "language_loss": 0.55964816, "learning_rate": 7.79248245675082e-07, "loss": 0.5797863, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.06835938, "step": 11937, "time_per_iteration": 3.0053932666778564 }, { "auxiliary_loss_clip": 0.01055644, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.01528335, "balance_loss_mlp": 1.01726091, "epoch": 0.717751390350218, "flos": 31279838866560.0, "grad_norm": 1.9372424019651522, "language_loss": 0.56104833, "learning_rate": 7.789397715835542e-07, "loss": 0.58201617, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 11938, "time_per_iteration": 2.430328607559204 }, { "auxiliary_loss_clip": 0.01050119, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.01125336, "balance_loss_mlp": 1.01508152, "epoch": 0.7178115136028859, "flos": 19858177866240.0, "grad_norm": 1.7071248202052391, "language_loss": 0.77493095, "learning_rate": 7.786313437947527e-07, "loss": 0.79578704, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3515625, "step": 11939, "time_per_iteration": 2.4006354808807373 }, { "auxiliary_loss_clip": 0.01008042, "auxiliary_loss_mlp": 0.01002698, "balance_loss_clip": 0.99995577, "balance_loss_mlp": 1.00101674, "epoch": 0.7178716368555539, "flos": 64345169174400.0, "grad_norm": 0.7609847851267416, "language_loss": 0.61511916, "learning_rate": 7.783229623203738e-07, "loss": 0.63522655, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.0703125, "step": 11940, "time_per_iteration": 2.9766199588775635 }, { "auxiliary_loss_clip": 0.01052848, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.01154697, "balance_loss_mlp": 1.01676607, "epoch": 0.7179317601082219, "flos": 26761344330240.0, "grad_norm": 1.6843430054340716, "language_loss": 0.59733325, "learning_rate": 7.780146271721097e-07, "loss": 0.6182071, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 11941, "time_per_iteration": 2.4296348094940186 }, { "auxiliary_loss_clip": 0.0105297, "auxiliary_loss_mlp": 0.01037024, "balance_loss_clip": 1.01510167, "balance_loss_mlp": 1.01689816, "epoch": 0.7179918833608898, "flos": 23512676186880.0, "grad_norm": 1.960921552566401, "language_loss": 0.80910772, "learning_rate": 7.777063383616543e-07, "loss": 0.83000767, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 11942, "time_per_iteration": 2.367962598800659 }, { "auxiliary_loss_clip": 0.0105315, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.01653123, "balance_loss_mlp": 1.01578879, "epoch": 0.7180520066135578, "flos": 17164628340480.0, "grad_norm": 2.0463187170356676, "language_loss": 0.67257977, "learning_rate": 7.773980959006968e-07, "loss": 0.69351941, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 11943, "time_per_iteration": 2.3607850074768066 }, { "auxiliary_loss_clip": 0.01051207, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.01262546, "balance_loss_mlp": 1.01574993, "epoch": 0.7181121298662257, "flos": 17565675661440.0, "grad_norm": 1.8801878690085914, "language_loss": 0.79583013, "learning_rate": 7.770898998009254e-07, "loss": 0.81669718, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 11944, "time_per_iteration": 2.3384201526641846 }, { "auxiliary_loss_clip": 0.01054514, "auxiliary_loss_mlp": 0.01042517, "balance_loss_clip": 1.01303625, "balance_loss_mlp": 1.01615167, "epoch": 0.7181722531188938, "flos": 11946858716160.0, "grad_norm": 2.279536159072007, "language_loss": 0.64352202, "learning_rate": 7.767817500740277e-07, "loss": 0.66449237, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 0.38476562, "step": 11945, "time_per_iteration": 2.3585660457611084 }, { "auxiliary_loss_clip": 0.01008851, "auxiliary_loss_mlp": 0.01002866, "balance_loss_clip": 1.00037479, "balance_loss_mlp": 1.00162268, "epoch": 0.7182323763715617, "flos": 65500480707840.0, "grad_norm": 0.7008793365898902, "language_loss": 0.51095617, "learning_rate": 7.76473646731689e-07, "loss": 0.53107333, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.07226562, "step": 11946, "time_per_iteration": 2.9282047748565674 }, { "auxiliary_loss_clip": 0.01055756, "auxiliary_loss_mlp": 0.01043653, "balance_loss_clip": 1.01549625, "balance_loss_mlp": 1.01719546, "epoch": 0.7182924996242297, "flos": 20629897758720.0, "grad_norm": 1.8962867946838917, "language_loss": 0.76085496, "learning_rate": 7.761655897855925e-07, "loss": 0.78184909, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.38476562, "step": 11947, "time_per_iteration": 2.3767342567443848 }, { "auxiliary_loss_clip": 0.01051932, "auxiliary_loss_mlp": 0.01037549, "balance_loss_clip": 1.01323032, "balance_loss_mlp": 1.0155338, "epoch": 0.7183526228768976, "flos": 16215514496640.0, "grad_norm": 1.5446156454343203, "language_loss": 0.73799312, "learning_rate": 7.758575792474187e-07, "loss": 0.75888795, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 11948, "time_per_iteration": 2.3336637020111084 }, { "auxiliary_loss_clip": 0.01055742, "auxiliary_loss_mlp": 0.01039921, "balance_loss_clip": 1.01320577, "balance_loss_mlp": 1.01735556, "epoch": 0.7184127461295656, "flos": 22231678158720.0, "grad_norm": 1.678960312271913, "language_loss": 0.72420967, "learning_rate": 7.755496151288483e-07, "loss": 0.7451663, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3828125, "step": 11949, "time_per_iteration": 2.3958375453948975 }, { "auxiliary_loss_clip": 0.01052726, "auxiliary_loss_mlp": 0.0104255, "balance_loss_clip": 1.01843429, "balance_loss_mlp": 1.01621652, "epoch": 0.7184728693822335, "flos": 27343276738560.0, "grad_norm": 2.1414179747306488, "language_loss": 0.77705991, "learning_rate": 7.752416974415598e-07, "loss": 0.79801267, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 11950, "time_per_iteration": 2.4081759452819824 }, { "auxiliary_loss_clip": 0.01057036, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.01516485, "balance_loss_mlp": 1.01839602, "epoch": 0.7185329926349016, "flos": 16507597864320.0, "grad_norm": 2.343878592511926, "language_loss": 0.68770343, "learning_rate": 7.749338261972282e-07, "loss": 0.70868909, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38671875, "step": 11951, "time_per_iteration": 2.492427349090576 }, { "auxiliary_loss_clip": 0.01056194, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.01003528, "balance_loss_mlp": 1.01699662, "epoch": 0.7185931158875695, "flos": 23949928454400.0, "grad_norm": 1.9195771849374768, "language_loss": 0.79453957, "learning_rate": 7.746260014075286e-07, "loss": 0.81547183, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 11952, "time_per_iteration": 2.375363826751709 }, { "auxiliary_loss_clip": 0.01053966, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.01340997, "balance_loss_mlp": 1.0162493, "epoch": 0.7186532391402375, "flos": 26540798071680.0, "grad_norm": 3.626053853604953, "language_loss": 0.752666, "learning_rate": 7.743182230841352e-07, "loss": 0.77358234, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37890625, "step": 11953, "time_per_iteration": 2.46132755279541 }, { "auxiliary_loss_clip": 0.01054177, "auxiliary_loss_mlp": 0.01041729, "balance_loss_clip": 1.01605129, "balance_loss_mlp": 1.01633668, "epoch": 0.7187133623929055, "flos": 22381944117120.0, "grad_norm": 1.8775416390822386, "language_loss": 0.74702042, "learning_rate": 7.740104912387164e-07, "loss": 0.7679795, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 11954, "time_per_iteration": 2.3528268337249756 }, { "auxiliary_loss_clip": 0.01055006, "auxiliary_loss_mlp": 0.01040261, "balance_loss_clip": 1.01331949, "balance_loss_mlp": 1.0178113, "epoch": 0.7187734856455734, "flos": 15778646254080.0, "grad_norm": 1.7618427741766383, "language_loss": 0.75681221, "learning_rate": 7.737028058829425e-07, "loss": 0.77776492, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 11955, "time_per_iteration": 2.3788928985595703 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.01796973, "balance_loss_mlp": 1.01755023, "epoch": 0.7188336088982414, "flos": 31758253493760.0, "grad_norm": 1.726781872740489, "language_loss": 0.74300635, "learning_rate": 7.733951670284817e-07, "loss": 0.76395607, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 11956, "time_per_iteration": 2.4412827491760254 }, { "auxiliary_loss_clip": 0.01054026, "auxiliary_loss_mlp": 0.01038707, "balance_loss_clip": 1.01349461, "balance_loss_mlp": 1.01632929, "epoch": 0.7188937321509093, "flos": 21464287274880.0, "grad_norm": 2.2849262260537833, "language_loss": 0.72866446, "learning_rate": 7.730875746869987e-07, "loss": 0.74959177, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 11957, "time_per_iteration": 2.4339489936828613 }, { "auxiliary_loss_clip": 0.01055075, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.0209533, "balance_loss_mlp": 1.01715493, "epoch": 0.7189538554035774, "flos": 27270273352320.0, "grad_norm": 1.7924875861589642, "language_loss": 0.74425012, "learning_rate": 7.727800288701582e-07, "loss": 0.76527369, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 11958, "time_per_iteration": 2.409494161605835 }, { "auxiliary_loss_clip": 0.01052235, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.01391542, "balance_loss_mlp": 1.01659739, "epoch": 0.7190139786562453, "flos": 21579535272960.0, "grad_norm": 1.5841216251836978, "language_loss": 0.8494823, "learning_rate": 7.724725295896215e-07, "loss": 0.87037778, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 11959, "time_per_iteration": 2.4106082916259766 }, { "auxiliary_loss_clip": 0.01056798, "auxiliary_loss_mlp": 0.01044471, "balance_loss_clip": 1.01838756, "balance_loss_mlp": 1.01894557, "epoch": 0.7190741019089133, "flos": 26720112147840.0, "grad_norm": 1.69250507523006, "language_loss": 0.82856858, "learning_rate": 7.7216507685705e-07, "loss": 0.8495813, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 11960, "time_per_iteration": 3.747187852859497 }, { "auxiliary_loss_clip": 0.01051647, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.02047944, "balance_loss_mlp": 1.01622295, "epoch": 0.7191342251615812, "flos": 26103545804160.0, "grad_norm": 1.5593598496783287, "language_loss": 0.78778815, "learning_rate": 7.718576706841013e-07, "loss": 0.80874729, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 11961, "time_per_iteration": 2.440626859664917 }, { "auxiliary_loss_clip": 0.0105026, "auxiliary_loss_mlp": 0.01036405, "balance_loss_clip": 1.01467264, "balance_loss_mlp": 1.01596355, "epoch": 0.7191943484142492, "flos": 22965901384320.0, "grad_norm": 1.382497935960697, "language_loss": 0.75895411, "learning_rate": 7.715503110824326e-07, "loss": 0.77982074, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 11962, "time_per_iteration": 2.371434211730957 }, { "auxiliary_loss_clip": 0.01054436, "auxiliary_loss_mlp": 0.01038804, "balance_loss_clip": 1.01343608, "balance_loss_mlp": 1.01693869, "epoch": 0.7192544716669171, "flos": 22564225658880.0, "grad_norm": 1.764319293597679, "language_loss": 0.76312578, "learning_rate": 7.712429980637001e-07, "loss": 0.78405821, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 11963, "time_per_iteration": 3.90022611618042 }, { "auxiliary_loss_clip": 0.01056197, "auxiliary_loss_mlp": 0.01040355, "balance_loss_clip": 1.0138427, "balance_loss_mlp": 1.01673746, "epoch": 0.7193145949195852, "flos": 18981404092800.0, "grad_norm": 2.3611239439004774, "language_loss": 0.81766492, "learning_rate": 7.709357316395564e-07, "loss": 0.83863044, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39453125, "step": 11964, "time_per_iteration": 2.4728188514709473 }, { "auxiliary_loss_clip": 0.01052446, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.01552105, "balance_loss_mlp": 1.01565218, "epoch": 0.7193747181722531, "flos": 18003277042560.0, "grad_norm": 1.6677009931322888, "language_loss": 0.75482452, "learning_rate": 7.70628511821652e-07, "loss": 0.77574694, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 11965, "time_per_iteration": 3.7361624240875244 }, { "auxiliary_loss_clip": 0.01055186, "auxiliary_loss_mlp": 0.0104098, "balance_loss_clip": 1.01443172, "balance_loss_mlp": 1.01737046, "epoch": 0.7194348414249211, "flos": 24388262974080.0, "grad_norm": 1.5252076593587447, "language_loss": 0.78589404, "learning_rate": 7.703213386216377e-07, "loss": 0.80685568, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 11966, "time_per_iteration": 2.397975444793701 }, { "auxiliary_loss_clip": 0.01053776, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.01360869, "balance_loss_mlp": 1.016222, "epoch": 0.7194949646775891, "flos": 22162375376640.0, "grad_norm": 1.8119571516834474, "language_loss": 0.74561691, "learning_rate": 7.700142120511619e-07, "loss": 0.76652265, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.375, "step": 11967, "time_per_iteration": 2.3485090732574463 }, { "auxiliary_loss_clip": 0.01052082, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.01447225, "balance_loss_mlp": 1.01785517, "epoch": 0.719555087930257, "flos": 20265334675200.0, "grad_norm": 1.6531668939611108, "language_loss": 0.82585615, "learning_rate": 7.6970713212187e-07, "loss": 0.8467291, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 11968, "time_per_iteration": 2.387608766555786 }, { "auxiliary_loss_clip": 0.01053262, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.01697338, "balance_loss_mlp": 1.01646733, "epoch": 0.719615211182925, "flos": 24715189745280.0, "grad_norm": 2.155043888478388, "language_loss": 0.77264768, "learning_rate": 7.69400098845407e-07, "loss": 0.79358643, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 11969, "time_per_iteration": 2.3975765705108643 }, { "auxiliary_loss_clip": 0.01052175, "auxiliary_loss_mlp": 0.01039568, "balance_loss_clip": 1.01430798, "balance_loss_mlp": 1.01447511, "epoch": 0.719675334435593, "flos": 20008653292800.0, "grad_norm": 1.6230802807348554, "language_loss": 0.71818924, "learning_rate": 7.69093112233417e-07, "loss": 0.73910666, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 11970, "time_per_iteration": 2.4073703289031982 }, { "auxiliary_loss_clip": 0.01007844, "auxiliary_loss_mlp": 0.01003451, "balance_loss_clip": 1.00092411, "balance_loss_mlp": 1.00079215, "epoch": 0.719735457688261, "flos": 44197177178880.0, "grad_norm": 0.9100888829564044, "language_loss": 0.60965145, "learning_rate": 7.68786172297538e-07, "loss": 0.62976444, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.0703125, "step": 11971, "time_per_iteration": 2.8934988975524902 }, { "auxiliary_loss_clip": 0.01057041, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.01297295, "balance_loss_mlp": 1.01699507, "epoch": 0.7197955809409289, "flos": 16801880647680.0, "grad_norm": 2.0756426613618113, "language_loss": 0.81305552, "learning_rate": 7.684792790494105e-07, "loss": 0.83400792, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.40039062, "step": 11972, "time_per_iteration": 2.3897409439086914 }, { "auxiliary_loss_clip": 0.01055336, "auxiliary_loss_mlp": 0.01046162, "balance_loss_clip": 1.01912487, "balance_loss_mlp": 1.01685607, "epoch": 0.7198557041935969, "flos": 24534234835200.0, "grad_norm": 1.5150031038827514, "language_loss": 0.76514536, "learning_rate": 7.681724325006733e-07, "loss": 0.78616035, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38476562, "step": 11973, "time_per_iteration": 2.3833138942718506 }, { "auxiliary_loss_clip": 0.01008025, "auxiliary_loss_mlp": 0.01003921, "balance_loss_clip": 1.00150084, "balance_loss_mlp": 1.001302, "epoch": 0.7199158274462648, "flos": 70707811835520.0, "grad_norm": 0.8540206319147171, "language_loss": 0.57356817, "learning_rate": 7.6786563266296e-07, "loss": 0.59368765, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.06738281, "step": 11974, "time_per_iteration": 2.9001505374908447 }, { "auxiliary_loss_clip": 0.01054308, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.01654077, "balance_loss_mlp": 1.01555538, "epoch": 0.7199759506989328, "flos": 29346802686720.0, "grad_norm": 2.125579031994053, "language_loss": 0.62005568, "learning_rate": 7.675588795479062e-07, "loss": 0.64102793, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38671875, "step": 11975, "time_per_iteration": 2.421797513961792 }, { "auxiliary_loss_clip": 0.01052022, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.01262236, "balance_loss_mlp": 1.01585555, "epoch": 0.7200360739516007, "flos": 24639428361600.0, "grad_norm": 2.4174172413754693, "language_loss": 0.6866554, "learning_rate": 7.672521731671425e-07, "loss": 0.70753539, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 11976, "time_per_iteration": 3.8162434101104736 }, { "auxiliary_loss_clip": 0.01053742, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.01758146, "balance_loss_mlp": 1.0160892, "epoch": 0.7200961972042688, "flos": 20811830186880.0, "grad_norm": 2.210778157598444, "language_loss": 0.68678719, "learning_rate": 7.669455135323004e-07, "loss": 0.70773005, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.375, "step": 11977, "time_per_iteration": 2.3939740657806396 }, { "auxiliary_loss_clip": 0.01053193, "auxiliary_loss_mlp": 0.01043367, "balance_loss_clip": 1.01636636, "balance_loss_mlp": 1.01617217, "epoch": 0.7201563204569367, "flos": 31244646349440.0, "grad_norm": 1.8588845561870584, "language_loss": 0.75969887, "learning_rate": 7.666389006550074e-07, "loss": 0.7806645, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 11978, "time_per_iteration": 2.477902412414551 }, { "auxiliary_loss_clip": 0.01051056, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.01299846, "balance_loss_mlp": 1.01435697, "epoch": 0.7202164437096047, "flos": 26650180961280.0, "grad_norm": 1.9113385895577413, "language_loss": 0.79744542, "learning_rate": 7.663323345468908e-07, "loss": 0.8183217, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 11979, "time_per_iteration": 2.4896886348724365 }, { "auxiliary_loss_clip": 0.01054275, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.01587892, "balance_loss_mlp": 1.01671791, "epoch": 0.7202765669622727, "flos": 25958376904320.0, "grad_norm": 1.6634359015684512, "language_loss": 0.65837288, "learning_rate": 7.660258152195767e-07, "loss": 0.67934155, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.375, "step": 11980, "time_per_iteration": 2.4429006576538086 }, { "auxiliary_loss_clip": 0.01054771, "auxiliary_loss_mlp": 0.01041682, "balance_loss_clip": 1.01595712, "balance_loss_mlp": 1.01721275, "epoch": 0.7203366902149406, "flos": 28511086538880.0, "grad_norm": 1.9854119615832166, "language_loss": 0.68820059, "learning_rate": 7.657193426846871e-07, "loss": 0.7091651, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 11981, "time_per_iteration": 2.41070556640625 }, { "auxiliary_loss_clip": 0.01053209, "auxiliary_loss_mlp": 0.01047123, "balance_loss_clip": 1.02114677, "balance_loss_mlp": 1.01596904, "epoch": 0.7203968134676086, "flos": 21104960895360.0, "grad_norm": 2.4722264333092867, "language_loss": 0.74566853, "learning_rate": 7.65412916953843e-07, "loss": 0.76667184, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 11982, "time_per_iteration": 2.381371021270752 }, { "auxiliary_loss_clip": 0.01052812, "auxiliary_loss_mlp": 0.01046305, "balance_loss_clip": 1.02165282, "balance_loss_mlp": 1.01553047, "epoch": 0.7204569367202766, "flos": 18331181331840.0, "grad_norm": 1.9963513575188736, "language_loss": 0.67086816, "learning_rate": 7.65106538038665e-07, "loss": 0.69185936, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 11983, "time_per_iteration": 2.33769154548645 }, { "auxiliary_loss_clip": 0.01055446, "auxiliary_loss_mlp": 0.01041731, "balance_loss_clip": 1.01467073, "balance_loss_mlp": 1.01750576, "epoch": 0.7205170599729446, "flos": 23254074679680.0, "grad_norm": 1.4850932009149553, "language_loss": 0.67922169, "learning_rate": 7.648002059507715e-07, "loss": 0.70019346, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 11984, "time_per_iteration": 2.391268730163574 }, { "auxiliary_loss_clip": 0.0105651, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.01370478, "balance_loss_mlp": 1.01747608, "epoch": 0.7205771832256125, "flos": 20119851573120.0, "grad_norm": 1.7783176491211077, "language_loss": 0.75603855, "learning_rate": 7.644939207017771e-07, "loss": 0.77700585, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 11985, "time_per_iteration": 2.350484848022461 }, { "auxiliary_loss_clip": 0.01053412, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.01619554, "balance_loss_mlp": 1.01700389, "epoch": 0.7206373064782805, "flos": 27702184181760.0, "grad_norm": 1.9314544199988615, "language_loss": 0.64037144, "learning_rate": 7.641876823032977e-07, "loss": 0.66130078, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 11986, "time_per_iteration": 2.415400505065918 }, { "auxiliary_loss_clip": 0.01053944, "auxiliary_loss_mlp": 0.01043518, "balance_loss_clip": 1.01583731, "balance_loss_mlp": 1.01654482, "epoch": 0.7206974297309484, "flos": 17967176830080.0, "grad_norm": 2.7867265608282703, "language_loss": 0.73893905, "learning_rate": 7.638814907669455e-07, "loss": 0.75991368, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.37304688, "step": 11987, "time_per_iteration": 2.346979856491089 }, { "auxiliary_loss_clip": 0.01055945, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.01463795, "balance_loss_mlp": 1.01687241, "epoch": 0.7207575529836164, "flos": 16982207153280.0, "grad_norm": 1.7892883883651076, "language_loss": 0.78826511, "learning_rate": 7.635753461043301e-07, "loss": 0.80923855, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.390625, "step": 11988, "time_per_iteration": 2.345564126968384 }, { "auxiliary_loss_clip": 0.01053237, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.01807094, "balance_loss_mlp": 1.01670885, "epoch": 0.7208176762362843, "flos": 18726782480640.0, "grad_norm": 1.7601927296909525, "language_loss": 0.79185599, "learning_rate": 7.632692483270618e-07, "loss": 0.8128112, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11989, "time_per_iteration": 2.3650481700897217 }, { "auxiliary_loss_clip": 0.01053034, "auxiliary_loss_mlp": 0.01036886, "balance_loss_clip": 1.01414084, "balance_loss_mlp": 1.01606929, "epoch": 0.7208777994889524, "flos": 18733485461760.0, "grad_norm": 2.3412785312890843, "language_loss": 0.84015334, "learning_rate": 7.629631974467481e-07, "loss": 0.86105251, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36914062, "step": 11990, "time_per_iteration": 2.375626802444458 }, { "auxiliary_loss_clip": 0.01053748, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.01437366, "balance_loss_mlp": 1.0160197, "epoch": 0.7209379227416203, "flos": 14792559413760.0, "grad_norm": 2.0479073334589772, "language_loss": 0.77249652, "learning_rate": 7.626571934749931e-07, "loss": 0.79342937, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 11991, "time_per_iteration": 2.3367466926574707 }, { "auxiliary_loss_clip": 0.01052155, "auxiliary_loss_mlp": 0.01034327, "balance_loss_clip": 1.01067567, "balance_loss_mlp": 1.01648283, "epoch": 0.7209980459942883, "flos": 29635744032000.0, "grad_norm": 1.5168038881090715, "language_loss": 0.73637944, "learning_rate": 7.623512364234022e-07, "loss": 0.75724429, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 11992, "time_per_iteration": 2.4698843955993652 }, { "auxiliary_loss_clip": 0.01054092, "auxiliary_loss_mlp": 0.01034991, "balance_loss_clip": 1.00907516, "balance_loss_mlp": 1.01606166, "epoch": 0.7210581692469563, "flos": 23476052304000.0, "grad_norm": 1.7453684045481757, "language_loss": 0.67305815, "learning_rate": 7.620453263035755e-07, "loss": 0.69394898, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 11993, "time_per_iteration": 2.3863577842712402 }, { "auxiliary_loss_clip": 0.01052155, "auxiliary_loss_mlp": 0.01037776, "balance_loss_clip": 1.01355302, "balance_loss_mlp": 1.01565933, "epoch": 0.7211182924996242, "flos": 26098762947840.0, "grad_norm": 2.1289505227203724, "language_loss": 0.66740471, "learning_rate": 7.61739463127115e-07, "loss": 0.68830401, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 11994, "time_per_iteration": 2.4278411865234375 }, { "auxiliary_loss_clip": 0.01053927, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.01388288, "balance_loss_mlp": 1.01636243, "epoch": 0.7211784157522922, "flos": 17711123852160.0, "grad_norm": 1.6785060013619115, "language_loss": 0.6801253, "learning_rate": 7.614336469056172e-07, "loss": 0.70106852, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.375, "step": 11995, "time_per_iteration": 2.3310282230377197 }, { "auxiliary_loss_clip": 0.0105214, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.00977635, "balance_loss_mlp": 1.0164367, "epoch": 0.7212385390049602, "flos": 24422547795840.0, "grad_norm": 1.849895504389214, "language_loss": 0.80675745, "learning_rate": 7.6112787765068e-07, "loss": 0.82762218, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 11996, "time_per_iteration": 2.4340925216674805 }, { "auxiliary_loss_clip": 0.01053251, "auxiliary_loss_mlp": 0.01037492, "balance_loss_clip": 1.01421046, "balance_loss_mlp": 1.01620626, "epoch": 0.7212986622576282, "flos": 28145999784960.0, "grad_norm": 2.0188149231410497, "language_loss": 0.82528478, "learning_rate": 7.60822155373899e-07, "loss": 0.84619218, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 11997, "time_per_iteration": 2.435441493988037 }, { "auxiliary_loss_clip": 0.01054402, "auxiliary_loss_mlp": 0.01038476, "balance_loss_clip": 1.01390719, "balance_loss_mlp": 1.01642203, "epoch": 0.7213587855102961, "flos": 21834680555520.0, "grad_norm": 1.8995033626514313, "language_loss": 0.67921126, "learning_rate": 7.605164800868646e-07, "loss": 0.70014, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 11998, "time_per_iteration": 2.4133143424987793 }, { "auxiliary_loss_clip": 0.0105368, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.0162369, "balance_loss_mlp": 1.01738584, "epoch": 0.7214189087629641, "flos": 14610661896960.0, "grad_norm": 1.842671374753031, "language_loss": 0.73497361, "learning_rate": 7.602108518011696e-07, "loss": 0.75590634, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 11999, "time_per_iteration": 3.6233370304107666 }, { "auxiliary_loss_clip": 0.0105311, "auxiliary_loss_mlp": 0.01036709, "balance_loss_clip": 1.0126884, "balance_loss_mlp": 1.01602614, "epoch": 0.721479032015632, "flos": 19389852622080.0, "grad_norm": 1.974739078758876, "language_loss": 0.83903158, "learning_rate": 7.599052705284039e-07, "loss": 0.8599298, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 12000, "time_per_iteration": 2.3890182971954346 }, { "auxiliary_loss_clip": 0.01055537, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.01370955, "balance_loss_mlp": 1.01693618, "epoch": 0.7215391552683, "flos": 18511961685120.0, "grad_norm": 1.8303983279488911, "language_loss": 0.78496301, "learning_rate": 7.59599736280154e-07, "loss": 0.80590403, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38671875, "step": 12001, "time_per_iteration": 2.357947826385498 }, { "auxiliary_loss_clip": 0.01055135, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.02021599, "balance_loss_mlp": 1.01872945, "epoch": 0.721599278520968, "flos": 23257600727040.0, "grad_norm": 1.636984280462141, "language_loss": 0.82514274, "learning_rate": 7.592942490680066e-07, "loss": 0.84613597, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 12002, "time_per_iteration": 2.410571575164795 }, { "auxiliary_loss_clip": 0.01054656, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.01484299, "balance_loss_mlp": 1.0169512, "epoch": 0.721659401773636, "flos": 39197581706880.0, "grad_norm": 1.9145231271958676, "language_loss": 0.63921863, "learning_rate": 7.589888089035462e-07, "loss": 0.6601544, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37695312, "step": 12003, "time_per_iteration": 4.020500659942627 }, { "auxiliary_loss_clip": 0.0105317, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.0104847, "balance_loss_mlp": 1.01572394, "epoch": 0.7217195250263039, "flos": 14939020033920.0, "grad_norm": 2.666621813777241, "language_loss": 0.70296913, "learning_rate": 7.586834157983544e-07, "loss": 0.72386205, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12004, "time_per_iteration": 3.7463862895965576 }, { "auxiliary_loss_clip": 0.01008355, "auxiliary_loss_mlp": 0.0100263, "balance_loss_clip": 1.00000763, "balance_loss_mlp": 1.00151205, "epoch": 0.7217796482789719, "flos": 70865828115840.0, "grad_norm": 0.8626593854858787, "language_loss": 0.54214859, "learning_rate": 7.583780697640112e-07, "loss": 0.56225848, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.06835938, "step": 12005, "time_per_iteration": 2.945441961288452 }, { "auxiliary_loss_clip": 0.01055724, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.01109862, "balance_loss_mlp": 1.01791537, "epoch": 0.7218397715316398, "flos": 37450004002560.0, "grad_norm": 1.7419866791108258, "language_loss": 0.64012468, "learning_rate": 7.580727708120962e-07, "loss": 0.66103697, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 12006, "time_per_iteration": 2.4981517791748047 }, { "auxiliary_loss_clip": 0.01053209, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.01623249, "balance_loss_mlp": 1.01688647, "epoch": 0.7218998947843078, "flos": 22709569115520.0, "grad_norm": 1.8355458316295166, "language_loss": 0.92420161, "learning_rate": 7.577675189541865e-07, "loss": 0.94512343, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 12007, "time_per_iteration": 2.377774238586426 }, { "auxiliary_loss_clip": 0.01055166, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 1.00931168, "balance_loss_mlp": 1.0169332, "epoch": 0.7219600180369758, "flos": 12166357633920.0, "grad_norm": 2.2889206398610122, "language_loss": 0.65166932, "learning_rate": 7.574623142018568e-07, "loss": 0.67256749, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 12008, "time_per_iteration": 2.3323090076446533 }, { "auxiliary_loss_clip": 0.01055178, "auxiliary_loss_mlp": 0.01039407, "balance_loss_clip": 1.01423049, "balance_loss_mlp": 1.01698947, "epoch": 0.7220201412896438, "flos": 22595612837760.0, "grad_norm": 1.9651084271322778, "language_loss": 0.79567116, "learning_rate": 7.57157156566681e-07, "loss": 0.81661701, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 12009, "time_per_iteration": 2.388071060180664 }, { "auxiliary_loss_clip": 0.0105455, "auxiliary_loss_mlp": 0.01043602, "balance_loss_clip": 1.0162673, "balance_loss_mlp": 1.01692963, "epoch": 0.7220802645423118, "flos": 26717598529920.0, "grad_norm": 1.7895329032554002, "language_loss": 0.64878559, "learning_rate": 7.568520460602297e-07, "loss": 0.66976702, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37695312, "step": 12010, "time_per_iteration": 2.4065637588500977 }, { "auxiliary_loss_clip": 0.01052972, "auxiliary_loss_mlp": 0.01044234, "balance_loss_clip": 1.02036858, "balance_loss_mlp": 1.01630187, "epoch": 0.7221403877949797, "flos": 24419545418880.0, "grad_norm": 1.7121941797403988, "language_loss": 0.78301489, "learning_rate": 7.565469826940742e-07, "loss": 0.80398697, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12011, "time_per_iteration": 2.411363363265991 }, { "auxiliary_loss_clip": 0.01052393, "auxiliary_loss_mlp": 0.01040889, "balance_loss_clip": 1.01758349, "balance_loss_mlp": 1.01586914, "epoch": 0.7222005110476477, "flos": 23513234768640.0, "grad_norm": 1.6280293488370976, "language_loss": 0.80165696, "learning_rate": 7.56241966479781e-07, "loss": 0.82258976, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 12012, "time_per_iteration": 2.3806228637695312 }, { "auxiliary_loss_clip": 0.01054144, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.01008153, "balance_loss_mlp": 1.01778412, "epoch": 0.7222606343003156, "flos": 23111419397760.0, "grad_norm": 2.322727075288452, "language_loss": 0.77259326, "learning_rate": 7.559369974289171e-07, "loss": 0.79346609, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 12013, "time_per_iteration": 2.431779623031616 }, { "auxiliary_loss_clip": 0.01052707, "auxiliary_loss_mlp": 0.01040953, "balance_loss_clip": 1.01677752, "balance_loss_mlp": 1.01620531, "epoch": 0.7223207575529836, "flos": 24350068080000.0, "grad_norm": 1.6243945235067492, "language_loss": 0.76412785, "learning_rate": 7.556320755530484e-07, "loss": 0.78506446, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 12014, "time_per_iteration": 2.4370534420013428 }, { "auxiliary_loss_clip": 0.01053639, "auxiliary_loss_mlp": 0.01038478, "balance_loss_clip": 1.01395631, "balance_loss_mlp": 1.01603842, "epoch": 0.7223808808056515, "flos": 28328909731200.0, "grad_norm": 2.932188511865818, "language_loss": 0.87701035, "learning_rate": 7.553272008637346e-07, "loss": 0.89793146, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 12015, "time_per_iteration": 4.001673221588135 }, { "auxiliary_loss_clip": 0.0105229, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.01151919, "balance_loss_mlp": 1.01650369, "epoch": 0.7224410040583196, "flos": 21068371923840.0, "grad_norm": 2.0983426712511055, "language_loss": 0.79057992, "learning_rate": 7.55022373372538e-07, "loss": 0.81144547, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 12016, "time_per_iteration": 2.39625883102417 }, { "auxiliary_loss_clip": 0.01052752, "auxiliary_loss_mlp": 0.0104133, "balance_loss_clip": 1.01800108, "balance_loss_mlp": 1.01690972, "epoch": 0.7225011273109875, "flos": 26794267608960.0, "grad_norm": 3.848054357596503, "language_loss": 0.78271389, "learning_rate": 7.547175930910186e-07, "loss": 0.80365467, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 12017, "time_per_iteration": 2.4486989974975586 }, { "auxiliary_loss_clip": 0.0105155, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.01429105, "balance_loss_mlp": 1.01651216, "epoch": 0.7225612505636555, "flos": 23582537550720.0, "grad_norm": 1.9479048549375, "language_loss": 0.75946629, "learning_rate": 7.54412860030732e-07, "loss": 0.78033769, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 12018, "time_per_iteration": 2.410247325897217 }, { "auxiliary_loss_clip": 0.01053261, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.01514673, "balance_loss_mlp": 1.0186224, "epoch": 0.7226213738163234, "flos": 20776567847040.0, "grad_norm": 1.5867880252190933, "language_loss": 0.78386313, "learning_rate": 7.541081742032347e-07, "loss": 0.80475771, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 12019, "time_per_iteration": 2.3982675075531006 }, { "auxiliary_loss_clip": 0.010524, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.01035392, "balance_loss_mlp": 1.01588202, "epoch": 0.7226814970689914, "flos": 32634433774080.0, "grad_norm": 1.7050633194903118, "language_loss": 0.74920118, "learning_rate": 7.53803535620081e-07, "loss": 0.77007341, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 12020, "time_per_iteration": 2.4524645805358887 }, { "auxiliary_loss_clip": 0.01051661, "auxiliary_loss_mlp": 0.01041722, "balance_loss_clip": 1.01901269, "balance_loss_mlp": 1.01506066, "epoch": 0.7227416203216595, "flos": 22453306669440.0, "grad_norm": 1.5401606246048691, "language_loss": 0.78320009, "learning_rate": 7.534989442928219e-07, "loss": 0.80413401, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36523438, "step": 12021, "time_per_iteration": 2.4439949989318848 }, { "auxiliary_loss_clip": 0.01052973, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.01061916, "balance_loss_mlp": 1.01673567, "epoch": 0.7228017435743274, "flos": 21651246938880.0, "grad_norm": 1.5673201571299449, "language_loss": 0.69007051, "learning_rate": 7.531944002330073e-07, "loss": 0.71093392, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 12022, "time_per_iteration": 2.492393970489502 }, { "auxiliary_loss_clip": 0.01052849, "auxiliary_loss_mlp": 0.01038912, "balance_loss_clip": 1.01448631, "balance_loss_mlp": 1.01615286, "epoch": 0.7228618668269954, "flos": 29532191339520.0, "grad_norm": 2.3299153418465335, "language_loss": 0.70493263, "learning_rate": 7.528899034521858e-07, "loss": 0.72585022, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 12023, "time_per_iteration": 2.446632146835327 }, { "auxiliary_loss_clip": 0.01051175, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.01351511, "balance_loss_mlp": 1.01496732, "epoch": 0.7229219900796633, "flos": 27452589805440.0, "grad_norm": 1.690268852215826, "language_loss": 0.71730083, "learning_rate": 7.525854539619052e-07, "loss": 0.73816669, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 12024, "time_per_iteration": 2.4376330375671387 }, { "auxiliary_loss_clip": 0.01052177, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 1.01624322, "balance_loss_mlp": 1.01618385, "epoch": 0.7229821133323313, "flos": 16288448060160.0, "grad_norm": 2.3587374880666383, "language_loss": 0.76484615, "learning_rate": 7.522810517737089e-07, "loss": 0.78576481, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 12025, "time_per_iteration": 2.4284820556640625 }, { "auxiliary_loss_clip": 0.01052077, "auxiliary_loss_mlp": 0.01039752, "balance_loss_clip": 1.01802063, "balance_loss_mlp": 1.0168221, "epoch": 0.7230422365849992, "flos": 20411306536320.0, "grad_norm": 2.01142172097412, "language_loss": 0.77849817, "learning_rate": 7.519766968991395e-07, "loss": 0.79941648, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 12026, "time_per_iteration": 2.400914430618286 }, { "auxiliary_loss_clip": 0.01054422, "auxiliary_loss_mlp": 0.01042357, "balance_loss_clip": 1.01970685, "balance_loss_mlp": 1.01720726, "epoch": 0.7231023598376672, "flos": 25592312632320.0, "grad_norm": 1.8861964181346198, "language_loss": 0.68669629, "learning_rate": 7.516723893497388e-07, "loss": 0.70766413, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 12027, "time_per_iteration": 2.407539129257202 }, { "auxiliary_loss_clip": 0.01055101, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.01615202, "balance_loss_mlp": 1.0178839, "epoch": 0.7231624830903352, "flos": 25148601763200.0, "grad_norm": 1.9354894703746335, "language_loss": 0.7991432, "learning_rate": 7.513681291370469e-07, "loss": 0.82010007, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 12028, "time_per_iteration": 2.389491558074951 }, { "auxiliary_loss_clip": 0.01051141, "auxiliary_loss_mlp": 0.01041536, "balance_loss_clip": 1.01739585, "balance_loss_mlp": 1.01520848, "epoch": 0.7232226063430032, "flos": 21724669261440.0, "grad_norm": 1.6429415965799696, "language_loss": 0.83192849, "learning_rate": 7.510639162726e-07, "loss": 0.85285527, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 12029, "time_per_iteration": 2.381150245666504 }, { "auxiliary_loss_clip": 0.01008374, "auxiliary_loss_mlp": 0.01003163, "balance_loss_clip": 1.00082672, "balance_loss_mlp": 1.00148034, "epoch": 0.7232827295956711, "flos": 68433149335680.0, "grad_norm": 0.8144714647787783, "language_loss": 0.61797929, "learning_rate": 7.507597507679347e-07, "loss": 0.63809466, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.06884766, "step": 12030, "time_per_iteration": 3.064298629760742 }, { "auxiliary_loss_clip": 0.01051171, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.01093221, "balance_loss_mlp": 1.016168, "epoch": 0.7233428528483391, "flos": 20191633061760.0, "grad_norm": 1.6544791415979463, "language_loss": 0.78783238, "learning_rate": 7.504556326345859e-07, "loss": 0.80868942, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 12031, "time_per_iteration": 2.3678088188171387 }, { "auxiliary_loss_clip": 0.01054638, "auxiliary_loss_mlp": 0.010376, "balance_loss_clip": 1.0129596, "balance_loss_mlp": 1.01653934, "epoch": 0.723402976101007, "flos": 23948392354560.0, "grad_norm": 4.407850369051631, "language_loss": 0.82023871, "learning_rate": 7.501515618840834e-07, "loss": 0.84116113, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38085938, "step": 12032, "time_per_iteration": 2.374979257583618 }, { "auxiliary_loss_clip": 0.01055255, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.01528454, "balance_loss_mlp": 1.01707792, "epoch": 0.723463099353675, "flos": 20812353857280.0, "grad_norm": 1.7516410225949417, "language_loss": 0.76793724, "learning_rate": 7.498475385279592e-07, "loss": 0.78889215, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 12033, "time_per_iteration": 2.4330575466156006 }, { "auxiliary_loss_clip": 0.01049842, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.0140183, "balance_loss_mlp": 1.01566434, "epoch": 0.723523222606343, "flos": 19097036115840.0, "grad_norm": 1.6619912827899956, "language_loss": 0.75903165, "learning_rate": 7.495435625777423e-07, "loss": 0.77987909, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34179688, "step": 12034, "time_per_iteration": 2.353130578994751 }, { "auxiliary_loss_clip": 0.01052011, "auxiliary_loss_mlp": 0.01034914, "balance_loss_clip": 1.01239514, "balance_loss_mlp": 1.01620114, "epoch": 0.723583345859011, "flos": 26505745200000.0, "grad_norm": 1.699265621845893, "language_loss": 0.81980938, "learning_rate": 7.492396340449578e-07, "loss": 0.84067863, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 12035, "time_per_iteration": 2.408832550048828 }, { "auxiliary_loss_clip": 0.01054008, "auxiliary_loss_mlp": 0.0103793, "balance_loss_clip": 1.01291966, "balance_loss_mlp": 1.01725364, "epoch": 0.723643469111679, "flos": 16032953664000.0, "grad_norm": 1.9073542576646292, "language_loss": 0.61547095, "learning_rate": 7.489357529411326e-07, "loss": 0.63639033, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 12036, "time_per_iteration": 2.36421799659729 }, { "auxiliary_loss_clip": 0.01051088, "auxiliary_loss_mlp": 0.01037081, "balance_loss_clip": 1.01544464, "balance_loss_mlp": 1.01565909, "epoch": 0.7237035923643469, "flos": 21944447470080.0, "grad_norm": 1.8098894200382531, "language_loss": 0.68601871, "learning_rate": 7.486319192777883e-07, "loss": 0.70690042, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35351562, "step": 12037, "time_per_iteration": 2.404141902923584 }, { "auxiliary_loss_clip": 0.01051461, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.01511037, "balance_loss_mlp": 1.01609731, "epoch": 0.7237637156170149, "flos": 23582083703040.0, "grad_norm": 1.890443736986783, "language_loss": 0.73114383, "learning_rate": 7.483281330664479e-07, "loss": 0.75204527, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 12038, "time_per_iteration": 3.8177239894866943 }, { "auxiliary_loss_clip": 0.01050505, "auxiliary_loss_mlp": 0.01036117, "balance_loss_clip": 1.01039207, "balance_loss_mlp": 1.01510191, "epoch": 0.7238238388696828, "flos": 20593657900800.0, "grad_norm": 2.4477891122618156, "language_loss": 0.73285878, "learning_rate": 7.480243943186293e-07, "loss": 0.75372505, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.35351562, "step": 12039, "time_per_iteration": 2.351316213607788 }, { "auxiliary_loss_clip": 0.01052435, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.01400399, "balance_loss_mlp": 1.01587248, "epoch": 0.7238839621223508, "flos": 24205876698240.0, "grad_norm": 1.8859031143739322, "language_loss": 0.77911317, "learning_rate": 7.477207030458513e-07, "loss": 0.80000377, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 12040, "time_per_iteration": 2.3983163833618164 }, { "auxiliary_loss_clip": 0.01052242, "auxiliary_loss_mlp": 0.01034682, "balance_loss_clip": 1.01123357, "balance_loss_mlp": 1.01634622, "epoch": 0.7239440853750188, "flos": 14208881437440.0, "grad_norm": 1.558034112884099, "language_loss": 0.77983773, "learning_rate": 7.474170592596301e-07, "loss": 0.80070698, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 12041, "time_per_iteration": 2.3562402725219727 }, { "auxiliary_loss_clip": 0.01053976, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.00881577, "balance_loss_mlp": 1.0166285, "epoch": 0.7240042086276868, "flos": 21613785183360.0, "grad_norm": 2.0168015194994986, "language_loss": 0.65290904, "learning_rate": 7.471134629714797e-07, "loss": 0.67377234, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 12042, "time_per_iteration": 2.3466567993164062 }, { "auxiliary_loss_clip": 0.01053721, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.01131463, "balance_loss_mlp": 1.01716328, "epoch": 0.7240643318803547, "flos": 23330324822400.0, "grad_norm": 1.8497188374293847, "language_loss": 0.85401845, "learning_rate": 7.468099141929116e-07, "loss": 0.87491441, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 12043, "time_per_iteration": 3.7557036876678467 }, { "auxiliary_loss_clip": 0.01053009, "auxiliary_loss_mlp": 0.01041645, "balance_loss_clip": 1.0160625, "balance_loss_mlp": 1.01603484, "epoch": 0.7241244551330227, "flos": 24023699890560.0, "grad_norm": 2.177223986283052, "language_loss": 0.65252042, "learning_rate": 7.465064129354379e-07, "loss": 0.67346692, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36914062, "step": 12044, "time_per_iteration": 3.7909982204437256 }, { "auxiliary_loss_clip": 0.0105349, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.01060438, "balance_loss_mlp": 1.01671529, "epoch": 0.7241845783856906, "flos": 18729435744000.0, "grad_norm": 1.462338527519926, "language_loss": 0.82610637, "learning_rate": 7.462029592105658e-07, "loss": 0.84699732, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 12045, "time_per_iteration": 2.353032350540161 }, { "auxiliary_loss_clip": 0.01051003, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.01323605, "balance_loss_mlp": 1.01565921, "epoch": 0.7242447016383586, "flos": 19497699411840.0, "grad_norm": 1.6742718187972638, "language_loss": 0.73049754, "learning_rate": 7.458995530298034e-07, "loss": 0.75136679, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 12046, "time_per_iteration": 2.35311222076416 }, { "auxiliary_loss_clip": 0.01051884, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.01372766, "balance_loss_mlp": 1.01571834, "epoch": 0.7243048248910267, "flos": 22162410288000.0, "grad_norm": 1.974345585291145, "language_loss": 0.7199558, "learning_rate": 7.455961944046553e-07, "loss": 0.74084866, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 12047, "time_per_iteration": 2.3511464595794678 }, { "auxiliary_loss_clip": 0.0105474, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.01408625, "balance_loss_mlp": 1.01648855, "epoch": 0.7243649481436946, "flos": 27671530141440.0, "grad_norm": 1.846978142498591, "language_loss": 0.71133107, "learning_rate": 7.45292883346627e-07, "loss": 0.73226953, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 12048, "time_per_iteration": 2.400956153869629 }, { "auxiliary_loss_clip": 0.01007113, "auxiliary_loss_mlp": 0.01002323, "balance_loss_clip": 0.99990326, "balance_loss_mlp": 1.00076365, "epoch": 0.7244250713963626, "flos": 63241355629440.0, "grad_norm": 0.8212147456630677, "language_loss": 0.53803277, "learning_rate": 7.449896198672168e-07, "loss": 0.55812716, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.06347656, "step": 12049, "time_per_iteration": 3.0113911628723145 }, { "auxiliary_loss_clip": 0.01057285, "auxiliary_loss_mlp": 0.01040833, "balance_loss_clip": 1.01353455, "balance_loss_mlp": 1.0181309, "epoch": 0.7244851946490305, "flos": 17966164400640.0, "grad_norm": 2.911385660831529, "language_loss": 0.61522996, "learning_rate": 7.446864039779258e-07, "loss": 0.6362111, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 12050, "time_per_iteration": 2.3526384830474854 }, { "auxiliary_loss_clip": 0.01007534, "auxiliary_loss_mlp": 0.01004833, "balance_loss_clip": 1.00246084, "balance_loss_mlp": 1.00104117, "epoch": 0.7245453179016985, "flos": 70940227956480.0, "grad_norm": 0.7191020026741499, "language_loss": 0.53373086, "learning_rate": 7.443832356902528e-07, "loss": 0.55385458, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.06494141, "step": 12051, "time_per_iteration": 3.0231308937072754 }, { "auxiliary_loss_clip": 0.01051253, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.01866078, "balance_loss_mlp": 1.01620173, "epoch": 0.7246054411543664, "flos": 24567402493440.0, "grad_norm": 1.5003777698814518, "language_loss": 0.73075581, "learning_rate": 7.440801150156927e-07, "loss": 0.75167143, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 12052, "time_per_iteration": 2.3925609588623047 }, { "auxiliary_loss_clip": 0.01054335, "auxiliary_loss_mlp": 0.01042765, "balance_loss_clip": 1.01529896, "balance_loss_mlp": 1.01698434, "epoch": 0.7246655644070344, "flos": 32337078791040.0, "grad_norm": 1.6780682577225594, "language_loss": 0.75192642, "learning_rate": 7.437770419657415e-07, "loss": 0.77289736, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37304688, "step": 12053, "time_per_iteration": 2.436213493347168 }, { "auxiliary_loss_clip": 0.01052927, "auxiliary_loss_mlp": 0.01039567, "balance_loss_clip": 1.01440167, "balance_loss_mlp": 1.01590657, "epoch": 0.7247256876597024, "flos": 21871374261120.0, "grad_norm": 1.755486221385532, "language_loss": 0.79320729, "learning_rate": 7.434740165518898e-07, "loss": 0.81413215, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 12054, "time_per_iteration": 2.3751819133758545 }, { "auxiliary_loss_clip": 0.0105379, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.0150032, "balance_loss_mlp": 1.01722789, "epoch": 0.7247858109123704, "flos": 16212267740160.0, "grad_norm": 2.4282352239537675, "language_loss": 0.69258434, "learning_rate": 7.431710387856301e-07, "loss": 0.71350807, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 12055, "time_per_iteration": 3.735994577407837 }, { "auxiliary_loss_clip": 0.01050421, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.0189321, "balance_loss_mlp": 1.01511645, "epoch": 0.7248459341650383, "flos": 20849641056000.0, "grad_norm": 1.6951718835499987, "language_loss": 0.74938786, "learning_rate": 7.428681086784496e-07, "loss": 0.7702924, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35351562, "step": 12056, "time_per_iteration": 2.35951828956604 }, { "auxiliary_loss_clip": 0.01050195, "auxiliary_loss_mlp": 0.0103211, "balance_loss_clip": 1.00975847, "balance_loss_mlp": 1.01590943, "epoch": 0.7249060574177063, "flos": 25920600946560.0, "grad_norm": 1.525419390507607, "language_loss": 0.71692538, "learning_rate": 7.425652262418368e-07, "loss": 0.7377485, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 12057, "time_per_iteration": 2.4205920696258545 }, { "auxiliary_loss_clip": 0.01055102, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.01794195, "balance_loss_mlp": 1.01772594, "epoch": 0.7249661806703742, "flos": 17344640643840.0, "grad_norm": 3.4693858979660726, "language_loss": 0.63440645, "learning_rate": 7.42262391487277e-07, "loss": 0.65537804, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 12058, "time_per_iteration": 2.339470624923706 }, { "auxiliary_loss_clip": 0.01053164, "auxiliary_loss_mlp": 0.01038953, "balance_loss_clip": 1.01441979, "balance_loss_mlp": 1.01689327, "epoch": 0.7250263039230422, "flos": 19573111681920.0, "grad_norm": 2.0216155927510164, "language_loss": 0.75426292, "learning_rate": 7.419596044262535e-07, "loss": 0.77518409, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 12059, "time_per_iteration": 2.3768556118011475 }, { "auxiliary_loss_clip": 0.0105234, "auxiliary_loss_mlp": 0.01040151, "balance_loss_clip": 1.01770377, "balance_loss_mlp": 1.01603913, "epoch": 0.7250864271757103, "flos": 21975695003520.0, "grad_norm": 1.7090598811318478, "language_loss": 0.80122244, "learning_rate": 7.416568650702472e-07, "loss": 0.82214737, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 12060, "time_per_iteration": 2.3914859294891357 }, { "auxiliary_loss_clip": 0.01052405, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.00937223, "balance_loss_mlp": 1.01659799, "epoch": 0.7251465504283782, "flos": 25011357742080.0, "grad_norm": 4.592777554610926, "language_loss": 0.77673817, "learning_rate": 7.413541734307393e-07, "loss": 0.797607, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35742188, "step": 12061, "time_per_iteration": 2.460577964782715 }, { "auxiliary_loss_clip": 0.01050061, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.01181948, "balance_loss_mlp": 1.01613593, "epoch": 0.7252066736810462, "flos": 16689216090240.0, "grad_norm": 1.6195488097620234, "language_loss": 0.81911254, "learning_rate": 7.410515295192068e-07, "loss": 0.83993995, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 12062, "time_per_iteration": 2.330665349960327 }, { "auxiliary_loss_clip": 0.01056575, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.01455832, "balance_loss_mlp": 1.01818371, "epoch": 0.7252667969337141, "flos": 25701835167360.0, "grad_norm": 2.481886655950343, "language_loss": 0.71220726, "learning_rate": 7.407489333471262e-07, "loss": 0.73318404, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38476562, "step": 12063, "time_per_iteration": 2.473924398422241 }, { "auxiliary_loss_clip": 0.0105087, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01116586, "balance_loss_mlp": 1.01570535, "epoch": 0.7253269201863821, "flos": 18258945995520.0, "grad_norm": 1.514710653909478, "language_loss": 0.71150589, "learning_rate": 7.40446384925973e-07, "loss": 0.73235166, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 12064, "time_per_iteration": 2.366633653640747 }, { "auxiliary_loss_clip": 0.0105461, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.01112604, "balance_loss_mlp": 1.01791215, "epoch": 0.72538704343905, "flos": 20410782865920.0, "grad_norm": 1.904574771201914, "language_loss": 0.91760802, "learning_rate": 7.401438842672192e-07, "loss": 0.93849772, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 12065, "time_per_iteration": 2.3930938243865967 }, { "auxiliary_loss_clip": 0.01007736, "auxiliary_loss_mlp": 0.01003843, "balance_loss_clip": 1.00126839, "balance_loss_mlp": 1.00110173, "epoch": 0.725447166691718, "flos": 70147524470400.0, "grad_norm": 0.7266824611170254, "language_loss": 0.56078953, "learning_rate": 7.398414313823349e-07, "loss": 0.58090538, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.06640625, "step": 12066, "time_per_iteration": 3.1739885807037354 }, { "auxiliary_loss_clip": 0.01051299, "auxiliary_loss_mlp": 0.01038778, "balance_loss_clip": 1.01619959, "balance_loss_mlp": 1.01545858, "epoch": 0.725507289944386, "flos": 27051123548160.0, "grad_norm": 1.9210499970111274, "language_loss": 0.77645183, "learning_rate": 7.395390262827897e-07, "loss": 0.79735267, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 12067, "time_per_iteration": 2.402179002761841 }, { "auxiliary_loss_clip": 0.01007953, "auxiliary_loss_mlp": 0.01004614, "balance_loss_clip": 1.00199163, "balance_loss_mlp": 1.00123119, "epoch": 0.725567413197054, "flos": 62918583310080.0, "grad_norm": 0.7293650450051737, "language_loss": 0.5714488, "learning_rate": 7.392366689800515e-07, "loss": 0.59157449, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.06738281, "step": 12068, "time_per_iteration": 2.931572437286377 }, { "auxiliary_loss_clip": 0.01007817, "auxiliary_loss_mlp": 0.01003178, "balance_loss_clip": 1.0008055, "balance_loss_mlp": 1.0011251, "epoch": 0.7256275364497219, "flos": 60292660821120.0, "grad_norm": 0.6627269736067521, "language_loss": 0.55505764, "learning_rate": 7.389343594855848e-07, "loss": 0.57516754, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.06689453, "step": 12069, "time_per_iteration": 3.0457684993743896 }, { "auxiliary_loss_clip": 0.010493, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.01711392, "balance_loss_mlp": 1.01508129, "epoch": 0.7256876597023899, "flos": 24497366572800.0, "grad_norm": 1.817564220487277, "language_loss": 0.80875921, "learning_rate": 7.38632097810854e-07, "loss": 0.82964021, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 12070, "time_per_iteration": 2.3818652629852295 }, { "auxiliary_loss_clip": 0.01049581, "auxiliary_loss_mlp": 0.01039205, "balance_loss_clip": 1.01667452, "balance_loss_mlp": 1.01595068, "epoch": 0.7257477829550578, "flos": 24351604179840.0, "grad_norm": 1.7187918768560777, "language_loss": 0.73026597, "learning_rate": 7.383298839673197e-07, "loss": 0.75115383, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3359375, "step": 12071, "time_per_iteration": 2.3952651023864746 }, { "auxiliary_loss_clip": 0.0105201, "auxiliary_loss_mlp": 0.01041526, "balance_loss_clip": 1.01849508, "balance_loss_mlp": 1.01651657, "epoch": 0.7258079062077258, "flos": 17201252223360.0, "grad_norm": 1.7701672165431859, "language_loss": 0.70376706, "learning_rate": 7.380277179664436e-07, "loss": 0.72470248, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12072, "time_per_iteration": 2.3528645038604736 }, { "auxiliary_loss_clip": 0.01053678, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.01336682, "balance_loss_mlp": 1.01612926, "epoch": 0.7258680294603939, "flos": 21579255982080.0, "grad_norm": 1.7219521399119946, "language_loss": 0.79114377, "learning_rate": 7.377255998196821e-07, "loss": 0.8120541, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 12073, "time_per_iteration": 2.4200217723846436 }, { "auxiliary_loss_clip": 0.01051202, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.01141298, "balance_loss_mlp": 1.01641202, "epoch": 0.7259281527130618, "flos": 34854107149440.0, "grad_norm": 1.3919328880711501, "language_loss": 0.71109104, "learning_rate": 7.374235295384923e-07, "loss": 0.7319541, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34765625, "step": 12074, "time_per_iteration": 2.4574480056762695 }, { "auxiliary_loss_clip": 0.01053047, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.01370382, "balance_loss_mlp": 1.01622427, "epoch": 0.7259882759657298, "flos": 25403642311680.0, "grad_norm": 1.6314678999765204, "language_loss": 0.75358188, "learning_rate": 7.371215071343302e-07, "loss": 0.77448404, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 12075, "time_per_iteration": 2.400881052017212 }, { "auxiliary_loss_clip": 0.01052944, "auxiliary_loss_mlp": 0.01045353, "balance_loss_clip": 1.0205338, "balance_loss_mlp": 1.0162394, "epoch": 0.7260483992183977, "flos": 62951438632320.0, "grad_norm": 1.5546676816335279, "language_loss": 0.64523792, "learning_rate": 7.368195326186458e-07, "loss": 0.6662209, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 12076, "time_per_iteration": 2.7409183979034424 }, { "auxiliary_loss_clip": 0.01053123, "auxiliary_loss_mlp": 0.01038102, "balance_loss_clip": 1.01402187, "balance_loss_mlp": 1.0163281, "epoch": 0.7261085224710657, "flos": 26466363319680.0, "grad_norm": 2.043483374514626, "language_loss": 0.79148972, "learning_rate": 7.365176060028912e-07, "loss": 0.81240201, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 12077, "time_per_iteration": 2.42322039604187 }, { "auxiliary_loss_clip": 0.01008115, "auxiliary_loss_mlp": 0.01003701, "balance_loss_clip": 1.00088811, "balance_loss_mlp": 1.00126696, "epoch": 0.7261686457237336, "flos": 66769748519040.0, "grad_norm": 0.8889327412650297, "language_loss": 0.65127778, "learning_rate": 7.362157272985163e-07, "loss": 0.67139602, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 0.02807617, "router_z_loss_mlp": 0.06835938, "step": 12078, "time_per_iteration": 4.3908514976501465 }, { "auxiliary_loss_clip": 0.01007821, "auxiliary_loss_mlp": 0.0100404, "balance_loss_clip": 1.00129783, "balance_loss_mlp": 1.00107729, "epoch": 0.7262287689764017, "flos": 69996071525760.0, "grad_norm": 0.7182445247988756, "language_loss": 0.59316051, "learning_rate": 7.359138965169671e-07, "loss": 0.6132791, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.06738281, "step": 12079, "time_per_iteration": 3.1319475173950195 }, { "auxiliary_loss_clip": 0.01051966, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.01341915, "balance_loss_mlp": 1.01601601, "epoch": 0.7262888922290696, "flos": 23804305706880.0, "grad_norm": 2.340765176572212, "language_loss": 0.65823865, "learning_rate": 7.356121136696895e-07, "loss": 0.67913532, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 12080, "time_per_iteration": 2.3932976722717285 }, { "auxiliary_loss_clip": 0.01052141, "auxiliary_loss_mlp": 0.01036134, "balance_loss_clip": 1.01206565, "balance_loss_mlp": 1.01536369, "epoch": 0.7263490154817376, "flos": 19499305334400.0, "grad_norm": 2.285142198759051, "language_loss": 0.71551055, "learning_rate": 7.35310378768128e-07, "loss": 0.73639327, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 12081, "time_per_iteration": 2.345550537109375 }, { "auxiliary_loss_clip": 0.01056084, "auxiliary_loss_mlp": 0.01041871, "balance_loss_clip": 1.01696777, "balance_loss_mlp": 1.01741064, "epoch": 0.7264091387344055, "flos": 16285410771840.0, "grad_norm": 1.7399318877580308, "language_loss": 0.8245008, "learning_rate": 7.350086918237237e-07, "loss": 0.84548032, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 12082, "time_per_iteration": 3.651632070541382 }, { "auxiliary_loss_clip": 0.01055842, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.01318514, "balance_loss_mlp": 1.01702797, "epoch": 0.7264692619870735, "flos": 24350905952640.0, "grad_norm": 1.6563487724561143, "language_loss": 0.78192961, "learning_rate": 7.347070528479158e-07, "loss": 0.80288136, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 12083, "time_per_iteration": 2.3894126415252686 }, { "auxiliary_loss_clip": 0.01054568, "auxiliary_loss_mlp": 0.01039268, "balance_loss_clip": 1.01440144, "balance_loss_mlp": 1.01743078, "epoch": 0.7265293852397414, "flos": 25118296836480.0, "grad_norm": 1.954459923552141, "language_loss": 0.73861039, "learning_rate": 7.344054618521433e-07, "loss": 0.75954878, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 12084, "time_per_iteration": 3.880523681640625 }, { "auxiliary_loss_clip": 0.01055904, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.0160464, "balance_loss_mlp": 1.01734459, "epoch": 0.7265895084924094, "flos": 22637124311040.0, "grad_norm": 1.7719063824584447, "language_loss": 0.78790271, "learning_rate": 7.34103918847843e-07, "loss": 0.80886626, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38476562, "step": 12085, "time_per_iteration": 2.4177029132843018 }, { "auxiliary_loss_clip": 0.010542, "auxiliary_loss_mlp": 0.01044265, "balance_loss_clip": 1.02042305, "balance_loss_mlp": 1.01681602, "epoch": 0.7266496317450775, "flos": 23367088350720.0, "grad_norm": 2.054156698492528, "language_loss": 0.73498178, "learning_rate": 7.338024238464493e-07, "loss": 0.75596642, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 12086, "time_per_iteration": 2.3588883876800537 }, { "auxiliary_loss_clip": 0.01053392, "auxiliary_loss_mlp": 0.01039883, "balance_loss_clip": 1.01624393, "balance_loss_mlp": 1.0169189, "epoch": 0.7267097549977454, "flos": 28073345512320.0, "grad_norm": 1.6480431378526412, "language_loss": 0.70691156, "learning_rate": 7.335009768593938e-07, "loss": 0.72784424, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 12087, "time_per_iteration": 2.4229543209075928 }, { "auxiliary_loss_clip": 0.01053736, "auxiliary_loss_mlp": 0.0104421, "balance_loss_clip": 1.01879501, "balance_loss_mlp": 1.01680052, "epoch": 0.7267698782504134, "flos": 22194565516800.0, "grad_norm": 1.7200362704572183, "language_loss": 0.79806316, "learning_rate": 7.331995778981088e-07, "loss": 0.81904256, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 12088, "time_per_iteration": 2.3559067249298096 }, { "auxiliary_loss_clip": 0.01052677, "auxiliary_loss_mlp": 0.01036891, "balance_loss_clip": 1.01391888, "balance_loss_mlp": 1.01575303, "epoch": 0.7268300015030813, "flos": 18513881809920.0, "grad_norm": 1.7455296845044221, "language_loss": 0.74667805, "learning_rate": 7.328982269740221e-07, "loss": 0.76757371, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36914062, "step": 12089, "time_per_iteration": 2.3432788848876953 }, { "auxiliary_loss_clip": 0.01053388, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.01401341, "balance_loss_mlp": 1.01655746, "epoch": 0.7268901247557493, "flos": 23984946414720.0, "grad_norm": 2.199131952528735, "language_loss": 0.71970463, "learning_rate": 7.325969240985616e-07, "loss": 0.74059868, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3671875, "step": 12090, "time_per_iteration": 2.369866371154785 }, { "auxiliary_loss_clip": 0.01054037, "auxiliary_loss_mlp": 0.01046391, "balance_loss_clip": 1.02065349, "balance_loss_mlp": 1.01670098, "epoch": 0.7269502480084172, "flos": 32086716364800.0, "grad_norm": 1.5934558642885301, "language_loss": 0.77605617, "learning_rate": 7.322956692831528e-07, "loss": 0.79706043, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 12091, "time_per_iteration": 2.456702709197998 }, { "auxiliary_loss_clip": 0.01051087, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.01353765, "balance_loss_mlp": 1.01516771, "epoch": 0.7270103712610853, "flos": 19061773776000.0, "grad_norm": 1.916543567140568, "language_loss": 0.72589922, "learning_rate": 7.319944625392205e-07, "loss": 0.74678379, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 12092, "time_per_iteration": 2.358867883682251 }, { "auxiliary_loss_clip": 0.01051876, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.01147568, "balance_loss_mlp": 1.01614594, "epoch": 0.7270704945137532, "flos": 34531474475520.0, "grad_norm": 2.114893176169776, "language_loss": 0.62292612, "learning_rate": 7.31693303878184e-07, "loss": 0.64379472, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 12093, "time_per_iteration": 2.4600799083709717 }, { "auxiliary_loss_clip": 0.0105222, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.01037073, "balance_loss_mlp": 1.01629817, "epoch": 0.7271306177664212, "flos": 21506496975360.0, "grad_norm": 1.6679603936347902, "language_loss": 0.76016557, "learning_rate": 7.313921933114644e-07, "loss": 0.78102672, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 12094, "time_per_iteration": 2.3829593658447266 }, { "auxiliary_loss_clip": 0.01052097, "auxiliary_loss_mlp": 0.01037572, "balance_loss_clip": 1.01560187, "balance_loss_mlp": 1.01675797, "epoch": 0.7271907410190891, "flos": 22271374241280.0, "grad_norm": 1.8263038390688884, "language_loss": 0.86081052, "learning_rate": 7.310911308504808e-07, "loss": 0.88170719, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 12095, "time_per_iteration": 3.8250982761383057 }, { "auxiliary_loss_clip": 0.01051753, "auxiliary_loss_mlp": 0.01038468, "balance_loss_clip": 1.01443529, "balance_loss_mlp": 1.01521516, "epoch": 0.7272508642717571, "flos": 22892025214080.0, "grad_norm": 1.7387656873931483, "language_loss": 0.79225683, "learning_rate": 7.307901165066479e-07, "loss": 0.81315911, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 12096, "time_per_iteration": 2.3697991371154785 }, { "auxiliary_loss_clip": 0.01052696, "auxiliary_loss_mlp": 0.01044006, "balance_loss_clip": 1.02046239, "balance_loss_mlp": 1.01661634, "epoch": 0.727310987524425, "flos": 11655089550720.0, "grad_norm": 2.1922625805428666, "language_loss": 0.73473608, "learning_rate": 7.30489150291381e-07, "loss": 0.75570309, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 12097, "time_per_iteration": 2.3568108081817627 }, { "auxiliary_loss_clip": 0.01053691, "auxiliary_loss_mlp": 0.01043046, "balance_loss_clip": 1.01848865, "balance_loss_mlp": 1.0163337, "epoch": 0.727371110777093, "flos": 24534165012480.0, "grad_norm": 1.8088019000227107, "language_loss": 0.78396338, "learning_rate": 7.301882322160935e-07, "loss": 0.80493081, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 12098, "time_per_iteration": 2.373223066329956 }, { "auxiliary_loss_clip": 0.01053542, "auxiliary_loss_mlp": 0.01037033, "balance_loss_clip": 1.01313174, "balance_loss_mlp": 1.01583362, "epoch": 0.7274312340297611, "flos": 74737278691200.0, "grad_norm": 1.6455616993379154, "language_loss": 0.68152779, "learning_rate": 7.298873622921952e-07, "loss": 0.70243353, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 12099, "time_per_iteration": 2.778770923614502 }, { "auxiliary_loss_clip": 0.01054778, "auxiliary_loss_mlp": 0.01039392, "balance_loss_clip": 1.01360714, "balance_loss_mlp": 1.01603878, "epoch": 0.727491357282429, "flos": 22341864009600.0, "grad_norm": 1.9425455507501495, "language_loss": 0.73617136, "learning_rate": 7.29586540531095e-07, "loss": 0.7571131, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 12100, "time_per_iteration": 2.357714891433716 }, { "auxiliary_loss_clip": 0.01051692, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.01657867, "balance_loss_mlp": 1.01634181, "epoch": 0.727551480535097, "flos": 23296354202880.0, "grad_norm": 1.4150905993705087, "language_loss": 0.7553128, "learning_rate": 7.292857669442005e-07, "loss": 0.77622122, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 12101, "time_per_iteration": 2.4215524196624756 }, { "auxiliary_loss_clip": 0.01053869, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.01673138, "balance_loss_mlp": 1.01759315, "epoch": 0.7276116037877649, "flos": 21469489067520.0, "grad_norm": 1.7845676767026417, "language_loss": 0.83884335, "learning_rate": 7.289850415429177e-07, "loss": 0.85976487, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36328125, "step": 12102, "time_per_iteration": 2.352703332901001 }, { "auxiliary_loss_clip": 0.01050704, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 1.01143265, "balance_loss_mlp": 1.01546502, "epoch": 0.7276717270404329, "flos": 21463170111360.0, "grad_norm": 1.9998261212245683, "language_loss": 0.83337152, "learning_rate": 7.286843643386495e-07, "loss": 0.8542133, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 12103, "time_per_iteration": 2.3677995204925537 }, { "auxiliary_loss_clip": 0.01053213, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.01118183, "balance_loss_mlp": 1.01757193, "epoch": 0.7277318502931008, "flos": 16836270203520.0, "grad_norm": 1.689926741909336, "language_loss": 0.6802603, "learning_rate": 7.283837353427968e-07, "loss": 0.70113349, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 12104, "time_per_iteration": 2.34486722946167 }, { "auxiliary_loss_clip": 0.01049634, "auxiliary_loss_mlp": 0.01038003, "balance_loss_clip": 1.01528192, "balance_loss_mlp": 1.0155282, "epoch": 0.7277919735457689, "flos": 33399171394560.0, "grad_norm": 2.119222022032666, "language_loss": 0.66784686, "learning_rate": 7.280831545667611e-07, "loss": 0.68872321, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.33984375, "step": 12105, "time_per_iteration": 2.4764440059661865 }, { "auxiliary_loss_clip": 0.01052466, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.0171901, "balance_loss_mlp": 1.01674438, "epoch": 0.7278520967984368, "flos": 19205546221440.0, "grad_norm": 2.9474830135266354, "language_loss": 0.76902974, "learning_rate": 7.27782622021939e-07, "loss": 0.78995812, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 12106, "time_per_iteration": 2.3332674503326416 }, { "auxiliary_loss_clip": 0.0105446, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.01763034, "balance_loss_mlp": 1.01655746, "epoch": 0.7279122200511048, "flos": 34093244689920.0, "grad_norm": 2.062094764470932, "language_loss": 0.71528685, "learning_rate": 7.274821377197273e-07, "loss": 0.73624647, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37890625, "step": 12107, "time_per_iteration": 2.4844741821289062 }, { "auxiliary_loss_clip": 0.01051513, "auxiliary_loss_mlp": 0.01038014, "balance_loss_clip": 1.01487517, "balance_loss_mlp": 1.01613927, "epoch": 0.7279723433037727, "flos": 54597071928960.0, "grad_norm": 1.4390570258688409, "language_loss": 0.76178432, "learning_rate": 7.271817016715205e-07, "loss": 0.78267956, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 12108, "time_per_iteration": 2.69358229637146 }, { "auxiliary_loss_clip": 0.01052055, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.01117516, "balance_loss_mlp": 1.0157783, "epoch": 0.7280324665564407, "flos": 36136012872960.0, "grad_norm": 1.4242662346110704, "language_loss": 0.67831951, "learning_rate": 7.268813138887124e-07, "loss": 0.69918203, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 12109, "time_per_iteration": 2.4926018714904785 }, { "auxiliary_loss_clip": 0.01053314, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.0167942, "balance_loss_mlp": 1.01640105, "epoch": 0.7280925898091086, "flos": 11617767440640.0, "grad_norm": 1.9398687614112124, "language_loss": 0.64514256, "learning_rate": 7.265809743826912e-07, "loss": 0.66608787, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 12110, "time_per_iteration": 2.319761276245117 }, { "auxiliary_loss_clip": 0.01052573, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.01092005, "balance_loss_mlp": 1.01533711, "epoch": 0.7281527130617766, "flos": 34275665877120.0, "grad_norm": 2.4885693530315787, "language_loss": 0.59938371, "learning_rate": 7.26280683164847e-07, "loss": 0.62027824, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 12111, "time_per_iteration": 2.4759747982025146 }, { "auxiliary_loss_clip": 0.01053507, "auxiliary_loss_mlp": 0.01037767, "balance_loss_clip": 1.01287627, "balance_loss_mlp": 1.01648378, "epoch": 0.7282128363144446, "flos": 13917182094720.0, "grad_norm": 2.238178897883566, "language_loss": 0.75717986, "learning_rate": 7.259804402465677e-07, "loss": 0.77809262, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 12112, "time_per_iteration": 2.3309481143951416 }, { "auxiliary_loss_clip": 0.01050775, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 1.01197886, "balance_loss_mlp": 1.01585388, "epoch": 0.7282729595671126, "flos": 20776567847040.0, "grad_norm": 2.5295613431338535, "language_loss": 0.67944014, "learning_rate": 7.25680245639237e-07, "loss": 0.70028841, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 12113, "time_per_iteration": 2.349595546722412 }, { "auxiliary_loss_clip": 0.01053263, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.0121727, "balance_loss_mlp": 1.01676393, "epoch": 0.7283330828197806, "flos": 16324513361280.0, "grad_norm": 1.6347277970916918, "language_loss": 0.74227595, "learning_rate": 7.253800993542399e-07, "loss": 0.76316071, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 12114, "time_per_iteration": 2.3481242656707764 }, { "auxiliary_loss_clip": 0.01052308, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.01389742, "balance_loss_mlp": 1.01607859, "epoch": 0.7283932060724485, "flos": 27488969308800.0, "grad_norm": 1.9288072852027034, "language_loss": 0.69325888, "learning_rate": 7.250800014029564e-07, "loss": 0.71415597, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 12115, "time_per_iteration": 2.404755115509033 }, { "auxiliary_loss_clip": 0.01055823, "auxiliary_loss_mlp": 0.01038915, "balance_loss_clip": 1.01431, "balance_loss_mlp": 1.01658392, "epoch": 0.7284533293251165, "flos": 18366932430720.0, "grad_norm": 1.8805989086384283, "language_loss": 0.61362433, "learning_rate": 7.247799517967674e-07, "loss": 0.63457167, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.39257812, "step": 12116, "time_per_iteration": 2.379642963409424 }, { "auxiliary_loss_clip": 0.01051501, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.01283252, "balance_loss_mlp": 1.01586294, "epoch": 0.7285134525777844, "flos": 21724459793280.0, "grad_norm": 1.8120744669971138, "language_loss": 0.74126792, "learning_rate": 7.2447995054705e-07, "loss": 0.76215863, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35546875, "step": 12117, "time_per_iteration": 2.3625385761260986 }, { "auxiliary_loss_clip": 0.01053218, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.01359642, "balance_loss_mlp": 1.01666307, "epoch": 0.7285735758304525, "flos": 20740293077760.0, "grad_norm": 1.8225282082865497, "language_loss": 0.70657814, "learning_rate": 7.241799976651807e-07, "loss": 0.72746992, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 12118, "time_per_iteration": 3.6344995498657227 }, { "auxiliary_loss_clip": 0.01048907, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.01381195, "balance_loss_mlp": 1.01460993, "epoch": 0.7286336990831204, "flos": 17310006708480.0, "grad_norm": 1.641483226712756, "language_loss": 0.85310364, "learning_rate": 7.238800931625346e-07, "loss": 0.87393719, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 12119, "time_per_iteration": 2.376063108444214 }, { "auxiliary_loss_clip": 0.01053845, "auxiliary_loss_mlp": 0.01036121, "balance_loss_clip": 1.01233888, "balance_loss_mlp": 1.01712704, "epoch": 0.7286938223357884, "flos": 19786501111680.0, "grad_norm": 2.134266128104077, "language_loss": 0.82924283, "learning_rate": 7.235802370504831e-07, "loss": 0.85014248, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12120, "time_per_iteration": 2.326993465423584 }, { "auxiliary_loss_clip": 0.01052828, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.01872396, "balance_loss_mlp": 1.01661038, "epoch": 0.7287539455884563, "flos": 15339962620800.0, "grad_norm": 1.8083110069762862, "language_loss": 0.80019456, "learning_rate": 7.232804293403963e-07, "loss": 0.82114142, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 12121, "time_per_iteration": 2.3832247257232666 }, { "auxiliary_loss_clip": 0.01054174, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.01167178, "balance_loss_mlp": 1.01671278, "epoch": 0.7288140688411243, "flos": 25191300222720.0, "grad_norm": 1.6364896179781765, "language_loss": 0.70514709, "learning_rate": 7.229806700436441e-07, "loss": 0.72605503, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 12122, "time_per_iteration": 3.801983594894409 }, { "auxiliary_loss_clip": 0.01049839, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.01383722, "balance_loss_mlp": 1.01483274, "epoch": 0.7288741920937922, "flos": 23983131024000.0, "grad_norm": 1.67795094646173, "language_loss": 0.87962389, "learning_rate": 7.226809591715923e-07, "loss": 0.90049297, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34960938, "step": 12123, "time_per_iteration": 2.38325572013855 }, { "auxiliary_loss_clip": 0.01052874, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.01639676, "balance_loss_mlp": 1.01651716, "epoch": 0.7289343153464602, "flos": 22743888848640.0, "grad_norm": 1.8092577592495658, "language_loss": 0.83109975, "learning_rate": 7.223812967356065e-07, "loss": 0.85201514, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 12124, "time_per_iteration": 3.8626708984375 }, { "auxiliary_loss_clip": 0.01051706, "auxiliary_loss_mlp": 0.01035993, "balance_loss_clip": 1.01256824, "balance_loss_mlp": 1.01554108, "epoch": 0.7289944385991282, "flos": 24898867741440.0, "grad_norm": 1.9118809039784581, "language_loss": 0.68659747, "learning_rate": 7.220816827470499e-07, "loss": 0.70747441, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 12125, "time_per_iteration": 2.4090561866760254 }, { "auxiliary_loss_clip": 0.01053152, "auxiliary_loss_mlp": 0.01040972, "balance_loss_clip": 1.01596165, "balance_loss_mlp": 1.01505375, "epoch": 0.7290545618517962, "flos": 22965936295680.0, "grad_norm": 1.7384787789636396, "language_loss": 0.7634294, "learning_rate": 7.217821172172855e-07, "loss": 0.78437066, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 12126, "time_per_iteration": 2.3931398391723633 }, { "auxiliary_loss_clip": 0.01008365, "auxiliary_loss_mlp": 0.01003958, "balance_loss_clip": 1.00120389, "balance_loss_mlp": 1.00120711, "epoch": 0.7291146851044642, "flos": 61898176736640.0, "grad_norm": 0.8190739063758912, "language_loss": 0.58714592, "learning_rate": 7.2148260015767e-07, "loss": 0.60726917, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.07128906, "step": 12127, "time_per_iteration": 2.946786642074585 }, { "auxiliary_loss_clip": 0.0105279, "auxiliary_loss_mlp": 0.01039738, "balance_loss_clip": 1.01720715, "balance_loss_mlp": 1.01755691, "epoch": 0.7291748083571321, "flos": 23329836063360.0, "grad_norm": 2.2902889624258544, "language_loss": 0.70507169, "learning_rate": 7.21183131579562e-07, "loss": 0.72599697, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 12128, "time_per_iteration": 2.417213201522827 }, { "auxiliary_loss_clip": 0.01053958, "auxiliary_loss_mlp": 0.01048737, "balance_loss_clip": 1.02419186, "balance_loss_mlp": 1.01682711, "epoch": 0.7292349316098001, "flos": 28328735174400.0, "grad_norm": 2.2452095585546683, "language_loss": 0.66475266, "learning_rate": 7.20883711494319e-07, "loss": 0.68577957, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12129, "time_per_iteration": 2.406158685684204 }, { "auxiliary_loss_clip": 0.01049401, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.01110816, "balance_loss_mlp": 1.01516449, "epoch": 0.729295054862468, "flos": 24131127744000.0, "grad_norm": 2.009497000525627, "language_loss": 0.74955618, "learning_rate": 7.205843399132927e-07, "loss": 0.77039063, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34179688, "step": 12130, "time_per_iteration": 2.4020779132843018 }, { "auxiliary_loss_clip": 0.01052175, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.01538396, "balance_loss_mlp": 1.01576877, "epoch": 0.7293551781151361, "flos": 22815251400960.0, "grad_norm": 1.7034359172970466, "language_loss": 0.70810544, "learning_rate": 7.202850168478374e-07, "loss": 0.72902942, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 12131, "time_per_iteration": 2.365910291671753 }, { "auxiliary_loss_clip": 0.01052142, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.01163483, "balance_loss_mlp": 1.01686263, "epoch": 0.729415301367804, "flos": 22125611848320.0, "grad_norm": 1.5720379445330195, "language_loss": 0.7788192, "learning_rate": 7.199857423093025e-07, "loss": 0.79968214, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 12132, "time_per_iteration": 2.36641263961792 }, { "auxiliary_loss_clip": 0.01054054, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 1.02119684, "balance_loss_mlp": 1.01795161, "epoch": 0.729475424620472, "flos": 12348778821120.0, "grad_norm": 1.9856790212136735, "language_loss": 0.80133665, "learning_rate": 7.196865163090358e-07, "loss": 0.82232845, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 12133, "time_per_iteration": 2.343222141265869 }, { "auxiliary_loss_clip": 0.01053462, "auxiliary_loss_mlp": 0.01038818, "balance_loss_clip": 1.01573968, "balance_loss_mlp": 1.01601577, "epoch": 0.7295355478731399, "flos": 22194356048640.0, "grad_norm": 1.887189697844182, "language_loss": 0.72904593, "learning_rate": 7.193873388583846e-07, "loss": 0.74996877, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.375, "step": 12134, "time_per_iteration": 3.7896244525909424 }, { "auxiliary_loss_clip": 0.01053597, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.01651144, "balance_loss_mlp": 1.01684678, "epoch": 0.7295956711258079, "flos": 23220907021440.0, "grad_norm": 1.610580743693987, "language_loss": 0.72308946, "learning_rate": 7.190882099686939e-07, "loss": 0.74403191, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 12135, "time_per_iteration": 2.366248846054077 }, { "auxiliary_loss_clip": 0.0105384, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.01403058, "balance_loss_mlp": 1.01635671, "epoch": 0.7296557943784758, "flos": 31867741117440.0, "grad_norm": 2.1379055162319354, "language_loss": 0.64860904, "learning_rate": 7.187891296513075e-07, "loss": 0.66953129, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.375, "step": 12136, "time_per_iteration": 2.4587676525115967 }, { "auxiliary_loss_clip": 0.01051475, "auxiliary_loss_mlp": 0.01037216, "balance_loss_clip": 1.01468539, "balance_loss_mlp": 1.01530385, "epoch": 0.7297159176311439, "flos": 26650495163520.0, "grad_norm": 2.096864151026659, "language_loss": 0.7623964, "learning_rate": 7.184900979175654e-07, "loss": 0.78328335, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36132812, "step": 12137, "time_per_iteration": 2.3877265453338623 }, { "auxiliary_loss_clip": 0.01053438, "auxiliary_loss_mlp": 0.01044498, "balance_loss_clip": 1.02169323, "balance_loss_mlp": 1.01667011, "epoch": 0.7297760408838118, "flos": 24748531960320.0, "grad_norm": 1.5823016494549402, "language_loss": 0.74860394, "learning_rate": 7.181911147788069e-07, "loss": 0.76958323, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 12138, "time_per_iteration": 2.427959680557251 }, { "auxiliary_loss_clip": 0.01051552, "auxiliary_loss_mlp": 0.01035776, "balance_loss_clip": 1.0127331, "balance_loss_mlp": 1.01578474, "epoch": 0.7298361641364798, "flos": 18072894026880.0, "grad_norm": 2.0727470501264698, "language_loss": 0.73585564, "learning_rate": 7.178921802463702e-07, "loss": 0.75672895, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 12139, "time_per_iteration": 2.33278226852417 }, { "auxiliary_loss_clip": 0.01049833, "auxiliary_loss_mlp": 0.01032682, "balance_loss_clip": 1.01146221, "balance_loss_mlp": 1.01550817, "epoch": 0.7298962873891478, "flos": 29894380450560.0, "grad_norm": 1.5262762734107822, "language_loss": 0.74271703, "learning_rate": 7.175932943315898e-07, "loss": 0.76354223, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 12140, "time_per_iteration": 2.48923397064209 }, { "auxiliary_loss_clip": 0.01053382, "auxiliary_loss_mlp": 0.01040271, "balance_loss_clip": 1.01466501, "balance_loss_mlp": 1.01621878, "epoch": 0.7299564106418157, "flos": 32264843454720.0, "grad_norm": 1.6579350473179726, "language_loss": 0.56565136, "learning_rate": 7.172944570458003e-07, "loss": 0.58658791, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 12141, "time_per_iteration": 2.4417004585266113 }, { "auxiliary_loss_clip": 0.0105066, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.0114665, "balance_loss_mlp": 1.01617229, "epoch": 0.7300165338944837, "flos": 22929172767360.0, "grad_norm": 1.4471005937910635, "language_loss": 0.73447037, "learning_rate": 7.169956684003342e-07, "loss": 0.75529861, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 12142, "time_per_iteration": 2.4076027870178223 }, { "auxiliary_loss_clip": 0.01051556, "auxiliary_loss_mlp": 0.01034573, "balance_loss_clip": 1.01340103, "balance_loss_mlp": 1.01654005, "epoch": 0.7300766571471516, "flos": 19827768205440.0, "grad_norm": 1.830109783907101, "language_loss": 0.74811351, "learning_rate": 7.16696928406521e-07, "loss": 0.76897478, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.3515625, "step": 12143, "time_per_iteration": 2.337013006210327 }, { "auxiliary_loss_clip": 0.01052994, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.01144803, "balance_loss_mlp": 1.01625526, "epoch": 0.7301367803998197, "flos": 24346821323520.0, "grad_norm": 2.0409701863013963, "language_loss": 0.67999107, "learning_rate": 7.163982370756882e-07, "loss": 0.70086634, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 12144, "time_per_iteration": 2.431495428085327 }, { "auxiliary_loss_clip": 0.01051722, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.01532245, "balance_loss_mlp": 1.01588094, "epoch": 0.7301969036524876, "flos": 15303618028800.0, "grad_norm": 1.7055539242422253, "language_loss": 0.80285585, "learning_rate": 7.160995944191627e-07, "loss": 0.82376385, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.359375, "step": 12145, "time_per_iteration": 2.3372957706451416 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.01433945, "balance_loss_mlp": 1.01706028, "epoch": 0.7302570269051556, "flos": 23506322319360.0, "grad_norm": 2.37061920027568, "language_loss": 0.92440832, "learning_rate": 7.158010004482702e-07, "loss": 0.94529277, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 12146, "time_per_iteration": 2.374751567840576 }, { "auxiliary_loss_clip": 0.01050162, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.01506889, "balance_loss_mlp": 1.01630998, "epoch": 0.7303171501578235, "flos": 20521981146240.0, "grad_norm": 1.7212784606519054, "language_loss": 0.62862587, "learning_rate": 7.155024551743316e-07, "loss": 0.64950448, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.33984375, "step": 12147, "time_per_iteration": 2.439755916595459 }, { "auxiliary_loss_clip": 0.01054066, "auxiliary_loss_mlp": 0.01042975, "balance_loss_clip": 1.01841795, "balance_loss_mlp": 1.01722121, "epoch": 0.7303772734104915, "flos": 18331635179520.0, "grad_norm": 1.8801477896779148, "language_loss": 0.76363701, "learning_rate": 7.152039586086693e-07, "loss": 0.78460741, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36914062, "step": 12148, "time_per_iteration": 2.328608512878418 }, { "auxiliary_loss_clip": 0.01008656, "auxiliary_loss_mlp": 0.01002708, "balance_loss_clip": 1.00016856, "balance_loss_mlp": 1.00163913, "epoch": 0.7304373966631594, "flos": 60651638087040.0, "grad_norm": 0.6927880124219913, "language_loss": 0.56760401, "learning_rate": 7.149055107626017e-07, "loss": 0.58771759, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.0703125, "step": 12149, "time_per_iteration": 2.997512102127075 }, { "auxiliary_loss_clip": 0.01054409, "auxiliary_loss_mlp": 0.01038339, "balance_loss_clip": 1.01346016, "balance_loss_mlp": 1.01665008, "epoch": 0.7304975199158275, "flos": 19827069978240.0, "grad_norm": 1.781961016535131, "language_loss": 0.75387698, "learning_rate": 7.146071116474451e-07, "loss": 0.77480447, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37890625, "step": 12150, "time_per_iteration": 2.3715248107910156 }, { "auxiliary_loss_clip": 0.010539, "auxiliary_loss_mlp": 0.0104396, "balance_loss_clip": 1.01879454, "balance_loss_mlp": 1.01647902, "epoch": 0.7305576431684954, "flos": 13223178622080.0, "grad_norm": 2.132401682765272, "language_loss": 0.84743559, "learning_rate": 7.143087612745158e-07, "loss": 0.86841422, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 12151, "time_per_iteration": 2.3284645080566406 }, { "auxiliary_loss_clip": 0.01053229, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.01805425, "balance_loss_mlp": 1.01622748, "epoch": 0.7306177664211634, "flos": 24059346255360.0, "grad_norm": 2.1976542905522916, "language_loss": 0.789639, "learning_rate": 7.14010459655127e-07, "loss": 0.81059897, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 12152, "time_per_iteration": 2.3799543380737305 }, { "auxiliary_loss_clip": 0.01052877, "auxiliary_loss_mlp": 0.01038505, "balance_loss_clip": 1.01418614, "balance_loss_mlp": 1.01633668, "epoch": 0.7306778896738314, "flos": 27087887076480.0, "grad_norm": 1.5619615004039853, "language_loss": 0.80621451, "learning_rate": 7.137122068005919e-07, "loss": 0.82712841, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 12153, "time_per_iteration": 2.414236068725586 }, { "auxiliary_loss_clip": 0.01054747, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.01527059, "balance_loss_mlp": 1.01694918, "epoch": 0.7307380129264993, "flos": 16689739760640.0, "grad_norm": 1.7592985797593879, "language_loss": 0.68409568, "learning_rate": 7.134140027222173e-07, "loss": 0.705037, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37890625, "step": 12154, "time_per_iteration": 2.3269240856170654 }, { "auxiliary_loss_clip": 0.01052343, "auxiliary_loss_mlp": 0.01038139, "balance_loss_clip": 1.01355827, "balance_loss_mlp": 1.01588726, "epoch": 0.7307981361791673, "flos": 21724669261440.0, "grad_norm": 1.8236270557253473, "language_loss": 0.66827881, "learning_rate": 7.131158474313128e-07, "loss": 0.68918359, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 12155, "time_per_iteration": 2.3728766441345215 }, { "auxiliary_loss_clip": 0.01050443, "auxiliary_loss_mlp": 0.01034332, "balance_loss_clip": 1.01243377, "balance_loss_mlp": 1.01541591, "epoch": 0.7308582594318352, "flos": 18039691457280.0, "grad_norm": 2.025576139984311, "language_loss": 0.82765412, "learning_rate": 7.128177409391851e-07, "loss": 0.84850186, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 12156, "time_per_iteration": 2.3345067501068115 }, { "auxiliary_loss_clip": 0.01050864, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.01789749, "balance_loss_mlp": 1.01603198, "epoch": 0.7309183826845033, "flos": 13844108885760.0, "grad_norm": 4.6114895222421755, "language_loss": 0.76603365, "learning_rate": 7.125196832571367e-07, "loss": 0.78693962, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 12157, "time_per_iteration": 2.3798465728759766 }, { "auxiliary_loss_clip": 0.01049259, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.01164842, "balance_loss_mlp": 1.01504803, "epoch": 0.7309785059371712, "flos": 17018272454400.0, "grad_norm": 2.6726121283910746, "language_loss": 0.75021327, "learning_rate": 7.122216743964713e-07, "loss": 0.77103049, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34179688, "step": 12158, "time_per_iteration": 3.7539024353027344 }, { "auxiliary_loss_clip": 0.01054235, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.01535416, "balance_loss_mlp": 1.01752424, "epoch": 0.7310386291898392, "flos": 26501276545920.0, "grad_norm": 1.7725098474960317, "language_loss": 0.86525285, "learning_rate": 7.119237143684896e-07, "loss": 0.8861891, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 12159, "time_per_iteration": 2.398587703704834 }, { "auxiliary_loss_clip": 0.01055276, "auxiliary_loss_mlp": 0.01040947, "balance_loss_clip": 1.01599669, "balance_loss_mlp": 1.01707339, "epoch": 0.7310987524425071, "flos": 16944989777280.0, "grad_norm": 2.2488148973857847, "language_loss": 0.74616665, "learning_rate": 7.116258031844895e-07, "loss": 0.76712883, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3828125, "step": 12160, "time_per_iteration": 2.3653366565704346 }, { "auxiliary_loss_clip": 0.0105551, "auxiliary_loss_mlp": 0.01039913, "balance_loss_clip": 1.01551104, "balance_loss_mlp": 1.01731038, "epoch": 0.7311588756951751, "flos": 13844423088000.0, "grad_norm": 2.159069857584061, "language_loss": 0.74897248, "learning_rate": 7.113279408557675e-07, "loss": 0.76992667, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 12161, "time_per_iteration": 3.7169902324676514 }, { "auxiliary_loss_clip": 0.01056626, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.01223791, "balance_loss_mlp": 1.01731789, "epoch": 0.731218998947843, "flos": 28766615846400.0, "grad_norm": 1.6924777205830486, "language_loss": 0.71533179, "learning_rate": 7.110301273936192e-07, "loss": 0.73629767, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.39257812, "step": 12162, "time_per_iteration": 2.471043825149536 }, { "auxiliary_loss_clip": 0.01055843, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.01307273, "balance_loss_mlp": 1.01798892, "epoch": 0.7312791222005111, "flos": 27087572874240.0, "grad_norm": 1.8808691744394173, "language_loss": 0.67896938, "learning_rate": 7.107323628093382e-07, "loss": 0.69990003, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 12163, "time_per_iteration": 3.9231553077697754 }, { "auxiliary_loss_clip": 0.01053946, "auxiliary_loss_mlp": 0.01033074, "balance_loss_clip": 1.01118767, "balance_loss_mlp": 1.01678014, "epoch": 0.731339245453179, "flos": 20922958644480.0, "grad_norm": 1.927385033528841, "language_loss": 0.69746381, "learning_rate": 7.104346471142153e-07, "loss": 0.71833408, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.37109375, "step": 12164, "time_per_iteration": 2.3704469203948975 }, { "auxiliary_loss_clip": 0.01051574, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.01435947, "balance_loss_mlp": 1.01680171, "epoch": 0.731399368705847, "flos": 23074586046720.0, "grad_norm": 3.9296420524584903, "language_loss": 0.74474329, "learning_rate": 7.101369803195391e-07, "loss": 0.76562589, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 12165, "time_per_iteration": 2.4035120010375977 }, { "auxiliary_loss_clip": 0.0105243, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.01409233, "balance_loss_mlp": 1.01561618, "epoch": 0.731459491958515, "flos": 23581664766720.0, "grad_norm": 2.027818882402817, "language_loss": 0.77679157, "learning_rate": 7.098393624365988e-07, "loss": 0.79769003, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 12166, "time_per_iteration": 2.438708782196045 }, { "auxiliary_loss_clip": 0.01052963, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.01541257, "balance_loss_mlp": 1.01703167, "epoch": 0.7315196152111829, "flos": 22378278424320.0, "grad_norm": 1.7194521406162961, "language_loss": 0.80383903, "learning_rate": 7.095417934766781e-07, "loss": 0.82475531, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 12167, "time_per_iteration": 2.5277140140533447 }, { "auxiliary_loss_clip": 0.01051892, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.0183301, "balance_loss_mlp": 1.01679027, "epoch": 0.7315797384638509, "flos": 26175850963200.0, "grad_norm": 3.4224931836249017, "language_loss": 0.77819633, "learning_rate": 7.092442734510622e-07, "loss": 0.79912269, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 12168, "time_per_iteration": 2.429852247238159 }, { "auxiliary_loss_clip": 0.01054514, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.01581371, "balance_loss_mlp": 1.01758146, "epoch": 0.7316398617165188, "flos": 21505275077760.0, "grad_norm": 1.8768507313890366, "language_loss": 0.82825613, "learning_rate": 7.089468023710326e-07, "loss": 0.84920758, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 12169, "time_per_iteration": 2.379197120666504 }, { "auxiliary_loss_clip": 0.01053744, "auxiliary_loss_mlp": 0.01046064, "balance_loss_clip": 1.02247262, "balance_loss_mlp": 1.01684403, "epoch": 0.7316999849691869, "flos": 30481235360640.0, "grad_norm": 1.8613885281469587, "language_loss": 0.71064311, "learning_rate": 7.08649380247871e-07, "loss": 0.73164117, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 12170, "time_per_iteration": 2.4633593559265137 }, { "auxiliary_loss_clip": 0.01051856, "auxiliary_loss_mlp": 0.01040168, "balance_loss_clip": 1.01481211, "balance_loss_mlp": 1.0165441, "epoch": 0.7317601082218548, "flos": 21542701921920.0, "grad_norm": 2.6192195724201386, "language_loss": 0.71360052, "learning_rate": 7.083520070928533e-07, "loss": 0.73452079, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.35351562, "step": 12171, "time_per_iteration": 2.3543953895568848 }, { "auxiliary_loss_clip": 0.01051729, "auxiliary_loss_mlp": 0.01041125, "balance_loss_clip": 1.01886833, "balance_loss_mlp": 1.01693487, "epoch": 0.7318202314745228, "flos": 33250301890560.0, "grad_norm": 1.7258000291085507, "language_loss": 0.67323607, "learning_rate": 7.080546829172564e-07, "loss": 0.69416463, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 12172, "time_per_iteration": 2.456505298614502 }, { "auxiliary_loss_clip": 0.01052659, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.00944674, "balance_loss_mlp": 1.01609218, "epoch": 0.7318803547271907, "flos": 20156021608320.0, "grad_norm": 2.334987018147699, "language_loss": 0.64571899, "learning_rate": 7.077574077323564e-07, "loss": 0.66656715, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36523438, "step": 12173, "time_per_iteration": 2.3396220207214355 }, { "auxiliary_loss_clip": 0.01053238, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.01395774, "balance_loss_mlp": 1.01696622, "epoch": 0.7319404779798587, "flos": 20557487865600.0, "grad_norm": 2.267903788203601, "language_loss": 0.75873303, "learning_rate": 7.074601815494243e-07, "loss": 0.77962756, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36132812, "step": 12174, "time_per_iteration": 3.8076095581054688 }, { "auxiliary_loss_clip": 0.01051358, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.01522815, "balance_loss_mlp": 1.01625562, "epoch": 0.7320006012325266, "flos": 28694101219200.0, "grad_norm": 1.547452310288421, "language_loss": 0.819929, "learning_rate": 7.071630043797317e-07, "loss": 0.84080887, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3515625, "step": 12175, "time_per_iteration": 2.461665630340576 }, { "auxiliary_loss_clip": 0.01052446, "auxiliary_loss_mlp": 0.01034034, "balance_loss_clip": 1.01143169, "balance_loss_mlp": 1.01634276, "epoch": 0.7320607244851947, "flos": 16361765648640.0, "grad_norm": 2.111999908178241, "language_loss": 0.776474, "learning_rate": 7.068658762345488e-07, "loss": 0.79733872, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 12176, "time_per_iteration": 2.327890157699585 }, { "auxiliary_loss_clip": 0.01053281, "auxiliary_loss_mlp": 0.01034086, "balance_loss_clip": 1.0119133, "balance_loss_mlp": 1.01679754, "epoch": 0.7321208477378626, "flos": 20954171266560.0, "grad_norm": 1.8435931236284973, "language_loss": 0.77361143, "learning_rate": 7.065687971251399e-07, "loss": 0.79448509, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 12177, "time_per_iteration": 2.3974416255950928 }, { "auxiliary_loss_clip": 0.01051764, "auxiliary_loss_mlp": 0.01031374, "balance_loss_clip": 1.00880766, "balance_loss_mlp": 1.01565683, "epoch": 0.7321809709905306, "flos": 13844213619840.0, "grad_norm": 1.9741330530629164, "language_loss": 0.75333691, "learning_rate": 7.06271767062772e-07, "loss": 0.77416825, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 12178, "time_per_iteration": 2.322317123413086 }, { "auxiliary_loss_clip": 0.01054516, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.0117135, "balance_loss_mlp": 1.01707816, "epoch": 0.7322410942431986, "flos": 26978713655040.0, "grad_norm": 1.947493245643442, "language_loss": 0.83851564, "learning_rate": 7.059747860587084e-07, "loss": 0.85940719, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.375, "step": 12179, "time_per_iteration": 2.425567626953125 }, { "auxiliary_loss_clip": 0.01050267, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.01633978, "balance_loss_mlp": 1.01640153, "epoch": 0.7323012174958665, "flos": 17638748870400.0, "grad_norm": 1.9329409004327058, "language_loss": 0.76255906, "learning_rate": 7.056778541242115e-07, "loss": 0.78342783, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 12180, "time_per_iteration": 2.348221778869629 }, { "auxiliary_loss_clip": 0.01053748, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.0101577, "balance_loss_mlp": 1.01618111, "epoch": 0.7323613407485345, "flos": 32341407799680.0, "grad_norm": 1.8456225227488154, "language_loss": 0.80346775, "learning_rate": 7.053809712705396e-07, "loss": 0.82434475, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.375, "step": 12181, "time_per_iteration": 2.462688446044922 }, { "auxiliary_loss_clip": 0.0105467, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.0141573, "balance_loss_mlp": 1.01746368, "epoch": 0.7324214640012024, "flos": 18361975017600.0, "grad_norm": 1.7364640988605713, "language_loss": 0.73097569, "learning_rate": 7.050841375089506e-07, "loss": 0.75189847, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37304688, "step": 12182, "time_per_iteration": 2.3621416091918945 }, { "auxiliary_loss_clip": 0.0105507, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.01682162, "balance_loss_mlp": 1.01757395, "epoch": 0.7324815872538705, "flos": 30810920129280.0, "grad_norm": 1.4970771806136376, "language_loss": 0.72297704, "learning_rate": 7.047873528507015e-07, "loss": 0.74393356, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.375, "step": 12183, "time_per_iteration": 2.4357213973999023 }, { "auxiliary_loss_clip": 0.01054908, "auxiliary_loss_mlp": 0.01045114, "balance_loss_clip": 1.02100956, "balance_loss_mlp": 1.01730359, "epoch": 0.7325417105065384, "flos": 21504053180160.0, "grad_norm": 1.7723969659481718, "language_loss": 0.74042225, "learning_rate": 7.04490617307045e-07, "loss": 0.76142246, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 12184, "time_per_iteration": 2.3853797912597656 }, { "auxiliary_loss_clip": 0.01008296, "auxiliary_loss_mlp": 0.01003445, "balance_loss_clip": 1.00095379, "balance_loss_mlp": 1.00143504, "epoch": 0.7326018337592064, "flos": 67254447191040.0, "grad_norm": 0.7605067815141181, "language_loss": 0.65228069, "learning_rate": 7.041939308892344e-07, "loss": 0.67239809, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.06835938, "step": 12185, "time_per_iteration": 3.014910936355591 }, { "auxiliary_loss_clip": 0.01052596, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.01127863, "balance_loss_mlp": 1.01544738, "epoch": 0.7326619570118743, "flos": 22855959912960.0, "grad_norm": 1.9530815698409731, "language_loss": 0.82324749, "learning_rate": 7.038972936085197e-07, "loss": 0.84412229, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 12186, "time_per_iteration": 2.363873243331909 }, { "auxiliary_loss_clip": 0.01051773, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.01674998, "balance_loss_mlp": 1.01568437, "epoch": 0.7327220802645423, "flos": 23326484572800.0, "grad_norm": 1.735707726524004, "language_loss": 0.74601215, "learning_rate": 7.036007054761508e-07, "loss": 0.76693749, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 12187, "time_per_iteration": 2.3823039531707764 }, { "auxiliary_loss_clip": 0.01055259, "auxiliary_loss_mlp": 0.01046803, "balance_loss_clip": 1.02077949, "balance_loss_mlp": 1.01758313, "epoch": 0.7327822035172102, "flos": 23179674839040.0, "grad_norm": 1.8825897214537683, "language_loss": 0.90903044, "learning_rate": 7.033041665033716e-07, "loss": 0.93005109, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37695312, "step": 12188, "time_per_iteration": 2.393327474594116 }, { "auxiliary_loss_clip": 0.01054172, "auxiliary_loss_mlp": 0.01035792, "balance_loss_clip": 1.01075792, "balance_loss_mlp": 1.01679325, "epoch": 0.7328423267698783, "flos": 21065613926400.0, "grad_norm": 1.9523384078448847, "language_loss": 0.75984287, "learning_rate": 7.030076767014284e-07, "loss": 0.78074253, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 12189, "time_per_iteration": 2.364394187927246 }, { "auxiliary_loss_clip": 0.01052379, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.01014936, "balance_loss_mlp": 1.01580441, "epoch": 0.7329024500225462, "flos": 21688499226240.0, "grad_norm": 5.0639985530296885, "language_loss": 0.83145559, "learning_rate": 7.027112360815648e-07, "loss": 0.8523047, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 12190, "time_per_iteration": 2.418689012527466 }, { "auxiliary_loss_clip": 0.01053529, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.01746178, "balance_loss_mlp": 1.01660228, "epoch": 0.7329625732752142, "flos": 24163073504640.0, "grad_norm": 1.719604316873844, "language_loss": 0.73007613, "learning_rate": 7.024148446550204e-07, "loss": 0.75102645, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 12191, "time_per_iteration": 2.455463409423828 }, { "auxiliary_loss_clip": 0.01052999, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.01166511, "balance_loss_mlp": 1.01664186, "epoch": 0.7330226965278822, "flos": 30076696903680.0, "grad_norm": 2.183197278769759, "language_loss": 0.6996612, "learning_rate": 7.021185024330361e-07, "loss": 0.72053707, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 12192, "time_per_iteration": 2.443463087081909 }, { "auxiliary_loss_clip": 0.01050877, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.01320708, "balance_loss_mlp": 1.01564825, "epoch": 0.7330828197805501, "flos": 23367158173440.0, "grad_norm": 1.737783957420689, "language_loss": 0.74518692, "learning_rate": 7.01822209426848e-07, "loss": 0.7660467, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 12193, "time_per_iteration": 2.379871368408203 }, { "auxiliary_loss_clip": 0.01052805, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.01530766, "balance_loss_mlp": 1.01601148, "epoch": 0.7331429430332181, "flos": 21031748040960.0, "grad_norm": 1.7159705974222188, "language_loss": 0.78532863, "learning_rate": 7.015259656476911e-07, "loss": 0.80626583, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 12194, "time_per_iteration": 2.3885979652404785 }, { "auxiliary_loss_clip": 0.01051723, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.012362, "balance_loss_mlp": 1.01627731, "epoch": 0.733203066285886, "flos": 14647006488960.0, "grad_norm": 1.6587890429035141, "language_loss": 0.71204573, "learning_rate": 7.012297711067998e-07, "loss": 0.7329157, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 12195, "time_per_iteration": 2.3389620780944824 }, { "auxiliary_loss_clip": 0.01054518, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.01989269, "balance_loss_mlp": 1.0173378, "epoch": 0.7332631895385541, "flos": 17164349049600.0, "grad_norm": 1.8641467648867205, "language_loss": 0.73723984, "learning_rate": 7.009336258154057e-07, "loss": 0.75821114, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.37109375, "step": 12196, "time_per_iteration": 2.391292095184326 }, { "auxiliary_loss_clip": 0.01052707, "auxiliary_loss_mlp": 0.01038009, "balance_loss_clip": 1.01519287, "balance_loss_mlp": 1.01700914, "epoch": 0.733323312791222, "flos": 28656883843200.0, "grad_norm": 2.3533086309267235, "language_loss": 0.7271263, "learning_rate": 7.006375297847394e-07, "loss": 0.7480334, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 12197, "time_per_iteration": 2.405515193939209 }, { "auxiliary_loss_clip": 0.01055349, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.01840997, "balance_loss_mlp": 1.01646924, "epoch": 0.73338343604389, "flos": 16617469512960.0, "grad_norm": 2.037183122961024, "language_loss": 0.79669297, "learning_rate": 7.003414830260282e-07, "loss": 0.81769371, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 12198, "time_per_iteration": 3.607362747192383 }, { "auxiliary_loss_clip": 0.01052574, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.01674032, "balance_loss_mlp": 1.01630616, "epoch": 0.7334435592965579, "flos": 21141026196480.0, "grad_norm": 1.884267483486003, "language_loss": 0.75528157, "learning_rate": 7.000454855504974e-07, "loss": 0.776196, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36328125, "step": 12199, "time_per_iteration": 2.3580985069274902 }, { "auxiliary_loss_clip": 0.01054562, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.01823473, "balance_loss_mlp": 1.01743269, "epoch": 0.7335036825492259, "flos": 17124478410240.0, "grad_norm": 2.5164756158998767, "language_loss": 0.78080463, "learning_rate": 6.997495373693729e-07, "loss": 0.80179554, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37109375, "step": 12200, "time_per_iteration": 2.3273849487304688 }, { "auxiliary_loss_clip": 0.01051373, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.01032412, "balance_loss_mlp": 1.01621163, "epoch": 0.7335638058018938, "flos": 23730708827520.0, "grad_norm": 1.556493829591751, "language_loss": 0.62739575, "learning_rate": 6.994536384938754e-07, "loss": 0.64822847, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 12201, "time_per_iteration": 3.8360352516174316 }, { "auxiliary_loss_clip": 0.01052502, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.01073182, "balance_loss_mlp": 1.01740646, "epoch": 0.7336239290545619, "flos": 34931858480640.0, "grad_norm": 2.595453945809166, "language_loss": 0.53314841, "learning_rate": 6.991577889352264e-07, "loss": 0.55400145, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 12202, "time_per_iteration": 2.47965145111084 }, { "auxiliary_loss_clip": 0.01051467, "auxiliary_loss_mlp": 0.01034122, "balance_loss_clip": 1.01174641, "balance_loss_mlp": 1.01652765, "epoch": 0.7336840523072298, "flos": 21102063252480.0, "grad_norm": 1.9934573211191406, "language_loss": 0.6980474, "learning_rate": 6.98861988704645e-07, "loss": 0.7189033, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 12203, "time_per_iteration": 3.8799729347229004 }, { "auxiliary_loss_clip": 0.01054454, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.01986265, "balance_loss_mlp": 1.01666427, "epoch": 0.7337441755598978, "flos": 24023280954240.0, "grad_norm": 1.9727383820314361, "language_loss": 0.67507905, "learning_rate": 6.985662378133474e-07, "loss": 0.69605422, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 12204, "time_per_iteration": 2.376086711883545 }, { "auxiliary_loss_clip": 0.0105304, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.01921582, "balance_loss_mlp": 1.01751113, "epoch": 0.7338042988125658, "flos": 22710197520000.0, "grad_norm": 1.955522457370784, "language_loss": 0.78340423, "learning_rate": 6.982705362725479e-07, "loss": 0.80434042, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35546875, "step": 12205, "time_per_iteration": 2.3886218070983887 }, { "auxiliary_loss_clip": 0.01051952, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.00984645, "balance_loss_mlp": 1.01759326, "epoch": 0.7338644220652337, "flos": 21359931621120.0, "grad_norm": 1.8829385843235025, "language_loss": 0.80917513, "learning_rate": 6.979748840934601e-07, "loss": 0.82999361, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.34375, "step": 12206, "time_per_iteration": 2.3651058673858643 }, { "auxiliary_loss_clip": 0.01051852, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.01250601, "balance_loss_mlp": 1.01602471, "epoch": 0.7339245453179017, "flos": 30918906564480.0, "grad_norm": 1.8834612871506102, "language_loss": 0.71910298, "learning_rate": 6.976792812872958e-07, "loss": 0.73996896, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 12207, "time_per_iteration": 2.443502426147461 }, { "auxiliary_loss_clip": 0.01008279, "auxiliary_loss_mlp": 0.01004129, "balance_loss_clip": 1.00162613, "balance_loss_mlp": 1.00139499, "epoch": 0.7339846685705697, "flos": 67896535605120.0, "grad_norm": 0.7836927948084487, "language_loss": 0.54856026, "learning_rate": 6.97383727865263e-07, "loss": 0.56868434, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.06933594, "step": 12208, "time_per_iteration": 3.1158335208892822 }, { "auxiliary_loss_clip": 0.01050794, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.01441455, "balance_loss_mlp": 1.01599085, "epoch": 0.7340447918232377, "flos": 22235658053760.0, "grad_norm": 1.6814089464009954, "language_loss": 0.81096482, "learning_rate": 6.970882238385703e-07, "loss": 0.8318274, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34765625, "step": 12209, "time_per_iteration": 2.3908092975616455 }, { "auxiliary_loss_clip": 0.01048936, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.01307082, "balance_loss_mlp": 1.01491308, "epoch": 0.7341049150759056, "flos": 23763771751680.0, "grad_norm": 1.3918954254862488, "language_loss": 0.79723406, "learning_rate": 6.96792769218423e-07, "loss": 0.81805432, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33984375, "step": 12210, "time_per_iteration": 2.3856420516967773 }, { "auxiliary_loss_clip": 0.01049058, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.0136416, "balance_loss_mlp": 1.01515973, "epoch": 0.7341650383285736, "flos": 17235641779200.0, "grad_norm": 1.8960322664534088, "language_loss": 0.77883339, "learning_rate": 6.964973640160236e-07, "loss": 0.79968619, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.33789062, "step": 12211, "time_per_iteration": 2.3717644214630127 }, { "auxiliary_loss_clip": 0.01053206, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.01474667, "balance_loss_mlp": 1.01695919, "epoch": 0.7342251615812415, "flos": 23402839449600.0, "grad_norm": 2.3598813747762333, "language_loss": 0.73243237, "learning_rate": 6.962020082425748e-07, "loss": 0.75334102, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 12212, "time_per_iteration": 2.3778247833251953 }, { "auxiliary_loss_clip": 0.0105293, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.01365888, "balance_loss_mlp": 1.01724255, "epoch": 0.7342852848339095, "flos": 22746088264320.0, "grad_norm": 1.6016104630095909, "language_loss": 0.70209485, "learning_rate": 6.959067019092766e-07, "loss": 0.72298473, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 12213, "time_per_iteration": 3.874692678451538 }, { "auxiliary_loss_clip": 0.01008047, "auxiliary_loss_mlp": 0.01005986, "balance_loss_clip": 1.00361383, "balance_loss_mlp": 1.00124168, "epoch": 0.7343454080865774, "flos": 53939376270720.0, "grad_norm": 0.7343095905964789, "language_loss": 0.54396403, "learning_rate": 6.956114450273276e-07, "loss": 0.56410432, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.06835938, "step": 12214, "time_per_iteration": 2.9529306888580322 }, { "auxiliary_loss_clip": 0.01053602, "auxiliary_loss_mlp": 0.01035773, "balance_loss_clip": 1.01282489, "balance_loss_mlp": 1.01646411, "epoch": 0.7344055313392455, "flos": 12166043431680.0, "grad_norm": 1.9971553792133507, "language_loss": 0.71083963, "learning_rate": 6.953162376079233e-07, "loss": 0.73173338, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.37109375, "step": 12215, "time_per_iteration": 2.3388352394104004 }, { "auxiliary_loss_clip": 0.0105, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.01701164, "balance_loss_mlp": 1.01632762, "epoch": 0.7344656545919134, "flos": 18549109238400.0, "grad_norm": 2.9585043399433752, "language_loss": 0.73858917, "learning_rate": 6.950210796622573e-07, "loss": 0.75945497, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.3359375, "step": 12216, "time_per_iteration": 2.403745651245117 }, { "auxiliary_loss_clip": 0.01056149, "auxiliary_loss_mlp": 0.01043072, "balance_loss_clip": 1.0153079, "balance_loss_mlp": 1.01737404, "epoch": 0.7345257778445814, "flos": 23660463438720.0, "grad_norm": 1.7618868782017676, "language_loss": 0.78903764, "learning_rate": 6.947259712015236e-07, "loss": 0.81002986, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38671875, "step": 12217, "time_per_iteration": 2.4626104831695557 }, { "auxiliary_loss_clip": 0.01049347, "auxiliary_loss_mlp": 0.01033242, "balance_loss_clip": 1.01360869, "balance_loss_mlp": 1.01526749, "epoch": 0.7345859010972494, "flos": 13807799205120.0, "grad_norm": 2.0962944343401486, "language_loss": 0.79129952, "learning_rate": 6.94430912236911e-07, "loss": 0.81212538, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.34179688, "step": 12218, "time_per_iteration": 2.399146318435669 }, { "auxiliary_loss_clip": 0.01050191, "auxiliary_loss_mlp": 0.01039939, "balance_loss_clip": 1.01690805, "balance_loss_mlp": 1.01588631, "epoch": 0.7346460243499173, "flos": 22271653532160.0, "grad_norm": 1.978918353028586, "language_loss": 0.73620117, "learning_rate": 6.941359027796092e-07, "loss": 0.75710249, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34375, "step": 12219, "time_per_iteration": 2.3605458736419678 }, { "auxiliary_loss_clip": 0.01049852, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.01743901, "balance_loss_mlp": 1.01555943, "epoch": 0.7347061476025853, "flos": 23254214325120.0, "grad_norm": 1.7317830754446983, "language_loss": 0.76024616, "learning_rate": 6.938409428408061e-07, "loss": 0.78112411, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 12220, "time_per_iteration": 2.407951593399048 }, { "auxiliary_loss_clip": 0.01053983, "auxiliary_loss_mlp": 0.01040517, "balance_loss_clip": 1.01653183, "balance_loss_mlp": 1.01720893, "epoch": 0.7347662708552533, "flos": 15266679943680.0, "grad_norm": 1.6072022240743673, "language_loss": 0.67095149, "learning_rate": 6.93546032431684e-07, "loss": 0.6918965, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 12221, "time_per_iteration": 2.374692440032959 }, { "auxiliary_loss_clip": 0.0105149, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.01007903, "balance_loss_mlp": 1.01618361, "epoch": 0.7348263941079213, "flos": 24858927279360.0, "grad_norm": 1.7441848018998098, "language_loss": 0.70759952, "learning_rate": 6.932511715634273e-07, "loss": 0.72842669, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35351562, "step": 12222, "time_per_iteration": 2.426853895187378 }, { "auxiliary_loss_clip": 0.01050748, "auxiliary_loss_mlp": 0.01035238, "balance_loss_clip": 1.0148294, "balance_loss_mlp": 1.01525319, "epoch": 0.7348865173605892, "flos": 24350975775360.0, "grad_norm": 1.6314777475129116, "language_loss": 0.67379713, "learning_rate": 6.92956360247217e-07, "loss": 0.69465697, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.35546875, "step": 12223, "time_per_iteration": 2.379265308380127 }, { "auxiliary_loss_clip": 0.01052099, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.01171756, "balance_loss_mlp": 1.0161351, "epoch": 0.7349466406132572, "flos": 20003765702400.0, "grad_norm": 1.837513783058394, "language_loss": 0.73646593, "learning_rate": 6.926615984942332e-07, "loss": 0.75732368, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.359375, "step": 12224, "time_per_iteration": 2.392704725265503 }, { "auxiliary_loss_clip": 0.01053925, "auxiliary_loss_mlp": 0.01040791, "balance_loss_clip": 1.01722288, "balance_loss_mlp": 1.01699567, "epoch": 0.7350067638659251, "flos": 29823785948160.0, "grad_norm": 1.6863028748354423, "language_loss": 0.73683965, "learning_rate": 6.92366886315652e-07, "loss": 0.75778681, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36914062, "step": 12225, "time_per_iteration": 2.436452627182007 }, { "auxiliary_loss_clip": 0.01054899, "auxiliary_loss_mlp": 0.01044066, "balance_loss_clip": 1.01902032, "balance_loss_mlp": 1.01638544, "epoch": 0.7350668871185931, "flos": 21865229861760.0, "grad_norm": 1.7156839668408816, "language_loss": 0.77315736, "learning_rate": 6.920722237226501e-07, "loss": 0.79414696, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38671875, "step": 12226, "time_per_iteration": 2.395659923553467 }, { "auxiliary_loss_clip": 0.01051302, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.01331842, "balance_loss_mlp": 1.01654935, "epoch": 0.735127010371261, "flos": 22564993708800.0, "grad_norm": 5.93572330296042, "language_loss": 0.67975837, "learning_rate": 6.917776107264008e-07, "loss": 0.70062894, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 12227, "time_per_iteration": 2.3797426223754883 }, { "auxiliary_loss_clip": 0.01051227, "auxiliary_loss_mlp": 0.0104036, "balance_loss_clip": 1.01684022, "balance_loss_mlp": 1.0151732, "epoch": 0.7351871336239291, "flos": 25883174102400.0, "grad_norm": 1.4133334840830094, "language_loss": 0.64713699, "learning_rate": 6.914830473380749e-07, "loss": 0.66805279, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36132812, "step": 12228, "time_per_iteration": 2.4294989109039307 }, { "auxiliary_loss_clip": 0.0105198, "auxiliary_loss_mlp": 0.0103703, "balance_loss_clip": 1.01610911, "balance_loss_mlp": 1.01611853, "epoch": 0.735247256876597, "flos": 17931181351680.0, "grad_norm": 2.0640722392032664, "language_loss": 0.64144838, "learning_rate": 6.911885335688427e-07, "loss": 0.66233844, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.359375, "step": 12229, "time_per_iteration": 2.387708902359009 }, { "auxiliary_loss_clip": 0.01053382, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.0171535, "balance_loss_mlp": 1.01610565, "epoch": 0.735307380129265, "flos": 28873938965760.0, "grad_norm": 3.9099454884027876, "language_loss": 0.74756289, "learning_rate": 6.908940694298726e-07, "loss": 0.76849449, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.37109375, "step": 12230, "time_per_iteration": 2.401040554046631 }, { "auxiliary_loss_clip": 0.01054022, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.01556957, "balance_loss_mlp": 1.01680923, "epoch": 0.7353675033819329, "flos": 13624819436160.0, "grad_norm": 1.9864673203866536, "language_loss": 0.74145848, "learning_rate": 6.90599654932332e-07, "loss": 0.76240057, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12231, "time_per_iteration": 2.387418031692505 }, { "auxiliary_loss_clip": 0.01054153, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.01382339, "balance_loss_mlp": 1.01661325, "epoch": 0.7354276266346009, "flos": 19462087958400.0, "grad_norm": 2.8744683338816115, "language_loss": 0.64734149, "learning_rate": 6.903052900873823e-07, "loss": 0.66828048, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 12232, "time_per_iteration": 2.3440961837768555 }, { "auxiliary_loss_clip": 0.01053717, "auxiliary_loss_mlp": 0.01040887, "balance_loss_clip": 1.01764131, "balance_loss_mlp": 1.01691973, "epoch": 0.735487749887269, "flos": 15771140311680.0, "grad_norm": 1.8134867098816378, "language_loss": 0.76310432, "learning_rate": 6.900109749061874e-07, "loss": 0.78405035, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 12233, "time_per_iteration": 2.3689301013946533 }, { "auxiliary_loss_clip": 0.01052566, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.01519465, "balance_loss_mlp": 1.01618838, "epoch": 0.7355478731399369, "flos": 18259644222720.0, "grad_norm": 1.5434505874737896, "language_loss": 0.74848628, "learning_rate": 6.897167093999079e-07, "loss": 0.7694034, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 12234, "time_per_iteration": 2.367719888687134 }, { "auxiliary_loss_clip": 0.01053315, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.01450157, "balance_loss_mlp": 1.01668847, "epoch": 0.7356079963926049, "flos": 26540832983040.0, "grad_norm": 2.0209935084659216, "language_loss": 0.60912573, "learning_rate": 6.894224935797017e-07, "loss": 0.63002944, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3671875, "step": 12235, "time_per_iteration": 2.4186418056488037 }, { "auxiliary_loss_clip": 0.01052532, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.01424336, "balance_loss_mlp": 1.01699376, "epoch": 0.7356681196452728, "flos": 10777896840960.0, "grad_norm": 2.1215473747761355, "language_loss": 0.87502873, "learning_rate": 6.891283274567259e-07, "loss": 0.89592397, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 12236, "time_per_iteration": 2.330629348754883 }, { "auxiliary_loss_clip": 0.0105279, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.01107192, "balance_loss_mlp": 1.01651514, "epoch": 0.7357282428979408, "flos": 19717687088640.0, "grad_norm": 1.8230683079055678, "language_loss": 0.70427561, "learning_rate": 6.888342110421364e-07, "loss": 0.72514045, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 12237, "time_per_iteration": 2.3682234287261963 }, { "auxiliary_loss_clip": 0.01051647, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.01370955, "balance_loss_mlp": 1.01554847, "epoch": 0.7357883661506087, "flos": 19462995653760.0, "grad_norm": 1.868610878331933, "language_loss": 0.72937906, "learning_rate": 6.885401443470839e-07, "loss": 0.75024509, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.36132812, "step": 12238, "time_per_iteration": 3.5560007095336914 }, { "auxiliary_loss_clip": 0.01053855, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.0111165, "balance_loss_mlp": 1.01561105, "epoch": 0.7358484894032767, "flos": 27121857696000.0, "grad_norm": 1.7954210816532172, "language_loss": 0.73773438, "learning_rate": 6.882461273827205e-07, "loss": 0.758623, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3828125, "step": 12239, "time_per_iteration": 2.403902769088745 }, { "auxiliary_loss_clip": 0.01050168, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.01551008, "balance_loss_mlp": 1.01522398, "epoch": 0.7359086126559446, "flos": 24501032265600.0, "grad_norm": 1.4285044965877596, "language_loss": 0.79952258, "learning_rate": 6.879521601601954e-07, "loss": 0.82040334, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 12240, "time_per_iteration": 3.881805419921875 }, { "auxiliary_loss_clip": 0.01052274, "auxiliary_loss_mlp": 0.01038891, "balance_loss_clip": 1.01614594, "balance_loss_mlp": 1.01683331, "epoch": 0.7359687359086127, "flos": 23330150265600.0, "grad_norm": 1.7159160274811243, "language_loss": 0.84396267, "learning_rate": 6.876582426906565e-07, "loss": 0.8648743, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 12241, "time_per_iteration": 2.384986639022827 }, { "auxiliary_loss_clip": 0.01051256, "auxiliary_loss_mlp": 0.01037604, "balance_loss_clip": 1.01527631, "balance_loss_mlp": 1.01607335, "epoch": 0.7360288591612806, "flos": 20192366200320.0, "grad_norm": 1.988105886287044, "language_loss": 0.79796016, "learning_rate": 6.873643749852484e-07, "loss": 0.81884873, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 12242, "time_per_iteration": 3.777843952178955 }, { "auxiliary_loss_clip": 0.01051931, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.01481152, "balance_loss_mlp": 1.01605844, "epoch": 0.7360889824139486, "flos": 24971626748160.0, "grad_norm": 1.7524527978224644, "language_loss": 0.80462056, "learning_rate": 6.870705570551145e-07, "loss": 0.82550639, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 12243, "time_per_iteration": 2.391782760620117 }, { "auxiliary_loss_clip": 0.01053025, "auxiliary_loss_mlp": 0.01040818, "balance_loss_clip": 1.01653552, "balance_loss_mlp": 1.01597738, "epoch": 0.7361491056666165, "flos": 15011429927040.0, "grad_norm": 2.7446895340229305, "language_loss": 0.76550829, "learning_rate": 6.867767889113969e-07, "loss": 0.78644669, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 12244, "time_per_iteration": 2.3524110317230225 }, { "auxiliary_loss_clip": 0.01052638, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.01623392, "balance_loss_mlp": 1.0157311, "epoch": 0.7362092289192845, "flos": 22929277501440.0, "grad_norm": 1.7705886594120857, "language_loss": 0.70750082, "learning_rate": 6.864830705652347e-07, "loss": 0.72842741, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 12245, "time_per_iteration": 2.376838445663452 }, { "auxiliary_loss_clip": 0.01049976, "auxiliary_loss_mlp": 0.01042881, "balance_loss_clip": 1.01931334, "balance_loss_mlp": 1.01540983, "epoch": 0.7362693521719526, "flos": 20701679247360.0, "grad_norm": 1.8619028947142098, "language_loss": 0.75112683, "learning_rate": 6.861894020277658e-07, "loss": 0.77205539, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34570312, "step": 12246, "time_per_iteration": 2.406189441680908 }, { "auxiliary_loss_clip": 0.01049777, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.01269364, "balance_loss_mlp": 1.01559448, "epoch": 0.7363294754246205, "flos": 13110653710080.0, "grad_norm": 2.3335912960277003, "language_loss": 0.74436235, "learning_rate": 6.858957833101266e-07, "loss": 0.76519716, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33984375, "step": 12247, "time_per_iteration": 2.4332563877105713 }, { "auxiliary_loss_clip": 0.01050196, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.01190531, "balance_loss_mlp": 1.01667261, "epoch": 0.7363895986772885, "flos": 14026564984320.0, "grad_norm": 1.6815085120733877, "language_loss": 0.75247186, "learning_rate": 6.856022144234526e-07, "loss": 0.77331269, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3359375, "step": 12248, "time_per_iteration": 2.3795406818389893 }, { "auxiliary_loss_clip": 0.01051232, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.01335323, "balance_loss_mlp": 1.01483142, "epoch": 0.7364497219299564, "flos": 19718943897600.0, "grad_norm": 1.806204533516511, "language_loss": 0.74700296, "learning_rate": 6.853086953788727e-07, "loss": 0.76789248, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 12249, "time_per_iteration": 2.352381706237793 }, { "auxiliary_loss_clip": 0.01053517, "auxiliary_loss_mlp": 0.01034639, "balance_loss_clip": 1.01016498, "balance_loss_mlp": 1.01706958, "epoch": 0.7365098451826244, "flos": 21360315646080.0, "grad_norm": 2.0574157710162515, "language_loss": 0.78190631, "learning_rate": 6.850152261875189e-07, "loss": 0.80278784, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 12250, "time_per_iteration": 2.3913090229034424 }, { "auxiliary_loss_clip": 0.01055005, "auxiliary_loss_mlp": 0.01038697, "balance_loss_clip": 1.01478374, "balance_loss_mlp": 1.01804972, "epoch": 0.7365699684352923, "flos": 23367088350720.0, "grad_norm": 1.6139780580453422, "language_loss": 0.72257513, "learning_rate": 6.8472180686052e-07, "loss": 0.74351215, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 12251, "time_per_iteration": 2.381366729736328 }, { "auxiliary_loss_clip": 0.0105063, "auxiliary_loss_mlp": 0.01036682, "balance_loss_clip": 1.01490259, "balance_loss_mlp": 1.01600182, "epoch": 0.7366300916879603, "flos": 59522758185600.0, "grad_norm": 1.4654645283759766, "language_loss": 0.66728556, "learning_rate": 6.844284374090015e-07, "loss": 0.68815869, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 12252, "time_per_iteration": 2.7115306854248047 }, { "auxiliary_loss_clip": 0.01056293, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.01433349, "balance_loss_mlp": 1.01843596, "epoch": 0.7366902149406283, "flos": 20922085860480.0, "grad_norm": 1.5278282315981666, "language_loss": 0.80125546, "learning_rate": 6.841351178440884e-07, "loss": 0.82219326, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37890625, "step": 12253, "time_per_iteration": 2.384110450744629 }, { "auxiliary_loss_clip": 0.01049083, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.01014447, "balance_loss_mlp": 1.01531208, "epoch": 0.7367503381932963, "flos": 17347189173120.0, "grad_norm": 2.0454598331339553, "language_loss": 0.77916348, "learning_rate": 6.83841848176905e-07, "loss": 0.79996514, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33789062, "step": 12254, "time_per_iteration": 3.831697702407837 }, { "auxiliary_loss_clip": 0.01052966, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.01555777, "balance_loss_mlp": 1.01651001, "epoch": 0.7368104614459642, "flos": 17820367096320.0, "grad_norm": 2.4543238737404782, "language_loss": 0.7025193, "learning_rate": 6.835486284185692e-07, "loss": 0.72343045, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 12255, "time_per_iteration": 2.35434889793396 }, { "auxiliary_loss_clip": 0.01051832, "auxiliary_loss_mlp": 0.01036139, "balance_loss_clip": 1.01298845, "balance_loss_mlp": 1.01585078, "epoch": 0.7368705846986322, "flos": 24605003894400.0, "grad_norm": 2.0955857525256967, "language_loss": 0.76241112, "learning_rate": 6.832554585802012e-07, "loss": 0.78329074, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 12256, "time_per_iteration": 2.3822531700134277 }, { "auxiliary_loss_clip": 0.01052375, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.01258755, "balance_loss_mlp": 1.01661706, "epoch": 0.7369307079513001, "flos": 34968726743040.0, "grad_norm": 1.766669201740906, "language_loss": 0.74661982, "learning_rate": 6.829623386729182e-07, "loss": 0.76748955, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 12257, "time_per_iteration": 2.4921045303344727 }, { "auxiliary_loss_clip": 0.01050517, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.01669943, "balance_loss_mlp": 1.01544726, "epoch": 0.7369908312039681, "flos": 21213540823680.0, "grad_norm": 1.4546555039469709, "language_loss": 0.78687739, "learning_rate": 6.826692687078362e-07, "loss": 0.80776489, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 12258, "time_per_iteration": 2.3701868057250977 }, { "auxiliary_loss_clip": 0.01053172, "auxiliary_loss_mlp": 0.01039209, "balance_loss_clip": 1.01633239, "balance_loss_mlp": 1.0168792, "epoch": 0.7370509544566362, "flos": 23622512924160.0, "grad_norm": 1.4677573794601317, "language_loss": 0.66994655, "learning_rate": 6.823762486960674e-07, "loss": 0.6908704, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 12259, "time_per_iteration": 2.3873655796051025 }, { "auxiliary_loss_clip": 0.01053431, "auxiliary_loss_mlp": 0.0103637, "balance_loss_clip": 1.01388669, "balance_loss_mlp": 1.01676416, "epoch": 0.7371110777093041, "flos": 24826527671040.0, "grad_norm": 1.760069963994392, "language_loss": 0.742423, "learning_rate": 6.820832786487225e-07, "loss": 0.76332098, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3671875, "step": 12260, "time_per_iteration": 2.4112956523895264 }, { "auxiliary_loss_clip": 0.01053208, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.01399183, "balance_loss_mlp": 1.01635218, "epoch": 0.7371712009619721, "flos": 23148357482880.0, "grad_norm": 1.5989140688923762, "language_loss": 0.74011093, "learning_rate": 6.817903585769125e-07, "loss": 0.76101422, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36914062, "step": 12261, "time_per_iteration": 2.3695507049560547 }, { "auxiliary_loss_clip": 0.01054639, "auxiliary_loss_mlp": 0.01041121, "balance_loss_clip": 1.0143106, "balance_loss_mlp": 1.01663256, "epoch": 0.73723132421464, "flos": 23111768511360.0, "grad_norm": 2.2526523443432573, "language_loss": 0.69152111, "learning_rate": 6.814974884917438e-07, "loss": 0.7124787, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38085938, "step": 12262, "time_per_iteration": 2.3942551612854004 }, { "auxiliary_loss_clip": 0.01052737, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.00991678, "balance_loss_mlp": 1.01557398, "epoch": 0.737291447467308, "flos": 19272544853760.0, "grad_norm": 2.0624457893724824, "language_loss": 0.9003275, "learning_rate": 6.81204668404322e-07, "loss": 0.92120147, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 12263, "time_per_iteration": 2.3306210041046143 }, { "auxiliary_loss_clip": 0.01047675, "auxiliary_loss_mlp": 0.01031861, "balance_loss_clip": 1.01203668, "balance_loss_mlp": 1.01466227, "epoch": 0.7373515707199759, "flos": 25117109850240.0, "grad_norm": 1.5315135299265008, "language_loss": 0.68197447, "learning_rate": 6.809118983257522e-07, "loss": 0.70276982, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33007812, "step": 12264, "time_per_iteration": 2.428191900253296 }, { "auxiliary_loss_clip": 0.01050722, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.01296973, "balance_loss_mlp": 1.01593041, "epoch": 0.737411693972644, "flos": 32407324179840.0, "grad_norm": 1.729231402033943, "language_loss": 0.81209403, "learning_rate": 6.806191782671356e-07, "loss": 0.83295065, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 12265, "time_per_iteration": 2.460345983505249 }, { "auxiliary_loss_clip": 0.01055367, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.0181179, "balance_loss_mlp": 1.01718533, "epoch": 0.7374718172253119, "flos": 24314666094720.0, "grad_norm": 1.5538462428975695, "language_loss": 0.74857712, "learning_rate": 6.803265082395711e-07, "loss": 0.76955634, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 12266, "time_per_iteration": 2.3893086910247803 }, { "auxiliary_loss_clip": 0.0105254, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.0200851, "balance_loss_mlp": 1.01621497, "epoch": 0.7375319404779799, "flos": 27155060265600.0, "grad_norm": 1.5790958869998546, "language_loss": 0.74175572, "learning_rate": 6.800338882541576e-07, "loss": 0.76272571, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 12267, "time_per_iteration": 2.3892602920532227 }, { "auxiliary_loss_clip": 0.01050178, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.01549363, "balance_loss_mlp": 1.01591337, "epoch": 0.7375920637306478, "flos": 18879003475200.0, "grad_norm": 2.0013710798930773, "language_loss": 0.84719586, "learning_rate": 6.797413183219923e-07, "loss": 0.86806607, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 12268, "time_per_iteration": 2.363011598587036 }, { "auxiliary_loss_clip": 0.01051609, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 1.01787257, "balance_loss_mlp": 1.01604676, "epoch": 0.7376521869833158, "flos": 15668844428160.0, "grad_norm": 1.7160116861224517, "language_loss": 0.7399739, "learning_rate": 6.794487984541677e-07, "loss": 0.76090276, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 12269, "time_per_iteration": 2.3553900718688965 }, { "auxiliary_loss_clip": 0.01053231, "auxiliary_loss_mlp": 0.01039189, "balance_loss_clip": 1.01556206, "balance_loss_mlp": 1.01619339, "epoch": 0.7377123102359837, "flos": 36970611857280.0, "grad_norm": 2.042598676428294, "language_loss": 0.71379024, "learning_rate": 6.791563286617776e-07, "loss": 0.73471445, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 12270, "time_per_iteration": 2.556297540664673 }, { "auxiliary_loss_clip": 0.01049927, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.01343799, "balance_loss_mlp": 1.01527262, "epoch": 0.7377724334886517, "flos": 24495202068480.0, "grad_norm": 1.5697328621948805, "language_loss": 0.69989598, "learning_rate": 6.788639089559119e-07, "loss": 0.72073269, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34570312, "step": 12271, "time_per_iteration": 2.384168863296509 }, { "auxiliary_loss_clip": 0.0105335, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.00871468, "balance_loss_mlp": 1.01576352, "epoch": 0.7378325567413198, "flos": 24388856467200.0, "grad_norm": 2.2496425068828554, "language_loss": 0.69117451, "learning_rate": 6.785715393476586e-07, "loss": 0.71204025, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37695312, "step": 12272, "time_per_iteration": 2.4765048027038574 }, { "auxiliary_loss_clip": 0.01051074, "auxiliary_loss_mlp": 0.01034191, "balance_loss_clip": 1.01309109, "balance_loss_mlp": 1.01620948, "epoch": 0.7378926799939877, "flos": 17415549348480.0, "grad_norm": 1.736047022973496, "language_loss": 0.79134107, "learning_rate": 6.782792198481049e-07, "loss": 0.81219375, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34765625, "step": 12273, "time_per_iteration": 2.377392530441284 }, { "auxiliary_loss_clip": 0.01050807, "auxiliary_loss_mlp": 0.01033622, "balance_loss_clip": 1.01111543, "balance_loss_mlp": 1.01529789, "epoch": 0.7379528032466557, "flos": 18473347854720.0, "grad_norm": 2.7021599139941097, "language_loss": 0.83717191, "learning_rate": 6.779869504683355e-07, "loss": 0.85801619, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 12274, "time_per_iteration": 2.4021050930023193 }, { "auxiliary_loss_clip": 0.01055698, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.01118815, "balance_loss_mlp": 1.0176034, "epoch": 0.7380129264993236, "flos": 17821030412160.0, "grad_norm": 1.841075170255253, "language_loss": 0.7521897, "learning_rate": 6.776947312194341e-07, "loss": 0.77312446, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38085938, "step": 12275, "time_per_iteration": 2.3512771129608154 }, { "auxiliary_loss_clip": 0.01054942, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.01511002, "balance_loss_mlp": 1.01744485, "epoch": 0.7380730497519916, "flos": 22996415779200.0, "grad_norm": 1.8224260122589107, "language_loss": 0.74575394, "learning_rate": 6.774025621124813e-07, "loss": 0.7666986, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 12276, "time_per_iteration": 2.411278486251831 }, { "auxiliary_loss_clip": 0.01052797, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.01277852, "balance_loss_mlp": 1.01534688, "epoch": 0.7381331730046595, "flos": 20265229941120.0, "grad_norm": 1.9828965914458558, "language_loss": 0.7902956, "learning_rate": 6.771104431585551e-07, "loss": 0.81120694, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12277, "time_per_iteration": 3.594449043273926 }, { "auxiliary_loss_clip": 0.0105057, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.01592648, "balance_loss_mlp": 1.01627743, "epoch": 0.7381932962573275, "flos": 19753542921600.0, "grad_norm": 1.8253454143029726, "language_loss": 0.80283105, "learning_rate": 6.768183743687338e-07, "loss": 0.82374185, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.34179688, "step": 12278, "time_per_iteration": 2.368837833404541 }, { "auxiliary_loss_clip": 0.01051177, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.01253223, "balance_loss_mlp": 1.01540327, "epoch": 0.7382534195099955, "flos": 17304525624960.0, "grad_norm": 2.3104646252812366, "language_loss": 0.72945493, "learning_rate": 6.765263557540921e-07, "loss": 0.75032258, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 12279, "time_per_iteration": 2.3451077938079834 }, { "auxiliary_loss_clip": 0.01052557, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.00994825, "balance_loss_mlp": 1.01505566, "epoch": 0.7383135427626635, "flos": 18696372819840.0, "grad_norm": 2.0846026821050065, "language_loss": 0.8711642, "learning_rate": 6.762343873257034e-07, "loss": 0.89204174, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 12280, "time_per_iteration": 3.6808841228485107 }, { "auxiliary_loss_clip": 0.01053169, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.00904989, "balance_loss_mlp": 1.01703107, "epoch": 0.7383736660153314, "flos": 20880399830400.0, "grad_norm": 1.841172642786682, "language_loss": 0.73714989, "learning_rate": 6.759424690946408e-07, "loss": 0.75800288, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 12281, "time_per_iteration": 3.769367218017578 }, { "auxiliary_loss_clip": 0.01052602, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.0145911, "balance_loss_mlp": 1.01557302, "epoch": 0.7384337892679994, "flos": 20662297367040.0, "grad_norm": 2.307664295285107, "language_loss": 0.61993289, "learning_rate": 6.756506010719711e-07, "loss": 0.64084238, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 12282, "time_per_iteration": 2.3468971252441406 }, { "auxiliary_loss_clip": 0.01052085, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.01375651, "balance_loss_mlp": 1.01537371, "epoch": 0.7384939125206673, "flos": 29168326483200.0, "grad_norm": 1.7642854426265835, "language_loss": 0.68837154, "learning_rate": 6.753587832687632e-07, "loss": 0.70927763, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 12283, "time_per_iteration": 2.440141439437866 }, { "auxiliary_loss_clip": 0.01052263, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.01198626, "balance_loss_mlp": 1.0161078, "epoch": 0.7385540357733353, "flos": 36311556522240.0, "grad_norm": 1.8412387816742841, "language_loss": 0.77219319, "learning_rate": 6.750670156960832e-07, "loss": 0.79305673, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36132812, "step": 12284, "time_per_iteration": 2.476750612258911 }, { "auxiliary_loss_clip": 0.01052135, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.01284885, "balance_loss_mlp": 1.01494431, "epoch": 0.7386141590260034, "flos": 20301574533120.0, "grad_norm": 1.789793724573719, "language_loss": 0.70329249, "learning_rate": 6.747752983649954e-07, "loss": 0.72420299, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37109375, "step": 12285, "time_per_iteration": 2.3714308738708496 }, { "auxiliary_loss_clip": 0.01055063, "auxiliary_loss_mlp": 0.01041142, "balance_loss_clip": 1.01436806, "balance_loss_mlp": 1.01627183, "epoch": 0.7386742822786713, "flos": 25482615540480.0, "grad_norm": 2.0140333893314932, "language_loss": 0.81012303, "learning_rate": 6.744836312865602e-07, "loss": 0.83108509, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38867188, "step": 12286, "time_per_iteration": 2.388735771179199 }, { "auxiliary_loss_clip": 0.01049212, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.01242828, "balance_loss_mlp": 1.01443684, "epoch": 0.7387344055313393, "flos": 13771105499520.0, "grad_norm": 15.410354649919997, "language_loss": 0.66510284, "learning_rate": 6.741920144718396e-07, "loss": 0.68593979, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 12287, "time_per_iteration": 2.3568530082702637 }, { "auxiliary_loss_clip": 0.01050315, "auxiliary_loss_mlp": 0.01037401, "balance_loss_clip": 1.01549077, "balance_loss_mlp": 1.0150317, "epoch": 0.7387945287840072, "flos": 27853951328640.0, "grad_norm": 1.87496616039231, "language_loss": 0.77523607, "learning_rate": 6.739004479318903e-07, "loss": 0.79611325, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 12288, "time_per_iteration": 2.404808521270752 }, { "auxiliary_loss_clip": 0.01054428, "auxiliary_loss_mlp": 0.01040133, "balance_loss_clip": 1.01373982, "balance_loss_mlp": 1.01670194, "epoch": 0.7388546520366752, "flos": 44231463866880.0, "grad_norm": 2.1319256775021898, "language_loss": 0.59171951, "learning_rate": 6.736089316777684e-07, "loss": 0.61266512, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37695312, "step": 12289, "time_per_iteration": 2.6246416568756104 }, { "auxiliary_loss_clip": 0.01008483, "auxiliary_loss_mlp": 0.01006017, "balance_loss_clip": 1.00348938, "balance_loss_mlp": 1.00160456, "epoch": 0.7389147752893431, "flos": 70677681465600.0, "grad_norm": 0.6540303527809055, "language_loss": 0.49382591, "learning_rate": 6.733174657205287e-07, "loss": 0.51397085, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.06884766, "step": 12290, "time_per_iteration": 3.1272263526916504 }, { "auxiliary_loss_clip": 0.01055362, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.01800036, "balance_loss_mlp": 1.01712, "epoch": 0.7389748985420111, "flos": 25993778889600.0, "grad_norm": 1.99510775178788, "language_loss": 0.69124138, "learning_rate": 6.730260500712237e-07, "loss": 0.71223438, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 12291, "time_per_iteration": 2.4210011959075928 }, { "auxiliary_loss_clip": 0.0100797, "auxiliary_loss_mlp": 0.01002561, "balance_loss_clip": 1.00012887, "balance_loss_mlp": 1.00096941, "epoch": 0.7390350217946791, "flos": 54401033445120.0, "grad_norm": 1.001495113853988, "language_loss": 0.61013007, "learning_rate": 6.727346847409052e-07, "loss": 0.63023537, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.0703125, "step": 12292, "time_per_iteration": 2.676886558532715 }, { "auxiliary_loss_clip": 0.01054409, "auxiliary_loss_mlp": 0.01044939, "balance_loss_clip": 1.02224183, "balance_loss_mlp": 1.01779127, "epoch": 0.7390951450473471, "flos": 32195610495360.0, "grad_norm": 1.7224096610705013, "language_loss": 0.68018168, "learning_rate": 6.724433697406191e-07, "loss": 0.70117509, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 12293, "time_per_iteration": 3.996119260787964 }, { "auxiliary_loss_clip": 0.010519, "auxiliary_loss_mlp": 0.01036074, "balance_loss_clip": 1.01261401, "balance_loss_mlp": 1.01551878, "epoch": 0.739155268300015, "flos": 16683490627200.0, "grad_norm": 1.8591678506051779, "language_loss": 0.83884001, "learning_rate": 6.721521050814134e-07, "loss": 0.85971975, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 12294, "time_per_iteration": 2.359832286834717 }, { "auxiliary_loss_clip": 0.01050716, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.01296997, "balance_loss_mlp": 1.01565206, "epoch": 0.739215391552683, "flos": 31648416756480.0, "grad_norm": 1.5909578658695285, "language_loss": 0.73809862, "learning_rate": 6.718608907743337e-07, "loss": 0.75897318, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3515625, "step": 12295, "time_per_iteration": 2.4244799613952637 }, { "auxiliary_loss_clip": 0.01049327, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.0200386, "balance_loss_mlp": 1.01534653, "epoch": 0.7392755148053509, "flos": 29717161056000.0, "grad_norm": 1.7256630041040149, "language_loss": 0.79541993, "learning_rate": 6.715697268304215e-07, "loss": 0.81634223, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33984375, "step": 12296, "time_per_iteration": 2.4644765853881836 }, { "auxiliary_loss_clip": 0.01052146, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.0137732, "balance_loss_mlp": 1.0161767, "epoch": 0.7393356380580189, "flos": 37048956681600.0, "grad_norm": 1.912791715558873, "language_loss": 0.68081999, "learning_rate": 6.712786132607182e-07, "loss": 0.70172858, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 12297, "time_per_iteration": 2.49284291267395 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.0104484, "balance_loss_clip": 1.02062821, "balance_loss_mlp": 1.01684117, "epoch": 0.739395761310687, "flos": 19718594784000.0, "grad_norm": 2.3157523829520894, "language_loss": 0.70384437, "learning_rate": 6.709875500762645e-07, "loss": 0.72483033, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 12298, "time_per_iteration": 2.373307704925537 }, { "auxiliary_loss_clip": 0.01053234, "auxiliary_loss_mlp": 0.01044694, "balance_loss_clip": 1.01967192, "balance_loss_mlp": 1.01562095, "epoch": 0.7394558845633549, "flos": 11800712298240.0, "grad_norm": 2.2957259574104576, "language_loss": 0.75430846, "learning_rate": 6.706965372880946e-07, "loss": 0.77528769, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 12299, "time_per_iteration": 2.3240997791290283 }, { "auxiliary_loss_clip": 0.01008379, "auxiliary_loss_mlp": 0.0100315, "balance_loss_clip": 1.00063455, "balance_loss_mlp": 1.00135958, "epoch": 0.7395160078160229, "flos": 66192668789760.0, "grad_norm": 0.7317688276372185, "language_loss": 0.60955781, "learning_rate": 6.704055749072455e-07, "loss": 0.62967312, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.0703125, "step": 12300, "time_per_iteration": 3.0516090393066406 }, { "auxiliary_loss_clip": 0.01051528, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.0132668, "balance_loss_mlp": 1.01625812, "epoch": 0.7395761310686908, "flos": 21248698429440.0, "grad_norm": 1.8174176058332792, "language_loss": 0.81591374, "learning_rate": 6.7011466294475e-07, "loss": 0.83678448, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 12301, "time_per_iteration": 2.3594186305999756 }, { "auxiliary_loss_clip": 0.0105123, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.01568079, "balance_loss_mlp": 1.0153507, "epoch": 0.7396362543213588, "flos": 25954187541120.0, "grad_norm": 1.5192143132850227, "language_loss": 0.73587525, "learning_rate": 6.698238014116406e-07, "loss": 0.75677276, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 12302, "time_per_iteration": 2.4284322261810303 }, { "auxiliary_loss_clip": 0.01052929, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.01909947, "balance_loss_mlp": 1.01598692, "epoch": 0.7396963775740267, "flos": 27376793510400.0, "grad_norm": 2.044110895928742, "language_loss": 0.750166, "learning_rate": 6.695329903189451e-07, "loss": 0.7711336, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 12303, "time_per_iteration": 2.429917097091675 }, { "auxiliary_loss_clip": 0.01049633, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.01661229, "balance_loss_mlp": 1.0155108, "epoch": 0.7397565008266948, "flos": 25518960132480.0, "grad_norm": 1.8295115371562056, "language_loss": 0.55461991, "learning_rate": 6.692422296776927e-07, "loss": 0.57549727, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 12304, "time_per_iteration": 2.418330192565918 }, { "auxiliary_loss_clip": 0.01052803, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.01468277, "balance_loss_mlp": 1.01708984, "epoch": 0.7398166240793627, "flos": 23726763843840.0, "grad_norm": 2.0069684368794642, "language_loss": 0.8541643, "learning_rate": 6.689515194989084e-07, "loss": 0.87506986, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 12305, "time_per_iteration": 2.403287887573242 }, { "auxiliary_loss_clip": 0.01007899, "auxiliary_loss_mlp": 0.01003392, "balance_loss_clip": 1.00099635, "balance_loss_mlp": 1.00080538, "epoch": 0.7398767473320307, "flos": 67264012903680.0, "grad_norm": 0.874389388802186, "language_loss": 0.57693034, "learning_rate": 6.68660859793615e-07, "loss": 0.59704322, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.07128906, "step": 12306, "time_per_iteration": 3.011460304260254 }, { "auxiliary_loss_clip": 0.01054364, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.01525426, "balance_loss_mlp": 1.01724637, "epoch": 0.7399368705846986, "flos": 22017590501760.0, "grad_norm": 2.9309154700949698, "language_loss": 0.83004665, "learning_rate": 6.683702505728355e-07, "loss": 0.85099977, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 12307, "time_per_iteration": 2.38409423828125 }, { "auxiliary_loss_clip": 0.01051864, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.01397038, "balance_loss_mlp": 1.01662016, "epoch": 0.7399969938373666, "flos": 14172990693120.0, "grad_norm": 1.7732504270805958, "language_loss": 0.70882916, "learning_rate": 6.680796918475893e-07, "loss": 0.72970891, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 12308, "time_per_iteration": 2.3597514629364014 }, { "auxiliary_loss_clip": 0.01050544, "auxiliary_loss_mlp": 0.01034509, "balance_loss_clip": 1.01283658, "balance_loss_mlp": 1.01556814, "epoch": 0.7400571170900345, "flos": 25300299087360.0, "grad_norm": 1.864283248202594, "language_loss": 0.83615649, "learning_rate": 6.67789183628896e-07, "loss": 0.85700703, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 12309, "time_per_iteration": 2.407780170440674 }, { "auxiliary_loss_clip": 0.01054041, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.01547384, "balance_loss_mlp": 1.01669812, "epoch": 0.7401172403427025, "flos": 22710232431360.0, "grad_norm": 1.746097291855091, "language_loss": 0.74041682, "learning_rate": 6.674987259277692e-07, "loss": 0.76136482, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 12310, "time_per_iteration": 2.3972346782684326 }, { "auxiliary_loss_clip": 0.01053, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.01926589, "balance_loss_mlp": 1.01613128, "epoch": 0.7401773635953706, "flos": 18066749627520.0, "grad_norm": 2.6048512246936264, "language_loss": 0.89639866, "learning_rate": 6.672083187552239e-07, "loss": 0.91736567, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 12311, "time_per_iteration": 2.3199305534362793 }, { "auxiliary_loss_clip": 0.01051793, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.01293778, "balance_loss_mlp": 1.0154115, "epoch": 0.7402374868480385, "flos": 22711000481280.0, "grad_norm": 1.5695396751609656, "language_loss": 0.80713809, "learning_rate": 6.669179621222738e-07, "loss": 0.82801449, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 12312, "time_per_iteration": 2.4128103256225586 }, { "auxiliary_loss_clip": 0.01052261, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.01243556, "balance_loss_mlp": 1.01665664, "epoch": 0.7402976101007065, "flos": 22855575888000.0, "grad_norm": 1.7555384579414623, "language_loss": 0.79099536, "learning_rate": 6.666276560399273e-07, "loss": 0.81186879, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 12313, "time_per_iteration": 2.484036445617676 }, { "auxiliary_loss_clip": 0.01053749, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.01365685, "balance_loss_mlp": 1.01592231, "epoch": 0.7403577333533744, "flos": 12345078216960.0, "grad_norm": 1.9867782372546265, "language_loss": 0.79809225, "learning_rate": 6.663374005191937e-07, "loss": 0.81902117, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 12314, "time_per_iteration": 2.394526720046997 }, { "auxiliary_loss_clip": 0.01007726, "auxiliary_loss_mlp": 0.01005248, "balance_loss_clip": 1.00266135, "balance_loss_mlp": 1.00075984, "epoch": 0.7404178566060424, "flos": 60324117822720.0, "grad_norm": 0.8391677859923466, "language_loss": 0.55187953, "learning_rate": 6.660471955710809e-07, "loss": 0.57200927, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.06982422, "step": 12315, "time_per_iteration": 2.9966282844543457 }, { "auxiliary_loss_clip": 0.01050212, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.01479506, "balance_loss_mlp": 1.01582122, "epoch": 0.7404779798587103, "flos": 32013294042240.0, "grad_norm": 2.0759418043614306, "language_loss": 0.80621094, "learning_rate": 6.65757041206591e-07, "loss": 0.82707524, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 12316, "time_per_iteration": 3.7717604637145996 }, { "auxiliary_loss_clip": 0.0105265, "auxiliary_loss_mlp": 0.01040142, "balance_loss_clip": 1.01564503, "balance_loss_mlp": 1.0160023, "epoch": 0.7405381031113784, "flos": 12889060110720.0, "grad_norm": 2.244258140439781, "language_loss": 0.76507872, "learning_rate": 6.654669374367275e-07, "loss": 0.78600669, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 12317, "time_per_iteration": 2.409691095352173 }, { "auxiliary_loss_clip": 0.01049623, "auxiliary_loss_mlp": 0.0103611, "balance_loss_clip": 1.01383007, "balance_loss_mlp": 1.01574945, "epoch": 0.7405982263640463, "flos": 20228117299200.0, "grad_norm": 1.6193641654479036, "language_loss": 0.82642639, "learning_rate": 6.651768842724917e-07, "loss": 0.84728372, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33789062, "step": 12318, "time_per_iteration": 2.3854904174804688 }, { "auxiliary_loss_clip": 0.01052947, "auxiliary_loss_mlp": 0.01037862, "balance_loss_clip": 1.0136863, "balance_loss_mlp": 1.0161891, "epoch": 0.7406583496167143, "flos": 17566234243200.0, "grad_norm": 1.8807667297716892, "language_loss": 0.78144628, "learning_rate": 6.648868817248827e-07, "loss": 0.80235434, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 12319, "time_per_iteration": 2.362156629562378 }, { "auxiliary_loss_clip": 0.01051614, "auxiliary_loss_mlp": 0.01035623, "balance_loss_clip": 1.01404572, "balance_loss_mlp": 1.01597714, "epoch": 0.7407184728693822, "flos": 18294766917120.0, "grad_norm": 1.9659102506674795, "language_loss": 0.64723104, "learning_rate": 6.64596929804897e-07, "loss": 0.66810346, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35546875, "step": 12320, "time_per_iteration": 3.839484691619873 }, { "auxiliary_loss_clip": 0.01054513, "auxiliary_loss_mlp": 0.01045054, "balance_loss_clip": 1.0197463, "balance_loss_mlp": 1.01675534, "epoch": 0.7407785961220502, "flos": 16689635026560.0, "grad_norm": 2.7061522583031845, "language_loss": 0.84959066, "learning_rate": 6.643070285235288e-07, "loss": 0.8705864, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 12321, "time_per_iteration": 3.7310774326324463 }, { "auxiliary_loss_clip": 0.01057025, "auxiliary_loss_mlp": 0.0104908, "balance_loss_clip": 1.02052963, "balance_loss_mlp": 1.01699233, "epoch": 0.7408387193747181, "flos": 22087312220160.0, "grad_norm": 2.23583245639914, "language_loss": 0.73305702, "learning_rate": 6.640171778917727e-07, "loss": 0.75411808, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.40039062, "step": 12322, "time_per_iteration": 2.3885817527770996 }, { "auxiliary_loss_clip": 0.01053062, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.01217914, "balance_loss_mlp": 1.01663494, "epoch": 0.7408988426273861, "flos": 24235762688640.0, "grad_norm": 1.6102347956366845, "language_loss": 0.65126592, "learning_rate": 6.637273779206183e-07, "loss": 0.67216259, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 12323, "time_per_iteration": 2.4394478797912598 }, { "auxiliary_loss_clip": 0.01053943, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.01263928, "balance_loss_mlp": 1.01651001, "epoch": 0.7409589658800542, "flos": 29021726217600.0, "grad_norm": 1.435176551450173, "language_loss": 0.76843822, "learning_rate": 6.634376286210559e-07, "loss": 0.78935057, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.375, "step": 12324, "time_per_iteration": 2.4576776027679443 }, { "auxiliary_loss_clip": 0.01053354, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.01157069, "balance_loss_mlp": 1.0161556, "epoch": 0.7410190891327221, "flos": 19350435830400.0, "grad_norm": 1.5909859358250389, "language_loss": 0.75416207, "learning_rate": 6.63147930004073e-07, "loss": 0.77504814, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 12325, "time_per_iteration": 2.3958988189697266 }, { "auxiliary_loss_clip": 0.01055674, "auxiliary_loss_mlp": 0.01039762, "balance_loss_clip": 1.01528871, "balance_loss_mlp": 1.01669312, "epoch": 0.7410792123853901, "flos": 22746542112000.0, "grad_norm": 2.843428424858071, "language_loss": 0.69652534, "learning_rate": 6.628582820806545e-07, "loss": 0.71747971, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.390625, "step": 12326, "time_per_iteration": 2.364818811416626 }, { "auxiliary_loss_clip": 0.01054225, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.01429486, "balance_loss_mlp": 1.01707518, "epoch": 0.741139335638058, "flos": 25371312526080.0, "grad_norm": 1.8991485734240783, "language_loss": 0.90676212, "learning_rate": 6.625686848617835e-07, "loss": 0.92768598, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 12327, "time_per_iteration": 2.4255197048187256 }, { "auxiliary_loss_clip": 0.01053734, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.01676869, "balance_loss_mlp": 1.0164516, "epoch": 0.741199458890726, "flos": 18584720691840.0, "grad_norm": 7.852205349883515, "language_loss": 0.86401272, "learning_rate": 6.62279138358442e-07, "loss": 0.88496923, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 12328, "time_per_iteration": 2.337928056716919 }, { "auxiliary_loss_clip": 0.01050957, "auxiliary_loss_mlp": 0.01039749, "balance_loss_clip": 1.01386893, "balance_loss_mlp": 1.01526654, "epoch": 0.7412595821433939, "flos": 22125995873280.0, "grad_norm": 1.7806623842358433, "language_loss": 0.68253714, "learning_rate": 6.619896425816103e-07, "loss": 0.70344424, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.35546875, "step": 12329, "time_per_iteration": 2.3928465843200684 }, { "auxiliary_loss_clip": 0.01056745, "auxiliary_loss_mlp": 0.01039983, "balance_loss_clip": 1.01351857, "balance_loss_mlp": 1.017874, "epoch": 0.741319705396062, "flos": 29168396305920.0, "grad_norm": 1.6191955276238443, "language_loss": 0.67933249, "learning_rate": 6.617001975422647e-07, "loss": 0.70029974, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38867188, "step": 12330, "time_per_iteration": 2.442047357559204 }, { "auxiliary_loss_clip": 0.01057387, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.01301932, "balance_loss_mlp": 1.01814318, "epoch": 0.7413798286487299, "flos": 20666451818880.0, "grad_norm": 3.261086818843126, "language_loss": 0.86500913, "learning_rate": 6.614108032513823e-07, "loss": 0.88597971, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 12331, "time_per_iteration": 2.4118335247039795 }, { "auxiliary_loss_clip": 0.01052483, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.01367521, "balance_loss_mlp": 1.01587069, "epoch": 0.7414399519013979, "flos": 16397970595200.0, "grad_norm": 2.088278761056931, "language_loss": 0.71522546, "learning_rate": 6.611214597199364e-07, "loss": 0.73612392, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 12332, "time_per_iteration": 2.3334925174713135 }, { "auxiliary_loss_clip": 0.01053865, "auxiliary_loss_mlp": 0.01036853, "balance_loss_clip": 1.01240301, "balance_loss_mlp": 1.01711249, "epoch": 0.7415000751540658, "flos": 25629041249280.0, "grad_norm": 2.0358766828881176, "language_loss": 0.64458454, "learning_rate": 6.608321669588984e-07, "loss": 0.66549176, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 12333, "time_per_iteration": 3.945812463760376 }, { "auxiliary_loss_clip": 0.0105335, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.01708293, "balance_loss_mlp": 1.01814353, "epoch": 0.7415601984067338, "flos": 24498553559040.0, "grad_norm": 1.5920177376131113, "language_loss": 0.72199893, "learning_rate": 6.605429249792387e-07, "loss": 0.7429443, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 12334, "time_per_iteration": 2.395211935043335 }, { "auxiliary_loss_clip": 0.01051372, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.01298189, "balance_loss_mlp": 1.01556945, "epoch": 0.7416203216594017, "flos": 20886090382080.0, "grad_norm": 1.8313194478437247, "language_loss": 0.83589768, "learning_rate": 6.602537337919257e-07, "loss": 0.85677493, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 12335, "time_per_iteration": 2.3779118061065674 }, { "auxiliary_loss_clip": 0.01053667, "auxiliary_loss_mlp": 0.01037644, "balance_loss_clip": 1.01147771, "balance_loss_mlp": 1.01613462, "epoch": 0.7416804449120697, "flos": 15623597439360.0, "grad_norm": 2.894733825010716, "language_loss": 0.75575751, "learning_rate": 6.599645934079259e-07, "loss": 0.77667058, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 12336, "time_per_iteration": 2.34045147895813 }, { "auxiliary_loss_clip": 0.01053508, "auxiliary_loss_mlp": 0.01041129, "balance_loss_clip": 1.01623774, "balance_loss_mlp": 1.01658726, "epoch": 0.7417405681647377, "flos": 17119765376640.0, "grad_norm": 1.8622308305333641, "language_loss": 0.74998736, "learning_rate": 6.596755038382029e-07, "loss": 0.77093369, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36914062, "step": 12337, "time_per_iteration": 2.4045145511627197 }, { "auxiliary_loss_clip": 0.01051149, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.01515102, "balance_loss_mlp": 1.01655173, "epoch": 0.7418006914174057, "flos": 18879317677440.0, "grad_norm": 1.9426077341530699, "language_loss": 0.77716017, "learning_rate": 6.593864650937186e-07, "loss": 0.79805517, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34570312, "step": 12338, "time_per_iteration": 2.344142198562622 }, { "auxiliary_loss_clip": 0.01051226, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.0096643, "balance_loss_mlp": 1.01589561, "epoch": 0.7418608146700737, "flos": 21579640007040.0, "grad_norm": 1.6306689457415364, "language_loss": 0.73552251, "learning_rate": 6.590974771854345e-07, "loss": 0.75635254, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12339, "time_per_iteration": 2.414616823196411 }, { "auxiliary_loss_clip": 0.01052693, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.0143044, "balance_loss_mlp": 1.01599729, "epoch": 0.7419209379227416, "flos": 22339524948480.0, "grad_norm": 1.6448661067173334, "language_loss": 0.81125784, "learning_rate": 6.588085401243077e-07, "loss": 0.83216739, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 12340, "time_per_iteration": 2.36594820022583 }, { "auxiliary_loss_clip": 0.01052852, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.01682019, "balance_loss_mlp": 1.01627958, "epoch": 0.7419810611754096, "flos": 16761381603840.0, "grad_norm": 1.4595827395578265, "language_loss": 0.7631948, "learning_rate": 6.585196539212958e-07, "loss": 0.78413618, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 12341, "time_per_iteration": 2.376251459121704 }, { "auxiliary_loss_clip": 0.01049735, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.01498854, "balance_loss_mlp": 1.01614559, "epoch": 0.7420411844280775, "flos": 26211776618880.0, "grad_norm": 1.4601450328008798, "language_loss": 0.80502105, "learning_rate": 6.582308185873535e-07, "loss": 0.82587922, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.3359375, "step": 12342, "time_per_iteration": 2.4010541439056396 }, { "auxiliary_loss_clip": 0.01051378, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.01205206, "balance_loss_mlp": 1.01577425, "epoch": 0.7421013076807456, "flos": 68527208004480.0, "grad_norm": 1.6391546240381698, "language_loss": 0.78109562, "learning_rate": 6.57942034133433e-07, "loss": 0.80196238, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 12343, "time_per_iteration": 2.8015100955963135 }, { "auxiliary_loss_clip": 0.01050555, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.01581192, "balance_loss_mlp": 1.01475775, "epoch": 0.7421614309334135, "flos": 24424188629760.0, "grad_norm": 1.7234761750976315, "language_loss": 0.68616974, "learning_rate": 6.576533005704843e-07, "loss": 0.70707273, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35742188, "step": 12344, "time_per_iteration": 2.422603130340576 }, { "auxiliary_loss_clip": 0.01052777, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.01390195, "balance_loss_mlp": 1.0156734, "epoch": 0.7422215541860815, "flos": 12310304636160.0, "grad_norm": 2.1599223949429165, "language_loss": 0.82397503, "learning_rate": 6.573646179094572e-07, "loss": 0.84488791, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12345, "time_per_iteration": 2.35941743850708 }, { "auxiliary_loss_clip": 0.01053611, "auxiliary_loss_mlp": 0.01038864, "balance_loss_clip": 1.01507032, "balance_loss_mlp": 1.01726913, "epoch": 0.7422816774387494, "flos": 19644578968320.0, "grad_norm": 2.5209104723034925, "language_loss": 0.72414982, "learning_rate": 6.570759861612988e-07, "loss": 0.74507457, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 12346, "time_per_iteration": 2.3424322605133057 }, { "auxiliary_loss_clip": 0.01055033, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.01387596, "balance_loss_mlp": 1.01902378, "epoch": 0.7423418006914174, "flos": 32014585762560.0, "grad_norm": 1.547598575245381, "language_loss": 0.74559426, "learning_rate": 6.56787405336953e-07, "loss": 0.7665053, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 12347, "time_per_iteration": 2.497457981109619 }, { "auxiliary_loss_clip": 0.01053532, "auxiliary_loss_mlp": 0.01039194, "balance_loss_clip": 1.01534033, "balance_loss_mlp": 1.01587367, "epoch": 0.7424019239440853, "flos": 18915941560320.0, "grad_norm": 1.8362913198629662, "language_loss": 0.82561374, "learning_rate": 6.564988754473642e-07, "loss": 0.84654099, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 12348, "time_per_iteration": 2.343688726425171 }, { "auxiliary_loss_clip": 0.01051937, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.01344275, "balance_loss_mlp": 1.01619828, "epoch": 0.7424620471967533, "flos": 35875211950080.0, "grad_norm": 1.7085776743143948, "language_loss": 0.7368924, "learning_rate": 6.562103965034724e-07, "loss": 0.75777173, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 12349, "time_per_iteration": 2.5097122192382812 }, { "auxiliary_loss_clip": 0.01055356, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.02108073, "balance_loss_mlp": 1.0166595, "epoch": 0.7425221704494213, "flos": 27015372449280.0, "grad_norm": 1.948379118141713, "language_loss": 0.80583537, "learning_rate": 6.559219685162165e-07, "loss": 0.82686448, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 12350, "time_per_iteration": 2.410724401473999 }, { "auxiliary_loss_clip": 0.01051536, "auxiliary_loss_mlp": 0.01036764, "balance_loss_clip": 1.01341093, "balance_loss_mlp": 1.01574636, "epoch": 0.7425822937020893, "flos": 34165724405760.0, "grad_norm": 1.5760664330347598, "language_loss": 0.76249307, "learning_rate": 6.556335914965343e-07, "loss": 0.78337604, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 12351, "time_per_iteration": 2.4868342876434326 }, { "auxiliary_loss_clip": 0.01051972, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.01048934, "balance_loss_mlp": 1.01624537, "epoch": 0.7426424169547573, "flos": 21282634137600.0, "grad_norm": 2.162353101103212, "language_loss": 0.82299042, "learning_rate": 6.553452654553611e-07, "loss": 0.84383619, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 12352, "time_per_iteration": 2.3509767055511475 }, { "auxiliary_loss_clip": 0.0105224, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.01801181, "balance_loss_mlp": 1.01667476, "epoch": 0.7427025402074252, "flos": 22447546295040.0, "grad_norm": 2.021593502043992, "language_loss": 0.7304647, "learning_rate": 6.550569904036307e-07, "loss": 0.75140369, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 12353, "time_per_iteration": 2.423694610595703 }, { "auxiliary_loss_clip": 0.01051617, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 1.01142526, "balance_loss_mlp": 1.01662207, "epoch": 0.7427626634600932, "flos": 22523621880960.0, "grad_norm": 1.784544271221922, "language_loss": 0.73377889, "learning_rate": 6.547687663522739e-07, "loss": 0.75463247, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 12354, "time_per_iteration": 2.3915696144104004 }, { "auxiliary_loss_clip": 0.01008938, "auxiliary_loss_mlp": 0.01003157, "balance_loss_clip": 1.00078499, "balance_loss_mlp": 1.00173426, "epoch": 0.7428227867127611, "flos": 67206512027520.0, "grad_norm": 0.6977736480650875, "language_loss": 0.59606385, "learning_rate": 6.544805933122199e-07, "loss": 0.61618483, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07226562, "step": 12355, "time_per_iteration": 4.35331130027771 }, { "auxiliary_loss_clip": 0.01052662, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.01350665, "balance_loss_mlp": 1.01594234, "epoch": 0.7428829099654292, "flos": 14720324077440.0, "grad_norm": 1.8407729507083679, "language_loss": 0.68433392, "learning_rate": 6.541924712943971e-07, "loss": 0.70524889, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 12356, "time_per_iteration": 2.3380463123321533 }, { "auxiliary_loss_clip": 0.01053442, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.01313806, "balance_loss_mlp": 1.01544595, "epoch": 0.7429430332180971, "flos": 48644834699520.0, "grad_norm": 1.789260016500064, "language_loss": 0.73660135, "learning_rate": 6.539044003097301e-07, "loss": 0.75750685, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 12357, "time_per_iteration": 2.6221981048583984 }, { "auxiliary_loss_clip": 0.01051732, "auxiliary_loss_mlp": 0.01030951, "balance_loss_clip": 1.00859928, "balance_loss_mlp": 1.01718307, "epoch": 0.7430031564707651, "flos": 16763127171840.0, "grad_norm": 1.7150353191870682, "language_loss": 0.66033006, "learning_rate": 6.53616380369143e-07, "loss": 0.68115693, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 12358, "time_per_iteration": 2.3928449153900146 }, { "auxiliary_loss_clip": 0.01055453, "auxiliary_loss_mlp": 0.01045143, "balance_loss_clip": 1.01809406, "balance_loss_mlp": 1.01748943, "epoch": 0.743063279723433, "flos": 23869663505280.0, "grad_norm": 2.0155731063254576, "language_loss": 0.81730795, "learning_rate": 6.533284114835591e-07, "loss": 0.83831394, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38085938, "step": 12359, "time_per_iteration": 3.7173728942871094 }, { "auxiliary_loss_clip": 0.01051511, "auxiliary_loss_mlp": 0.01036253, "balance_loss_clip": 1.01255393, "balance_loss_mlp": 1.0149411, "epoch": 0.743123402976101, "flos": 14390848776960.0, "grad_norm": 2.1338565946521832, "language_loss": 0.68960541, "learning_rate": 6.530404936638956e-07, "loss": 0.71048307, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 12360, "time_per_iteration": 2.387791872024536 }, { "auxiliary_loss_clip": 0.01052487, "auxiliary_loss_mlp": 0.01039162, "balance_loss_clip": 1.01466465, "balance_loss_mlp": 1.01624107, "epoch": 0.7431835262287689, "flos": 27453078564480.0, "grad_norm": 1.7053082523738714, "language_loss": 0.73601925, "learning_rate": 6.527526269210715e-07, "loss": 0.75693572, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 12361, "time_per_iteration": 3.785395860671997 }, { "auxiliary_loss_clip": 0.01054436, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.01233935, "balance_loss_mlp": 1.01686573, "epoch": 0.743243649481437, "flos": 20958465363840.0, "grad_norm": 1.933892611428438, "language_loss": 0.5717535, "learning_rate": 6.524648112660027e-07, "loss": 0.59267592, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12362, "time_per_iteration": 2.374314785003662 }, { "auxiliary_loss_clip": 0.01053333, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.01450896, "balance_loss_mlp": 1.01728785, "epoch": 0.7433037727341049, "flos": 22782083742720.0, "grad_norm": 1.7393540430537877, "language_loss": 0.78477496, "learning_rate": 6.521770467096039e-07, "loss": 0.8056975, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 12363, "time_per_iteration": 2.374274253845215 }, { "auxiliary_loss_clip": 0.01052873, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.01088095, "balance_loss_mlp": 1.01643384, "epoch": 0.7433638959867729, "flos": 22195717591680.0, "grad_norm": 1.7666183646023021, "language_loss": 0.78874165, "learning_rate": 6.518893332627862e-07, "loss": 0.80960131, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 12364, "time_per_iteration": 2.4018077850341797 }, { "auxiliary_loss_clip": 0.01052486, "auxiliary_loss_mlp": 0.01038483, "balance_loss_clip": 1.01589251, "balance_loss_mlp": 1.01605797, "epoch": 0.7434240192394409, "flos": 23295586152960.0, "grad_norm": 2.094247699476492, "language_loss": 0.80100262, "learning_rate": 6.516016709364604e-07, "loss": 0.82191235, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36523438, "step": 12365, "time_per_iteration": 2.390216112136841 }, { "auxiliary_loss_clip": 0.01055327, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.00943589, "balance_loss_mlp": 1.01730251, "epoch": 0.7434841424921088, "flos": 54007773223680.0, "grad_norm": 1.8075821772955203, "language_loss": 0.77613378, "learning_rate": 6.513140597415346e-07, "loss": 0.79703116, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37890625, "step": 12366, "time_per_iteration": 2.6761667728424072 }, { "auxiliary_loss_clip": 0.01051862, "auxiliary_loss_mlp": 0.01035556, "balance_loss_clip": 1.01383638, "balance_loss_mlp": 1.01688576, "epoch": 0.7435442657447768, "flos": 21432900096000.0, "grad_norm": 1.3986391827696896, "language_loss": 0.72512901, "learning_rate": 6.510264996889141e-07, "loss": 0.74600327, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 12367, "time_per_iteration": 2.3684988021850586 }, { "auxiliary_loss_clip": 0.01053974, "auxiliary_loss_mlp": 0.0104426, "balance_loss_clip": 1.01876152, "balance_loss_mlp": 1.01660681, "epoch": 0.7436043889974447, "flos": 24498239356800.0, "grad_norm": 1.6033397570480457, "language_loss": 0.75456405, "learning_rate": 6.507389907895038e-07, "loss": 0.77554643, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12368, "time_per_iteration": 2.4226863384246826 }, { "auxiliary_loss_clip": 0.0105073, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.01583016, "balance_loss_mlp": 1.01612699, "epoch": 0.7436645122501128, "flos": 40696786932480.0, "grad_norm": 1.7172499318536323, "language_loss": 0.70535076, "learning_rate": 6.50451533054207e-07, "loss": 0.72621882, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34570312, "step": 12369, "time_per_iteration": 2.5139408111572266 }, { "auxiliary_loss_clip": 0.01052781, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.01368511, "balance_loss_mlp": 1.01670754, "epoch": 0.7437246355027807, "flos": 18908051592960.0, "grad_norm": 1.8041741136718719, "language_loss": 0.76403213, "learning_rate": 6.501641264939233e-07, "loss": 0.78493643, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 12370, "time_per_iteration": 2.3806397914886475 }, { "auxiliary_loss_clip": 0.01054424, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.01536322, "balance_loss_mlp": 1.0176487, "epoch": 0.7437847587554487, "flos": 21542736833280.0, "grad_norm": 1.5294628906323284, "language_loss": 0.79596376, "learning_rate": 6.498767711195503e-07, "loss": 0.81690085, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 12371, "time_per_iteration": 2.4101314544677734 }, { "auxiliary_loss_clip": 0.01052313, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.0095439, "balance_loss_mlp": 1.01601243, "epoch": 0.7438448820081166, "flos": 27781227233280.0, "grad_norm": 1.7877383865794667, "language_loss": 0.70888162, "learning_rate": 6.495894669419857e-07, "loss": 0.72973418, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 12372, "time_per_iteration": 2.420164108276367 }, { "auxiliary_loss_clip": 0.0105048, "auxiliary_loss_mlp": 0.0104069, "balance_loss_clip": 1.01720619, "balance_loss_mlp": 1.01530182, "epoch": 0.7439050052607846, "flos": 17966862627840.0, "grad_norm": 2.6344505033229986, "language_loss": 0.7679888, "learning_rate": 6.493022139721245e-07, "loss": 0.78890049, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 12373, "time_per_iteration": 3.781064510345459 }, { "auxiliary_loss_clip": 0.01054128, "auxiliary_loss_mlp": 0.01037203, "balance_loss_clip": 1.01129866, "balance_loss_mlp": 1.01594639, "epoch": 0.7439651285134525, "flos": 22957801948800.0, "grad_norm": 1.6066675841772189, "language_loss": 0.78264225, "learning_rate": 6.49015012220858e-07, "loss": 0.80355561, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38085938, "step": 12374, "time_per_iteration": 2.3914132118225098 }, { "auxiliary_loss_clip": 0.01053884, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.01128483, "balance_loss_mlp": 1.01678681, "epoch": 0.7440252517661206, "flos": 18805790620800.0, "grad_norm": 1.938198050439268, "language_loss": 0.77878362, "learning_rate": 6.487278616990774e-07, "loss": 0.79968524, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 12375, "time_per_iteration": 2.3307554721832275 }, { "auxiliary_loss_clip": 0.01050399, "auxiliary_loss_mlp": 0.01034376, "balance_loss_clip": 1.01272798, "balance_loss_mlp": 1.01540756, "epoch": 0.7440853750187885, "flos": 20265264852480.0, "grad_norm": 1.8641000532155652, "language_loss": 0.78305972, "learning_rate": 6.484407624176733e-07, "loss": 0.80390745, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 12376, "time_per_iteration": 2.3791542053222656 }, { "auxiliary_loss_clip": 0.01054686, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.0104897, "balance_loss_mlp": 1.01682377, "epoch": 0.7441454982714565, "flos": 25336120008960.0, "grad_norm": 1.7132908296862734, "language_loss": 0.80991191, "learning_rate": 6.481537143875296e-07, "loss": 0.83080506, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 12377, "time_per_iteration": 2.4048173427581787 }, { "auxiliary_loss_clip": 0.01054369, "auxiliary_loss_mlp": 0.01037865, "balance_loss_clip": 1.01250935, "balance_loss_mlp": 1.0164361, "epoch": 0.7442056215241245, "flos": 64478819191680.0, "grad_norm": 2.09682031153249, "language_loss": 0.68137443, "learning_rate": 6.478667176195322e-07, "loss": 0.70229685, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 12378, "time_per_iteration": 2.7550666332244873 }, { "auxiliary_loss_clip": 0.01055739, "auxiliary_loss_mlp": 0.01039727, "balance_loss_clip": 1.01336944, "balance_loss_mlp": 1.01768243, "epoch": 0.7442657447767924, "flos": 31284028229760.0, "grad_norm": 1.7347902738127314, "language_loss": 0.73457348, "learning_rate": 6.475797721245648e-07, "loss": 0.75552815, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38085938, "step": 12379, "time_per_iteration": 2.436588764190674 }, { "auxiliary_loss_clip": 0.01051544, "auxiliary_loss_mlp": 0.01039991, "balance_loss_clip": 1.01595807, "balance_loss_mlp": 1.01526129, "epoch": 0.7443258680294604, "flos": 20807012419200.0, "grad_norm": 1.702551044152397, "language_loss": 0.67004979, "learning_rate": 6.472928779135085e-07, "loss": 0.69096506, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 12380, "time_per_iteration": 2.374053478240967 }, { "auxiliary_loss_clip": 0.01055112, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.01351321, "balance_loss_mlp": 1.01746452, "epoch": 0.7443859912821283, "flos": 22198754880000.0, "grad_norm": 2.603210842350457, "language_loss": 0.79654247, "learning_rate": 6.470060349972411e-07, "loss": 0.81748724, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 12381, "time_per_iteration": 2.382394790649414 }, { "auxiliary_loss_clip": 0.01055683, "auxiliary_loss_mlp": 0.01039152, "balance_loss_clip": 1.01358151, "balance_loss_mlp": 1.01819301, "epoch": 0.7444461145347964, "flos": 22016752629120.0, "grad_norm": 2.1985689671067576, "language_loss": 0.74396622, "learning_rate": 6.467192433866411e-07, "loss": 0.76491451, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12382, "time_per_iteration": 2.416293144226074 }, { "auxiliary_loss_clip": 0.01008621, "auxiliary_loss_mlp": 0.01005798, "balance_loss_clip": 1.00349772, "balance_loss_mlp": 1.00140762, "epoch": 0.7445062377874643, "flos": 70555170038400.0, "grad_norm": 0.6521747393750501, "language_loss": 0.54709947, "learning_rate": 6.464325030925831e-07, "loss": 0.5672437, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.07226562, "step": 12383, "time_per_iteration": 3.166435956954956 }, { "auxiliary_loss_clip": 0.0105243, "auxiliary_loss_mlp": 0.01036746, "balance_loss_clip": 1.01252294, "balance_loss_mlp": 1.01561689, "epoch": 0.7445663610401323, "flos": 22163317983360.0, "grad_norm": 2.4999856358896966, "language_loss": 0.77297169, "learning_rate": 6.461458141259395e-07, "loss": 0.79386353, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 12384, "time_per_iteration": 2.4000794887542725 }, { "auxiliary_loss_clip": 0.01051835, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.01276231, "balance_loss_mlp": 1.0160656, "epoch": 0.7446264842928002, "flos": 24169113169920.0, "grad_norm": 2.085130650991691, "language_loss": 0.80961853, "learning_rate": 6.458591764975823e-07, "loss": 0.83049768, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 12385, "time_per_iteration": 2.3751606941223145 }, { "auxiliary_loss_clip": 0.01056219, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.01778948, "balance_loss_mlp": 1.01753998, "epoch": 0.7446866075454682, "flos": 24133396982400.0, "grad_norm": 1.5069223172032695, "language_loss": 0.82778263, "learning_rate": 6.455725902183813e-07, "loss": 0.84877509, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 12386, "time_per_iteration": 2.426088809967041 }, { "auxiliary_loss_clip": 0.01051459, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.0131495, "balance_loss_mlp": 1.01577902, "epoch": 0.7447467307981361, "flos": 23546995920000.0, "grad_norm": 1.7060408085044414, "language_loss": 0.72501689, "learning_rate": 6.452860552992037e-07, "loss": 0.74589235, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 12387, "time_per_iteration": 2.3630526065826416 }, { "auxiliary_loss_clip": 0.01052644, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.01711071, "balance_loss_mlp": 1.01567149, "epoch": 0.7448068540508042, "flos": 19566967282560.0, "grad_norm": 2.108164264658429, "language_loss": 0.71428734, "learning_rate": 6.449995717509138e-07, "loss": 0.73522502, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 12388, "time_per_iteration": 2.372028350830078 }, { "auxiliary_loss_clip": 0.01053129, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.01402712, "balance_loss_mlp": 1.01660669, "epoch": 0.7448669773034721, "flos": 21838520805120.0, "grad_norm": 1.7039838150947855, "language_loss": 0.859707, "learning_rate": 6.447131395843761e-07, "loss": 0.88061368, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36523438, "step": 12389, "time_per_iteration": 2.3968961238861084 }, { "auxiliary_loss_clip": 0.01052154, "auxiliary_loss_mlp": 0.01037311, "balance_loss_clip": 1.01298022, "balance_loss_mlp": 1.01587987, "epoch": 0.7449271005561401, "flos": 25154222492160.0, "grad_norm": 1.663255110091735, "language_loss": 0.80076051, "learning_rate": 6.444267588104526e-07, "loss": 0.82165515, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 12390, "time_per_iteration": 2.4019064903259277 }, { "auxiliary_loss_clip": 0.01054134, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.01093388, "balance_loss_mlp": 1.01681614, "epoch": 0.7449872238088081, "flos": 22272247025280.0, "grad_norm": 1.933938879252367, "language_loss": 0.85600114, "learning_rate": 6.441404294400014e-07, "loss": 0.87691796, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37304688, "step": 12391, "time_per_iteration": 2.3619091510772705 }, { "auxiliary_loss_clip": 0.01053019, "auxiliary_loss_mlp": 0.01035137, "balance_loss_clip": 1.01238012, "balance_loss_mlp": 1.01711726, "epoch": 0.745047347061476, "flos": 20593762634880.0, "grad_norm": 2.1015099766976935, "language_loss": 0.74930096, "learning_rate": 6.438541514838811e-07, "loss": 0.77018249, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 12392, "time_per_iteration": 2.374443769454956 }, { "auxiliary_loss_clip": 0.0105184, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.01303315, "balance_loss_mlp": 1.01644731, "epoch": 0.745107470314144, "flos": 22126449720960.0, "grad_norm": 1.536976181545661, "language_loss": 0.77774245, "learning_rate": 6.435679249529487e-07, "loss": 0.79862261, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 12393, "time_per_iteration": 2.362616539001465 }, { "auxiliary_loss_clip": 0.01053143, "auxiliary_loss_mlp": 0.01040183, "balance_loss_clip": 1.01417184, "balance_loss_mlp": 1.01623762, "epoch": 0.745167593566812, "flos": 22235413674240.0, "grad_norm": 1.916510112714912, "language_loss": 0.73634279, "learning_rate": 6.432817498580552e-07, "loss": 0.75727594, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.36914062, "step": 12394, "time_per_iteration": 2.398754596710205 }, { "auxiliary_loss_clip": 0.01054782, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.01124048, "balance_loss_mlp": 1.01718211, "epoch": 0.74522771681948, "flos": 20665229921280.0, "grad_norm": 1.925012861887302, "language_loss": 0.82507205, "learning_rate": 6.429956262100535e-07, "loss": 0.84597337, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 12395, "time_per_iteration": 3.6122326850891113 }, { "auxiliary_loss_clip": 0.01056541, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.01351345, "balance_loss_mlp": 1.01766396, "epoch": 0.7452878400721479, "flos": 21105903502080.0, "grad_norm": 1.937468894476774, "language_loss": 0.72569531, "learning_rate": 6.427095540197937e-07, "loss": 0.74664968, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38867188, "step": 12396, "time_per_iteration": 2.4092438220977783 }, { "auxiliary_loss_clip": 0.01054513, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.01375675, "balance_loss_mlp": 1.01623225, "epoch": 0.7453479633248159, "flos": 26686839755520.0, "grad_norm": 2.1400836276841595, "language_loss": 0.69419158, "learning_rate": 6.424235332981245e-07, "loss": 0.71511042, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3828125, "step": 12397, "time_per_iteration": 2.416828155517578 }, { "auxiliary_loss_clip": 0.01051985, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.01394343, "balance_loss_mlp": 1.01588547, "epoch": 0.7454080865774838, "flos": 17015200254720.0, "grad_norm": 1.7190349195940162, "language_loss": 0.77937388, "learning_rate": 6.421375640558908e-07, "loss": 0.80027771, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 12398, "time_per_iteration": 2.345754623413086 }, { "auxiliary_loss_clip": 0.01051579, "auxiliary_loss_mlp": 0.01036374, "balance_loss_clip": 1.01283073, "balance_loss_mlp": 1.01618695, "epoch": 0.7454682098301518, "flos": 21322853890560.0, "grad_norm": 1.6833532291748674, "language_loss": 0.78868937, "learning_rate": 6.418516463039363e-07, "loss": 0.80956888, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 12399, "time_per_iteration": 2.3521833419799805 }, { "auxiliary_loss_clip": 0.0104799, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.01681328, "balance_loss_mlp": 1.01448464, "epoch": 0.7455283330828197, "flos": 17857375004160.0, "grad_norm": 2.262590771497466, "language_loss": 0.74778455, "learning_rate": 6.415657800531038e-07, "loss": 0.7686671, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3359375, "step": 12400, "time_per_iteration": 5.154583930969238 }, { "auxiliary_loss_clip": 0.01051771, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.01272666, "balance_loss_mlp": 1.01572084, "epoch": 0.7455884563354878, "flos": 30772934703360.0, "grad_norm": 3.137162313988889, "language_loss": 0.83278787, "learning_rate": 6.412799653142327e-07, "loss": 0.85364717, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36132812, "step": 12401, "time_per_iteration": 2.451383590698242 }, { "auxiliary_loss_clip": 0.01050971, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.01747358, "balance_loss_mlp": 1.01586986, "epoch": 0.7456485795881557, "flos": 23184422784000.0, "grad_norm": 1.774307292989828, "language_loss": 0.66462004, "learning_rate": 6.409942020981611e-07, "loss": 0.68553317, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 12402, "time_per_iteration": 2.384031057357788 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01034951, "balance_loss_clip": 1.01197946, "balance_loss_mlp": 1.01606441, "epoch": 0.7457087028408237, "flos": 38725625681280.0, "grad_norm": 1.535604730759279, "language_loss": 0.74125832, "learning_rate": 6.407084904157265e-07, "loss": 0.7621305, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 12403, "time_per_iteration": 2.503023147583008 }, { "auxiliary_loss_clip": 0.01008268, "auxiliary_loss_mlp": 0.01006027, "balance_loss_clip": 1.00363076, "balance_loss_mlp": 1.00121999, "epoch": 0.7457688260934917, "flos": 56041113473280.0, "grad_norm": 0.8352573538604305, "language_loss": 0.58929932, "learning_rate": 6.404228302777621e-07, "loss": 0.60944223, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.0703125, "step": 12404, "time_per_iteration": 2.822359323501587 }, { "auxiliary_loss_clip": 0.01051848, "auxiliary_loss_mlp": 0.01037995, "balance_loss_clip": 1.01467717, "balance_loss_mlp": 1.01633871, "epoch": 0.7458289493461596, "flos": 20115243273600.0, "grad_norm": 1.5373707025809125, "language_loss": 0.78353882, "learning_rate": 6.401372216950995e-07, "loss": 0.80443728, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 12405, "time_per_iteration": 2.3925161361694336 }, { "auxiliary_loss_clip": 0.01051643, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.0102942, "balance_loss_mlp": 1.01618886, "epoch": 0.7458890725988276, "flos": 20192051998080.0, "grad_norm": 1.5239705491963482, "language_loss": 0.6959362, "learning_rate": 6.398516646785698e-07, "loss": 0.71679747, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35351562, "step": 12406, "time_per_iteration": 2.3469395637512207 }, { "auxiliary_loss_clip": 0.01054979, "auxiliary_loss_mlp": 0.01039199, "balance_loss_clip": 1.01390266, "balance_loss_mlp": 1.01666427, "epoch": 0.7459491958514956, "flos": 17017818606720.0, "grad_norm": 1.7541117882371835, "language_loss": 0.65895623, "learning_rate": 6.39566159239002e-07, "loss": 0.67989802, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 12407, "time_per_iteration": 2.3639416694641113 }, { "auxiliary_loss_clip": 0.01053877, "auxiliary_loss_mlp": 0.01042012, "balance_loss_clip": 1.01650178, "balance_loss_mlp": 1.0163151, "epoch": 0.7460093191041636, "flos": 25077658147200.0, "grad_norm": 1.6353743250358053, "language_loss": 0.73364109, "learning_rate": 6.392807053872212e-07, "loss": 0.75459999, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12408, "time_per_iteration": 2.4010226726531982 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01039636, "balance_loss_clip": 1.01468539, "balance_loss_mlp": 1.0181129, "epoch": 0.7460694423568315, "flos": 21907439562240.0, "grad_norm": 2.350989120322418, "language_loss": 0.74087274, "learning_rate": 6.38995303134053e-07, "loss": 0.76183701, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38671875, "step": 12409, "time_per_iteration": 2.392117738723755 }, { "auxiliary_loss_clip": 0.01049505, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.01375949, "balance_loss_mlp": 1.01539969, "epoch": 0.7461295656094995, "flos": 21214657987200.0, "grad_norm": 1.5675124721226341, "language_loss": 0.66597044, "learning_rate": 6.38709952490319e-07, "loss": 0.68681633, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34179688, "step": 12410, "time_per_iteration": 2.3584184646606445 }, { "auxiliary_loss_clip": 0.01050841, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.01018953, "balance_loss_mlp": 1.01620448, "epoch": 0.7461896888621674, "flos": 22345739170560.0, "grad_norm": 2.0192846791070576, "language_loss": 0.84430289, "learning_rate": 6.384246534668396e-07, "loss": 0.86513221, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 12411, "time_per_iteration": 2.380342721939087 }, { "auxiliary_loss_clip": 0.01052683, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.01231992, "balance_loss_mlp": 1.01622415, "epoch": 0.7462498121148354, "flos": 25481777667840.0, "grad_norm": 1.4888062649272975, "language_loss": 0.78840947, "learning_rate": 6.381394060744339e-07, "loss": 0.80930078, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 12412, "time_per_iteration": 3.8741352558135986 }, { "auxiliary_loss_clip": 0.0105343, "auxiliary_loss_mlp": 0.01040366, "balance_loss_clip": 1.01681066, "balance_loss_mlp": 1.01667774, "epoch": 0.7463099353675033, "flos": 33946539690240.0, "grad_norm": 1.8024195493762916, "language_loss": 0.63276732, "learning_rate": 6.378542103239188e-07, "loss": 0.65370524, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3671875, "step": 12413, "time_per_iteration": 2.469855308532715 }, { "auxiliary_loss_clip": 0.01008267, "auxiliary_loss_mlp": 0.01009357, "balance_loss_clip": 1.00697279, "balance_loss_mlp": 1.00133896, "epoch": 0.7463700586201714, "flos": 62764616747520.0, "grad_norm": 0.723949473974381, "language_loss": 0.54895949, "learning_rate": 6.375690662261082e-07, "loss": 0.56913573, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06933594, "step": 12414, "time_per_iteration": 3.070646047592163 }, { "auxiliary_loss_clip": 0.01052548, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.01019454, "balance_loss_mlp": 1.0157392, "epoch": 0.7464301818728393, "flos": 33431396446080.0, "grad_norm": 1.7940283134453394, "language_loss": 0.55926943, "learning_rate": 6.372839737918154e-07, "loss": 0.58013701, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 12415, "time_per_iteration": 2.459868907928467 }, { "auxiliary_loss_clip": 0.01053219, "auxiliary_loss_mlp": 0.01040339, "balance_loss_clip": 1.01710522, "balance_loss_mlp": 1.01708388, "epoch": 0.7464903051255073, "flos": 26868667449600.0, "grad_norm": 1.4494394481777901, "language_loss": 0.75480181, "learning_rate": 6.369989330318506e-07, "loss": 0.7757374, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 12416, "time_per_iteration": 2.419088125228882 }, { "auxiliary_loss_clip": 0.01053778, "auxiliary_loss_mlp": 0.0104025, "balance_loss_clip": 1.01546681, "balance_loss_mlp": 1.01686847, "epoch": 0.7465504283781753, "flos": 44085387271680.0, "grad_norm": 1.6744309602791239, "language_loss": 0.70200956, "learning_rate": 6.367139439570233e-07, "loss": 0.72294986, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 12417, "time_per_iteration": 2.549337387084961 }, { "auxiliary_loss_clip": 0.0105399, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.01438904, "balance_loss_mlp": 1.01720369, "epoch": 0.7466105516308432, "flos": 19675267920000.0, "grad_norm": 1.731607483149177, "language_loss": 0.75271821, "learning_rate": 6.364290065781392e-07, "loss": 0.77362871, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 12418, "time_per_iteration": 2.392871379852295 }, { "auxiliary_loss_clip": 0.01052824, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.01574385, "balance_loss_mlp": 1.0166173, "epoch": 0.7466706748835112, "flos": 20519711907840.0, "grad_norm": 1.5963750114971749, "language_loss": 0.70328259, "learning_rate": 6.361441209060039e-07, "loss": 0.72420198, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 12419, "time_per_iteration": 2.3679373264312744 }, { "auxiliary_loss_clip": 0.01048784, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.01569414, "balance_loss_mlp": 1.01517236, "epoch": 0.7467307981361792, "flos": 21689162542080.0, "grad_norm": 1.8976279750879554, "language_loss": 0.75543422, "learning_rate": 6.358592869514216e-07, "loss": 0.77629846, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3359375, "step": 12420, "time_per_iteration": 2.4110121726989746 }, { "auxiliary_loss_clip": 0.0105532, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.01720059, "balance_loss_mlp": 1.0181309, "epoch": 0.7467909213888472, "flos": 19572657834240.0, "grad_norm": 3.5091054774431507, "language_loss": 0.686396, "learning_rate": 6.355745047251904e-07, "loss": 0.70737147, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 12421, "time_per_iteration": 2.379145860671997 }, { "auxiliary_loss_clip": 0.0105454, "auxiliary_loss_mlp": 0.01042039, "balance_loss_clip": 1.01552677, "balance_loss_mlp": 1.01647139, "epoch": 0.7468510446415151, "flos": 23694119856000.0, "grad_norm": 1.5988337052739932, "language_loss": 0.73425555, "learning_rate": 6.352897742381107e-07, "loss": 0.75522137, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38085938, "step": 12422, "time_per_iteration": 2.4188194274902344 }, { "auxiliary_loss_clip": 0.01051304, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.01118922, "balance_loss_mlp": 1.01573634, "epoch": 0.7469111678941831, "flos": 29314472901120.0, "grad_norm": 2.3107416866431976, "language_loss": 0.75640869, "learning_rate": 6.350050955009796e-07, "loss": 0.77727193, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 12423, "time_per_iteration": 2.4373514652252197 }, { "auxiliary_loss_clip": 0.01049723, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.00880194, "balance_loss_mlp": 1.01443124, "epoch": 0.746971291146851, "flos": 21797567913600.0, "grad_norm": 1.3174064564151, "language_loss": 0.68717623, "learning_rate": 6.347204685245929e-07, "loss": 0.70798206, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12424, "time_per_iteration": 2.401118516921997 }, { "auxiliary_loss_clip": 0.01053493, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.01599813, "balance_loss_mlp": 1.01649928, "epoch": 0.747031414399519, "flos": 36243650194560.0, "grad_norm": 1.7627875263322692, "language_loss": 0.75738668, "learning_rate": 6.344358933197418e-07, "loss": 0.77831644, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 12425, "time_per_iteration": 2.5140438079833984 }, { "auxiliary_loss_clip": 0.01052063, "auxiliary_loss_mlp": 0.01031811, "balance_loss_clip": 1.00987649, "balance_loss_mlp": 1.01748037, "epoch": 0.7470915376521869, "flos": 19973879712000.0, "grad_norm": 1.9387515545433291, "language_loss": 0.71978164, "learning_rate": 6.341513698972194e-07, "loss": 0.74062037, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34570312, "step": 12426, "time_per_iteration": 2.360137939453125 }, { "auxiliary_loss_clip": 0.01050162, "auxiliary_loss_mlp": 0.0104197, "balance_loss_clip": 1.01853311, "balance_loss_mlp": 1.0154711, "epoch": 0.747151660904855, "flos": 20083262601600.0, "grad_norm": 1.390102824358665, "language_loss": 0.65687621, "learning_rate": 6.338668982678139e-07, "loss": 0.67779756, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 12427, "time_per_iteration": 2.389765977859497 }, { "auxiliary_loss_clip": 0.01052617, "auxiliary_loss_mlp": 0.01036934, "balance_loss_clip": 1.01323509, "balance_loss_mlp": 1.01672029, "epoch": 0.7472117841575229, "flos": 16289425578240.0, "grad_norm": 1.570123428769494, "language_loss": 0.75300717, "learning_rate": 6.335824784423118e-07, "loss": 0.77390265, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 12428, "time_per_iteration": 2.3262596130371094 }, { "auxiliary_loss_clip": 0.01055711, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.01227307, "balance_loss_mlp": 1.01741838, "epoch": 0.7472719074101909, "flos": 21389084472960.0, "grad_norm": 1.9066267886275796, "language_loss": 0.59155571, "learning_rate": 6.33298110431499e-07, "loss": 0.6124922, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 12429, "time_per_iteration": 2.4026732444763184 }, { "auxiliary_loss_clip": 0.01055651, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.0136466, "balance_loss_mlp": 1.01707757, "epoch": 0.7473320306628589, "flos": 29641993165440.0, "grad_norm": 1.9292129391456336, "language_loss": 0.62139767, "learning_rate": 6.330137942461595e-07, "loss": 0.64232415, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.38671875, "step": 12430, "time_per_iteration": 2.423727035522461 }, { "auxiliary_loss_clip": 0.01052081, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.01335514, "balance_loss_mlp": 1.01671255, "epoch": 0.7473921539155268, "flos": 24134898170880.0, "grad_norm": 1.3856718291695302, "language_loss": 0.76050919, "learning_rate": 6.327295298970734e-07, "loss": 0.78138417, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12431, "time_per_iteration": 2.427079677581787 }, { "auxiliary_loss_clip": 0.01052291, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.01300776, "balance_loss_mlp": 1.01569521, "epoch": 0.7474522771681948, "flos": 17487156280320.0, "grad_norm": 1.8915791579097403, "language_loss": 0.76385272, "learning_rate": 6.32445317395021e-07, "loss": 0.78472871, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 12432, "time_per_iteration": 2.320988416671753 }, { "auxiliary_loss_clip": 0.01054875, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.012079, "balance_loss_mlp": 1.0157907, "epoch": 0.7475124004208628, "flos": 16726363643520.0, "grad_norm": 1.8243589528574065, "language_loss": 0.70935893, "learning_rate": 6.321611567507787e-07, "loss": 0.73030078, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 12433, "time_per_iteration": 2.384920120239258 }, { "auxiliary_loss_clip": 0.01052227, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.01290917, "balance_loss_mlp": 1.01570857, "epoch": 0.7475725236735308, "flos": 19719188277120.0, "grad_norm": 1.6712211378964323, "language_loss": 0.68431354, "learning_rate": 6.318770479751232e-07, "loss": 0.70520735, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 12434, "time_per_iteration": 3.6710000038146973 }, { "auxiliary_loss_clip": 0.01048822, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.01419914, "balance_loss_mlp": 1.01600528, "epoch": 0.7476326469261987, "flos": 26284814916480.0, "grad_norm": 1.447182308639515, "language_loss": 0.80543637, "learning_rate": 6.315929910788263e-07, "loss": 0.82627392, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.328125, "step": 12435, "time_per_iteration": 2.412912130355835 }, { "auxiliary_loss_clip": 0.01054388, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.01302755, "balance_loss_mlp": 1.01704311, "epoch": 0.7476927701788667, "flos": 31830488830080.0, "grad_norm": 1.6874808513904371, "language_loss": 0.69138527, "learning_rate": 6.313089860726604e-07, "loss": 0.71229106, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37304688, "step": 12436, "time_per_iteration": 2.4799132347106934 }, { "auxiliary_loss_clip": 0.0105466, "auxiliary_loss_mlp": 0.01039323, "balance_loss_clip": 1.01512396, "balance_loss_mlp": 1.01703715, "epoch": 0.7477528934315346, "flos": 31794144238080.0, "grad_norm": 1.5092589997394859, "language_loss": 0.71793187, "learning_rate": 6.31025032967396e-07, "loss": 0.73887163, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 12437, "time_per_iteration": 2.4558494091033936 }, { "auxiliary_loss_clip": 0.01049074, "auxiliary_loss_mlp": 0.01034443, "balance_loss_clip": 1.01346207, "balance_loss_mlp": 1.01456165, "epoch": 0.7478130166842026, "flos": 20371051872000.0, "grad_norm": 1.9804116280304538, "language_loss": 0.68755651, "learning_rate": 6.307411317737986e-07, "loss": 0.70839167, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34570312, "step": 12438, "time_per_iteration": 2.3995018005371094 }, { "auxiliary_loss_clip": 0.01052279, "auxiliary_loss_mlp": 0.01034789, "balance_loss_clip": 1.01163816, "balance_loss_mlp": 1.01596272, "epoch": 0.7478731399368705, "flos": 18147992094720.0, "grad_norm": 1.7257608317602304, "language_loss": 0.812033, "learning_rate": 6.304572825026344e-07, "loss": 0.83290362, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 12439, "time_per_iteration": 5.055356740951538 }, { "auxiliary_loss_clip": 0.01052758, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.01388013, "balance_loss_mlp": 1.01650214, "epoch": 0.7479332631895386, "flos": 15266994145920.0, "grad_norm": 2.31616099872374, "language_loss": 0.72367632, "learning_rate": 6.301734851646674e-07, "loss": 0.7445612, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 12440, "time_per_iteration": 2.3562893867492676 }, { "auxiliary_loss_clip": 0.01051119, "auxiliary_loss_mlp": 0.0103584, "balance_loss_clip": 1.01370311, "balance_loss_mlp": 1.01716518, "epoch": 0.7479933864422065, "flos": 21141445132800.0, "grad_norm": 1.6439077457450508, "language_loss": 0.7546128, "learning_rate": 6.298897397706597e-07, "loss": 0.77548242, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33984375, "step": 12441, "time_per_iteration": 2.361109972000122 }, { "auxiliary_loss_clip": 0.0105451, "auxiliary_loss_mlp": 0.01038456, "balance_loss_clip": 1.01488864, "balance_loss_mlp": 1.01748919, "epoch": 0.7480535096948745, "flos": 14391162979200.0, "grad_norm": 2.001860709397914, "language_loss": 0.8350122, "learning_rate": 6.296060463313698e-07, "loss": 0.85594189, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 12442, "time_per_iteration": 2.355724334716797 }, { "auxiliary_loss_clip": 0.01053088, "auxiliary_loss_mlp": 0.0104137, "balance_loss_clip": 1.01629996, "balance_loss_mlp": 1.01618695, "epoch": 0.7481136329475425, "flos": 27343451295360.0, "grad_norm": 2.46853129116258, "language_loss": 0.63977551, "learning_rate": 6.293224048575565e-07, "loss": 0.66072011, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 12443, "time_per_iteration": 2.415341854095459 }, { "auxiliary_loss_clip": 0.01050901, "auxiliary_loss_mlp": 0.01034232, "balance_loss_clip": 1.01291728, "balance_loss_mlp": 1.01561356, "epoch": 0.7481737562002104, "flos": 19530587779200.0, "grad_norm": 1.9231871923458341, "language_loss": 0.72931045, "learning_rate": 6.29038815359975e-07, "loss": 0.75016177, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35351562, "step": 12444, "time_per_iteration": 2.365589141845703 }, { "auxiliary_loss_clip": 0.01051164, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.01152813, "balance_loss_mlp": 1.01586771, "epoch": 0.7482338794528784, "flos": 21759023905920.0, "grad_norm": 1.3723783903237237, "language_loss": 0.69572556, "learning_rate": 6.287552778493786e-07, "loss": 0.71659052, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 12445, "time_per_iteration": 2.376697540283203 }, { "auxiliary_loss_clip": 0.01049497, "auxiliary_loss_mlp": 0.0103446, "balance_loss_clip": 1.01152372, "balance_loss_mlp": 1.01508653, "epoch": 0.7482940027055464, "flos": 18696372819840.0, "grad_norm": 1.9580184937349534, "language_loss": 0.75173855, "learning_rate": 6.28471792336519e-07, "loss": 0.77257818, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 12446, "time_per_iteration": 2.363255262374878 }, { "auxiliary_loss_clip": 0.01055465, "auxiliary_loss_mlp": 0.01042408, "balance_loss_clip": 1.01587248, "balance_loss_mlp": 1.01632833, "epoch": 0.7483541259582144, "flos": 15997097831040.0, "grad_norm": 2.1832138540012824, "language_loss": 0.74338037, "learning_rate": 6.281883588321475e-07, "loss": 0.76435912, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 12447, "time_per_iteration": 2.3159799575805664 }, { "auxiliary_loss_clip": 0.01051149, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.01484275, "balance_loss_mlp": 1.01528668, "epoch": 0.7484142492108823, "flos": 25555130167680.0, "grad_norm": 3.4553609144978386, "language_loss": 0.73699647, "learning_rate": 6.279049773470109e-07, "loss": 0.75787526, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 12448, "time_per_iteration": 2.4142839908599854 }, { "auxiliary_loss_clip": 0.01053014, "auxiliary_loss_mlp": 0.0104364, "balance_loss_clip": 1.02051377, "balance_loss_mlp": 1.01691377, "epoch": 0.7484743724635503, "flos": 22886788510080.0, "grad_norm": 2.476837360108069, "language_loss": 0.75060284, "learning_rate": 6.276216478918543e-07, "loss": 0.77156937, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 12449, "time_per_iteration": 2.3679232597351074 }, { "auxiliary_loss_clip": 0.0105599, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.01477528, "balance_loss_mlp": 1.01815343, "epoch": 0.7485344957162182, "flos": 25299147012480.0, "grad_norm": 2.1465246557706044, "language_loss": 0.6343075, "learning_rate": 6.273383704774225e-07, "loss": 0.65524691, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37890625, "step": 12450, "time_per_iteration": 2.4396159648895264 }, { "auxiliary_loss_clip": 0.01048308, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.01200819, "balance_loss_mlp": 1.01469994, "epoch": 0.7485946189688862, "flos": 27051786864000.0, "grad_norm": 2.0859385264812094, "language_loss": 0.71180999, "learning_rate": 6.270551451144577e-07, "loss": 0.73261368, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.3359375, "step": 12451, "time_per_iteration": 2.399294853210449 }, { "auxiliary_loss_clip": 0.01053514, "auxiliary_loss_mlp": 0.01038765, "balance_loss_clip": 1.01418447, "balance_loss_mlp": 1.01682174, "epoch": 0.7486547422215541, "flos": 26905535712000.0, "grad_norm": 3.8230313605256367, "language_loss": 0.81181842, "learning_rate": 6.267719718136988e-07, "loss": 0.83274126, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 12452, "time_per_iteration": 3.8183603286743164 }, { "auxiliary_loss_clip": 0.01054638, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.0153327, "balance_loss_mlp": 1.01743793, "epoch": 0.7487148654742222, "flos": 22345180588800.0, "grad_norm": 2.275808008116949, "language_loss": 0.72695917, "learning_rate": 6.264888505858843e-07, "loss": 0.74789929, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 12453, "time_per_iteration": 2.3526365756988525 }, { "auxiliary_loss_clip": 0.01052179, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.01577091, "balance_loss_mlp": 1.01593065, "epoch": 0.7487749887268901, "flos": 23037717784320.0, "grad_norm": 1.7390676564516472, "language_loss": 0.75031334, "learning_rate": 6.262057814417517e-07, "loss": 0.77121258, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 12454, "time_per_iteration": 2.398498058319092 }, { "auxiliary_loss_clip": 0.01008112, "auxiliary_loss_mlp": 0.01002309, "balance_loss_clip": 0.99996078, "balance_loss_mlp": 1.00126076, "epoch": 0.7488351119795581, "flos": 71521915985280.0, "grad_norm": 0.733774119604931, "language_loss": 0.59418893, "learning_rate": 6.259227643920322e-07, "loss": 0.6142931, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.06835938, "step": 12455, "time_per_iteration": 3.165294647216797 }, { "auxiliary_loss_clip": 0.01052135, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.02082908, "balance_loss_mlp": 1.01681316, "epoch": 0.748895235232226, "flos": 17195456937600.0, "grad_norm": 1.8571525995965872, "language_loss": 0.80472636, "learning_rate": 6.256397994474592e-07, "loss": 0.82567543, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 12456, "time_per_iteration": 2.32230281829834 }, { "auxiliary_loss_clip": 0.01008395, "auxiliary_loss_mlp": 0.01004988, "balance_loss_clip": 1.0027349, "balance_loss_mlp": 1.00150442, "epoch": 0.748955358484894, "flos": 58976086250880.0, "grad_norm": 0.8386605832083294, "language_loss": 0.61447418, "learning_rate": 6.25356886618763e-07, "loss": 0.63460803, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.06884766, "step": 12457, "time_per_iteration": 2.94266414642334 }, { "auxiliary_loss_clip": 0.01053616, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.0102942, "balance_loss_mlp": 1.01745999, "epoch": 0.749015481737562, "flos": 11359724515200.0, "grad_norm": 2.9119262264564743, "language_loss": 0.68418843, "learning_rate": 6.250740259166711e-07, "loss": 0.70505208, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 12458, "time_per_iteration": 2.3119161128997803 }, { "auxiliary_loss_clip": 0.01050815, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.0169282, "balance_loss_mlp": 1.01585126, "epoch": 0.74907560499023, "flos": 21105414743040.0, "grad_norm": 1.9735432571073708, "language_loss": 0.8042208, "learning_rate": 6.247912173519106e-07, "loss": 0.82511437, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34960938, "step": 12459, "time_per_iteration": 2.3782150745391846 }, { "auxiliary_loss_clip": 0.01051433, "auxiliary_loss_mlp": 0.01041628, "balance_loss_clip": 1.01816797, "balance_loss_mlp": 1.01625586, "epoch": 0.749135728242898, "flos": 22267080144000.0, "grad_norm": 1.869124539789917, "language_loss": 0.81040013, "learning_rate": 6.245084609352043e-07, "loss": 0.83133078, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 12460, "time_per_iteration": 2.364835739135742 }, { "auxiliary_loss_clip": 0.01052313, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 1.0143795, "balance_loss_mlp": 1.01628113, "epoch": 0.7491958514955659, "flos": 24056483523840.0, "grad_norm": 1.8676211947868342, "language_loss": 0.86867917, "learning_rate": 6.242257566772755e-07, "loss": 0.88958001, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 12461, "time_per_iteration": 2.409587860107422 }, { "auxiliary_loss_clip": 0.01050322, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.01645577, "balance_loss_mlp": 1.01582527, "epoch": 0.7492559747482339, "flos": 24491152350720.0, "grad_norm": 1.9263708291082304, "language_loss": 0.70792753, "learning_rate": 6.239431045888435e-07, "loss": 0.72879946, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34570312, "step": 12462, "time_per_iteration": 2.407344102859497 }, { "auxiliary_loss_clip": 0.01052141, "auxiliary_loss_mlp": 0.01042643, "balance_loss_clip": 1.01710868, "balance_loss_mlp": 1.01581931, "epoch": 0.7493160980009018, "flos": 27744114591360.0, "grad_norm": 1.9611855701202572, "language_loss": 0.71166098, "learning_rate": 6.236605046806267e-07, "loss": 0.7326088, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 12463, "time_per_iteration": 2.41162109375 }, { "auxiliary_loss_clip": 0.01052636, "auxiliary_loss_mlp": 0.0103655, "balance_loss_clip": 1.0148654, "balance_loss_mlp": 1.01722932, "epoch": 0.7493762212535698, "flos": 30224903091840.0, "grad_norm": 2.0923394788370313, "language_loss": 0.79009974, "learning_rate": 6.233779569633419e-07, "loss": 0.81099153, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 12464, "time_per_iteration": 2.4782421588897705 }, { "auxiliary_loss_clip": 0.01051025, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.01506329, "balance_loss_mlp": 1.01566839, "epoch": 0.7494363445062378, "flos": 21943400129280.0, "grad_norm": 1.8869620798325428, "language_loss": 0.79940766, "learning_rate": 6.230954614477034e-07, "loss": 0.82028669, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 12465, "time_per_iteration": 2.3632140159606934 }, { "auxiliary_loss_clip": 0.01056935, "auxiliary_loss_mlp": 0.01042149, "balance_loss_clip": 1.01661479, "balance_loss_mlp": 1.01754153, "epoch": 0.7494964677589058, "flos": 12489653623680.0, "grad_norm": 2.832878361071903, "language_loss": 0.75100154, "learning_rate": 6.22813018144422e-07, "loss": 0.77199239, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.39453125, "step": 12466, "time_per_iteration": 2.3843109607696533 }, { "auxiliary_loss_clip": 0.01052204, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.01327109, "balance_loss_mlp": 1.01607466, "epoch": 0.7495565910115737, "flos": 21651980077440.0, "grad_norm": 2.2510430722444617, "language_loss": 0.67295957, "learning_rate": 6.22530627064209e-07, "loss": 0.6938349, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36132812, "step": 12467, "time_per_iteration": 2.367713212966919 }, { "auxiliary_loss_clip": 0.01051703, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.01294208, "balance_loss_mlp": 1.01604986, "epoch": 0.7496167142642417, "flos": 15267622550400.0, "grad_norm": 2.6688479273867705, "language_loss": 0.78069305, "learning_rate": 6.222482882177735e-07, "loss": 0.80156994, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12468, "time_per_iteration": 2.3621609210968018 }, { "auxiliary_loss_clip": 0.01050851, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.01297808, "balance_loss_mlp": 1.01625657, "epoch": 0.7496768375169096, "flos": 22053830359680.0, "grad_norm": 2.2713805863433008, "language_loss": 0.70285594, "learning_rate": 6.219660016158201e-07, "loss": 0.72372091, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 12469, "time_per_iteration": 2.3959569931030273 }, { "auxiliary_loss_clip": 0.01054407, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.01398873, "balance_loss_mlp": 1.01721668, "epoch": 0.7497369607695776, "flos": 19056187958400.0, "grad_norm": 1.9426787872142965, "language_loss": 0.70463216, "learning_rate": 6.216837672690543e-07, "loss": 0.7255491, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 12470, "time_per_iteration": 2.3620617389678955 }, { "auxiliary_loss_clip": 0.01054583, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.01495314, "balance_loss_mlp": 1.01665473, "epoch": 0.7497970840222457, "flos": 21616333712640.0, "grad_norm": 2.5737424131442, "language_loss": 0.76029015, "learning_rate": 6.214015851881793e-07, "loss": 0.78124368, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 12471, "time_per_iteration": 2.3594651222229004 }, { "auxiliary_loss_clip": 0.01052717, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.00980222, "balance_loss_mlp": 1.01635814, "epoch": 0.7498572072749136, "flos": 13734725996160.0, "grad_norm": 4.3338726673471015, "language_loss": 0.78617728, "learning_rate": 6.211194553838929e-07, "loss": 0.807037, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 12472, "time_per_iteration": 2.3438053131103516 }, { "auxiliary_loss_clip": 0.0105121, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.01094723, "balance_loss_mlp": 1.01539123, "epoch": 0.7499173305275816, "flos": 22965412625280.0, "grad_norm": 1.6119644705502016, "language_loss": 0.8501358, "learning_rate": 6.208373778668951e-07, "loss": 0.87096846, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.359375, "step": 12473, "time_per_iteration": 3.6447575092315674 }, { "auxiliary_loss_clip": 0.01055165, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.01585555, "balance_loss_mlp": 1.01723731, "epoch": 0.7499774537802495, "flos": 22739559840000.0, "grad_norm": 2.3152023353326707, "language_loss": 0.75504935, "learning_rate": 6.205553526478829e-07, "loss": 0.77601898, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 12474, "time_per_iteration": 2.410468578338623 }, { "auxiliary_loss_clip": 0.01054726, "auxiliary_loss_mlp": 0.01038052, "balance_loss_clip": 1.01354241, "balance_loss_mlp": 1.01633096, "epoch": 0.7500375770329175, "flos": 18295569878400.0, "grad_norm": 1.9714629024381431, "language_loss": 0.75282323, "learning_rate": 6.202733797375492e-07, "loss": 0.77375102, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38476562, "step": 12475, "time_per_iteration": 2.3493924140930176 }, { "auxiliary_loss_clip": 0.01055486, "auxiliary_loss_mlp": 0.01039321, "balance_loss_clip": 1.01239169, "balance_loss_mlp": 1.01661849, "epoch": 0.7500977002855854, "flos": 19169027072640.0, "grad_norm": 1.9511248900722415, "language_loss": 0.81376183, "learning_rate": 6.199914591465878e-07, "loss": 0.83470994, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38867188, "step": 12476, "time_per_iteration": 2.400913715362549 }, { "auxiliary_loss_clip": 0.01051436, "auxiliary_loss_mlp": 0.0103929, "balance_loss_clip": 1.01551986, "balance_loss_mlp": 1.01558018, "epoch": 0.7501578235382534, "flos": 22162794312960.0, "grad_norm": 4.442719893819561, "language_loss": 0.79156005, "learning_rate": 6.19709590885688e-07, "loss": 0.81246734, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 12477, "time_per_iteration": 2.3656206130981445 }, { "auxiliary_loss_clip": 0.01008814, "auxiliary_loss_mlp": 0.01002452, "balance_loss_clip": 1.00007951, "balance_loss_mlp": 1.00169456, "epoch": 0.7502179467909214, "flos": 64462374074880.0, "grad_norm": 0.8087064905181407, "language_loss": 0.54541051, "learning_rate": 6.194277749655394e-07, "loss": 0.56552315, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.07128906, "step": 12478, "time_per_iteration": 4.239579200744629 }, { "auxiliary_loss_clip": 0.01051499, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.01481509, "balance_loss_mlp": 1.01711679, "epoch": 0.7502780700435894, "flos": 20477432384640.0, "grad_norm": 1.842790307447499, "language_loss": 0.80996758, "learning_rate": 6.191460113968272e-07, "loss": 0.83085388, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 12479, "time_per_iteration": 3.8660566806793213 }, { "auxiliary_loss_clip": 0.01055275, "auxiliary_loss_mlp": 0.01040334, "balance_loss_clip": 1.01622939, "balance_loss_mlp": 1.01799202, "epoch": 0.7503381932962573, "flos": 20444334549120.0, "grad_norm": 1.9689502705691357, "language_loss": 0.65477443, "learning_rate": 6.188643001902369e-07, "loss": 0.67573059, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 12480, "time_per_iteration": 2.411593198776245 }, { "auxiliary_loss_clip": 0.01049942, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.01548076, "balance_loss_mlp": 1.015764, "epoch": 0.7503983165489253, "flos": 22380861864960.0, "grad_norm": 1.8275288854566887, "language_loss": 0.78267688, "learning_rate": 6.185826413564512e-07, "loss": 0.80353326, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34179688, "step": 12481, "time_per_iteration": 2.369314670562744 }, { "auxiliary_loss_clip": 0.01053435, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.01422477, "balance_loss_mlp": 1.01576877, "epoch": 0.7504584398015932, "flos": 24898309159680.0, "grad_norm": 1.6581793000644778, "language_loss": 0.72064519, "learning_rate": 6.183010349061501e-07, "loss": 0.74156165, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 12482, "time_per_iteration": 2.3868229389190674 }, { "auxiliary_loss_clip": 0.01053257, "auxiliary_loss_mlp": 0.01039012, "balance_loss_clip": 1.01346529, "balance_loss_mlp": 1.01712346, "epoch": 0.7505185630542612, "flos": 25884046886400.0, "grad_norm": 1.707358230876113, "language_loss": 0.7123096, "learning_rate": 6.180194808500118e-07, "loss": 0.73323226, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.359375, "step": 12483, "time_per_iteration": 2.4292056560516357 }, { "auxiliary_loss_clip": 0.01052223, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.01292121, "balance_loss_mlp": 1.01629043, "epoch": 0.7505786863069293, "flos": 23142876399360.0, "grad_norm": 1.7537840143334733, "language_loss": 0.75978637, "learning_rate": 6.177379791987131e-07, "loss": 0.78067052, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 12484, "time_per_iteration": 2.377311944961548 }, { "auxiliary_loss_clip": 0.01051641, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.01667285, "balance_loss_mlp": 1.01611412, "epoch": 0.7506388095595972, "flos": 16982416621440.0, "grad_norm": 1.8122834202668094, "language_loss": 0.86052936, "learning_rate": 6.174565299629295e-07, "loss": 0.88144308, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12485, "time_per_iteration": 2.3662514686584473 }, { "auxiliary_loss_clip": 0.01052026, "auxiliary_loss_mlp": 0.01035929, "balance_loss_clip": 1.01436436, "balance_loss_mlp": 1.01687968, "epoch": 0.7506989328122652, "flos": 22343923779840.0, "grad_norm": 1.583703267143101, "language_loss": 0.78662485, "learning_rate": 6.171751331533323e-07, "loss": 0.80750442, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 12486, "time_per_iteration": 2.362147092819214 }, { "auxiliary_loss_clip": 0.01054819, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.01305366, "balance_loss_mlp": 1.01734638, "epoch": 0.7507590560649331, "flos": 25774873464960.0, "grad_norm": 2.304987391871904, "language_loss": 0.73707044, "learning_rate": 6.168937887805932e-07, "loss": 0.75797737, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.375, "step": 12487, "time_per_iteration": 2.4182982444763184 }, { "auxiliary_loss_clip": 0.01053268, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.01080012, "balance_loss_mlp": 1.01604152, "epoch": 0.7508191793176011, "flos": 24278286591360.0, "grad_norm": 2.2163661815682905, "language_loss": 0.68605685, "learning_rate": 6.166124968553801e-07, "loss": 0.70692408, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37304688, "step": 12488, "time_per_iteration": 2.421172857284546 }, { "auxiliary_loss_clip": 0.01052045, "auxiliary_loss_mlp": 0.010367, "balance_loss_clip": 1.01213062, "balance_loss_mlp": 1.01629996, "epoch": 0.750879302570269, "flos": 19898607087360.0, "grad_norm": 1.9990932064711822, "language_loss": 0.78583944, "learning_rate": 6.163312573883592e-07, "loss": 0.80672693, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 12489, "time_per_iteration": 2.360384702682495 }, { "auxiliary_loss_clip": 0.01050721, "auxiliary_loss_mlp": 0.01041887, "balance_loss_clip": 1.01861703, "balance_loss_mlp": 1.01515269, "epoch": 0.750939425822937, "flos": 29204391784320.0, "grad_norm": 1.8435559595526412, "language_loss": 0.76464999, "learning_rate": 6.160500703901956e-07, "loss": 0.78557611, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 12490, "time_per_iteration": 2.44028902053833 }, { "auxiliary_loss_clip": 0.01051087, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.01163208, "balance_loss_mlp": 1.01591396, "epoch": 0.750999549075605, "flos": 21141235664640.0, "grad_norm": 1.5744094111060265, "language_loss": 0.79617691, "learning_rate": 6.157689358715527e-07, "loss": 0.8170312, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 12491, "time_per_iteration": 3.83137845993042 }, { "auxiliary_loss_clip": 0.0104881, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.01526296, "balance_loss_mlp": 1.0147028, "epoch": 0.751059672328273, "flos": 23546681717760.0, "grad_norm": 1.869787844391525, "language_loss": 0.77290964, "learning_rate": 6.154878538430899e-07, "loss": 0.79376251, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34179688, "step": 12492, "time_per_iteration": 2.401094675064087 }, { "auxiliary_loss_clip": 0.01051019, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.0116334, "balance_loss_mlp": 1.01558399, "epoch": 0.7511197955809409, "flos": 18988735478400.0, "grad_norm": 1.892584456461371, "language_loss": 0.7225765, "learning_rate": 6.152068243154671e-07, "loss": 0.74341697, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35351562, "step": 12493, "time_per_iteration": 2.36411190032959 }, { "auxiliary_loss_clip": 0.01052719, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.01234925, "balance_loss_mlp": 1.01669669, "epoch": 0.7511799188336089, "flos": 22046080037760.0, "grad_norm": 1.6626626808198401, "language_loss": 0.81411946, "learning_rate": 6.149258472993395e-07, "loss": 0.83499968, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 12494, "time_per_iteration": 2.38456392288208 }, { "auxiliary_loss_clip": 0.01053972, "auxiliary_loss_mlp": 0.0104094, "balance_loss_clip": 1.01532209, "balance_loss_mlp": 1.01671922, "epoch": 0.7512400420862768, "flos": 16466330770560.0, "grad_norm": 1.7361912941426558, "language_loss": 0.80148375, "learning_rate": 6.146449228053634e-07, "loss": 0.82243288, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 12495, "time_per_iteration": 2.3650412559509277 }, { "auxiliary_loss_clip": 0.01052907, "auxiliary_loss_mlp": 0.01039258, "balance_loss_clip": 1.01694191, "balance_loss_mlp": 1.01576662, "epoch": 0.7513001653389448, "flos": 20447302014720.0, "grad_norm": 2.241029849699323, "language_loss": 0.7152195, "learning_rate": 6.143640508441898e-07, "loss": 0.73614109, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.37109375, "step": 12496, "time_per_iteration": 2.3668289184570312 }, { "auxiliary_loss_clip": 0.01052679, "auxiliary_loss_mlp": 0.01037241, "balance_loss_clip": 1.01610494, "balance_loss_mlp": 1.01663876, "epoch": 0.7513602885916129, "flos": 23475703190400.0, "grad_norm": 1.5490331166045441, "language_loss": 0.79001486, "learning_rate": 6.140832314264705e-07, "loss": 0.81091404, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.359375, "step": 12497, "time_per_iteration": 2.4206466674804688 }, { "auxiliary_loss_clip": 0.01053035, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.01932108, "balance_loss_mlp": 1.01662588, "epoch": 0.7514204118442808, "flos": 26796013176960.0, "grad_norm": 1.6992893310447554, "language_loss": 0.77896118, "learning_rate": 6.13802464562855e-07, "loss": 0.79992568, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 12498, "time_per_iteration": 2.395012617111206 }, { "auxiliary_loss_clip": 0.01050749, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.01419115, "balance_loss_mlp": 1.01670146, "epoch": 0.7514805350969488, "flos": 19864601556480.0, "grad_norm": 1.7489822417009924, "language_loss": 0.74474418, "learning_rate": 6.135217502639878e-07, "loss": 0.7656129, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 12499, "time_per_iteration": 2.3836405277252197 }, { "auxiliary_loss_clip": 0.01050739, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.0105288, "balance_loss_mlp": 1.01510406, "epoch": 0.7515406583496167, "flos": 24570404870400.0, "grad_norm": 2.5471870755155654, "language_loss": 0.80414855, "learning_rate": 6.132410885405148e-07, "loss": 0.82496619, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.35546875, "step": 12500, "time_per_iteration": 2.3805432319641113 }, { "auxiliary_loss_clip": 0.01055847, "auxiliary_loss_mlp": 0.01042642, "balance_loss_clip": 1.01447272, "balance_loss_mlp": 1.0170784, "epoch": 0.7516007816022847, "flos": 20119223168640.0, "grad_norm": 2.2964602961715994, "language_loss": 0.74736398, "learning_rate": 6.129604794030794e-07, "loss": 0.76834887, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.38671875, "step": 12501, "time_per_iteration": 2.3835690021514893 }, { "auxiliary_loss_clip": 0.01051665, "auxiliary_loss_mlp": 0.01033043, "balance_loss_clip": 1.01069105, "balance_loss_mlp": 1.01589012, "epoch": 0.7516609048549526, "flos": 22783515108480.0, "grad_norm": 1.705486259478708, "language_loss": 0.78999037, "learning_rate": 6.126799228623207e-07, "loss": 0.81083745, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 12502, "time_per_iteration": 2.3851442337036133 }, { "auxiliary_loss_clip": 0.0105306, "auxiliary_loss_mlp": 0.01037579, "balance_loss_clip": 1.01302183, "balance_loss_mlp": 1.01632738, "epoch": 0.7517210281076206, "flos": 10633251611520.0, "grad_norm": 2.21844763504999, "language_loss": 0.7307421, "learning_rate": 6.123994189288786e-07, "loss": 0.75164843, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 12503, "time_per_iteration": 2.3683152198791504 }, { "auxiliary_loss_clip": 0.01008427, "auxiliary_loss_mlp": 0.01003545, "balance_loss_clip": 1.00098217, "balance_loss_mlp": 1.00137234, "epoch": 0.7517811513602886, "flos": 66049001078400.0, "grad_norm": 1.2206335361431635, "language_loss": 0.64133179, "learning_rate": 6.121189676133903e-07, "loss": 0.66145152, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.0703125, "step": 12504, "time_per_iteration": 2.9171221256256104 }, { "auxiliary_loss_clip": 0.01049366, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.01344657, "balance_loss_mlp": 1.01526046, "epoch": 0.7518412746129566, "flos": 37266849676800.0, "grad_norm": 1.678412278779344, "language_loss": 0.69442666, "learning_rate": 6.118385689264896e-07, "loss": 0.71528286, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33984375, "step": 12505, "time_per_iteration": 2.551549196243286 }, { "auxiliary_loss_clip": 0.01008812, "auxiliary_loss_mlp": 0.01002811, "balance_loss_clip": 1.00053442, "balance_loss_mlp": 1.00198722, "epoch": 0.7519013978656245, "flos": 60515162115840.0, "grad_norm": 0.6469383450057987, "language_loss": 0.55169588, "learning_rate": 6.11558222878809e-07, "loss": 0.57181215, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.06835938, "step": 12506, "time_per_iteration": 3.181152105331421 }, { "auxiliary_loss_clip": 0.01053246, "auxiliary_loss_mlp": 0.0103801, "balance_loss_clip": 1.01373875, "balance_loss_mlp": 1.01722634, "epoch": 0.7519615211182925, "flos": 18805895354880.0, "grad_norm": 1.9794624955949431, "language_loss": 0.78732973, "learning_rate": 6.112779294809796e-07, "loss": 0.80824226, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 12507, "time_per_iteration": 2.3636245727539062 }, { "auxiliary_loss_clip": 0.01051882, "auxiliary_loss_mlp": 0.01038216, "balance_loss_clip": 1.01762843, "balance_loss_mlp": 1.01662087, "epoch": 0.7520216443709604, "flos": 14574352216320.0, "grad_norm": 1.6448454233968035, "language_loss": 0.72157389, "learning_rate": 6.10997688743631e-07, "loss": 0.74247485, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.35351562, "step": 12508, "time_per_iteration": 2.3395674228668213 }, { "auxiliary_loss_clip": 0.01049656, "auxiliary_loss_mlp": 0.01038333, "balance_loss_clip": 1.01668501, "balance_loss_mlp": 1.01533604, "epoch": 0.7520817676236284, "flos": 17055629475840.0, "grad_norm": 1.6645974282641212, "language_loss": 0.72816771, "learning_rate": 6.107175006773885e-07, "loss": 0.74904764, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 12509, "time_per_iteration": 2.39573335647583 }, { "auxiliary_loss_clip": 0.01055317, "auxiliary_loss_mlp": 0.0104693, "balance_loss_clip": 1.0210855, "balance_loss_mlp": 1.01748514, "epoch": 0.7521418908762965, "flos": 25665211284480.0, "grad_norm": 1.593829834748856, "language_loss": 0.63319081, "learning_rate": 6.104373652928785e-07, "loss": 0.65421331, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 12510, "time_per_iteration": 2.3912620544433594 }, { "auxiliary_loss_clip": 0.01049719, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.01183796, "balance_loss_mlp": 1.01543307, "epoch": 0.7522020141289644, "flos": 20885706357120.0, "grad_norm": 1.6644618795926276, "language_loss": 0.8274101, "learning_rate": 6.10157282600722e-07, "loss": 0.84825712, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34375, "step": 12511, "time_per_iteration": 2.3777265548706055 }, { "auxiliary_loss_clip": 0.01055984, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.01442301, "balance_loss_mlp": 1.01712751, "epoch": 0.7522621373816324, "flos": 12639500645760.0, "grad_norm": 2.501082134804211, "language_loss": 0.77364349, "learning_rate": 6.098772526115412e-07, "loss": 0.79459721, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 12512, "time_per_iteration": 3.8453569412231445 }, { "auxiliary_loss_clip": 0.01048872, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.01615024, "balance_loss_mlp": 1.01497865, "epoch": 0.7523222606343003, "flos": 25625061354240.0, "grad_norm": 1.6913116739396399, "language_loss": 0.83044237, "learning_rate": 6.095972753359537e-07, "loss": 0.8513056, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 12513, "time_per_iteration": 2.4073643684387207 }, { "auxiliary_loss_clip": 0.01053451, "auxiliary_loss_mlp": 0.01043619, "balance_loss_clip": 1.01896679, "balance_loss_mlp": 1.01634455, "epoch": 0.7523823838869683, "flos": 20447860596480.0, "grad_norm": 1.8945618221370861, "language_loss": 0.76095283, "learning_rate": 6.093173507845771e-07, "loss": 0.78192359, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12514, "time_per_iteration": 2.3888628482818604 }, { "auxiliary_loss_clip": 0.01049865, "auxiliary_loss_mlp": 0.01036028, "balance_loss_clip": 1.01477313, "balance_loss_mlp": 1.01578176, "epoch": 0.7524425071396362, "flos": 14719730584320.0, "grad_norm": 2.1600545091254686, "language_loss": 0.70270824, "learning_rate": 6.090374789680271e-07, "loss": 0.72356719, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 12515, "time_per_iteration": 2.3318045139312744 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.01015532, "balance_loss_mlp": 1.01665735, "epoch": 0.7525026303923043, "flos": 30590722984320.0, "grad_norm": 1.7797888344747634, "language_loss": 0.71904838, "learning_rate": 6.087576598969137e-07, "loss": 0.73991334, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 12516, "time_per_iteration": 2.4364535808563232 }, { "auxiliary_loss_clip": 0.01050901, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.01547313, "balance_loss_mlp": 1.0168668, "epoch": 0.7525627536449722, "flos": 24790567104000.0, "grad_norm": 1.439568493768495, "language_loss": 0.90008628, "learning_rate": 6.084778935818495e-07, "loss": 0.92096549, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 12517, "time_per_iteration": 2.4371795654296875 }, { "auxiliary_loss_clip": 0.01052595, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.01986718, "balance_loss_mlp": 1.01610911, "epoch": 0.7526228768976402, "flos": 20778662528640.0, "grad_norm": 1.5002886508535487, "language_loss": 0.75509942, "learning_rate": 6.081981800334437e-07, "loss": 0.77604944, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36523438, "step": 12518, "time_per_iteration": 3.8496694564819336 }, { "auxiliary_loss_clip": 0.01008149, "auxiliary_loss_mlp": 0.01002022, "balance_loss_clip": 0.99966151, "balance_loss_mlp": 1.00135398, "epoch": 0.7526830001503081, "flos": 66556114709760.0, "grad_norm": 0.7052594109419351, "language_loss": 0.55833805, "learning_rate": 6.079185192623017e-07, "loss": 0.57843971, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.06835938, "step": 12519, "time_per_iteration": 4.463867902755737 }, { "auxiliary_loss_clip": 0.01050832, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.01908946, "balance_loss_mlp": 1.01572144, "epoch": 0.7527431234029761, "flos": 23476750531200.0, "grad_norm": 1.4308760017128805, "language_loss": 0.78461421, "learning_rate": 6.07638911279029e-07, "loss": 0.8055377, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 12520, "time_per_iteration": 2.3873491287231445 }, { "auxiliary_loss_clip": 0.01050494, "auxiliary_loss_mlp": 0.01035899, "balance_loss_clip": 1.01322544, "balance_loss_mlp": 1.01514089, "epoch": 0.752803246655644, "flos": 22048593655680.0, "grad_norm": 2.132872101161886, "language_loss": 0.7547555, "learning_rate": 6.07359356094229e-07, "loss": 0.77561939, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 12521, "time_per_iteration": 2.403851270675659 }, { "auxiliary_loss_clip": 0.01054166, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.01932311, "balance_loss_mlp": 1.01626313, "epoch": 0.752863369908312, "flos": 30152493198720.0, "grad_norm": 1.8078920691633977, "language_loss": 0.68359113, "learning_rate": 6.070798537185016e-07, "loss": 0.70458066, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 12522, "time_per_iteration": 2.4342448711395264 }, { "auxiliary_loss_clip": 0.01054317, "auxiliary_loss_mlp": 0.01047721, "balance_loss_clip": 1.02577496, "balance_loss_mlp": 1.01732492, "epoch": 0.7529234931609801, "flos": 24566599532160.0, "grad_norm": 1.5171288825226044, "language_loss": 0.79356694, "learning_rate": 6.068004041624453e-07, "loss": 0.81458735, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.37109375, "step": 12523, "time_per_iteration": 2.49700927734375 }, { "auxiliary_loss_clip": 0.01051982, "auxiliary_loss_mlp": 0.0104037, "balance_loss_clip": 1.01736271, "balance_loss_mlp": 1.01647866, "epoch": 0.752983616413648, "flos": 23111279752320.0, "grad_norm": 2.005956101264723, "language_loss": 0.81010365, "learning_rate": 6.065210074366571e-07, "loss": 0.83102715, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12524, "time_per_iteration": 2.365661382675171 }, { "auxiliary_loss_clip": 0.01049848, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.01564884, "balance_loss_mlp": 1.01599145, "epoch": 0.753043739666316, "flos": 24315783258240.0, "grad_norm": 1.7998712914615127, "language_loss": 0.75009322, "learning_rate": 6.062416635517326e-07, "loss": 0.77095985, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33984375, "step": 12525, "time_per_iteration": 2.5296785831451416 }, { "auxiliary_loss_clip": 0.01052382, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.01548219, "balance_loss_mlp": 1.01636517, "epoch": 0.7531038629189839, "flos": 24242151467520.0, "grad_norm": 1.7785876418073239, "language_loss": 0.73595655, "learning_rate": 6.059623725182641e-07, "loss": 0.756859, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 12526, "time_per_iteration": 2.4295523166656494 }, { "auxiliary_loss_clip": 0.01051082, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.01533055, "balance_loss_mlp": 1.01660705, "epoch": 0.7531639861716519, "flos": 30187546070400.0, "grad_norm": 2.4175987618796806, "language_loss": 0.73785019, "learning_rate": 6.056831343468414e-07, "loss": 0.75873411, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 12527, "time_per_iteration": 2.487762928009033 }, { "auxiliary_loss_clip": 0.0105013, "auxiliary_loss_mlp": 0.01030715, "balance_loss_clip": 1.00987697, "balance_loss_mlp": 1.01587045, "epoch": 0.7532241094243198, "flos": 18222217378560.0, "grad_norm": 1.9588177759441452, "language_loss": 0.82358384, "learning_rate": 6.054039490480539e-07, "loss": 0.8443923, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34375, "step": 12528, "time_per_iteration": 2.3890132904052734 }, { "auxiliary_loss_clip": 0.01053239, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.01639509, "balance_loss_mlp": 1.01680732, "epoch": 0.7532842326769879, "flos": 20880155450880.0, "grad_norm": 2.0096926084576205, "language_loss": 0.86341596, "learning_rate": 6.051248166324892e-07, "loss": 0.88436157, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36328125, "step": 12529, "time_per_iteration": 2.42287540435791 }, { "auxiliary_loss_clip": 0.01054521, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.01773667, "balance_loss_mlp": 1.01724553, "epoch": 0.7533443559296558, "flos": 18077676883200.0, "grad_norm": 1.9710426650878614, "language_loss": 0.75477099, "learning_rate": 6.048457371107303e-07, "loss": 0.7757411, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37304688, "step": 12530, "time_per_iteration": 2.4096360206604004 }, { "auxiliary_loss_clip": 0.01008317, "auxiliary_loss_mlp": 0.01004782, "balance_loss_clip": 1.00225461, "balance_loss_mlp": 1.00131083, "epoch": 0.7534044791823238, "flos": 50252024494080.0, "grad_norm": 0.8267312357156381, "language_loss": 0.63711703, "learning_rate": 6.045667104933612e-07, "loss": 0.65724802, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.0703125, "step": 12531, "time_per_iteration": 4.293577432632446 }, { "auxiliary_loss_clip": 0.01053858, "auxiliary_loss_mlp": 0.0103716, "balance_loss_clip": 1.01300812, "balance_loss_mlp": 1.01640654, "epoch": 0.7534646024349917, "flos": 20849222119680.0, "grad_norm": 1.8150094370486118, "language_loss": 0.71357971, "learning_rate": 6.042877367909633e-07, "loss": 0.73448992, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 12532, "time_per_iteration": 2.3581719398498535 }, { "auxiliary_loss_clip": 0.01050123, "auxiliary_loss_mlp": 0.01038997, "balance_loss_clip": 1.01719356, "balance_loss_mlp": 1.01637125, "epoch": 0.7535247256876597, "flos": 23070780708480.0, "grad_norm": 1.8557082007596366, "language_loss": 0.78130305, "learning_rate": 6.040088160141132e-07, "loss": 0.80219418, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.3359375, "step": 12533, "time_per_iteration": 2.4118728637695312 }, { "auxiliary_loss_clip": 0.01008173, "auxiliary_loss_mlp": 0.01004366, "balance_loss_clip": 1.00200605, "balance_loss_mlp": 1.00137746, "epoch": 0.7535848489403276, "flos": 58620006627840.0, "grad_norm": 0.7963334510997042, "language_loss": 0.57392788, "learning_rate": 6.037299481733886e-07, "loss": 0.59405327, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.06787109, "step": 12534, "time_per_iteration": 3.081427812576294 }, { "auxiliary_loss_clip": 0.01050216, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.01434469, "balance_loss_mlp": 1.01480889, "epoch": 0.7536449721929956, "flos": 26576688816000.0, "grad_norm": 1.5620860264675178, "language_loss": 0.72359294, "learning_rate": 6.03451133279365e-07, "loss": 0.74446201, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 12535, "time_per_iteration": 2.4551658630371094 }, { "auxiliary_loss_clip": 0.01052958, "auxiliary_loss_mlp": 0.01043084, "balance_loss_clip": 1.01679826, "balance_loss_mlp": 1.01555717, "epoch": 0.7537050954456637, "flos": 25734898091520.0, "grad_norm": 1.527479504455474, "language_loss": 0.81787694, "learning_rate": 6.031723713426135e-07, "loss": 0.83883739, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.375, "step": 12536, "time_per_iteration": 2.4245450496673584 }, { "auxiliary_loss_clip": 0.01049273, "auxiliary_loss_mlp": 0.01035936, "balance_loss_clip": 1.0131309, "balance_loss_mlp": 1.015136, "epoch": 0.7537652186983316, "flos": 30223192435200.0, "grad_norm": 1.7186734083211472, "language_loss": 0.75545728, "learning_rate": 6.028936623737067e-07, "loss": 0.77630937, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 12537, "time_per_iteration": 2.4724762439727783 }, { "auxiliary_loss_clip": 0.01052259, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.01098132, "balance_loss_mlp": 1.01594496, "epoch": 0.7538253419509996, "flos": 12640408341120.0, "grad_norm": 1.8185892959672139, "language_loss": 0.75447774, "learning_rate": 6.026150063832111e-07, "loss": 0.77534813, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 12538, "time_per_iteration": 2.351973056793213 }, { "auxiliary_loss_clip": 0.01051737, "auxiliary_loss_mlp": 0.01036793, "balance_loss_clip": 1.01386857, "balance_loss_mlp": 1.01636839, "epoch": 0.7538854652036675, "flos": 23184841720320.0, "grad_norm": 1.9871801469524246, "language_loss": 0.68278456, "learning_rate": 6.023364033816956e-07, "loss": 0.70366985, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 12539, "time_per_iteration": 2.442000150680542 }, { "auxiliary_loss_clip": 0.01051969, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.0124197, "balance_loss_mlp": 1.01709008, "epoch": 0.7539455884563355, "flos": 23185086099840.0, "grad_norm": 1.6563816130931548, "language_loss": 0.75696772, "learning_rate": 6.020578533797229e-07, "loss": 0.77783871, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 12540, "time_per_iteration": 2.406379461288452 }, { "auxiliary_loss_clip": 0.01052434, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.01152062, "balance_loss_mlp": 1.01569963, "epoch": 0.7540057117090034, "flos": 13180515073920.0, "grad_norm": 1.9908026309125533, "language_loss": 0.74500149, "learning_rate": 6.017793563878566e-07, "loss": 0.76587856, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 12541, "time_per_iteration": 2.375148296356201 }, { "auxiliary_loss_clip": 0.01051298, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.01045096, "balance_loss_mlp": 1.0162102, "epoch": 0.7540658349616715, "flos": 45476396593920.0, "grad_norm": 1.5880193505441267, "language_loss": 0.73038387, "learning_rate": 6.015009124166576e-07, "loss": 0.75123632, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 12542, "time_per_iteration": 2.617290496826172 }, { "auxiliary_loss_clip": 0.01050937, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.00939667, "balance_loss_mlp": 1.01585984, "epoch": 0.7541259582143394, "flos": 19929994266240.0, "grad_norm": 2.550459430125662, "language_loss": 0.85966635, "learning_rate": 6.012225214766844e-07, "loss": 0.88049805, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 12543, "time_per_iteration": 2.4195051193237305 }, { "auxiliary_loss_clip": 0.01052838, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.01143265, "balance_loss_mlp": 1.01758838, "epoch": 0.7541860814670074, "flos": 27197025586560.0, "grad_norm": 2.463117662276628, "language_loss": 0.75048798, "learning_rate": 6.009441835784927e-07, "loss": 0.77136612, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 12544, "time_per_iteration": 2.4434478282928467 }, { "auxiliary_loss_clip": 0.01050577, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.01286364, "balance_loss_mlp": 1.01535845, "epoch": 0.7542462047196753, "flos": 21323098270080.0, "grad_norm": 1.8823328425445394, "language_loss": 0.7022832, "learning_rate": 6.006658987326383e-07, "loss": 0.7231431, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 12545, "time_per_iteration": 2.447777271270752 }, { "auxiliary_loss_clip": 0.01050172, "auxiliary_loss_mlp": 0.01037136, "balance_loss_clip": 1.01436758, "balance_loss_mlp": 1.01534712, "epoch": 0.7543063279723433, "flos": 11940330291840.0, "grad_norm": 1.9742601714330743, "language_loss": 0.70754349, "learning_rate": 6.003876669496728e-07, "loss": 0.72841656, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 12546, "time_per_iteration": 2.3720648288726807 }, { "auxiliary_loss_clip": 0.01052968, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.01516747, "balance_loss_mlp": 1.01623046, "epoch": 0.7543664512250112, "flos": 22818882182400.0, "grad_norm": 2.1237248508987197, "language_loss": 0.75436819, "learning_rate": 6.00109488240147e-07, "loss": 0.77529138, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 12547, "time_per_iteration": 2.395521640777588 }, { "auxiliary_loss_clip": 0.01052222, "auxiliary_loss_mlp": 0.01038153, "balance_loss_clip": 1.0133456, "balance_loss_mlp": 1.01640034, "epoch": 0.7544265744776792, "flos": 20922784087680.0, "grad_norm": 1.8168276630838271, "language_loss": 0.6847899, "learning_rate": 5.998313626146099e-07, "loss": 0.70569366, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 12548, "time_per_iteration": 2.3880324363708496 }, { "auxiliary_loss_clip": 0.01053457, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.00884986, "balance_loss_mlp": 1.01615644, "epoch": 0.7544866977303473, "flos": 15194584252800.0, "grad_norm": 1.8912743803450867, "language_loss": 0.876302, "learning_rate": 5.995532900836088e-07, "loss": 0.89717245, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37304688, "step": 12549, "time_per_iteration": 2.365236282348633 }, { "auxiliary_loss_clip": 0.01048966, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.01345551, "balance_loss_mlp": 1.01493835, "epoch": 0.7545468209830152, "flos": 27082615461120.0, "grad_norm": 1.6931000095884086, "language_loss": 0.78390276, "learning_rate": 5.992752706576865e-07, "loss": 0.80475879, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.33984375, "step": 12550, "time_per_iteration": 2.4013724327087402 }, { "auxiliary_loss_clip": 0.01051854, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.01026034, "balance_loss_mlp": 1.01610982, "epoch": 0.7546069442356832, "flos": 26870447928960.0, "grad_norm": 1.4095999150504634, "language_loss": 0.70162749, "learning_rate": 5.98997304347386e-07, "loss": 0.72246826, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 12551, "time_per_iteration": 2.4543628692626953 }, { "auxiliary_loss_clip": 0.01053962, "auxiliary_loss_mlp": 0.01038258, "balance_loss_clip": 1.01410615, "balance_loss_mlp": 1.01766741, "epoch": 0.7546670674883511, "flos": 15742196928000.0, "grad_norm": 2.1097407869695566, "language_loss": 0.8711127, "learning_rate": 5.987193911632487e-07, "loss": 0.89203501, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 12552, "time_per_iteration": 3.59165096282959 }, { "auxiliary_loss_clip": 0.01052721, "auxiliary_loss_mlp": 0.01038111, "balance_loss_clip": 1.01450765, "balance_loss_mlp": 1.01662266, "epoch": 0.7547271907410191, "flos": 23476575974400.0, "grad_norm": 1.6838345207498164, "language_loss": 0.79692757, "learning_rate": 5.98441531115812e-07, "loss": 0.81783587, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 12553, "time_per_iteration": 2.4197230339050293 }, { "auxiliary_loss_clip": 0.01052372, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.01588929, "balance_loss_mlp": 1.01583827, "epoch": 0.754787313993687, "flos": 31721455054080.0, "grad_norm": 2.377922187168272, "language_loss": 0.64757091, "learning_rate": 5.981637242156135e-07, "loss": 0.66849506, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 12554, "time_per_iteration": 2.4552290439605713 }, { "auxiliary_loss_clip": 0.0105248, "auxiliary_loss_mlp": 0.01039191, "balance_loss_clip": 1.01536107, "balance_loss_mlp": 1.01671255, "epoch": 0.7548474372463551, "flos": 27561833049600.0, "grad_norm": 1.6637711263967532, "language_loss": 0.74044049, "learning_rate": 5.978859704731864e-07, "loss": 0.76135719, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 12555, "time_per_iteration": 2.460484743118286 }, { "auxiliary_loss_clip": 0.01054608, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.01236069, "balance_loss_mlp": 1.0171392, "epoch": 0.754907560499023, "flos": 19317547463040.0, "grad_norm": 1.7125545547400447, "language_loss": 0.80081761, "learning_rate": 5.976082698990645e-07, "loss": 0.82174039, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 12556, "time_per_iteration": 2.3539249897003174 }, { "auxiliary_loss_clip": 0.01008569, "auxiliary_loss_mlp": 0.01002092, "balance_loss_clip": 0.99981493, "balance_loss_mlp": 1.00153208, "epoch": 0.754967683751691, "flos": 69741100800000.0, "grad_norm": 0.7422118952474142, "language_loss": 0.50469643, "learning_rate": 5.973306225037769e-07, "loss": 0.52480304, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.0703125, "step": 12557, "time_per_iteration": 4.374576807022095 }, { "auxiliary_loss_clip": 0.01053687, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.00985169, "balance_loss_mlp": 1.01653719, "epoch": 0.7550278070043589, "flos": 24420872050560.0, "grad_norm": 1.5803874855307685, "language_loss": 0.72675377, "learning_rate": 5.970530282978525e-07, "loss": 0.74764299, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 12558, "time_per_iteration": 3.8435425758361816 }, { "auxiliary_loss_clip": 0.01051526, "auxiliary_loss_mlp": 0.01033212, "balance_loss_clip": 1.01195741, "balance_loss_mlp": 1.01644361, "epoch": 0.7550879302570269, "flos": 32633246787840.0, "grad_norm": 2.935641351520168, "language_loss": 0.81219047, "learning_rate": 5.967754872918187e-07, "loss": 0.83303785, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.3515625, "step": 12559, "time_per_iteration": 2.480257034301758 }, { "auxiliary_loss_clip": 0.01053291, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.01140881, "balance_loss_mlp": 1.01624417, "epoch": 0.7551480535096948, "flos": 21794565536640.0, "grad_norm": 2.3322215228471843, "language_loss": 0.79806304, "learning_rate": 5.96497999496199e-07, "loss": 0.81894499, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 12560, "time_per_iteration": 2.3976242542266846 }, { "auxiliary_loss_clip": 0.01050277, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.01543951, "balance_loss_mlp": 1.01611781, "epoch": 0.7552081767623628, "flos": 18514126189440.0, "grad_norm": 1.6806169942466942, "language_loss": 0.72221422, "learning_rate": 5.96220564921515e-07, "loss": 0.74308419, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34179688, "step": 12561, "time_per_iteration": 2.390883445739746 }, { "auxiliary_loss_clip": 0.01051836, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.01212764, "balance_loss_mlp": 1.01544094, "epoch": 0.7552683000150308, "flos": 27633614538240.0, "grad_norm": 1.6075129737101073, "language_loss": 0.77034283, "learning_rate": 5.959431835782889e-07, "loss": 0.79121089, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 12562, "time_per_iteration": 2.4276885986328125 }, { "auxiliary_loss_clip": 0.01051104, "auxiliary_loss_mlp": 0.01038522, "balance_loss_clip": 1.01532352, "balance_loss_mlp": 1.01524985, "epoch": 0.7553284232676988, "flos": 20301888735360.0, "grad_norm": 2.049202068091425, "language_loss": 0.76852477, "learning_rate": 5.956658554770371e-07, "loss": 0.78942102, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 12563, "time_per_iteration": 2.4069366455078125 }, { "auxiliary_loss_clip": 0.01055737, "auxiliary_loss_mlp": 0.01047457, "balance_loss_clip": 1.02108765, "balance_loss_mlp": 1.016047, "epoch": 0.7553885465203668, "flos": 33254072317440.0, "grad_norm": 2.301263179847579, "language_loss": 0.695539, "learning_rate": 5.953885806282768e-07, "loss": 0.71657097, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39648438, "step": 12564, "time_per_iteration": 2.511375904083252 }, { "auxiliary_loss_clip": 0.01053579, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.01428795, "balance_loss_mlp": 1.01633191, "epoch": 0.7554486697730347, "flos": 21615181637760.0, "grad_norm": 2.932863721180713, "language_loss": 0.69791436, "learning_rate": 5.951113590425228e-07, "loss": 0.71885026, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 12565, "time_per_iteration": 2.4608850479125977 }, { "auxiliary_loss_clip": 0.01055591, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.01316476, "balance_loss_mlp": 1.01693201, "epoch": 0.7555087930257027, "flos": 27631834058880.0, "grad_norm": 1.5078230082223012, "language_loss": 0.7577095, "learning_rate": 5.94834190730287e-07, "loss": 0.77864861, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 12566, "time_per_iteration": 2.4740943908691406 }, { "auxiliary_loss_clip": 0.01054749, "auxiliary_loss_mlp": 0.01046762, "balance_loss_clip": 1.02059555, "balance_loss_mlp": 1.0166223, "epoch": 0.7555689162783706, "flos": 23620557888000.0, "grad_norm": 2.3780071904064353, "language_loss": 0.75050056, "learning_rate": 5.945570757020789e-07, "loss": 0.77151573, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 12567, "time_per_iteration": 2.3886756896972656 }, { "auxiliary_loss_clip": 0.01052621, "auxiliary_loss_mlp": 0.01035582, "balance_loss_clip": 1.01239562, "balance_loss_mlp": 1.01659799, "epoch": 0.7556290395310387, "flos": 24861929656320.0, "grad_norm": 1.9924222408612715, "language_loss": 0.64157999, "learning_rate": 5.942800139684073e-07, "loss": 0.662462, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 12568, "time_per_iteration": 2.4692583084106445 }, { "auxiliary_loss_clip": 0.01051165, "auxiliary_loss_mlp": 0.0103891, "balance_loss_clip": 1.01687968, "balance_loss_mlp": 1.01596165, "epoch": 0.7556891627837066, "flos": 43542103605120.0, "grad_norm": 1.7775138923897402, "language_loss": 0.67392147, "learning_rate": 5.940030055397789e-07, "loss": 0.69482219, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12569, "time_per_iteration": 2.571235418319702 }, { "auxiliary_loss_clip": 0.01055799, "auxiliary_loss_mlp": 0.01041746, "balance_loss_clip": 1.01507854, "balance_loss_mlp": 1.0168736, "epoch": 0.7557492860363746, "flos": 26649727113600.0, "grad_norm": 1.7797614892613784, "language_loss": 0.68246788, "learning_rate": 5.93726050426697e-07, "loss": 0.70344329, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38867188, "step": 12570, "time_per_iteration": 3.9672651290893555 }, { "auxiliary_loss_clip": 0.01053397, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.008196, "balance_loss_mlp": 1.01675844, "epoch": 0.7558094092890425, "flos": 55180889550720.0, "grad_norm": 1.858876083293177, "language_loss": 0.72753829, "learning_rate": 5.934491486396647e-07, "loss": 0.74841434, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 12571, "time_per_iteration": 2.697713851928711 }, { "auxiliary_loss_clip": 0.01054112, "auxiliary_loss_mlp": 0.01039241, "balance_loss_clip": 1.01396847, "balance_loss_mlp": 1.01614416, "epoch": 0.7558695325417105, "flos": 23987145830400.0, "grad_norm": 1.7703581139518583, "language_loss": 0.74726391, "learning_rate": 5.931723001891811e-07, "loss": 0.76819748, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 12572, "time_per_iteration": 2.483304977416992 }, { "auxiliary_loss_clip": 0.01055268, "auxiliary_loss_mlp": 0.01042351, "balance_loss_clip": 1.01846194, "balance_loss_mlp": 1.01754117, "epoch": 0.7559296557943784, "flos": 14610382606080.0, "grad_norm": 2.088030550892795, "language_loss": 0.77809834, "learning_rate": 5.928955050857456e-07, "loss": 0.79907453, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37695312, "step": 12573, "time_per_iteration": 2.5594263076782227 }, { "auxiliary_loss_clip": 0.01054353, "auxiliary_loss_mlp": 0.01042548, "balance_loss_clip": 1.01756132, "balance_loss_mlp": 1.01548874, "epoch": 0.7559897790470465, "flos": 18549528174720.0, "grad_norm": 1.622856970935265, "language_loss": 0.6964258, "learning_rate": 5.926187633398527e-07, "loss": 0.71739483, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38867188, "step": 12574, "time_per_iteration": 2.4467971324920654 }, { "auxiliary_loss_clip": 0.01050958, "auxiliary_loss_mlp": 0.01039149, "balance_loss_clip": 1.01542652, "balance_loss_mlp": 1.01556015, "epoch": 0.7560499022997144, "flos": 17966897539200.0, "grad_norm": 2.270159299372539, "language_loss": 0.73860627, "learning_rate": 5.923420749619974e-07, "loss": 0.75950736, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 12575, "time_per_iteration": 2.4479002952575684 }, { "auxiliary_loss_clip": 0.01051406, "auxiliary_loss_mlp": 0.01042265, "balance_loss_clip": 1.01928151, "balance_loss_mlp": 1.01587069, "epoch": 0.7561100255523824, "flos": 15737030046720.0, "grad_norm": 2.2294623127545634, "language_loss": 0.73786789, "learning_rate": 5.92065439962673e-07, "loss": 0.75880462, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12576, "time_per_iteration": 2.3828234672546387 }, { "auxiliary_loss_clip": 0.01052182, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.0156188, "balance_loss_mlp": 1.01669157, "epoch": 0.7561701488050504, "flos": 15887191271040.0, "grad_norm": 1.8308687767308962, "language_loss": 0.68457657, "learning_rate": 5.917888583523669e-07, "loss": 0.7054857, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 12577, "time_per_iteration": 2.3886446952819824 }, { "auxiliary_loss_clip": 0.01051669, "auxiliary_loss_mlp": 0.01037682, "balance_loss_clip": 1.01587892, "balance_loss_mlp": 1.01590014, "epoch": 0.7562302720577183, "flos": 20338128593280.0, "grad_norm": 2.0238211397279677, "language_loss": 0.79147708, "learning_rate": 5.915123301415685e-07, "loss": 0.81237054, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35742188, "step": 12578, "time_per_iteration": 2.4017505645751953 }, { "auxiliary_loss_clip": 0.0105217, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.01629186, "balance_loss_mlp": 1.0154047, "epoch": 0.7562903953103863, "flos": 20811201782400.0, "grad_norm": 1.5150385376026165, "language_loss": 0.76423585, "learning_rate": 5.912358553407641e-07, "loss": 0.78516328, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 12579, "time_per_iteration": 2.4347798824310303 }, { "auxiliary_loss_clip": 0.01055519, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.01658428, "balance_loss_mlp": 1.01640153, "epoch": 0.7563505185630542, "flos": 37595487104640.0, "grad_norm": 1.8717892959707667, "language_loss": 0.63957512, "learning_rate": 5.90959433960437e-07, "loss": 0.66054893, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 12580, "time_per_iteration": 2.507239818572998 }, { "auxiliary_loss_clip": 0.01053539, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.00909448, "balance_loss_mlp": 1.01734614, "epoch": 0.7564106418157223, "flos": 20229932689920.0, "grad_norm": 2.189514300205276, "language_loss": 0.76127678, "learning_rate": 5.906830660110691e-07, "loss": 0.78213418, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 12581, "time_per_iteration": 2.4476778507232666 }, { "auxiliary_loss_clip": 0.01051552, "auxiliary_loss_mlp": 0.01038651, "balance_loss_clip": 1.01564336, "balance_loss_mlp": 1.01503682, "epoch": 0.7564707650683902, "flos": 24753698841600.0, "grad_norm": 2.2629812107112897, "language_loss": 0.6405021, "learning_rate": 5.904067515031412e-07, "loss": 0.66140413, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 12582, "time_per_iteration": 2.395158290863037 }, { "auxiliary_loss_clip": 0.01008671, "auxiliary_loss_mlp": 0.01004513, "balance_loss_clip": 1.00192654, "balance_loss_mlp": 1.00148892, "epoch": 0.7565308883210582, "flos": 48527594887680.0, "grad_norm": 0.945847273377463, "language_loss": 0.60670328, "learning_rate": 5.901304904471307e-07, "loss": 0.62683511, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.07177734, "step": 12583, "time_per_iteration": 2.8325090408325195 }, { "auxiliary_loss_clip": 0.0105409, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.01651323, "balance_loss_mlp": 1.01642299, "epoch": 0.7565910115737261, "flos": 12494261923200.0, "grad_norm": 1.9747874918623742, "language_loss": 0.80435079, "learning_rate": 5.898542828535125e-07, "loss": 0.82530266, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 12584, "time_per_iteration": 2.3499972820281982 }, { "auxiliary_loss_clip": 0.010507, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.01217163, "balance_loss_mlp": 1.01583719, "epoch": 0.7566511348263941, "flos": 21172099173120.0, "grad_norm": 2.4801759039610745, "language_loss": 0.78104901, "learning_rate": 5.895781287327612e-07, "loss": 0.80190063, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 12585, "time_per_iteration": 2.423314094543457 }, { "auxiliary_loss_clip": 0.01055152, "auxiliary_loss_mlp": 0.01038137, "balance_loss_clip": 1.01374674, "balance_loss_mlp": 1.01740324, "epoch": 0.756711258079062, "flos": 21753961758720.0, "grad_norm": 1.6882541611631476, "language_loss": 0.84630626, "learning_rate": 5.893020280953493e-07, "loss": 0.86723912, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37695312, "step": 12586, "time_per_iteration": 2.3527650833129883 }, { "auxiliary_loss_clip": 0.0105433, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.01291347, "balance_loss_mlp": 1.01696277, "epoch": 0.75677138133173, "flos": 22381804471680.0, "grad_norm": 2.324012972716028, "language_loss": 0.85135579, "learning_rate": 5.890259809517459e-07, "loss": 0.87226224, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 12587, "time_per_iteration": 2.4331281185150146 }, { "auxiliary_loss_clip": 0.01051219, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.01185489, "balance_loss_mlp": 1.01578295, "epoch": 0.756831504584398, "flos": 22707928281600.0, "grad_norm": 1.792715653228071, "language_loss": 0.71916819, "learning_rate": 5.88749987312418e-07, "loss": 0.74003196, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 12588, "time_per_iteration": 2.5149829387664795 }, { "auxiliary_loss_clip": 0.01053482, "auxiliary_loss_mlp": 0.01037429, "balance_loss_clip": 1.01347971, "balance_loss_mlp": 1.01662016, "epoch": 0.756891627837066, "flos": 24097192035840.0, "grad_norm": 3.9273913905253828, "language_loss": 0.70407307, "learning_rate": 5.884740471878327e-07, "loss": 0.72498226, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 12589, "time_per_iteration": 2.4207379817962646 }, { "auxiliary_loss_clip": 0.01053824, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.01407242, "balance_loss_mlp": 1.01634479, "epoch": 0.756951751089734, "flos": 19748166572160.0, "grad_norm": 1.6924315941977943, "language_loss": 0.93045932, "learning_rate": 5.881981605884522e-07, "loss": 0.95138514, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.375, "step": 12590, "time_per_iteration": 2.405069351196289 }, { "auxiliary_loss_clip": 0.01051004, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.01295137, "balance_loss_mlp": 1.01588237, "epoch": 0.7570118743424019, "flos": 35077830341760.0, "grad_norm": 1.9847884533682028, "language_loss": 0.66234982, "learning_rate": 5.879223275247391e-07, "loss": 0.68321633, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 12591, "time_per_iteration": 3.7293288707733154 }, { "auxiliary_loss_clip": 0.01052289, "auxiliary_loss_mlp": 0.01031289, "balance_loss_clip": 1.00929523, "balance_loss_mlp": 1.0169189, "epoch": 0.7570719975950699, "flos": 25593325061760.0, "grad_norm": 1.5421673371489821, "language_loss": 0.75709641, "learning_rate": 5.876465480071528e-07, "loss": 0.77793223, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 12592, "time_per_iteration": 2.4673094749450684 }, { "auxiliary_loss_clip": 0.01052302, "auxiliary_loss_mlp": 0.01040255, "balance_loss_clip": 1.01623404, "balance_loss_mlp": 1.01570153, "epoch": 0.7571321208477378, "flos": 10815463330560.0, "grad_norm": 2.5496933745105093, "language_loss": 0.72908777, "learning_rate": 5.873708220461522e-07, "loss": 0.75001335, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 12593, "time_per_iteration": 2.416538953781128 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.01040627, "balance_loss_clip": 1.01623702, "balance_loss_mlp": 1.01605904, "epoch": 0.7571922441004059, "flos": 18259120552320.0, "grad_norm": 2.344541736379826, "language_loss": 0.67487955, "learning_rate": 5.870951496521903e-07, "loss": 0.69582337, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37695312, "step": 12594, "time_per_iteration": 2.3820931911468506 }, { "auxiliary_loss_clip": 0.01054456, "auxiliary_loss_mlp": 0.01038344, "balance_loss_clip": 1.01409698, "balance_loss_mlp": 1.01680291, "epoch": 0.7572523673530738, "flos": 22889476684800.0, "grad_norm": 1.9751125145847035, "language_loss": 0.81531274, "learning_rate": 5.86819530835722e-07, "loss": 0.83624077, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37695312, "step": 12595, "time_per_iteration": 2.4210867881774902 }, { "auxiliary_loss_clip": 0.01052201, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.01445031, "balance_loss_mlp": 1.01651359, "epoch": 0.7573124906057418, "flos": 20995263803520.0, "grad_norm": 2.0090268050523368, "language_loss": 0.72715831, "learning_rate": 5.865439656071993e-07, "loss": 0.74804425, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35742188, "step": 12596, "time_per_iteration": 2.4222259521484375 }, { "auxiliary_loss_clip": 0.01051053, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.01026917, "balance_loss_mlp": 1.01596403, "epoch": 0.7573726138584097, "flos": 20885252509440.0, "grad_norm": 3.1984769106012743, "language_loss": 0.82085133, "learning_rate": 5.862684539770706e-07, "loss": 0.84169167, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 12597, "time_per_iteration": 3.8836419582366943 }, { "auxiliary_loss_clip": 0.01056108, "auxiliary_loss_mlp": 0.01039344, "balance_loss_clip": 1.01262879, "balance_loss_mlp": 1.01807892, "epoch": 0.7574327371110777, "flos": 24529486890240.0, "grad_norm": 1.8450985515331628, "language_loss": 0.83532596, "learning_rate": 5.859929959557835e-07, "loss": 0.85628051, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38085938, "step": 12598, "time_per_iteration": 2.4708263874053955 }, { "auxiliary_loss_clip": 0.01051977, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.01028943, "balance_loss_mlp": 1.01669455, "epoch": 0.7574928603637456, "flos": 23363492480640.0, "grad_norm": 1.564969128359294, "language_loss": 0.63856089, "learning_rate": 5.857175915537845e-07, "loss": 0.65940422, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12599, "time_per_iteration": 2.4094395637512207 }, { "auxiliary_loss_clip": 0.01055481, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.0120846, "balance_loss_mlp": 1.01743495, "epoch": 0.7575529836164137, "flos": 13515436546560.0, "grad_norm": 2.3364843261246495, "language_loss": 0.6543417, "learning_rate": 5.854422407815161e-07, "loss": 0.67527902, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 12600, "time_per_iteration": 2.408433675765991 }, { "auxiliary_loss_clip": 0.01051727, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.01246071, "balance_loss_mlp": 1.01682782, "epoch": 0.7576131068690816, "flos": 19645556486400.0, "grad_norm": 2.104252771477835, "language_loss": 0.67357445, "learning_rate": 5.851669436494191e-07, "loss": 0.69442779, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34960938, "step": 12601, "time_per_iteration": 2.388345718383789 }, { "auxiliary_loss_clip": 0.01051628, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.01816809, "balance_loss_mlp": 1.01666594, "epoch": 0.7576732301217496, "flos": 20047197300480.0, "grad_norm": 1.5828370897945256, "language_loss": 0.69280565, "learning_rate": 5.848917001679335e-07, "loss": 0.71371269, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34960938, "step": 12602, "time_per_iteration": 2.4322757720947266 }, { "auxiliary_loss_clip": 0.01051189, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.01554418, "balance_loss_mlp": 1.01590216, "epoch": 0.7577333533744176, "flos": 15376202478720.0, "grad_norm": 1.7995432895370804, "language_loss": 0.68332636, "learning_rate": 5.846165103474967e-07, "loss": 0.70423448, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35351562, "step": 12603, "time_per_iteration": 2.380073308944702 }, { "auxiliary_loss_clip": 0.01049417, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.01425028, "balance_loss_mlp": 1.01514983, "epoch": 0.7577934766270855, "flos": 17893894152960.0, "grad_norm": 1.9597389789789155, "language_loss": 0.63141966, "learning_rate": 5.843413741985439e-07, "loss": 0.6522665, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34179688, "step": 12604, "time_per_iteration": 2.353097915649414 }, { "auxiliary_loss_clip": 0.0105283, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.0128032, "balance_loss_mlp": 1.01679075, "epoch": 0.7578535998797535, "flos": 21612772753920.0, "grad_norm": 1.862593364300812, "language_loss": 0.81059808, "learning_rate": 5.840662917315076e-07, "loss": 0.83147967, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 12605, "time_per_iteration": 2.4288647174835205 }, { "auxiliary_loss_clip": 0.01054543, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.01623964, "balance_loss_mlp": 1.01564264, "epoch": 0.7579137231324214, "flos": 18477397572480.0, "grad_norm": 3.3023209589049607, "language_loss": 0.80740035, "learning_rate": 5.837912629568198e-07, "loss": 0.82836521, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 12606, "time_per_iteration": 2.3573949337005615 }, { "auxiliary_loss_clip": 0.01051101, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.01592255, "balance_loss_mlp": 1.01666009, "epoch": 0.7579738463850895, "flos": 23254004856960.0, "grad_norm": 1.3766187284215885, "language_loss": 0.73606139, "learning_rate": 5.835162878849087e-07, "loss": 0.75692368, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.34375, "step": 12607, "time_per_iteration": 2.427187204360962 }, { "auxiliary_loss_clip": 0.01053003, "auxiliary_loss_mlp": 0.01042027, "balance_loss_clip": 1.01741052, "balance_loss_mlp": 1.01545894, "epoch": 0.7580339696377574, "flos": 14026180959360.0, "grad_norm": 1.959243076848414, "language_loss": 0.75718975, "learning_rate": 5.83241366526202e-07, "loss": 0.77814007, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 12608, "time_per_iteration": 2.394636869430542 }, { "auxiliary_loss_clip": 0.01051763, "auxiliary_loss_mlp": 0.01040891, "balance_loss_clip": 1.01552355, "balance_loss_mlp": 1.01688516, "epoch": 0.7580940928904254, "flos": 25081603130880.0, "grad_norm": 1.7068109579343937, "language_loss": 0.72890043, "learning_rate": 5.829664988911245e-07, "loss": 0.74982691, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.34960938, "step": 12609, "time_per_iteration": 2.43148136138916 }, { "auxiliary_loss_clip": 0.01053412, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.01278579, "balance_loss_mlp": 1.01562297, "epoch": 0.7581542161430933, "flos": 23835448506240.0, "grad_norm": 1.6308651912054826, "language_loss": 0.82232749, "learning_rate": 5.826916849901007e-07, "loss": 0.843243, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 12610, "time_per_iteration": 3.9108715057373047 }, { "auxiliary_loss_clip": 0.01054926, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.01442933, "balance_loss_mlp": 1.01681376, "epoch": 0.7582143393957613, "flos": 22235902433280.0, "grad_norm": 1.6711675764362164, "language_loss": 0.71260351, "learning_rate": 5.824169248335488e-07, "loss": 0.73353726, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38085938, "step": 12611, "time_per_iteration": 2.4045212268829346 }, { "auxiliary_loss_clip": 0.01052426, "auxiliary_loss_mlp": 0.01033261, "balance_loss_clip": 1.01027727, "balance_loss_mlp": 1.01645172, "epoch": 0.7582744626484292, "flos": 21105310008960.0, "grad_norm": 1.7245988890993413, "language_loss": 0.71947581, "learning_rate": 5.821422184318893e-07, "loss": 0.74033272, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 12612, "time_per_iteration": 2.4358608722686768 }, { "auxiliary_loss_clip": 0.01055341, "auxiliary_loss_mlp": 0.01042731, "balance_loss_clip": 1.01925921, "balance_loss_mlp": 1.01781678, "epoch": 0.7583345859010973, "flos": 24603712174080.0, "grad_norm": 1.463220223020773, "language_loss": 0.60627794, "learning_rate": 5.818675657955397e-07, "loss": 0.62725866, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 12613, "time_per_iteration": 2.422016143798828 }, { "auxiliary_loss_clip": 0.0105301, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.01476097, "balance_loss_mlp": 1.0163517, "epoch": 0.7583947091537652, "flos": 33545422546560.0, "grad_norm": 1.8184402422983665, "language_loss": 0.60951322, "learning_rate": 5.815929669349135e-07, "loss": 0.63042867, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 12614, "time_per_iteration": 2.526916742324829 }, { "auxiliary_loss_clip": 0.0105302, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.01357484, "balance_loss_mlp": 1.01632905, "epoch": 0.7584548324064332, "flos": 20119956307200.0, "grad_norm": 1.6742489226330188, "language_loss": 0.73890758, "learning_rate": 5.813184218604246e-07, "loss": 0.7597968, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3671875, "step": 12615, "time_per_iteration": 2.3701961040496826 }, { "auxiliary_loss_clip": 0.01008792, "auxiliary_loss_mlp": 0.01004399, "balance_loss_clip": 1.00197923, "balance_loss_mlp": 1.0016222, "epoch": 0.7585149556591012, "flos": 70399004060160.0, "grad_norm": 0.813807366047582, "language_loss": 0.67798907, "learning_rate": 5.810439305824828e-07, "loss": 0.69812095, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.07177734, "step": 12616, "time_per_iteration": 3.0687966346740723 }, { "auxiliary_loss_clip": 0.01055517, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.01478815, "balance_loss_mlp": 1.01730728, "epoch": 0.7585750789117691, "flos": 16142860224000.0, "grad_norm": 1.8058177760210061, "language_loss": 0.85245633, "learning_rate": 5.807694931114979e-07, "loss": 0.87341964, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 12617, "time_per_iteration": 2.3727877140045166 }, { "auxiliary_loss_clip": 0.01054382, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.01549602, "balance_loss_mlp": 1.01693702, "epoch": 0.7586352021644371, "flos": 17492218427520.0, "grad_norm": 2.26123035208268, "language_loss": 0.76460135, "learning_rate": 5.804951094578757e-07, "loss": 0.78552377, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.375, "step": 12618, "time_per_iteration": 2.3974733352661133 }, { "auxiliary_loss_clip": 0.01054742, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.01857233, "balance_loss_mlp": 1.01617122, "epoch": 0.758695325417105, "flos": 17274220698240.0, "grad_norm": 1.9713821444274184, "language_loss": 0.79036748, "learning_rate": 5.802207796320209e-07, "loss": 0.81135571, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 12619, "time_per_iteration": 2.362710475921631 }, { "auxiliary_loss_clip": 0.01052881, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.01062429, "balance_loss_mlp": 1.01685262, "epoch": 0.7587554486697731, "flos": 29494415381760.0, "grad_norm": 1.6271759503793115, "language_loss": 0.83167231, "learning_rate": 5.79946503644337e-07, "loss": 0.85255718, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 12620, "time_per_iteration": 2.4521214962005615 }, { "auxiliary_loss_clip": 0.01054066, "auxiliary_loss_mlp": 0.01041763, "balance_loss_clip": 1.01591861, "balance_loss_mlp": 1.01609373, "epoch": 0.758815571922441, "flos": 16100057030400.0, "grad_norm": 4.942242850813006, "language_loss": 0.83610427, "learning_rate": 5.796722815052242e-07, "loss": 0.85706258, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 12621, "time_per_iteration": 2.333015203475952 }, { "auxiliary_loss_clip": 0.01052588, "auxiliary_loss_mlp": 0.01039462, "balance_loss_clip": 1.01598942, "balance_loss_mlp": 1.01632345, "epoch": 0.758875695175109, "flos": 16142790401280.0, "grad_norm": 2.050092714871132, "language_loss": 0.75111836, "learning_rate": 5.7939811322508e-07, "loss": 0.77203882, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 12622, "time_per_iteration": 2.410367488861084 }, { "auxiliary_loss_clip": 0.01008556, "auxiliary_loss_mlp": 0.01002838, "balance_loss_clip": 1.00047719, "balance_loss_mlp": 1.00146306, "epoch": 0.7589358184277769, "flos": 68458671406080.0, "grad_norm": 0.8440297375844196, "language_loss": 0.60833216, "learning_rate": 5.791239988143024e-07, "loss": 0.6284461, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.07080078, "step": 12623, "time_per_iteration": 3.03436017036438 }, { "auxiliary_loss_clip": 0.01052269, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.01352501, "balance_loss_mlp": 1.01703107, "epoch": 0.7589959416804449, "flos": 20046289605120.0, "grad_norm": 2.4516521903050514, "language_loss": 0.68688917, "learning_rate": 5.788499382832847e-07, "loss": 0.70775962, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 12624, "time_per_iteration": 2.44975209236145 }, { "auxiliary_loss_clip": 0.01050946, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.01595283, "balance_loss_mlp": 1.01644194, "epoch": 0.7590560649331128, "flos": 18770772660480.0, "grad_norm": 1.7763545047130584, "language_loss": 0.77273613, "learning_rate": 5.785759316424196e-07, "loss": 0.7936179, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 12625, "time_per_iteration": 2.5047059059143066 }, { "auxiliary_loss_clip": 0.01053531, "auxiliary_loss_mlp": 0.01040671, "balance_loss_clip": 1.01632881, "balance_loss_mlp": 1.01752877, "epoch": 0.7591161881857809, "flos": 29823995416320.0, "grad_norm": 1.993863544775772, "language_loss": 0.64106381, "learning_rate": 5.783019789020977e-07, "loss": 0.66200584, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 12626, "time_per_iteration": 2.6232433319091797 }, { "auxiliary_loss_clip": 0.01053527, "auxiliary_loss_mlp": 0.01044523, "balance_loss_clip": 1.01711726, "balance_loss_mlp": 1.0172205, "epoch": 0.7591763114384488, "flos": 20301679267200.0, "grad_norm": 1.9218332381082242, "language_loss": 0.75191373, "learning_rate": 5.780280800727084e-07, "loss": 0.77289426, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.36328125, "step": 12627, "time_per_iteration": 2.473642110824585 }, { "auxiliary_loss_clip": 0.01054867, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.01159668, "balance_loss_mlp": 1.0163312, "epoch": 0.7592364346911168, "flos": 20812563325440.0, "grad_norm": 6.511439016625779, "language_loss": 0.69724894, "learning_rate": 5.777542351646356e-07, "loss": 0.71815097, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38476562, "step": 12628, "time_per_iteration": 2.4487459659576416 }, { "auxiliary_loss_clip": 0.01057347, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.01744878, "balance_loss_mlp": 1.01712036, "epoch": 0.7592965579437848, "flos": 21250443997440.0, "grad_norm": 2.006054057483096, "language_loss": 0.63970375, "learning_rate": 5.774804441882648e-07, "loss": 0.66071707, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.40234375, "step": 12629, "time_per_iteration": 2.4446065425872803 }, { "auxiliary_loss_clip": 0.01049798, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.01417887, "balance_loss_mlp": 1.01516557, "epoch": 0.7593566811964527, "flos": 26212405023360.0, "grad_norm": 1.6522217497268992, "language_loss": 0.79197502, "learning_rate": 5.772067071539786e-07, "loss": 0.8128444, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34765625, "step": 12630, "time_per_iteration": 2.626588821411133 }, { "auxiliary_loss_clip": 0.01008581, "auxiliary_loss_mlp": 0.01005769, "balance_loss_clip": 1.00357556, "balance_loss_mlp": 1.00147486, "epoch": 0.7594168044491207, "flos": 71233777601280.0, "grad_norm": 0.9982977486307736, "language_loss": 0.61576617, "learning_rate": 5.769330240721562e-07, "loss": 0.63590968, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.07128906, "step": 12631, "time_per_iteration": 4.304818391799927 }, { "auxiliary_loss_clip": 0.01056864, "auxiliary_loss_mlp": 0.0104535, "balance_loss_clip": 1.01729989, "balance_loss_mlp": 1.018538, "epoch": 0.7594769277017887, "flos": 26612160624000.0, "grad_norm": 1.780526134385108, "language_loss": 0.75197297, "learning_rate": 5.766593949531767e-07, "loss": 0.77299511, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.3828125, "step": 12632, "time_per_iteration": 2.42234206199646 }, { "auxiliary_loss_clip": 0.01054323, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.01317966, "balance_loss_mlp": 1.01730394, "epoch": 0.7595370509544567, "flos": 17595177626880.0, "grad_norm": 2.056443341151545, "language_loss": 0.75332689, "learning_rate": 5.763858198074154e-07, "loss": 0.77425253, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 12633, "time_per_iteration": 2.400239944458008 }, { "auxiliary_loss_clip": 0.01051119, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.01375985, "balance_loss_mlp": 1.01603961, "epoch": 0.7595971742071246, "flos": 18002020233600.0, "grad_norm": 2.158577484574086, "language_loss": 0.74417555, "learning_rate": 5.76112298645246e-07, "loss": 0.76504749, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 12634, "time_per_iteration": 2.370774507522583 }, { "auxiliary_loss_clip": 0.01053613, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.01259017, "balance_loss_mlp": 1.01739931, "epoch": 0.7596572974597926, "flos": 28839060650880.0, "grad_norm": 1.7350354429533523, "language_loss": 0.65959454, "learning_rate": 5.758388314770408e-07, "loss": 0.68049878, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 12635, "time_per_iteration": 2.4188039302825928 }, { "auxiliary_loss_clip": 0.01055022, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.01518369, "balance_loss_mlp": 1.01711726, "epoch": 0.7597174207124605, "flos": 14281954646400.0, "grad_norm": 1.686316050820359, "language_loss": 0.70490086, "learning_rate": 5.7556541831317e-07, "loss": 0.72586656, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 12636, "time_per_iteration": 3.799367666244507 }, { "auxiliary_loss_clip": 0.01054583, "auxiliary_loss_mlp": 0.01042425, "balance_loss_clip": 1.01778412, "balance_loss_mlp": 1.01727688, "epoch": 0.7597775439651285, "flos": 21687870821760.0, "grad_norm": 1.8044394555550034, "language_loss": 0.82159936, "learning_rate": 5.752920591640018e-07, "loss": 0.84256941, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 12637, "time_per_iteration": 3.907252073287964 }, { "auxiliary_loss_clip": 0.01053812, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.01610446, "balance_loss_mlp": 1.01671195, "epoch": 0.7598376672177964, "flos": 36099773015040.0, "grad_norm": 1.8018279202719802, "language_loss": 0.67252958, "learning_rate": 5.750187540399017e-07, "loss": 0.69347489, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12638, "time_per_iteration": 2.566556930541992 }, { "auxiliary_loss_clip": 0.01053685, "auxiliary_loss_mlp": 0.01043044, "balance_loss_clip": 1.01684213, "balance_loss_mlp": 1.01693928, "epoch": 0.7598977904704645, "flos": 18331355888640.0, "grad_norm": 2.285753481577789, "language_loss": 0.6710968, "learning_rate": 5.747455029512323e-07, "loss": 0.69206417, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 12639, "time_per_iteration": 2.360905408859253 }, { "auxiliary_loss_clip": 0.01053212, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.01152062, "balance_loss_mlp": 1.01638043, "epoch": 0.7599579137231324, "flos": 20191633061760.0, "grad_norm": 2.1959489046411393, "language_loss": 0.71130884, "learning_rate": 5.744723059083572e-07, "loss": 0.73221171, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 12640, "time_per_iteration": 2.3644821643829346 }, { "auxiliary_loss_clip": 0.01054343, "auxiliary_loss_mlp": 0.0103934, "balance_loss_clip": 1.01310182, "balance_loss_mlp": 1.01648211, "epoch": 0.7600180369758004, "flos": 24023699890560.0, "grad_norm": 1.9185500389468169, "language_loss": 0.67818403, "learning_rate": 5.741991629216343e-07, "loss": 0.69912088, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 12641, "time_per_iteration": 2.427910327911377 }, { "auxiliary_loss_clip": 0.01053606, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.01531243, "balance_loss_mlp": 1.0157361, "epoch": 0.7600781602284684, "flos": 18988526010240.0, "grad_norm": 2.5604654026865483, "language_loss": 0.68470198, "learning_rate": 5.73926074001422e-07, "loss": 0.70564634, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 12642, "time_per_iteration": 2.3517918586730957 }, { "auxiliary_loss_clip": 0.01052074, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.01248264, "balance_loss_mlp": 1.0166316, "epoch": 0.7601382834811363, "flos": 26066328428160.0, "grad_norm": 2.00950115997072, "language_loss": 0.77358735, "learning_rate": 5.736530391580765e-07, "loss": 0.79447162, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35351562, "step": 12643, "time_per_iteration": 2.4327619075775146 }, { "auxiliary_loss_clip": 0.01054657, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.01412654, "balance_loss_mlp": 1.017537, "epoch": 0.7601984067338043, "flos": 18843217464960.0, "grad_norm": 1.6706111747663748, "language_loss": 0.79388511, "learning_rate": 5.733800584019508e-07, "loss": 0.81481797, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 12644, "time_per_iteration": 2.3700451850891113 }, { "auxiliary_loss_clip": 0.01052778, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.01605749, "balance_loss_mlp": 1.01629782, "epoch": 0.7602585299864723, "flos": 24645188736000.0, "grad_norm": 1.7147490522588766, "language_loss": 0.80883205, "learning_rate": 5.731071317433957e-07, "loss": 0.82975054, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 12645, "time_per_iteration": 2.428832769393921 }, { "auxiliary_loss_clip": 0.01054254, "auxiliary_loss_mlp": 0.01042426, "balance_loss_clip": 1.01752329, "balance_loss_mlp": 1.01669109, "epoch": 0.7603186532391403, "flos": 23840964501120.0, "grad_norm": 1.9175231543078288, "language_loss": 0.74124062, "learning_rate": 5.728342591927611e-07, "loss": 0.76220739, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 12646, "time_per_iteration": 2.3785500526428223 }, { "auxiliary_loss_clip": 0.01051008, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.01483464, "balance_loss_mlp": 1.01622188, "epoch": 0.7603787764918082, "flos": 22198824702720.0, "grad_norm": 2.0389568632739925, "language_loss": 0.69068867, "learning_rate": 5.725614407603949e-07, "loss": 0.71158218, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34765625, "step": 12647, "time_per_iteration": 2.393958330154419 }, { "auxiliary_loss_clip": 0.01008247, "auxiliary_loss_mlp": 0.01002945, "balance_loss_clip": 1.00053668, "balance_loss_mlp": 1.00108314, "epoch": 0.7604388997444762, "flos": 54083951677440.0, "grad_norm": 0.7221198814539381, "language_loss": 0.4903208, "learning_rate": 5.722886764566415e-07, "loss": 0.51043272, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.07177734, "step": 12648, "time_per_iteration": 2.9570586681365967 }, { "auxiliary_loss_clip": 0.01051681, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.01461744, "balance_loss_mlp": 1.01636279, "epoch": 0.7604990229971441, "flos": 19680923560320.0, "grad_norm": 1.674493310670297, "language_loss": 0.77286011, "learning_rate": 5.720159662918451e-07, "loss": 0.79375541, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 12649, "time_per_iteration": 2.399400472640991 }, { "auxiliary_loss_clip": 0.01051372, "auxiliary_loss_mlp": 0.01037464, "balance_loss_clip": 1.0148977, "balance_loss_mlp": 1.01568043, "epoch": 0.7605591462498121, "flos": 25226876764800.0, "grad_norm": 1.5391888866727546, "language_loss": 0.69696319, "learning_rate": 5.717433102763462e-07, "loss": 0.71785152, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 12650, "time_per_iteration": 3.9584317207336426 }, { "auxiliary_loss_clip": 0.01008004, "auxiliary_loss_mlp": 0.01003619, "balance_loss_clip": 1.00102043, "balance_loss_mlp": 1.00092804, "epoch": 0.76061926950248, "flos": 66780466306560.0, "grad_norm": 0.7537745542921246, "language_loss": 0.62835455, "learning_rate": 5.714707084204838e-07, "loss": 0.64847076, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.07080078, "step": 12651, "time_per_iteration": 3.0211620330810547 }, { "auxiliary_loss_clip": 0.01052206, "auxiliary_loss_mlp": 0.01038817, "balance_loss_clip": 1.01495171, "balance_loss_mlp": 1.01643407, "epoch": 0.7606793927551481, "flos": 25337167349760.0, "grad_norm": 1.6700590408743354, "language_loss": 0.72378695, "learning_rate": 5.711981607345951e-07, "loss": 0.74469721, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 12652, "time_per_iteration": 2.4361419677734375 }, { "auxiliary_loss_clip": 0.01055433, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.01552081, "balance_loss_mlp": 1.01802921, "epoch": 0.760739516007816, "flos": 18222636314880.0, "grad_norm": 2.068787206577919, "language_loss": 0.8095659, "learning_rate": 5.709256672290152e-07, "loss": 0.83053142, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12653, "time_per_iteration": 2.350820541381836 }, { "auxiliary_loss_clip": 0.01056099, "auxiliary_loss_mlp": 0.01040059, "balance_loss_clip": 1.01569247, "balance_loss_mlp": 1.01719344, "epoch": 0.760799639260484, "flos": 22558185993600.0, "grad_norm": 1.6524386643108755, "language_loss": 0.81139696, "learning_rate": 5.706532279140785e-07, "loss": 0.83235848, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38867188, "step": 12654, "time_per_iteration": 2.4235339164733887 }, { "auxiliary_loss_clip": 0.0105471, "auxiliary_loss_mlp": 0.01045477, "balance_loss_clip": 1.01912034, "balance_loss_mlp": 1.01678848, "epoch": 0.760859762513152, "flos": 22308242503680.0, "grad_norm": 2.0710776742886705, "language_loss": 0.80286604, "learning_rate": 5.703808428001136e-07, "loss": 0.82386792, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 12655, "time_per_iteration": 2.3636739253997803 }, { "auxiliary_loss_clip": 0.01050151, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.01034093, "balance_loss_mlp": 1.01604056, "epoch": 0.7609198857658199, "flos": 24862732617600.0, "grad_norm": 1.850155118476888, "language_loss": 0.69280386, "learning_rate": 5.701085118974505e-07, "loss": 0.71361041, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.34179688, "step": 12656, "time_per_iteration": 2.487635612487793 }, { "auxiliary_loss_clip": 0.01052765, "auxiliary_loss_mlp": 0.0104, "balance_loss_clip": 1.01617026, "balance_loss_mlp": 1.01562667, "epoch": 0.760980009018488, "flos": 16835851267200.0, "grad_norm": 2.1153968165761174, "language_loss": 0.74830973, "learning_rate": 5.698362352164164e-07, "loss": 0.76923746, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 12657, "time_per_iteration": 2.357116460800171 }, { "auxiliary_loss_clip": 0.01007789, "auxiliary_loss_mlp": 0.0100329, "balance_loss_clip": 1.00065541, "balance_loss_mlp": 1.00072777, "epoch": 0.7610401322711559, "flos": 61227670475520.0, "grad_norm": 0.8659976329393893, "language_loss": 0.65004146, "learning_rate": 5.695640127673347e-07, "loss": 0.67015219, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.0703125, "step": 12658, "time_per_iteration": 2.997544288635254 }, { "auxiliary_loss_clip": 0.01051116, "auxiliary_loss_mlp": 0.01037361, "balance_loss_clip": 1.01548624, "balance_loss_mlp": 1.01597106, "epoch": 0.7611002555238239, "flos": 19639865934720.0, "grad_norm": 1.620463708611102, "language_loss": 0.80464888, "learning_rate": 5.692918445605293e-07, "loss": 0.82553363, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 12659, "time_per_iteration": 2.363365888595581 }, { "auxiliary_loss_clip": 0.0105017, "auxiliary_loss_mlp": 0.01037143, "balance_loss_clip": 1.01508904, "balance_loss_mlp": 1.01515615, "epoch": 0.7611603787764918, "flos": 26870936688000.0, "grad_norm": 1.5576986862404827, "language_loss": 0.69783539, "learning_rate": 5.690197306063209e-07, "loss": 0.71870852, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 12660, "time_per_iteration": 2.468965768814087 }, { "auxiliary_loss_clip": 0.0105238, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.01120007, "balance_loss_mlp": 1.01555443, "epoch": 0.7612205020291598, "flos": 27343032359040.0, "grad_norm": 1.8998456713682055, "language_loss": 0.71587253, "learning_rate": 5.687476709150281e-07, "loss": 0.73675036, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 12661, "time_per_iteration": 2.4072518348693848 }, { "auxiliary_loss_clip": 0.01052249, "auxiliary_loss_mlp": 0.01035876, "balance_loss_clip": 1.01233244, "balance_loss_mlp": 1.01580095, "epoch": 0.7612806252818277, "flos": 29313320826240.0, "grad_norm": 1.5795593840352138, "language_loss": 0.84286189, "learning_rate": 5.68475665496966e-07, "loss": 0.86374319, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 12662, "time_per_iteration": 2.462442398071289 }, { "auxiliary_loss_clip": 0.01051646, "auxiliary_loss_mlp": 0.01045097, "balance_loss_clip": 1.02306712, "balance_loss_mlp": 1.01527989, "epoch": 0.7613407485344957, "flos": 19025045159040.0, "grad_norm": 1.7813096401851325, "language_loss": 0.69413006, "learning_rate": 5.682037143624505e-07, "loss": 0.71509743, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 12663, "time_per_iteration": 2.3431339263916016 }, { "auxiliary_loss_clip": 0.01051323, "auxiliary_loss_mlp": 0.01027117, "balance_loss_clip": 1.00562406, "balance_loss_mlp": 1.01632166, "epoch": 0.7614008717871636, "flos": 23254982375040.0, "grad_norm": 1.644411530972499, "language_loss": 0.70525503, "learning_rate": 5.67931817521794e-07, "loss": 0.72603947, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 12664, "time_per_iteration": 2.409648895263672 }, { "auxiliary_loss_clip": 0.01055933, "auxiliary_loss_mlp": 0.01045747, "balance_loss_clip": 1.01948488, "balance_loss_mlp": 1.0177232, "epoch": 0.7614609950398317, "flos": 21578837045760.0, "grad_norm": 1.715878078569503, "language_loss": 0.7996192, "learning_rate": 5.676599749853066e-07, "loss": 0.82063603, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3828125, "step": 12665, "time_per_iteration": 2.3870623111724854 }, { "auxiliary_loss_clip": 0.01052246, "auxiliary_loss_mlp": 0.01040645, "balance_loss_clip": 1.01844859, "balance_loss_mlp": 1.01681733, "epoch": 0.7615211182924996, "flos": 29276627120640.0, "grad_norm": 1.8813872264883427, "language_loss": 0.88796765, "learning_rate": 5.673881867632959e-07, "loss": 0.90889657, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35351562, "step": 12666, "time_per_iteration": 2.4651947021484375 }, { "auxiliary_loss_clip": 0.0105337, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.01398396, "balance_loss_mlp": 1.01658595, "epoch": 0.7615812415451676, "flos": 13260291264000.0, "grad_norm": 2.122823261277655, "language_loss": 0.84252107, "learning_rate": 5.671164528660693e-07, "loss": 0.86343282, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12667, "time_per_iteration": 2.343019723892212 }, { "auxiliary_loss_clip": 0.01051894, "auxiliary_loss_mlp": 0.01043916, "balance_loss_clip": 1.02120638, "balance_loss_mlp": 1.01700044, "epoch": 0.7616413647978356, "flos": 18583847907840.0, "grad_norm": 1.5688748420895675, "language_loss": 0.79450893, "learning_rate": 5.668447733039296e-07, "loss": 0.81546706, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 12668, "time_per_iteration": 2.383962631225586 }, { "auxiliary_loss_clip": 0.01050482, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.01336908, "balance_loss_mlp": 1.01562929, "epoch": 0.7617014880505035, "flos": 18515173530240.0, "grad_norm": 1.7488940507255015, "language_loss": 0.64905894, "learning_rate": 5.6657314808718e-07, "loss": 0.66991794, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 12669, "time_per_iteration": 2.3505423069000244 }, { "auxiliary_loss_clip": 0.0105445, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.01361012, "balance_loss_mlp": 1.01691127, "epoch": 0.7617616113031715, "flos": 24972010773120.0, "grad_norm": 1.6617799422315742, "language_loss": 0.67075706, "learning_rate": 5.663015772261202e-07, "loss": 0.69168746, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 12670, "time_per_iteration": 3.744530439376831 }, { "auxiliary_loss_clip": 0.01053099, "auxiliary_loss_mlp": 0.01043305, "balance_loss_clip": 1.0184375, "balance_loss_mlp": 1.01583827, "epoch": 0.7618217345558395, "flos": 23293910407680.0, "grad_norm": 1.658549459691781, "language_loss": 0.73993468, "learning_rate": 5.660300607310493e-07, "loss": 0.76089871, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 12671, "time_per_iteration": 2.4137401580810547 }, { "auxiliary_loss_clip": 0.01050789, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.01511621, "balance_loss_mlp": 1.01563096, "epoch": 0.7618818578085075, "flos": 25481742756480.0, "grad_norm": 1.6175520579860723, "language_loss": 0.73781574, "learning_rate": 5.657585986122613e-07, "loss": 0.75870085, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 12672, "time_per_iteration": 2.447244167327881 }, { "auxiliary_loss_clip": 0.01008351, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.00027835, "balance_loss_mlp": 1.00116289, "epoch": 0.7619419810611754, "flos": 61149220917120.0, "grad_norm": 0.7646802531981837, "language_loss": 0.56747508, "learning_rate": 5.654871908800506e-07, "loss": 0.58758652, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.07226562, "step": 12673, "time_per_iteration": 3.0157690048217773 }, { "auxiliary_loss_clip": 0.01053711, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.01550984, "balance_loss_mlp": 1.01642966, "epoch": 0.7620021043138434, "flos": 23257530904320.0, "grad_norm": 2.1786857019842096, "language_loss": 0.76026046, "learning_rate": 5.652158375447102e-07, "loss": 0.78120291, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 12674, "time_per_iteration": 2.4133248329162598 }, { "auxiliary_loss_clip": 0.01049761, "auxiliary_loss_mlp": 0.01033305, "balance_loss_clip": 1.01278901, "balance_loss_mlp": 1.0150423, "epoch": 0.7620622275665113, "flos": 25081323840000.0, "grad_norm": 2.3052539006888044, "language_loss": 0.74017286, "learning_rate": 5.649445386165286e-07, "loss": 0.76100349, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34765625, "step": 12675, "time_per_iteration": 2.4667890071868896 }, { "auxiliary_loss_clip": 0.01050983, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.01503754, "balance_loss_mlp": 1.01637733, "epoch": 0.7621223508191793, "flos": 20154031660800.0, "grad_norm": 2.160556439869312, "language_loss": 0.73988503, "learning_rate": 5.646732941057936e-07, "loss": 0.7607615, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 12676, "time_per_iteration": 3.776190996170044 }, { "auxiliary_loss_clip": 0.01054767, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.01528633, "balance_loss_mlp": 1.01585901, "epoch": 0.7621824740718472, "flos": 17999332058880.0, "grad_norm": 2.556186114783881, "language_loss": 0.55824554, "learning_rate": 5.644021040227927e-07, "loss": 0.57918644, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38867188, "step": 12677, "time_per_iteration": 3.7703723907470703 }, { "auxiliary_loss_clip": 0.01053659, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.01527929, "balance_loss_mlp": 1.01642418, "epoch": 0.7622425973245153, "flos": 21724599438720.0, "grad_norm": 2.1460659306714165, "language_loss": 0.81995821, "learning_rate": 5.641309683778064e-07, "loss": 0.84089768, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 12678, "time_per_iteration": 2.383782148361206 }, { "auxiliary_loss_clip": 0.01054095, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.0140667, "balance_loss_mlp": 1.01650095, "epoch": 0.7623027205771832, "flos": 19717547443200.0, "grad_norm": 1.7439544716004574, "language_loss": 0.77446562, "learning_rate": 5.638598871811175e-07, "loss": 0.79539549, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 12679, "time_per_iteration": 2.414151191711426 }, { "auxiliary_loss_clip": 0.01052277, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.01313865, "balance_loss_mlp": 1.0163213, "epoch": 0.7623628438298512, "flos": 23987669500800.0, "grad_norm": 1.3672936133807874, "language_loss": 0.80233836, "learning_rate": 5.635888604430059e-07, "loss": 0.82320994, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 12680, "time_per_iteration": 2.394313335418701 }, { "auxiliary_loss_clip": 0.01053949, "auxiliary_loss_mlp": 0.01041309, "balance_loss_clip": 1.01600051, "balance_loss_mlp": 1.01687336, "epoch": 0.7624229670825191, "flos": 22344622007040.0, "grad_norm": 2.3173460082403796, "language_loss": 0.64383471, "learning_rate": 5.633178881737493e-07, "loss": 0.66478723, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 12681, "time_per_iteration": 2.4145023822784424 }, { "auxiliary_loss_clip": 0.01051902, "auxiliary_loss_mlp": 0.01034705, "balance_loss_clip": 1.01259184, "balance_loss_mlp": 1.01705849, "epoch": 0.7624830903351871, "flos": 22710651367680.0, "grad_norm": 2.175298268097647, "language_loss": 0.77113378, "learning_rate": 5.63046970383622e-07, "loss": 0.79199988, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 12682, "time_per_iteration": 2.3797357082366943 }, { "auxiliary_loss_clip": 0.01050432, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.01017404, "balance_loss_mlp": 1.01598167, "epoch": 0.7625432135878552, "flos": 25592522100480.0, "grad_norm": 1.4641744586198198, "language_loss": 0.69250208, "learning_rate": 5.627761070828974e-07, "loss": 0.71331829, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 12683, "time_per_iteration": 2.452089786529541 }, { "auxiliary_loss_clip": 0.0105237, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.01524949, "balance_loss_mlp": 1.01641273, "epoch": 0.7626033368405231, "flos": 23986517425920.0, "grad_norm": 2.615747008894074, "language_loss": 0.8436445, "learning_rate": 5.625052982818472e-07, "loss": 0.86455059, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 12684, "time_per_iteration": 2.3668100833892822 }, { "auxiliary_loss_clip": 0.01053623, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.01182246, "balance_loss_mlp": 1.01718593, "epoch": 0.7626634600931911, "flos": 12598443020160.0, "grad_norm": 1.8425421971329572, "language_loss": 0.83634102, "learning_rate": 5.622345439907396e-07, "loss": 0.85722399, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 12685, "time_per_iteration": 2.399744987487793 }, { "auxiliary_loss_clip": 0.01052708, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.01225805, "balance_loss_mlp": 1.01573634, "epoch": 0.762723583345859, "flos": 26321403888000.0, "grad_norm": 1.9236919545568707, "language_loss": 0.78966963, "learning_rate": 5.619638442198422e-07, "loss": 0.81055415, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 12686, "time_per_iteration": 2.396284580230713 }, { "auxiliary_loss_clip": 0.01054465, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.01317191, "balance_loss_mlp": 1.01659989, "epoch": 0.762783706598527, "flos": 21906008196480.0, "grad_norm": 2.009977745773024, "language_loss": 0.73793316, "learning_rate": 5.616931989794198e-07, "loss": 0.75888932, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.37890625, "step": 12687, "time_per_iteration": 2.4410672187805176 }, { "auxiliary_loss_clip": 0.01053326, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.02089953, "balance_loss_mlp": 1.01578426, "epoch": 0.7628438298511949, "flos": 15338915280000.0, "grad_norm": 1.7891592602529882, "language_loss": 0.66085035, "learning_rate": 5.614226082797369e-07, "loss": 0.6818524, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 12688, "time_per_iteration": 2.3476152420043945 }, { "auxiliary_loss_clip": 0.0105138, "auxiliary_loss_mlp": 0.01041581, "balance_loss_clip": 1.01863372, "balance_loss_mlp": 1.01604903, "epoch": 0.7629039531038629, "flos": 13005460183680.0, "grad_norm": 2.066488529333495, "language_loss": 0.71912789, "learning_rate": 5.611520721310515e-07, "loss": 0.74005747, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 12689, "time_per_iteration": 3.817654609680176 }, { "auxiliary_loss_clip": 0.01056861, "auxiliary_loss_mlp": 0.01041311, "balance_loss_clip": 1.01574063, "balance_loss_mlp": 1.01825047, "epoch": 0.7629640763565309, "flos": 26170614259200.0, "grad_norm": 1.8497906731091749, "language_loss": 0.70415008, "learning_rate": 5.608815905436238e-07, "loss": 0.72513175, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 12690, "time_per_iteration": 2.4117329120635986 }, { "auxiliary_loss_clip": 0.01052813, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.01567245, "balance_loss_mlp": 1.01615202, "epoch": 0.7630241996091989, "flos": 36792240387840.0, "grad_norm": 1.4887908522426623, "language_loss": 0.70714259, "learning_rate": 5.606111635277109e-07, "loss": 0.72806537, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12691, "time_per_iteration": 2.541783332824707 }, { "auxiliary_loss_clip": 0.01052112, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.01864743, "balance_loss_mlp": 1.01636231, "epoch": 0.7630843228618668, "flos": 21834087062400.0, "grad_norm": 1.5785602620097348, "language_loss": 0.83230388, "learning_rate": 5.603407910935662e-07, "loss": 0.85323167, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 12692, "time_per_iteration": 2.3721070289611816 }, { "auxiliary_loss_clip": 0.01056218, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.01442075, "balance_loss_mlp": 1.01842856, "epoch": 0.7631444461145348, "flos": 12639710113920.0, "grad_norm": 2.2031498521762383, "language_loss": 0.78318578, "learning_rate": 5.600704732514438e-07, "loss": 0.80411869, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37890625, "step": 12693, "time_per_iteration": 2.376164197921753 }, { "auxiliary_loss_clip": 0.01052133, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.01079655, "balance_loss_mlp": 1.01608157, "epoch": 0.7632045693672027, "flos": 16835676710400.0, "grad_norm": 2.5020725687393632, "language_loss": 0.7513935, "learning_rate": 5.598002100115933e-07, "loss": 0.77226877, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 12694, "time_per_iteration": 2.3585779666900635 }, { "auxiliary_loss_clip": 0.01050588, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.01561213, "balance_loss_mlp": 1.01577139, "epoch": 0.7632646926198707, "flos": 22016263870080.0, "grad_norm": 2.497537138623823, "language_loss": 0.71363729, "learning_rate": 5.595300013842625e-07, "loss": 0.73452318, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 12695, "time_per_iteration": 2.403942346572876 }, { "auxiliary_loss_clip": 0.01053045, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.01167846, "balance_loss_mlp": 1.01670122, "epoch": 0.7633248158725388, "flos": 23112850763520.0, "grad_norm": 1.5496705724898812, "language_loss": 0.73109579, "learning_rate": 5.592598473796985e-07, "loss": 0.75198174, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 12696, "time_per_iteration": 2.4131603240966797 }, { "auxiliary_loss_clip": 0.01053403, "auxiliary_loss_mlp": 0.01039751, "balance_loss_clip": 1.01505041, "balance_loss_mlp": 1.01664853, "epoch": 0.7633849391252067, "flos": 10889060209920.0, "grad_norm": 2.270334776894465, "language_loss": 0.73081052, "learning_rate": 5.589897480081453e-07, "loss": 0.75174206, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 12697, "time_per_iteration": 2.4284164905548096 }, { "auxiliary_loss_clip": 0.01052769, "auxiliary_loss_mlp": 0.01040537, "balance_loss_clip": 1.01619446, "balance_loss_mlp": 1.01663971, "epoch": 0.7634450623778747, "flos": 20993169121920.0, "grad_norm": 1.9814556628050048, "language_loss": 0.67730367, "learning_rate": 5.587197032798461e-07, "loss": 0.69823676, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 12698, "time_per_iteration": 2.357182502746582 }, { "auxiliary_loss_clip": 0.01053092, "auxiliary_loss_mlp": 0.01041473, "balance_loss_clip": 1.01724982, "balance_loss_mlp": 1.01628065, "epoch": 0.7635051856305426, "flos": 18880993422720.0, "grad_norm": 1.6141414593281105, "language_loss": 0.73200005, "learning_rate": 5.5844971320504e-07, "loss": 0.75294572, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 12699, "time_per_iteration": 2.4349634647369385 }, { "auxiliary_loss_clip": 0.01051737, "auxiliary_loss_mlp": 0.01037664, "balance_loss_clip": 1.01530051, "balance_loss_mlp": 1.01650023, "epoch": 0.7635653088832106, "flos": 34785572417280.0, "grad_norm": 1.6658134833266927, "language_loss": 0.74098921, "learning_rate": 5.581797777939648e-07, "loss": 0.76188314, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 12700, "time_per_iteration": 2.4759881496429443 }, { "auxiliary_loss_clip": 0.01052458, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.01067638, "balance_loss_mlp": 1.01572704, "epoch": 0.7636254321358785, "flos": 23177510334720.0, "grad_norm": 2.0645689459595227, "language_loss": 0.69883156, "learning_rate": 5.579098970568574e-07, "loss": 0.71968311, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 12701, "time_per_iteration": 2.387096881866455 }, { "auxiliary_loss_clip": 0.01053389, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.01695752, "balance_loss_mlp": 1.01742661, "epoch": 0.7636855553885465, "flos": 21324145610880.0, "grad_norm": 1.6513640265973635, "language_loss": 0.65743196, "learning_rate": 5.576400710039508e-07, "loss": 0.67837399, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 12702, "time_per_iteration": 2.36795711517334 }, { "auxiliary_loss_clip": 0.01051651, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.01145864, "balance_loss_mlp": 1.01587427, "epoch": 0.7637456786412145, "flos": 28656814020480.0, "grad_norm": 2.296785135016134, "language_loss": 0.67187858, "learning_rate": 5.57370299645477e-07, "loss": 0.69273263, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 12703, "time_per_iteration": 2.449291229248047 }, { "auxiliary_loss_clip": 0.01052012, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.01213789, "balance_loss_mlp": 1.01635885, "epoch": 0.7638058018938825, "flos": 21906217664640.0, "grad_norm": 1.8020107562838503, "language_loss": 0.85654652, "learning_rate": 5.571005829916668e-07, "loss": 0.87740862, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 12704, "time_per_iteration": 2.365377426147461 }, { "auxiliary_loss_clip": 0.01055244, "auxiliary_loss_mlp": 0.01038715, "balance_loss_clip": 1.01472998, "balance_loss_mlp": 1.01771307, "epoch": 0.7638659251465504, "flos": 29642586658560.0, "grad_norm": 1.5806921423314881, "language_loss": 0.68311286, "learning_rate": 5.568309210527469e-07, "loss": 0.70405239, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 12705, "time_per_iteration": 2.4613256454467773 }, { "auxiliary_loss_clip": 0.01051241, "auxiliary_loss_mlp": 0.01038204, "balance_loss_clip": 1.01446915, "balance_loss_mlp": 1.01567221, "epoch": 0.7639260483992184, "flos": 26139960218880.0, "grad_norm": 1.7547885622047517, "language_loss": 0.74671996, "learning_rate": 5.565613138389427e-07, "loss": 0.76761436, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 12706, "time_per_iteration": 2.4175801277160645 }, { "auxiliary_loss_clip": 0.01053402, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.01323438, "balance_loss_mlp": 1.01640916, "epoch": 0.7639861716518863, "flos": 20155672494720.0, "grad_norm": 2.179134861409055, "language_loss": 0.79184139, "learning_rate": 5.562917613604781e-07, "loss": 0.81274003, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 12707, "time_per_iteration": 2.4321393966674805 }, { "auxiliary_loss_clip": 0.01051687, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.0140835, "balance_loss_mlp": 1.01560378, "epoch": 0.7640462949045543, "flos": 18582276896640.0, "grad_norm": 1.707753405944436, "language_loss": 0.81012082, "learning_rate": 5.560222636275751e-07, "loss": 0.83100307, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 12708, "time_per_iteration": 2.350581407546997 }, { "auxiliary_loss_clip": 0.01008398, "auxiliary_loss_mlp": 0.01004435, "balance_loss_clip": 1.00213444, "balance_loss_mlp": 1.00116038, "epoch": 0.7641064181572224, "flos": 68318494830720.0, "grad_norm": 0.8172562742613345, "language_loss": 0.56749892, "learning_rate": 5.557528206504521e-07, "loss": 0.58762717, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.07226562, "step": 12709, "time_per_iteration": 3.073061466217041 }, { "auxiliary_loss_clip": 0.01054787, "auxiliary_loss_mlp": 0.01047193, "balance_loss_clip": 1.02034688, "balance_loss_mlp": 1.01683807, "epoch": 0.7641665414098903, "flos": 17967979791360.0, "grad_norm": 1.911256015092136, "language_loss": 0.65651661, "learning_rate": 5.554834324393271e-07, "loss": 0.67753643, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 12710, "time_per_iteration": 3.671189308166504 }, { "auxiliary_loss_clip": 0.01053545, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.01490903, "balance_loss_mlp": 1.01545584, "epoch": 0.7642266646625583, "flos": 21251002579200.0, "grad_norm": 2.5982299650325276, "language_loss": 0.66108644, "learning_rate": 5.552140990044154e-07, "loss": 0.68202049, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 12711, "time_per_iteration": 2.4329936504364014 }, { "auxiliary_loss_clip": 0.01051692, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.01414454, "balance_loss_mlp": 1.01592588, "epoch": 0.7642867879152262, "flos": 22746681757440.0, "grad_norm": 1.4733777421418268, "language_loss": 0.73927653, "learning_rate": 5.549448203559293e-07, "loss": 0.7601499, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35742188, "step": 12712, "time_per_iteration": 2.3774375915527344 }, { "auxiliary_loss_clip": 0.01051917, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.01130366, "balance_loss_mlp": 1.01630974, "epoch": 0.7643469111678942, "flos": 23330988138240.0, "grad_norm": 1.4898537185426444, "language_loss": 0.81991601, "learning_rate": 5.546755965040804e-07, "loss": 0.84076989, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 12713, "time_per_iteration": 2.419034957885742 }, { "auxiliary_loss_clip": 0.01054532, "auxiliary_loss_mlp": 0.0104365, "balance_loss_clip": 1.01927161, "balance_loss_mlp": 1.0169425, "epoch": 0.7644070344205621, "flos": 19856292652800.0, "grad_norm": 2.122576191121344, "language_loss": 0.84195119, "learning_rate": 5.544064274590776e-07, "loss": 0.86293298, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37695312, "step": 12714, "time_per_iteration": 2.3625900745391846 }, { "auxiliary_loss_clip": 0.01055568, "auxiliary_loss_mlp": 0.01044934, "balance_loss_clip": 1.01873171, "balance_loss_mlp": 1.01780045, "epoch": 0.7644671576732301, "flos": 22089546547200.0, "grad_norm": 1.492321626693684, "language_loss": 0.73872554, "learning_rate": 5.541373132311287e-07, "loss": 0.75973058, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37695312, "step": 12715, "time_per_iteration": 3.8207969665527344 }, { "auxiliary_loss_clip": 0.01051021, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.01361263, "balance_loss_mlp": 1.0153538, "epoch": 0.7645272809258981, "flos": 25480311390720.0, "grad_norm": 1.8674236514965161, "language_loss": 0.64298147, "learning_rate": 5.538682538304376e-07, "loss": 0.6638487, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 12716, "time_per_iteration": 3.8544297218322754 }, { "auxiliary_loss_clip": 0.01055434, "auxiliary_loss_mlp": 0.01045992, "balance_loss_clip": 1.02093482, "balance_loss_mlp": 1.01746929, "epoch": 0.7645874041785661, "flos": 21540851619840.0, "grad_norm": 1.543555107035806, "language_loss": 0.80581546, "learning_rate": 5.535992492672068e-07, "loss": 0.82682967, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 12717, "time_per_iteration": 2.4164652824401855 }, { "auxiliary_loss_clip": 0.01052391, "auxiliary_loss_mlp": 0.01037132, "balance_loss_clip": 1.0149827, "balance_loss_mlp": 1.01648092, "epoch": 0.764647527431234, "flos": 20629862847360.0, "grad_norm": 2.2652140878879115, "language_loss": 0.67677051, "learning_rate": 5.53330299551638e-07, "loss": 0.69766575, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 12718, "time_per_iteration": 2.4390506744384766 }, { "auxiliary_loss_clip": 0.01049466, "auxiliary_loss_mlp": 0.01036387, "balance_loss_clip": 1.01352239, "balance_loss_mlp": 1.01526213, "epoch": 0.764707650683902, "flos": 21433004830080.0, "grad_norm": 1.9507226433490688, "language_loss": 0.79005158, "learning_rate": 5.530614046939286e-07, "loss": 0.81091017, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 12719, "time_per_iteration": 2.39414381980896 }, { "auxiliary_loss_clip": 0.01052328, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.01040184, "balance_loss_mlp": 1.01563716, "epoch": 0.7647677739365699, "flos": 22710092785920.0, "grad_norm": 3.7017122461565286, "language_loss": 0.71426004, "learning_rate": 5.527925647042754e-07, "loss": 0.73511112, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3671875, "step": 12720, "time_per_iteration": 2.412700653076172 }, { "auxiliary_loss_clip": 0.01053899, "auxiliary_loss_mlp": 0.01042854, "balance_loss_clip": 1.01838088, "balance_loss_mlp": 1.01674628, "epoch": 0.7648278971892379, "flos": 21323063358720.0, "grad_norm": 1.621789069798318, "language_loss": 0.74662113, "learning_rate": 5.52523779592875e-07, "loss": 0.76758868, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 12721, "time_per_iteration": 2.3606297969818115 }, { "auxiliary_loss_clip": 0.01051855, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.01094389, "balance_loss_mlp": 1.01632452, "epoch": 0.764888020441906, "flos": 20666312173440.0, "grad_norm": 1.7707633347751002, "language_loss": 0.74651849, "learning_rate": 5.522550493699163e-07, "loss": 0.76737612, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 12722, "time_per_iteration": 2.3680193424224854 }, { "auxiliary_loss_clip": 0.01051961, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.01433647, "balance_loss_mlp": 1.0154599, "epoch": 0.7649481436945739, "flos": 25081358751360.0, "grad_norm": 2.0910766918435897, "language_loss": 0.75219625, "learning_rate": 5.519863740455912e-07, "loss": 0.77308261, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36523438, "step": 12723, "time_per_iteration": 2.402522087097168 }, { "auxiliary_loss_clip": 0.01052632, "auxiliary_loss_mlp": 0.01035864, "balance_loss_clip": 1.01106882, "balance_loss_mlp": 1.01495934, "epoch": 0.7650082669472419, "flos": 24899705614080.0, "grad_norm": 1.934159051449939, "language_loss": 0.73682135, "learning_rate": 5.517177536300881e-07, "loss": 0.75770628, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 12724, "time_per_iteration": 2.4418442249298096 }, { "auxiliary_loss_clip": 0.01050584, "auxiliary_loss_mlp": 0.01030274, "balance_loss_clip": 1.00979424, "balance_loss_mlp": 1.01622891, "epoch": 0.7650683901999098, "flos": 14646517729920.0, "grad_norm": 1.9568577826270541, "language_loss": 0.84919059, "learning_rate": 5.514491881335935e-07, "loss": 0.86999917, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 12725, "time_per_iteration": 2.3476407527923584 }, { "auxiliary_loss_clip": 0.01050666, "auxiliary_loss_mlp": 0.01038888, "balance_loss_clip": 1.01516545, "balance_loss_mlp": 1.01515424, "epoch": 0.7651285134525778, "flos": 26351429523840.0, "grad_norm": 1.7784091742384471, "language_loss": 0.78904974, "learning_rate": 5.511806775662901e-07, "loss": 0.80994529, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 12726, "time_per_iteration": 2.4907591342926025 }, { "auxiliary_loss_clip": 0.01052049, "auxiliary_loss_mlp": 0.01040651, "balance_loss_clip": 1.01745319, "balance_loss_mlp": 1.015697, "epoch": 0.7651886367052457, "flos": 26645782129920.0, "grad_norm": 1.6751538669022377, "language_loss": 0.7135675, "learning_rate": 5.509122219383615e-07, "loss": 0.73449451, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 12727, "time_per_iteration": 2.4009079933166504 }, { "auxiliary_loss_clip": 0.01050588, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.01109195, "balance_loss_mlp": 1.01615882, "epoch": 0.7652487599579137, "flos": 25701660610560.0, "grad_norm": 1.632910883198225, "language_loss": 0.80403376, "learning_rate": 5.506438212599864e-07, "loss": 0.82485998, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34375, "step": 12728, "time_per_iteration": 2.4325737953186035 }, { "auxiliary_loss_clip": 0.01053783, "auxiliary_loss_mlp": 0.01034391, "balance_loss_clip": 1.01014352, "balance_loss_mlp": 1.01675272, "epoch": 0.7653088832105817, "flos": 28584299393280.0, "grad_norm": 1.9625544226296037, "language_loss": 0.57336682, "learning_rate": 5.503754755413424e-07, "loss": 0.59424853, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 12729, "time_per_iteration": 3.8606715202331543 }, { "auxiliary_loss_clip": 0.01051346, "auxiliary_loss_mlp": 0.0103659, "balance_loss_clip": 1.01439285, "balance_loss_mlp": 1.01554883, "epoch": 0.7653690064632497, "flos": 23365656984960.0, "grad_norm": 1.743031029575185, "language_loss": 0.79082859, "learning_rate": 5.501071847926055e-07, "loss": 0.81170797, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 12730, "time_per_iteration": 2.4229512214660645 }, { "auxiliary_loss_clip": 0.01056335, "auxiliary_loss_mlp": 0.01041904, "balance_loss_clip": 1.01685798, "balance_loss_mlp": 1.01863694, "epoch": 0.7654291297159176, "flos": 15773130259200.0, "grad_norm": 1.6531436130550428, "language_loss": 0.69279879, "learning_rate": 5.498389490239495e-07, "loss": 0.71378118, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 12731, "time_per_iteration": 2.375032663345337 }, { "auxiliary_loss_clip": 0.01053286, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.01475048, "balance_loss_mlp": 1.01679683, "epoch": 0.7654892529685856, "flos": 18033023387520.0, "grad_norm": 1.9164556269048885, "language_loss": 0.71480262, "learning_rate": 5.495707682455471e-07, "loss": 0.73570824, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36523438, "step": 12732, "time_per_iteration": 2.3706295490264893 }, { "auxiliary_loss_clip": 0.01053568, "auxiliary_loss_mlp": 0.01042387, "balance_loss_clip": 1.01740098, "balance_loss_mlp": 1.01611948, "epoch": 0.7655493762212535, "flos": 27234766632960.0, "grad_norm": 1.9880893527950658, "language_loss": 0.79184449, "learning_rate": 5.493026424675653e-07, "loss": 0.8128041, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 12733, "time_per_iteration": 2.422466278076172 }, { "auxiliary_loss_clip": 0.01051967, "auxiliary_loss_mlp": 0.01040562, "balance_loss_clip": 1.0172565, "balance_loss_mlp": 1.01661301, "epoch": 0.7656094994739215, "flos": 20773006888320.0, "grad_norm": 1.8561667467510132, "language_loss": 0.78583199, "learning_rate": 5.490345717001726e-07, "loss": 0.80675733, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 12734, "time_per_iteration": 2.3961362838745117 }, { "auxiliary_loss_clip": 0.01054104, "auxiliary_loss_mlp": 0.01037764, "balance_loss_clip": 1.01280212, "balance_loss_mlp": 1.01610255, "epoch": 0.7656696227265896, "flos": 23038136720640.0, "grad_norm": 2.040923309914301, "language_loss": 0.74535573, "learning_rate": 5.48766555953535e-07, "loss": 0.76627433, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 12735, "time_per_iteration": 2.4182705879211426 }, { "auxiliary_loss_clip": 0.01051407, "auxiliary_loss_mlp": 0.01037711, "balance_loss_clip": 1.01411986, "balance_loss_mlp": 1.01610637, "epoch": 0.7657297459792575, "flos": 27524441116800.0, "grad_norm": 10.337123658166268, "language_loss": 0.73430741, "learning_rate": 5.484985952378145e-07, "loss": 0.75519854, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 12736, "time_per_iteration": 2.448190927505493 }, { "auxiliary_loss_clip": 0.0105571, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.01946974, "balance_loss_mlp": 1.01716483, "epoch": 0.7657898692319255, "flos": 17127515698560.0, "grad_norm": 2.2458308097029835, "language_loss": 0.79294765, "learning_rate": 5.482306895631728e-07, "loss": 0.81398934, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 0.38671875, "step": 12737, "time_per_iteration": 2.353928565979004 }, { "auxiliary_loss_clip": 0.01054456, "auxiliary_loss_mlp": 0.01039997, "balance_loss_clip": 1.0140934, "balance_loss_mlp": 1.01684535, "epoch": 0.7658499924845934, "flos": 21464810945280.0, "grad_norm": 1.7936976658135209, "language_loss": 0.77156317, "learning_rate": 5.479628389397699e-07, "loss": 0.79250777, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 12738, "time_per_iteration": 2.3898069858551025 }, { "auxiliary_loss_clip": 0.01054284, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.00972116, "balance_loss_mlp": 1.01719928, "epoch": 0.7659101157372614, "flos": 29495392899840.0, "grad_norm": 1.9773731579296154, "language_loss": 0.63672233, "learning_rate": 5.476950433777603e-07, "loss": 0.65761364, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 12739, "time_per_iteration": 2.418248176574707 }, { "auxiliary_loss_clip": 0.01054076, "auxiliary_loss_mlp": 0.0104074, "balance_loss_clip": 1.01611125, "balance_loss_mlp": 1.01684892, "epoch": 0.7659702389899293, "flos": 18550819895040.0, "grad_norm": 1.9853247425954526, "language_loss": 0.81421787, "learning_rate": 5.474273028873004e-07, "loss": 0.83516604, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 12740, "time_per_iteration": 2.391996383666992 }, { "auxiliary_loss_clip": 0.0105375, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.01196003, "balance_loss_mlp": 1.01706684, "epoch": 0.7660303622425974, "flos": 23548078172160.0, "grad_norm": 1.5648815822931679, "language_loss": 0.66120541, "learning_rate": 5.471596174785429e-07, "loss": 0.68212223, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 12741, "time_per_iteration": 2.417964220046997 }, { "auxiliary_loss_clip": 0.01051397, "auxiliary_loss_mlp": 0.01034834, "balance_loss_clip": 1.01083708, "balance_loss_mlp": 1.01589942, "epoch": 0.7660904854952653, "flos": 18915732092160.0, "grad_norm": 1.5957519796768864, "language_loss": 0.7690801, "learning_rate": 5.468919871616386e-07, "loss": 0.78994244, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 12742, "time_per_iteration": 2.5376827716827393 }, { "auxiliary_loss_clip": 0.01050977, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.01755178, "balance_loss_mlp": 1.01632416, "epoch": 0.7661506087479333, "flos": 23146437358080.0, "grad_norm": 1.3737761269619064, "language_loss": 0.77106643, "learning_rate": 5.46624411946736e-07, "loss": 0.79198599, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 12743, "time_per_iteration": 2.398345470428467 }, { "auxiliary_loss_clip": 0.010519, "auxiliary_loss_mlp": 0.01040092, "balance_loss_clip": 1.01754963, "balance_loss_mlp": 1.01584959, "epoch": 0.7662107320006012, "flos": 17564837788800.0, "grad_norm": 1.7956684689844167, "language_loss": 0.7585845, "learning_rate": 5.463568918439805e-07, "loss": 0.77950442, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 12744, "time_per_iteration": 2.361865997314453 }, { "auxiliary_loss_clip": 0.01053309, "auxiliary_loss_mlp": 0.01035202, "balance_loss_clip": 1.00914311, "balance_loss_mlp": 1.01622605, "epoch": 0.7662708552532692, "flos": 22302167927040.0, "grad_norm": 2.460852042496772, "language_loss": 0.7218529, "learning_rate": 5.460894268635181e-07, "loss": 0.74273801, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37109375, "step": 12745, "time_per_iteration": 2.512939453125 }, { "auxiliary_loss_clip": 0.01053182, "auxiliary_loss_mlp": 0.01044019, "balance_loss_clip": 1.01866293, "balance_loss_mlp": 1.01669538, "epoch": 0.7663309785059371, "flos": 15741149587200.0, "grad_norm": 2.50198242975992, "language_loss": 0.79356194, "learning_rate": 5.458220170154896e-07, "loss": 0.81453395, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 12746, "time_per_iteration": 2.5211589336395264 }, { "auxiliary_loss_clip": 0.01008351, "auxiliary_loss_mlp": 0.01002605, "balance_loss_clip": 1.00020897, "balance_loss_mlp": 1.00146973, "epoch": 0.7663911017586051, "flos": 62159780620800.0, "grad_norm": 0.6676416255623655, "language_loss": 0.56862473, "learning_rate": 5.455546623100362e-07, "loss": 0.58873427, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.06884766, "step": 12747, "time_per_iteration": 3.0681021213531494 }, { "auxiliary_loss_clip": 0.01049923, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.01720786, "balance_loss_mlp": 1.01512587, "epoch": 0.7664512250112732, "flos": 26504802593280.0, "grad_norm": 1.5479623995954555, "language_loss": 0.73195833, "learning_rate": 5.452873627572956e-07, "loss": 0.75283682, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34765625, "step": 12748, "time_per_iteration": 2.407489776611328 }, { "auxiliary_loss_clip": 0.01052213, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.01020527, "balance_loss_mlp": 1.01650405, "epoch": 0.7665113482639411, "flos": 16248717066240.0, "grad_norm": 2.6606575171335156, "language_loss": 0.71115494, "learning_rate": 5.450201183674052e-07, "loss": 0.73203409, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.35742188, "step": 12749, "time_per_iteration": 3.6445515155792236 }, { "auxiliary_loss_clip": 0.01051716, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.01557803, "balance_loss_mlp": 1.01540399, "epoch": 0.7665714715166091, "flos": 27196676472960.0, "grad_norm": 1.5818622426615363, "language_loss": 0.74772489, "learning_rate": 5.447529291504967e-07, "loss": 0.76864564, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 12750, "time_per_iteration": 2.4174180030822754 }, { "auxiliary_loss_clip": 0.01050568, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.01162612, "balance_loss_mlp": 1.01595283, "epoch": 0.766631594769277, "flos": 21066766001280.0, "grad_norm": 2.3122808269325748, "language_loss": 0.76593566, "learning_rate": 5.444857951167026e-07, "loss": 0.78676552, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34570312, "step": 12751, "time_per_iteration": 2.3904788494110107 }, { "auxiliary_loss_clip": 0.01053473, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.01643991, "balance_loss_mlp": 1.01690125, "epoch": 0.766691718021945, "flos": 24096808010880.0, "grad_norm": 1.815475002164816, "language_loss": 0.6259141, "learning_rate": 5.442187162761537e-07, "loss": 0.64685166, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12752, "time_per_iteration": 2.4050722122192383 }, { "auxiliary_loss_clip": 0.01053801, "auxiliary_loss_mlp": 0.01040332, "balance_loss_clip": 1.01278305, "balance_loss_mlp": 1.01643038, "epoch": 0.7667518412746129, "flos": 23439533155200.0, "grad_norm": 1.9116312387886978, "language_loss": 0.70526487, "learning_rate": 5.439516926389767e-07, "loss": 0.72620612, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.375, "step": 12753, "time_per_iteration": 2.3904943466186523 }, { "auxiliary_loss_clip": 0.01052105, "auxiliary_loss_mlp": 0.01046878, "balance_loss_clip": 1.02245212, "balance_loss_mlp": 1.01566136, "epoch": 0.766811964527281, "flos": 18147852449280.0, "grad_norm": 2.712687608843921, "language_loss": 0.63473821, "learning_rate": 5.436847242152971e-07, "loss": 0.6557281, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 12754, "time_per_iteration": 3.7526795864105225 }, { "auxiliary_loss_clip": 0.01052283, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.00932491, "balance_loss_mlp": 1.0164268, "epoch": 0.7668720877799489, "flos": 19535056433280.0, "grad_norm": 3.035460768437019, "language_loss": 0.81481415, "learning_rate": 5.434178110152401e-07, "loss": 0.83567291, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.359375, "step": 12755, "time_per_iteration": 2.4135172367095947 }, { "auxiliary_loss_clip": 0.01052055, "auxiliary_loss_mlp": 0.0104123, "balance_loss_clip": 1.0164938, "balance_loss_mlp": 1.01632559, "epoch": 0.7669322110326169, "flos": 22673224523520.0, "grad_norm": 1.868365133016885, "language_loss": 0.71516085, "learning_rate": 5.431509530489242e-07, "loss": 0.73609376, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.35546875, "step": 12756, "time_per_iteration": 3.757382869720459 }, { "auxiliary_loss_clip": 0.01051801, "auxiliary_loss_mlp": 0.01042629, "balance_loss_clip": 1.0194788, "balance_loss_mlp": 1.01563263, "epoch": 0.7669923342852848, "flos": 26468178710400.0, "grad_norm": 1.5892951497498515, "language_loss": 0.70694375, "learning_rate": 5.428841503264706e-07, "loss": 0.72788805, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 12757, "time_per_iteration": 2.4543678760528564 }, { "auxiliary_loss_clip": 0.01052853, "auxiliary_loss_mlp": 0.01039961, "balance_loss_clip": 1.01455772, "balance_loss_mlp": 1.01682651, "epoch": 0.7670524575379528, "flos": 22855052217600.0, "grad_norm": 1.9865678024804194, "language_loss": 0.7767092, "learning_rate": 5.426174028579955e-07, "loss": 0.79763734, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 12758, "time_per_iteration": 2.3725173473358154 }, { "auxiliary_loss_clip": 0.01051244, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.01894557, "balance_loss_mlp": 1.01602697, "epoch": 0.7671125807906207, "flos": 22451142165120.0, "grad_norm": 3.333139095823116, "language_loss": 0.77201629, "learning_rate": 5.423507106536156e-07, "loss": 0.79294729, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 12759, "time_per_iteration": 2.4158780574798584 }, { "auxiliary_loss_clip": 0.01052527, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.00863838, "balance_loss_mlp": 1.01582098, "epoch": 0.7671727040432887, "flos": 35370088266240.0, "grad_norm": 2.099909262958268, "language_loss": 0.69422191, "learning_rate": 5.420840737234425e-07, "loss": 0.71506578, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 12760, "time_per_iteration": 2.4774861335754395 }, { "auxiliary_loss_clip": 0.01054594, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.01443958, "balance_loss_mlp": 1.01788092, "epoch": 0.7672328272959568, "flos": 22493770801920.0, "grad_norm": 1.5219371926517717, "language_loss": 0.80042726, "learning_rate": 5.418174920775871e-07, "loss": 0.82137018, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 12761, "time_per_iteration": 2.4428563117980957 }, { "auxiliary_loss_clip": 0.01051065, "auxiliary_loss_mlp": 0.01041595, "balance_loss_clip": 1.0179801, "balance_loss_mlp": 1.01562738, "epoch": 0.7672929505486247, "flos": 22814588085120.0, "grad_norm": 2.507823835597998, "language_loss": 0.67715144, "learning_rate": 5.415509657261589e-07, "loss": 0.69807804, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 12762, "time_per_iteration": 2.3837387561798096 }, { "auxiliary_loss_clip": 0.01053182, "auxiliary_loss_mlp": 0.01035432, "balance_loss_clip": 1.01094604, "balance_loss_mlp": 1.0154078, "epoch": 0.7673530738012927, "flos": 20337814391040.0, "grad_norm": 1.6921245092834227, "language_loss": 0.7565887, "learning_rate": 5.412844946792639e-07, "loss": 0.77747488, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37695312, "step": 12763, "time_per_iteration": 2.3854806423187256 }, { "auxiliary_loss_clip": 0.01053091, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.01896775, "balance_loss_mlp": 1.01640809, "epoch": 0.7674131970539606, "flos": 34932137771520.0, "grad_norm": 1.5086013313442899, "language_loss": 0.71658784, "learning_rate": 5.410180789470067e-07, "loss": 0.73754781, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 12764, "time_per_iteration": 2.479351758956909 }, { "auxiliary_loss_clip": 0.01052038, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 1.01549363, "balance_loss_mlp": 1.01591229, "epoch": 0.7674733203066286, "flos": 28327618010880.0, "grad_norm": 1.560054839445609, "language_loss": 0.69897044, "learning_rate": 5.40751718539491e-07, "loss": 0.71986562, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36132812, "step": 12765, "time_per_iteration": 2.4481985569000244 }, { "auxiliary_loss_clip": 0.01049319, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.0114007, "balance_loss_mlp": 1.01422358, "epoch": 0.7675334435592965, "flos": 16288797173760.0, "grad_norm": 2.529219918468953, "language_loss": 0.62026083, "learning_rate": 5.404854134668162e-07, "loss": 0.64108604, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 12766, "time_per_iteration": 2.3527235984802246 }, { "auxiliary_loss_clip": 0.01008075, "auxiliary_loss_mlp": 0.0100228, "balance_loss_clip": 0.99994349, "balance_loss_mlp": 1.001001, "epoch": 0.7675935668119646, "flos": 64822641194880.0, "grad_norm": 0.7367794062559335, "language_loss": 0.6086452, "learning_rate": 5.402191637390803e-07, "loss": 0.62874877, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.07080078, "step": 12767, "time_per_iteration": 3.156628370285034 }, { "auxiliary_loss_clip": 0.01050727, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.01075339, "balance_loss_mlp": 1.01627517, "epoch": 0.7676536900646325, "flos": 22674271864320.0, "grad_norm": 1.8341283066430407, "language_loss": 0.69992578, "learning_rate": 5.399529693663801e-07, "loss": 0.72076285, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 12768, "time_per_iteration": 3.8093857765197754 }, { "auxiliary_loss_clip": 0.01055718, "auxiliary_loss_mlp": 0.01036979, "balance_loss_clip": 1.01184928, "balance_loss_mlp": 1.01674533, "epoch": 0.7677138133173005, "flos": 26938563724800.0, "grad_norm": 1.7207351733516836, "language_loss": 0.71781516, "learning_rate": 5.3968683035881e-07, "loss": 0.73874217, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.390625, "step": 12769, "time_per_iteration": 2.419064521789551 }, { "auxiliary_loss_clip": 0.01053173, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.01538157, "balance_loss_mlp": 1.01616681, "epoch": 0.7677739365699684, "flos": 23798580243840.0, "grad_norm": 2.030546886349247, "language_loss": 0.81322211, "learning_rate": 5.394207467264611e-07, "loss": 0.83415186, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 12770, "time_per_iteration": 2.4146625995635986 }, { "auxiliary_loss_clip": 0.01051657, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.01426876, "balance_loss_mlp": 1.01649952, "epoch": 0.7678340598226364, "flos": 34454176992000.0, "grad_norm": 1.600062406333276, "language_loss": 0.79435003, "learning_rate": 5.391547184794245e-07, "loss": 0.81521958, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.3515625, "step": 12771, "time_per_iteration": 2.492295742034912 }, { "auxiliary_loss_clip": 0.0105236, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.01364243, "balance_loss_mlp": 1.01622462, "epoch": 0.7678941830753043, "flos": 23840615387520.0, "grad_norm": 1.4538967992058485, "language_loss": 0.69252163, "learning_rate": 5.388887456277876e-07, "loss": 0.71340853, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 12772, "time_per_iteration": 2.440007448196411 }, { "auxiliary_loss_clip": 0.01049769, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.00861669, "balance_loss_mlp": 1.01546884, "epoch": 0.7679543063279723, "flos": 25409751799680.0, "grad_norm": 1.6942182341047893, "language_loss": 0.74205542, "learning_rate": 5.386228281816349e-07, "loss": 0.76286, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 12773, "time_per_iteration": 2.411992073059082 }, { "auxiliary_loss_clip": 0.01050269, "auxiliary_loss_mlp": 0.01032809, "balance_loss_clip": 1.01236486, "balance_loss_mlp": 1.01566625, "epoch": 0.7680144295806404, "flos": 27961204625280.0, "grad_norm": 2.053486309858787, "language_loss": 0.81806445, "learning_rate": 5.383569661510512e-07, "loss": 0.83889526, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34570312, "step": 12774, "time_per_iteration": 2.423240900039673 }, { "auxiliary_loss_clip": 0.01053097, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.0126555, "balance_loss_mlp": 1.01693726, "epoch": 0.7680745528333083, "flos": 20411760384000.0, "grad_norm": 1.8165699595014853, "language_loss": 0.71028203, "learning_rate": 5.380911595461177e-07, "loss": 0.73117638, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36132812, "step": 12775, "time_per_iteration": 2.371366262435913 }, { "auxiliary_loss_clip": 0.01008171, "auxiliary_loss_mlp": 0.01003181, "balance_loss_clip": 1.00071323, "balance_loss_mlp": 1.00117838, "epoch": 0.7681346760859763, "flos": 68397433148160.0, "grad_norm": 0.6958216998138518, "language_loss": 0.56889433, "learning_rate": 5.378254083769147e-07, "loss": 0.58900785, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.0703125, "step": 12776, "time_per_iteration": 3.1368296146392822 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.0103732, "balance_loss_clip": 1.01447916, "balance_loss_mlp": 1.0160985, "epoch": 0.7681947993386442, "flos": 21250409086080.0, "grad_norm": 2.049606451018449, "language_loss": 0.75509572, "learning_rate": 5.375597126535188e-07, "loss": 0.77598751, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 12777, "time_per_iteration": 2.387273073196411 }, { "auxiliary_loss_clip": 0.01052812, "auxiliary_loss_mlp": 0.01034484, "balance_loss_clip": 1.01244199, "balance_loss_mlp": 1.01743245, "epoch": 0.7682549225913122, "flos": 21396625326720.0, "grad_norm": 2.1388700507850777, "language_loss": 0.71925449, "learning_rate": 5.372940723860043e-07, "loss": 0.74012744, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 12778, "time_per_iteration": 2.3946659564971924 }, { "auxiliary_loss_clip": 0.01052644, "auxiliary_loss_mlp": 0.01037746, "balance_loss_clip": 1.0142138, "balance_loss_mlp": 1.01670337, "epoch": 0.7683150458439801, "flos": 23037822518400.0, "grad_norm": 1.8574763484267862, "language_loss": 0.7132566, "learning_rate": 5.37028487584446e-07, "loss": 0.73416054, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 12779, "time_per_iteration": 2.414494276046753 }, { "auxiliary_loss_clip": 0.01054077, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.01272559, "balance_loss_mlp": 1.01746321, "epoch": 0.7683751690966482, "flos": 67330070795520.0, "grad_norm": 1.695606352259959, "language_loss": 0.59771073, "learning_rate": 5.367629582589133e-07, "loss": 0.61862046, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 12780, "time_per_iteration": 2.7964818477630615 }, { "auxiliary_loss_clip": 0.01054772, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.01462722, "balance_loss_mlp": 1.01718545, "epoch": 0.7684352923493161, "flos": 21797812293120.0, "grad_norm": 1.8521937739231058, "language_loss": 0.69332767, "learning_rate": 5.364974844194759e-07, "loss": 0.714275, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 12781, "time_per_iteration": 2.4283041954040527 }, { "auxiliary_loss_clip": 0.010513, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.01294804, "balance_loss_mlp": 1.01588929, "epoch": 0.7684954156019841, "flos": 25846445485440.0, "grad_norm": 1.7980633099844932, "language_loss": 0.80329686, "learning_rate": 5.362320660762016e-07, "loss": 0.82417142, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 12782, "time_per_iteration": 2.4206135272979736 }, { "auxiliary_loss_clip": 0.01053342, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.01524091, "balance_loss_mlp": 1.01630044, "epoch": 0.768555538854652, "flos": 25446201125760.0, "grad_norm": 1.7644960834046488, "language_loss": 0.68392754, "learning_rate": 5.35966703239153e-07, "loss": 0.70488161, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37109375, "step": 12783, "time_per_iteration": 2.397552251815796 }, { "auxiliary_loss_clip": 0.01052853, "auxiliary_loss_mlp": 0.01044204, "balance_loss_clip": 1.0191102, "balance_loss_mlp": 1.01587963, "epoch": 0.76861566210732, "flos": 19645347018240.0, "grad_norm": 1.609071405988719, "language_loss": 0.70643139, "learning_rate": 5.357013959183938e-07, "loss": 0.72740197, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 12784, "time_per_iteration": 2.4014434814453125 }, { "auxiliary_loss_clip": 0.0105057, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.00801122, "balance_loss_mlp": 1.01540089, "epoch": 0.7686757853599879, "flos": 22417939595520.0, "grad_norm": 1.6769175162814445, "language_loss": 0.81331909, "learning_rate": 5.354361441239843e-07, "loss": 0.83411002, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3515625, "step": 12785, "time_per_iteration": 2.3646156787872314 }, { "auxiliary_loss_clip": 0.0105335, "auxiliary_loss_mlp": 0.01044768, "balance_loss_clip": 1.01857734, "balance_loss_mlp": 1.01689184, "epoch": 0.768735908612656, "flos": 47772529580160.0, "grad_norm": 1.6468120002815054, "language_loss": 0.78399998, "learning_rate": 5.351709478659836e-07, "loss": 0.80498111, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 12786, "time_per_iteration": 2.639465570449829 }, { "auxiliary_loss_clip": 0.01052215, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.01282048, "balance_loss_mlp": 1.01581752, "epoch": 0.7687960318653239, "flos": 30261876088320.0, "grad_norm": 2.046991099904427, "language_loss": 0.59640676, "learning_rate": 5.349058071544468e-07, "loss": 0.61730397, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36523438, "step": 12787, "time_per_iteration": 2.424450159072876 }, { "auxiliary_loss_clip": 0.01050458, "auxiliary_loss_mlp": 0.0103703, "balance_loss_clip": 1.01504838, "balance_loss_mlp": 1.01470113, "epoch": 0.7688561551179919, "flos": 19572413454720.0, "grad_norm": 1.970844897828011, "language_loss": 0.77272522, "learning_rate": 5.346407219994292e-07, "loss": 0.7936002, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 12788, "time_per_iteration": 2.389676809310913 }, { "auxiliary_loss_clip": 0.01051697, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.0156225, "balance_loss_mlp": 1.01529145, "epoch": 0.7689162783706599, "flos": 22782677235840.0, "grad_norm": 1.7346455297798944, "language_loss": 0.68451273, "learning_rate": 5.343756924109821e-07, "loss": 0.70543289, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 12789, "time_per_iteration": 3.615783452987671 }, { "auxiliary_loss_clip": 0.01052431, "auxiliary_loss_mlp": 0.01037637, "balance_loss_clip": 1.01365185, "balance_loss_mlp": 1.01651621, "epoch": 0.7689764016233278, "flos": 34202767224960.0, "grad_norm": 1.8013026820104745, "language_loss": 0.70215207, "learning_rate": 5.341107183991553e-07, "loss": 0.72305274, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 12790, "time_per_iteration": 2.4757652282714844 }, { "auxiliary_loss_clip": 0.01050289, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.01476121, "balance_loss_mlp": 1.01525199, "epoch": 0.7690365248759958, "flos": 17273522471040.0, "grad_norm": 1.6952376053007376, "language_loss": 0.69511169, "learning_rate": 5.338457999739969e-07, "loss": 0.71598804, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 12791, "time_per_iteration": 2.3556394577026367 }, { "auxiliary_loss_clip": 0.0105065, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.01588511, "balance_loss_mlp": 1.01618099, "epoch": 0.7690966481286637, "flos": 18222182467200.0, "grad_norm": 1.815549463488252, "language_loss": 0.8039391, "learning_rate": 5.335809371455526e-07, "loss": 0.82482409, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 12792, "time_per_iteration": 2.3782742023468018 }, { "auxiliary_loss_clip": 0.01056239, "auxiliary_loss_mlp": 0.01045584, "balance_loss_clip": 1.01789141, "balance_loss_mlp": 1.01783717, "epoch": 0.7691567713813318, "flos": 21536662256640.0, "grad_norm": 1.767144221021261, "language_loss": 0.73774689, "learning_rate": 5.333161299238673e-07, "loss": 0.75876516, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38476562, "step": 12793, "time_per_iteration": 2.371781826019287 }, { "auxiliary_loss_clip": 0.01052133, "auxiliary_loss_mlp": 0.0103915, "balance_loss_clip": 1.01516533, "balance_loss_mlp": 1.01632786, "epoch": 0.7692168946339997, "flos": 39378571528320.0, "grad_norm": 2.4227261962831648, "language_loss": 0.64831752, "learning_rate": 5.330513783189803e-07, "loss": 0.66923034, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35742188, "step": 12794, "time_per_iteration": 3.9140408039093018 }, { "auxiliary_loss_clip": 0.01054649, "auxiliary_loss_mlp": 0.01038848, "balance_loss_clip": 1.01381373, "balance_loss_mlp": 1.01742339, "epoch": 0.7692770178866677, "flos": 25008774301440.0, "grad_norm": 1.3884154727048397, "language_loss": 0.77108246, "learning_rate": 5.327866823409319e-07, "loss": 0.79201746, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 12795, "time_per_iteration": 3.8382973670959473 }, { "auxiliary_loss_clip": 0.0105209, "auxiliary_loss_mlp": 0.01036031, "balance_loss_clip": 1.01190352, "balance_loss_mlp": 1.01565361, "epoch": 0.7693371411393356, "flos": 24715154833920.0, "grad_norm": 1.6099524937954728, "language_loss": 0.72300649, "learning_rate": 5.325220419997601e-07, "loss": 0.74388766, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 12796, "time_per_iteration": 2.3973636627197266 }, { "auxiliary_loss_clip": 0.01051456, "auxiliary_loss_mlp": 0.01033796, "balance_loss_clip": 1.01287484, "balance_loss_mlp": 1.01645398, "epoch": 0.7693972643920036, "flos": 15923884976640.0, "grad_norm": 3.21601125369956, "language_loss": 0.66374737, "learning_rate": 5.32257457305499e-07, "loss": 0.68459994, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34960938, "step": 12797, "time_per_iteration": 2.3735015392303467 }, { "auxiliary_loss_clip": 0.01053117, "auxiliary_loss_mlp": 0.01042494, "balance_loss_clip": 1.0186646, "balance_loss_mlp": 1.01658368, "epoch": 0.7694573876446715, "flos": 25404864209280.0, "grad_norm": 1.9611755044806252, "language_loss": 0.92037773, "learning_rate": 5.319929282681823e-07, "loss": 0.94133389, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 12798, "time_per_iteration": 2.395881175994873 }, { "auxiliary_loss_clip": 0.01052868, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.01115918, "balance_loss_mlp": 1.01631832, "epoch": 0.7695175108973396, "flos": 16653290434560.0, "grad_norm": 2.090271963159474, "language_loss": 0.83160913, "learning_rate": 5.317284548978418e-07, "loss": 0.85247737, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 12799, "time_per_iteration": 2.362508535385132 }, { "auxiliary_loss_clip": 0.01053787, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.01410174, "balance_loss_mlp": 1.01654649, "epoch": 0.7695776341500075, "flos": 13625657308800.0, "grad_norm": 2.7519058004355155, "language_loss": 0.79213768, "learning_rate": 5.314640372045045e-07, "loss": 0.81305879, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 12800, "time_per_iteration": 2.3397159576416016 }, { "auxiliary_loss_clip": 0.01055653, "auxiliary_loss_mlp": 0.01039673, "balance_loss_clip": 1.01307797, "balance_loss_mlp": 1.01671731, "epoch": 0.7696377574026755, "flos": 24275633328000.0, "grad_norm": 2.884319974988817, "language_loss": 0.84863222, "learning_rate": 5.31199675198198e-07, "loss": 0.86958551, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 12801, "time_per_iteration": 2.4041638374328613 }, { "auxiliary_loss_clip": 0.01051626, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.01523709, "balance_loss_mlp": 1.01626956, "epoch": 0.7696978806553435, "flos": 20922085860480.0, "grad_norm": 1.899259041430755, "language_loss": 0.729397, "learning_rate": 5.30935368888947e-07, "loss": 0.75029194, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 12802, "time_per_iteration": 2.3861377239227295 }, { "auxiliary_loss_clip": 0.01050767, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.01573288, "balance_loss_mlp": 1.01587737, "epoch": 0.7697580039080114, "flos": 22928509451520.0, "grad_norm": 1.951869540926632, "language_loss": 0.76845777, "learning_rate": 5.306711182867747e-07, "loss": 0.7893464, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 12803, "time_per_iteration": 2.4393162727355957 }, { "auxiliary_loss_clip": 0.01008295, "auxiliary_loss_mlp": 0.01003285, "balance_loss_clip": 1.00047207, "balance_loss_mlp": 1.00118542, "epoch": 0.7698181271606794, "flos": 68714305447680.0, "grad_norm": 0.7280577046798192, "language_loss": 0.55849123, "learning_rate": 5.304069234017001e-07, "loss": 0.57860696, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 0.02807617, "router_z_loss_mlp": 0.07128906, "step": 12804, "time_per_iteration": 3.026589870452881 }, { "auxiliary_loss_clip": 0.01007847, "auxiliary_loss_mlp": 0.01003421, "balance_loss_clip": 1.00127542, "balance_loss_mlp": 1.00103855, "epoch": 0.7698782504133473, "flos": 67406249249280.0, "grad_norm": 0.736818595495299, "language_loss": 0.54085201, "learning_rate": 5.301427842437429e-07, "loss": 0.5609647, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06835938, "step": 12805, "time_per_iteration": 3.176511764526367 }, { "auxiliary_loss_clip": 0.01055776, "auxiliary_loss_mlp": 0.01039282, "balance_loss_clip": 1.01384318, "balance_loss_mlp": 1.01837063, "epoch": 0.7699383736660154, "flos": 22487835870720.0, "grad_norm": 2.024235065155715, "language_loss": 0.74210751, "learning_rate": 5.298787008229187e-07, "loss": 0.76305807, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 12806, "time_per_iteration": 2.3517298698425293 }, { "auxiliary_loss_clip": 0.01051571, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.01499653, "balance_loss_mlp": 1.01558053, "epoch": 0.7699984969186833, "flos": 21538756938240.0, "grad_norm": 1.7947686277494967, "language_loss": 0.75643051, "learning_rate": 5.296146731492408e-07, "loss": 0.77733582, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 12807, "time_per_iteration": 2.368196487426758 }, { "auxiliary_loss_clip": 0.01055154, "auxiliary_loss_mlp": 0.01042343, "balance_loss_clip": 1.01760662, "balance_loss_mlp": 1.01698804, "epoch": 0.7700586201713513, "flos": 21718210659840.0, "grad_norm": 2.100813498011537, "language_loss": 0.81017172, "learning_rate": 5.293507012327218e-07, "loss": 0.83114666, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 12808, "time_per_iteration": 3.868474245071411 }, { "auxiliary_loss_clip": 0.01054371, "auxiliary_loss_mlp": 0.01041406, "balance_loss_clip": 1.01591897, "balance_loss_mlp": 1.01693463, "epoch": 0.7701187434240192, "flos": 27854754289920.0, "grad_norm": 2.529079254403235, "language_loss": 0.80043608, "learning_rate": 5.290867850833718e-07, "loss": 0.82139385, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 12809, "time_per_iteration": 2.4175643920898438 }, { "auxiliary_loss_clip": 0.01048572, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.01142585, "balance_loss_mlp": 1.01515698, "epoch": 0.7701788666766872, "flos": 28620050492160.0, "grad_norm": 1.490041181068458, "language_loss": 0.71194577, "learning_rate": 5.288229247111993e-07, "loss": 0.73277056, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.33398438, "step": 12810, "time_per_iteration": 2.442531108856201 }, { "auxiliary_loss_clip": 0.01055775, "auxiliary_loss_mlp": 0.01040094, "balance_loss_clip": 1.01349819, "balance_loss_mlp": 1.01788187, "epoch": 0.7702389899293551, "flos": 14245575143040.0, "grad_norm": 4.34944225124783, "language_loss": 0.79704463, "learning_rate": 5.285591201262079e-07, "loss": 0.81800336, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 12811, "time_per_iteration": 2.3264055252075195 }, { "auxiliary_loss_clip": 0.01007816, "auxiliary_loss_mlp": 0.01003356, "balance_loss_clip": 1.00087607, "balance_loss_mlp": 1.00091696, "epoch": 0.7702991131820232, "flos": 70570847105280.0, "grad_norm": 0.8147873411196881, "language_loss": 0.56761456, "learning_rate": 5.28295371338402e-07, "loss": 0.58772629, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.06884766, "step": 12812, "time_per_iteration": 3.0539534091949463 }, { "auxiliary_loss_clip": 0.01052914, "auxiliary_loss_mlp": 0.01040136, "balance_loss_clip": 1.01537657, "balance_loss_mlp": 1.01627398, "epoch": 0.7703592364346911, "flos": 25478949847680.0, "grad_norm": 1.7214166247144893, "language_loss": 0.72989988, "learning_rate": 5.280316783577836e-07, "loss": 0.75083041, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 12813, "time_per_iteration": 2.3852429389953613 }, { "auxiliary_loss_clip": 0.01052656, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.01178277, "balance_loss_mlp": 1.01614904, "epoch": 0.7704193596873591, "flos": 19279911150720.0, "grad_norm": 1.7031747956235483, "language_loss": 0.68124878, "learning_rate": 5.27768041194351e-07, "loss": 0.70214868, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36523438, "step": 12814, "time_per_iteration": 2.4301040172576904 }, { "auxiliary_loss_clip": 0.01052043, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.01396132, "balance_loss_mlp": 1.01634622, "epoch": 0.7704794829400271, "flos": 23657356327680.0, "grad_norm": 3.958660952340384, "language_loss": 0.67156947, "learning_rate": 5.275044598581018e-07, "loss": 0.69246483, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35742188, "step": 12815, "time_per_iteration": 2.392043352127075 }, { "auxiliary_loss_clip": 0.01052334, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.01137257, "balance_loss_mlp": 1.01583147, "epoch": 0.770539606192695, "flos": 18988316542080.0, "grad_norm": 2.5264897192823055, "language_loss": 0.67347741, "learning_rate": 5.272409343590322e-07, "loss": 0.69435424, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 12816, "time_per_iteration": 2.405233144760132 }, { "auxiliary_loss_clip": 0.01052986, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.01495171, "balance_loss_mlp": 1.01714921, "epoch": 0.770599729445363, "flos": 11829585859200.0, "grad_norm": 6.186916995105536, "language_loss": 0.73441637, "learning_rate": 5.26977464707133e-07, "loss": 0.7553314, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 12817, "time_per_iteration": 2.325200319290161 }, { "auxiliary_loss_clip": 0.01052022, "auxiliary_loss_mlp": 0.01039689, "balance_loss_clip": 1.01649094, "balance_loss_mlp": 1.0164001, "epoch": 0.770659852698031, "flos": 17821623905280.0, "grad_norm": 1.8060620214351653, "language_loss": 0.62463856, "learning_rate": 5.267140509123957e-07, "loss": 0.64555568, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 12818, "time_per_iteration": 2.3811862468719482 }, { "auxiliary_loss_clip": 0.01051828, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.01047909, "balance_loss_mlp": 1.0168364, "epoch": 0.770719975950699, "flos": 21870885502080.0, "grad_norm": 1.637011415399494, "language_loss": 0.67693859, "learning_rate": 5.264506929848093e-07, "loss": 0.69776505, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34960938, "step": 12819, "time_per_iteration": 2.39805006980896 }, { "auxiliary_loss_clip": 0.01053966, "auxiliary_loss_mlp": 0.01039513, "balance_loss_clip": 1.01509929, "balance_loss_mlp": 1.01700592, "epoch": 0.7707800992033669, "flos": 21323971054080.0, "grad_norm": 2.620768702349678, "language_loss": 0.58366895, "learning_rate": 5.261873909343608e-07, "loss": 0.60460377, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 12820, "time_per_iteration": 2.364400863647461 }, { "auxiliary_loss_clip": 0.01051014, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.01211905, "balance_loss_mlp": 1.01512384, "epoch": 0.7708402224560349, "flos": 28178294659200.0, "grad_norm": 2.202115342948981, "language_loss": 0.8205328, "learning_rate": 5.259241447710343e-07, "loss": 0.8414138, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 12821, "time_per_iteration": 2.4637911319732666 }, { "auxiliary_loss_clip": 0.01051888, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.01013827, "balance_loss_mlp": 1.01594996, "epoch": 0.7709003457087028, "flos": 15376167567360.0, "grad_norm": 2.2035550323589224, "language_loss": 0.69788241, "learning_rate": 5.256609545048114e-07, "loss": 0.71873701, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 12822, "time_per_iteration": 2.3525612354278564 }, { "auxiliary_loss_clip": 0.01051598, "auxiliary_loss_mlp": 0.01038331, "balance_loss_clip": 1.01427472, "balance_loss_mlp": 1.01634955, "epoch": 0.7709604689613708, "flos": 30620713708800.0, "grad_norm": 1.551146522078857, "language_loss": 0.72752166, "learning_rate": 5.253978201456733e-07, "loss": 0.74842095, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35351562, "step": 12823, "time_per_iteration": 2.4726462364196777 }, { "auxiliary_loss_clip": 0.01057196, "auxiliary_loss_mlp": 0.01038389, "balance_loss_clip": 1.01199651, "balance_loss_mlp": 1.01756465, "epoch": 0.7710205922140387, "flos": 20300282812800.0, "grad_norm": 2.9006244831450836, "language_loss": 0.77478862, "learning_rate": 5.251347417035969e-07, "loss": 0.79574448, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39648438, "step": 12824, "time_per_iteration": 2.366564989089966 }, { "auxiliary_loss_clip": 0.0105437, "auxiliary_loss_mlp": 0.01036239, "balance_loss_clip": 1.01223063, "balance_loss_mlp": 1.01767492, "epoch": 0.7710807154667068, "flos": 19643252336640.0, "grad_norm": 2.1421638816044433, "language_loss": 0.74010253, "learning_rate": 5.248717191885592e-07, "loss": 0.76100862, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 12825, "time_per_iteration": 2.403416395187378 }, { "auxiliary_loss_clip": 0.01050288, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.01430857, "balance_loss_mlp": 1.01624751, "epoch": 0.7711408387193747, "flos": 20005441447680.0, "grad_norm": 1.4544235905386564, "language_loss": 0.74494129, "learning_rate": 5.246087526105343e-07, "loss": 0.76578885, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 12826, "time_per_iteration": 2.4249658584594727 }, { "auxiliary_loss_clip": 0.01053496, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.01367497, "balance_loss_mlp": 1.01696479, "epoch": 0.7712009619720427, "flos": 24970020825600.0, "grad_norm": 1.5528736633342006, "language_loss": 0.82364774, "learning_rate": 5.243458419794933e-07, "loss": 0.8445704, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 12827, "time_per_iteration": 2.43949556350708 }, { "auxiliary_loss_clip": 0.01007692, "auxiliary_loss_mlp": 0.01003163, "balance_loss_clip": 1.00077868, "balance_loss_mlp": 1.00072134, "epoch": 0.7712610852247107, "flos": 63246347953920.0, "grad_norm": 0.8638866278180034, "language_loss": 0.55187881, "learning_rate": 5.240829873054051e-07, "loss": 0.57198739, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06982422, "step": 12828, "time_per_iteration": 4.457131385803223 }, { "auxiliary_loss_clip": 0.01051115, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.01060438, "balance_loss_mlp": 1.01625443, "epoch": 0.7713212084773786, "flos": 18696861578880.0, "grad_norm": 1.9755529744036866, "language_loss": 0.70944107, "learning_rate": 5.23820188598238e-07, "loss": 0.73027515, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 12829, "time_per_iteration": 2.3447306156158447 }, { "auxiliary_loss_clip": 0.01054902, "auxiliary_loss_mlp": 0.01041772, "balance_loss_clip": 1.01706004, "balance_loss_mlp": 1.01716113, "epoch": 0.7713813317300466, "flos": 14172501934080.0, "grad_norm": 2.367205141035658, "language_loss": 0.81077015, "learning_rate": 5.235574458679579e-07, "loss": 0.83173692, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 12830, "time_per_iteration": 2.3815901279449463 }, { "auxiliary_loss_clip": 0.01054868, "auxiliary_loss_mlp": 0.01042693, "balance_loss_clip": 1.01535797, "balance_loss_mlp": 1.01671195, "epoch": 0.7714414549827145, "flos": 25702742862720.0, "grad_norm": 1.6010278965915659, "language_loss": 0.79221988, "learning_rate": 5.232947591245269e-07, "loss": 0.81319547, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38085938, "step": 12831, "time_per_iteration": 2.3912479877471924 }, { "auxiliary_loss_clip": 0.01051865, "auxiliary_loss_mlp": 0.01038088, "balance_loss_clip": 1.01452041, "balance_loss_mlp": 1.01604009, "epoch": 0.7715015782353826, "flos": 30553994367360.0, "grad_norm": 1.510688916968866, "language_loss": 0.62101132, "learning_rate": 5.230321283779071e-07, "loss": 0.64191085, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35742188, "step": 12832, "time_per_iteration": 2.4633495807647705 }, { "auxiliary_loss_clip": 0.01054016, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.01489639, "balance_loss_mlp": 1.01562142, "epoch": 0.7715617014880505, "flos": 20228326767360.0, "grad_norm": 1.6790874095979706, "language_loss": 0.80527878, "learning_rate": 5.227695536380572e-07, "loss": 0.8262068, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3828125, "step": 12833, "time_per_iteration": 3.771522045135498 }, { "auxiliary_loss_clip": 0.01008084, "auxiliary_loss_mlp": 0.01003116, "balance_loss_clip": 1.00067246, "balance_loss_mlp": 1.00103617, "epoch": 0.7716218247407185, "flos": 63662059912320.0, "grad_norm": 0.8592873767362482, "language_loss": 0.55643904, "learning_rate": 5.22507034914933e-07, "loss": 0.57655096, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.0703125, "step": 12834, "time_per_iteration": 4.357520818710327 }, { "auxiliary_loss_clip": 0.0105288, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.01244736, "balance_loss_mlp": 1.01676393, "epoch": 0.7716819479933864, "flos": 19790795208960.0, "grad_norm": 2.6026694299596067, "language_loss": 0.7439667, "learning_rate": 5.222445722184903e-07, "loss": 0.7648502, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 12835, "time_per_iteration": 2.4197685718536377 }, { "auxiliary_loss_clip": 0.01053278, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.01435328, "balance_loss_mlp": 1.01655853, "epoch": 0.7717420712460544, "flos": 18441192625920.0, "grad_norm": 1.8750246564537822, "language_loss": 0.72284818, "learning_rate": 5.219821655586814e-07, "loss": 0.74378467, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 12836, "time_per_iteration": 2.3590691089630127 }, { "auxiliary_loss_clip": 0.01050776, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.01252723, "balance_loss_mlp": 1.01593161, "epoch": 0.7718021944987223, "flos": 35188016192640.0, "grad_norm": 2.333447654738038, "language_loss": 0.61298108, "learning_rate": 5.217198149454575e-07, "loss": 0.63384688, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34765625, "step": 12837, "time_per_iteration": 2.53525710105896 }, { "auxiliary_loss_clip": 0.01008152, "auxiliary_loss_mlp": 0.01002423, "balance_loss_clip": 0.99984759, "balance_loss_mlp": 1.00119495, "epoch": 0.7718623177513904, "flos": 67919612014080.0, "grad_norm": 0.8615660925894095, "language_loss": 0.5589202, "learning_rate": 5.214575203887666e-07, "loss": 0.57902598, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.06933594, "step": 12838, "time_per_iteration": 2.989227294921875 }, { "auxiliary_loss_clip": 0.0105077, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.01366091, "balance_loss_mlp": 1.01586413, "epoch": 0.7719224410040583, "flos": 18580601151360.0, "grad_norm": 2.8110305887644365, "language_loss": 0.70369387, "learning_rate": 5.211952818985538e-07, "loss": 0.72456294, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 12839, "time_per_iteration": 2.34385085105896 }, { "auxiliary_loss_clip": 0.01051562, "auxiliary_loss_mlp": 0.01034856, "balance_loss_clip": 1.01264715, "balance_loss_mlp": 1.01718462, "epoch": 0.7719825642567263, "flos": 23074690780800.0, "grad_norm": 1.9464568162900922, "language_loss": 0.81200135, "learning_rate": 5.209330994847647e-07, "loss": 0.83286554, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 12840, "time_per_iteration": 2.3669679164886475 }, { "auxiliary_loss_clip": 0.01053553, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.01494551, "balance_loss_mlp": 1.01712787, "epoch": 0.7720426875093943, "flos": 20338058770560.0, "grad_norm": 2.078362666931096, "language_loss": 0.80924785, "learning_rate": 5.206709731573402e-07, "loss": 0.83016264, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 12841, "time_per_iteration": 2.384835720062256 }, { "auxiliary_loss_clip": 0.01051281, "auxiliary_loss_mlp": 0.01036942, "balance_loss_clip": 1.0136013, "balance_loss_mlp": 1.01572669, "epoch": 0.7721028107620622, "flos": 23879508508800.0, "grad_norm": 1.4600064951566638, "language_loss": 0.77899218, "learning_rate": 5.204089029262208e-07, "loss": 0.79987442, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 12842, "time_per_iteration": 2.4024016857147217 }, { "auxiliary_loss_clip": 0.01054298, "auxiliary_loss_mlp": 0.01040981, "balance_loss_clip": 1.01781881, "balance_loss_mlp": 1.01725483, "epoch": 0.7721629340147302, "flos": 26650355518080.0, "grad_norm": 1.908205047247588, "language_loss": 0.69685727, "learning_rate": 5.201468888013445e-07, "loss": 0.71781003, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 12843, "time_per_iteration": 2.444014072418213 }, { "auxiliary_loss_clip": 0.01054906, "auxiliary_loss_mlp": 0.01037002, "balance_loss_clip": 1.01392317, "balance_loss_mlp": 1.01672316, "epoch": 0.7722230572673981, "flos": 21177789724800.0, "grad_norm": 2.9027927685910573, "language_loss": 0.75909221, "learning_rate": 5.198849307926465e-07, "loss": 0.7800113, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3828125, "step": 12844, "time_per_iteration": 2.363124370574951 }, { "auxiliary_loss_clip": 0.01050981, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.01330721, "balance_loss_mlp": 1.01595199, "epoch": 0.7722831805200662, "flos": 27963404040960.0, "grad_norm": 1.5045451223593904, "language_loss": 0.72815681, "learning_rate": 5.196230289100596e-07, "loss": 0.74902022, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 12845, "time_per_iteration": 2.4450411796569824 }, { "auxiliary_loss_clip": 0.01049609, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.01503301, "balance_loss_mlp": 1.01552212, "epoch": 0.7723433037727341, "flos": 33874164708480.0, "grad_norm": 1.7363221210393276, "language_loss": 0.65487683, "learning_rate": 5.193611831635159e-07, "loss": 0.67573404, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 12846, "time_per_iteration": 2.5328209400177 }, { "auxiliary_loss_clip": 0.01008172, "auxiliary_loss_mlp": 0.01004369, "balance_loss_clip": 1.00212765, "balance_loss_mlp": 1.00130725, "epoch": 0.7724034270254021, "flos": 62844951519360.0, "grad_norm": 0.7804982211816042, "language_loss": 0.6186744, "learning_rate": 5.19099393562945e-07, "loss": 0.63879979, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06884766, "step": 12847, "time_per_iteration": 2.972306728363037 }, { "auxiliary_loss_clip": 0.0105099, "auxiliary_loss_mlp": 0.0103332, "balance_loss_clip": 1.01093304, "balance_loss_mlp": 1.01496732, "epoch": 0.77246355027807, "flos": 23294329344000.0, "grad_norm": 1.7862877024684414, "language_loss": 0.80250037, "learning_rate": 5.188376601182732e-07, "loss": 0.82334346, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 12848, "time_per_iteration": 3.8425590991973877 }, { "auxiliary_loss_clip": 0.01053662, "auxiliary_loss_mlp": 0.01041948, "balance_loss_clip": 1.0188334, "balance_loss_mlp": 1.01611149, "epoch": 0.772523673530738, "flos": 20120235598080.0, "grad_norm": 1.7261201562374693, "language_loss": 0.74056941, "learning_rate": 5.185759828394261e-07, "loss": 0.76152551, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 12849, "time_per_iteration": 2.3561737537384033 }, { "auxiliary_loss_clip": 0.0105089, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.00927091, "balance_loss_mlp": 1.01519334, "epoch": 0.7725837967834059, "flos": 17819180110080.0, "grad_norm": 1.8588748537990467, "language_loss": 0.79715395, "learning_rate": 5.183143617363261e-07, "loss": 0.81798303, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 12850, "time_per_iteration": 2.3530564308166504 }, { "auxiliary_loss_clip": 0.0105368, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.01662707, "balance_loss_mlp": 1.01634538, "epoch": 0.772643920036074, "flos": 27197688902400.0, "grad_norm": 1.6952698406929767, "language_loss": 0.80780017, "learning_rate": 5.180527968188935e-07, "loss": 0.82874417, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 12851, "time_per_iteration": 2.4131436347961426 }, { "auxiliary_loss_clip": 0.01051516, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.0198046, "balance_loss_mlp": 1.01619339, "epoch": 0.7727040432887419, "flos": 21578453020800.0, "grad_norm": 1.7032113347659417, "language_loss": 0.75116277, "learning_rate": 5.177912880970474e-07, "loss": 0.7721113, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 12852, "time_per_iteration": 2.3662424087524414 }, { "auxiliary_loss_clip": 0.01050599, "auxiliary_loss_mlp": 0.01041933, "balance_loss_clip": 1.01948595, "balance_loss_mlp": 1.01495886, "epoch": 0.7727641665414099, "flos": 22235553319680.0, "grad_norm": 1.698800758306886, "language_loss": 0.82873321, "learning_rate": 5.17529835580704e-07, "loss": 0.84965861, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 12853, "time_per_iteration": 2.3861634731292725 }, { "auxiliary_loss_clip": 0.01007661, "auxiliary_loss_mlp": 0.01001397, "balance_loss_clip": 0.99915552, "balance_loss_mlp": 1.00070858, "epoch": 0.7728242897940779, "flos": 54828822867840.0, "grad_norm": 0.8933027017113838, "language_loss": 0.54607069, "learning_rate": 5.172684392797786e-07, "loss": 0.56616122, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06933594, "step": 12854, "time_per_iteration": 3.0723183155059814 }, { "auxiliary_loss_clip": 0.0105312, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.01239955, "balance_loss_mlp": 1.01586354, "epoch": 0.7728844130467458, "flos": 34460461036800.0, "grad_norm": 1.564832345684066, "language_loss": 0.7306686, "learning_rate": 5.170070992041826e-07, "loss": 0.75156355, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 12855, "time_per_iteration": 2.4712162017822266 }, { "auxiliary_loss_clip": 0.01051935, "auxiliary_loss_mlp": 0.01037957, "balance_loss_clip": 1.01359081, "balance_loss_mlp": 1.01620448, "epoch": 0.7729445362994138, "flos": 18915348067200.0, "grad_norm": 1.6112444619286324, "language_loss": 0.69203258, "learning_rate": 5.167458153638254e-07, "loss": 0.71293151, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35742188, "step": 12856, "time_per_iteration": 2.3786211013793945 }, { "auxiliary_loss_clip": 0.0105293, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.01692176, "balance_loss_mlp": 1.01620889, "epoch": 0.7730046595520818, "flos": 22198964348160.0, "grad_norm": 1.7862089198120918, "language_loss": 0.80312634, "learning_rate": 5.164845877686162e-07, "loss": 0.82406312, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 12857, "time_per_iteration": 2.369840145111084 }, { "auxiliary_loss_clip": 0.01052001, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.01228678, "balance_loss_mlp": 1.01559269, "epoch": 0.7730647828047498, "flos": 13551501847680.0, "grad_norm": 1.7513197449145859, "language_loss": 0.79368901, "learning_rate": 5.162234164284591e-07, "loss": 0.81458032, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 12858, "time_per_iteration": 2.317845344543457 }, { "auxiliary_loss_clip": 0.01050231, "auxiliary_loss_mlp": 0.0103081, "balance_loss_clip": 1.00836277, "balance_loss_mlp": 1.01442778, "epoch": 0.7731249060574177, "flos": 21975101510400.0, "grad_norm": 2.3901923447547655, "language_loss": 0.78583872, "learning_rate": 5.159623013532591e-07, "loss": 0.80664909, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 12859, "time_per_iteration": 2.3920135498046875 }, { "auxiliary_loss_clip": 0.01048989, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.01375723, "balance_loss_mlp": 1.01603341, "epoch": 0.7731850293100857, "flos": 22600709896320.0, "grad_norm": 1.416694556896704, "language_loss": 0.6877768, "learning_rate": 5.157012425529186e-07, "loss": 0.70860374, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33007812, "step": 12860, "time_per_iteration": 2.3788037300109863 }, { "auxiliary_loss_clip": 0.01053428, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.0149473, "balance_loss_mlp": 1.01617646, "epoch": 0.7732451525627536, "flos": 14097613334400.0, "grad_norm": 2.3162201134554885, "language_loss": 0.77072227, "learning_rate": 5.154402400373343e-07, "loss": 0.79165304, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37304688, "step": 12861, "time_per_iteration": 2.3552823066711426 }, { "auxiliary_loss_clip": 0.01053551, "auxiliary_loss_mlp": 0.01036335, "balance_loss_clip": 1.01285052, "balance_loss_mlp": 1.01643538, "epoch": 0.7733052758154216, "flos": 21468965397120.0, "grad_norm": 1.4950012925137461, "language_loss": 0.7585361, "learning_rate": 5.15179293816405e-07, "loss": 0.77943498, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 12862, "time_per_iteration": 2.4103405475616455 }, { "auxiliary_loss_clip": 0.01050814, "auxiliary_loss_mlp": 0.01038001, "balance_loss_clip": 1.01789093, "balance_loss_mlp": 1.01627231, "epoch": 0.7733653990680895, "flos": 21393308747520.0, "grad_norm": 1.851449653157231, "language_loss": 0.83214897, "learning_rate": 5.149184039000256e-07, "loss": 0.85303712, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.34570312, "step": 12863, "time_per_iteration": 2.383023738861084 }, { "auxiliary_loss_clip": 0.01050595, "auxiliary_loss_mlp": 0.01038791, "balance_loss_clip": 1.01618934, "balance_loss_mlp": 1.01570368, "epoch": 0.7734255223207576, "flos": 17675093462400.0, "grad_norm": 1.7240945722970054, "language_loss": 0.74317873, "learning_rate": 5.146575702980898e-07, "loss": 0.76407254, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 12864, "time_per_iteration": 2.3390071392059326 }, { "auxiliary_loss_clip": 0.01050948, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.01510549, "balance_loss_mlp": 1.01517391, "epoch": 0.7734856455734255, "flos": 25229599850880.0, "grad_norm": 2.0591636920830747, "language_loss": 0.82701981, "learning_rate": 5.143967930204871e-07, "loss": 0.84790003, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 12865, "time_per_iteration": 2.4269559383392334 }, { "auxiliary_loss_clip": 0.01055531, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.01928377, "balance_loss_mlp": 1.01718795, "epoch": 0.7735457688260935, "flos": 23432201769600.0, "grad_norm": 2.2329713907406044, "language_loss": 0.73490822, "learning_rate": 5.141360720771077e-07, "loss": 0.75592256, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 12866, "time_per_iteration": 2.376915454864502 }, { "auxiliary_loss_clip": 0.01054194, "auxiliary_loss_mlp": 0.01040096, "balance_loss_clip": 1.01439476, "balance_loss_mlp": 1.01681185, "epoch": 0.7736058920787615, "flos": 18728388403200.0, "grad_norm": 2.682029344651614, "language_loss": 0.66259706, "learning_rate": 5.138754074778371e-07, "loss": 0.68353999, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 12867, "time_per_iteration": 2.3612287044525146 }, { "auxiliary_loss_clip": 0.01051564, "auxiliary_loss_mlp": 0.01044474, "balance_loss_clip": 1.0200603, "balance_loss_mlp": 1.01567614, "epoch": 0.7736660153314294, "flos": 22892199770880.0, "grad_norm": 1.4504178332617506, "language_loss": 0.7155931, "learning_rate": 5.136147992325595e-07, "loss": 0.73655343, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 12868, "time_per_iteration": 3.6806933879852295 }, { "auxiliary_loss_clip": 0.01053692, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.01620507, "balance_loss_mlp": 1.01662111, "epoch": 0.7737261385840974, "flos": 13800258351360.0, "grad_norm": 2.053678107914755, "language_loss": 0.7815389, "learning_rate": 5.133542473511578e-07, "loss": 0.80248463, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 12869, "time_per_iteration": 2.351701498031616 }, { "auxiliary_loss_clip": 0.01050009, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.01185179, "balance_loss_mlp": 1.01527619, "epoch": 0.7737862618367654, "flos": 28729468293120.0, "grad_norm": 1.5966029996509639, "language_loss": 0.75044596, "learning_rate": 5.130937518435124e-07, "loss": 0.77128136, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 12870, "time_per_iteration": 2.425358533859253 }, { "auxiliary_loss_clip": 0.01052687, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.01413512, "balance_loss_mlp": 1.01629448, "epoch": 0.7738463850894334, "flos": 17017644049920.0, "grad_norm": 2.430606237463681, "language_loss": 0.75929749, "learning_rate": 5.12833312719501e-07, "loss": 0.78019476, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 12871, "time_per_iteration": 2.357271671295166 }, { "auxiliary_loss_clip": 0.01052032, "auxiliary_loss_mlp": 0.01038181, "balance_loss_clip": 1.01538849, "balance_loss_mlp": 1.0160737, "epoch": 0.7739065083421013, "flos": 20702970967680.0, "grad_norm": 1.7179228867822032, "language_loss": 0.69838125, "learning_rate": 5.12572929988999e-07, "loss": 0.7192834, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 12872, "time_per_iteration": 2.3658108711242676 }, { "auxiliary_loss_clip": 0.01051548, "auxiliary_loss_mlp": 0.01039732, "balance_loss_clip": 1.01516342, "balance_loss_mlp": 1.01551116, "epoch": 0.7739666315947693, "flos": 20696372720640.0, "grad_norm": 2.38281755833363, "language_loss": 0.85829043, "learning_rate": 5.123126036618804e-07, "loss": 0.8792032, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 12873, "time_per_iteration": 3.778946876525879 }, { "auxiliary_loss_clip": 0.01052889, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.01612771, "balance_loss_mlp": 1.01628578, "epoch": 0.7740267548474372, "flos": 29569373804160.0, "grad_norm": 2.601235743130282, "language_loss": 0.6711762, "learning_rate": 5.120523337480174e-07, "loss": 0.69210368, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 12874, "time_per_iteration": 3.856391668319702 }, { "auxiliary_loss_clip": 0.01051127, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.01334202, "balance_loss_mlp": 1.01653934, "epoch": 0.7740868781001052, "flos": 23657984732160.0, "grad_norm": 1.6729804350386472, "language_loss": 0.63168305, "learning_rate": 5.117921202572785e-07, "loss": 0.65254551, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 12875, "time_per_iteration": 2.424463987350464 }, { "auxiliary_loss_clip": 0.01052858, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.01404631, "balance_loss_mlp": 1.0168556, "epoch": 0.7741470013527731, "flos": 24716167263360.0, "grad_norm": 1.8044391197565974, "language_loss": 0.66467619, "learning_rate": 5.115319631995318e-07, "loss": 0.68557221, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 12876, "time_per_iteration": 2.440645694732666 }, { "auxiliary_loss_clip": 0.01049717, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.01380932, "balance_loss_mlp": 1.01551795, "epoch": 0.7742071246054412, "flos": 21870571299840.0, "grad_norm": 2.6720451575143946, "language_loss": 0.72567046, "learning_rate": 5.112718625846433e-07, "loss": 0.74652803, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34179688, "step": 12877, "time_per_iteration": 2.3866336345672607 }, { "auxiliary_loss_clip": 0.01052598, "auxiliary_loss_mlp": 0.01039998, "balance_loss_clip": 1.0155127, "balance_loss_mlp": 1.01602983, "epoch": 0.7742672478581091, "flos": 22673154700800.0, "grad_norm": 1.7526462561663059, "language_loss": 0.84164, "learning_rate": 5.110118184224736e-07, "loss": 0.86256593, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 12878, "time_per_iteration": 2.371530055999756 }, { "auxiliary_loss_clip": 0.01054018, "auxiliary_loss_mlp": 0.01042084, "balance_loss_clip": 1.01676428, "balance_loss_mlp": 1.01649809, "epoch": 0.7743273711107771, "flos": 18839970708480.0, "grad_norm": 1.7068660672880127, "language_loss": 0.74375457, "learning_rate": 5.10751830722885e-07, "loss": 0.76471555, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 12879, "time_per_iteration": 2.3368301391601562 }, { "auxiliary_loss_clip": 0.01049731, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.01338243, "balance_loss_mlp": 1.0155251, "epoch": 0.7743874943634451, "flos": 28728106750080.0, "grad_norm": 1.5812827137235541, "language_loss": 0.8054828, "learning_rate": 5.104918994957364e-07, "loss": 0.82632828, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 12880, "time_per_iteration": 2.444201707839966 }, { "auxiliary_loss_clip": 0.01052826, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.01253724, "balance_loss_mlp": 1.01735306, "epoch": 0.774447617616113, "flos": 21908521814400.0, "grad_norm": 1.4484094464069734, "language_loss": 0.7172482, "learning_rate": 5.102320247508847e-07, "loss": 0.73814523, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 12881, "time_per_iteration": 2.3950397968292236 }, { "auxiliary_loss_clip": 0.01054341, "auxiliary_loss_mlp": 0.01047917, "balance_loss_clip": 1.02005744, "balance_loss_mlp": 1.01693726, "epoch": 0.774507740868781, "flos": 19499619536640.0, "grad_norm": 1.817782284184215, "language_loss": 0.85813612, "learning_rate": 5.099722064981832e-07, "loss": 0.87915874, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.375, "step": 12882, "time_per_iteration": 2.382051944732666 }, { "auxiliary_loss_clip": 0.01007603, "auxiliary_loss_mlp": 0.01004043, "balance_loss_clip": 1.00155175, "balance_loss_mlp": 1.00086057, "epoch": 0.774567864121449, "flos": 59423113699200.0, "grad_norm": 0.7666922445536866, "language_loss": 0.60586309, "learning_rate": 5.097124447474858e-07, "loss": 0.62597954, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.06738281, "step": 12883, "time_per_iteration": 2.9852066040039062 }, { "auxiliary_loss_clip": 0.01054172, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 1.01513469, "balance_loss_mlp": 1.01699328, "epoch": 0.774627987374117, "flos": 13224470342400.0, "grad_norm": 1.8923077093120344, "language_loss": 0.74163908, "learning_rate": 5.094527395086416e-07, "loss": 0.762586, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 12884, "time_per_iteration": 2.365237236022949 }, { "auxiliary_loss_clip": 0.01050848, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.01186299, "balance_loss_mlp": 1.01616085, "epoch": 0.7746881106267849, "flos": 21393064368000.0, "grad_norm": 1.5608728541543393, "language_loss": 0.81751239, "learning_rate": 5.091930907914986e-07, "loss": 0.83834827, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 12885, "time_per_iteration": 2.3604304790496826 }, { "auxiliary_loss_clip": 0.01051075, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.01214075, "balance_loss_mlp": 1.0157392, "epoch": 0.7747482338794529, "flos": 25628168465280.0, "grad_norm": 1.8140905135789172, "language_loss": 0.65662062, "learning_rate": 5.089334986059029e-07, "loss": 0.67747134, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 12886, "time_per_iteration": 2.420694351196289 }, { "auxiliary_loss_clip": 0.01051683, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.01363075, "balance_loss_mlp": 1.0159514, "epoch": 0.7748083571321208, "flos": 11546125597440.0, "grad_norm": 1.8134728979125818, "language_loss": 0.7076689, "learning_rate": 5.086739629616987e-07, "loss": 0.72853041, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.359375, "step": 12887, "time_per_iteration": 3.81010103225708 }, { "auxiliary_loss_clip": 0.01051121, "auxiliary_loss_mlp": 0.01033623, "balance_loss_clip": 1.01273751, "balance_loss_mlp": 1.01646733, "epoch": 0.7748684803847888, "flos": 19061424662400.0, "grad_norm": 1.7831589608994476, "language_loss": 0.71947134, "learning_rate": 5.084144838687275e-07, "loss": 0.74031883, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 12888, "time_per_iteration": 2.361963987350464 }, { "auxiliary_loss_clip": 0.01053148, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.0142616, "balance_loss_mlp": 1.0160532, "epoch": 0.7749286036374567, "flos": 22272072468480.0, "grad_norm": 1.5906132683902054, "language_loss": 0.8264569, "learning_rate": 5.081550613368279e-07, "loss": 0.84737885, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 12889, "time_per_iteration": 2.3900370597839355 }, { "auxiliary_loss_clip": 0.01053011, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.01475644, "balance_loss_mlp": 1.01736975, "epoch": 0.7749887268901248, "flos": 20191458504960.0, "grad_norm": 1.9042650216199808, "language_loss": 0.8053987, "learning_rate": 5.07895695375838e-07, "loss": 0.82630742, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 12890, "time_per_iteration": 2.3576319217681885 }, { "auxiliary_loss_clip": 0.01055421, "auxiliary_loss_mlp": 0.01044779, "balance_loss_clip": 1.02068639, "balance_loss_mlp": 1.01797342, "epoch": 0.7750488501427927, "flos": 20336557582080.0, "grad_norm": 1.991701789202154, "language_loss": 0.6786406, "learning_rate": 5.076363859955932e-07, "loss": 0.69964254, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 12891, "time_per_iteration": 2.3536574840545654 }, { "auxiliary_loss_clip": 0.01050777, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.01260304, "balance_loss_mlp": 1.01577067, "epoch": 0.7751089733954607, "flos": 28362845439360.0, "grad_norm": 1.3634390965247747, "language_loss": 0.79309654, "learning_rate": 5.073771332059257e-07, "loss": 0.81395125, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 12892, "time_per_iteration": 2.467600107192993 }, { "auxiliary_loss_clip": 0.01054083, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.0178653, "balance_loss_mlp": 1.01681471, "epoch": 0.7751690966481286, "flos": 16942930007040.0, "grad_norm": 2.366343943947146, "language_loss": 0.69445324, "learning_rate": 5.071179370166669e-07, "loss": 0.71540725, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37304688, "step": 12893, "time_per_iteration": 2.342428207397461 }, { "auxiliary_loss_clip": 0.01007667, "auxiliary_loss_mlp": 0.01003193, "balance_loss_clip": 1.00082064, "balance_loss_mlp": 1.00093341, "epoch": 0.7752292199007966, "flos": 65664362096640.0, "grad_norm": 0.8042684634737156, "language_loss": 0.58592463, "learning_rate": 5.068587974376468e-07, "loss": 0.60603321, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.06738281, "step": 12894, "time_per_iteration": 3.0615978240966797 }, { "auxiliary_loss_clip": 0.01052932, "auxiliary_loss_mlp": 0.01039562, "balance_loss_clip": 1.01438546, "balance_loss_mlp": 1.01647782, "epoch": 0.7752893431534646, "flos": 20593622989440.0, "grad_norm": 2.037849871611893, "language_loss": 0.79056108, "learning_rate": 5.065997144786895e-07, "loss": 0.81148601, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 12895, "time_per_iteration": 2.356987237930298 }, { "auxiliary_loss_clip": 0.01053698, "auxiliary_loss_mlp": 0.01041866, "balance_loss_clip": 1.01686835, "balance_loss_mlp": 1.01726604, "epoch": 0.7753494664061326, "flos": 20484309922560.0, "grad_norm": 1.7329966931794394, "language_loss": 0.68774021, "learning_rate": 5.063406881496209e-07, "loss": 0.70869589, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 12896, "time_per_iteration": 2.3843705654144287 }, { "auxiliary_loss_clip": 0.01052034, "auxiliary_loss_mlp": 0.01038089, "balance_loss_clip": 1.01598787, "balance_loss_mlp": 1.01599514, "epoch": 0.7754095896588006, "flos": 20264880827520.0, "grad_norm": 2.090730312411665, "language_loss": 0.70021105, "learning_rate": 5.060817184602629e-07, "loss": 0.72111237, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 12897, "time_per_iteration": 2.361423969268799 }, { "auxiliary_loss_clip": 0.01054072, "auxiliary_loss_mlp": 0.01042475, "balance_loss_clip": 1.01558101, "balance_loss_mlp": 1.01711845, "epoch": 0.7754697129114685, "flos": 23329975708800.0, "grad_norm": 1.8815937487118626, "language_loss": 0.7603507, "learning_rate": 5.058228054204364e-07, "loss": 0.78131616, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.36914062, "step": 12898, "time_per_iteration": 2.4217164516448975 }, { "auxiliary_loss_clip": 0.01052372, "auxiliary_loss_mlp": 0.01035063, "balance_loss_clip": 1.00960016, "balance_loss_mlp": 1.01645589, "epoch": 0.7755298361641365, "flos": 17346665502720.0, "grad_norm": 2.31457495365626, "language_loss": 0.71052235, "learning_rate": 5.055639490399588e-07, "loss": 0.73139668, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 12899, "time_per_iteration": 2.316455125808716 }, { "auxiliary_loss_clip": 0.0105176, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.01448011, "balance_loss_mlp": 1.01614618, "epoch": 0.7755899594168044, "flos": 19644858259200.0, "grad_norm": 2.0723310012022007, "language_loss": 0.76098967, "learning_rate": 5.053051493286453e-07, "loss": 0.7818948, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 12900, "time_per_iteration": 2.364990234375 }, { "auxiliary_loss_clip": 0.01051247, "auxiliary_loss_mlp": 0.01044344, "balance_loss_clip": 1.02301741, "balance_loss_mlp": 1.01602983, "epoch": 0.7756500826694724, "flos": 27413312659200.0, "grad_norm": 1.699091932561673, "language_loss": 0.78321576, "learning_rate": 5.050464062963113e-07, "loss": 0.80417168, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 12901, "time_per_iteration": 2.4199023246765137 }, { "auxiliary_loss_clip": 0.01052656, "auxiliary_loss_mlp": 0.01037228, "balance_loss_clip": 1.01516247, "balance_loss_mlp": 1.01682055, "epoch": 0.7757102059221404, "flos": 28729258824960.0, "grad_norm": 2.1326453517712767, "language_loss": 0.77868909, "learning_rate": 5.047877199527666e-07, "loss": 0.79958791, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 12902, "time_per_iteration": 2.418944835662842 }, { "auxiliary_loss_clip": 0.01051899, "auxiliary_loss_mlp": 0.01037938, "balance_loss_clip": 1.01537168, "balance_loss_mlp": 1.01650405, "epoch": 0.7757703291748084, "flos": 22485845923200.0, "grad_norm": 1.7511137470427962, "language_loss": 0.74280632, "learning_rate": 5.045290903078215e-07, "loss": 0.76370466, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 12903, "time_per_iteration": 2.381988763809204 }, { "auxiliary_loss_clip": 0.01051066, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.01103234, "balance_loss_mlp": 1.01593673, "epoch": 0.7758304524274763, "flos": 21429199491840.0, "grad_norm": 1.968568562417605, "language_loss": 0.77174616, "learning_rate": 5.042705173712835e-07, "loss": 0.79259706, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 12904, "time_per_iteration": 2.3860630989074707 }, { "auxiliary_loss_clip": 0.01048592, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.0112505, "balance_loss_mlp": 1.01469374, "epoch": 0.7758905756801443, "flos": 23658124377600.0, "grad_norm": 2.0397343292278527, "language_loss": 0.69438595, "learning_rate": 5.040120011529576e-07, "loss": 0.71519214, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33984375, "step": 12905, "time_per_iteration": 2.377030611038208 }, { "auxiliary_loss_clip": 0.01050525, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.01116836, "balance_loss_mlp": 1.01572418, "epoch": 0.7759506989328122, "flos": 28364241893760.0, "grad_norm": 1.580558261177534, "language_loss": 0.68506843, "learning_rate": 5.037535416626459e-07, "loss": 0.70590377, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 12906, "time_per_iteration": 2.454740047454834 }, { "auxiliary_loss_clip": 0.01052335, "auxiliary_loss_mlp": 0.01040902, "balance_loss_clip": 1.01759624, "balance_loss_mlp": 1.01627171, "epoch": 0.7760108221854802, "flos": 14901907392000.0, "grad_norm": 1.8837795102591517, "language_loss": 0.82568949, "learning_rate": 5.034951389101498e-07, "loss": 0.84662181, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 12907, "time_per_iteration": 3.640040397644043 }, { "auxiliary_loss_clip": 0.01050686, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.01698637, "balance_loss_mlp": 1.01687217, "epoch": 0.7760709454381483, "flos": 14791651718400.0, "grad_norm": 2.2882048483231068, "language_loss": 0.68491274, "learning_rate": 5.032367929052685e-07, "loss": 0.70580661, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33789062, "step": 12908, "time_per_iteration": 2.3452656269073486 }, { "auxiliary_loss_clip": 0.01054359, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.016505, "balance_loss_mlp": 1.01830888, "epoch": 0.7761310686908162, "flos": 17378995288320.0, "grad_norm": 1.760629757446218, "language_loss": 0.71051896, "learning_rate": 5.029785036577976e-07, "loss": 0.73147571, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 12909, "time_per_iteration": 2.3647639751434326 }, { "auxiliary_loss_clip": 0.01050609, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.01477981, "balance_loss_mlp": 1.0159049, "epoch": 0.7761911919434842, "flos": 25555374547200.0, "grad_norm": 1.585784995960741, "language_loss": 0.68751609, "learning_rate": 5.027202711775324e-07, "loss": 0.70839328, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 12910, "time_per_iteration": 2.438021421432495 }, { "auxiliary_loss_clip": 0.0105327, "auxiliary_loss_mlp": 0.01044847, "balance_loss_clip": 1.02063596, "balance_loss_mlp": 1.01724195, "epoch": 0.7762513151961521, "flos": 23178802055040.0, "grad_norm": 1.6899973124752887, "language_loss": 0.72882444, "learning_rate": 5.024620954742646e-07, "loss": 0.74980563, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 12911, "time_per_iteration": 2.371218681335449 }, { "auxiliary_loss_clip": 0.01054078, "auxiliary_loss_mlp": 0.01041113, "balance_loss_clip": 1.01570928, "balance_loss_mlp": 1.0172981, "epoch": 0.7763114384488201, "flos": 21688534137600.0, "grad_norm": 2.640333321389839, "language_loss": 0.64535403, "learning_rate": 5.022039765577836e-07, "loss": 0.66630596, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3671875, "step": 12912, "time_per_iteration": 2.3640151023864746 }, { "auxiliary_loss_clip": 0.0100786, "auxiliary_loss_mlp": 0.0100178, "balance_loss_clip": 0.99945509, "balance_loss_mlp": 1.00119972, "epoch": 0.776371561701488, "flos": 69021749813760.0, "grad_norm": 0.7791379205014731, "language_loss": 0.53334588, "learning_rate": 5.019459144378779e-07, "loss": 0.5534423, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.06640625, "step": 12913, "time_per_iteration": 4.469132661819458 }, { "auxiliary_loss_clip": 0.01052144, "auxiliary_loss_mlp": 0.01038844, "balance_loss_clip": 1.01429844, "balance_loss_mlp": 1.01637781, "epoch": 0.776431684954156, "flos": 22892793264000.0, "grad_norm": 1.669233211173606, "language_loss": 0.63244879, "learning_rate": 5.016879091243338e-07, "loss": 0.6533587, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 12914, "time_per_iteration": 3.8122642040252686 }, { "auxiliary_loss_clip": 0.01051852, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.01218832, "balance_loss_mlp": 1.01609564, "epoch": 0.776491808206824, "flos": 20260656552960.0, "grad_norm": 1.9886137369188122, "language_loss": 0.83642626, "learning_rate": 5.014299606269339e-07, "loss": 0.85730439, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 12915, "time_per_iteration": 2.350306749343872 }, { "auxiliary_loss_clip": 0.01054006, "auxiliary_loss_mlp": 0.01044829, "balance_loss_clip": 1.02065325, "balance_loss_mlp": 1.01657176, "epoch": 0.776551931459492, "flos": 26757888105600.0, "grad_norm": 1.7303296645467678, "language_loss": 0.76195043, "learning_rate": 5.011720689554603e-07, "loss": 0.78293884, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 12916, "time_per_iteration": 2.402024984359741 }, { "auxiliary_loss_clip": 0.01052286, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.01443398, "balance_loss_mlp": 1.01588154, "epoch": 0.7766120547121599, "flos": 52663162965120.0, "grad_norm": 1.4543409344000637, "language_loss": 0.6622197, "learning_rate": 5.009142341196919e-07, "loss": 0.68311566, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 12917, "time_per_iteration": 2.6602914333343506 }, { "auxiliary_loss_clip": 0.01051156, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.01231313, "balance_loss_mlp": 1.01488924, "epoch": 0.7766721779648279, "flos": 25155025453440.0, "grad_norm": 1.5187151199136055, "language_loss": 0.65021104, "learning_rate": 5.006564561294065e-07, "loss": 0.6710692, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36328125, "step": 12918, "time_per_iteration": 2.422973394393921 }, { "auxiliary_loss_clip": 0.01051621, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.01236558, "balance_loss_mlp": 1.01653481, "epoch": 0.7767323012174958, "flos": 23759861679360.0, "grad_norm": 2.844342850974983, "language_loss": 0.74163324, "learning_rate": 5.003987349943777e-07, "loss": 0.76248062, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3515625, "step": 12919, "time_per_iteration": 2.4093081951141357 }, { "auxiliary_loss_clip": 0.01052617, "auxiliary_loss_mlp": 0.0103744, "balance_loss_clip": 1.01180971, "balance_loss_mlp": 1.01588702, "epoch": 0.7767924244701638, "flos": 22085671386240.0, "grad_norm": 2.1266588183221917, "language_loss": 0.80562222, "learning_rate": 5.001410707243792e-07, "loss": 0.82652283, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 12920, "time_per_iteration": 2.3687024116516113 }, { "auxiliary_loss_clip": 0.01053648, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.01328683, "balance_loss_mlp": 1.01756573, "epoch": 0.7768525477228319, "flos": 21980547682560.0, "grad_norm": 3.9206323607336686, "language_loss": 0.7127676, "learning_rate": 4.998834633291829e-07, "loss": 0.7336638, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 12921, "time_per_iteration": 2.430094003677368 }, { "auxiliary_loss_clip": 0.01055958, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.01268721, "balance_loss_mlp": 1.01696181, "epoch": 0.7769126709754998, "flos": 21793622929920.0, "grad_norm": 1.8951623079927071, "language_loss": 0.77146804, "learning_rate": 4.996259128185547e-07, "loss": 0.79241604, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 12922, "time_per_iteration": 2.373117685317993 }, { "auxiliary_loss_clip": 0.01053727, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.01300287, "balance_loss_mlp": 1.01678216, "epoch": 0.7769727942281678, "flos": 20046952920960.0, "grad_norm": 1.6840534402370615, "language_loss": 0.82196158, "learning_rate": 4.993684192022625e-07, "loss": 0.84285533, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 12923, "time_per_iteration": 2.3952529430389404 }, { "auxiliary_loss_clip": 0.01053184, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.01399207, "balance_loss_mlp": 1.01694977, "epoch": 0.7770329174808357, "flos": 21685776140160.0, "grad_norm": 1.8730452819331227, "language_loss": 0.93310988, "learning_rate": 4.991109824900699e-07, "loss": 0.95400119, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 12924, "time_per_iteration": 2.342617988586426 }, { "auxiliary_loss_clip": 0.01052523, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.01416862, "balance_loss_mlp": 1.01581335, "epoch": 0.7770930407335037, "flos": 25848051408000.0, "grad_norm": 2.1253082185761896, "language_loss": 0.67157412, "learning_rate": 4.988536026917401e-07, "loss": 0.69248402, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 12925, "time_per_iteration": 2.421358346939087 }, { "auxiliary_loss_clip": 0.01053701, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.01560104, "balance_loss_mlp": 1.01760602, "epoch": 0.7771531639861716, "flos": 24346856234880.0, "grad_norm": 1.9325192512002993, "language_loss": 0.73424125, "learning_rate": 4.985962798170314e-07, "loss": 0.75516832, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 12926, "time_per_iteration": 2.3806920051574707 }, { "auxiliary_loss_clip": 0.01053831, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.01542532, "balance_loss_mlp": 1.01630688, "epoch": 0.7772132872388396, "flos": 25628761958400.0, "grad_norm": 1.7355182327894574, "language_loss": 0.66986299, "learning_rate": 4.983390138757027e-07, "loss": 0.69081163, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 12927, "time_per_iteration": 3.848069190979004 }, { "auxiliary_loss_clip": 0.01053571, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.0141511, "balance_loss_mlp": 1.01646817, "epoch": 0.7772734104915076, "flos": 26066223694080.0, "grad_norm": 1.9326274239754369, "language_loss": 0.73113, "learning_rate": 4.980818048775093e-07, "loss": 0.75208932, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.37109375, "step": 12928, "time_per_iteration": 2.455474615097046 }, { "auxiliary_loss_clip": 0.01050206, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.01832342, "balance_loss_mlp": 1.01493597, "epoch": 0.7773335337441756, "flos": 22924075708800.0, "grad_norm": 1.5824460832897658, "language_loss": 0.75124943, "learning_rate": 4.978246528322036e-07, "loss": 0.77216512, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 12929, "time_per_iteration": 2.3935930728912354 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.01394033, "balance_loss_mlp": 1.0163157, "epoch": 0.7773936569968435, "flos": 20775729974400.0, "grad_norm": 1.9584208957755669, "language_loss": 0.78431565, "learning_rate": 4.975675577495377e-07, "loss": 0.80521441, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35351562, "step": 12930, "time_per_iteration": 2.3866143226623535 }, { "auxiliary_loss_clip": 0.0105302, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.01650357, "balance_loss_mlp": 1.01702523, "epoch": 0.7774537802495115, "flos": 20371331162880.0, "grad_norm": 2.0416206354074995, "language_loss": 0.80485678, "learning_rate": 4.973105196392613e-07, "loss": 0.82579082, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 12931, "time_per_iteration": 2.401240348815918 }, { "auxiliary_loss_clip": 0.01008899, "auxiliary_loss_mlp": 0.01003083, "balance_loss_clip": 1.00085366, "balance_loss_mlp": 1.00188041, "epoch": 0.7775139035021794, "flos": 53908931698560.0, "grad_norm": 0.8115363137729467, "language_loss": 0.59847128, "learning_rate": 4.970535385111199e-07, "loss": 0.61859107, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.0703125, "step": 12932, "time_per_iteration": 2.9641058444976807 }, { "auxiliary_loss_clip": 0.01054412, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.0158484, "balance_loss_mlp": 1.01710749, "epoch": 0.7775740267548474, "flos": 28841155332480.0, "grad_norm": 1.526981197217689, "language_loss": 0.77455181, "learning_rate": 4.967966143748595e-07, "loss": 0.79550898, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 12933, "time_per_iteration": 2.4534475803375244 }, { "auxiliary_loss_clip": 0.01053541, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.01685584, "balance_loss_mlp": 1.01583254, "epoch": 0.7776341500075155, "flos": 21871374261120.0, "grad_norm": 1.956232730229949, "language_loss": 0.74476123, "learning_rate": 4.965397472402215e-07, "loss": 0.76573396, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37695312, "step": 12934, "time_per_iteration": 2.3785595893859863 }, { "auxiliary_loss_clip": 0.01052617, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.01417327, "balance_loss_mlp": 1.0155735, "epoch": 0.7776942732601834, "flos": 20228815526400.0, "grad_norm": 1.7391452333577067, "language_loss": 0.71727246, "learning_rate": 4.962829371169475e-07, "loss": 0.73819774, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 12935, "time_per_iteration": 2.3711256980895996 }, { "auxiliary_loss_clip": 0.01052192, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.01572227, "balance_loss_mlp": 1.01569521, "epoch": 0.7777543965128514, "flos": 22230875197440.0, "grad_norm": 1.9660806122941459, "language_loss": 0.84575409, "learning_rate": 4.960261840147746e-07, "loss": 0.86667418, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 12936, "time_per_iteration": 2.3894755840301514 }, { "auxiliary_loss_clip": 0.01054103, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.01414907, "balance_loss_mlp": 1.01686907, "epoch": 0.7778145197655193, "flos": 14501069539200.0, "grad_norm": 2.0885410986212536, "language_loss": 0.69395691, "learning_rate": 4.957694879434397e-07, "loss": 0.71486509, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.37304688, "step": 12937, "time_per_iteration": 2.368121862411499 }, { "auxiliary_loss_clip": 0.01054773, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.01244724, "balance_loss_mlp": 1.01728237, "epoch": 0.7778746430181873, "flos": 21139280628480.0, "grad_norm": 1.4273398449966652, "language_loss": 0.882065, "learning_rate": 4.955128489126777e-07, "loss": 0.90298289, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 12938, "time_per_iteration": 2.369689464569092 }, { "auxiliary_loss_clip": 0.01053278, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.01443791, "balance_loss_mlp": 1.01622415, "epoch": 0.7779347662708552, "flos": 20265334675200.0, "grad_norm": 2.22685019474679, "language_loss": 0.86355805, "learning_rate": 4.95256266932218e-07, "loss": 0.88447428, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 12939, "time_per_iteration": 2.388915538787842 }, { "auxiliary_loss_clip": 0.01049379, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.01426697, "balance_loss_mlp": 1.01508498, "epoch": 0.7779948895235232, "flos": 19207990016640.0, "grad_norm": 2.468166313043386, "language_loss": 0.70473742, "learning_rate": 4.949997420117915e-07, "loss": 0.72559905, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 12940, "time_per_iteration": 2.3624417781829834 }, { "auxiliary_loss_clip": 0.01052745, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.01026094, "balance_loss_mlp": 1.01617098, "epoch": 0.7780550127761912, "flos": 23913583862400.0, "grad_norm": 1.5420912309938426, "language_loss": 0.78676128, "learning_rate": 4.947432741611255e-07, "loss": 0.80764204, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 12941, "time_per_iteration": 2.430081367492676 }, { "auxiliary_loss_clip": 0.01054714, "auxiliary_loss_mlp": 0.01040822, "balance_loss_clip": 1.01390433, "balance_loss_mlp": 1.01644063, "epoch": 0.7781151360288592, "flos": 32414585742720.0, "grad_norm": 2.490981068374838, "language_loss": 0.73807395, "learning_rate": 4.944868633899462e-07, "loss": 0.75902927, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3828125, "step": 12942, "time_per_iteration": 2.4553780555725098 }, { "auxiliary_loss_clip": 0.01052177, "auxiliary_loss_mlp": 0.01041254, "balance_loss_clip": 1.01763916, "balance_loss_mlp": 1.01602149, "epoch": 0.7781752592815271, "flos": 22345285322880.0, "grad_norm": 4.078136476515427, "language_loss": 0.69030631, "learning_rate": 4.942305097079751e-07, "loss": 0.71124059, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 12943, "time_per_iteration": 2.4106791019439697 }, { "auxiliary_loss_clip": 0.01007945, "auxiliary_loss_mlp": 0.01003103, "balance_loss_clip": 1.00079048, "balance_loss_mlp": 1.00096869, "epoch": 0.7782353825341951, "flos": 70457030472960.0, "grad_norm": 0.7906399592527179, "language_loss": 0.58646619, "learning_rate": 4.939742131249347e-07, "loss": 0.60657668, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.06982422, "step": 12944, "time_per_iteration": 3.1938228607177734 }, { "auxiliary_loss_clip": 0.01053983, "auxiliary_loss_mlp": 0.0104485, "balance_loss_clip": 1.01765847, "balance_loss_mlp": 1.01596546, "epoch": 0.778295505786863, "flos": 19061564307840.0, "grad_norm": 2.387708957732708, "language_loss": 0.69024652, "learning_rate": 4.937179736505428e-07, "loss": 0.71123487, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.37890625, "step": 12945, "time_per_iteration": 2.3716681003570557 }, { "auxiliary_loss_clip": 0.01053905, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.01662505, "balance_loss_mlp": 1.01733637, "epoch": 0.778355629039531, "flos": 20998580382720.0, "grad_norm": 2.038182182658981, "language_loss": 0.70411515, "learning_rate": 4.93461791294516e-07, "loss": 0.72506928, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36523438, "step": 12946, "time_per_iteration": 2.4352924823760986 }, { "auxiliary_loss_clip": 0.01052036, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.01328814, "balance_loss_mlp": 1.01542687, "epoch": 0.7784157522921991, "flos": 21397009351680.0, "grad_norm": 1.794963237878672, "language_loss": 0.65869302, "learning_rate": 4.932056660665689e-07, "loss": 0.67959428, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 12947, "time_per_iteration": 3.6577954292297363 }, { "auxiliary_loss_clip": 0.01052636, "auxiliary_loss_mlp": 0.0103966, "balance_loss_clip": 1.01519799, "balance_loss_mlp": 1.01603734, "epoch": 0.778475875544867, "flos": 20812807704960.0, "grad_norm": 1.9009834586929546, "language_loss": 0.66661572, "learning_rate": 4.929495979764147e-07, "loss": 0.68753874, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 12948, "time_per_iteration": 2.37751841545105 }, { "auxiliary_loss_clip": 0.01054268, "auxiliary_loss_mlp": 0.01042037, "balance_loss_clip": 1.01480985, "balance_loss_mlp": 1.01685274, "epoch": 0.778535998797535, "flos": 14354504184960.0, "grad_norm": 1.699825791111126, "language_loss": 0.75743449, "learning_rate": 4.926935870337625e-07, "loss": 0.77839756, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.375, "step": 12949, "time_per_iteration": 2.354609727859497 }, { "auxiliary_loss_clip": 0.01055931, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.01539946, "balance_loss_mlp": 1.01769698, "epoch": 0.7785961220502029, "flos": 19208513687040.0, "grad_norm": 1.6619516410191302, "language_loss": 0.69546062, "learning_rate": 4.924376332483202e-07, "loss": 0.71643984, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 12950, "time_per_iteration": 2.371218681335449 }, { "auxiliary_loss_clip": 0.01053981, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.01386452, "balance_loss_mlp": 1.01672292, "epoch": 0.7786562453028709, "flos": 25737586266240.0, "grad_norm": 1.815805072154146, "language_loss": 0.72807407, "learning_rate": 4.921817366297938e-07, "loss": 0.74898815, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 12951, "time_per_iteration": 2.3961660861968994 }, { "auxiliary_loss_clip": 0.01052327, "auxiliary_loss_mlp": 0.01041087, "balance_loss_clip": 1.01489663, "balance_loss_mlp": 1.01681077, "epoch": 0.7787163685555388, "flos": 25738249582080.0, "grad_norm": 1.798974456971515, "language_loss": 0.66949618, "learning_rate": 4.919258971878877e-07, "loss": 0.69043028, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.35546875, "step": 12952, "time_per_iteration": 2.4522953033447266 }, { "auxiliary_loss_clip": 0.0104772, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.01453948, "balance_loss_mlp": 1.0142833, "epoch": 0.7787764918082068, "flos": 22746611934720.0, "grad_norm": 1.5875385924726528, "language_loss": 0.82116336, "learning_rate": 4.916701149323022e-07, "loss": 0.84200627, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33398438, "step": 12953, "time_per_iteration": 3.7386562824249268 }, { "auxiliary_loss_clip": 0.01055239, "auxiliary_loss_mlp": 0.01041218, "balance_loss_clip": 1.01641035, "balance_loss_mlp": 1.01745546, "epoch": 0.7788366150608748, "flos": 15190080687360.0, "grad_norm": 3.256129659782537, "language_loss": 0.78317726, "learning_rate": 4.91414389872737e-07, "loss": 0.80414182, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 12954, "time_per_iteration": 3.830127239227295 }, { "auxiliary_loss_clip": 0.01053095, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.0124507, "balance_loss_mlp": 1.01643622, "epoch": 0.7788967383135428, "flos": 21209316549120.0, "grad_norm": 1.5773422814989122, "language_loss": 0.73576027, "learning_rate": 4.911587220188905e-07, "loss": 0.75664741, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36523438, "step": 12955, "time_per_iteration": 2.390669584274292 }, { "auxiliary_loss_clip": 0.0105378, "auxiliary_loss_mlp": 0.01035983, "balance_loss_clip": 1.01137817, "balance_loss_mlp": 1.01636863, "epoch": 0.7789568615662107, "flos": 21682075536000.0, "grad_norm": 1.477404264375413, "language_loss": 0.69020224, "learning_rate": 4.909031113804551e-07, "loss": 0.71109986, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 12956, "time_per_iteration": 2.3824710845947266 }, { "auxiliary_loss_clip": 0.01053766, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.0123024, "balance_loss_mlp": 1.01749682, "epoch": 0.7790169848188787, "flos": 26359144934400.0, "grad_norm": 1.474200238222114, "language_loss": 0.77228093, "learning_rate": 4.906475579671252e-07, "loss": 0.79316783, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 12957, "time_per_iteration": 2.455156087875366 }, { "auxiliary_loss_clip": 0.01052599, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.0098207, "balance_loss_mlp": 1.01675606, "epoch": 0.7790771080715466, "flos": 25515119882880.0, "grad_norm": 1.4829990844731398, "language_loss": 0.78314668, "learning_rate": 4.903920617885917e-07, "loss": 0.80402613, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.359375, "step": 12958, "time_per_iteration": 2.4076333045959473 }, { "auxiliary_loss_clip": 0.0105351, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 1.01308489, "balance_loss_mlp": 1.01670825, "epoch": 0.7791372313242146, "flos": 16033267866240.0, "grad_norm": 1.8609690398876075, "language_loss": 0.72703373, "learning_rate": 4.901366228545418e-07, "loss": 0.74794519, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 12959, "time_per_iteration": 2.447665214538574 }, { "auxiliary_loss_clip": 0.01052693, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.01795852, "balance_loss_mlp": 1.01620078, "epoch": 0.7791973545768827, "flos": 23841069235200.0, "grad_norm": 1.6081758495031038, "language_loss": 0.78718901, "learning_rate": 4.898812411746632e-07, "loss": 0.80814838, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 12960, "time_per_iteration": 2.392622947692871 }, { "auxiliary_loss_clip": 0.01054583, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.01460719, "balance_loss_mlp": 1.01705563, "epoch": 0.7792574778295506, "flos": 24167297779200.0, "grad_norm": 1.8656938016393536, "language_loss": 0.76292652, "learning_rate": 4.896259167586385e-07, "loss": 0.78389299, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.375, "step": 12961, "time_per_iteration": 2.4209365844726562 }, { "auxiliary_loss_clip": 0.01051106, "auxiliary_loss_mlp": 0.01036657, "balance_loss_clip": 1.01360238, "balance_loss_mlp": 1.01690292, "epoch": 0.7793176010822186, "flos": 21464007984000.0, "grad_norm": 1.6368517716269988, "language_loss": 0.74833816, "learning_rate": 4.893706496161511e-07, "loss": 0.76921582, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34179688, "step": 12962, "time_per_iteration": 2.376387596130371 }, { "auxiliary_loss_clip": 0.01052202, "auxiliary_loss_mlp": 0.01032801, "balance_loss_clip": 1.01105762, "balance_loss_mlp": 1.01687801, "epoch": 0.7793777243348865, "flos": 20665683768960.0, "grad_norm": 1.8482307900987534, "language_loss": 0.71038151, "learning_rate": 4.891154397568795e-07, "loss": 0.73123151, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 12963, "time_per_iteration": 2.362128496170044 }, { "auxiliary_loss_clip": 0.0105313, "auxiliary_loss_mlp": 0.01041243, "balance_loss_clip": 1.01587498, "balance_loss_mlp": 1.01692557, "epoch": 0.7794378475875545, "flos": 27124545870720.0, "grad_norm": 1.7765380299037952, "language_loss": 0.6478045, "learning_rate": 4.888602871905019e-07, "loss": 0.66874826, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 12964, "time_per_iteration": 2.472013235092163 }, { "auxiliary_loss_clip": 0.01053426, "auxiliary_loss_mlp": 0.01039117, "balance_loss_clip": 1.01552522, "balance_loss_mlp": 1.0160954, "epoch": 0.7794979708402224, "flos": 28072891664640.0, "grad_norm": 1.6241874677285475, "language_loss": 0.77382791, "learning_rate": 4.88605191926694e-07, "loss": 0.79475337, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 12965, "time_per_iteration": 2.433488368988037 }, { "auxiliary_loss_clip": 0.01049298, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.01499248, "balance_loss_mlp": 1.01591814, "epoch": 0.7795580940928905, "flos": 26868353247360.0, "grad_norm": 1.6171405143071382, "language_loss": 0.73185658, "learning_rate": 4.883501539751289e-07, "loss": 0.7527163, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33398438, "step": 12966, "time_per_iteration": 2.4479498863220215 }, { "auxiliary_loss_clip": 0.01050149, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.01360607, "balance_loss_mlp": 1.01552904, "epoch": 0.7796182173455584, "flos": 23834436076800.0, "grad_norm": 1.4486078474039017, "language_loss": 0.75028342, "learning_rate": 4.880951733454768e-07, "loss": 0.77112663, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34570312, "step": 12967, "time_per_iteration": 3.8182566165924072 }, { "auxiliary_loss_clip": 0.01055445, "auxiliary_loss_mlp": 0.01037497, "balance_loss_clip": 1.01272476, "balance_loss_mlp": 1.01722348, "epoch": 0.7796783405982264, "flos": 19791214145280.0, "grad_norm": 2.3678204411972055, "language_loss": 0.74042755, "learning_rate": 4.878402500474073e-07, "loss": 0.76135695, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 12968, "time_per_iteration": 2.3740506172180176 }, { "auxiliary_loss_clip": 0.01054103, "auxiliary_loss_mlp": 0.01044912, "balance_loss_clip": 1.01947284, "balance_loss_mlp": 1.01764297, "epoch": 0.7797384638508943, "flos": 15449310599040.0, "grad_norm": 2.0432609050183843, "language_loss": 0.62341404, "learning_rate": 4.875853840905874e-07, "loss": 0.64440417, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 12969, "time_per_iteration": 2.3779804706573486 }, { "auxiliary_loss_clip": 0.01050887, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.01487613, "balance_loss_mlp": 1.01614404, "epoch": 0.7797985871035623, "flos": 20921701835520.0, "grad_norm": 1.7124044813444863, "language_loss": 0.70941275, "learning_rate": 4.873305754846811e-07, "loss": 0.73029304, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 12970, "time_per_iteration": 2.344909429550171 }, { "auxiliary_loss_clip": 0.01053964, "auxiliary_loss_mlp": 0.01046504, "balance_loss_clip": 1.02113593, "balance_loss_mlp": 1.01740074, "epoch": 0.7798587103562302, "flos": 36935803365120.0, "grad_norm": 2.1315656591042416, "language_loss": 0.72758532, "learning_rate": 4.870758242393507e-07, "loss": 0.74858999, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 12971, "time_per_iteration": 2.5021302700042725 }, { "auxiliary_loss_clip": 0.0105545, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.01434326, "balance_loss_mlp": 1.01717758, "epoch": 0.7799188336088982, "flos": 22418183975040.0, "grad_norm": 1.7142791028704145, "language_loss": 0.75154686, "learning_rate": 4.868211303642578e-07, "loss": 0.77250332, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 12972, "time_per_iteration": 2.372159242630005 }, { "auxiliary_loss_clip": 0.01053163, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.00985861, "balance_loss_mlp": 1.01620138, "epoch": 0.7799789568615663, "flos": 18879457322880.0, "grad_norm": 1.958113986023157, "language_loss": 0.73226428, "learning_rate": 4.865664938690584e-07, "loss": 0.75313276, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 12973, "time_per_iteration": 2.376121759414673 }, { "auxiliary_loss_clip": 0.0105039, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.01345682, "balance_loss_mlp": 1.01546931, "epoch": 0.7800390801142342, "flos": 20261354780160.0, "grad_norm": 1.9297566212048989, "language_loss": 0.7890622, "learning_rate": 4.863119147634089e-07, "loss": 0.80992728, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 12974, "time_per_iteration": 2.3408312797546387 }, { "auxiliary_loss_clip": 0.0105153, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.01076019, "balance_loss_mlp": 1.01589251, "epoch": 0.7800992033669022, "flos": 16689390647040.0, "grad_norm": 1.6567666565964025, "language_loss": 0.70193344, "learning_rate": 4.86057393056964e-07, "loss": 0.72279561, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35546875, "step": 12975, "time_per_iteration": 2.3465535640716553 }, { "auxiliary_loss_clip": 0.01051774, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.01679122, "balance_loss_mlp": 1.01670349, "epoch": 0.7801593266195701, "flos": 18584301755520.0, "grad_norm": 2.0487160554580797, "language_loss": 0.83767319, "learning_rate": 4.858029287593739e-07, "loss": 0.85859233, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3515625, "step": 12976, "time_per_iteration": 2.355829954147339 }, { "auxiliary_loss_clip": 0.01055106, "auxiliary_loss_mlp": 0.010385, "balance_loss_clip": 1.01108146, "balance_loss_mlp": 1.01603127, "epoch": 0.7802194498722381, "flos": 25483732704000.0, "grad_norm": 1.6664681951412832, "language_loss": 0.66384941, "learning_rate": 4.85548521880289e-07, "loss": 0.68478549, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.390625, "step": 12977, "time_per_iteration": 2.3913071155548096 }, { "auxiliary_loss_clip": 0.01052103, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.01358175, "balance_loss_mlp": 1.01668048, "epoch": 0.780279573124906, "flos": 31174959542400.0, "grad_norm": 2.2973478430694723, "language_loss": 0.75258064, "learning_rate": 4.852941724293554e-07, "loss": 0.77344465, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.35546875, "step": 12978, "time_per_iteration": 2.4588351249694824 }, { "auxiliary_loss_clip": 0.01054493, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.01463783, "balance_loss_mlp": 1.01634359, "epoch": 0.780339696377574, "flos": 26942787999360.0, "grad_norm": 1.79945675806762, "language_loss": 0.62852883, "learning_rate": 4.85039880416219e-07, "loss": 0.64949, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38085938, "step": 12979, "time_per_iteration": 2.4333090782165527 }, { "auxiliary_loss_clip": 0.01052827, "auxiliary_loss_mlp": 0.01037973, "balance_loss_clip": 1.01477528, "balance_loss_mlp": 1.01720965, "epoch": 0.780399819630242, "flos": 27956386857600.0, "grad_norm": 2.0123200799250625, "language_loss": 0.78312516, "learning_rate": 4.847856458505217e-07, "loss": 0.80403316, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 12980, "time_per_iteration": 2.491596221923828 }, { "auxiliary_loss_clip": 0.01054613, "auxiliary_loss_mlp": 0.01042189, "balance_loss_clip": 1.01784658, "balance_loss_mlp": 1.01722956, "epoch": 0.78045994288291, "flos": 22485845923200.0, "grad_norm": 1.8391255266892814, "language_loss": 0.7817173, "learning_rate": 4.845314687419046e-07, "loss": 0.80268532, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 12981, "time_per_iteration": 2.363982677459717 }, { "auxiliary_loss_clip": 0.01055554, "auxiliary_loss_mlp": 0.0103877, "balance_loss_clip": 1.01350975, "balance_loss_mlp": 1.01853967, "epoch": 0.7805200661355779, "flos": 20849780701440.0, "grad_norm": 1.7791610757518848, "language_loss": 0.73769152, "learning_rate": 4.842773491000067e-07, "loss": 0.75863481, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 12982, "time_per_iteration": 2.412245988845825 }, { "auxiliary_loss_clip": 0.01052536, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.01105034, "balance_loss_mlp": 1.01601315, "epoch": 0.7805801893882459, "flos": 25664792348160.0, "grad_norm": 1.3614217809370464, "language_loss": 0.74254191, "learning_rate": 4.840232869344636e-07, "loss": 0.76341271, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36523438, "step": 12983, "time_per_iteration": 2.398818016052246 }, { "auxiliary_loss_clip": 0.01052953, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.01606369, "balance_loss_mlp": 1.01626945, "epoch": 0.7806403126409138, "flos": 11327010704640.0, "grad_norm": 1.8921076450987002, "language_loss": 0.75738883, "learning_rate": 4.837692822549086e-07, "loss": 0.77832329, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 12984, "time_per_iteration": 2.3546581268310547 }, { "auxiliary_loss_clip": 0.01050236, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.015347, "balance_loss_mlp": 1.01542032, "epoch": 0.7807004358935818, "flos": 19572343632000.0, "grad_norm": 2.147578814122469, "language_loss": 0.82743549, "learning_rate": 4.835153350709746e-07, "loss": 0.8483237, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 12985, "time_per_iteration": 2.332937240600586 }, { "auxiliary_loss_clip": 0.01050877, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.0185591, "balance_loss_mlp": 1.01553583, "epoch": 0.7807605591462499, "flos": 19134812073600.0, "grad_norm": 1.7646744407216968, "language_loss": 0.77927089, "learning_rate": 4.832614453922915e-07, "loss": 0.80019319, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 12986, "time_per_iteration": 2.3722445964813232 }, { "auxiliary_loss_clip": 0.01052894, "auxiliary_loss_mlp": 0.01041126, "balance_loss_clip": 1.01697373, "balance_loss_mlp": 1.01662993, "epoch": 0.7808206823989178, "flos": 32373423383040.0, "grad_norm": 1.7025866819770958, "language_loss": 0.75962085, "learning_rate": 4.830076132284859e-07, "loss": 0.78056103, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 12987, "time_per_iteration": 3.8618781566619873 }, { "auxiliary_loss_clip": 0.01008477, "auxiliary_loss_mlp": 0.01002049, "balance_loss_clip": 0.99991506, "balance_loss_mlp": 1.00106966, "epoch": 0.7808808056515858, "flos": 55046855508480.0, "grad_norm": 0.7273101898096401, "language_loss": 0.55157185, "learning_rate": 4.82753838589184e-07, "loss": 0.57167709, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.07421875, "step": 12988, "time_per_iteration": 3.009838819503784 }, { "auxiliary_loss_clip": 0.0105223, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.01401591, "balance_loss_mlp": 1.01677454, "epoch": 0.7809409289042537, "flos": 12858650449920.0, "grad_norm": 2.462816083519718, "language_loss": 0.81373769, "learning_rate": 4.82500121484009e-07, "loss": 0.83463037, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 12989, "time_per_iteration": 2.354731798171997 }, { "auxiliary_loss_clip": 0.01050396, "auxiliary_loss_mlp": 0.01036311, "balance_loss_clip": 1.01158714, "balance_loss_mlp": 1.01513982, "epoch": 0.7810010521569217, "flos": 21686229987840.0, "grad_norm": 1.520186259180861, "language_loss": 0.71930969, "learning_rate": 4.822464619225806e-07, "loss": 0.74017674, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3515625, "step": 12990, "time_per_iteration": 2.35052490234375 }, { "auxiliary_loss_clip": 0.01052931, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.01297069, "balance_loss_mlp": 1.01603234, "epoch": 0.7810611754095896, "flos": 16756319456640.0, "grad_norm": 2.0728907077183862, "language_loss": 0.78788888, "learning_rate": 4.819928599145184e-07, "loss": 0.80880368, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 12991, "time_per_iteration": 2.3377022743225098 }, { "auxiliary_loss_clip": 0.01052533, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.01428843, "balance_loss_mlp": 1.0162673, "epoch": 0.7811212986622577, "flos": 43505793924480.0, "grad_norm": 1.4981515046829033, "language_loss": 0.67180073, "learning_rate": 4.817393154694398e-07, "loss": 0.69270819, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 12992, "time_per_iteration": 2.548750162124634 }, { "auxiliary_loss_clip": 0.01053069, "auxiliary_loss_mlp": 0.01038603, "balance_loss_clip": 1.01424837, "balance_loss_mlp": 1.01683867, "epoch": 0.7811814219149256, "flos": 21756754667520.0, "grad_norm": 3.457431991218914, "language_loss": 0.63152206, "learning_rate": 4.814858285969578e-07, "loss": 0.65243876, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 12993, "time_per_iteration": 3.8565962314605713 }, { "auxiliary_loss_clip": 0.01052447, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.01146221, "balance_loss_mlp": 1.01702154, "epoch": 0.7812415451675936, "flos": 24060358684800.0, "grad_norm": 1.504628641647026, "language_loss": 0.70003659, "learning_rate": 4.812323993066862e-07, "loss": 0.72091854, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 12994, "time_per_iteration": 3.695828676223755 }, { "auxiliary_loss_clip": 0.01050551, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.01334596, "balance_loss_mlp": 1.01562238, "epoch": 0.7813016684202615, "flos": 18988700567040.0, "grad_norm": 1.984117456026829, "language_loss": 0.70793998, "learning_rate": 4.809790276082335e-07, "loss": 0.72879791, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 12995, "time_per_iteration": 2.331050395965576 }, { "auxiliary_loss_clip": 0.01050364, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.01304734, "balance_loss_mlp": 1.01518536, "epoch": 0.7813617916729295, "flos": 25259730220800.0, "grad_norm": 1.6833966057689018, "language_loss": 0.76165801, "learning_rate": 4.807257135112088e-07, "loss": 0.78250933, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 12996, "time_per_iteration": 2.393113136291504 }, { "auxiliary_loss_clip": 0.01056003, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.01198316, "balance_loss_mlp": 1.01783383, "epoch": 0.7814219149255974, "flos": 17965117059840.0, "grad_norm": 3.2422271928248234, "language_loss": 0.71272677, "learning_rate": 4.804724570252167e-07, "loss": 0.73365378, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 12997, "time_per_iteration": 2.3168907165527344 }, { "auxiliary_loss_clip": 0.01054456, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.01270616, "balance_loss_mlp": 1.016572, "epoch": 0.7814820381782654, "flos": 25774978199040.0, "grad_norm": 1.7040123812732635, "language_loss": 0.82643092, "learning_rate": 4.802192581598614e-07, "loss": 0.84736037, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 12998, "time_per_iteration": 2.3963725566864014 }, { "auxiliary_loss_clip": 0.01053375, "auxiliary_loss_mlp": 0.010425, "balance_loss_clip": 1.01739478, "balance_loss_mlp": 1.01626563, "epoch": 0.7815421614309335, "flos": 20518594744320.0, "grad_norm": 1.9075536354174656, "language_loss": 0.75770849, "learning_rate": 4.799661169247453e-07, "loss": 0.77866721, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 12999, "time_per_iteration": 2.3990745544433594 }, { "auxiliary_loss_clip": 0.01056141, "auxiliary_loss_mlp": 0.01046254, "balance_loss_clip": 1.02173269, "balance_loss_mlp": 1.01805043, "epoch": 0.7816022846836014, "flos": 21286614032640.0, "grad_norm": 1.543685426734998, "language_loss": 0.85354936, "learning_rate": 4.797130333294652e-07, "loss": 0.87457335, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38085938, "step": 13000, "time_per_iteration": 2.3662493228912354 }, { "auxiliary_loss_clip": 0.01053074, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.01209497, "balance_loss_mlp": 1.01608229, "epoch": 0.7816624079362694, "flos": 19207396523520.0, "grad_norm": 2.322699996032797, "language_loss": 0.67208648, "learning_rate": 4.794600073836192e-07, "loss": 0.69297624, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 13001, "time_per_iteration": 2.3683700561523438 }, { "auxiliary_loss_clip": 0.01053219, "auxiliary_loss_mlp": 0.01040138, "balance_loss_clip": 1.01699901, "balance_loss_mlp": 1.01655293, "epoch": 0.7817225311889373, "flos": 26103475981440.0, "grad_norm": 1.5890480476548756, "language_loss": 0.67368996, "learning_rate": 4.792070390968027e-07, "loss": 0.69462353, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 13002, "time_per_iteration": 2.3884780406951904 }, { "auxiliary_loss_clip": 0.01054266, "auxiliary_loss_mlp": 0.01041314, "balance_loss_clip": 1.01488507, "balance_loss_mlp": 1.01647758, "epoch": 0.7817826544416053, "flos": 21249885415680.0, "grad_norm": 3.7697316708603386, "language_loss": 0.7498455, "learning_rate": 4.78954128478607e-07, "loss": 0.77080137, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 13003, "time_per_iteration": 2.361257553100586 }, { "auxiliary_loss_clip": 0.01054896, "auxiliary_loss_mlp": 0.01039699, "balance_loss_clip": 1.01549923, "balance_loss_mlp": 1.01768208, "epoch": 0.7818427776942732, "flos": 19931320897920.0, "grad_norm": 1.7306638562413317, "language_loss": 0.62836438, "learning_rate": 4.787012755386233e-07, "loss": 0.64931035, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 13004, "time_per_iteration": 2.3482108116149902 }, { "auxiliary_loss_clip": 0.01047103, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.00999522, "balance_loss_mlp": 1.01411724, "epoch": 0.7819029009469413, "flos": 11362971271680.0, "grad_norm": 1.7784787287866872, "language_loss": 0.83469927, "learning_rate": 4.784484802864403e-07, "loss": 0.85548246, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33007812, "step": 13005, "time_per_iteration": 2.346139669418335 }, { "auxiliary_loss_clip": 0.01050456, "auxiliary_loss_mlp": 0.01040707, "balance_loss_clip": 1.01671052, "balance_loss_mlp": 1.01487947, "epoch": 0.7819630241996092, "flos": 24278146945920.0, "grad_norm": 1.888021263342438, "language_loss": 0.73758239, "learning_rate": 4.781957427316432e-07, "loss": 0.75849402, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 13006, "time_per_iteration": 3.796691417694092 }, { "auxiliary_loss_clip": 0.01055457, "auxiliary_loss_mlp": 0.01041767, "balance_loss_clip": 1.01455116, "balance_loss_mlp": 1.01755035, "epoch": 0.7820231474522772, "flos": 22707858458880.0, "grad_norm": 1.5899160664711856, "language_loss": 0.72480214, "learning_rate": 4.779430628838157e-07, "loss": 0.74577439, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37890625, "step": 13007, "time_per_iteration": 2.3757400512695312 }, { "auxiliary_loss_clip": 0.01054272, "auxiliary_loss_mlp": 0.01041065, "balance_loss_clip": 1.01514864, "balance_loss_mlp": 1.01667309, "epoch": 0.7820832707049451, "flos": 20046394339200.0, "grad_norm": 2.0757124608080795, "language_loss": 0.70181072, "learning_rate": 4.776904407525397e-07, "loss": 0.72276413, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 13008, "time_per_iteration": 2.363776206970215 }, { "auxiliary_loss_clip": 0.01052158, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.0122447, "balance_loss_mlp": 1.01573217, "epoch": 0.7821433939576131, "flos": 27161553778560.0, "grad_norm": 1.8192873198521333, "language_loss": 0.71012348, "learning_rate": 4.774378763473954e-07, "loss": 0.73101294, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 13009, "time_per_iteration": 2.445284366607666 }, { "auxiliary_loss_clip": 0.01051311, "auxiliary_loss_mlp": 0.01040356, "balance_loss_clip": 1.01809955, "balance_loss_mlp": 1.01533937, "epoch": 0.782203517210281, "flos": 22600954275840.0, "grad_norm": 1.6641998872744002, "language_loss": 0.8236649, "learning_rate": 4.771853696779586e-07, "loss": 0.8445816, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 13010, "time_per_iteration": 2.3654022216796875 }, { "auxiliary_loss_clip": 0.01049711, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.01613569, "balance_loss_mlp": 1.01509464, "epoch": 0.782263640462949, "flos": 29058524657280.0, "grad_norm": 2.1978155193701157, "language_loss": 0.63017571, "learning_rate": 4.76932920753806e-07, "loss": 0.65105087, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 13011, "time_per_iteration": 2.4796652793884277 }, { "auxiliary_loss_clip": 0.01049795, "auxiliary_loss_mlp": 0.01029549, "balance_loss_clip": 1.00962949, "balance_loss_mlp": 1.01524043, "epoch": 0.782323763715617, "flos": 25298378962560.0, "grad_norm": 2.073686044641185, "language_loss": 0.70683527, "learning_rate": 4.7668052958450913e-07, "loss": 0.72762865, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.34570312, "step": 13012, "time_per_iteration": 2.388181686401367 }, { "auxiliary_loss_clip": 0.01008746, "auxiliary_loss_mlp": 0.01003265, "balance_loss_clip": 1.00098777, "balance_loss_mlp": 1.00137901, "epoch": 0.782383886968285, "flos": 65190695414400.0, "grad_norm": 0.7054228088600454, "language_loss": 0.55074829, "learning_rate": 4.764281961796395e-07, "loss": 0.57086837, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.07373047, "step": 13013, "time_per_iteration": 3.1194424629211426 }, { "auxiliary_loss_clip": 0.01054295, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.01508498, "balance_loss_mlp": 1.01677489, "epoch": 0.782444010220953, "flos": 18404464008960.0, "grad_norm": 1.7167253766814454, "language_loss": 0.66672647, "learning_rate": 4.76175920548765e-07, "loss": 0.68766505, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 13014, "time_per_iteration": 2.3541159629821777 }, { "auxiliary_loss_clip": 0.01008337, "auxiliary_loss_mlp": 0.01002856, "balance_loss_clip": 1.00061476, "balance_loss_mlp": 1.00118518, "epoch": 0.7825041334736209, "flos": 63951313593600.0, "grad_norm": 0.7308556053282091, "language_loss": 0.58571571, "learning_rate": 4.759237027014524e-07, "loss": 0.60582763, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.07128906, "step": 13015, "time_per_iteration": 3.112224578857422 }, { "auxiliary_loss_clip": 0.01051479, "auxiliary_loss_mlp": 0.01038316, "balance_loss_clip": 1.01622701, "balance_loss_mlp": 1.01608551, "epoch": 0.7825642567262889, "flos": 20338338061440.0, "grad_norm": 1.615309249854176, "language_loss": 0.75211251, "learning_rate": 4.756715426472666e-07, "loss": 0.77301049, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 13016, "time_per_iteration": 2.358203172683716 }, { "auxiliary_loss_clip": 0.01052659, "auxiliary_loss_mlp": 0.01039585, "balance_loss_clip": 1.01433647, "balance_loss_mlp": 1.01598823, "epoch": 0.7826243799789568, "flos": 20262018096000.0, "grad_norm": 1.6321316400525547, "language_loss": 0.76250046, "learning_rate": 4.7541944039576766e-07, "loss": 0.78342295, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 13017, "time_per_iteration": 2.3476481437683105 }, { "auxiliary_loss_clip": 0.0105387, "auxiliary_loss_mlp": 0.01041735, "balance_loss_clip": 1.01808405, "balance_loss_mlp": 1.01582432, "epoch": 0.7826845032316249, "flos": 21132123799680.0, "grad_norm": 2.0122831293630212, "language_loss": 0.76448011, "learning_rate": 4.7516739595651636e-07, "loss": 0.78543615, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.38085938, "step": 13018, "time_per_iteration": 2.3517165184020996 }, { "auxiliary_loss_clip": 0.01052468, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.01544023, "balance_loss_mlp": 1.01629245, "epoch": 0.7827446264842928, "flos": 22491152449920.0, "grad_norm": 1.4420829899616083, "language_loss": 0.77757972, "learning_rate": 4.749154093390708e-07, "loss": 0.79848254, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36132812, "step": 13019, "time_per_iteration": 2.370814800262451 }, { "auxiliary_loss_clip": 0.01049971, "auxiliary_loss_mlp": 0.01033985, "balance_loss_clip": 1.0108583, "balance_loss_mlp": 1.01520348, "epoch": 0.7828047497369608, "flos": 28839374853120.0, "grad_norm": 1.5515975900530117, "language_loss": 0.68316472, "learning_rate": 4.746634805529852e-07, "loss": 0.70400429, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 13020, "time_per_iteration": 2.4722111225128174 }, { "auxiliary_loss_clip": 0.01052839, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.01811314, "balance_loss_mlp": 1.01674151, "epoch": 0.7828648729896287, "flos": 23256588297600.0, "grad_norm": 3.5672304469740226, "language_loss": 0.635364, "learning_rate": 4.7441160960781325e-07, "loss": 0.65630162, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 13021, "time_per_iteration": 2.353741407394409 }, { "auxiliary_loss_clip": 0.01050677, "auxiliary_loss_mlp": 0.01038582, "balance_loss_clip": 1.01679039, "balance_loss_mlp": 1.01612568, "epoch": 0.7829249962422967, "flos": 25264478165760.0, "grad_norm": 1.564535045723806, "language_loss": 0.70229626, "learning_rate": 4.7415979651310636e-07, "loss": 0.72318876, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 13022, "time_per_iteration": 2.41540789604187 }, { "auxiliary_loss_clip": 0.01008292, "auxiliary_loss_mlp": 0.01009261, "balance_loss_clip": 1.00709176, "balance_loss_mlp": 1.00132942, "epoch": 0.7829851194949646, "flos": 70718704179840.0, "grad_norm": 0.6456304402234454, "language_loss": 0.56220782, "learning_rate": 4.739080412784131e-07, "loss": 0.58238328, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06982422, "step": 13023, "time_per_iteration": 3.143916606903076 }, { "auxiliary_loss_clip": 0.0104795, "auxiliary_loss_mlp": 0.01030364, "balance_loss_clip": 1.01076591, "balance_loss_mlp": 1.01408887, "epoch": 0.7830452427476327, "flos": 25659765112320.0, "grad_norm": 1.6769362943205355, "language_loss": 0.67621005, "learning_rate": 4.736563439132792e-07, "loss": 0.69699317, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.33789062, "step": 13024, "time_per_iteration": 2.4001219272613525 }, { "auxiliary_loss_clip": 0.01053768, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.01081097, "balance_loss_mlp": 1.0163635, "epoch": 0.7831053660003006, "flos": 22783200906240.0, "grad_norm": 1.5933757518534784, "language_loss": 0.78760844, "learning_rate": 4.734047044272498e-07, "loss": 0.80850172, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 13025, "time_per_iteration": 2.3863184452056885 }, { "auxiliary_loss_clip": 0.01052777, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.01543546, "balance_loss_mlp": 1.01687622, "epoch": 0.7831654892529686, "flos": 25811078411520.0, "grad_norm": 1.737249421752044, "language_loss": 0.79358274, "learning_rate": 4.731531228298673e-07, "loss": 0.81448299, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 13026, "time_per_iteration": 3.6791322231292725 }, { "auxiliary_loss_clip": 0.01051039, "auxiliary_loss_mlp": 0.01031324, "balance_loss_clip": 1.00929463, "balance_loss_mlp": 1.0168345, "epoch": 0.7832256125056366, "flos": 20770667827200.0, "grad_norm": 1.9919547802765962, "language_loss": 0.7596162, "learning_rate": 4.729015991306715e-07, "loss": 0.78043985, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34179688, "step": 13027, "time_per_iteration": 2.3278372287750244 }, { "auxiliary_loss_clip": 0.01052414, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.01553631, "balance_loss_mlp": 1.01714361, "epoch": 0.7832857357583045, "flos": 21505484545920.0, "grad_norm": 1.8131012619816327, "language_loss": 0.71487939, "learning_rate": 4.726501333391997e-07, "loss": 0.7357887, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 13028, "time_per_iteration": 2.3874857425689697 }, { "auxiliary_loss_clip": 0.01053215, "auxiliary_loss_mlp": 0.01040928, "balance_loss_clip": 1.01534581, "balance_loss_mlp": 1.01655936, "epoch": 0.7833458590109725, "flos": 18076804099200.0, "grad_norm": 1.921685083288741, "language_loss": 0.69525027, "learning_rate": 4.7239872546498774e-07, "loss": 0.71619171, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 13029, "time_per_iteration": 2.3223965167999268 }, { "auxiliary_loss_clip": 0.01053329, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.01913249, "balance_loss_mlp": 1.01606917, "epoch": 0.7834059822636404, "flos": 28287607726080.0, "grad_norm": 2.709339855277154, "language_loss": 0.82230043, "learning_rate": 4.721473755175698e-07, "loss": 0.84328353, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 13030, "time_per_iteration": 2.5288212299346924 }, { "auxiliary_loss_clip": 0.01053832, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.01063585, "balance_loss_mlp": 1.01645947, "epoch": 0.7834661055163085, "flos": 31684866082560.0, "grad_norm": 1.990914456147986, "language_loss": 0.7228893, "learning_rate": 4.71896083506476e-07, "loss": 0.74375927, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.375, "step": 13031, "time_per_iteration": 2.450366735458374 }, { "auxiliary_loss_clip": 0.01051695, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.01443195, "balance_loss_mlp": 1.01553297, "epoch": 0.7835262287689764, "flos": 12932352063360.0, "grad_norm": 1.7303883207339177, "language_loss": 0.80208516, "learning_rate": 4.7164484944123574e-07, "loss": 0.82298797, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 13032, "time_per_iteration": 3.765632390975952 }, { "auxiliary_loss_clip": 0.01055537, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.01751423, "balance_loss_mlp": 1.01761103, "epoch": 0.7835863520216444, "flos": 16142301642240.0, "grad_norm": 2.151569053410961, "language_loss": 0.64346468, "learning_rate": 4.7139367333137726e-07, "loss": 0.66442883, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37890625, "step": 13033, "time_per_iteration": 3.8196401596069336 }, { "auxiliary_loss_clip": 0.01052522, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.01359153, "balance_loss_mlp": 1.0165, "epoch": 0.7836464752743123, "flos": 11509117689600.0, "grad_norm": 1.6986639487001598, "language_loss": 0.7272895, "learning_rate": 4.7114255518642255e-07, "loss": 0.74820882, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.359375, "step": 13034, "time_per_iteration": 2.342808723449707 }, { "auxiliary_loss_clip": 0.01054845, "auxiliary_loss_mlp": 0.0104546, "balance_loss_clip": 1.02046239, "balance_loss_mlp": 1.0180459, "epoch": 0.7837065985269803, "flos": 18222706137600.0, "grad_norm": 1.7841344524585203, "language_loss": 0.73579574, "learning_rate": 4.7089149501589555e-07, "loss": 0.75679886, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 13035, "time_per_iteration": 2.3902299404144287 }, { "auxiliary_loss_clip": 0.01053535, "auxiliary_loss_mlp": 0.01042174, "balance_loss_clip": 1.01576948, "balance_loss_mlp": 1.01701558, "epoch": 0.7837667217796482, "flos": 24753244993920.0, "grad_norm": 1.9708394179175979, "language_loss": 0.67412829, "learning_rate": 4.7064049282931664e-07, "loss": 0.69508541, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36523438, "step": 13036, "time_per_iteration": 2.420297384262085 }, { "auxiliary_loss_clip": 0.01056107, "auxiliary_loss_mlp": 0.01040986, "balance_loss_clip": 1.01440227, "balance_loss_mlp": 1.01752913, "epoch": 0.7838268450323163, "flos": 22382013939840.0, "grad_norm": 3.7765432074586878, "language_loss": 0.73163444, "learning_rate": 4.703895486362031e-07, "loss": 0.75260538, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38476562, "step": 13037, "time_per_iteration": 2.381239652633667 }, { "auxiliary_loss_clip": 0.01051304, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.0171783, "balance_loss_mlp": 1.016078, "epoch": 0.7838869682849842, "flos": 19499270423040.0, "grad_norm": 2.681516541368443, "language_loss": 0.62095875, "learning_rate": 4.701386624460717e-07, "loss": 0.64186901, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 13038, "time_per_iteration": 2.35034441947937 }, { "auxiliary_loss_clip": 0.01050776, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.0146991, "balance_loss_mlp": 1.01576757, "epoch": 0.7839470915376522, "flos": 32891394447360.0, "grad_norm": 1.4974727359841413, "language_loss": 0.69120222, "learning_rate": 4.698878342684349e-07, "loss": 0.71208143, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13039, "time_per_iteration": 2.504504442214966 }, { "auxiliary_loss_clip": 0.01049326, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.01015782, "balance_loss_mlp": 1.01459658, "epoch": 0.7840072147903202, "flos": 29674811710080.0, "grad_norm": 2.214159387984537, "language_loss": 0.70378006, "learning_rate": 4.6963706411280537e-07, "loss": 0.72458375, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 13040, "time_per_iteration": 2.41644287109375 }, { "auxiliary_loss_clip": 0.01052548, "auxiliary_loss_mlp": 0.01041174, "balance_loss_clip": 1.01844072, "balance_loss_mlp": 1.01603103, "epoch": 0.7840673380429881, "flos": 18185768052480.0, "grad_norm": 1.5040597121049684, "language_loss": 0.69166911, "learning_rate": 4.6938635198869116e-07, "loss": 0.71260631, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36523438, "step": 13041, "time_per_iteration": 2.40739369392395 }, { "auxiliary_loss_clip": 0.0100828, "auxiliary_loss_mlp": 0.01002302, "balance_loss_clip": 1.00010848, "balance_loss_mlp": 1.00103045, "epoch": 0.7841274612956561, "flos": 66342725280000.0, "grad_norm": 0.806008262590338, "language_loss": 0.57505232, "learning_rate": 4.691356979055998e-07, "loss": 0.5951581, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.07226562, "step": 13042, "time_per_iteration": 2.9753613471984863 }, { "auxiliary_loss_clip": 0.0105243, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.01157475, "balance_loss_mlp": 1.01658368, "epoch": 0.784187584548324, "flos": 26647353141120.0, "grad_norm": 2.438606993246624, "language_loss": 0.85278559, "learning_rate": 4.688851018730369e-07, "loss": 0.87366199, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13043, "time_per_iteration": 2.4851784706115723 }, { "auxiliary_loss_clip": 0.01050392, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 1.01533067, "balance_loss_mlp": 1.01557791, "epoch": 0.7842477078009921, "flos": 25738947809280.0, "grad_norm": 1.401136453681194, "language_loss": 0.88979518, "learning_rate": 4.6863456390050425e-07, "loss": 0.91066563, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34765625, "step": 13044, "time_per_iteration": 2.404571294784546 }, { "auxiliary_loss_clip": 0.01055037, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.01824474, "balance_loss_mlp": 1.01710296, "epoch": 0.78430783105366, "flos": 21979884366720.0, "grad_norm": 1.7122297087846543, "language_loss": 0.80347717, "learning_rate": 4.6838408399750195e-07, "loss": 0.82446373, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 13045, "time_per_iteration": 2.4352242946624756 }, { "auxiliary_loss_clip": 0.01050653, "auxiliary_loss_mlp": 0.01033074, "balance_loss_clip": 1.0114975, "balance_loss_mlp": 1.01561081, "epoch": 0.784367954306328, "flos": 23841139057920.0, "grad_norm": 1.5395511279233067, "language_loss": 0.73390871, "learning_rate": 4.6813366217352925e-07, "loss": 0.75474596, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34960938, "step": 13046, "time_per_iteration": 3.941368341445923 }, { "auxiliary_loss_clip": 0.01051464, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 1.01492178, "balance_loss_mlp": 1.01670456, "epoch": 0.7844280775589959, "flos": 24825515241600.0, "grad_norm": 1.6142623351116727, "language_loss": 0.63719612, "learning_rate": 4.678832984380809e-07, "loss": 0.65809834, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34765625, "step": 13047, "time_per_iteration": 2.463311195373535 }, { "auxiliary_loss_clip": 0.01052122, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.01160312, "balance_loss_mlp": 1.01638985, "epoch": 0.7844882008116639, "flos": 22454563478400.0, "grad_norm": 1.5167197139199908, "language_loss": 0.73745096, "learning_rate": 4.676329928006515e-07, "loss": 0.7583077, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 13048, "time_per_iteration": 2.441051721572876 }, { "auxiliary_loss_clip": 0.01053693, "auxiliary_loss_mlp": 0.0103853, "balance_loss_clip": 1.01282883, "balance_loss_mlp": 1.01766062, "epoch": 0.7845483240643318, "flos": 26102847576960.0, "grad_norm": 1.9682780060945169, "language_loss": 0.75212604, "learning_rate": 4.6738274527073243e-07, "loss": 0.77304828, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36132812, "step": 13049, "time_per_iteration": 2.431675672531128 }, { "auxiliary_loss_clip": 0.01053972, "auxiliary_loss_mlp": 0.01041963, "balance_loss_clip": 1.01483107, "balance_loss_mlp": 1.01597083, "epoch": 0.7846084473169999, "flos": 19353298561920.0, "grad_norm": 1.7080967357273877, "language_loss": 0.73565543, "learning_rate": 4.6713255585781454e-07, "loss": 0.7566148, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.38085938, "step": 13050, "time_per_iteration": 2.362901449203491 }, { "auxiliary_loss_clip": 0.01051462, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.02072561, "balance_loss_mlp": 1.01639581, "epoch": 0.7846685705696678, "flos": 23324843738880.0, "grad_norm": 2.0862481426322366, "language_loss": 0.75144833, "learning_rate": 4.668824245713825e-07, "loss": 0.77238655, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34960938, "step": 13051, "time_per_iteration": 2.401057243347168 }, { "auxiliary_loss_clip": 0.01053745, "auxiliary_loss_mlp": 0.0103906, "balance_loss_clip": 1.01226163, "balance_loss_mlp": 1.01703787, "epoch": 0.7847286938223358, "flos": 35808073672320.0, "grad_norm": 2.882832962588486, "language_loss": 0.73877299, "learning_rate": 4.666323514209227e-07, "loss": 0.75970101, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3671875, "step": 13052, "time_per_iteration": 2.4924302101135254 }, { "auxiliary_loss_clip": 0.01048885, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.0149653, "balance_loss_mlp": 1.0150131, "epoch": 0.7847888170750038, "flos": 18477188104320.0, "grad_norm": 1.8583694523886467, "language_loss": 0.70631516, "learning_rate": 4.663823364159183e-07, "loss": 0.72716719, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 13053, "time_per_iteration": 2.485466241836548 }, { "auxiliary_loss_clip": 0.0105021, "auxiliary_loss_mlp": 0.01031526, "balance_loss_clip": 1.00992513, "balance_loss_mlp": 1.01619518, "epoch": 0.7848489403276717, "flos": 25117982634240.0, "grad_norm": 2.0774545171421432, "language_loss": 0.70989227, "learning_rate": 4.6613237956584893e-07, "loss": 0.73070961, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 13054, "time_per_iteration": 2.4205853939056396 }, { "auxiliary_loss_clip": 0.01054518, "auxiliary_loss_mlp": 0.01041165, "balance_loss_clip": 1.01613069, "balance_loss_mlp": 1.01658106, "epoch": 0.7849090635803397, "flos": 26501311457280.0, "grad_norm": 1.5500339872307878, "language_loss": 0.76782793, "learning_rate": 4.658824808801938e-07, "loss": 0.78878474, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 13055, "time_per_iteration": 2.450080633163452 }, { "auxiliary_loss_clip": 0.01056146, "auxiliary_loss_mlp": 0.01042025, "balance_loss_clip": 1.01546526, "balance_loss_mlp": 1.01693654, "epoch": 0.7849691868330076, "flos": 20958605009280.0, "grad_norm": 1.9502113610033256, "language_loss": 0.7578243, "learning_rate": 4.656326403684283e-07, "loss": 0.77880597, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 13056, "time_per_iteration": 2.3737223148345947 }, { "auxiliary_loss_clip": 0.01051977, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.01063669, "balance_loss_mlp": 1.01622415, "epoch": 0.7850293100856757, "flos": 26066293516800.0, "grad_norm": 1.642859046430362, "language_loss": 0.71396005, "learning_rate": 4.6538285804002744e-07, "loss": 0.73482519, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 13057, "time_per_iteration": 2.432526111602783 }, { "auxiliary_loss_clip": 0.01050699, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.00913465, "balance_loss_mlp": 1.01560473, "epoch": 0.7850894333383436, "flos": 22490803336320.0, "grad_norm": 1.7718411885756047, "language_loss": 0.7796911, "learning_rate": 4.6513313390446175e-07, "loss": 0.80051422, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 13058, "time_per_iteration": 2.3655998706817627 }, { "auxiliary_loss_clip": 0.01053516, "auxiliary_loss_mlp": 0.01043427, "balance_loss_clip": 1.01891708, "balance_loss_mlp": 1.01745391, "epoch": 0.7851495565910116, "flos": 20557592599680.0, "grad_norm": 2.0340615255799634, "language_loss": 0.71981347, "learning_rate": 4.6488346797120146e-07, "loss": 0.74078292, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36132812, "step": 13059, "time_per_iteration": 2.421283721923828 }, { "auxiliary_loss_clip": 0.01055414, "auxiliary_loss_mlp": 0.01042103, "balance_loss_clip": 1.01591253, "balance_loss_mlp": 1.01746058, "epoch": 0.7852096798436795, "flos": 15923919888000.0, "grad_norm": 1.8584801608138541, "language_loss": 0.77628922, "learning_rate": 4.646338602497144e-07, "loss": 0.7972644, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 13060, "time_per_iteration": 2.3508081436157227 }, { "auxiliary_loss_clip": 0.01054091, "auxiliary_loss_mlp": 0.0103922, "balance_loss_clip": 1.01391172, "balance_loss_mlp": 1.01754785, "epoch": 0.7852698030963475, "flos": 19061285016960.0, "grad_norm": 2.5356908410048664, "language_loss": 0.77512443, "learning_rate": 4.643843107494654e-07, "loss": 0.79605758, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 13061, "time_per_iteration": 2.350389003753662 }, { "auxiliary_loss_clip": 0.01051785, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.01632524, "balance_loss_mlp": 1.01584339, "epoch": 0.7853299263490154, "flos": 24643233699840.0, "grad_norm": 1.893948364582214, "language_loss": 0.75009966, "learning_rate": 4.641348194799164e-07, "loss": 0.77102149, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 13062, "time_per_iteration": 2.380682945251465 }, { "auxiliary_loss_clip": 0.01049909, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.01372766, "balance_loss_mlp": 1.01567841, "epoch": 0.7853900496016835, "flos": 22016892274560.0, "grad_norm": 1.5440720420859588, "language_loss": 0.70105278, "learning_rate": 4.638853864505297e-07, "loss": 0.72189879, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 13063, "time_per_iteration": 2.388108968734741 }, { "auxiliary_loss_clip": 0.01052838, "auxiliary_loss_mlp": 0.0104299, "balance_loss_clip": 1.02008963, "balance_loss_mlp": 1.01771057, "epoch": 0.7854501728543514, "flos": 30226090078080.0, "grad_norm": 1.864747014705704, "language_loss": 0.74627858, "learning_rate": 4.636360116707625e-07, "loss": 0.76723683, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 13064, "time_per_iteration": 2.4215452671051025 }, { "auxiliary_loss_clip": 0.01054541, "auxiliary_loss_mlp": 0.01038582, "balance_loss_clip": 1.0153842, "balance_loss_mlp": 1.01733482, "epoch": 0.7855102961070194, "flos": 18842693794560.0, "grad_norm": 1.5710335813462553, "language_loss": 0.68899775, "learning_rate": 4.633866951500718e-07, "loss": 0.70992899, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 13065, "time_per_iteration": 3.5785601139068604 }, { "auxiliary_loss_clip": 0.01052898, "auxiliary_loss_mlp": 0.01042975, "balance_loss_clip": 1.01968169, "balance_loss_mlp": 1.01690495, "epoch": 0.7855704193596874, "flos": 22308870908160.0, "grad_norm": 1.7667515823612099, "language_loss": 0.77220309, "learning_rate": 4.6313743689791196e-07, "loss": 0.79316175, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 13066, "time_per_iteration": 2.36228084564209 }, { "auxiliary_loss_clip": 0.01008124, "auxiliary_loss_mlp": 0.01003202, "balance_loss_clip": 1.00081742, "balance_loss_mlp": 1.00112677, "epoch": 0.7856305426123553, "flos": 60001136035200.0, "grad_norm": 0.8813964842510358, "language_loss": 0.5343895, "learning_rate": 4.628882369237346e-07, "loss": 0.55450279, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.0703125, "step": 13067, "time_per_iteration": 3.0697269439697266 }, { "auxiliary_loss_clip": 0.01051177, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.02020812, "balance_loss_mlp": 1.01545763, "epoch": 0.7856906658650233, "flos": 21867603834240.0, "grad_norm": 1.4626506944138644, "language_loss": 0.6804359, "learning_rate": 4.62639095236989e-07, "loss": 0.70138502, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35742188, "step": 13068, "time_per_iteration": 2.3634679317474365 }, { "auxiliary_loss_clip": 0.01051294, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.01173544, "balance_loss_mlp": 1.01660693, "epoch": 0.7857507891176913, "flos": 23621814696960.0, "grad_norm": 1.7463223755337627, "language_loss": 0.6906597, "learning_rate": 4.6239001184712267e-07, "loss": 0.71150815, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 13069, "time_per_iteration": 2.388317346572876 }, { "auxiliary_loss_clip": 0.01052699, "auxiliary_loss_mlp": 0.01035632, "balance_loss_clip": 1.01230288, "balance_loss_mlp": 1.01705956, "epoch": 0.7858109123703593, "flos": 25518890309760.0, "grad_norm": 1.6791370314961107, "language_loss": 0.77150494, "learning_rate": 4.6214098676358195e-07, "loss": 0.7923882, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 13070, "time_per_iteration": 2.383430004119873 }, { "auxiliary_loss_clip": 0.0105129, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.01245999, "balance_loss_mlp": 1.01617754, "epoch": 0.7858710356230272, "flos": 17456432417280.0, "grad_norm": 1.639937926779383, "language_loss": 0.6651777, "learning_rate": 4.618920199958083e-07, "loss": 0.68603623, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13071, "time_per_iteration": 2.3524727821350098 }, { "auxiliary_loss_clip": 0.01052099, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.01422644, "balance_loss_mlp": 1.01630378, "epoch": 0.7859311588756952, "flos": 24678565862400.0, "grad_norm": 1.5455397300355838, "language_loss": 0.75047386, "learning_rate": 4.616431115532442e-07, "loss": 0.77137184, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35742188, "step": 13072, "time_per_iteration": 3.8536064624786377 }, { "auxiliary_loss_clip": 0.01052764, "auxiliary_loss_mlp": 0.01038558, "balance_loss_clip": 1.0122962, "balance_loss_mlp": 1.01658773, "epoch": 0.7859912821283631, "flos": 21798056672640.0, "grad_norm": 1.7122432014676567, "language_loss": 0.72359234, "learning_rate": 4.613942614453268e-07, "loss": 0.74450552, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 13073, "time_per_iteration": 3.6731374263763428 }, { "auxiliary_loss_clip": 0.0105179, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.01246011, "balance_loss_mlp": 1.01631045, "epoch": 0.7860514053810311, "flos": 20846324476800.0, "grad_norm": 1.6080883264708346, "language_loss": 0.77651227, "learning_rate": 4.611454696814938e-07, "loss": 0.79739398, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35546875, "step": 13074, "time_per_iteration": 2.369936227798462 }, { "auxiliary_loss_clip": 0.01049639, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.01168501, "balance_loss_mlp": 1.01566434, "epoch": 0.786111528633699, "flos": 24314561360640.0, "grad_norm": 1.8461845892796624, "language_loss": 0.76040113, "learning_rate": 4.608967362711782e-07, "loss": 0.78122747, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 13075, "time_per_iteration": 2.3769848346710205 }, { "auxiliary_loss_clip": 0.01051077, "auxiliary_loss_mlp": 0.01033018, "balance_loss_clip": 1.01200175, "balance_loss_mlp": 1.01603723, "epoch": 0.7861716518863671, "flos": 24352023116160.0, "grad_norm": 1.7402161765821813, "language_loss": 0.70226657, "learning_rate": 4.6064806122381283e-07, "loss": 0.72310752, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34960938, "step": 13076, "time_per_iteration": 2.408473491668701 }, { "auxiliary_loss_clip": 0.01050048, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.01474595, "balance_loss_mlp": 1.01575065, "epoch": 0.786231775139035, "flos": 14021677393920.0, "grad_norm": 1.967780386936859, "language_loss": 0.8166151, "learning_rate": 4.603994445488282e-07, "loss": 0.83750069, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34375, "step": 13077, "time_per_iteration": 2.347256660461426 }, { "auxiliary_loss_clip": 0.01051062, "auxiliary_loss_mlp": 0.01034915, "balance_loss_clip": 1.01294422, "balance_loss_mlp": 1.01581311, "epoch": 0.786291898391703, "flos": 33722991054720.0, "grad_norm": 1.682118932510479, "language_loss": 0.71205139, "learning_rate": 4.6015088625564956e-07, "loss": 0.73291117, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 13078, "time_per_iteration": 2.5068202018737793 }, { "auxiliary_loss_clip": 0.0105126, "auxiliary_loss_mlp": 0.01037442, "balance_loss_clip": 1.01466119, "balance_loss_mlp": 1.01640201, "epoch": 0.786352021644371, "flos": 25810310361600.0, "grad_norm": 1.5264657681525122, "language_loss": 0.82308853, "learning_rate": 4.599023863537039e-07, "loss": 0.84397554, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 13079, "time_per_iteration": 2.382204294204712 }, { "auxiliary_loss_clip": 0.01049543, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.01411879, "balance_loss_mlp": 1.01614285, "epoch": 0.7864121448970389, "flos": 28909620241920.0, "grad_norm": 1.5966149409264057, "language_loss": 0.69008911, "learning_rate": 4.596539448524146e-07, "loss": 0.71094042, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33398438, "step": 13080, "time_per_iteration": 2.4558568000793457 }, { "auxiliary_loss_clip": 0.01052192, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.01321507, "balance_loss_mlp": 1.01587784, "epoch": 0.7864722681497069, "flos": 19207815459840.0, "grad_norm": 1.6437292341142, "language_loss": 0.71028119, "learning_rate": 4.594055617612016e-07, "loss": 0.73116481, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 13081, "time_per_iteration": 2.4095816612243652 }, { "auxiliary_loss_clip": 0.01050751, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.01548982, "balance_loss_mlp": 1.01567686, "epoch": 0.7865323914023749, "flos": 21870501477120.0, "grad_norm": 1.4814050838637678, "language_loss": 0.69421005, "learning_rate": 4.591572370894838e-07, "loss": 0.71509361, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13082, "time_per_iteration": 2.4049739837646484 }, { "auxiliary_loss_clip": 0.01052916, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.01578879, "balance_loss_mlp": 1.01679003, "epoch": 0.7865925146550429, "flos": 25519134689280.0, "grad_norm": 1.8120468943595218, "language_loss": 0.67559838, "learning_rate": 4.589089708466789e-07, "loss": 0.69650447, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 13083, "time_per_iteration": 2.396791934967041 }, { "auxiliary_loss_clip": 0.01054612, "auxiliary_loss_mlp": 0.01039331, "balance_loss_clip": 1.01452363, "balance_loss_mlp": 1.01739001, "epoch": 0.7866526379077108, "flos": 19096407711360.0, "grad_norm": 2.256877418901593, "language_loss": 0.76725686, "learning_rate": 4.5866076304220015e-07, "loss": 0.78819633, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 13084, "time_per_iteration": 2.3946433067321777 }, { "auxiliary_loss_clip": 0.01051233, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.01557124, "balance_loss_mlp": 1.01687729, "epoch": 0.7867127611603788, "flos": 16173025505280.0, "grad_norm": 1.8368751499841338, "language_loss": 0.71262664, "learning_rate": 4.584126136854591e-07, "loss": 0.73350692, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 13085, "time_per_iteration": 3.7150564193725586 }, { "auxiliary_loss_clip": 0.0105199, "auxiliary_loss_mlp": 0.01036232, "balance_loss_clip": 1.0124017, "balance_loss_mlp": 1.01566529, "epoch": 0.7867728844130467, "flos": 20772692686080.0, "grad_norm": 1.9572648997454458, "language_loss": 0.73083544, "learning_rate": 4.5816452278586617e-07, "loss": 0.75171769, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 13086, "time_per_iteration": 2.3497791290283203 }, { "auxiliary_loss_clip": 0.01050966, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.01330495, "balance_loss_mlp": 1.0162704, "epoch": 0.7868330076657147, "flos": 21759093728640.0, "grad_norm": 1.7871805258435856, "language_loss": 0.75335598, "learning_rate": 4.5791649035282965e-07, "loss": 0.77420354, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34765625, "step": 13087, "time_per_iteration": 2.35309100151062 }, { "auxiliary_loss_clip": 0.01051605, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.01533115, "balance_loss_mlp": 1.01623464, "epoch": 0.7868931309183826, "flos": 25699565928960.0, "grad_norm": 1.5893351560019862, "language_loss": 0.71819979, "learning_rate": 4.5766851639575456e-07, "loss": 0.73908931, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 13088, "time_per_iteration": 2.3978638648986816 }, { "auxiliary_loss_clip": 0.01007937, "auxiliary_loss_mlp": 0.01001784, "balance_loss_clip": 0.99976945, "balance_loss_mlp": 1.00116611, "epoch": 0.7869532541710507, "flos": 64641267348480.0, "grad_norm": 0.6733849772074044, "language_loss": 0.55560607, "learning_rate": 4.574206009240431e-07, "loss": 0.57570332, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.06787109, "step": 13089, "time_per_iteration": 3.0756068229675293 }, { "auxiliary_loss_clip": 0.01008047, "auxiliary_loss_mlp": 0.0100223, "balance_loss_clip": 0.99998885, "balance_loss_mlp": 1.00120878, "epoch": 0.7870133774237186, "flos": 67449925226880.0, "grad_norm": 0.7326027158982577, "language_loss": 0.50175047, "learning_rate": 4.571727439470976e-07, "loss": 0.52185327, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06835938, "step": 13090, "time_per_iteration": 3.1257426738739014 }, { "auxiliary_loss_clip": 0.01049743, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.01427865, "balance_loss_mlp": 1.01571727, "epoch": 0.7870735006763866, "flos": 26067096478080.0, "grad_norm": 1.7013846994246846, "language_loss": 0.84805304, "learning_rate": 4.5692494547431583e-07, "loss": 0.86890566, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33984375, "step": 13091, "time_per_iteration": 2.420112133026123 }, { "auxiliary_loss_clip": 0.01007986, "auxiliary_loss_mlp": 0.01002369, "balance_loss_clip": 1.00004411, "balance_loss_mlp": 1.0012598, "epoch": 0.7871336239290546, "flos": 70286095123200.0, "grad_norm": 0.7179978777224387, "language_loss": 0.63997459, "learning_rate": 4.566772055150947e-07, "loss": 0.66007817, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.06738281, "step": 13092, "time_per_iteration": 3.0552520751953125 }, { "auxiliary_loss_clip": 0.01053069, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.0189929, "balance_loss_mlp": 1.01686287, "epoch": 0.7871937471817225, "flos": 15777668736000.0, "grad_norm": 1.9819361630003371, "language_loss": 0.80222678, "learning_rate": 4.564295240788285e-07, "loss": 0.82318795, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 13093, "time_per_iteration": 2.3523361682891846 }, { "auxiliary_loss_clip": 0.01050786, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.01073861, "balance_loss_mlp": 1.01637375, "epoch": 0.7872538704343905, "flos": 20484205188480.0, "grad_norm": 2.0353476908739125, "language_loss": 0.77170861, "learning_rate": 4.561819011749106e-07, "loss": 0.79254007, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 13094, "time_per_iteration": 2.3489818572998047 }, { "auxiliary_loss_clip": 0.01053139, "auxiliary_loss_mlp": 0.01036671, "balance_loss_clip": 1.01458097, "balance_loss_mlp": 1.01714551, "epoch": 0.7873139936870585, "flos": 25081498396800.0, "grad_norm": 1.7904680846948264, "language_loss": 0.80082059, "learning_rate": 4.5593433681272884e-07, "loss": 0.82171869, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 13095, "time_per_iteration": 2.4295496940612793 }, { "auxiliary_loss_clip": 0.01053859, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.01434588, "balance_loss_mlp": 1.01679885, "epoch": 0.7873741169397265, "flos": 30881863745280.0, "grad_norm": 1.9429240520599127, "language_loss": 0.69096887, "learning_rate": 4.556868310016715e-07, "loss": 0.71188533, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 13096, "time_per_iteration": 2.439267158508301 }, { "auxiliary_loss_clip": 0.0104867, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.0107131, "balance_loss_mlp": 1.01505208, "epoch": 0.7874342401923944, "flos": 46790178255360.0, "grad_norm": 1.9710723788122153, "language_loss": 0.70784414, "learning_rate": 4.55439383751125e-07, "loss": 0.72863746, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.3359375, "step": 13097, "time_per_iteration": 2.6201696395874023 }, { "auxiliary_loss_clip": 0.01052939, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.01435137, "balance_loss_mlp": 1.01712251, "epoch": 0.7874943634450624, "flos": 23583480157440.0, "grad_norm": 1.687857701535057, "language_loss": 0.82022941, "learning_rate": 4.5519199507047126e-07, "loss": 0.84113336, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 13098, "time_per_iteration": 2.383824586868286 }, { "auxiliary_loss_clip": 0.01050305, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.01130509, "balance_loss_mlp": 1.01599538, "epoch": 0.7875544866977303, "flos": 20190201696000.0, "grad_norm": 2.4211965624443605, "language_loss": 0.7542699, "learning_rate": 4.5494466496909177e-07, "loss": 0.77510166, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 13099, "time_per_iteration": 2.421232223510742 }, { "auxiliary_loss_clip": 0.01051914, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.01242995, "balance_loss_mlp": 1.016325, "epoch": 0.7876146099503983, "flos": 22601443034880.0, "grad_norm": 1.5558773667089265, "language_loss": 0.78647089, "learning_rate": 4.5469739345636603e-07, "loss": 0.80735874, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 13100, "time_per_iteration": 2.3794920444488525 }, { "auxiliary_loss_clip": 0.01056916, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.0096823, "balance_loss_mlp": 1.01769781, "epoch": 0.7876747332030662, "flos": 10705102922880.0, "grad_norm": 2.2445722479293457, "language_loss": 0.67953789, "learning_rate": 4.5445018054167007e-07, "loss": 0.70045799, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.390625, "step": 13101, "time_per_iteration": 2.3516077995300293 }, { "auxiliary_loss_clip": 0.01050548, "auxiliary_loss_mlp": 0.0104208, "balance_loss_clip": 1.01979971, "balance_loss_mlp": 1.015396, "epoch": 0.7877348564557343, "flos": 38397791214720.0, "grad_norm": 1.4086819059027418, "language_loss": 0.78668201, "learning_rate": 4.5420302623437745e-07, "loss": 0.80760825, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 13102, "time_per_iteration": 2.527534246444702 }, { "auxiliary_loss_clip": 0.01052072, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.01473689, "balance_loss_mlp": 1.01659536, "epoch": 0.7877949797084022, "flos": 18328632802560.0, "grad_norm": 1.926526288258514, "language_loss": 0.8301053, "learning_rate": 4.5395593054386093e-07, "loss": 0.8509869, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35546875, "step": 13103, "time_per_iteration": 2.3296546936035156 }, { "auxiliary_loss_clip": 0.01054394, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.01668227, "balance_loss_mlp": 1.01791251, "epoch": 0.7878551029610702, "flos": 25805702062080.0, "grad_norm": 2.1766963061682394, "language_loss": 0.8184312, "learning_rate": 4.537088934794913e-07, "loss": 0.83937645, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36523438, "step": 13104, "time_per_iteration": 2.4109456539154053 }, { "auxiliary_loss_clip": 0.01051675, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.01681018, "balance_loss_mlp": 1.01577902, "epoch": 0.7879152262137382, "flos": 22341689452800.0, "grad_norm": 2.3284621814727737, "language_loss": 0.74878001, "learning_rate": 4.5346191505063515e-07, "loss": 0.7696861, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 13105, "time_per_iteration": 3.590615749359131 }, { "auxiliary_loss_clip": 0.01053073, "auxiliary_loss_mlp": 0.01044349, "balance_loss_clip": 1.02011371, "balance_loss_mlp": 1.01622987, "epoch": 0.7879753494664061, "flos": 24784317970560.0, "grad_norm": 1.9370318236933564, "language_loss": 0.7664668, "learning_rate": 4.5321499526665776e-07, "loss": 0.78744096, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 13106, "time_per_iteration": 2.3771698474884033 }, { "auxiliary_loss_clip": 0.01053937, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.01250982, "balance_loss_mlp": 1.01758265, "epoch": 0.7880354727190741, "flos": 16908156426240.0, "grad_norm": 5.522013413253047, "language_loss": 0.74648869, "learning_rate": 4.5296813413692337e-07, "loss": 0.76740009, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 13107, "time_per_iteration": 2.367156505584717 }, { "auxiliary_loss_clip": 0.01050116, "auxiliary_loss_mlp": 0.01037602, "balance_loss_clip": 1.01514304, "balance_loss_mlp": 1.01523352, "epoch": 0.7880955959717421, "flos": 22229583477120.0, "grad_norm": 2.43072378147805, "language_loss": 0.74471951, "learning_rate": 4.5272133167079165e-07, "loss": 0.76559663, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13108, "time_per_iteration": 2.3572044372558594 }, { "auxiliary_loss_clip": 0.01007702, "auxiliary_loss_mlp": 0.01003476, "balance_loss_clip": 1.00136626, "balance_loss_mlp": 1.00091481, "epoch": 0.7881557192244101, "flos": 69180082162560.0, "grad_norm": 0.8823129055400022, "language_loss": 0.6038065, "learning_rate": 4.5247458787762216e-07, "loss": 0.62391818, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06787109, "step": 13109, "time_per_iteration": 2.9582979679107666 }, { "auxiliary_loss_clip": 0.01049663, "auxiliary_loss_mlp": 0.01038901, "balance_loss_clip": 1.01807535, "balance_loss_mlp": 1.01560163, "epoch": 0.788215842477078, "flos": 24934304638080.0, "grad_norm": 1.9327957359709333, "language_loss": 0.73287439, "learning_rate": 4.5222790276677126e-07, "loss": 0.75376004, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 13110, "time_per_iteration": 2.4374961853027344 }, { "auxiliary_loss_clip": 0.01050012, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.01160204, "balance_loss_mlp": 1.0160749, "epoch": 0.788275965729746, "flos": 26105221549440.0, "grad_norm": 1.3631490689235013, "language_loss": 0.75903934, "learning_rate": 4.5198127634759455e-07, "loss": 0.77986503, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33984375, "step": 13111, "time_per_iteration": 2.447052240371704 }, { "auxiliary_loss_clip": 0.01051959, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.01622963, "balance_loss_mlp": 1.01672256, "epoch": 0.7883360889824139, "flos": 21213750291840.0, "grad_norm": 1.8219600402557927, "language_loss": 0.62956607, "learning_rate": 4.5173470862944206e-07, "loss": 0.65047389, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 13112, "time_per_iteration": 5.254083156585693 }, { "auxiliary_loss_clip": 0.01051069, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.01621878, "balance_loss_mlp": 1.01495183, "epoch": 0.7883962122350819, "flos": 21141480044160.0, "grad_norm": 2.340802663422589, "language_loss": 0.68977153, "learning_rate": 4.514881996216644e-07, "loss": 0.71066523, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36132812, "step": 13113, "time_per_iteration": 2.383671522140503 }, { "auxiliary_loss_clip": 0.01051387, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.01327312, "balance_loss_mlp": 1.01669097, "epoch": 0.7884563354877498, "flos": 15302047017600.0, "grad_norm": 2.776431184247072, "language_loss": 0.60928011, "learning_rate": 4.5124174933361e-07, "loss": 0.63015789, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34765625, "step": 13114, "time_per_iteration": 2.38053297996521 }, { "auxiliary_loss_clip": 0.01053374, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.01719308, "balance_loss_mlp": 1.01708031, "epoch": 0.7885164587404179, "flos": 24387180721920.0, "grad_norm": 1.621968881487676, "language_loss": 0.66944063, "learning_rate": 4.5099535777462306e-07, "loss": 0.69038975, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 13115, "time_per_iteration": 2.389820098876953 }, { "auxiliary_loss_clip": 0.01051934, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.01357603, "balance_loss_mlp": 1.01638472, "epoch": 0.7885765819930858, "flos": 14385193136640.0, "grad_norm": 1.8128310065291144, "language_loss": 0.88783896, "learning_rate": 4.50749024954048e-07, "loss": 0.90871525, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 13116, "time_per_iteration": 2.376133441925049 }, { "auxiliary_loss_clip": 0.01056228, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.01978612, "balance_loss_mlp": 1.01709843, "epoch": 0.7886367052457538, "flos": 18258945995520.0, "grad_norm": 1.7954699259788616, "language_loss": 0.73776799, "learning_rate": 4.505027508812245e-07, "loss": 0.75879371, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.390625, "step": 13117, "time_per_iteration": 2.3407537937164307 }, { "auxiliary_loss_clip": 0.01049885, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.01307368, "balance_loss_mlp": 1.01629186, "epoch": 0.7886968284984217, "flos": 15304176610560.0, "grad_norm": 1.4032688456367244, "language_loss": 0.80989122, "learning_rate": 4.502565355654926e-07, "loss": 0.83074582, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3359375, "step": 13118, "time_per_iteration": 2.39129638671875 }, { "auxiliary_loss_clip": 0.01050735, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.01177669, "balance_loss_mlp": 1.01553559, "epoch": 0.7887569517510897, "flos": 21214378696320.0, "grad_norm": 1.6724015476361158, "language_loss": 0.74436247, "learning_rate": 4.500103790161878e-07, "loss": 0.76521981, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 13119, "time_per_iteration": 2.38012957572937 }, { "auxiliary_loss_clip": 0.01053391, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.01389647, "balance_loss_mlp": 1.01701355, "epoch": 0.7888170750037578, "flos": 22710127697280.0, "grad_norm": 1.3864186895434452, "language_loss": 0.73204195, "learning_rate": 4.4976428124264454e-07, "loss": 0.75295216, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 13120, "time_per_iteration": 2.4412171840667725 }, { "auxiliary_loss_clip": 0.01051512, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.01607192, "balance_loss_mlp": 1.01585615, "epoch": 0.7888771982564257, "flos": 36427677304320.0, "grad_norm": 1.4864492503930322, "language_loss": 0.79700148, "learning_rate": 4.4951824225419564e-07, "loss": 0.81791598, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 13121, "time_per_iteration": 2.49934720993042 }, { "auxiliary_loss_clip": 0.01050276, "auxiliary_loss_mlp": 0.01035121, "balance_loss_clip": 1.01163697, "balance_loss_mlp": 1.01552427, "epoch": 0.7889373215090937, "flos": 27308712625920.0, "grad_norm": 1.3574115686135173, "language_loss": 0.80964649, "learning_rate": 4.4927226206017057e-07, "loss": 0.83050048, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 13122, "time_per_iteration": 2.4726414680480957 }, { "auxiliary_loss_clip": 0.01051818, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.01288056, "balance_loss_mlp": 1.01553631, "epoch": 0.7889974447617616, "flos": 19827977673600.0, "grad_norm": 2.0469188013146358, "language_loss": 0.78674906, "learning_rate": 4.4902634066989597e-07, "loss": 0.80761504, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 13123, "time_per_iteration": 2.359224319458008 }, { "auxiliary_loss_clip": 0.01053716, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.01044595, "balance_loss_mlp": 1.01669896, "epoch": 0.7890575680144296, "flos": 17270345537280.0, "grad_norm": 1.7961183703832058, "language_loss": 0.68260324, "learning_rate": 4.487804780926985e-07, "loss": 0.70349801, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 13124, "time_per_iteration": 2.350379467010498 }, { "auxiliary_loss_clip": 0.01053626, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.01166224, "balance_loss_mlp": 1.01704597, "epoch": 0.7891176912670975, "flos": 27598910780160.0, "grad_norm": 1.9592621888966097, "language_loss": 0.73419499, "learning_rate": 4.4853467433790036e-07, "loss": 0.7550931, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36523438, "step": 13125, "time_per_iteration": 3.906547784805298 }, { "auxiliary_loss_clip": 0.01052088, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.0127939, "balance_loss_mlp": 1.0152868, "epoch": 0.7891778145197655, "flos": 22710546633600.0, "grad_norm": 1.9006836912922955, "language_loss": 0.73861682, "learning_rate": 4.4828892941482267e-07, "loss": 0.75949383, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 13126, "time_per_iteration": 2.4091665744781494 }, { "auxiliary_loss_clip": 0.01053007, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.00777829, "balance_loss_mlp": 1.01608682, "epoch": 0.7892379377724335, "flos": 17309832151680.0, "grad_norm": 1.816202912893336, "language_loss": 0.77789676, "learning_rate": 4.480432433327845e-07, "loss": 0.79875779, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 13127, "time_per_iteration": 2.330759286880493 }, { "auxiliary_loss_clip": 0.01050681, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.01411164, "balance_loss_mlp": 1.01653779, "epoch": 0.7892980610251015, "flos": 25774489440000.0, "grad_norm": 1.5641376762579695, "language_loss": 0.86398518, "learning_rate": 4.47797616101103e-07, "loss": 0.88486159, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 13128, "time_per_iteration": 2.432013750076294 }, { "auxiliary_loss_clip": 0.01051161, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.01646852, "balance_loss_mlp": 1.01628184, "epoch": 0.7893581842777694, "flos": 21578871957120.0, "grad_norm": 2.099712731481739, "language_loss": 0.7067216, "learning_rate": 4.475520477290904e-07, "loss": 0.72762382, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 13129, "time_per_iteration": 2.36267352104187 }, { "auxiliary_loss_clip": 0.01008138, "auxiliary_loss_mlp": 0.0100193, "balance_loss_clip": 0.99984419, "balance_loss_mlp": 1.00113475, "epoch": 0.7894183075304374, "flos": 69012917239680.0, "grad_norm": 0.7239290699226015, "language_loss": 0.61704433, "learning_rate": 4.473065382260597e-07, "loss": 0.63714504, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.0703125, "step": 13130, "time_per_iteration": 3.018737316131592 }, { "auxiliary_loss_clip": 0.01053417, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.00959349, "balance_loss_mlp": 1.01755273, "epoch": 0.7894784307831053, "flos": 24242116556160.0, "grad_norm": 4.920897525876791, "language_loss": 0.74324864, "learning_rate": 4.4706108760132124e-07, "loss": 0.76410806, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 13131, "time_per_iteration": 2.4414162635803223 }, { "auxiliary_loss_clip": 0.01057637, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.00943112, "balance_loss_mlp": 1.01723003, "epoch": 0.7895385540357733, "flos": 20265509232000.0, "grad_norm": 2.9615169153554666, "language_loss": 0.71552098, "learning_rate": 4.4681569586418153e-07, "loss": 0.73646176, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.40429688, "step": 13132, "time_per_iteration": 2.398754119873047 }, { "auxiliary_loss_clip": 0.0105509, "auxiliary_loss_mlp": 0.01046918, "balance_loss_clip": 1.01996517, "balance_loss_mlp": 1.01843858, "epoch": 0.7895986772884414, "flos": 20995508183040.0, "grad_norm": 1.864015286096193, "language_loss": 0.62986702, "learning_rate": 4.465703630239468e-07, "loss": 0.65088713, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3671875, "step": 13133, "time_per_iteration": 2.3838300704956055 }, { "auxiliary_loss_clip": 0.01055531, "auxiliary_loss_mlp": 0.01039823, "balance_loss_clip": 1.013978, "balance_loss_mlp": 1.0169853, "epoch": 0.7896588005411093, "flos": 18657095673600.0, "grad_norm": 2.0506530195686348, "language_loss": 0.81527114, "learning_rate": 4.463250890899195e-07, "loss": 0.83622468, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38476562, "step": 13134, "time_per_iteration": 2.373037338256836 }, { "auxiliary_loss_clip": 0.01052759, "auxiliary_loss_mlp": 0.01040788, "balance_loss_clip": 1.01757765, "balance_loss_mlp": 1.0170486, "epoch": 0.7897189237937773, "flos": 18404917856640.0, "grad_norm": 1.7509394274774932, "language_loss": 0.81635243, "learning_rate": 4.460798740713998e-07, "loss": 0.8372879, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 13135, "time_per_iteration": 2.370577096939087 }, { "auxiliary_loss_clip": 0.01051384, "auxiliary_loss_mlp": 0.01041571, "balance_loss_clip": 1.01659632, "balance_loss_mlp": 1.01543379, "epoch": 0.7897790470464452, "flos": 23730499359360.0, "grad_norm": 1.692349122535223, "language_loss": 0.72926819, "learning_rate": 4.4583471797768733e-07, "loss": 0.75019777, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 13136, "time_per_iteration": 2.44268798828125 }, { "auxiliary_loss_clip": 0.01056598, "auxiliary_loss_mlp": 0.01042394, "balance_loss_clip": 1.01616836, "balance_loss_mlp": 1.01642692, "epoch": 0.7898391702991132, "flos": 15918194424960.0, "grad_norm": 2.220319343795838, "language_loss": 0.72598553, "learning_rate": 4.455896208180778e-07, "loss": 0.74697542, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.40234375, "step": 13137, "time_per_iteration": 2.358534574508667 }, { "auxiliary_loss_clip": 0.01050756, "auxiliary_loss_mlp": 0.01039883, "balance_loss_clip": 1.01661336, "balance_loss_mlp": 1.01579583, "epoch": 0.7898992935517811, "flos": 19828012584960.0, "grad_norm": 1.8653329526671472, "language_loss": 0.75592989, "learning_rate": 4.4534458260186645e-07, "loss": 0.77683628, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34960938, "step": 13138, "time_per_iteration": 2.4070839881896973 }, { "auxiliary_loss_clip": 0.0105186, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.01386237, "balance_loss_mlp": 1.01561236, "epoch": 0.7899594168044491, "flos": 16215339939840.0, "grad_norm": 2.2085614442270125, "language_loss": 0.69705069, "learning_rate": 4.4509960333834426e-07, "loss": 0.71793735, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 13139, "time_per_iteration": 2.3544600009918213 }, { "auxiliary_loss_clip": 0.01008043, "auxiliary_loss_mlp": 0.01004179, "balance_loss_clip": 1.0023309, "balance_loss_mlp": 1.00107837, "epoch": 0.790019540057117, "flos": 68327257582080.0, "grad_norm": 0.8741404112023985, "language_loss": 0.60288221, "learning_rate": 4.448546830368003e-07, "loss": 0.62300444, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.06933594, "step": 13140, "time_per_iteration": 3.1090009212493896 }, { "auxiliary_loss_clip": 0.01052773, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.01681221, "balance_loss_mlp": 1.01645684, "epoch": 0.7900796633097851, "flos": 30331562895360.0, "grad_norm": 1.609796185412189, "language_loss": 0.77899468, "learning_rate": 4.4460982170652304e-07, "loss": 0.7999258, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 13141, "time_per_iteration": 2.4656901359558105 }, { "auxiliary_loss_clip": 0.01054251, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.01978588, "balance_loss_mlp": 1.01691246, "epoch": 0.790139786562453, "flos": 22125716582400.0, "grad_norm": 1.9501693479290214, "language_loss": 0.69355023, "learning_rate": 4.4436501935679694e-07, "loss": 0.7145263, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 13142, "time_per_iteration": 2.3962888717651367 }, { "auxiliary_loss_clip": 0.01007714, "auxiliary_loss_mlp": 0.01002997, "balance_loss_clip": 1.00080395, "balance_loss_mlp": 1.00091171, "epoch": 0.790199909815121, "flos": 58204296535680.0, "grad_norm": 0.8128750405646906, "language_loss": 0.60113227, "learning_rate": 4.441202759969049e-07, "loss": 0.62123936, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06787109, "step": 13143, "time_per_iteration": 2.830991268157959 }, { "auxiliary_loss_clip": 0.01054602, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.01283526, "balance_loss_mlp": 1.01750004, "epoch": 0.7902600330677889, "flos": 34531858500480.0, "grad_norm": 1.8528931249460574, "language_loss": 0.75698566, "learning_rate": 4.4387559163612875e-07, "loss": 0.77790117, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 13144, "time_per_iteration": 2.5148165225982666 }, { "auxiliary_loss_clip": 0.01055378, "auxiliary_loss_mlp": 0.01039694, "balance_loss_clip": 1.0139209, "balance_loss_mlp": 1.01698279, "epoch": 0.7903201563204569, "flos": 22345285322880.0, "grad_norm": 1.891803898500093, "language_loss": 0.84017539, "learning_rate": 4.4363096628374605e-07, "loss": 0.86112607, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 13145, "time_per_iteration": 3.6872386932373047 }, { "auxiliary_loss_clip": 0.01048442, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.01142466, "balance_loss_mlp": 1.01441324, "epoch": 0.790380279573125, "flos": 22052468816640.0, "grad_norm": 1.627646293057472, "language_loss": 0.74121606, "learning_rate": 4.4338639994903235e-07, "loss": 0.76202118, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33984375, "step": 13146, "time_per_iteration": 2.386735439300537 }, { "auxiliary_loss_clip": 0.01053745, "auxiliary_loss_mlp": 0.01038038, "balance_loss_clip": 1.01394594, "balance_loss_mlp": 1.01623857, "epoch": 0.7904404028257929, "flos": 20301574533120.0, "grad_norm": 1.9101783126782728, "language_loss": 0.76474506, "learning_rate": 4.4314189264126246e-07, "loss": 0.78566289, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.375, "step": 13147, "time_per_iteration": 2.3629133701324463 }, { "auxiliary_loss_clip": 0.01051712, "auxiliary_loss_mlp": 0.01042798, "balance_loss_clip": 1.01759684, "balance_loss_mlp": 1.01596212, "epoch": 0.7905005260784609, "flos": 20007955065600.0, "grad_norm": 1.8147438417060116, "language_loss": 0.73103535, "learning_rate": 4.428974443697087e-07, "loss": 0.75198054, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35742188, "step": 13148, "time_per_iteration": 2.386448621749878 }, { "auxiliary_loss_clip": 0.01051397, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.01564717, "balance_loss_mlp": 1.01483488, "epoch": 0.7905606493311288, "flos": 26904732750720.0, "grad_norm": 1.6104705326619932, "language_loss": 0.72916663, "learning_rate": 4.4265305514363913e-07, "loss": 0.75006783, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 13149, "time_per_iteration": 2.4476826190948486 }, { "auxiliary_loss_clip": 0.01054639, "auxiliary_loss_mlp": 0.01041772, "balance_loss_clip": 1.01481915, "balance_loss_mlp": 1.01753139, "epoch": 0.7906207725837968, "flos": 23695097374080.0, "grad_norm": 1.9280106282514626, "language_loss": 0.66010129, "learning_rate": 4.424087249723225e-07, "loss": 0.68106538, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 13150, "time_per_iteration": 2.3981781005859375 }, { "auxiliary_loss_clip": 0.01051524, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.01157272, "balance_loss_mlp": 1.01590824, "epoch": 0.7906808958364647, "flos": 20847825665280.0, "grad_norm": 1.739017235685906, "language_loss": 0.71336436, "learning_rate": 4.421644538650231e-07, "loss": 0.73422223, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13151, "time_per_iteration": 3.807577610015869 }, { "auxiliary_loss_clip": 0.01054872, "auxiliary_loss_mlp": 0.01047052, "balance_loss_clip": 1.02067113, "balance_loss_mlp": 1.01691103, "epoch": 0.7907410190891327, "flos": 40733585372160.0, "grad_norm": 1.7948704825507629, "language_loss": 0.71017283, "learning_rate": 4.4192024183100306e-07, "loss": 0.73119205, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 13152, "time_per_iteration": 3.8467752933502197 }, { "auxiliary_loss_clip": 0.01052528, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.01147389, "balance_loss_mlp": 1.01713204, "epoch": 0.7908011423418007, "flos": 13260326175360.0, "grad_norm": 1.7719191729595685, "language_loss": 0.73787349, "learning_rate": 4.4167608887952367e-07, "loss": 0.75874436, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 13153, "time_per_iteration": 2.3916752338409424 }, { "auxiliary_loss_clip": 0.0105195, "auxiliary_loss_mlp": 0.01032056, "balance_loss_clip": 1.00866699, "balance_loss_mlp": 1.01597345, "epoch": 0.7908612655944687, "flos": 19753752389760.0, "grad_norm": 1.5204414732063258, "language_loss": 0.79609656, "learning_rate": 4.4143199501984306e-07, "loss": 0.81693661, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 13154, "time_per_iteration": 2.3500359058380127 }, { "auxiliary_loss_clip": 0.01056239, "auxiliary_loss_mlp": 0.01039596, "balance_loss_clip": 1.01307154, "balance_loss_mlp": 1.01737535, "epoch": 0.7909213888471366, "flos": 21286683855360.0, "grad_norm": 2.122793219092856, "language_loss": 0.7122348, "learning_rate": 4.411879602612185e-07, "loss": 0.73319316, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38867188, "step": 13155, "time_per_iteration": 2.38899827003479 }, { "auxiliary_loss_clip": 0.0105269, "auxiliary_loss_mlp": 0.01032168, "balance_loss_clip": 1.00829017, "balance_loss_mlp": 1.01635623, "epoch": 0.7909815120998046, "flos": 22527776332800.0, "grad_norm": 1.5589271657483865, "language_loss": 0.78022236, "learning_rate": 4.4094398461290174e-07, "loss": 0.80107093, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 13156, "time_per_iteration": 2.364368200302124 }, { "auxiliary_loss_clip": 0.01051596, "auxiliary_loss_mlp": 0.01032094, "balance_loss_clip": 1.00939655, "balance_loss_mlp": 1.01607168, "epoch": 0.7910416353524725, "flos": 26726396192640.0, "grad_norm": 2.91416012422953, "language_loss": 0.67056084, "learning_rate": 4.4070006808414526e-07, "loss": 0.69139779, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13157, "time_per_iteration": 2.50380539894104 }, { "auxiliary_loss_clip": 0.01053651, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.01595044, "balance_loss_mlp": 1.01715994, "epoch": 0.7911017586051405, "flos": 24643687547520.0, "grad_norm": 1.7318591129986136, "language_loss": 0.75544858, "learning_rate": 4.4045621068419894e-07, "loss": 0.77638614, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 13158, "time_per_iteration": 2.393826723098755 }, { "auxiliary_loss_clip": 0.01049007, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.01213253, "balance_loss_mlp": 1.01501441, "epoch": 0.7911618818578086, "flos": 17564558497920.0, "grad_norm": 2.0311819075175648, "language_loss": 0.68602896, "learning_rate": 4.40212412422309e-07, "loss": 0.7068401, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33984375, "step": 13159, "time_per_iteration": 2.4017014503479004 }, { "auxiliary_loss_clip": 0.01052432, "auxiliary_loss_mlp": 0.01035878, "balance_loss_clip": 1.01352644, "balance_loss_mlp": 1.01697421, "epoch": 0.7912220051104765, "flos": 16720882560000.0, "grad_norm": 1.689647334024438, "language_loss": 0.6791535, "learning_rate": 4.399686733077206e-07, "loss": 0.70003664, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 13160, "time_per_iteration": 2.3631880283355713 }, { "auxiliary_loss_clip": 0.01049237, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.01261234, "balance_loss_mlp": 1.01541138, "epoch": 0.7912821283631445, "flos": 13697892645120.0, "grad_norm": 1.9281286032409604, "language_loss": 0.73644626, "learning_rate": 4.3972499334967694e-07, "loss": 0.75727272, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33789062, "step": 13161, "time_per_iteration": 2.4395992755889893 }, { "auxiliary_loss_clip": 0.01051033, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.01022804, "balance_loss_mlp": 1.01586318, "epoch": 0.7913422516158124, "flos": 23767891292160.0, "grad_norm": 1.5900258742175382, "language_loss": 0.74707425, "learning_rate": 4.39481372557418e-07, "loss": 0.76792681, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 13162, "time_per_iteration": 2.383955240249634 }, { "auxiliary_loss_clip": 0.01053095, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.01362371, "balance_loss_mlp": 1.01621687, "epoch": 0.7914023748684804, "flos": 19937220917760.0, "grad_norm": 1.6632050340241853, "language_loss": 0.72456253, "learning_rate": 4.392378109401811e-07, "loss": 0.74547708, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 13163, "time_per_iteration": 2.4014806747436523 }, { "auxiliary_loss_clip": 0.01051113, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.01306164, "balance_loss_mlp": 1.01515412, "epoch": 0.7914624981211483, "flos": 20593762634880.0, "grad_norm": 1.9813852279033242, "language_loss": 0.71193159, "learning_rate": 4.3899430850720296e-07, "loss": 0.73280978, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13164, "time_per_iteration": 2.3362114429473877 }, { "auxiliary_loss_clip": 0.01051464, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.01490903, "balance_loss_mlp": 1.01555693, "epoch": 0.7915226213738163, "flos": 21798370874880.0, "grad_norm": 1.953204233391065, "language_loss": 0.68153822, "learning_rate": 4.387508652677177e-07, "loss": 0.70243943, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 13165, "time_per_iteration": 3.770310401916504 }, { "auxiliary_loss_clip": 0.0104885, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.01286542, "balance_loss_mlp": 1.01490402, "epoch": 0.7915827446264843, "flos": 16287470542080.0, "grad_norm": 1.7626309783929734, "language_loss": 0.7333138, "learning_rate": 4.385074812309557e-07, "loss": 0.75414479, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 13166, "time_per_iteration": 2.315321445465088 }, { "auxiliary_loss_clip": 0.01050645, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.01486409, "balance_loss_mlp": 1.01478457, "epoch": 0.7916428678791523, "flos": 25701416231040.0, "grad_norm": 1.7400472570786014, "language_loss": 0.78686684, "learning_rate": 4.382641564061462e-07, "loss": 0.80777323, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.359375, "step": 13167, "time_per_iteration": 2.410093307495117 }, { "auxiliary_loss_clip": 0.0105081, "auxiliary_loss_mlp": 0.01034342, "balance_loss_clip": 1.01218152, "balance_loss_mlp": 1.01597726, "epoch": 0.7917029911318202, "flos": 23877378915840.0, "grad_norm": 1.5988320687027526, "language_loss": 0.85265124, "learning_rate": 4.3802089080251713e-07, "loss": 0.87350273, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34960938, "step": 13168, "time_per_iteration": 2.3919177055358887 }, { "auxiliary_loss_clip": 0.010527, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 1.01030159, "balance_loss_mlp": 1.0162816, "epoch": 0.7917631143844882, "flos": 21645696032640.0, "grad_norm": 2.631962691081977, "language_loss": 0.73616087, "learning_rate": 4.3777768442929155e-07, "loss": 0.75703311, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 13169, "time_per_iteration": 2.361884832382202 }, { "auxiliary_loss_clip": 0.01054635, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.01543295, "balance_loss_mlp": 1.01675665, "epoch": 0.7918232376371561, "flos": 38872644883200.0, "grad_norm": 1.6941845569344933, "language_loss": 0.68614775, "learning_rate": 4.3753453729569287e-07, "loss": 0.70711833, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 13170, "time_per_iteration": 2.5291528701782227 }, { "auxiliary_loss_clip": 0.01052629, "auxiliary_loss_mlp": 0.01032841, "balance_loss_clip": 1.01132393, "balance_loss_mlp": 1.01582301, "epoch": 0.7918833608898241, "flos": 20774542988160.0, "grad_norm": 1.6173816799506826, "language_loss": 0.71457916, "learning_rate": 4.372914494109412e-07, "loss": 0.73543388, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36914062, "step": 13171, "time_per_iteration": 2.386094570159912 }, { "auxiliary_loss_clip": 0.01051212, "auxiliary_loss_mlp": 0.0103765, "balance_loss_clip": 1.01281857, "balance_loss_mlp": 1.01565385, "epoch": 0.7919434841424922, "flos": 33908763732480.0, "grad_norm": 1.8872248507030527, "language_loss": 0.68349189, "learning_rate": 4.370484207842553e-07, "loss": 0.70438051, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35546875, "step": 13172, "time_per_iteration": 2.457155227661133 }, { "auxiliary_loss_clip": 0.01051473, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.01208663, "balance_loss_mlp": 1.01571465, "epoch": 0.7920036073951601, "flos": 21063728712960.0, "grad_norm": 1.6971220705413375, "language_loss": 0.80506819, "learning_rate": 4.3680545142484893e-07, "loss": 0.82593405, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 13173, "time_per_iteration": 2.4437193870544434 }, { "auxiliary_loss_clip": 0.01052051, "auxiliary_loss_mlp": 0.01037253, "balance_loss_clip": 1.01479387, "balance_loss_mlp": 1.01621366, "epoch": 0.7920637306478281, "flos": 23654947443840.0, "grad_norm": 1.8152328390984418, "language_loss": 0.78048539, "learning_rate": 4.365625413419365e-07, "loss": 0.80137849, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 13174, "time_per_iteration": 2.4004623889923096 }, { "auxiliary_loss_clip": 0.01049307, "auxiliary_loss_mlp": 0.0103834, "balance_loss_clip": 1.01597619, "balance_loss_mlp": 1.01531458, "epoch": 0.792123853900496, "flos": 27194302500480.0, "grad_norm": 1.7631516706074781, "language_loss": 0.72776866, "learning_rate": 4.363196905447297e-07, "loss": 0.74864513, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.33984375, "step": 13175, "time_per_iteration": 2.570491075515747 }, { "auxiliary_loss_clip": 0.01052528, "auxiliary_loss_mlp": 0.01040632, "balance_loss_clip": 1.01652837, "balance_loss_mlp": 1.01613426, "epoch": 0.792183977153164, "flos": 19097664520320.0, "grad_norm": 2.920604200740116, "language_loss": 0.60317653, "learning_rate": 4.360768990424364e-07, "loss": 0.62410814, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 13176, "time_per_iteration": 2.3686742782592773 }, { "auxiliary_loss_clip": 0.01052966, "auxiliary_loss_mlp": 0.01037515, "balance_loss_clip": 1.01419783, "balance_loss_mlp": 1.01693892, "epoch": 0.7922441004058319, "flos": 17127899723520.0, "grad_norm": 1.6907674350315618, "language_loss": 0.74645841, "learning_rate": 4.3583416684426376e-07, "loss": 0.76736331, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 13177, "time_per_iteration": 2.3941731452941895 }, { "auxiliary_loss_clip": 0.01052038, "auxiliary_loss_mlp": 0.01038522, "balance_loss_clip": 1.0147754, "balance_loss_mlp": 1.01713979, "epoch": 0.7923042236585, "flos": 17820681298560.0, "grad_norm": 8.830392110926448, "language_loss": 0.65161747, "learning_rate": 4.355914939594174e-07, "loss": 0.67252302, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34765625, "step": 13178, "time_per_iteration": 2.3704211711883545 }, { "auxiliary_loss_clip": 0.01051816, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.01218033, "balance_loss_mlp": 1.01617765, "epoch": 0.7923643469111679, "flos": 29933901976320.0, "grad_norm": 1.480724941203536, "language_loss": 0.69477057, "learning_rate": 4.3534888039709726e-07, "loss": 0.7156207, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.35546875, "step": 13179, "time_per_iteration": 2.4881720542907715 }, { "auxiliary_loss_clip": 0.01050888, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.01145411, "balance_loss_mlp": 1.01535177, "epoch": 0.7924244701638359, "flos": 22673608548480.0, "grad_norm": 2.1054216243464645, "language_loss": 0.75616181, "learning_rate": 4.3510632616650444e-07, "loss": 0.77701604, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 13180, "time_per_iteration": 2.3702096939086914 }, { "auxiliary_loss_clip": 0.01053816, "auxiliary_loss_mlp": 0.01036502, "balance_loss_clip": 1.01268411, "balance_loss_mlp": 1.01677394, "epoch": 0.7924845934165038, "flos": 17967176830080.0, "grad_norm": 2.0059046304641543, "language_loss": 0.82907581, "learning_rate": 4.3486383127683646e-07, "loss": 0.84997892, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 13181, "time_per_iteration": 2.3908777236938477 }, { "auxiliary_loss_clip": 0.0105154, "auxiliary_loss_mlp": 0.01042335, "balance_loss_clip": 1.01804042, "balance_loss_mlp": 1.01534986, "epoch": 0.7925447166691718, "flos": 23475842835840.0, "grad_norm": 1.8147211796278988, "language_loss": 0.7834174, "learning_rate": 4.346213957372895e-07, "loss": 0.80435622, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 13182, "time_per_iteration": 2.4420676231384277 }, { "auxiliary_loss_clip": 0.01056353, "auxiliary_loss_mlp": 0.01042337, "balance_loss_clip": 1.01596785, "balance_loss_mlp": 1.01723135, "epoch": 0.7926048399218397, "flos": 20446568876160.0, "grad_norm": 2.412028183117106, "language_loss": 0.75988305, "learning_rate": 4.34379019557056e-07, "loss": 0.78086996, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.390625, "step": 13183, "time_per_iteration": 2.4970364570617676 }, { "auxiliary_loss_clip": 0.01050601, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.01122618, "balance_loss_mlp": 1.01485634, "epoch": 0.7926649631745077, "flos": 37158514128000.0, "grad_norm": 2.696979578853507, "language_loss": 0.70025992, "learning_rate": 4.341367027453264e-07, "loss": 0.72111887, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35742188, "step": 13184, "time_per_iteration": 3.7900891304016113 }, { "auxiliary_loss_clip": 0.01053778, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.014117, "balance_loss_mlp": 1.0167284, "epoch": 0.7927250864271758, "flos": 17017678961280.0, "grad_norm": 2.0346207222938477, "language_loss": 0.72058332, "learning_rate": 4.338944453112907e-07, "loss": 0.74149507, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 13185, "time_per_iteration": 2.3930413722991943 }, { "auxiliary_loss_clip": 0.01053, "auxiliary_loss_mlp": 0.01043321, "balance_loss_clip": 1.01770246, "balance_loss_mlp": 1.01564586, "epoch": 0.7927852096798437, "flos": 17748236494080.0, "grad_norm": 2.002771972067062, "language_loss": 0.66977918, "learning_rate": 4.3365224726413375e-07, "loss": 0.69074237, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 13186, "time_per_iteration": 2.347581148147583 }, { "auxiliary_loss_clip": 0.01050671, "auxiliary_loss_mlp": 0.01037771, "balance_loss_clip": 1.01689792, "balance_loss_mlp": 1.01623154, "epoch": 0.7928453329325117, "flos": 23837403542400.0, "grad_norm": 1.802596826220055, "language_loss": 0.77500284, "learning_rate": 4.334101086130408e-07, "loss": 0.79588729, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 13187, "time_per_iteration": 2.410280227661133 }, { "auxiliary_loss_clip": 0.01050975, "auxiliary_loss_mlp": 0.01034864, "balance_loss_clip": 1.01333523, "balance_loss_mlp": 1.01600718, "epoch": 0.7929054561851796, "flos": 17454023533440.0, "grad_norm": 2.223922955777871, "language_loss": 0.7349422, "learning_rate": 4.3316802936719334e-07, "loss": 0.7558006, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 13188, "time_per_iteration": 2.3225107192993164 }, { "auxiliary_loss_clip": 0.01054417, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.01945901, "balance_loss_mlp": 1.01634967, "epoch": 0.7929655794378476, "flos": 21980198568960.0, "grad_norm": 2.5216683712881562, "language_loss": 0.64399487, "learning_rate": 4.329260095357725e-07, "loss": 0.66499507, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38085938, "step": 13189, "time_per_iteration": 2.436572551727295 }, { "auxiliary_loss_clip": 0.01051396, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.01522374, "balance_loss_mlp": 1.01637387, "epoch": 0.7930257026905155, "flos": 17272998800640.0, "grad_norm": 1.8553171385029397, "language_loss": 0.74057662, "learning_rate": 4.3268404912795307e-07, "loss": 0.76147711, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 13190, "time_per_iteration": 2.3367631435394287 }, { "auxiliary_loss_clip": 0.01049571, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.01580477, "balance_loss_mlp": 1.01567268, "epoch": 0.7930858259431836, "flos": 27299565849600.0, "grad_norm": 1.739541764862739, "language_loss": 0.7394352, "learning_rate": 4.3244214815291166e-07, "loss": 0.76029515, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.33984375, "step": 13191, "time_per_iteration": 3.831942319869995 }, { "auxiliary_loss_clip": 0.01050869, "auxiliary_loss_mlp": 0.01040921, "balance_loss_clip": 1.01699543, "balance_loss_mlp": 1.01522946, "epoch": 0.7931459491958515, "flos": 19862751254400.0, "grad_norm": 1.7816878512190157, "language_loss": 0.70457256, "learning_rate": 4.322003066198219e-07, "loss": 0.72549045, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35546875, "step": 13192, "time_per_iteration": 3.713895797729492 }, { "auxiliary_loss_clip": 0.01052523, "auxiliary_loss_mlp": 0.01034705, "balance_loss_clip": 1.01266336, "balance_loss_mlp": 1.01630509, "epoch": 0.7932060724485195, "flos": 23146053333120.0, "grad_norm": 1.7455021671000863, "language_loss": 0.75942492, "learning_rate": 4.3195852453785274e-07, "loss": 0.78029728, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 13193, "time_per_iteration": 2.3818862438201904 }, { "auxiliary_loss_clip": 0.01053614, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.01456392, "balance_loss_mlp": 1.01764631, "epoch": 0.7932661957011874, "flos": 29933552862720.0, "grad_norm": 1.8368595266389598, "language_loss": 0.73221481, "learning_rate": 4.317168019161741e-07, "loss": 0.75314885, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 13194, "time_per_iteration": 2.4834189414978027 }, { "auxiliary_loss_clip": 0.01053973, "auxiliary_loss_mlp": 0.01038791, "balance_loss_clip": 1.01297045, "balance_loss_mlp": 1.016186, "epoch": 0.7933263189538554, "flos": 22558185993600.0, "grad_norm": 2.0182067654354228, "language_loss": 0.70719105, "learning_rate": 4.314751387639517e-07, "loss": 0.72811866, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 13195, "time_per_iteration": 2.393211841583252 }, { "auxiliary_loss_clip": 0.01051682, "auxiliary_loss_mlp": 0.01035093, "balance_loss_clip": 1.01188302, "balance_loss_mlp": 1.01615477, "epoch": 0.7933864422065233, "flos": 25478007240960.0, "grad_norm": 1.5707810157651836, "language_loss": 0.77736688, "learning_rate": 4.3123353509034844e-07, "loss": 0.79823458, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 13196, "time_per_iteration": 2.426161050796509 }, { "auxiliary_loss_clip": 0.01054214, "auxiliary_loss_mlp": 0.01041642, "balance_loss_clip": 1.01617837, "balance_loss_mlp": 1.01699054, "epoch": 0.7934465654591913, "flos": 33581767138560.0, "grad_norm": 1.5579550222133354, "language_loss": 0.69811469, "learning_rate": 4.309919909045268e-07, "loss": 0.7190733, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 13197, "time_per_iteration": 2.5184993743896484 }, { "auxiliary_loss_clip": 0.01051739, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.01223278, "balance_loss_mlp": 1.01618254, "epoch": 0.7935066887118594, "flos": 31431152165760.0, "grad_norm": 2.3523200804655473, "language_loss": 0.65952468, "learning_rate": 4.30750506215646e-07, "loss": 0.68039048, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13198, "time_per_iteration": 2.476816415786743 }, { "auxiliary_loss_clip": 0.01056227, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.01820266, "balance_loss_mlp": 1.01829696, "epoch": 0.7935668119645273, "flos": 14681780069760.0, "grad_norm": 2.0238564274784, "language_loss": 0.73973191, "learning_rate": 4.30509081032864e-07, "loss": 0.76073503, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 13199, "time_per_iteration": 2.3761396408081055 }, { "auxiliary_loss_clip": 0.01052558, "auxiliary_loss_mlp": 0.01040566, "balance_loss_clip": 1.01660466, "balance_loss_mlp": 1.01632595, "epoch": 0.7936269352171953, "flos": 18003277042560.0, "grad_norm": 2.040327663530415, "language_loss": 0.81556559, "learning_rate": 4.302677153653349e-07, "loss": 0.83649677, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 13200, "time_per_iteration": 2.4006898403167725 }, { "auxiliary_loss_clip": 0.01051348, "auxiliary_loss_mlp": 0.01038035, "balance_loss_clip": 1.01517045, "balance_loss_mlp": 1.01633179, "epoch": 0.7936870584698632, "flos": 18879212943360.0, "grad_norm": 1.796680248725001, "language_loss": 0.77799594, "learning_rate": 4.3002640922221077e-07, "loss": 0.79888976, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 13201, "time_per_iteration": 2.387350082397461 }, { "auxiliary_loss_clip": 0.01051333, "auxiliary_loss_mlp": 0.01039683, "balance_loss_clip": 1.01644921, "balance_loss_mlp": 1.01588511, "epoch": 0.7937471817225312, "flos": 23365901364480.0, "grad_norm": 1.6711730482115423, "language_loss": 0.67435521, "learning_rate": 4.2978516261264296e-07, "loss": 0.69526529, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 13202, "time_per_iteration": 2.4267404079437256 }, { "auxiliary_loss_clip": 0.01053334, "auxiliary_loss_mlp": 0.01035419, "balance_loss_clip": 1.0111239, "balance_loss_mlp": 1.01617336, "epoch": 0.7938073049751991, "flos": 22673329257600.0, "grad_norm": 1.9564292706442066, "language_loss": 0.75462556, "learning_rate": 4.2954397554577884e-07, "loss": 0.77551305, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 13203, "time_per_iteration": 2.391225576400757 }, { "auxiliary_loss_clip": 0.01051256, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.01195955, "balance_loss_mlp": 1.01566947, "epoch": 0.7938674282278672, "flos": 22850478829440.0, "grad_norm": 1.9098061085753986, "language_loss": 0.67777848, "learning_rate": 4.293028480307643e-07, "loss": 0.6986438, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 13204, "time_per_iteration": 2.3808417320251465 }, { "auxiliary_loss_clip": 0.01050124, "auxiliary_loss_mlp": 0.01042127, "balance_loss_clip": 1.01898825, "balance_loss_mlp": 1.01518679, "epoch": 0.7939275514805351, "flos": 27011392554240.0, "grad_norm": 1.408836384406725, "language_loss": 0.79867774, "learning_rate": 4.290617800767438e-07, "loss": 0.81960022, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 13205, "time_per_iteration": 3.8605754375457764 }, { "auxiliary_loss_clip": 0.01049723, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.0168314, "balance_loss_mlp": 1.014606, "epoch": 0.7939876747332031, "flos": 21141759335040.0, "grad_norm": 1.8606133237099647, "language_loss": 0.78945887, "learning_rate": 4.28820771692858e-07, "loss": 0.81034875, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 13206, "time_per_iteration": 2.447221517562866 }, { "auxiliary_loss_clip": 0.01055401, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.01328743, "balance_loss_mlp": 1.01742601, "epoch": 0.794047797985871, "flos": 23288115121920.0, "grad_norm": 2.3552185659632117, "language_loss": 0.8065694, "learning_rate": 4.285798228882456e-07, "loss": 0.82752448, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.37890625, "step": 13207, "time_per_iteration": 2.405446767807007 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 1.0155623, "balance_loss_mlp": 1.01556909, "epoch": 0.794107921238539, "flos": 24606924019200.0, "grad_norm": 1.8478018583175289, "language_loss": 0.84746504, "learning_rate": 4.2833893367204375e-07, "loss": 0.86836427, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 13208, "time_per_iteration": 2.4263768196105957 }, { "auxiliary_loss_clip": 0.01007821, "auxiliary_loss_mlp": 0.01003184, "balance_loss_clip": 1.00091875, "balance_loss_mlp": 1.00110936, "epoch": 0.7941680444912069, "flos": 64090198448640.0, "grad_norm": 0.775098410322917, "language_loss": 0.58350468, "learning_rate": 4.280981040533875e-07, "loss": 0.60361469, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.06738281, "step": 13209, "time_per_iteration": 3.1642396450042725 }, { "auxiliary_loss_clip": 0.01054896, "auxiliary_loss_mlp": 0.01040202, "balance_loss_clip": 1.01476312, "balance_loss_mlp": 1.01740599, "epoch": 0.794228167743875, "flos": 24387704392320.0, "grad_norm": 2.173564310863978, "language_loss": 0.64073908, "learning_rate": 4.2785733404140825e-07, "loss": 0.66169012, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 13210, "time_per_iteration": 2.427150249481201 }, { "auxiliary_loss_clip": 0.01051567, "auxiliary_loss_mlp": 0.01041571, "balance_loss_clip": 1.02011323, "balance_loss_mlp": 1.01584697, "epoch": 0.794288290996543, "flos": 28511226184320.0, "grad_norm": 1.5172336907186137, "language_loss": 0.70157486, "learning_rate": 4.2761662364523676e-07, "loss": 0.72250617, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35742188, "step": 13211, "time_per_iteration": 2.430877923965454 }, { "auxiliary_loss_clip": 0.01054366, "auxiliary_loss_mlp": 0.01047301, "balance_loss_clip": 1.02134967, "balance_loss_mlp": 1.01683474, "epoch": 0.7943484142492109, "flos": 25920915148800.0, "grad_norm": 1.6605743558221266, "language_loss": 0.73420042, "learning_rate": 4.2737597287400074e-07, "loss": 0.75521708, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 13212, "time_per_iteration": 2.422306776046753 }, { "auxiliary_loss_clip": 0.01050819, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.01150155, "balance_loss_mlp": 1.01629496, "epoch": 0.7944085375018789, "flos": 23914142444160.0, "grad_norm": 1.5723543833328346, "language_loss": 0.80848604, "learning_rate": 4.271353817368246e-07, "loss": 0.82932574, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 13213, "time_per_iteration": 2.3897955417633057 }, { "auxiliary_loss_clip": 0.010548, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.01213622, "balance_loss_mlp": 1.01679349, "epoch": 0.7944686607545468, "flos": 20228920260480.0, "grad_norm": 2.389444456671732, "language_loss": 0.69294846, "learning_rate": 4.268948502428327e-07, "loss": 0.71387076, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 13214, "time_per_iteration": 2.365450859069824 }, { "auxiliary_loss_clip": 0.01050161, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.01050961, "balance_loss_mlp": 1.01525354, "epoch": 0.7945287840072148, "flos": 21979919278080.0, "grad_norm": 1.756461279648147, "language_loss": 0.73187661, "learning_rate": 4.2665437840114535e-07, "loss": 0.75270265, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34960938, "step": 13215, "time_per_iteration": 2.3719539642333984 }, { "auxiliary_loss_clip": 0.01052655, "auxiliary_loss_mlp": 0.0103528, "balance_loss_clip": 1.01303554, "balance_loss_mlp": 1.01691866, "epoch": 0.7945889072598827, "flos": 26396467044480.0, "grad_norm": 1.5997530441770806, "language_loss": 0.79929084, "learning_rate": 4.2641396622088253e-07, "loss": 0.82017016, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 13216, "time_per_iteration": 2.4333584308624268 }, { "auxiliary_loss_clip": 0.01051795, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.01314199, "balance_loss_mlp": 1.01576269, "epoch": 0.7946490305125508, "flos": 25809123375360.0, "grad_norm": 1.831357188272053, "language_loss": 0.74851483, "learning_rate": 4.261736137111598e-07, "loss": 0.76939726, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 13217, "time_per_iteration": 2.3970069885253906 }, { "auxiliary_loss_clip": 0.01050269, "auxiliary_loss_mlp": 0.01037234, "balance_loss_clip": 1.01591933, "balance_loss_mlp": 1.01602292, "epoch": 0.7947091537652187, "flos": 15960055011840.0, "grad_norm": 1.897438560728471, "language_loss": 0.7477839, "learning_rate": 4.259333208810907e-07, "loss": 0.76865888, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 13218, "time_per_iteration": 2.3656723499298096 }, { "auxiliary_loss_clip": 0.01054954, "auxiliary_loss_mlp": 0.01047184, "balance_loss_clip": 1.02247202, "balance_loss_mlp": 1.0168153, "epoch": 0.7947692770178867, "flos": 18586885196160.0, "grad_norm": 1.933882970918329, "language_loss": 0.85320532, "learning_rate": 4.2569308773978817e-07, "loss": 0.87422669, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 13219, "time_per_iteration": 2.340045690536499 }, { "auxiliary_loss_clip": 0.01055467, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.01361918, "balance_loss_mlp": 1.01762938, "epoch": 0.7948294002705546, "flos": 20441367083520.0, "grad_norm": 1.9232535381441154, "language_loss": 0.76423484, "learning_rate": 4.2545291429636123e-07, "loss": 0.7852006, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37890625, "step": 13220, "time_per_iteration": 2.3990962505340576 }, { "auxiliary_loss_clip": 0.01054244, "auxiliary_loss_mlp": 0.01038269, "balance_loss_clip": 1.01465368, "balance_loss_mlp": 1.01706946, "epoch": 0.7948895235232226, "flos": 38179653840000.0, "grad_norm": 1.7769890584710024, "language_loss": 0.73338026, "learning_rate": 4.252128005599176e-07, "loss": 0.75430536, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 13221, "time_per_iteration": 2.498739719390869 }, { "auxiliary_loss_clip": 0.01050023, "auxiliary_loss_mlp": 0.01035555, "balance_loss_clip": 1.01358521, "balance_loss_mlp": 1.01564944, "epoch": 0.7949496467758905, "flos": 15558902956800.0, "grad_norm": 1.9211936492821031, "language_loss": 0.76051068, "learning_rate": 4.249727465395634e-07, "loss": 0.78136647, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 13222, "time_per_iteration": 2.400449514389038 }, { "auxiliary_loss_clip": 0.01007787, "auxiliary_loss_mlp": 0.0100253, "balance_loss_clip": 1.00082564, "balance_loss_mlp": 1.00099754, "epoch": 0.7950097700285585, "flos": 70893898715520.0, "grad_norm": 0.769830955911655, "language_loss": 0.67202109, "learning_rate": 4.247327522443993e-07, "loss": 0.69212425, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.06787109, "step": 13223, "time_per_iteration": 2.8637583255767822 }, { "auxiliary_loss_clip": 0.01052431, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.01495767, "balance_loss_mlp": 1.0162077, "epoch": 0.7950698932812266, "flos": 23950487036160.0, "grad_norm": 1.8422853974359592, "language_loss": 0.72251922, "learning_rate": 4.2449281768352717e-07, "loss": 0.74343985, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 13224, "time_per_iteration": 3.612887382507324 }, { "auxiliary_loss_clip": 0.01007425, "auxiliary_loss_mlp": 0.01002199, "balance_loss_clip": 1.00006533, "balance_loss_mlp": 1.00076056, "epoch": 0.7951300165338945, "flos": 60279638883840.0, "grad_norm": 0.6692413730035823, "language_loss": 0.55124021, "learning_rate": 4.2425294286604527e-07, "loss": 0.57133639, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.06640625, "step": 13225, "time_per_iteration": 3.093139410018921 }, { "auxiliary_loss_clip": 0.01049337, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.01215839, "balance_loss_mlp": 1.01472759, "epoch": 0.7951901397865625, "flos": 22817939575680.0, "grad_norm": 1.8993546841533853, "language_loss": 0.65856194, "learning_rate": 4.2401312780105034e-07, "loss": 0.67940247, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34570312, "step": 13226, "time_per_iteration": 2.3524067401885986 }, { "auxiliary_loss_clip": 0.01053841, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.01860476, "balance_loss_mlp": 1.01629472, "epoch": 0.7952502630392304, "flos": 35694326862720.0, "grad_norm": 2.313079447413372, "language_loss": 0.71219361, "learning_rate": 4.237733724976349e-07, "loss": 0.73316169, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 13227, "time_per_iteration": 2.5145812034606934 }, { "auxiliary_loss_clip": 0.01049158, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.01843512, "balance_loss_mlp": 1.01528084, "epoch": 0.7953103862918984, "flos": 25628657224320.0, "grad_norm": 2.935239577945484, "language_loss": 0.71335185, "learning_rate": 4.2353367696489184e-07, "loss": 0.73424047, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 13228, "time_per_iteration": 2.4003708362579346 }, { "auxiliary_loss_clip": 0.01053136, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.0195539, "balance_loss_mlp": 1.01625752, "epoch": 0.7953705095445663, "flos": 40550396135040.0, "grad_norm": 1.4565672181769005, "language_loss": 0.71362174, "learning_rate": 4.232940412119095e-07, "loss": 0.73458701, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 13229, "time_per_iteration": 2.567638635635376 }, { "auxiliary_loss_clip": 0.01055448, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.01441038, "balance_loss_mlp": 1.0171423, "epoch": 0.7954306327972344, "flos": 27635429928960.0, "grad_norm": 2.907856099591437, "language_loss": 0.73052227, "learning_rate": 4.2305446524777457e-07, "loss": 0.75147825, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 13230, "time_per_iteration": 3.812774419784546 }, { "auxiliary_loss_clip": 0.01008005, "auxiliary_loss_mlp": 0.0100514, "balance_loss_clip": 1.00305426, "balance_loss_mlp": 1.00077677, "epoch": 0.7954907560499023, "flos": 59500481005440.0, "grad_norm": 0.8985191451752992, "language_loss": 0.63660574, "learning_rate": 4.2281494908157247e-07, "loss": 0.65673721, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.07226562, "step": 13231, "time_per_iteration": 4.457201242446899 }, { "auxiliary_loss_clip": 0.01053436, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.01595879, "balance_loss_mlp": 1.01733518, "epoch": 0.7955508793025703, "flos": 20119502459520.0, "grad_norm": 1.638163003632879, "language_loss": 0.70327628, "learning_rate": 4.2257549272238566e-07, "loss": 0.72419721, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 13232, "time_per_iteration": 2.4699573516845703 }, { "auxiliary_loss_clip": 0.01051284, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.00853133, "balance_loss_mlp": 1.01577961, "epoch": 0.7956110025552382, "flos": 26504174188800.0, "grad_norm": 1.5809829052088047, "language_loss": 0.78815711, "learning_rate": 4.223360961792952e-07, "loss": 0.80898631, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 13233, "time_per_iteration": 2.3929152488708496 }, { "auxiliary_loss_clip": 0.01052868, "auxiliary_loss_mlp": 0.01041092, "balance_loss_clip": 1.01608229, "balance_loss_mlp": 1.01634395, "epoch": 0.7956711258079062, "flos": 22564365304320.0, "grad_norm": 2.7164320185865467, "language_loss": 0.79070431, "learning_rate": 4.220967594613769e-07, "loss": 0.81164396, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 13234, "time_per_iteration": 2.3869435787200928 }, { "auxiliary_loss_clip": 0.010511, "auxiliary_loss_mlp": 0.01033185, "balance_loss_clip": 1.01234722, "balance_loss_mlp": 1.01632571, "epoch": 0.7957312490605741, "flos": 17378192327040.0, "grad_norm": 1.5711083594471142, "language_loss": 0.70821357, "learning_rate": 4.218574825777077e-07, "loss": 0.72905642, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 13235, "time_per_iteration": 2.324753522872925 }, { "auxiliary_loss_clip": 0.01052122, "auxiliary_loss_mlp": 0.01032418, "balance_loss_clip": 1.00793254, "balance_loss_mlp": 1.01581573, "epoch": 0.7957913723132422, "flos": 22490349488640.0, "grad_norm": 1.7230194039366533, "language_loss": 0.69516337, "learning_rate": 4.2161826553736145e-07, "loss": 0.71600878, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 13236, "time_per_iteration": 2.382568836212158 }, { "auxiliary_loss_clip": 0.0105116, "auxiliary_loss_mlp": 0.01036975, "balance_loss_clip": 1.01337147, "balance_loss_mlp": 1.01626348, "epoch": 0.7958514955659101, "flos": 22636984665600.0, "grad_norm": 1.579744751238117, "language_loss": 0.7598331, "learning_rate": 4.2137910834940826e-07, "loss": 0.78071451, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34765625, "step": 13237, "time_per_iteration": 2.3580968379974365 }, { "auxiliary_loss_clip": 0.01052985, "auxiliary_loss_mlp": 0.01036368, "balance_loss_clip": 1.01157308, "balance_loss_mlp": 1.01681471, "epoch": 0.7959116188185781, "flos": 20703180435840.0, "grad_norm": 1.926957361610192, "language_loss": 0.72006035, "learning_rate": 4.211400110229175e-07, "loss": 0.74095392, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 13238, "time_per_iteration": 2.364130973815918 }, { "auxiliary_loss_clip": 0.01053811, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.01198828, "balance_loss_mlp": 1.01676846, "epoch": 0.7959717420712461, "flos": 19023718527360.0, "grad_norm": 1.6268585816864745, "language_loss": 0.75529695, "learning_rate": 4.2090097356695684e-07, "loss": 0.77618468, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36914062, "step": 13239, "time_per_iteration": 2.3379557132720947 }, { "auxiliary_loss_clip": 0.01053731, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.01411319, "balance_loss_mlp": 1.0164485, "epoch": 0.796031865323914, "flos": 26355514152960.0, "grad_norm": 1.8427786457791275, "language_loss": 0.71509844, "learning_rate": 4.2066199599058814e-07, "loss": 0.73601663, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 13240, "time_per_iteration": 2.4375221729278564 }, { "auxiliary_loss_clip": 0.01007564, "auxiliary_loss_mlp": 0.01002352, "balance_loss_clip": 1.00037301, "balance_loss_mlp": 1.00069809, "epoch": 0.796091988576582, "flos": 62066493734400.0, "grad_norm": 0.8910606473468836, "language_loss": 0.58797693, "learning_rate": 4.2042307830287526e-07, "loss": 0.6080761, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06884766, "step": 13241, "time_per_iteration": 2.820869207382202 }, { "auxiliary_loss_clip": 0.01051979, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.01240361, "balance_loss_mlp": 1.01676428, "epoch": 0.7961521118292499, "flos": 39018546921600.0, "grad_norm": 2.084570476609054, "language_loss": 0.64882237, "learning_rate": 4.201842205128772e-07, "loss": 0.66968876, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 13242, "time_per_iteration": 2.5433218479156494 }, { "auxiliary_loss_clip": 0.0105121, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.0130055, "balance_loss_mlp": 1.01565385, "epoch": 0.796212235081918, "flos": 21761746992000.0, "grad_norm": 1.9075277797537118, "language_loss": 0.78138423, "learning_rate": 4.199454226296526e-07, "loss": 0.80227041, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 13243, "time_per_iteration": 2.356863498687744 }, { "auxiliary_loss_clip": 0.0105382, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.01362324, "balance_loss_mlp": 1.01684189, "epoch": 0.7962723583345859, "flos": 21177789724800.0, "grad_norm": 1.7073804368590668, "language_loss": 0.80208415, "learning_rate": 4.1970668466225565e-07, "loss": 0.82299888, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 13244, "time_per_iteration": 2.415593147277832 }, { "auxiliary_loss_clip": 0.01054201, "auxiliary_loss_mlp": 0.01041082, "balance_loss_clip": 1.01627517, "balance_loss_mlp": 1.01645625, "epoch": 0.7963324815872539, "flos": 17127690255360.0, "grad_norm": 2.2756100199076017, "language_loss": 0.69407392, "learning_rate": 4.1946800661973934e-07, "loss": 0.71502668, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 13245, "time_per_iteration": 3.916099786758423 }, { "auxiliary_loss_clip": 0.01052525, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.01269782, "balance_loss_mlp": 1.01676321, "epoch": 0.7963926048399218, "flos": 21396415858560.0, "grad_norm": 1.4482659719424709, "language_loss": 0.79917824, "learning_rate": 4.192293885111549e-07, "loss": 0.82006979, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35742188, "step": 13246, "time_per_iteration": 2.381047487258911 }, { "auxiliary_loss_clip": 0.01053174, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.01484084, "balance_loss_mlp": 1.01553106, "epoch": 0.7964527280925898, "flos": 25183235698560.0, "grad_norm": 1.8442923674323632, "language_loss": 0.67287505, "learning_rate": 4.1899083034555007e-07, "loss": 0.69380182, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 13247, "time_per_iteration": 2.385470390319824 }, { "auxiliary_loss_clip": 0.01050456, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.01460266, "balance_loss_mlp": 1.01580858, "epoch": 0.7965128513452577, "flos": 27014674222080.0, "grad_norm": 1.993119670895604, "language_loss": 0.7277658, "learning_rate": 4.1875233213197123e-07, "loss": 0.74862564, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 13248, "time_per_iteration": 2.424541473388672 }, { "auxiliary_loss_clip": 0.01052451, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.01423955, "balance_loss_mlp": 1.01648939, "epoch": 0.7965729745979258, "flos": 24418602812160.0, "grad_norm": 2.3794005788624912, "language_loss": 0.76910281, "learning_rate": 4.1851389387946255e-07, "loss": 0.79000938, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 13249, "time_per_iteration": 2.3677263259887695 }, { "auxiliary_loss_clip": 0.01051595, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.01305091, "balance_loss_mlp": 1.01646578, "epoch": 0.7966330978505937, "flos": 18839481949440.0, "grad_norm": 1.829123814736729, "language_loss": 0.62970346, "learning_rate": 4.1827551559706674e-07, "loss": 0.65055931, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3515625, "step": 13250, "time_per_iteration": 2.3596856594085693 }, { "auxiliary_loss_clip": 0.01051862, "auxiliary_loss_mlp": 0.01036258, "balance_loss_clip": 1.01234436, "balance_loss_mlp": 1.01613712, "epoch": 0.7966932211032617, "flos": 13151466956160.0, "grad_norm": 2.2159276915535013, "language_loss": 0.74390173, "learning_rate": 4.180371972938206e-07, "loss": 0.76478302, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 13251, "time_per_iteration": 2.3301806449890137 }, { "auxiliary_loss_clip": 0.01055583, "auxiliary_loss_mlp": 0.010404, "balance_loss_clip": 1.01367378, "balance_loss_mlp": 1.01763511, "epoch": 0.7967533443559297, "flos": 23948671645440.0, "grad_norm": 1.9036812125298026, "language_loss": 0.74564517, "learning_rate": 4.177989389787624e-07, "loss": 0.76660502, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38085938, "step": 13252, "time_per_iteration": 2.397923231124878 }, { "auxiliary_loss_clip": 0.01050022, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.01140893, "balance_loss_mlp": 1.01560175, "epoch": 0.7968134676085976, "flos": 30367593285120.0, "grad_norm": 1.9475907562746544, "language_loss": 0.67526472, "learning_rate": 4.175607406609278e-07, "loss": 0.69609851, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 13253, "time_per_iteration": 2.4117307662963867 }, { "auxiliary_loss_clip": 0.01053451, "auxiliary_loss_mlp": 0.01043411, "balance_loss_clip": 1.01906896, "balance_loss_mlp": 1.01594114, "epoch": 0.7968735908612656, "flos": 23073957642240.0, "grad_norm": 1.5177408422808698, "language_loss": 0.68037808, "learning_rate": 4.1732260234934767e-07, "loss": 0.70134676, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.375, "step": 13254, "time_per_iteration": 2.385810613632202 }, { "auxiliary_loss_clip": 0.01051503, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0151937, "balance_loss_mlp": 1.01648712, "epoch": 0.7969337141139335, "flos": 23581245830400.0, "grad_norm": 1.937319370982777, "language_loss": 0.70927429, "learning_rate": 4.1708452405305314e-07, "loss": 0.73015559, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 13255, "time_per_iteration": 2.3797271251678467 }, { "auxiliary_loss_clip": 0.01050171, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.01417959, "balance_loss_mlp": 1.01515269, "epoch": 0.7969938373666016, "flos": 19754834641920.0, "grad_norm": 2.06828992935669, "language_loss": 0.80050921, "learning_rate": 4.168465057810733e-07, "loss": 0.82137996, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 13256, "time_per_iteration": 2.397908926010132 }, { "auxiliary_loss_clip": 0.01052595, "auxiliary_loss_mlp": 0.01033566, "balance_loss_clip": 1.01147652, "balance_loss_mlp": 1.01687825, "epoch": 0.7970539606192695, "flos": 24132943134720.0, "grad_norm": 2.319243762144068, "language_loss": 0.66957754, "learning_rate": 4.166085475424315e-07, "loss": 0.69043916, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 13257, "time_per_iteration": 2.3795344829559326 }, { "auxiliary_loss_clip": 0.01055608, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.01473117, "balance_loss_mlp": 1.01801133, "epoch": 0.7971140838719375, "flos": 17967630677760.0, "grad_norm": 2.242274490301799, "language_loss": 0.73383546, "learning_rate": 4.163706493461523e-07, "loss": 0.75478685, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 13258, "time_per_iteration": 2.370718479156494 }, { "auxiliary_loss_clip": 0.01053589, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.01438439, "balance_loss_mlp": 1.01610327, "epoch": 0.7971742071246054, "flos": 19168608136320.0, "grad_norm": 1.7743104184605019, "language_loss": 0.69913208, "learning_rate": 4.1613281120125655e-07, "loss": 0.7200641, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 13259, "time_per_iteration": 2.3417551517486572 }, { "auxiliary_loss_clip": 0.0105048, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.01356316, "balance_loss_mlp": 1.01665258, "epoch": 0.7972343303772734, "flos": 27124720427520.0, "grad_norm": 1.7400792396879277, "language_loss": 0.74243873, "learning_rate": 4.158950331167641e-07, "loss": 0.76329184, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33789062, "step": 13260, "time_per_iteration": 2.442868709564209 }, { "auxiliary_loss_clip": 0.01049739, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.01174474, "balance_loss_mlp": 1.0157187, "epoch": 0.7972944536299413, "flos": 20995578005760.0, "grad_norm": 1.7357628757408043, "language_loss": 0.78967923, "learning_rate": 4.1565731510169065e-07, "loss": 0.81051195, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 13261, "time_per_iteration": 2.3762500286102295 }, { "auxiliary_loss_clip": 0.01046924, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01294112, "balance_loss_mlp": 1.01516879, "epoch": 0.7973545768826094, "flos": 21578941779840.0, "grad_norm": 1.4381669705601923, "language_loss": 0.76426309, "learning_rate": 4.154196571650501e-07, "loss": 0.78505248, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.31640625, "step": 13262, "time_per_iteration": 2.3652896881103516 }, { "auxiliary_loss_clip": 0.01056695, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 1.01415884, "balance_loss_mlp": 1.0175333, "epoch": 0.7974147001352773, "flos": 20557487865600.0, "grad_norm": 7.693261206777027, "language_loss": 0.72272789, "learning_rate": 4.1518205931585524e-07, "loss": 0.7436952, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.390625, "step": 13263, "time_per_iteration": 2.379946708679199 }, { "auxiliary_loss_clip": 0.0105609, "auxiliary_loss_mlp": 0.01041191, "balance_loss_clip": 1.01291478, "balance_loss_mlp": 1.01672125, "epoch": 0.7974748233879453, "flos": 20995717651200.0, "grad_norm": 2.0376970207944103, "language_loss": 0.72482866, "learning_rate": 4.149445215631153e-07, "loss": 0.74580151, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 0.39453125, "step": 13264, "time_per_iteration": 3.600358247756958 }, { "auxiliary_loss_clip": 0.01051916, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.01658618, "balance_loss_mlp": 1.01738918, "epoch": 0.7975349466406133, "flos": 22564086013440.0, "grad_norm": 4.822821879876, "language_loss": 0.78583938, "learning_rate": 4.1470704391583776e-07, "loss": 0.80674732, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 13265, "time_per_iteration": 2.360214948654175 }, { "auxiliary_loss_clip": 0.01052124, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.01188171, "balance_loss_mlp": 1.01561534, "epoch": 0.7975950698932812, "flos": 21688464314880.0, "grad_norm": 1.9750045779861756, "language_loss": 0.7646758, "learning_rate": 4.144696263830285e-07, "loss": 0.78555715, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 13266, "time_per_iteration": 2.3817176818847656 }, { "auxiliary_loss_clip": 0.01049679, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.01316357, "balance_loss_mlp": 1.01501071, "epoch": 0.7976551931459492, "flos": 19603695899520.0, "grad_norm": 2.0083429520661586, "language_loss": 0.85068834, "learning_rate": 4.1423226897369015e-07, "loss": 0.87152481, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 13267, "time_per_iteration": 2.3383820056915283 }, { "auxiliary_loss_clip": 0.01050191, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.01818371, "balance_loss_mlp": 1.01515198, "epoch": 0.7977153163986171, "flos": 21686579101440.0, "grad_norm": 1.5750003407604996, "language_loss": 0.77433693, "learning_rate": 4.139949716968223e-07, "loss": 0.79525405, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 13268, "time_per_iteration": 2.38385009765625 }, { "auxiliary_loss_clip": 0.01052311, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.01657414, "balance_loss_mlp": 1.01680326, "epoch": 0.7977754396512852, "flos": 23475668279040.0, "grad_norm": 1.5670755254388924, "language_loss": 0.78494412, "learning_rate": 4.1375773456142403e-07, "loss": 0.80587041, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 13269, "time_per_iteration": 2.4109268188476562 }, { "auxiliary_loss_clip": 0.01048661, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.01166916, "balance_loss_mlp": 1.01456594, "epoch": 0.7978355629039531, "flos": 22381141155840.0, "grad_norm": 1.7371468787981643, "language_loss": 0.82729828, "learning_rate": 4.135205575764922e-07, "loss": 0.84811544, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 13270, "time_per_iteration": 3.7910163402557373 }, { "auxiliary_loss_clip": 0.01052374, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.0197283, "balance_loss_mlp": 1.01690185, "epoch": 0.7978956861566211, "flos": 20265299763840.0, "grad_norm": 1.859571550011915, "language_loss": 0.61421633, "learning_rate": 4.1328344075101905e-07, "loss": 0.63516861, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 13271, "time_per_iteration": 3.622788906097412 }, { "auxiliary_loss_clip": 0.01054921, "auxiliary_loss_mlp": 0.01041155, "balance_loss_clip": 1.017838, "balance_loss_mlp": 1.01727295, "epoch": 0.797955809409289, "flos": 28111121470080.0, "grad_norm": 1.404414393111034, "language_loss": 0.74601805, "learning_rate": 4.130463840939975e-07, "loss": 0.7669788, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37695312, "step": 13272, "time_per_iteration": 2.4462101459503174 }, { "auxiliary_loss_clip": 0.01051877, "auxiliary_loss_mlp": 0.01040411, "balance_loss_clip": 1.0176419, "balance_loss_mlp": 1.01660323, "epoch": 0.798015932661957, "flos": 15558693488640.0, "grad_norm": 1.7901144621597667, "language_loss": 0.72731823, "learning_rate": 4.128093876144161e-07, "loss": 0.74824107, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 13273, "time_per_iteration": 2.3188564777374268 }, { "auxiliary_loss_clip": 0.0105446, "auxiliary_loss_mlp": 0.01038089, "balance_loss_clip": 1.01331675, "balance_loss_mlp": 1.01675427, "epoch": 0.7980760559146249, "flos": 23950068099840.0, "grad_norm": 1.7629296304887276, "language_loss": 0.76840639, "learning_rate": 4.1257245132126117e-07, "loss": 0.78933197, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37695312, "step": 13274, "time_per_iteration": 2.3892621994018555 }, { "auxiliary_loss_clip": 0.01047634, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.01412272, "balance_loss_mlp": 1.01506269, "epoch": 0.798136179167293, "flos": 28036826363520.0, "grad_norm": 1.4557828108372064, "language_loss": 0.78684211, "learning_rate": 4.12335575223518e-07, "loss": 0.80764806, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.32617188, "step": 13275, "time_per_iteration": 2.403825521469116 }, { "auxiliary_loss_clip": 0.01053662, "auxiliary_loss_mlp": 0.01043817, "balance_loss_clip": 1.0186758, "balance_loss_mlp": 1.01696301, "epoch": 0.7981963024199609, "flos": 35983268208000.0, "grad_norm": 1.9114617806166778, "language_loss": 0.65069616, "learning_rate": 4.1209875933016877e-07, "loss": 0.67167091, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 13276, "time_per_iteration": 2.4594767093658447 }, { "auxiliary_loss_clip": 0.01049911, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.0144577, "balance_loss_mlp": 1.01547539, "epoch": 0.7982564256726289, "flos": 25883732684160.0, "grad_norm": 1.6715999353810436, "language_loss": 0.61900806, "learning_rate": 4.118620036501945e-07, "loss": 0.63986969, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34375, "step": 13277, "time_per_iteration": 2.388949394226074 }, { "auxiliary_loss_clip": 0.01054508, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.01812208, "balance_loss_mlp": 1.01830757, "epoch": 0.7983165489252969, "flos": 25737970291200.0, "grad_norm": 2.2707400897593617, "language_loss": 0.80056548, "learning_rate": 4.1162530819257227e-07, "loss": 0.8215186, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 13278, "time_per_iteration": 2.3857619762420654 }, { "auxiliary_loss_clip": 0.01051724, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.01694918, "balance_loss_mlp": 1.01522076, "epoch": 0.7983766721779648, "flos": 21907125360000.0, "grad_norm": 1.9860509955635113, "language_loss": 0.65209228, "learning_rate": 4.113886729662768e-07, "loss": 0.67301595, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 13279, "time_per_iteration": 2.3512537479400635 }, { "auxiliary_loss_clip": 0.01048036, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.01124048, "balance_loss_mlp": 1.01568198, "epoch": 0.7984367954306328, "flos": 29346244104960.0, "grad_norm": 5.057200327146317, "language_loss": 0.71716905, "learning_rate": 4.111520979802825e-07, "loss": 0.73795187, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.32421875, "step": 13280, "time_per_iteration": 2.4374735355377197 }, { "auxiliary_loss_clip": 0.01054556, "auxiliary_loss_mlp": 0.01044128, "balance_loss_clip": 1.01896262, "balance_loss_mlp": 1.01780105, "epoch": 0.7984969186833007, "flos": 31356438122880.0, "grad_norm": 1.6677192533140794, "language_loss": 0.64004934, "learning_rate": 4.1091558324355955e-07, "loss": 0.66103613, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 13281, "time_per_iteration": 2.450786590576172 }, { "auxiliary_loss_clip": 0.01055588, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.01405668, "balance_loss_mlp": 1.01741457, "epoch": 0.7985570419359688, "flos": 24311873185920.0, "grad_norm": 2.9835402957080004, "language_loss": 0.80979908, "learning_rate": 4.1067912876507683e-07, "loss": 0.83073395, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 13282, "time_per_iteration": 2.4314935207366943 }, { "auxiliary_loss_clip": 0.01052454, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.015661, "balance_loss_mlp": 1.0157373, "epoch": 0.7986171651886367, "flos": 15741324144000.0, "grad_norm": 1.8793573951121325, "language_loss": 0.72700977, "learning_rate": 4.10442734553802e-07, "loss": 0.74795175, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 13283, "time_per_iteration": 2.33414626121521 }, { "auxiliary_loss_clip": 0.01049564, "auxiliary_loss_mlp": 0.01034913, "balance_loss_clip": 1.01239443, "balance_loss_mlp": 1.01467037, "epoch": 0.7986772884413047, "flos": 11618605313280.0, "grad_norm": 2.0159860053140597, "language_loss": 0.7481488, "learning_rate": 4.102064006186967e-07, "loss": 0.7689935, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13284, "time_per_iteration": 3.8019633293151855 }, { "auxiliary_loss_clip": 0.0105144, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.01499093, "balance_loss_mlp": 1.01638794, "epoch": 0.7987374116939726, "flos": 22089965483520.0, "grad_norm": 1.51894464273185, "language_loss": 0.7127853, "learning_rate": 4.0997012696872415e-07, "loss": 0.73365986, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34960938, "step": 13285, "time_per_iteration": 2.3694911003112793 }, { "auxiliary_loss_clip": 0.01051972, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.01552165, "balance_loss_mlp": 1.01673126, "epoch": 0.7987975349466406, "flos": 17889844435200.0, "grad_norm": 1.740550284959867, "language_loss": 0.74463177, "learning_rate": 4.097339136128437e-07, "loss": 0.76551986, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 13286, "time_per_iteration": 2.3713886737823486 }, { "auxiliary_loss_clip": 0.01052331, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.01540589, "balance_loss_mlp": 1.01634514, "epoch": 0.7988576581993085, "flos": 19718210759040.0, "grad_norm": 1.679771072439159, "language_loss": 0.76276016, "learning_rate": 4.0949776056001296e-07, "loss": 0.78367442, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13287, "time_per_iteration": 2.3512861728668213 }, { "auxiliary_loss_clip": 0.01050252, "auxiliary_loss_mlp": 0.01039364, "balance_loss_clip": 1.01629722, "balance_loss_mlp": 1.01576567, "epoch": 0.7989177814519766, "flos": 28035150618240.0, "grad_norm": 1.694206096404595, "language_loss": 0.63562685, "learning_rate": 4.092616678191863e-07, "loss": 0.65652305, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34570312, "step": 13288, "time_per_iteration": 2.431684970855713 }, { "auxiliary_loss_clip": 0.01051295, "auxiliary_loss_mlp": 0.0103384, "balance_loss_clip": 1.01293075, "balance_loss_mlp": 1.01669765, "epoch": 0.7989779047046445, "flos": 28869924159360.0, "grad_norm": 1.9305931523380229, "language_loss": 0.71624994, "learning_rate": 4.090256353993169e-07, "loss": 0.73710132, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 13289, "time_per_iteration": 2.4135990142822266 }, { "auxiliary_loss_clip": 0.01052324, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.01391423, "balance_loss_mlp": 1.0170536, "epoch": 0.7990380279573125, "flos": 18185907697920.0, "grad_norm": 2.161247359392258, "language_loss": 0.64189857, "learning_rate": 4.0878966330935506e-07, "loss": 0.66279054, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 13290, "time_per_iteration": 2.369813919067383 }, { "auxiliary_loss_clip": 0.01053756, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.01479936, "balance_loss_mlp": 1.01738274, "epoch": 0.7990981512099805, "flos": 20879073198720.0, "grad_norm": 1.811899500995961, "language_loss": 0.72307897, "learning_rate": 4.08553751558248e-07, "loss": 0.74400711, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 13291, "time_per_iteration": 2.398078203201294 }, { "auxiliary_loss_clip": 0.01049861, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.01332355, "balance_loss_mlp": 1.01503563, "epoch": 0.7991582744626484, "flos": 26098832770560.0, "grad_norm": 1.517377807024678, "language_loss": 0.64520466, "learning_rate": 4.083179001549422e-07, "loss": 0.66605902, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 13292, "time_per_iteration": 2.4021806716918945 }, { "auxiliary_loss_clip": 0.01051096, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.01613295, "balance_loss_mlp": 1.01540554, "epoch": 0.7992183977153164, "flos": 35294466528000.0, "grad_norm": 1.6831265157801256, "language_loss": 0.57058632, "learning_rate": 4.0808210910838105e-07, "loss": 0.59148765, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 13293, "time_per_iteration": 2.467203140258789 }, { "auxiliary_loss_clip": 0.01052685, "auxiliary_loss_mlp": 0.01038319, "balance_loss_clip": 1.01566958, "balance_loss_mlp": 1.01672387, "epoch": 0.7992785209679844, "flos": 51851781901440.0, "grad_norm": 2.4893545735043725, "language_loss": 0.73294914, "learning_rate": 4.0784637842750704e-07, "loss": 0.75385916, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 13294, "time_per_iteration": 2.6070194244384766 }, { "auxiliary_loss_clip": 0.01052739, "auxiliary_loss_mlp": 0.01037512, "balance_loss_clip": 1.01394391, "balance_loss_mlp": 1.01699102, "epoch": 0.7993386442206524, "flos": 22564016190720.0, "grad_norm": 1.9323523451731863, "language_loss": 0.74189967, "learning_rate": 4.0761070812125675e-07, "loss": 0.76280218, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 13295, "time_per_iteration": 2.349834442138672 }, { "auxiliary_loss_clip": 0.01050577, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.01365972, "balance_loss_mlp": 1.01651311, "epoch": 0.7993987674733203, "flos": 18799471664640.0, "grad_norm": 1.853084231220753, "language_loss": 0.77595979, "learning_rate": 4.0737509819856797e-07, "loss": 0.79681897, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 13296, "time_per_iteration": 2.348269462585449 }, { "auxiliary_loss_clip": 0.01008434, "auxiliary_loss_mlp": 0.01003446, "balance_loss_clip": 1.00113297, "balance_loss_mlp": 1.00147092, "epoch": 0.7994588907259883, "flos": 69418049189760.0, "grad_norm": 0.6969778992458188, "language_loss": 0.60854387, "learning_rate": 4.0713954866837573e-07, "loss": 0.62866271, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.06933594, "step": 13297, "time_per_iteration": 3.06969952583313 }, { "auxiliary_loss_clip": 0.01050747, "auxiliary_loss_mlp": 0.01033661, "balance_loss_clip": 1.01033151, "balance_loss_mlp": 1.01556528, "epoch": 0.7995190139786562, "flos": 13479475979520.0, "grad_norm": 1.796971138702444, "language_loss": 0.71694171, "learning_rate": 4.0690405953961073e-07, "loss": 0.73778582, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3515625, "step": 13298, "time_per_iteration": 2.3446595668792725 }, { "auxiliary_loss_clip": 0.01053751, "auxiliary_loss_mlp": 0.010393, "balance_loss_clip": 1.01407528, "balance_loss_mlp": 1.01595545, "epoch": 0.7995791372313242, "flos": 21651770609280.0, "grad_norm": 2.173547047635548, "language_loss": 0.77314067, "learning_rate": 4.066686308212037e-07, "loss": 0.7940712, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 13299, "time_per_iteration": 2.3445301055908203 }, { "auxiliary_loss_clip": 0.01049429, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.01336741, "balance_loss_mlp": 1.0148387, "epoch": 0.7996392604839921, "flos": 26066921921280.0, "grad_norm": 1.8671734474411203, "language_loss": 0.78713655, "learning_rate": 4.064332625220828e-07, "loss": 0.80798662, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34570312, "step": 13300, "time_per_iteration": 2.395376205444336 }, { "auxiliary_loss_clip": 0.01053124, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.01065338, "balance_loss_mlp": 1.0164963, "epoch": 0.7996993837366602, "flos": 24605771944320.0, "grad_norm": 1.8614662200580108, "language_loss": 0.64657557, "learning_rate": 4.0619795465117115e-07, "loss": 0.66745496, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 13301, "time_per_iteration": 2.377619981765747 }, { "auxiliary_loss_clip": 0.01051142, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.01499057, "balance_loss_mlp": 1.01628506, "epoch": 0.7997595069893281, "flos": 20991109351680.0, "grad_norm": 1.6204099228793187, "language_loss": 0.72755659, "learning_rate": 4.059627072173928e-07, "loss": 0.74844599, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 13302, "time_per_iteration": 2.426759719848633 }, { "auxiliary_loss_clip": 0.01053556, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.01414394, "balance_loss_mlp": 1.01588821, "epoch": 0.7998196302419961, "flos": 24425340704640.0, "grad_norm": 1.9949205832254464, "language_loss": 0.8462621, "learning_rate": 4.057275202296684e-07, "loss": 0.86718941, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 13303, "time_per_iteration": 3.7507333755493164 }, { "auxiliary_loss_clip": 0.01049892, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.01327634, "balance_loss_mlp": 1.01560068, "epoch": 0.7998797534946641, "flos": 30263307454080.0, "grad_norm": 1.8718222049139581, "language_loss": 0.60119116, "learning_rate": 4.054923936969166e-07, "loss": 0.62202621, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34375, "step": 13304, "time_per_iteration": 2.4264776706695557 }, { "auxiliary_loss_clip": 0.0105383, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.01016462, "balance_loss_mlp": 1.01644993, "epoch": 0.799939876747332, "flos": 23512850743680.0, "grad_norm": 1.5642649892440927, "language_loss": 0.7007395, "learning_rate": 4.0525732762805265e-07, "loss": 0.72161871, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 13305, "time_per_iteration": 2.4059605598449707 }, { "auxiliary_loss_clip": 0.01050226, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.01124299, "balance_loss_mlp": 1.01606202, "epoch": 0.8, "flos": 19317093615360.0, "grad_norm": 1.6194933232609776, "language_loss": 0.70251852, "learning_rate": 4.0502232203199107e-07, "loss": 0.72334182, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34179688, "step": 13306, "time_per_iteration": 2.404358386993408 }, { "auxiliary_loss_clip": 0.01052726, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.0119288, "balance_loss_mlp": 1.01646233, "epoch": 0.800060123252668, "flos": 32411164429440.0, "grad_norm": 1.7356377387484745, "language_loss": 0.70373166, "learning_rate": 4.0478737691764286e-07, "loss": 0.72461188, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 13307, "time_per_iteration": 2.4474194049835205 }, { "auxiliary_loss_clip": 0.01052686, "auxiliary_loss_mlp": 0.01041141, "balance_loss_clip": 1.01925457, "balance_loss_mlp": 1.01661849, "epoch": 0.800120246505336, "flos": 20009595899520.0, "grad_norm": 1.823540997590929, "language_loss": 0.78222638, "learning_rate": 4.0455249229391677e-07, "loss": 0.80316466, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 13308, "time_per_iteration": 2.371959924697876 }, { "auxiliary_loss_clip": 0.01054298, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 1.01447177, "balance_loss_mlp": 1.01617599, "epoch": 0.8001803697580039, "flos": 31866938156160.0, "grad_norm": 1.4712778817623702, "language_loss": 0.79427356, "learning_rate": 4.0431766816972e-07, "loss": 0.81521773, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 13309, "time_per_iteration": 3.8319997787475586 }, { "auxiliary_loss_clip": 0.01008013, "auxiliary_loss_mlp": 0.01007251, "balance_loss_clip": 1.00468755, "balance_loss_mlp": 1.00122499, "epoch": 0.8002404930106719, "flos": 63388828679040.0, "grad_norm": 0.9329854659128911, "language_loss": 0.64786059, "learning_rate": 4.040829045539571e-07, "loss": 0.66801322, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.06835938, "step": 13310, "time_per_iteration": 4.4200804233551025 }, { "auxiliary_loss_clip": 0.0105208, "auxiliary_loss_mlp": 0.01040693, "balance_loss_clip": 1.01773334, "balance_loss_mlp": 1.01611304, "epoch": 0.8003006162633398, "flos": 27854230619520.0, "grad_norm": 2.0076774504859523, "language_loss": 0.84582543, "learning_rate": 4.0384820145553156e-07, "loss": 0.86675316, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 13311, "time_per_iteration": 2.3890202045440674 }, { "auxiliary_loss_clip": 0.0105134, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.01468933, "balance_loss_mlp": 1.01616049, "epoch": 0.8003607395160078, "flos": 18222357024000.0, "grad_norm": 2.5040984923492777, "language_loss": 0.67877114, "learning_rate": 4.0361355888334116e-07, "loss": 0.6996758, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3515625, "step": 13312, "time_per_iteration": 2.3646042346954346 }, { "auxiliary_loss_clip": 0.01056058, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.01246858, "balance_loss_mlp": 1.0185473, "epoch": 0.8004208627686757, "flos": 20885915825280.0, "grad_norm": 1.7459612539790001, "language_loss": 0.7636776, "learning_rate": 4.033789768462843e-07, "loss": 0.78462338, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 13313, "time_per_iteration": 2.509648084640503 }, { "auxiliary_loss_clip": 0.01052269, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.01504469, "balance_loss_mlp": 1.01616359, "epoch": 0.8004809860213438, "flos": 26434836495360.0, "grad_norm": 1.320888692004062, "language_loss": 0.76254028, "learning_rate": 4.031444553532575e-07, "loss": 0.78343934, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36132812, "step": 13314, "time_per_iteration": 2.4302194118499756 }, { "auxiliary_loss_clip": 0.01008051, "auxiliary_loss_mlp": 0.01002174, "balance_loss_clip": 0.99994493, "balance_loss_mlp": 1.00098968, "epoch": 0.8005411092740117, "flos": 63645335504640.0, "grad_norm": 0.7849755740285583, "language_loss": 0.53750026, "learning_rate": 4.029099944131522e-07, "loss": 0.55760252, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.07080078, "step": 13315, "time_per_iteration": 2.9406778812408447 }, { "auxiliary_loss_clip": 0.01052694, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.01309288, "balance_loss_mlp": 1.01690245, "epoch": 0.8006012325266797, "flos": 36136571454720.0, "grad_norm": 1.5026080924211764, "language_loss": 0.72322857, "learning_rate": 4.026755940348603e-07, "loss": 0.74411052, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 13316, "time_per_iteration": 2.5139594078063965 }, { "auxiliary_loss_clip": 0.01053962, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.01922214, "balance_loss_mlp": 1.01621783, "epoch": 0.8006613557793477, "flos": 33837540825600.0, "grad_norm": 2.107985108984309, "language_loss": 0.65915465, "learning_rate": 4.024412542272706e-07, "loss": 0.68013823, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 13317, "time_per_iteration": 2.5097193717956543 }, { "auxiliary_loss_clip": 0.01008046, "auxiliary_loss_mlp": 0.01002888, "balance_loss_clip": 1.00069451, "balance_loss_mlp": 1.00115693, "epoch": 0.8007214790320156, "flos": 67344522232320.0, "grad_norm": 0.7709447474508325, "language_loss": 0.59140134, "learning_rate": 4.0220697499926783e-07, "loss": 0.61151075, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06884766, "step": 13318, "time_per_iteration": 3.1183090209960938 }, { "auxiliary_loss_clip": 0.01051068, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.01318741, "balance_loss_mlp": 1.01603055, "epoch": 0.8007816022846836, "flos": 23184527518080.0, "grad_norm": 1.5696038789644844, "language_loss": 0.66907722, "learning_rate": 4.019727563597366e-07, "loss": 0.68994582, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 13319, "time_per_iteration": 2.3894925117492676 }, { "auxiliary_loss_clip": 0.01054559, "auxiliary_loss_mlp": 0.01040667, "balance_loss_clip": 1.01377392, "balance_loss_mlp": 1.01694345, "epoch": 0.8008417255373516, "flos": 21980303303040.0, "grad_norm": 1.89950748796875, "language_loss": 0.74772179, "learning_rate": 4.0173859831755873e-07, "loss": 0.76867402, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.375, "step": 13320, "time_per_iteration": 2.401658535003662 }, { "auxiliary_loss_clip": 0.01054125, "auxiliary_loss_mlp": 0.01039708, "balance_loss_clip": 1.01635551, "balance_loss_mlp": 1.017084, "epoch": 0.8009018487900196, "flos": 16726398554880.0, "grad_norm": 1.9267544552564826, "language_loss": 0.81560904, "learning_rate": 4.015045008816138e-07, "loss": 0.83654737, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 13321, "time_per_iteration": 2.359140396118164 }, { "auxiliary_loss_clip": 0.01048825, "auxiliary_loss_mlp": 0.01032154, "balance_loss_clip": 1.01104164, "balance_loss_mlp": 1.01433873, "epoch": 0.8009619720426875, "flos": 20812563325440.0, "grad_norm": 1.860669395848053, "language_loss": 0.66980684, "learning_rate": 4.0127046406077825e-07, "loss": 0.69061661, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 13322, "time_per_iteration": 2.348242998123169 }, { "auxiliary_loss_clip": 0.01051492, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.01432085, "balance_loss_mlp": 1.01611876, "epoch": 0.8010220952953555, "flos": 17930134010880.0, "grad_norm": 1.7501099757362812, "language_loss": 0.78971565, "learning_rate": 4.010364878639265e-07, "loss": 0.81060314, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 13323, "time_per_iteration": 2.360288381576538 }, { "auxiliary_loss_clip": 0.01053798, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.01173639, "balance_loss_mlp": 1.01602066, "epoch": 0.8010822185480234, "flos": 24571068186240.0, "grad_norm": 2.2340522059976675, "language_loss": 0.73352575, "learning_rate": 4.00802572299932e-07, "loss": 0.75443053, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 13324, "time_per_iteration": 3.8230392932891846 }, { "auxiliary_loss_clip": 0.01053816, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.01235318, "balance_loss_mlp": 1.0159018, "epoch": 0.8011423418006914, "flos": 21829059826560.0, "grad_norm": 1.7224440933386287, "language_loss": 0.7705043, "learning_rate": 4.005687173776635e-07, "loss": 0.79142261, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 13325, "time_per_iteration": 2.3857614994049072 }, { "auxiliary_loss_clip": 0.01047502, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.01285911, "balance_loss_mlp": 1.01406908, "epoch": 0.8012024650533593, "flos": 23914037710080.0, "grad_norm": 1.6374828178326157, "language_loss": 0.80544239, "learning_rate": 4.003349231059898e-07, "loss": 0.82626557, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33398438, "step": 13326, "time_per_iteration": 2.379748821258545 }, { "auxiliary_loss_clip": 0.01050569, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.02002633, "balance_loss_mlp": 1.01590276, "epoch": 0.8012625883060274, "flos": 23585923952640.0, "grad_norm": 2.120157775548809, "language_loss": 0.67181134, "learning_rate": 4.001011894937765e-07, "loss": 0.69273639, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 13327, "time_per_iteration": 2.3907110691070557 }, { "auxiliary_loss_clip": 0.01049639, "auxiliary_loss_mlp": 0.01036789, "balance_loss_clip": 1.01423478, "balance_loss_mlp": 1.01561117, "epoch": 0.8013227115586953, "flos": 20812877527680.0, "grad_norm": 1.5469651349580793, "language_loss": 0.74252659, "learning_rate": 3.9986751654988636e-07, "loss": 0.7633909, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.33984375, "step": 13328, "time_per_iteration": 2.3585877418518066 }, { "auxiliary_loss_clip": 0.01052366, "auxiliary_loss_mlp": 0.01042959, "balance_loss_clip": 1.0178895, "balance_loss_mlp": 1.01599586, "epoch": 0.8013828348113633, "flos": 15887400739200.0, "grad_norm": 1.9683315520131375, "language_loss": 0.75541466, "learning_rate": 3.996339042831798e-07, "loss": 0.77636796, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 13329, "time_per_iteration": 2.3579304218292236 }, { "auxiliary_loss_clip": 0.01007896, "auxiliary_loss_mlp": 0.01003801, "balance_loss_clip": 1.00158405, "balance_loss_mlp": 1.00105822, "epoch": 0.8014429580640313, "flos": 71059281292800.0, "grad_norm": 0.6947462736174085, "language_loss": 0.53083634, "learning_rate": 3.9940035270251605e-07, "loss": 0.55095327, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.06835938, "step": 13330, "time_per_iteration": 3.0835137367248535 }, { "auxiliary_loss_clip": 0.01053917, "auxiliary_loss_mlp": 0.0104191, "balance_loss_clip": 1.01415849, "balance_loss_mlp": 1.01613927, "epoch": 0.8015030813166992, "flos": 23075214451200.0, "grad_norm": 1.7232341271587923, "language_loss": 0.74295926, "learning_rate": 3.991668618167519e-07, "loss": 0.76391751, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.37890625, "step": 13331, "time_per_iteration": 2.3858482837677 }, { "auxiliary_loss_clip": 0.01050671, "auxiliary_loss_mlp": 0.01035365, "balance_loss_clip": 1.01424146, "balance_loss_mlp": 1.01630259, "epoch": 0.8015632045693672, "flos": 21871234615680.0, "grad_norm": 1.8245883531802316, "language_loss": 0.77988845, "learning_rate": 3.989334316347401e-07, "loss": 0.80074883, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 13332, "time_per_iteration": 2.412506580352783 }, { "auxiliary_loss_clip": 0.01052378, "auxiliary_loss_mlp": 0.01036361, "balance_loss_clip": 1.0137589, "balance_loss_mlp": 1.01654899, "epoch": 0.8016233278220352, "flos": 23655680582400.0, "grad_norm": 1.8287657830796051, "language_loss": 0.84158325, "learning_rate": 3.987000621653338e-07, "loss": 0.86247063, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 13333, "time_per_iteration": 2.3736159801483154 }, { "auxiliary_loss_clip": 0.01052545, "auxiliary_loss_mlp": 0.01041898, "balance_loss_clip": 1.0167923, "balance_loss_mlp": 1.01549244, "epoch": 0.8016834510747032, "flos": 16252243113600.0, "grad_norm": 1.6290612763086185, "language_loss": 0.74924469, "learning_rate": 3.9846675341738133e-07, "loss": 0.77018911, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 13334, "time_per_iteration": 2.3352394104003906 }, { "auxiliary_loss_clip": 0.01051079, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.00970936, "balance_loss_mlp": 1.01619148, "epoch": 0.8017435743273711, "flos": 12275216853120.0, "grad_norm": 2.2138216747296457, "language_loss": 0.7603662, "learning_rate": 3.9823350539972967e-07, "loss": 0.78120083, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 13335, "time_per_iteration": 2.3888309001922607 }, { "auxiliary_loss_clip": 0.01051595, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.01144052, "balance_loss_mlp": 1.01578081, "epoch": 0.8018036975800391, "flos": 17195317292160.0, "grad_norm": 1.8837422668642683, "language_loss": 0.77163488, "learning_rate": 3.9800031812122416e-07, "loss": 0.79249954, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 13336, "time_per_iteration": 2.334930181503296 }, { "auxiliary_loss_clip": 0.01057078, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.01411593, "balance_loss_mlp": 1.01797438, "epoch": 0.801863820832707, "flos": 20630805454080.0, "grad_norm": 2.0479402315508883, "language_loss": 0.76641911, "learning_rate": 3.977671915907068e-07, "loss": 0.78739047, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.390625, "step": 13337, "time_per_iteration": 2.3789114952087402 }, { "auxiliary_loss_clip": 0.0105402, "auxiliary_loss_mlp": 0.01042037, "balance_loss_clip": 1.01826656, "balance_loss_mlp": 1.01695466, "epoch": 0.801923944085375, "flos": 30444262364160.0, "grad_norm": 1.7224452421461776, "language_loss": 0.81142694, "learning_rate": 3.9753412581701883e-07, "loss": 0.83238745, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 13338, "time_per_iteration": 2.41096830368042 }, { "auxiliary_loss_clip": 0.01052113, "auxiliary_loss_mlp": 0.01037902, "balance_loss_clip": 1.01365519, "balance_loss_mlp": 1.0153352, "epoch": 0.801984067338043, "flos": 20009560988160.0, "grad_norm": 1.952040912500146, "language_loss": 0.75693399, "learning_rate": 3.9730112080899733e-07, "loss": 0.77783418, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 13339, "time_per_iteration": 2.3934712409973145 }, { "auxiliary_loss_clip": 0.01050728, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.01246023, "balance_loss_mlp": 1.01689601, "epoch": 0.802044190590711, "flos": 22782921615360.0, "grad_norm": 1.620327394438716, "language_loss": 0.79667127, "learning_rate": 3.970681765754775e-07, "loss": 0.81752205, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 13340, "time_per_iteration": 2.39589262008667 }, { "auxiliary_loss_clip": 0.01051442, "auxiliary_loss_mlp": 0.01039098, "balance_loss_clip": 1.01734281, "balance_loss_mlp": 1.01558757, "epoch": 0.8021043138433789, "flos": 27598875868800.0, "grad_norm": 1.831597165258107, "language_loss": 0.68651378, "learning_rate": 3.968352931252936e-07, "loss": 0.70741922, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 13341, "time_per_iteration": 2.434870958328247 }, { "auxiliary_loss_clip": 0.01007948, "auxiliary_loss_mlp": 0.01007703, "balance_loss_clip": 1.00539064, "balance_loss_mlp": 1.00099778, "epoch": 0.8021644370960469, "flos": 62060942828160.0, "grad_norm": 0.8205224505314526, "language_loss": 0.61909592, "learning_rate": 3.9660247046727547e-07, "loss": 0.63925248, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.06933594, "step": 13342, "time_per_iteration": 4.291916370391846 }, { "auxiliary_loss_clip": 0.01053948, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.01398492, "balance_loss_mlp": 1.01713276, "epoch": 0.8022245603487148, "flos": 23360839217280.0, "grad_norm": 1.8345905560279367, "language_loss": 0.64450687, "learning_rate": 3.963697086102522e-07, "loss": 0.6654498, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.3671875, "step": 13343, "time_per_iteration": 2.386303663253784 }, { "auxiliary_loss_clip": 0.01048874, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.01161766, "balance_loss_mlp": 1.01547432, "epoch": 0.8022846836013828, "flos": 10852575972480.0, "grad_norm": 1.8874222884025036, "language_loss": 0.69124103, "learning_rate": 3.96137007563051e-07, "loss": 0.71206462, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33398438, "step": 13344, "time_per_iteration": 2.3325257301330566 }, { "auxiliary_loss_clip": 0.01053036, "auxiliary_loss_mlp": 0.01036636, "balance_loss_clip": 1.01379538, "balance_loss_mlp": 1.01685929, "epoch": 0.8023448068540509, "flos": 29239200276480.0, "grad_norm": 1.7017643353638807, "language_loss": 0.71682376, "learning_rate": 3.9590436733449506e-07, "loss": 0.73772049, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 13345, "time_per_iteration": 2.4552292823791504 }, { "auxiliary_loss_clip": 0.01007927, "auxiliary_loss_mlp": 0.01002321, "balance_loss_clip": 1.00000882, "balance_loss_mlp": 1.0008738, "epoch": 0.8024049301067188, "flos": 64150459188480.0, "grad_norm": 0.9235124125767383, "language_loss": 0.63053, "learning_rate": 3.956717879334059e-07, "loss": 0.6506325, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.0703125, "step": 13346, "time_per_iteration": 3.0966389179229736 }, { "auxiliary_loss_clip": 0.01050293, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.01287758, "balance_loss_mlp": 1.01593232, "epoch": 0.8024650533593868, "flos": 28584089925120.0, "grad_norm": 1.4661475628143572, "language_loss": 0.73177934, "learning_rate": 3.9543926936860327e-07, "loss": 0.75262266, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 13347, "time_per_iteration": 2.431648015975952 }, { "auxiliary_loss_clip": 0.01051577, "auxiliary_loss_mlp": 0.01035846, "balance_loss_clip": 1.01341116, "balance_loss_mlp": 1.01549089, "epoch": 0.8025251766120547, "flos": 16981334369280.0, "grad_norm": 1.7848041742851792, "language_loss": 0.74173111, "learning_rate": 3.9520681164890493e-07, "loss": 0.76260531, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 13348, "time_per_iteration": 3.7682647705078125 }, { "auxiliary_loss_clip": 0.01051815, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.01342189, "balance_loss_mlp": 1.01616299, "epoch": 0.8025852998647227, "flos": 22162584844800.0, "grad_norm": 2.633565589423174, "language_loss": 0.7729876, "learning_rate": 3.9497441478312444e-07, "loss": 0.79387027, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 13349, "time_per_iteration": 2.353545904159546 }, { "auxiliary_loss_clip": 0.01051609, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.01727271, "balance_loss_mlp": 1.01680374, "epoch": 0.8026454231173906, "flos": 22015949667840.0, "grad_norm": 2.114923139987294, "language_loss": 0.84553611, "learning_rate": 3.947420787800755e-07, "loss": 0.86643988, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 13350, "time_per_iteration": 3.6777758598327637 }, { "auxiliary_loss_clip": 0.01052193, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.01422381, "balance_loss_mlp": 1.0167346, "epoch": 0.8027055463700586, "flos": 22490244754560.0, "grad_norm": 1.6401560057988513, "language_loss": 0.7245332, "learning_rate": 3.945098036485679e-07, "loss": 0.74541897, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 13351, "time_per_iteration": 2.361637830734253 }, { "auxiliary_loss_clip": 0.01050812, "auxiliary_loss_mlp": 0.0103426, "balance_loss_clip": 1.01307666, "balance_loss_mlp": 1.01674318, "epoch": 0.8027656696227266, "flos": 28911645100800.0, "grad_norm": 4.160200651377108, "language_loss": 0.62426066, "learning_rate": 3.9427758939740885e-07, "loss": 0.64511138, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34179688, "step": 13352, "time_per_iteration": 2.4500269889831543 }, { "auxiliary_loss_clip": 0.01051646, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.01125801, "balance_loss_mlp": 1.01628006, "epoch": 0.8028257928753946, "flos": 18588351473280.0, "grad_norm": 1.9956389786070885, "language_loss": 0.78019881, "learning_rate": 3.940454360354046e-07, "loss": 0.80105692, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 13353, "time_per_iteration": 2.3454771041870117 }, { "auxiliary_loss_clip": 0.01055744, "auxiliary_loss_mlp": 0.01041053, "balance_loss_clip": 1.01476741, "balance_loss_mlp": 1.01707458, "epoch": 0.8028859161280625, "flos": 19129156433280.0, "grad_norm": 2.2114956149866685, "language_loss": 0.74152648, "learning_rate": 3.938133435713582e-07, "loss": 0.76249444, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 13354, "time_per_iteration": 2.3706772327423096 }, { "auxiliary_loss_clip": 0.01052977, "auxiliary_loss_mlp": 0.01038542, "balance_loss_clip": 1.01691699, "balance_loss_mlp": 1.01592541, "epoch": 0.8029460393807305, "flos": 20228850437760.0, "grad_norm": 2.7240161130778486, "language_loss": 0.66969424, "learning_rate": 3.935813120140714e-07, "loss": 0.69060946, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.37109375, "step": 13355, "time_per_iteration": 2.3424570560455322 }, { "auxiliary_loss_clip": 0.01054902, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.01421583, "balance_loss_mlp": 1.01692808, "epoch": 0.8030061626333984, "flos": 49781466789120.0, "grad_norm": 2.9230096449408243, "language_loss": 0.70407653, "learning_rate": 3.9334934137234235e-07, "loss": 0.7250219, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 13356, "time_per_iteration": 2.609100103378296 }, { "auxiliary_loss_clip": 0.01053576, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.01512587, "balance_loss_mlp": 1.01732349, "epoch": 0.8030662858860664, "flos": 21614204119680.0, "grad_norm": 1.5703299637043893, "language_loss": 0.77969623, "learning_rate": 3.931174316549666e-07, "loss": 0.80060035, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36328125, "step": 13357, "time_per_iteration": 2.347810983657837 }, { "auxiliary_loss_clip": 0.01054253, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.01498961, "balance_loss_mlp": 1.01680017, "epoch": 0.8031264091387345, "flos": 25628866692480.0, "grad_norm": 1.5413428871589125, "language_loss": 0.78116179, "learning_rate": 3.9288558287073937e-07, "loss": 0.80208886, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 13358, "time_per_iteration": 2.416837215423584 }, { "auxiliary_loss_clip": 0.0105065, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.01192582, "balance_loss_mlp": 1.01536763, "epoch": 0.8031865323914024, "flos": 19645207372800.0, "grad_norm": 1.7178316680600967, "language_loss": 0.85859776, "learning_rate": 3.9265379502845143e-07, "loss": 0.87944329, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 13359, "time_per_iteration": 2.3387808799743652 }, { "auxiliary_loss_clip": 0.01050743, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.01719451, "balance_loss_mlp": 1.01611495, "epoch": 0.8032466556440704, "flos": 26168135552640.0, "grad_norm": 2.2326316017315104, "language_loss": 0.74481964, "learning_rate": 3.924220681368928e-07, "loss": 0.7657187, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 13360, "time_per_iteration": 2.3980977535247803 }, { "auxiliary_loss_clip": 0.0105231, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.01410758, "balance_loss_mlp": 1.01612628, "epoch": 0.8033067788967383, "flos": 25518855398400.0, "grad_norm": 1.7692255626222524, "language_loss": 0.70806348, "learning_rate": 3.921904022048512e-07, "loss": 0.72895181, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 13361, "time_per_iteration": 2.3794262409210205 }, { "auxiliary_loss_clip": 0.01054874, "auxiliary_loss_mlp": 0.01043849, "balance_loss_clip": 1.01911294, "balance_loss_mlp": 1.01690078, "epoch": 0.8033669021494063, "flos": 24023141308800.0, "grad_norm": 1.5344241936815886, "language_loss": 0.71472299, "learning_rate": 3.919587972411098e-07, "loss": 0.73571026, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 13362, "time_per_iteration": 2.4027817249298096 }, { "auxiliary_loss_clip": 0.0105661, "auxiliary_loss_mlp": 0.01046482, "balance_loss_clip": 1.01884937, "balance_loss_mlp": 1.01694763, "epoch": 0.8034270254020742, "flos": 13587252946560.0, "grad_norm": 2.4743359371358133, "language_loss": 0.81032646, "learning_rate": 3.91727253254452e-07, "loss": 0.83135736, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.3984375, "step": 13363, "time_per_iteration": 3.9008572101593018 }, { "auxiliary_loss_clip": 0.01052716, "auxiliary_loss_mlp": 0.01036657, "balance_loss_clip": 1.01239765, "balance_loss_mlp": 1.01650953, "epoch": 0.8034871486547422, "flos": 27411567091200.0, "grad_norm": 2.0974019649309845, "language_loss": 0.76328588, "learning_rate": 3.9149577025365787e-07, "loss": 0.78417957, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 13364, "time_per_iteration": 2.411773920059204 }, { "auxiliary_loss_clip": 0.01052496, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.01008391, "balance_loss_mlp": 1.01742244, "epoch": 0.8035472719074102, "flos": 32597216398080.0, "grad_norm": 2.352804596033289, "language_loss": 0.6291002, "learning_rate": 3.9126434824750596e-07, "loss": 0.64995694, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 13365, "time_per_iteration": 2.4736979007720947 }, { "auxiliary_loss_clip": 0.01053668, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.01790404, "balance_loss_mlp": 1.01674676, "epoch": 0.8036073951600782, "flos": 21286928234880.0, "grad_norm": 1.7492410949288033, "language_loss": 0.67413259, "learning_rate": 3.910329872447706e-07, "loss": 0.69508427, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 13366, "time_per_iteration": 2.3595855236053467 }, { "auxiliary_loss_clip": 0.01052224, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.01115751, "balance_loss_mlp": 1.01682925, "epoch": 0.8036675184127461, "flos": 18112869400320.0, "grad_norm": 1.91564155329551, "language_loss": 0.74958402, "learning_rate": 3.908016872542259e-07, "loss": 0.77045351, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 13367, "time_per_iteration": 2.340679407119751 }, { "auxiliary_loss_clip": 0.01050659, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.00990009, "balance_loss_mlp": 1.01583803, "epoch": 0.8037276416654141, "flos": 26029111052160.0, "grad_norm": 1.5102241490083312, "language_loss": 0.75102967, "learning_rate": 3.905704482846428e-07, "loss": 0.77185571, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 13368, "time_per_iteration": 2.4152610301971436 }, { "auxiliary_loss_clip": 0.01054877, "auxiliary_loss_mlp": 0.01040366, "balance_loss_clip": 1.01588082, "balance_loss_mlp": 1.01681185, "epoch": 0.803787764918082, "flos": 18801321966720.0, "grad_norm": 2.0659227123862047, "language_loss": 0.71523106, "learning_rate": 3.90339270344789e-07, "loss": 0.73618346, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38085938, "step": 13369, "time_per_iteration": 2.346855401992798 }, { "auxiliary_loss_clip": 0.01050276, "auxiliary_loss_mlp": 0.01038915, "balance_loss_clip": 1.01644444, "balance_loss_mlp": 1.01567674, "epoch": 0.80384788817075, "flos": 20224870542720.0, "grad_norm": 2.332070694829283, "language_loss": 0.74709988, "learning_rate": 3.901081534434312e-07, "loss": 0.76799178, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 13370, "time_per_iteration": 2.3632099628448486 }, { "auxiliary_loss_clip": 0.01054533, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.01745129, "balance_loss_mlp": 1.01686764, "epoch": 0.8039080114234181, "flos": 18514300746240.0, "grad_norm": 2.4378821026155664, "language_loss": 0.87594378, "learning_rate": 3.898770975893342e-07, "loss": 0.89693451, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.37695312, "step": 13371, "time_per_iteration": 2.333332061767578 }, { "auxiliary_loss_clip": 0.01055475, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.01145196, "balance_loss_mlp": 1.01699102, "epoch": 0.803968134676086, "flos": 22381420446720.0, "grad_norm": 1.927583345598428, "language_loss": 0.75550526, "learning_rate": 3.89646102791259e-07, "loss": 0.77643824, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38476562, "step": 13372, "time_per_iteration": 2.3718769550323486 }, { "auxiliary_loss_clip": 0.01051343, "auxiliary_loss_mlp": 0.01040084, "balance_loss_clip": 1.01502633, "balance_loss_mlp": 1.01518607, "epoch": 0.804028257928754, "flos": 23841418348800.0, "grad_norm": 2.052047545042118, "language_loss": 0.79891813, "learning_rate": 3.894151690579646e-07, "loss": 0.81983232, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 13373, "time_per_iteration": 2.392277717590332 }, { "auxiliary_loss_clip": 0.01049686, "auxiliary_loss_mlp": 0.01041061, "balance_loss_clip": 1.01880431, "balance_loss_mlp": 1.01562333, "epoch": 0.8040883811814219, "flos": 23549579360640.0, "grad_norm": 1.7688629261445346, "language_loss": 0.756464, "learning_rate": 3.8918429639820815e-07, "loss": 0.77737153, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 13374, "time_per_iteration": 2.3956637382507324 }, { "auxiliary_loss_clip": 0.01053063, "auxiliary_loss_mlp": 0.01037299, "balance_loss_clip": 1.01174021, "balance_loss_mlp": 1.01568484, "epoch": 0.8041485044340899, "flos": 19025254627200.0, "grad_norm": 1.9928352070748598, "language_loss": 0.70245135, "learning_rate": 3.889534848207452e-07, "loss": 0.723355, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 13375, "time_per_iteration": 2.3643784523010254 }, { "auxiliary_loss_clip": 0.01007722, "auxiliary_loss_mlp": 0.01003464, "balance_loss_clip": 1.0010438, "balance_loss_mlp": 1.00080132, "epoch": 0.8042086276867578, "flos": 70001971545600.0, "grad_norm": 0.7263777541205627, "language_loss": 0.55774832, "learning_rate": 3.887227343343271e-07, "loss": 0.57786024, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.06933594, "step": 13376, "time_per_iteration": 3.098686456680298 }, { "auxiliary_loss_clip": 0.0105414, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.01233625, "balance_loss_mlp": 1.01734209, "epoch": 0.8042687509394258, "flos": 21871339349760.0, "grad_norm": 1.5031579436964206, "language_loss": 0.73897696, "learning_rate": 3.8849204494770425e-07, "loss": 0.75986779, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 13377, "time_per_iteration": 2.3809173107147217 }, { "auxiliary_loss_clip": 0.01051863, "auxiliary_loss_mlp": 0.01039281, "balance_loss_clip": 1.01559401, "balance_loss_mlp": 1.01578736, "epoch": 0.8043288741920938, "flos": 26613661812480.0, "grad_norm": 1.8159647882692662, "language_loss": 0.7215395, "learning_rate": 3.8826141666962567e-07, "loss": 0.74245101, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 13378, "time_per_iteration": 2.4194066524505615 }, { "auxiliary_loss_clip": 0.0105445, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.00888896, "balance_loss_mlp": 1.01740837, "epoch": 0.8043889974447618, "flos": 33401929392000.0, "grad_norm": 1.396960069121706, "language_loss": 0.69803643, "learning_rate": 3.880308495088347e-07, "loss": 0.71891689, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 13379, "time_per_iteration": 2.4932446479797363 }, { "auxiliary_loss_clip": 0.01054826, "auxiliary_loss_mlp": 0.01039483, "balance_loss_clip": 1.01492631, "balance_loss_mlp": 1.01732111, "epoch": 0.8044491206974297, "flos": 20374927032960.0, "grad_norm": 1.6944671937747975, "language_loss": 0.77746379, "learning_rate": 3.8780034347407533e-07, "loss": 0.7984069, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 13380, "time_per_iteration": 2.481358528137207 }, { "auxiliary_loss_clip": 0.01050191, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.0114336, "balance_loss_mlp": 1.01452518, "epoch": 0.8045092439500977, "flos": 23402699804160.0, "grad_norm": 1.9414360163556912, "language_loss": 0.69967115, "learning_rate": 3.875698985740887e-07, "loss": 0.72050858, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 13381, "time_per_iteration": 2.3922390937805176 }, { "auxiliary_loss_clip": 0.01054063, "auxiliary_loss_mlp": 0.01039188, "balance_loss_clip": 1.01490545, "balance_loss_mlp": 1.01741099, "epoch": 0.8045693672027656, "flos": 24096109783680.0, "grad_norm": 2.0034411651396673, "language_loss": 0.65305787, "learning_rate": 3.873395148176135e-07, "loss": 0.67399037, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 13382, "time_per_iteration": 3.6145522594451904 }, { "auxiliary_loss_clip": 0.01051722, "auxiliary_loss_mlp": 0.01039409, "balance_loss_clip": 1.01489925, "balance_loss_mlp": 1.01606131, "epoch": 0.8046294904554336, "flos": 27705989520000.0, "grad_norm": 1.8127103148146533, "language_loss": 0.77267605, "learning_rate": 3.8710919221338487e-07, "loss": 0.79358733, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.35546875, "step": 13383, "time_per_iteration": 2.4675276279449463 }, { "auxiliary_loss_clip": 0.01053159, "auxiliary_loss_mlp": 0.01041432, "balance_loss_clip": 1.01625454, "balance_loss_mlp": 1.01638174, "epoch": 0.8046896137081017, "flos": 24971836216320.0, "grad_norm": 1.8775215459795844, "language_loss": 0.70712167, "learning_rate": 3.868789307701381e-07, "loss": 0.72806758, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 13384, "time_per_iteration": 2.3875222206115723 }, { "auxiliary_loss_clip": 0.01052398, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.01397705, "balance_loss_mlp": 1.01527214, "epoch": 0.8047497369607696, "flos": 17674534880640.0, "grad_norm": 2.060524913877213, "language_loss": 0.8104583, "learning_rate": 3.8664873049660375e-07, "loss": 0.8313657, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 13385, "time_per_iteration": 2.3387656211853027 }, { "auxiliary_loss_clip": 0.01052889, "auxiliary_loss_mlp": 0.010332, "balance_loss_clip": 1.00952482, "balance_loss_mlp": 1.01622629, "epoch": 0.8048098602134376, "flos": 22381001510400.0, "grad_norm": 1.8774025184774346, "language_loss": 0.73294789, "learning_rate": 3.864185914015108e-07, "loss": 0.7538088, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 13386, "time_per_iteration": 2.379587411880493 }, { "auxiliary_loss_clip": 0.01007939, "auxiliary_loss_mlp": 0.01002985, "balance_loss_clip": 1.00093508, "balance_loss_mlp": 1.0010066, "epoch": 0.8048699834661055, "flos": 71197467920640.0, "grad_norm": 0.6643578532361207, "language_loss": 0.51362604, "learning_rate": 3.861885134935865e-07, "loss": 0.53373528, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06933594, "step": 13387, "time_per_iteration": 3.0725021362304688 }, { "auxiliary_loss_clip": 0.01052946, "auxiliary_loss_mlp": 0.01038989, "balance_loss_clip": 1.01310849, "balance_loss_mlp": 1.01607084, "epoch": 0.8049301067187735, "flos": 23659171718400.0, "grad_norm": 1.8147618151520588, "language_loss": 0.74790478, "learning_rate": 3.859584967815559e-07, "loss": 0.76882422, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 13388, "time_per_iteration": 3.846576452255249 }, { "auxiliary_loss_clip": 0.01050355, "auxiliary_loss_mlp": 0.01035029, "balance_loss_clip": 1.01137781, "balance_loss_mlp": 1.01571655, "epoch": 0.8049902299714414, "flos": 24425166147840.0, "grad_norm": 1.4706113700411905, "language_loss": 0.72493446, "learning_rate": 3.857285412741411e-07, "loss": 0.74578834, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34570312, "step": 13389, "time_per_iteration": 3.799210786819458 }, { "auxiliary_loss_clip": 0.01054071, "auxiliary_loss_mlp": 0.01039379, "balance_loss_clip": 1.01445282, "balance_loss_mlp": 1.01705313, "epoch": 0.8050503532241094, "flos": 17491694757120.0, "grad_norm": 1.9054734320014055, "language_loss": 0.83945084, "learning_rate": 3.8549864698006097e-07, "loss": 0.86038536, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 13390, "time_per_iteration": 2.350457191467285 }, { "auxiliary_loss_clip": 0.01007573, "auxiliary_loss_mlp": 0.01002285, "balance_loss_clip": 1.00010335, "balance_loss_mlp": 1.00078273, "epoch": 0.8051104764767774, "flos": 57655112849280.0, "grad_norm": 0.7873959840666273, "language_loss": 0.55531448, "learning_rate": 3.8526881390803424e-07, "loss": 0.57541305, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.06787109, "step": 13391, "time_per_iteration": 3.0432040691375732 }, { "auxiliary_loss_clip": 0.01050749, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.01170683, "balance_loss_mlp": 1.01603794, "epoch": 0.8051705997294454, "flos": 18002508992640.0, "grad_norm": 1.5424382181754943, "language_loss": 0.85650045, "learning_rate": 3.850390420667762e-07, "loss": 0.87734532, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 13392, "time_per_iteration": 2.380176305770874 }, { "auxiliary_loss_clip": 0.0105232, "auxiliary_loss_mlp": 0.01038247, "balance_loss_clip": 1.01549029, "balance_loss_mlp": 1.0162921, "epoch": 0.8052307229821133, "flos": 26396501955840.0, "grad_norm": 1.3886930886977327, "language_loss": 0.70785856, "learning_rate": 3.8480933146499914e-07, "loss": 0.72876418, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 13393, "time_per_iteration": 2.4559824466705322 }, { "auxiliary_loss_clip": 0.01053645, "auxiliary_loss_mlp": 0.01040649, "balance_loss_clip": 1.01726055, "balance_loss_mlp": 1.01697659, "epoch": 0.8052908462347813, "flos": 21756091351680.0, "grad_norm": 2.088333915745371, "language_loss": 0.76889896, "learning_rate": 3.84579682111414e-07, "loss": 0.78984189, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36523438, "step": 13394, "time_per_iteration": 2.402182102203369 }, { "auxiliary_loss_clip": 0.01052257, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.01289499, "balance_loss_mlp": 1.01652479, "epoch": 0.8053509694874492, "flos": 25441243712640.0, "grad_norm": 1.572886581653276, "language_loss": 0.65252447, "learning_rate": 3.843500940147304e-07, "loss": 0.67342103, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35546875, "step": 13395, "time_per_iteration": 2.4021615982055664 }, { "auxiliary_loss_clip": 0.01008093, "auxiliary_loss_mlp": 0.01004344, "balance_loss_clip": 1.00198412, "balance_loss_mlp": 1.00133109, "epoch": 0.8054110927401172, "flos": 57664922941440.0, "grad_norm": 0.7543522517932097, "language_loss": 0.57589275, "learning_rate": 3.8412056718365206e-07, "loss": 0.59601712, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.06738281, "step": 13396, "time_per_iteration": 3.1906328201293945 }, { "auxiliary_loss_clip": 0.01051695, "auxiliary_loss_mlp": 0.01046145, "balance_loss_clip": 1.02208853, "balance_loss_mlp": 1.016078, "epoch": 0.8054712159927853, "flos": 19275337762560.0, "grad_norm": 1.7290534082398585, "language_loss": 0.78522277, "learning_rate": 3.8389110162688353e-07, "loss": 0.80620116, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 13397, "time_per_iteration": 2.383577346801758 }, { "auxiliary_loss_clip": 0.01053221, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.0103476, "balance_loss_mlp": 1.01682353, "epoch": 0.8055313392454532, "flos": 17966653159680.0, "grad_norm": 1.5463885250522846, "language_loss": 0.71287692, "learning_rate": 3.836616973531266e-07, "loss": 0.73374599, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 13398, "time_per_iteration": 2.369751453399658 }, { "auxiliary_loss_clip": 0.0105158, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.01585102, "balance_loss_mlp": 1.01602089, "epoch": 0.8055914624981212, "flos": 13477555854720.0, "grad_norm": 2.0701267560785066, "language_loss": 0.70478308, "learning_rate": 3.834323543710805e-07, "loss": 0.72566831, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 13399, "time_per_iteration": 2.3394076824188232 }, { "auxiliary_loss_clip": 0.01052943, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.01346362, "balance_loss_mlp": 1.01643705, "epoch": 0.8056515857507891, "flos": 13223946672000.0, "grad_norm": 2.261179531703943, "language_loss": 0.73212171, "learning_rate": 3.8320307268944153e-07, "loss": 0.75301665, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 13400, "time_per_iteration": 2.360828161239624 }, { "auxiliary_loss_clip": 0.01050417, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.01432228, "balance_loss_mlp": 1.01488662, "epoch": 0.8057117090034571, "flos": 23877064713600.0, "grad_norm": 1.903386264412526, "language_loss": 0.65134025, "learning_rate": 3.829738523169037e-07, "loss": 0.67221832, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 13401, "time_per_iteration": 2.374128580093384 }, { "auxiliary_loss_clip": 0.01053868, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.01456201, "balance_loss_mlp": 1.01669204, "epoch": 0.805771832256125, "flos": 21213261532800.0, "grad_norm": 2.3779464637656584, "language_loss": 0.8587656, "learning_rate": 3.8274469326215985e-07, "loss": 0.8796922, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 13402, "time_per_iteration": 2.390835762023926 }, { "auxiliary_loss_clip": 0.01051935, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.01466656, "balance_loss_mlp": 1.01600766, "epoch": 0.805831955508793, "flos": 17565850218240.0, "grad_norm": 2.8536328263173383, "language_loss": 0.6956023, "learning_rate": 3.8251559553389876e-07, "loss": 0.71651578, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 13403, "time_per_iteration": 3.88546085357666 }, { "auxiliary_loss_clip": 0.01051982, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.01328063, "balance_loss_mlp": 1.01689351, "epoch": 0.805892078761461, "flos": 26906303761920.0, "grad_norm": 1.5781766394764625, "language_loss": 0.86019218, "learning_rate": 3.822865591408084e-07, "loss": 0.88106465, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 13404, "time_per_iteration": 2.41072416305542 }, { "auxiliary_loss_clip": 0.010486, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.01644027, "balance_loss_mlp": 1.01473927, "epoch": 0.805952202014129, "flos": 31505028336000.0, "grad_norm": 1.6165632436593849, "language_loss": 0.70553911, "learning_rate": 3.820575840915743e-07, "loss": 0.72640073, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33789062, "step": 13405, "time_per_iteration": 2.4690568447113037 }, { "auxiliary_loss_clip": 0.01051705, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.01404929, "balance_loss_mlp": 1.01652431, "epoch": 0.8060123252667969, "flos": 24388786644480.0, "grad_norm": 2.778448600905664, "language_loss": 0.77020901, "learning_rate": 3.818286703948788e-07, "loss": 0.79108906, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 13406, "time_per_iteration": 2.4174351692199707 }, { "auxiliary_loss_clip": 0.01053341, "auxiliary_loss_mlp": 0.01038085, "balance_loss_clip": 1.01407588, "balance_loss_mlp": 1.01660216, "epoch": 0.8060724485194649, "flos": 23478740478720.0, "grad_norm": 1.786210236832377, "language_loss": 0.76273715, "learning_rate": 3.815998180594018e-07, "loss": 0.78365135, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 13407, "time_per_iteration": 2.412099599838257 }, { "auxiliary_loss_clip": 0.01051489, "auxiliary_loss_mlp": 0.01037769, "balance_loss_clip": 1.0136764, "balance_loss_mlp": 1.01570451, "epoch": 0.8061325717721328, "flos": 18623509079040.0, "grad_norm": 1.7269267979454246, "language_loss": 0.75258756, "learning_rate": 3.81371027093822e-07, "loss": 0.77348012, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35742188, "step": 13408, "time_per_iteration": 2.369699478149414 }, { "auxiliary_loss_clip": 0.01053029, "auxiliary_loss_mlp": 0.01039318, "balance_loss_clip": 1.01440334, "balance_loss_mlp": 1.01713753, "epoch": 0.8061926950248008, "flos": 23581734589440.0, "grad_norm": 1.8914670654570915, "language_loss": 0.71046233, "learning_rate": 3.8114229750681523e-07, "loss": 0.73138577, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 13409, "time_per_iteration": 2.4111976623535156 }, { "auxiliary_loss_clip": 0.01051322, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.01027453, "balance_loss_mlp": 1.01525402, "epoch": 0.8062528182774689, "flos": 11142599569920.0, "grad_norm": 2.274251987702002, "language_loss": 0.78541946, "learning_rate": 3.809136293070545e-07, "loss": 0.80628765, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.359375, "step": 13410, "time_per_iteration": 2.346719264984131 }, { "auxiliary_loss_clip": 0.01052047, "auxiliary_loss_mlp": 0.01041218, "balance_loss_clip": 1.01735258, "balance_loss_mlp": 1.01646018, "epoch": 0.8063129415301368, "flos": 22345704259200.0, "grad_norm": 2.262038431030081, "language_loss": 0.70072436, "learning_rate": 3.806850225032117e-07, "loss": 0.72165704, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 13411, "time_per_iteration": 2.356912612915039 }, { "auxiliary_loss_clip": 0.01050363, "auxiliary_loss_mlp": 0.01036615, "balance_loss_clip": 1.01373911, "balance_loss_mlp": 1.01574874, "epoch": 0.8063730647828048, "flos": 23987250564480.0, "grad_norm": 1.9311076418588045, "language_loss": 0.69143856, "learning_rate": 3.804564771039551e-07, "loss": 0.71230829, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34570312, "step": 13412, "time_per_iteration": 2.3882107734680176 }, { "auxiliary_loss_clip": 0.01055072, "auxiliary_loss_mlp": 0.01040078, "balance_loss_clip": 1.01347065, "balance_loss_mlp": 1.01764512, "epoch": 0.8064331880354727, "flos": 21320514829440.0, "grad_norm": 5.67426528533234, "language_loss": 0.82153034, "learning_rate": 3.8022799311795064e-07, "loss": 0.84248185, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.375, "step": 13413, "time_per_iteration": 2.359790086746216 }, { "auxiliary_loss_clip": 0.01051822, "auxiliary_loss_mlp": 0.01038728, "balance_loss_clip": 1.0161376, "balance_loss_mlp": 1.01684749, "epoch": 0.8064933112881407, "flos": 19681866167040.0, "grad_norm": 1.8733685655180479, "language_loss": 0.85800618, "learning_rate": 3.7999957055386303e-07, "loss": 0.87891167, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 13414, "time_per_iteration": 2.3614144325256348 }, { "auxiliary_loss_clip": 0.01050314, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.01361799, "balance_loss_mlp": 1.01504827, "epoch": 0.8065534345408086, "flos": 19278759075840.0, "grad_norm": 1.9165217691272494, "language_loss": 0.68507355, "learning_rate": 3.7977120942035467e-07, "loss": 0.70592636, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 13415, "time_per_iteration": 2.344886064529419 }, { "auxiliary_loss_clip": 0.01049914, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.01371741, "balance_loss_mlp": 1.01615536, "epoch": 0.8066135577934767, "flos": 19676838931200.0, "grad_norm": 1.6110398097091148, "language_loss": 0.77341998, "learning_rate": 3.7954290972608383e-07, "loss": 0.79426539, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33789062, "step": 13416, "time_per_iteration": 2.3797390460968018 }, { "auxiliary_loss_clip": 0.01053928, "auxiliary_loss_mlp": 0.01036415, "balance_loss_clip": 1.01297879, "balance_loss_mlp": 1.01648784, "epoch": 0.8066736810461446, "flos": 21142492473600.0, "grad_norm": 1.7851227184126095, "language_loss": 0.65994352, "learning_rate": 3.793146714797086e-07, "loss": 0.68084693, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 13417, "time_per_iteration": 2.366173028945923 }, { "auxiliary_loss_clip": 0.01053894, "auxiliary_loss_mlp": 0.01043674, "balance_loss_clip": 1.02115536, "balance_loss_mlp": 1.01782215, "epoch": 0.8067338042988126, "flos": 22597253671680.0, "grad_norm": 1.7009651041422313, "language_loss": 0.81892478, "learning_rate": 3.7908649468988306e-07, "loss": 0.83990049, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 13418, "time_per_iteration": 2.414052724838257 }, { "auxiliary_loss_clip": 0.0105323, "auxiliary_loss_mlp": 0.01035858, "balance_loss_clip": 1.01229048, "balance_loss_mlp": 1.01639307, "epoch": 0.8067939275514805, "flos": 16507493130240.0, "grad_norm": 1.481544153823053, "language_loss": 0.85721123, "learning_rate": 3.7885837936526066e-07, "loss": 0.87810212, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 13419, "time_per_iteration": 2.3421061038970947 }, { "auxiliary_loss_clip": 0.0105422, "auxiliary_loss_mlp": 0.01038684, "balance_loss_clip": 1.01608205, "balance_loss_mlp": 1.01713061, "epoch": 0.8068540508041485, "flos": 28540763061120.0, "grad_norm": 1.6447227824985062, "language_loss": 0.7710138, "learning_rate": 3.7863032551449047e-07, "loss": 0.79194283, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 13420, "time_per_iteration": 2.4338624477386475 }, { "auxiliary_loss_clip": 0.01050086, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.01201761, "balance_loss_mlp": 1.01520944, "epoch": 0.8069141740568164, "flos": 21651421495680.0, "grad_norm": 1.767170700296049, "language_loss": 0.79414082, "learning_rate": 3.784023331462207e-07, "loss": 0.81496418, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34765625, "step": 13421, "time_per_iteration": 3.7605481147766113 }, { "auxiliary_loss_clip": 0.01054041, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.01153994, "balance_loss_mlp": 1.01706195, "epoch": 0.8069742973094844, "flos": 17528388462720.0, "grad_norm": 1.9705695317516094, "language_loss": 0.80515051, "learning_rate": 3.78174402269098e-07, "loss": 0.82604063, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 13422, "time_per_iteration": 2.409130334854126 }, { "auxiliary_loss_clip": 0.01051443, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.01285362, "balance_loss_mlp": 1.01594055, "epoch": 0.8070344205621525, "flos": 23365936275840.0, "grad_norm": 1.5932816533872878, "language_loss": 0.69434214, "learning_rate": 3.7794653289176347e-07, "loss": 0.71519583, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 13423, "time_per_iteration": 2.3649544715881348 }, { "auxiliary_loss_clip": 0.01053003, "auxiliary_loss_mlp": 0.0103812, "balance_loss_clip": 1.01330066, "balance_loss_mlp": 1.01635456, "epoch": 0.8070945438148204, "flos": 22929068033280.0, "grad_norm": 1.847940134715173, "language_loss": 0.8111701, "learning_rate": 3.7771872502285904e-07, "loss": 0.83208138, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 13424, "time_per_iteration": 2.398940086364746 }, { "auxiliary_loss_clip": 0.0105355, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.01292109, "balance_loss_mlp": 1.01545405, "epoch": 0.8071546670674884, "flos": 25299531037440.0, "grad_norm": 1.5238216838263303, "language_loss": 0.79820281, "learning_rate": 3.774909786710232e-07, "loss": 0.8190968, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.38085938, "step": 13425, "time_per_iteration": 2.3807871341705322 }, { "auxiliary_loss_clip": 0.01051259, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.01530159, "balance_loss_mlp": 1.01590419, "epoch": 0.8072147903201563, "flos": 18112729754880.0, "grad_norm": 3.283867708872226, "language_loss": 0.78196514, "learning_rate": 3.772632938448923e-07, "loss": 0.8028546, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 13426, "time_per_iteration": 2.3485589027404785 }, { "auxiliary_loss_clip": 0.01051617, "auxiliary_loss_mlp": 0.01032796, "balance_loss_clip": 1.00922859, "balance_loss_mlp": 1.01543391, "epoch": 0.8072749135728243, "flos": 26686944489600.0, "grad_norm": 1.8424019134496252, "language_loss": 0.73808229, "learning_rate": 3.770356705530997e-07, "loss": 0.75892651, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 13427, "time_per_iteration": 3.8178462982177734 }, { "auxiliary_loss_clip": 0.0105185, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.02140975, "balance_loss_mlp": 1.01654732, "epoch": 0.8073350368254922, "flos": 19239412106880.0, "grad_norm": 1.827176644457688, "language_loss": 0.70866156, "learning_rate": 3.768081088042774e-07, "loss": 0.72963452, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 13428, "time_per_iteration": 2.375622034072876 }, { "auxiliary_loss_clip": 0.01051919, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.01326561, "balance_loss_mlp": 1.01558208, "epoch": 0.8073951600781603, "flos": 13333678675200.0, "grad_norm": 2.0540338257865773, "language_loss": 0.76249719, "learning_rate": 3.765806086070544e-07, "loss": 0.78336895, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 13429, "time_per_iteration": 3.7719759941101074 }, { "auxiliary_loss_clip": 0.01050056, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.01676929, "balance_loss_mlp": 1.01578832, "epoch": 0.8074552833308282, "flos": 22852189486080.0, "grad_norm": 2.128931171493489, "language_loss": 0.68129158, "learning_rate": 3.763531699700568e-07, "loss": 0.70218837, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 13430, "time_per_iteration": 2.4939937591552734 }, { "auxiliary_loss_clip": 0.01051083, "auxiliary_loss_mlp": 0.01034914, "balance_loss_clip": 1.01159644, "balance_loss_mlp": 1.01632977, "epoch": 0.8075154065834962, "flos": 20338372972800.0, "grad_norm": 1.7287073923273004, "language_loss": 0.80708373, "learning_rate": 3.7612579290190994e-07, "loss": 0.82794374, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34765625, "step": 13431, "time_per_iteration": 2.3655526638031006 }, { "auxiliary_loss_clip": 0.01050484, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.01320374, "balance_loss_mlp": 1.01611698, "epoch": 0.8075755298361641, "flos": 21906985714560.0, "grad_norm": 1.8875366249539414, "language_loss": 0.81560194, "learning_rate": 3.7589847741123593e-07, "loss": 0.83646584, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34375, "step": 13432, "time_per_iteration": 2.355210542678833 }, { "auxiliary_loss_clip": 0.01057745, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.01358652, "balance_loss_mlp": 1.01847219, "epoch": 0.8076356530888321, "flos": 15668390580480.0, "grad_norm": 2.5332593497044993, "language_loss": 0.71750236, "learning_rate": 3.7567122350665415e-07, "loss": 0.73847258, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.39257812, "step": 13433, "time_per_iteration": 2.5151493549346924 }, { "auxiliary_loss_clip": 0.01051245, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.0159893, "balance_loss_mlp": 1.01573801, "epoch": 0.8076957763415, "flos": 37775953255680.0, "grad_norm": 2.0090479797311884, "language_loss": 0.73452079, "learning_rate": 3.754440311967828e-07, "loss": 0.75541651, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 13434, "time_per_iteration": 2.508953809738159 }, { "auxiliary_loss_clip": 0.0105295, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.01251626, "balance_loss_mlp": 1.01700342, "epoch": 0.807755899594168, "flos": 19609735564800.0, "grad_norm": 1.7348824420220699, "language_loss": 0.69324529, "learning_rate": 3.752169004902361e-07, "loss": 0.71412587, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 13435, "time_per_iteration": 2.3647587299346924 }, { "auxiliary_loss_clip": 0.01054624, "auxiliary_loss_mlp": 0.01044926, "balance_loss_clip": 1.0168643, "balance_loss_mlp": 1.01702309, "epoch": 0.8078160228468361, "flos": 23293770762240.0, "grad_norm": 1.45012187079473, "language_loss": 0.75856638, "learning_rate": 3.749898313956279e-07, "loss": 0.77956194, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.37695312, "step": 13436, "time_per_iteration": 2.3801019191741943 }, { "auxiliary_loss_clip": 0.01049002, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.01317668, "balance_loss_mlp": 1.01467729, "epoch": 0.807876146099504, "flos": 27161414133120.0, "grad_norm": 1.6770231779432119, "language_loss": 0.71901429, "learning_rate": 3.747628239215674e-07, "loss": 0.73985791, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34375, "step": 13437, "time_per_iteration": 2.4290661811828613 }, { "auxiliary_loss_clip": 0.01053065, "auxiliary_loss_mlp": 0.01039467, "balance_loss_clip": 1.01698422, "balance_loss_mlp": 1.01805353, "epoch": 0.807936269352172, "flos": 27158865603840.0, "grad_norm": 2.0961456721926814, "language_loss": 0.73628068, "learning_rate": 3.745358780766636e-07, "loss": 0.75720596, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13438, "time_per_iteration": 2.4366037845611572 }, { "auxiliary_loss_clip": 0.0105045, "auxiliary_loss_mlp": 0.01039368, "balance_loss_clip": 1.01540661, "balance_loss_mlp": 1.01548314, "epoch": 0.8079963926048399, "flos": 20739385382400.0, "grad_norm": 2.3165735096225664, "language_loss": 0.78450239, "learning_rate": 3.7430899386952344e-07, "loss": 0.80540061, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.34960938, "step": 13439, "time_per_iteration": 2.371670722961426 }, { "auxiliary_loss_clip": 0.01051602, "auxiliary_loss_mlp": 0.01037166, "balance_loss_clip": 1.01527882, "balance_loss_mlp": 1.01565337, "epoch": 0.8080565158575079, "flos": 25008495010560.0, "grad_norm": 1.473915016231069, "language_loss": 0.7925241, "learning_rate": 3.7408217130874786e-07, "loss": 0.81341177, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 13440, "time_per_iteration": 2.435093879699707 }, { "auxiliary_loss_clip": 0.01053355, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.00826871, "balance_loss_mlp": 1.01628852, "epoch": 0.8081166391101758, "flos": 18697071047040.0, "grad_norm": 1.8491130069484432, "language_loss": 0.60486245, "learning_rate": 3.7385541040293946e-07, "loss": 0.62571329, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 13441, "time_per_iteration": 2.3419432640075684 }, { "auxiliary_loss_clip": 0.0105065, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.01157355, "balance_loss_mlp": 1.01577163, "epoch": 0.8081767623628439, "flos": 19827628560000.0, "grad_norm": 1.9026548207241158, "language_loss": 0.77337962, "learning_rate": 3.7362871116069684e-07, "loss": 0.79423851, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 13442, "time_per_iteration": 2.390512228012085 }, { "auxiliary_loss_clip": 0.01052495, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.01475286, "balance_loss_mlp": 1.01675606, "epoch": 0.8082368856155118, "flos": 35771484700800.0, "grad_norm": 1.5731403435637927, "language_loss": 0.71661943, "learning_rate": 3.734020735906169e-07, "loss": 0.73752344, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35742188, "step": 13443, "time_per_iteration": 3.9548356533050537 }, { "auxiliary_loss_clip": 0.01050838, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.01964402, "balance_loss_mlp": 1.01659334, "epoch": 0.8082970088681798, "flos": 17197167594240.0, "grad_norm": 1.9672518992512014, "language_loss": 0.83458567, "learning_rate": 3.7317549770129286e-07, "loss": 0.85551161, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34179688, "step": 13444, "time_per_iteration": 2.320800542831421 }, { "auxiliary_loss_clip": 0.01008032, "auxiliary_loss_mlp": 0.01002526, "balance_loss_clip": 1.0002017, "balance_loss_mlp": 1.00142503, "epoch": 0.8083571321208477, "flos": 63552502465920.0, "grad_norm": 0.818557242727839, "language_loss": 0.53649545, "learning_rate": 3.7294898350131754e-07, "loss": 0.55660105, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.06640625, "step": 13445, "time_per_iteration": 2.913663864135742 }, { "auxiliary_loss_clip": 0.01050864, "auxiliary_loss_mlp": 0.01039235, "balance_loss_clip": 1.01570332, "balance_loss_mlp": 1.01576197, "epoch": 0.8084172553735157, "flos": 17929749985920.0, "grad_norm": 2.0194740649784237, "language_loss": 0.73146737, "learning_rate": 3.7272253099927964e-07, "loss": 0.75236839, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 13446, "time_per_iteration": 2.33137845993042 }, { "auxiliary_loss_clip": 0.01053506, "auxiliary_loss_mlp": 0.0103762, "balance_loss_clip": 1.0136348, "balance_loss_mlp": 1.01690662, "epoch": 0.8084773786261836, "flos": 24096842922240.0, "grad_norm": 1.6718605100323016, "language_loss": 0.72282398, "learning_rate": 3.7249614020376606e-07, "loss": 0.74373525, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 13447, "time_per_iteration": 2.4201507568359375 }, { "auxiliary_loss_clip": 0.01053789, "auxiliary_loss_mlp": 0.01040188, "balance_loss_clip": 1.01677501, "balance_loss_mlp": 1.01630545, "epoch": 0.8085375018788516, "flos": 15587532138240.0, "grad_norm": 2.269326596072333, "language_loss": 0.75950468, "learning_rate": 3.7226981112336197e-07, "loss": 0.78044444, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 13448, "time_per_iteration": 2.31272554397583 }, { "auxiliary_loss_clip": 0.01007884, "auxiliary_loss_mlp": 0.01002318, "balance_loss_clip": 0.99982673, "balance_loss_mlp": 1.00116611, "epoch": 0.8085976251315197, "flos": 67557667282560.0, "grad_norm": 0.7380583066120563, "language_loss": 0.63960838, "learning_rate": 3.7204354376665024e-07, "loss": 0.65971041, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.06738281, "step": 13449, "time_per_iteration": 3.0928773880004883 }, { "auxiliary_loss_clip": 0.01054691, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 1.01124847, "balance_loss_mlp": 1.01750958, "epoch": 0.8086577483841876, "flos": 22560455232000.0, "grad_norm": 1.9634590941012067, "language_loss": 0.75485468, "learning_rate": 3.718173381422105e-07, "loss": 0.77575833, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 13450, "time_per_iteration": 2.3544061183929443 }, { "auxiliary_loss_clip": 0.0105084, "auxiliary_loss_mlp": 0.01035318, "balance_loss_clip": 1.01294231, "balance_loss_mlp": 1.01562858, "epoch": 0.8087178716368556, "flos": 17967107007360.0, "grad_norm": 1.8640227758140813, "language_loss": 0.74621117, "learning_rate": 3.7159119425861986e-07, "loss": 0.76707274, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 13451, "time_per_iteration": 2.3836257457733154 }, { "auxiliary_loss_clip": 0.01054465, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.01416898, "balance_loss_mlp": 1.0154438, "epoch": 0.8087779948895235, "flos": 21718629596160.0, "grad_norm": 1.66352841104271, "language_loss": 0.80956572, "learning_rate": 3.713651121244543e-07, "loss": 0.83051431, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.390625, "step": 13452, "time_per_iteration": 2.355262279510498 }, { "auxiliary_loss_clip": 0.01053232, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.01492798, "balance_loss_mlp": 1.01688564, "epoch": 0.8088381181421915, "flos": 29091692315520.0, "grad_norm": 1.666224452431801, "language_loss": 0.79475772, "learning_rate": 3.711390917482875e-07, "loss": 0.81565827, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 13453, "time_per_iteration": 2.516028881072998 }, { "auxiliary_loss_clip": 0.01052682, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.01167405, "balance_loss_mlp": 1.01623583, "epoch": 0.8088982413948594, "flos": 22197393336960.0, "grad_norm": 2.373523065528196, "language_loss": 0.78647554, "learning_rate": 3.709131331386892e-07, "loss": 0.80736303, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 13454, "time_per_iteration": 2.343926191329956 }, { "auxiliary_loss_clip": 0.01051979, "auxiliary_loss_mlp": 0.01039459, "balance_loss_clip": 1.01676178, "balance_loss_mlp": 1.01649499, "epoch": 0.8089583646475275, "flos": 28035499731840.0, "grad_norm": 1.83512587741484, "language_loss": 0.78372794, "learning_rate": 3.7068723630422795e-07, "loss": 0.80464232, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13455, "time_per_iteration": 2.443310499191284 }, { "auxiliary_loss_clip": 0.01051054, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.01239491, "balance_loss_mlp": 1.01576197, "epoch": 0.8090184879001954, "flos": 16616806197120.0, "grad_norm": 1.9617370196375727, "language_loss": 0.80017108, "learning_rate": 3.70461401253471e-07, "loss": 0.82103205, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 13456, "time_per_iteration": 2.3204727172851562 }, { "auxiliary_loss_clip": 0.01052036, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.01555133, "balance_loss_mlp": 1.01724839, "epoch": 0.8090786111528634, "flos": 27339680868480.0, "grad_norm": 2.1415195756234797, "language_loss": 0.72925925, "learning_rate": 3.702356279949801e-07, "loss": 0.75016189, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 13457, "time_per_iteration": 2.4659838676452637 }, { "auxiliary_loss_clip": 0.01051489, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.01332486, "balance_loss_mlp": 1.01610494, "epoch": 0.8091387344055313, "flos": 21104681604480.0, "grad_norm": 2.0741949312442913, "language_loss": 0.73757219, "learning_rate": 3.700099165373176e-07, "loss": 0.75844562, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 13458, "time_per_iteration": 2.3565473556518555 }, { "auxiliary_loss_clip": 0.01052263, "auxiliary_loss_mlp": 0.0103867, "balance_loss_clip": 1.01622319, "balance_loss_mlp": 1.01652861, "epoch": 0.8091988576581993, "flos": 11654286589440.0, "grad_norm": 2.1891455450077655, "language_loss": 0.80917591, "learning_rate": 3.6978426688904275e-07, "loss": 0.83008528, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 13459, "time_per_iteration": 2.379330635070801 }, { "auxiliary_loss_clip": 0.01051843, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.01307297, "balance_loss_mlp": 1.01583242, "epoch": 0.8092589809108672, "flos": 22962305514240.0, "grad_norm": 2.0510997858079216, "language_loss": 0.8136006, "learning_rate": 3.695586790587113e-07, "loss": 0.83449358, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 13460, "time_per_iteration": 2.3499274253845215 }, { "auxiliary_loss_clip": 0.01052426, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.01279044, "balance_loss_mlp": 1.01566994, "epoch": 0.8093191041635353, "flos": 13260151618560.0, "grad_norm": 2.0535902385791447, "language_loss": 0.85104781, "learning_rate": 3.693331530548789e-07, "loss": 0.8719418, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 13461, "time_per_iteration": 3.5869734287261963 }, { "auxiliary_loss_clip": 0.01054426, "auxiliary_loss_mlp": 0.01038076, "balance_loss_clip": 1.01435328, "balance_loss_mlp": 1.01744437, "epoch": 0.8093792274162032, "flos": 25514945326080.0, "grad_norm": 1.8283591985176675, "language_loss": 0.77877307, "learning_rate": 3.69107688886096e-07, "loss": 0.79969811, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 13462, "time_per_iteration": 2.3893134593963623 }, { "auxiliary_loss_clip": 0.01052647, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.0133791, "balance_loss_mlp": 1.01669121, "epoch": 0.8094393506688712, "flos": 23545459820160.0, "grad_norm": 1.5854310543937062, "language_loss": 0.83947957, "learning_rate": 3.6888228656091357e-07, "loss": 0.86037594, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13463, "time_per_iteration": 2.4299380779266357 }, { "auxiliary_loss_clip": 0.01051043, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.01769483, "balance_loss_mlp": 1.01640463, "epoch": 0.8094994739215392, "flos": 17054966160000.0, "grad_norm": 1.8202108728101964, "language_loss": 0.63531476, "learning_rate": 3.686569460878779e-07, "loss": 0.65621293, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 13464, "time_per_iteration": 2.3412845134735107 }, { "auxiliary_loss_clip": 0.01050785, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.0138309, "balance_loss_mlp": 1.01558614, "epoch": 0.8095595971742071, "flos": 23550068119680.0, "grad_norm": 1.599761510553912, "language_loss": 0.62940919, "learning_rate": 3.684316674755341e-07, "loss": 0.6502763, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13465, "time_per_iteration": 2.444350481033325 }, { "auxiliary_loss_clip": 0.01053208, "auxiliary_loss_mlp": 0.01039095, "balance_loss_clip": 1.01625419, "balance_loss_mlp": 1.01793396, "epoch": 0.8096197204268751, "flos": 20372238858240.0, "grad_norm": 3.95783447622993, "language_loss": 0.83310199, "learning_rate": 3.682064507324256e-07, "loss": 0.85402501, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 13466, "time_per_iteration": 2.348133087158203 }, { "auxiliary_loss_clip": 0.01053623, "auxiliary_loss_mlp": 0.01042555, "balance_loss_clip": 1.01861787, "balance_loss_mlp": 1.01698732, "epoch": 0.809679843679543, "flos": 27817536913920.0, "grad_norm": 1.823235149185082, "language_loss": 0.77583516, "learning_rate": 3.6798129586709204e-07, "loss": 0.79679692, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 13467, "time_per_iteration": 3.793680191040039 }, { "auxiliary_loss_clip": 0.01051318, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.01171017, "balance_loss_mlp": 1.01505053, "epoch": 0.8097399669322111, "flos": 22013121847680.0, "grad_norm": 1.7904436798361845, "language_loss": 0.796363, "learning_rate": 3.6775620288807073e-07, "loss": 0.81722248, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 13468, "time_per_iteration": 2.3501980304718018 }, { "auxiliary_loss_clip": 0.01048339, "auxiliary_loss_mlp": 0.01035336, "balance_loss_clip": 1.01441503, "balance_loss_mlp": 1.01386046, "epoch": 0.809800090184879, "flos": 18988002339840.0, "grad_norm": 1.8069876621597911, "language_loss": 0.69087934, "learning_rate": 3.675311718038978e-07, "loss": 0.71171606, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 13469, "time_per_iteration": 3.7157602310180664 }, { "auxiliary_loss_clip": 0.01007499, "auxiliary_loss_mlp": 0.0100289, "balance_loss_clip": 1.00048161, "balance_loss_mlp": 1.00082314, "epoch": 0.809860213437547, "flos": 66095993635200.0, "grad_norm": 0.6921169878327617, "language_loss": 0.547786, "learning_rate": 3.6730620262310683e-07, "loss": 0.56788987, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.06640625, "step": 13470, "time_per_iteration": 3.093693971633911 }, { "auxiliary_loss_clip": 0.01050852, "auxiliary_loss_mlp": 0.0103744, "balance_loss_clip": 1.01561236, "balance_loss_mlp": 1.01558042, "epoch": 0.8099203366902149, "flos": 20881551905280.0, "grad_norm": 1.8097297454159427, "language_loss": 0.70574093, "learning_rate": 3.670812953542279e-07, "loss": 0.72662383, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13471, "time_per_iteration": 2.3706915378570557 }, { "auxiliary_loss_clip": 0.01051414, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.01428771, "balance_loss_mlp": 1.01573825, "epoch": 0.8099804599428829, "flos": 26029739456640.0, "grad_norm": 1.6724812220380318, "language_loss": 0.81317949, "learning_rate": 3.6685645000579003e-07, "loss": 0.83405405, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35742188, "step": 13472, "time_per_iteration": 2.422808885574341 }, { "auxiliary_loss_clip": 0.0100768, "auxiliary_loss_mlp": 0.01004119, "balance_loss_clip": 1.00188982, "balance_loss_mlp": 1.00076342, "epoch": 0.8100405831955508, "flos": 69300147928320.0, "grad_norm": 0.7509216571401784, "language_loss": 0.57882953, "learning_rate": 3.666316665863201e-07, "loss": 0.59894753, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.06933594, "step": 13473, "time_per_iteration": 2.9940404891967773 }, { "auxiliary_loss_clip": 0.01052256, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.01181769, "balance_loss_mlp": 1.01565754, "epoch": 0.8101007064482189, "flos": 15011604483840.0, "grad_norm": 2.486543532130282, "language_loss": 0.75355697, "learning_rate": 3.664069451043399e-07, "loss": 0.77443129, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 13474, "time_per_iteration": 2.3868792057037354 }, { "auxiliary_loss_clip": 0.01055264, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.01957238, "balance_loss_mlp": 1.01791835, "epoch": 0.8101608297008868, "flos": 21066207419520.0, "grad_norm": 1.6582642377009045, "language_loss": 0.79815245, "learning_rate": 3.661822855683723e-07, "loss": 0.81913626, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 13475, "time_per_iteration": 2.3599956035614014 }, { "auxiliary_loss_clip": 0.01051253, "auxiliary_loss_mlp": 0.01038426, "balance_loss_clip": 1.01609814, "balance_loss_mlp": 1.01595402, "epoch": 0.8102209529535548, "flos": 23730185157120.0, "grad_norm": 1.5651317865772785, "language_loss": 0.76163375, "learning_rate": 3.659576879869364e-07, "loss": 0.78253055, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 13476, "time_per_iteration": 2.424445867538452 }, { "auxiliary_loss_clip": 0.0105414, "auxiliary_loss_mlp": 0.01039947, "balance_loss_clip": 1.0143888, "balance_loss_mlp": 1.01651227, "epoch": 0.8102810762062228, "flos": 10955290792320.0, "grad_norm": 2.4009112248604865, "language_loss": 0.7545836, "learning_rate": 3.657331523685485e-07, "loss": 0.7755245, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 13477, "time_per_iteration": 2.3566675186157227 }, { "auxiliary_loss_clip": 0.01053276, "auxiliary_loss_mlp": 0.01039881, "balance_loss_clip": 1.01521611, "balance_loss_mlp": 1.01684058, "epoch": 0.8103411994588907, "flos": 14647914184320.0, "grad_norm": 2.1724278491283626, "language_loss": 0.71089494, "learning_rate": 3.6550867872172365e-07, "loss": 0.73182654, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 13478, "time_per_iteration": 2.4069743156433105 }, { "auxiliary_loss_clip": 0.01007383, "auxiliary_loss_mlp": 0.0100206, "balance_loss_clip": 0.99986672, "balance_loss_mlp": 1.0006249, "epoch": 0.8104013227115587, "flos": 59150373091200.0, "grad_norm": 0.6824996788942684, "language_loss": 0.52202493, "learning_rate": 3.6528426705497293e-07, "loss": 0.54211938, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06738281, "step": 13479, "time_per_iteration": 2.962573289871216 }, { "auxiliary_loss_clip": 0.01051322, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.01572311, "balance_loss_mlp": 1.01641846, "epoch": 0.8104614459642266, "flos": 19827663471360.0, "grad_norm": 1.5278416475389829, "language_loss": 0.72396559, "learning_rate": 3.650599173768072e-07, "loss": 0.74485707, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 13480, "time_per_iteration": 2.4261791706085205 }, { "auxiliary_loss_clip": 0.0105236, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.01065814, "balance_loss_mlp": 1.01662302, "epoch": 0.8105215692168947, "flos": 25373093005440.0, "grad_norm": 1.6432636483005112, "language_loss": 0.80783296, "learning_rate": 3.648356296957327e-07, "loss": 0.82869577, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 13481, "time_per_iteration": 2.461930751800537 }, { "auxiliary_loss_clip": 0.01050998, "auxiliary_loss_mlp": 0.01036643, "balance_loss_clip": 1.01348042, "balance_loss_mlp": 1.01554048, "epoch": 0.8105816924695626, "flos": 20480783875200.0, "grad_norm": 1.7500613318569513, "language_loss": 0.73616648, "learning_rate": 3.646114040202548e-07, "loss": 0.75704294, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 13482, "time_per_iteration": 2.3791632652282715 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.01144898, "balance_loss_mlp": 1.01388383, "epoch": 0.8106418157222306, "flos": 14537798156160.0, "grad_norm": 1.8647961783016305, "language_loss": 0.66465789, "learning_rate": 3.6438724035887705e-07, "loss": 0.68553418, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 13483, "time_per_iteration": 3.8133251667022705 }, { "auxiliary_loss_clip": 0.0105166, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.01099539, "balance_loss_mlp": 1.01600957, "epoch": 0.8107019389748985, "flos": 22563387786240.0, "grad_norm": 1.6513257929728962, "language_loss": 0.77261209, "learning_rate": 3.641631387200992e-07, "loss": 0.79346299, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 13484, "time_per_iteration": 2.409259557723999 }, { "auxiliary_loss_clip": 0.01052932, "auxiliary_loss_mlp": 0.01046121, "balance_loss_clip": 1.02117109, "balance_loss_mlp": 1.01534081, "epoch": 0.8107620622275665, "flos": 19608548578560.0, "grad_norm": 1.4566329147103592, "language_loss": 0.72447902, "learning_rate": 3.639390991124183e-07, "loss": 0.74546957, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 13485, "time_per_iteration": 2.395188808441162 }, { "auxiliary_loss_clip": 0.01049214, "auxiliary_loss_mlp": 0.01031092, "balance_loss_clip": 1.01061213, "balance_loss_mlp": 1.01538193, "epoch": 0.8108221854802344, "flos": 16142580933120.0, "grad_norm": 1.844981401577872, "language_loss": 0.76913261, "learning_rate": 3.637151215443308e-07, "loss": 0.78993565, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 13486, "time_per_iteration": 2.3798279762268066 }, { "auxiliary_loss_clip": 0.01054075, "auxiliary_loss_mlp": 0.01036192, "balance_loss_clip": 1.01151609, "balance_loss_mlp": 1.01713586, "epoch": 0.8108823087329025, "flos": 21105135452160.0, "grad_norm": 2.075121872883423, "language_loss": 0.72762674, "learning_rate": 3.6349120602433045e-07, "loss": 0.74852943, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 13487, "time_per_iteration": 2.3722336292266846 }, { "auxiliary_loss_clip": 0.01050512, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.01396143, "balance_loss_mlp": 1.01639688, "epoch": 0.8109424319855704, "flos": 29198526675840.0, "grad_norm": 1.9598983542899595, "language_loss": 0.84612966, "learning_rate": 3.6326735256090715e-07, "loss": 0.86699426, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 13488, "time_per_iteration": 2.455432176589966 }, { "auxiliary_loss_clip": 0.01053456, "auxiliary_loss_mlp": 0.01033516, "balance_loss_clip": 1.00931644, "balance_loss_mlp": 1.01705015, "epoch": 0.8110025552382384, "flos": 23110756081920.0, "grad_norm": 1.9099899920037717, "language_loss": 0.7501657, "learning_rate": 3.630435611625502e-07, "loss": 0.77103537, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 13489, "time_per_iteration": 2.388122797012329 }, { "auxiliary_loss_clip": 0.01049403, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.01223505, "balance_loss_mlp": 1.01506925, "epoch": 0.8110626784909064, "flos": 22378941740160.0, "grad_norm": 2.019206930527245, "language_loss": 0.72326833, "learning_rate": 3.628198318377453e-07, "loss": 0.74410146, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 13490, "time_per_iteration": 2.408686876296997 }, { "auxiliary_loss_clip": 0.01054578, "auxiliary_loss_mlp": 0.01045179, "balance_loss_clip": 1.02035987, "balance_loss_mlp": 1.01738286, "epoch": 0.8111228017435743, "flos": 23366913793920.0, "grad_norm": 2.4761056101267958, "language_loss": 0.73051226, "learning_rate": 3.625961645949762e-07, "loss": 0.75150979, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 13491, "time_per_iteration": 2.385023593902588 }, { "auxiliary_loss_clip": 0.01051652, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.01518679, "balance_loss_mlp": 1.01536965, "epoch": 0.8111829249962423, "flos": 21285531780480.0, "grad_norm": 3.1744406885480383, "language_loss": 0.68619561, "learning_rate": 3.623725594427245e-07, "loss": 0.70711946, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 13492, "time_per_iteration": 2.4424562454223633 }, { "auxiliary_loss_clip": 0.01053808, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.00885153, "balance_loss_mlp": 1.01744628, "epoch": 0.8112430482489102, "flos": 22344482361600.0, "grad_norm": 1.5670282002172735, "language_loss": 0.72893906, "learning_rate": 3.6214901638947006e-07, "loss": 0.74983007, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.36328125, "step": 13493, "time_per_iteration": 2.3798625469207764 }, { "auxiliary_loss_clip": 0.01051509, "auxiliary_loss_mlp": 0.0105075, "balance_loss_clip": 1.02585864, "balance_loss_mlp": 1.01563668, "epoch": 0.8113031715015783, "flos": 31137009027840.0, "grad_norm": 2.3978676558188203, "language_loss": 0.72281736, "learning_rate": 3.619255354436885e-07, "loss": 0.74383992, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 13494, "time_per_iteration": 2.451765537261963 }, { "auxiliary_loss_clip": 0.01053459, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.01511121, "balance_loss_mlp": 1.01627874, "epoch": 0.8113632947542462, "flos": 25334339529600.0, "grad_norm": 1.9822628315179835, "language_loss": 0.77722144, "learning_rate": 3.6170211661385543e-07, "loss": 0.79816544, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 13495, "time_per_iteration": 2.3901774883270264 }, { "auxiliary_loss_clip": 0.01051894, "auxiliary_loss_mlp": 0.01038724, "balance_loss_clip": 1.01520419, "balance_loss_mlp": 1.01639342, "epoch": 0.8114234180069142, "flos": 28437908595840.0, "grad_norm": 1.8296677739920801, "language_loss": 0.80696929, "learning_rate": 3.614787599084417e-07, "loss": 0.82787549, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 13496, "time_per_iteration": 2.454524040222168 }, { "auxiliary_loss_clip": 0.01050514, "auxiliary_loss_mlp": 0.01042353, "balance_loss_clip": 1.01977468, "balance_loss_mlp": 1.01531005, "epoch": 0.8114835412595821, "flos": 20337849302400.0, "grad_norm": 1.8939580372360536, "language_loss": 0.7199719, "learning_rate": 3.6125546533591787e-07, "loss": 0.74090052, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 13497, "time_per_iteration": 2.367025375366211 }, { "auxiliary_loss_clip": 0.0105388, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.01498663, "balance_loss_mlp": 1.01755798, "epoch": 0.8115436645122501, "flos": 22489825818240.0, "grad_norm": 1.5605294786677866, "language_loss": 0.77590692, "learning_rate": 3.610322329047508e-07, "loss": 0.79681587, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 13498, "time_per_iteration": 2.4254791736602783 }, { "auxiliary_loss_clip": 0.01052127, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.0144254, "balance_loss_mlp": 1.01575541, "epoch": 0.811603787764918, "flos": 13844423088000.0, "grad_norm": 2.2844471554442496, "language_loss": 0.85605907, "learning_rate": 3.608090626234055e-07, "loss": 0.87694728, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 13499, "time_per_iteration": 2.369807004928589 }, { "auxiliary_loss_clip": 0.01052664, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.00962245, "balance_loss_mlp": 1.01732314, "epoch": 0.8116639110175861, "flos": 21613610626560.0, "grad_norm": 1.6585851560856377, "language_loss": 0.77154529, "learning_rate": 3.6058595450034603e-07, "loss": 0.79240733, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35351562, "step": 13500, "time_per_iteration": 3.6528820991516113 }, { "auxiliary_loss_clip": 0.01007753, "auxiliary_loss_mlp": 0.01004026, "balance_loss_clip": 1.00141549, "balance_loss_mlp": 1.00112391, "epoch": 0.811724034270254, "flos": 64456262720640.0, "grad_norm": 0.8293201544946872, "language_loss": 0.60053706, "learning_rate": 3.603629085440303e-07, "loss": 0.62065488, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.06640625, "step": 13501, "time_per_iteration": 3.0851199626922607 }, { "auxiliary_loss_clip": 0.0105045, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.01614618, "balance_loss_mlp": 1.01567364, "epoch": 0.811784157522922, "flos": 24752965703040.0, "grad_norm": 1.5677583484510333, "language_loss": 0.80235982, "learning_rate": 3.6013992476291753e-07, "loss": 0.82324159, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 13502, "time_per_iteration": 2.397235870361328 }, { "auxiliary_loss_clip": 0.01052106, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.01795053, "balance_loss_mlp": 1.01640379, "epoch": 0.81184428077559, "flos": 12166322722560.0, "grad_norm": 2.1812457371275906, "language_loss": 0.71959537, "learning_rate": 3.599170031654635e-07, "loss": 0.74051762, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 13503, "time_per_iteration": 2.3779139518737793 }, { "auxiliary_loss_clip": 0.01052121, "auxiliary_loss_mlp": 0.01035757, "balance_loss_clip": 1.01044869, "balance_loss_mlp": 1.01552618, "epoch": 0.8119044040282579, "flos": 44420273832960.0, "grad_norm": 1.6002346354477133, "language_loss": 0.67999053, "learning_rate": 3.5969414376012065e-07, "loss": 0.70086932, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 13504, "time_per_iteration": 2.59869647026062 }, { "auxiliary_loss_clip": 0.01052913, "auxiliary_loss_mlp": 0.0103819, "balance_loss_clip": 1.01328683, "balance_loss_mlp": 1.01615763, "epoch": 0.8119645272809259, "flos": 52153570627200.0, "grad_norm": 2.001128685107157, "language_loss": 0.75889254, "learning_rate": 3.594713465553403e-07, "loss": 0.77980357, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.3671875, "step": 13505, "time_per_iteration": 2.6889119148254395 }, { "auxiliary_loss_clip": 0.0105299, "auxiliary_loss_mlp": 0.01040595, "balance_loss_clip": 1.01590681, "balance_loss_mlp": 1.01649666, "epoch": 0.8120246505335939, "flos": 30231501338880.0, "grad_norm": 1.929583342232816, "language_loss": 0.73554623, "learning_rate": 3.5924861155957123e-07, "loss": 0.75648212, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 13506, "time_per_iteration": 3.9361581802368164 }, { "auxiliary_loss_clip": 0.01055247, "auxiliary_loss_mlp": 0.01037508, "balance_loss_clip": 1.01442862, "balance_loss_mlp": 1.01661158, "epoch": 0.8120847737862619, "flos": 22126554455040.0, "grad_norm": 2.1825343954717056, "language_loss": 0.77496439, "learning_rate": 3.590259387812593e-07, "loss": 0.79589194, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.38671875, "step": 13507, "time_per_iteration": 2.4447824954986572 }, { "auxiliary_loss_clip": 0.0105416, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.01141167, "balance_loss_mlp": 1.01676154, "epoch": 0.8121448970389298, "flos": 23294050053120.0, "grad_norm": 1.6301429739898425, "language_loss": 0.71110338, "learning_rate": 3.5880332822884783e-07, "loss": 0.73199588, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37304688, "step": 13508, "time_per_iteration": 2.358341932296753 }, { "auxiliary_loss_clip": 0.01051589, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.01445389, "balance_loss_mlp": 1.01599896, "epoch": 0.8122050202915978, "flos": 22163038692480.0, "grad_norm": 1.6293695425107122, "language_loss": 0.76950395, "learning_rate": 3.585807799107785e-07, "loss": 0.7903899, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 13509, "time_per_iteration": 3.7622995376586914 }, { "auxiliary_loss_clip": 0.01052682, "auxiliary_loss_mlp": 0.01040341, "balance_loss_clip": 1.01541471, "balance_loss_mlp": 1.01665974, "epoch": 0.8122651435442657, "flos": 23257810195200.0, "grad_norm": 1.7752687898820978, "language_loss": 0.77776998, "learning_rate": 3.58358293835491e-07, "loss": 0.79870021, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 13510, "time_per_iteration": 2.4313602447509766 }, { "auxiliary_loss_clip": 0.01054483, "auxiliary_loss_mlp": 0.01042624, "balance_loss_clip": 1.01793587, "balance_loss_mlp": 1.01659822, "epoch": 0.8123252667969337, "flos": 16139194531200.0, "grad_norm": 2.0818175883858787, "language_loss": 0.71315646, "learning_rate": 3.581358700114212e-07, "loss": 0.73412752, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37890625, "step": 13511, "time_per_iteration": 2.334489107131958 }, { "auxiliary_loss_clip": 0.01054688, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.01669574, "balance_loss_mlp": 1.01826835, "epoch": 0.8123853900496016, "flos": 21244509066240.0, "grad_norm": 1.5952553666239175, "language_loss": 0.80115497, "learning_rate": 3.57913508447004e-07, "loss": 0.82210457, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 13512, "time_per_iteration": 2.3634531497955322 }, { "auxiliary_loss_clip": 0.01049888, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.01252508, "balance_loss_mlp": 1.01506376, "epoch": 0.8124455133022697, "flos": 64375336321920.0, "grad_norm": 3.8053443541740926, "language_loss": 0.64675748, "learning_rate": 3.5769120915067076e-07, "loss": 0.66759431, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 13513, "time_per_iteration": 2.7906720638275146 }, { "auxiliary_loss_clip": 0.01054325, "auxiliary_loss_mlp": 0.01043798, "balance_loss_clip": 1.0177865, "balance_loss_mlp": 1.01669586, "epoch": 0.8125056365549376, "flos": 23841069235200.0, "grad_norm": 1.7657388120435031, "language_loss": 0.73030961, "learning_rate": 3.5746897213085194e-07, "loss": 0.7512908, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37695312, "step": 13514, "time_per_iteration": 2.387388229370117 }, { "auxiliary_loss_clip": 0.01049874, "auxiliary_loss_mlp": 0.0103589, "balance_loss_clip": 1.01403868, "balance_loss_mlp": 1.01527429, "epoch": 0.8125657598076056, "flos": 23549195335680.0, "grad_norm": 1.5825781166789274, "language_loss": 0.63775265, "learning_rate": 3.5724679739597364e-07, "loss": 0.65861022, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 13515, "time_per_iteration": 2.3835203647613525 }, { "auxiliary_loss_clip": 0.01048463, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.01076329, "balance_loss_mlp": 1.01541793, "epoch": 0.8126258830602736, "flos": 20703180435840.0, "grad_norm": 1.5616580773025324, "language_loss": 0.76888514, "learning_rate": 3.570246849544616e-07, "loss": 0.7896862, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33007812, "step": 13516, "time_per_iteration": 2.3521318435668945 }, { "auxiliary_loss_clip": 0.01052653, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.01574922, "balance_loss_mlp": 1.01661611, "epoch": 0.8126860063129415, "flos": 23617171486080.0, "grad_norm": 1.541194857436985, "language_loss": 0.92223287, "learning_rate": 3.5680263481473907e-07, "loss": 0.94313872, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 13517, "time_per_iteration": 2.421895980834961 }, { "auxiliary_loss_clip": 0.01054977, "auxiliary_loss_mlp": 0.01039705, "balance_loss_clip": 1.01544559, "balance_loss_mlp": 1.01841378, "epoch": 0.8127461295656095, "flos": 25006051215360.0, "grad_norm": 1.5596552306167573, "language_loss": 0.79886985, "learning_rate": 3.565806469852244e-07, "loss": 0.81981671, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 13518, "time_per_iteration": 2.398237466812134 }, { "auxiliary_loss_clip": 0.01053162, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.01715076, "balance_loss_mlp": 1.01744449, "epoch": 0.8128062528182775, "flos": 27341007500160.0, "grad_norm": 1.7013068763810537, "language_loss": 0.79995811, "learning_rate": 3.56358721474336e-07, "loss": 0.82086957, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.35742188, "step": 13519, "time_per_iteration": 2.4675660133361816 }, { "auxiliary_loss_clip": 0.01051979, "auxiliary_loss_mlp": 0.01046235, "balance_loss_clip": 1.02183306, "balance_loss_mlp": 1.01563084, "epoch": 0.8128663760709455, "flos": 26505081884160.0, "grad_norm": 2.59480442654366, "language_loss": 0.71344197, "learning_rate": 3.561368582904905e-07, "loss": 0.73442411, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 13520, "time_per_iteration": 2.4032235145568848 }, { "auxiliary_loss_clip": 0.01051843, "auxiliary_loss_mlp": 0.01040374, "balance_loss_clip": 1.01623404, "balance_loss_mlp": 1.01573122, "epoch": 0.8129264993236134, "flos": 17930273656320.0, "grad_norm": 1.4658530317155931, "language_loss": 0.73707551, "learning_rate": 3.5591505744209925e-07, "loss": 0.75799763, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 13521, "time_per_iteration": 2.381408452987671 }, { "auxiliary_loss_clip": 0.01052482, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.01708508, "balance_loss_mlp": 1.0157547, "epoch": 0.8129866225762814, "flos": 26176479367680.0, "grad_norm": 1.8682915482212052, "language_loss": 0.70856267, "learning_rate": 3.5569331893757394e-07, "loss": 0.72949296, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 13522, "time_per_iteration": 2.4156973361968994 }, { "auxiliary_loss_clip": 0.01050648, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.01183534, "balance_loss_mlp": 1.01644754, "epoch": 0.8130467458289493, "flos": 21031154547840.0, "grad_norm": 1.6531894352189063, "language_loss": 0.71691978, "learning_rate": 3.554716427853233e-07, "loss": 0.73776215, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34179688, "step": 13523, "time_per_iteration": 3.8518593311309814 }, { "auxiliary_loss_clip": 0.01050618, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.01327634, "balance_loss_mlp": 1.01498199, "epoch": 0.8131068690816173, "flos": 15486143950080.0, "grad_norm": 2.369689808823012, "language_loss": 0.72749823, "learning_rate": 3.5525002899375256e-07, "loss": 0.74835795, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 13524, "time_per_iteration": 2.3644704818725586 }, { "auxiliary_loss_clip": 0.01051777, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.01354623, "balance_loss_mlp": 1.01623631, "epoch": 0.8131669923342852, "flos": 29349944709120.0, "grad_norm": 1.817712042755706, "language_loss": 0.63446099, "learning_rate": 3.550284775712653e-07, "loss": 0.65534467, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 13525, "time_per_iteration": 2.407771587371826 }, { "auxiliary_loss_clip": 0.01051875, "auxiliary_loss_mlp": 0.01040179, "balance_loss_clip": 1.01826787, "balance_loss_mlp": 1.01649117, "epoch": 0.8132271155869533, "flos": 35254875179520.0, "grad_norm": 1.4887728766590236, "language_loss": 0.66296411, "learning_rate": 3.548069885262628e-07, "loss": 0.68388462, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13526, "time_per_iteration": 2.501500368118286 }, { "auxiliary_loss_clip": 0.01050901, "auxiliary_loss_mlp": 0.01037748, "balance_loss_clip": 1.01516998, "balance_loss_mlp": 1.01539016, "epoch": 0.8132872388396212, "flos": 27780668651520.0, "grad_norm": 1.5567901967433475, "language_loss": 0.76312745, "learning_rate": 3.5458556186714473e-07, "loss": 0.78401399, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 13527, "time_per_iteration": 2.4133453369140625 }, { "auxiliary_loss_clip": 0.01051226, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.01196265, "balance_loss_mlp": 1.01596248, "epoch": 0.8133473620922892, "flos": 27818339875200.0, "grad_norm": 1.63907167142735, "language_loss": 0.71497017, "learning_rate": 3.5436419760230706e-07, "loss": 0.73582089, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 13528, "time_per_iteration": 2.4191091060638428 }, { "auxiliary_loss_clip": 0.01052595, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.01045465, "balance_loss_mlp": 1.01626956, "epoch": 0.8134074853449572, "flos": 18988526010240.0, "grad_norm": 1.8837506848037462, "language_loss": 0.70316994, "learning_rate": 3.5414289574014357e-07, "loss": 0.72402215, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36328125, "step": 13529, "time_per_iteration": 2.3479716777801514 }, { "auxiliary_loss_clip": 0.01050403, "auxiliary_loss_mlp": 0.01035145, "balance_loss_clip": 1.01191139, "balance_loss_mlp": 1.01492441, "epoch": 0.8134676085976251, "flos": 24241732531200.0, "grad_norm": 4.008560124843101, "language_loss": 0.78802991, "learning_rate": 3.5392165628904635e-07, "loss": 0.8088854, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 13530, "time_per_iteration": 2.404996871948242 }, { "auxiliary_loss_clip": 0.01050288, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.01462436, "balance_loss_mlp": 1.01559186, "epoch": 0.8135277318502931, "flos": 19061389751040.0, "grad_norm": 1.8469040292830101, "language_loss": 0.83156568, "learning_rate": 3.537004792574052e-07, "loss": 0.85243654, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 13531, "time_per_iteration": 2.3804564476013184 }, { "auxiliary_loss_clip": 0.01052424, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.01236713, "balance_loss_mlp": 1.01580632, "epoch": 0.813587855102961, "flos": 17268914171520.0, "grad_norm": 2.1064489572198237, "language_loss": 0.72816312, "learning_rate": 3.534793646536065e-07, "loss": 0.74905205, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 13532, "time_per_iteration": 2.3814139366149902 }, { "auxiliary_loss_clip": 0.01050945, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.01186872, "balance_loss_mlp": 1.0155195, "epoch": 0.8136479783556291, "flos": 20156545278720.0, "grad_norm": 1.7697029046039667, "language_loss": 0.77514768, "learning_rate": 3.5325831248603533e-07, "loss": 0.79598695, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 13533, "time_per_iteration": 2.3758068084716797 }, { "auxiliary_loss_clip": 0.01054076, "auxiliary_loss_mlp": 0.01038832, "balance_loss_clip": 1.01526487, "balance_loss_mlp": 1.01646471, "epoch": 0.813708101608297, "flos": 22051316741760.0, "grad_norm": 1.9513024546803832, "language_loss": 0.77537799, "learning_rate": 3.5303732276307495e-07, "loss": 0.79630697, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.375, "step": 13534, "time_per_iteration": 2.366466760635376 }, { "auxiliary_loss_clip": 0.01050572, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.01285481, "balance_loss_mlp": 1.01569533, "epoch": 0.813768224860965, "flos": 16172676391680.0, "grad_norm": 2.153063723209549, "language_loss": 0.94472957, "learning_rate": 3.5281639549310336e-07, "loss": 0.96557176, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34960938, "step": 13535, "time_per_iteration": 2.381228446960449 }, { "auxiliary_loss_clip": 0.01050253, "auxiliary_loss_mlp": 0.01033836, "balance_loss_clip": 1.01264024, "balance_loss_mlp": 1.01647806, "epoch": 0.8138283481136329, "flos": 24351185243520.0, "grad_norm": 1.5690707448072008, "language_loss": 0.71253937, "learning_rate": 3.52595530684499e-07, "loss": 0.73338032, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33789062, "step": 13536, "time_per_iteration": 2.3988044261932373 }, { "auxiliary_loss_clip": 0.01051575, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.01298761, "balance_loss_mlp": 1.01651192, "epoch": 0.8138884713663009, "flos": 25515294439680.0, "grad_norm": 1.510979866330716, "language_loss": 0.76390481, "learning_rate": 3.5237472834563775e-07, "loss": 0.78477728, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 13537, "time_per_iteration": 2.406149387359619 }, { "auxiliary_loss_clip": 0.01049398, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.01317108, "balance_loss_mlp": 1.0154655, "epoch": 0.8139485946189688, "flos": 22453306669440.0, "grad_norm": 1.419799020829966, "language_loss": 0.7696321, "learning_rate": 3.5215398848489163e-07, "loss": 0.79047465, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 13538, "time_per_iteration": 2.3804585933685303 }, { "auxiliary_loss_clip": 0.0104924, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.01578641, "balance_loss_mlp": 1.01406169, "epoch": 0.8140087178716369, "flos": 21249361745280.0, "grad_norm": 1.9953764546150417, "language_loss": 0.78219187, "learning_rate": 3.5193331111063176e-07, "loss": 0.80306464, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 13539, "time_per_iteration": 2.418666124343872 }, { "auxiliary_loss_clip": 0.0105005, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.01604152, "balance_loss_mlp": 1.01600194, "epoch": 0.8140688411243048, "flos": 39414322627200.0, "grad_norm": 2.3175401776949442, "language_loss": 0.66691983, "learning_rate": 3.5171269623122533e-07, "loss": 0.68778682, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 13540, "time_per_iteration": 3.7718703746795654 }, { "auxiliary_loss_clip": 0.01052065, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.01619351, "balance_loss_mlp": 1.01664329, "epoch": 0.8141289643769728, "flos": 25414569567360.0, "grad_norm": 1.4650682655010114, "language_loss": 0.68241799, "learning_rate": 3.5149214385503913e-07, "loss": 0.70332032, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13541, "time_per_iteration": 2.492431402206421 }, { "auxiliary_loss_clip": 0.01051192, "auxiliary_loss_mlp": 0.01037792, "balance_loss_clip": 1.01436687, "balance_loss_mlp": 1.01588547, "epoch": 0.8141890876296408, "flos": 12567230398080.0, "grad_norm": 1.9250763809513327, "language_loss": 0.7021842, "learning_rate": 3.512716539904355e-07, "loss": 0.72307402, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 13542, "time_per_iteration": 2.366791248321533 }, { "auxiliary_loss_clip": 0.01053346, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 1.01346982, "balance_loss_mlp": 1.01541853, "epoch": 0.8142492108823087, "flos": 14966532051840.0, "grad_norm": 3.2273664054725786, "language_loss": 0.82325196, "learning_rate": 3.5105122664577613e-07, "loss": 0.84415716, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37890625, "step": 13543, "time_per_iteration": 2.3668298721313477 }, { "auxiliary_loss_clip": 0.01054961, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.01719856, "balance_loss_mlp": 1.0171299, "epoch": 0.8143093341349767, "flos": 12421188714240.0, "grad_norm": 2.4103747844110464, "language_loss": 0.79253727, "learning_rate": 3.5083086182942003e-07, "loss": 0.81349909, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 13544, "time_per_iteration": 2.333771228790283 }, { "auxiliary_loss_clip": 0.01057144, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.01844501, "balance_loss_mlp": 1.01747894, "epoch": 0.8143694573876447, "flos": 11909780985600.0, "grad_norm": 3.305300743797227, "language_loss": 0.75446594, "learning_rate": 3.5061055954972264e-07, "loss": 0.77548349, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.39648438, "step": 13545, "time_per_iteration": 3.824995279312134 }, { "auxiliary_loss_clip": 0.01050637, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.01593518, "balance_loss_mlp": 1.01652038, "epoch": 0.8144295806403127, "flos": 21211899989760.0, "grad_norm": 1.6136383837846529, "language_loss": 0.77209616, "learning_rate": 3.5039031981503776e-07, "loss": 0.79297471, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34179688, "step": 13546, "time_per_iteration": 2.4031004905700684 }, { "auxiliary_loss_clip": 0.01053159, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.01296031, "balance_loss_mlp": 1.01747024, "epoch": 0.8144897038929806, "flos": 19864252442880.0, "grad_norm": 2.7624588036965667, "language_loss": 0.71705019, "learning_rate": 3.501701426337178e-07, "loss": 0.73792922, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 13547, "time_per_iteration": 2.3999722003936768 }, { "auxiliary_loss_clip": 0.01052641, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.01768589, "balance_loss_mlp": 1.01543725, "epoch": 0.8145498271456486, "flos": 24570579427200.0, "grad_norm": 2.138067396654817, "language_loss": 0.71958071, "learning_rate": 3.49950028014111e-07, "loss": 0.74053288, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37304688, "step": 13548, "time_per_iteration": 2.4348106384277344 }, { "auxiliary_loss_clip": 0.01054282, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.00921774, "balance_loss_mlp": 1.01784909, "epoch": 0.8146099503983165, "flos": 20192017086720.0, "grad_norm": 2.1651073749746015, "language_loss": 0.77705657, "learning_rate": 3.4972997596456444e-07, "loss": 0.79792249, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 13549, "time_per_iteration": 3.7097134590148926 }, { "auxiliary_loss_clip": 0.01052258, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.0110122, "balance_loss_mlp": 1.01736259, "epoch": 0.8146700736509845, "flos": 19535929217280.0, "grad_norm": 2.188480958689104, "language_loss": 0.72328448, "learning_rate": 3.4950998649342233e-07, "loss": 0.74413651, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 13550, "time_per_iteration": 2.3845138549804688 }, { "auxiliary_loss_clip": 0.01048342, "auxiliary_loss_mlp": 0.01033315, "balance_loss_clip": 1.01223826, "balance_loss_mlp": 1.01515746, "epoch": 0.8147301969036524, "flos": 18040354773120.0, "grad_norm": 1.8130579905223816, "language_loss": 0.72570181, "learning_rate": 3.4929005960902826e-07, "loss": 0.74651837, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33203125, "step": 13551, "time_per_iteration": 2.329204559326172 }, { "auxiliary_loss_clip": 0.01055672, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.01576471, "balance_loss_mlp": 1.01696575, "epoch": 0.8147903201563205, "flos": 18003730890240.0, "grad_norm": 1.841978339294827, "language_loss": 0.70359874, "learning_rate": 3.4907019531971926e-07, "loss": 0.72456723, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 13552, "time_per_iteration": 2.366551399230957 }, { "auxiliary_loss_clip": 0.01050897, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.01540327, "balance_loss_mlp": 1.0158757, "epoch": 0.8148504434089884, "flos": 20258492048640.0, "grad_norm": 1.761135494890966, "language_loss": 0.83697844, "learning_rate": 3.4885039363383407e-07, "loss": 0.85785139, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.3515625, "step": 13553, "time_per_iteration": 2.3371598720550537 }, { "auxiliary_loss_clip": 0.0105206, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.01532197, "balance_loss_mlp": 1.01627445, "epoch": 0.8149105666616564, "flos": 12493912809600.0, "grad_norm": 1.8611218206453928, "language_loss": 0.69753301, "learning_rate": 3.4863065455970795e-07, "loss": 0.71842432, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35742188, "step": 13554, "time_per_iteration": 2.341367721557617 }, { "auxiliary_loss_clip": 0.01054716, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.01574969, "balance_loss_mlp": 1.01786709, "epoch": 0.8149706899143244, "flos": 32522362709760.0, "grad_norm": 1.7722381470650175, "language_loss": 0.66824615, "learning_rate": 3.484109781056723e-07, "loss": 0.68919301, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 13555, "time_per_iteration": 2.447854995727539 }, { "auxiliary_loss_clip": 0.01053553, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.01483202, "balance_loss_mlp": 1.0167191, "epoch": 0.8150308131669923, "flos": 19385209411200.0, "grad_norm": 2.10813950931355, "language_loss": 0.74726385, "learning_rate": 3.4819136428005844e-07, "loss": 0.76818395, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 13556, "time_per_iteration": 2.3482964038848877 }, { "auxiliary_loss_clip": 0.01051499, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.00911689, "balance_loss_mlp": 1.01680207, "epoch": 0.8150909364196604, "flos": 17420402027520.0, "grad_norm": 1.7266485040492248, "language_loss": 0.81536818, "learning_rate": 3.4797181309119307e-07, "loss": 0.83618748, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34570312, "step": 13557, "time_per_iteration": 2.3261830806732178 }, { "auxiliary_loss_clip": 0.01055064, "auxiliary_loss_mlp": 0.01037983, "balance_loss_clip": 1.01560783, "balance_loss_mlp": 1.01751971, "epoch": 0.8151510596723283, "flos": 27161553778560.0, "grad_norm": 1.7627503519466825, "language_loss": 0.66273719, "learning_rate": 3.4775232454740255e-07, "loss": 0.68366766, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.375, "step": 13558, "time_per_iteration": 2.441406488418579 }, { "auxiliary_loss_clip": 0.01007438, "auxiliary_loss_mlp": 0.01003979, "balance_loss_clip": 1.00155902, "balance_loss_mlp": 1.00081003, "epoch": 0.8152111829249963, "flos": 64216131189120.0, "grad_norm": 0.7972983853127706, "language_loss": 0.56947351, "learning_rate": 3.4753289865700896e-07, "loss": 0.58958769, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.06640625, "step": 13559, "time_per_iteration": 2.9274091720581055 }, { "auxiliary_loss_clip": 0.01007528, "auxiliary_loss_mlp": 0.01002694, "balance_loss_clip": 1.00024986, "balance_loss_mlp": 1.00099587, "epoch": 0.8152713061776642, "flos": 67068814158720.0, "grad_norm": 0.6810392485807253, "language_loss": 0.55292958, "learning_rate": 3.473135354283334e-07, "loss": 0.57303178, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.06542969, "step": 13560, "time_per_iteration": 2.9027628898620605 }, { "auxiliary_loss_clip": 0.01051265, "auxiliary_loss_mlp": 0.01039672, "balance_loss_clip": 1.01796412, "balance_loss_mlp": 1.01650715, "epoch": 0.8153314294303322, "flos": 14390290195200.0, "grad_norm": 2.015411938969973, "language_loss": 0.68192828, "learning_rate": 3.470942348696948e-07, "loss": 0.70283759, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 13561, "time_per_iteration": 2.3333899974823 }, { "auxiliary_loss_clip": 0.01054149, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.01691103, "balance_loss_mlp": 1.01724339, "epoch": 0.8153915526830001, "flos": 25622512824960.0, "grad_norm": 1.6694564893323505, "language_loss": 0.827191, "learning_rate": 3.468749969894085e-07, "loss": 0.84813565, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 13562, "time_per_iteration": 3.9167189598083496 }, { "auxiliary_loss_clip": 0.01052465, "auxiliary_loss_mlp": 0.01037724, "balance_loss_clip": 1.01522946, "balance_loss_mlp": 1.01637352, "epoch": 0.8154516759356681, "flos": 23367996046080.0, "grad_norm": 1.6037662751293285, "language_loss": 0.72856116, "learning_rate": 3.4665582179578734e-07, "loss": 0.74946308, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 13563, "time_per_iteration": 2.4224705696105957 }, { "auxiliary_loss_clip": 0.01051443, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.01281738, "balance_loss_mlp": 1.01510966, "epoch": 0.815511799188336, "flos": 28147884998400.0, "grad_norm": 1.5513606898108148, "language_loss": 0.71345663, "learning_rate": 3.4643670929714387e-07, "loss": 0.73435956, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 13564, "time_per_iteration": 2.4412033557891846 }, { "auxiliary_loss_clip": 0.01050871, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.0116024, "balance_loss_mlp": 1.01570749, "epoch": 0.8155719224410041, "flos": 16982451532800.0, "grad_norm": 1.8965648677071556, "language_loss": 0.71329212, "learning_rate": 3.462176595017854e-07, "loss": 0.73415387, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 13565, "time_per_iteration": 2.3872175216674805 }, { "auxiliary_loss_clip": 0.01050633, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.01509273, "balance_loss_mlp": 1.01558781, "epoch": 0.815632045693672, "flos": 24680555809920.0, "grad_norm": 2.166154139237371, "language_loss": 0.79519135, "learning_rate": 3.459986724180188e-07, "loss": 0.81607914, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 13566, "time_per_iteration": 2.4251151084899902 }, { "auxiliary_loss_clip": 0.01050376, "auxiliary_loss_mlp": 0.01034298, "balance_loss_clip": 1.01331675, "balance_loss_mlp": 1.01568747, "epoch": 0.81569216894634, "flos": 19937290740480.0, "grad_norm": 1.5902230688323156, "language_loss": 0.83354002, "learning_rate": 3.457797480541491e-07, "loss": 0.85438675, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34570312, "step": 13567, "time_per_iteration": 2.4290928840637207 }, { "auxiliary_loss_clip": 0.01050496, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.01369548, "balance_loss_mlp": 1.01610744, "epoch": 0.8157522921990079, "flos": 21798301052160.0, "grad_norm": 1.8466935633714183, "language_loss": 0.81168956, "learning_rate": 3.455608864184771e-07, "loss": 0.83252811, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.34375, "step": 13568, "time_per_iteration": 2.3617868423461914 }, { "auxiliary_loss_clip": 0.01050071, "auxiliary_loss_mlp": 0.01038182, "balance_loss_clip": 1.01707006, "balance_loss_mlp": 1.0162003, "epoch": 0.8158124154516759, "flos": 18507527942400.0, "grad_norm": 1.8908761617352146, "language_loss": 0.78155303, "learning_rate": 3.453420875193016e-07, "loss": 0.80243552, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 13569, "time_per_iteration": 2.3679440021514893 }, { "auxiliary_loss_clip": 0.01051159, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 1.01360631, "balance_loss_mlp": 1.01613665, "epoch": 0.815872538704344, "flos": 26829669594240.0, "grad_norm": 3.4690948255190364, "language_loss": 0.60956621, "learning_rate": 3.451233513649199e-07, "loss": 0.63042057, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3515625, "step": 13570, "time_per_iteration": 2.4402389526367188 }, { "auxiliary_loss_clip": 0.01055031, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.02004838, "balance_loss_mlp": 1.01731706, "epoch": 0.8159326619570119, "flos": 21724634350080.0, "grad_norm": 1.9568364540703478, "language_loss": 0.83227783, "learning_rate": 3.4490467796362687e-07, "loss": 0.853266, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37695312, "step": 13571, "time_per_iteration": 2.4104504585266113 }, { "auxiliary_loss_clip": 0.0105187, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.0194242, "balance_loss_mlp": 1.01699352, "epoch": 0.8159927852096799, "flos": 13839989345280.0, "grad_norm": 2.666068385840137, "language_loss": 0.79993945, "learning_rate": 3.446860673237142e-07, "loss": 0.82088763, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34765625, "step": 13572, "time_per_iteration": 2.340604305267334 }, { "auxiliary_loss_clip": 0.01052109, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.0162034, "balance_loss_mlp": 1.01561487, "epoch": 0.8160529084623478, "flos": 24498344090880.0, "grad_norm": 1.4352363991449348, "language_loss": 0.66042233, "learning_rate": 3.4446751945347186e-07, "loss": 0.68133307, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36523438, "step": 13573, "time_per_iteration": 2.400564670562744 }, { "auxiliary_loss_clip": 0.01051566, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.01496899, "balance_loss_mlp": 1.01624191, "epoch": 0.8161130317150158, "flos": 24825201039360.0, "grad_norm": 1.5685274942666432, "language_loss": 0.76394951, "learning_rate": 3.442490343611868e-07, "loss": 0.78484523, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 13574, "time_per_iteration": 2.4113190174102783 }, { "auxiliary_loss_clip": 0.01054113, "auxiliary_loss_mlp": 0.01039646, "balance_loss_clip": 1.01662719, "balance_loss_mlp": 1.01744914, "epoch": 0.8161731549676837, "flos": 30955216245120.0, "grad_norm": 1.9952365591395962, "language_loss": 0.60713041, "learning_rate": 3.4403061205514485e-07, "loss": 0.62806797, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 13575, "time_per_iteration": 2.4556126594543457 }, { "auxiliary_loss_clip": 0.01051854, "auxiliary_loss_mlp": 0.01037317, "balance_loss_clip": 1.01298642, "balance_loss_mlp": 1.01526785, "epoch": 0.8162332782203517, "flos": 18550994451840.0, "grad_norm": 1.8271538169213215, "language_loss": 0.75476044, "learning_rate": 3.4381225254362736e-07, "loss": 0.77565217, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 13576, "time_per_iteration": 2.357179880142212 }, { "auxiliary_loss_clip": 0.01007508, "auxiliary_loss_mlp": 0.01002397, "balance_loss_clip": 1.00006068, "balance_loss_mlp": 1.00079572, "epoch": 0.8162934014730197, "flos": 70383712884480.0, "grad_norm": 0.81990881981954, "language_loss": 0.58724636, "learning_rate": 3.435939558349155e-07, "loss": 0.6073454, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.06738281, "step": 13577, "time_per_iteration": 2.992936372756958 }, { "auxiliary_loss_clip": 0.01049776, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.01476979, "balance_loss_mlp": 1.01596355, "epoch": 0.8163535247256877, "flos": 21213785203200.0, "grad_norm": 1.7503215275339672, "language_loss": 0.71985441, "learning_rate": 3.4337572193728747e-07, "loss": 0.74070519, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33789062, "step": 13578, "time_per_iteration": 2.3833415508270264 }, { "auxiliary_loss_clip": 0.01051367, "auxiliary_loss_mlp": 0.01039307, "balance_loss_clip": 1.01587033, "balance_loss_mlp": 1.01593661, "epoch": 0.8164136479783556, "flos": 21097978623360.0, "grad_norm": 2.1133151107034207, "language_loss": 0.74602854, "learning_rate": 3.431575508590172e-07, "loss": 0.76693535, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 13579, "time_per_iteration": 2.371922254562378 }, { "auxiliary_loss_clip": 0.01052346, "auxiliary_loss_mlp": 0.01037391, "balance_loss_clip": 1.01508737, "balance_loss_mlp": 1.01567245, "epoch": 0.8164737712310236, "flos": 21719711848320.0, "grad_norm": 1.7385024794369723, "language_loss": 0.7986455, "learning_rate": 3.4293944260837873e-07, "loss": 0.819543, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3671875, "step": 13580, "time_per_iteration": 2.362642765045166 }, { "auxiliary_loss_clip": 0.01051261, "auxiliary_loss_mlp": 0.01041534, "balance_loss_clip": 1.01713169, "balance_loss_mlp": 1.0161227, "epoch": 0.8165338944836915, "flos": 19535789571840.0, "grad_norm": 1.7889952510031795, "language_loss": 0.70002759, "learning_rate": 3.4272139719364314e-07, "loss": 0.72095549, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3515625, "step": 13581, "time_per_iteration": 3.7475593090057373 }, { "auxiliary_loss_clip": 0.01052591, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.01628757, "balance_loss_mlp": 1.01685905, "epoch": 0.8165940177363595, "flos": 22927497022080.0, "grad_norm": 1.5818595699036935, "language_loss": 0.61098534, "learning_rate": 3.4250341462307786e-07, "loss": 0.63188815, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35742188, "step": 13582, "time_per_iteration": 2.3591346740722656 }, { "auxiliary_loss_clip": 0.01049185, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.01365769, "balance_loss_mlp": 1.01579428, "epoch": 0.8166541409890276, "flos": 23369183032320.0, "grad_norm": 1.4164408708332663, "language_loss": 0.82710385, "learning_rate": 3.4228549490494897e-07, "loss": 0.84794319, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33203125, "step": 13583, "time_per_iteration": 2.5053625106811523 }, { "auxiliary_loss_clip": 0.01052327, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.01701283, "balance_loss_mlp": 1.01648581, "epoch": 0.8167142642416955, "flos": 18441018069120.0, "grad_norm": 1.6575511828714975, "language_loss": 0.75341833, "learning_rate": 3.4206763804752093e-07, "loss": 0.77433574, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 13584, "time_per_iteration": 2.3637771606445312 }, { "auxiliary_loss_clip": 0.01053834, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 1.01202488, "balance_loss_mlp": 1.0175395, "epoch": 0.8167743874943635, "flos": 21213924848640.0, "grad_norm": 1.6763142196510445, "language_loss": 0.76075673, "learning_rate": 3.4184984405905405e-07, "loss": 0.78165483, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 13585, "time_per_iteration": 3.786125421524048 }, { "auxiliary_loss_clip": 0.01052172, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.01349711, "balance_loss_mlp": 1.01632822, "epoch": 0.8168345107470314, "flos": 18696687022080.0, "grad_norm": 1.4952209622197266, "language_loss": 0.70830315, "learning_rate": 3.416321129478068e-07, "loss": 0.72920203, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 13586, "time_per_iteration": 2.3586063385009766 }, { "auxiliary_loss_clip": 0.01051277, "auxiliary_loss_mlp": 0.01037774, "balance_loss_clip": 1.01688862, "balance_loss_mlp": 1.01647627, "epoch": 0.8168946339996994, "flos": 16252173290880.0, "grad_norm": 1.533375632817559, "language_loss": 0.62017381, "learning_rate": 3.4141444472203594e-07, "loss": 0.64106429, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 13587, "time_per_iteration": 2.4120731353759766 }, { "auxiliary_loss_clip": 0.01053448, "auxiliary_loss_mlp": 0.01042651, "balance_loss_clip": 1.01895201, "balance_loss_mlp": 1.01659369, "epoch": 0.8169547572523673, "flos": 26940414026880.0, "grad_norm": 2.4038260781074134, "language_loss": 0.71933842, "learning_rate": 3.4119683938999624e-07, "loss": 0.7402994, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 13588, "time_per_iteration": 3.7814080715179443 }, { "auxiliary_loss_clip": 0.01053603, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.01402068, "balance_loss_mlp": 1.01632857, "epoch": 0.8170148805050353, "flos": 18951343545600.0, "grad_norm": 1.6545328170050868, "language_loss": 0.73360538, "learning_rate": 3.4097929695993854e-07, "loss": 0.7545352, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 13589, "time_per_iteration": 2.3737802505493164 }, { "auxiliary_loss_clip": 0.01049563, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.0123601, "balance_loss_mlp": 1.01557326, "epoch": 0.8170750037577033, "flos": 21833842682880.0, "grad_norm": 1.6547593022799678, "language_loss": 0.74307901, "learning_rate": 3.4076181744011166e-07, "loss": 0.76392186, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.33984375, "step": 13590, "time_per_iteration": 2.385528326034546 }, { "auxiliary_loss_clip": 0.01054914, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.01348591, "balance_loss_mlp": 1.01667237, "epoch": 0.8171351270103713, "flos": 33505866109440.0, "grad_norm": 2.1130855391711756, "language_loss": 0.66480505, "learning_rate": 3.4054440083876345e-07, "loss": 0.68573689, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3828125, "step": 13591, "time_per_iteration": 2.5209076404571533 }, { "auxiliary_loss_clip": 0.01053126, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.016747, "balance_loss_mlp": 1.01548862, "epoch": 0.8171952502630392, "flos": 22707160231680.0, "grad_norm": 2.362921484514171, "language_loss": 0.69628417, "learning_rate": 3.403270471641373e-07, "loss": 0.71721601, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37695312, "step": 13592, "time_per_iteration": 2.398068428039551 }, { "auxiliary_loss_clip": 0.01051832, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.01085567, "balance_loss_mlp": 1.01628125, "epoch": 0.8172553735157072, "flos": 26722521031680.0, "grad_norm": 1.7439637730372297, "language_loss": 0.67617238, "learning_rate": 3.401097564244759e-07, "loss": 0.69702899, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 13593, "time_per_iteration": 2.4385716915130615 }, { "auxiliary_loss_clip": 0.01050312, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.01738882, "balance_loss_mlp": 1.01545954, "epoch": 0.8173154967683751, "flos": 15960159745920.0, "grad_norm": 1.9964396917297764, "language_loss": 0.70399493, "learning_rate": 3.398925286280188e-07, "loss": 0.72489303, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 13594, "time_per_iteration": 2.3737223148345947 }, { "auxiliary_loss_clip": 0.01053208, "auxiliary_loss_mlp": 0.01036259, "balance_loss_clip": 1.01308513, "balance_loss_mlp": 1.01609039, "epoch": 0.8173756200210431, "flos": 25985749276800.0, "grad_norm": 1.7206889258545275, "language_loss": 0.67469984, "learning_rate": 3.3967536378300456e-07, "loss": 0.69559455, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 13595, "time_per_iteration": 2.408904790878296 }, { "auxiliary_loss_clip": 0.01054055, "auxiliary_loss_mlp": 0.01039327, "balance_loss_clip": 1.01459122, "balance_loss_mlp": 1.01595783, "epoch": 0.8174357432737112, "flos": 25663291159680.0, "grad_norm": 1.5045288856974015, "language_loss": 0.7915405, "learning_rate": 3.394582618976658e-07, "loss": 0.81247431, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 13596, "time_per_iteration": 2.412522554397583 }, { "auxiliary_loss_clip": 0.01050146, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.01165783, "balance_loss_mlp": 1.01540613, "epoch": 0.8174958665263791, "flos": 21834017239680.0, "grad_norm": 2.290887761898957, "language_loss": 0.60057598, "learning_rate": 3.392412229802362e-07, "loss": 0.62143636, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.34765625, "step": 13597, "time_per_iteration": 2.3650574684143066 }, { "auxiliary_loss_clip": 0.01051284, "auxiliary_loss_mlp": 0.01036118, "balance_loss_clip": 1.01348066, "balance_loss_mlp": 1.01634288, "epoch": 0.8175559897790471, "flos": 22454423832960.0, "grad_norm": 1.6007137291326945, "language_loss": 0.8332001, "learning_rate": 3.390242470389462e-07, "loss": 0.85407412, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 13598, "time_per_iteration": 2.3521604537963867 }, { "auxiliary_loss_clip": 0.01053349, "auxiliary_loss_mlp": 0.01039555, "balance_loss_clip": 1.01531959, "balance_loss_mlp": 1.01627183, "epoch": 0.817616113031715, "flos": 23614867336320.0, "grad_norm": 1.8565326052887317, "language_loss": 0.83389139, "learning_rate": 3.3880733408202277e-07, "loss": 0.85482037, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 13599, "time_per_iteration": 2.4214560985565186 }, { "auxiliary_loss_clip": 0.01052585, "auxiliary_loss_mlp": 0.01037433, "balance_loss_clip": 1.01435447, "balance_loss_mlp": 1.01721275, "epoch": 0.817676236284383, "flos": 27671041382400.0, "grad_norm": 1.8640402980228714, "language_loss": 0.8468678, "learning_rate": 3.3859048411769186e-07, "loss": 0.86776805, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 13600, "time_per_iteration": 2.39587664604187 }, { "auxiliary_loss_clip": 0.01051927, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.01550889, "balance_loss_mlp": 1.01525164, "epoch": 0.8177363595370509, "flos": 24679857582720.0, "grad_norm": 1.6635421744101178, "language_loss": 0.75699025, "learning_rate": 3.383736971541766e-07, "loss": 0.77791655, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 13601, "time_per_iteration": 3.7969820499420166 }, { "auxiliary_loss_clip": 0.01053923, "auxiliary_loss_mlp": 0.01039256, "balance_loss_clip": 1.01400721, "balance_loss_mlp": 1.01632774, "epoch": 0.817796482789719, "flos": 17345408693760.0, "grad_norm": 1.962285562400752, "language_loss": 0.69738245, "learning_rate": 3.3815697319969737e-07, "loss": 0.71831423, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 13602, "time_per_iteration": 2.3606979846954346 }, { "auxiliary_loss_clip": 0.01049966, "auxiliary_loss_mlp": 0.01037216, "balance_loss_clip": 1.01461363, "balance_loss_mlp": 1.01502156, "epoch": 0.8178566060423869, "flos": 17777703548160.0, "grad_norm": 1.971335101617719, "language_loss": 0.84880936, "learning_rate": 3.379403122624718e-07, "loss": 0.86968118, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 13603, "time_per_iteration": 2.316683292388916 }, { "auxiliary_loss_clip": 0.0105062, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.01194715, "balance_loss_mlp": 1.01556301, "epoch": 0.8179167292950549, "flos": 24972080595840.0, "grad_norm": 1.7155010049840889, "language_loss": 0.70624733, "learning_rate": 3.377237143507159e-07, "loss": 0.72709537, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 13604, "time_per_iteration": 2.4230597019195557 }, { "auxiliary_loss_clip": 0.0105243, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 1.01241791, "balance_loss_mlp": 1.01741362, "epoch": 0.8179768525477228, "flos": 22855680622080.0, "grad_norm": 1.5862534360134182, "language_loss": 0.75336683, "learning_rate": 3.3750717947264406e-07, "loss": 0.77425086, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 13605, "time_per_iteration": 2.36867618560791 }, { "auxiliary_loss_clip": 0.0105202, "auxiliary_loss_mlp": 0.01042532, "balance_loss_clip": 1.01956034, "balance_loss_mlp": 1.01737714, "epoch": 0.8180369758003908, "flos": 18514161100800.0, "grad_norm": 1.8667053482499978, "language_loss": 0.75461507, "learning_rate": 3.372907076364666e-07, "loss": 0.77556062, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34765625, "step": 13606, "time_per_iteration": 2.3294615745544434 }, { "auxiliary_loss_clip": 0.01051636, "auxiliary_loss_mlp": 0.01040624, "balance_loss_clip": 1.01785553, "balance_loss_mlp": 1.01607656, "epoch": 0.8180970990530587, "flos": 33180719817600.0, "grad_norm": 1.967820453200513, "language_loss": 0.66809225, "learning_rate": 3.370742988503916e-07, "loss": 0.68901485, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 13607, "time_per_iteration": 2.480757236480713 }, { "auxiliary_loss_clip": 0.01052571, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.01484811, "balance_loss_mlp": 1.01644397, "epoch": 0.8181572223057267, "flos": 25008844124160.0, "grad_norm": 1.8040400678898556, "language_loss": 0.71274543, "learning_rate": 3.3685795312262634e-07, "loss": 0.73364967, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 13608, "time_per_iteration": 2.387209177017212 }, { "auxiliary_loss_clip": 0.01050846, "auxiliary_loss_mlp": 0.01042777, "balance_loss_clip": 1.01969838, "balance_loss_mlp": 1.01575804, "epoch": 0.8182173455583948, "flos": 28547466042240.0, "grad_norm": 1.8247789316187035, "language_loss": 0.81087816, "learning_rate": 3.366416704613735e-07, "loss": 0.83181441, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34960938, "step": 13609, "time_per_iteration": 2.464195489883423 }, { "auxiliary_loss_clip": 0.01007352, "auxiliary_loss_mlp": 0.0100225, "balance_loss_clip": 0.9999972, "balance_loss_mlp": 1.00083733, "epoch": 0.8182774688110627, "flos": 72024875164800.0, "grad_norm": 0.749750175017467, "language_loss": 0.56006736, "learning_rate": 3.3642545087483544e-07, "loss": 0.58016336, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.06542969, "step": 13610, "time_per_iteration": 3.1334807872772217 }, { "auxiliary_loss_clip": 0.01048207, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.01144087, "balance_loss_mlp": 1.01508594, "epoch": 0.8183375920637307, "flos": 19754345882880.0, "grad_norm": 2.142040947001422, "language_loss": 0.79348654, "learning_rate": 3.362092943712107e-07, "loss": 0.81428933, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.33203125, "step": 13611, "time_per_iteration": 2.343662738800049 }, { "auxiliary_loss_clip": 0.01054179, "auxiliary_loss_mlp": 0.01044465, "balance_loss_clip": 1.0164752, "balance_loss_mlp": 1.01597667, "epoch": 0.8183977153163986, "flos": 22340921402880.0, "grad_norm": 1.93234131009409, "language_loss": 0.78712237, "learning_rate": 3.3599320095869745e-07, "loss": 0.80810887, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.38085938, "step": 13612, "time_per_iteration": 2.4059505462646484 }, { "auxiliary_loss_clip": 0.0104914, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 1.01057124, "balance_loss_mlp": 1.01523638, "epoch": 0.8184578385690666, "flos": 17711507877120.0, "grad_norm": 2.398121193626671, "language_loss": 0.87617505, "learning_rate": 3.3577717064548793e-07, "loss": 0.89699268, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33984375, "step": 13613, "time_per_iteration": 2.344506025314331 }, { "auxiliary_loss_clip": 0.01053358, "auxiliary_loss_mlp": 0.01042751, "balance_loss_clip": 1.02043533, "balance_loss_mlp": 1.01776814, "epoch": 0.8185179618217345, "flos": 25700019776640.0, "grad_norm": 1.379317113646301, "language_loss": 0.73609698, "learning_rate": 3.355612034397746e-07, "loss": 0.75705802, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 13614, "time_per_iteration": 2.4300644397735596 }, { "auxiliary_loss_clip": 0.01053099, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.01521921, "balance_loss_mlp": 1.01671529, "epoch": 0.8185780850744026, "flos": 25959075131520.0, "grad_norm": 1.6645383539122192, "language_loss": 0.82181466, "learning_rate": 3.353452993497479e-07, "loss": 0.84272605, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 13615, "time_per_iteration": 2.4384942054748535 }, { "auxiliary_loss_clip": 0.01051903, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.01720798, "balance_loss_mlp": 1.01595986, "epoch": 0.8186382083270705, "flos": 25227260789760.0, "grad_norm": 1.8894138733150982, "language_loss": 0.7658788, "learning_rate": 3.3512945838359375e-07, "loss": 0.78679925, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 13616, "time_per_iteration": 2.37628436088562 }, { "auxiliary_loss_clip": 0.01050294, "auxiliary_loss_mlp": 0.01040687, "balance_loss_clip": 1.01608253, "balance_loss_mlp": 1.01537776, "epoch": 0.8186983315797385, "flos": 22414029523200.0, "grad_norm": 1.66345882841564, "language_loss": 0.75921977, "learning_rate": 3.349136805494979e-07, "loss": 0.78012955, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.34960938, "step": 13617, "time_per_iteration": 2.379288911819458 }, { "auxiliary_loss_clip": 0.01048335, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.013309, "balance_loss_mlp": 1.01457334, "epoch": 0.8187584548324064, "flos": 22016927185920.0, "grad_norm": 1.9284174328306187, "language_loss": 0.69024992, "learning_rate": 3.346979658556415e-07, "loss": 0.71106613, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33789062, "step": 13618, "time_per_iteration": 2.3553390502929688 }, { "auxiliary_loss_clip": 0.01054741, "auxiliary_loss_mlp": 0.01041905, "balance_loss_clip": 1.0179081, "balance_loss_mlp": 1.01693916, "epoch": 0.8188185780850744, "flos": 29240387262720.0, "grad_norm": 1.9606441973917461, "language_loss": 0.70442343, "learning_rate": 3.344823143102058e-07, "loss": 0.72538996, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37890625, "step": 13619, "time_per_iteration": 3.83598256111145 }, { "auxiliary_loss_clip": 0.01053371, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.01395535, "balance_loss_mlp": 1.01682043, "epoch": 0.8188787013377423, "flos": 20695674493440.0, "grad_norm": 1.7356634050670037, "language_loss": 0.744587, "learning_rate": 3.3426672592136694e-07, "loss": 0.76549292, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 13620, "time_per_iteration": 2.382858991622925 }, { "auxiliary_loss_clip": 0.01049403, "auxiliary_loss_mlp": 0.01031031, "balance_loss_clip": 1.00945365, "balance_loss_mlp": 1.01544619, "epoch": 0.8189388245904103, "flos": 23731825991040.0, "grad_norm": 1.5478853590725143, "language_loss": 0.76712209, "learning_rate": 3.340512006973011e-07, "loss": 0.78792644, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33789062, "step": 13621, "time_per_iteration": 2.380847454071045 }, { "auxiliary_loss_clip": 0.0105013, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.00908875, "balance_loss_mlp": 1.01507783, "epoch": 0.8189989478430784, "flos": 28253881486080.0, "grad_norm": 2.3474034889523687, "language_loss": 0.67362976, "learning_rate": 3.3383573864618076e-07, "loss": 0.69444942, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34960938, "step": 13622, "time_per_iteration": 2.468069314956665 }, { "auxiliary_loss_clip": 0.01053237, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.01030326, "balance_loss_mlp": 1.01713836, "epoch": 0.8190590710957463, "flos": 21396625326720.0, "grad_norm": 1.8823720170211955, "language_loss": 0.76386696, "learning_rate": 3.3362033977617653e-07, "loss": 0.78475434, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36132812, "step": 13623, "time_per_iteration": 2.3773505687713623 }, { "auxiliary_loss_clip": 0.0105218, "auxiliary_loss_mlp": 0.01036659, "balance_loss_clip": 1.01349628, "balance_loss_mlp": 1.01598954, "epoch": 0.8191191943484143, "flos": 38795033197440.0, "grad_norm": 2.0686613226596524, "language_loss": 0.64567065, "learning_rate": 3.3340500409545527e-07, "loss": 0.66655904, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36328125, "step": 13624, "time_per_iteration": 2.5161044597625732 }, { "auxiliary_loss_clip": 0.01050145, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.01331997, "balance_loss_mlp": 1.01517975, "epoch": 0.8191793176010822, "flos": 25445363253120.0, "grad_norm": 2.3529825890341476, "language_loss": 0.79795986, "learning_rate": 3.3318973161218386e-07, "loss": 0.81881815, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 13625, "time_per_iteration": 3.837476968765259 }, { "auxiliary_loss_clip": 0.01055289, "auxiliary_loss_mlp": 0.0104218, "balance_loss_clip": 1.01818275, "balance_loss_mlp": 1.01657236, "epoch": 0.8192394408537502, "flos": 25081847510400.0, "grad_norm": 2.320522951694003, "language_loss": 0.77455622, "learning_rate": 3.329745223345244e-07, "loss": 0.79553092, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.38671875, "step": 13626, "time_per_iteration": 2.4145047664642334 }, { "auxiliary_loss_clip": 0.01051399, "auxiliary_loss_mlp": 0.01038376, "balance_loss_clip": 1.01685917, "balance_loss_mlp": 1.01689982, "epoch": 0.8192995641064181, "flos": 27672472748160.0, "grad_norm": 1.4979403470115598, "language_loss": 0.74739105, "learning_rate": 3.3275937627063823e-07, "loss": 0.76828879, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 13627, "time_per_iteration": 2.454164743423462 }, { "auxiliary_loss_clip": 0.0105286, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.0117116, "balance_loss_mlp": 1.01701128, "epoch": 0.8193596873590862, "flos": 21287416993920.0, "grad_norm": 1.5057113259201131, "language_loss": 0.70063066, "learning_rate": 3.3254429342868353e-07, "loss": 0.72150946, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 13628, "time_per_iteration": 3.6954305171966553 }, { "auxiliary_loss_clip": 0.01055868, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.01058984, "balance_loss_mlp": 1.01797557, "epoch": 0.8194198106117541, "flos": 17491694757120.0, "grad_norm": 1.6090719499399269, "language_loss": 0.8593697, "learning_rate": 3.323292738168171e-07, "loss": 0.8802852, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 13629, "time_per_iteration": 2.3951497077941895 }, { "auxiliary_loss_clip": 0.01051668, "auxiliary_loss_mlp": 0.01037761, "balance_loss_clip": 1.01587474, "balance_loss_mlp": 1.01550519, "epoch": 0.8194799338644221, "flos": 15267029057280.0, "grad_norm": 2.306338351042696, "language_loss": 0.74949354, "learning_rate": 3.3211431744319084e-07, "loss": 0.77038783, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 13630, "time_per_iteration": 2.3223626613616943 }, { "auxiliary_loss_clip": 0.01054298, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.01525331, "balance_loss_mlp": 1.01742971, "epoch": 0.81954005711709, "flos": 14717985016320.0, "grad_norm": 1.7963438164423489, "language_loss": 0.72829461, "learning_rate": 3.31899424315957e-07, "loss": 0.74923897, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 13631, "time_per_iteration": 2.3344781398773193 }, { "auxiliary_loss_clip": 0.01052668, "auxiliary_loss_mlp": 0.01038108, "balance_loss_clip": 1.01452839, "balance_loss_mlp": 1.01622975, "epoch": 0.819600180369758, "flos": 23072980124160.0, "grad_norm": 1.5060742121739281, "language_loss": 0.77223766, "learning_rate": 3.3168459444326447e-07, "loss": 0.79314542, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 13632, "time_per_iteration": 2.391266345977783 }, { "auxiliary_loss_clip": 0.01050495, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.01060867, "balance_loss_mlp": 1.01526058, "epoch": 0.8196603036224259, "flos": 27598561666560.0, "grad_norm": 1.5940225972081272, "language_loss": 0.67072642, "learning_rate": 3.314698278332588e-07, "loss": 0.69155604, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13633, "time_per_iteration": 2.399674654006958 }, { "auxiliary_loss_clip": 0.0104908, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.01631808, "balance_loss_mlp": 1.01519191, "epoch": 0.8197204268750939, "flos": 28580843168640.0, "grad_norm": 1.6283420262251564, "language_loss": 0.76385939, "learning_rate": 3.3125512449408513e-07, "loss": 0.78473216, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 13634, "time_per_iteration": 2.4749715328216553 }, { "auxiliary_loss_clip": 0.01049413, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.01228166, "balance_loss_mlp": 1.01543999, "epoch": 0.819780550127762, "flos": 23257181790720.0, "grad_norm": 1.8999218493955068, "language_loss": 0.82405239, "learning_rate": 3.310404844338841e-07, "loss": 0.84489286, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.33984375, "step": 13635, "time_per_iteration": 2.359858751296997 }, { "auxiliary_loss_clip": 0.01051191, "auxiliary_loss_mlp": 0.01039724, "balance_loss_clip": 1.01532161, "balance_loss_mlp": 1.01514769, "epoch": 0.8198406733804299, "flos": 26684116669440.0, "grad_norm": 1.6374307372761028, "language_loss": 0.76832461, "learning_rate": 3.308259076607949e-07, "loss": 0.7892338, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 13636, "time_per_iteration": 2.398360252380371 }, { "auxiliary_loss_clip": 0.01049484, "auxiliary_loss_mlp": 0.01036237, "balance_loss_clip": 1.0142312, "balance_loss_mlp": 1.01524186, "epoch": 0.8199007966330979, "flos": 20083053133440.0, "grad_norm": 1.9014296813170228, "language_loss": 0.82381427, "learning_rate": 3.3061139418295445e-07, "loss": 0.84467149, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34179688, "step": 13637, "time_per_iteration": 2.354078769683838 }, { "auxiliary_loss_clip": 0.01050614, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.01143324, "balance_loss_mlp": 1.01533985, "epoch": 0.8199609198857658, "flos": 31901502268800.0, "grad_norm": 2.216425319537988, "language_loss": 0.72927469, "learning_rate": 3.3039694400849725e-07, "loss": 0.75011593, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 13638, "time_per_iteration": 2.4448435306549072 }, { "auxiliary_loss_clip": 0.01053208, "auxiliary_loss_mlp": 0.01036018, "balance_loss_clip": 1.01043582, "balance_loss_mlp": 1.01566124, "epoch": 0.8200210431384338, "flos": 26468911848960.0, "grad_norm": 2.006578319183566, "language_loss": 0.80338019, "learning_rate": 3.3018255714555564e-07, "loss": 0.82427245, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 13639, "time_per_iteration": 2.386749505996704 }, { "auxiliary_loss_clip": 0.01050869, "auxiliary_loss_mlp": 0.01033985, "balance_loss_clip": 1.01109695, "balance_loss_mlp": 1.01579821, "epoch": 0.8200811663911017, "flos": 22090349508480.0, "grad_norm": 1.5767737129119914, "language_loss": 0.80111724, "learning_rate": 3.299682336022589e-07, "loss": 0.82196575, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 13640, "time_per_iteration": 2.3856165409088135 }, { "auxiliary_loss_clip": 0.0105509, "auxiliary_loss_mlp": 0.01041786, "balance_loss_clip": 1.01560783, "balance_loss_mlp": 1.01623034, "epoch": 0.8201412896437698, "flos": 37592240348160.0, "grad_norm": 1.7378667689231904, "language_loss": 0.6367203, "learning_rate": 3.297539733867336e-07, "loss": 0.65768909, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38867188, "step": 13641, "time_per_iteration": 3.9470717906951904 }, { "auxiliary_loss_clip": 0.01050683, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.0124824, "balance_loss_mlp": 1.01564419, "epoch": 0.8202014128964377, "flos": 19645312106880.0, "grad_norm": 1.7802339919079493, "language_loss": 0.74411994, "learning_rate": 3.295397765071055e-07, "loss": 0.76497757, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 13642, "time_per_iteration": 2.3357226848602295 }, { "auxiliary_loss_clip": 0.01052997, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.01347041, "balance_loss_mlp": 1.0174439, "epoch": 0.8202615361491057, "flos": 31465995569280.0, "grad_norm": 1.7405154711992856, "language_loss": 0.71722579, "learning_rate": 3.2932564297149615e-07, "loss": 0.73812538, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 13643, "time_per_iteration": 2.4540491104125977 }, { "auxiliary_loss_clip": 0.01051301, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.0127306, "balance_loss_mlp": 1.01639354, "epoch": 0.8203216594017736, "flos": 24714456606720.0, "grad_norm": 2.109123865090378, "language_loss": 0.66278529, "learning_rate": 3.291115727880256e-07, "loss": 0.68364936, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13644, "time_per_iteration": 2.3942558765411377 }, { "auxiliary_loss_clip": 0.01053266, "auxiliary_loss_mlp": 0.01035413, "balance_loss_clip": 1.01229787, "balance_loss_mlp": 1.01650798, "epoch": 0.8203817826544416, "flos": 26030612240640.0, "grad_norm": 1.4050123235852237, "language_loss": 0.71522141, "learning_rate": 3.2889756596481234e-07, "loss": 0.73610824, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 13645, "time_per_iteration": 2.399430513381958 }, { "auxiliary_loss_clip": 0.01050033, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.01527524, "balance_loss_mlp": 1.01503658, "epoch": 0.8204419059071095, "flos": 25953454402560.0, "grad_norm": 1.7367781586935402, "language_loss": 0.71933597, "learning_rate": 3.286836225099707e-07, "loss": 0.74020684, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34960938, "step": 13646, "time_per_iteration": 2.4114530086517334 }, { "auxiliary_loss_clip": 0.01051869, "auxiliary_loss_mlp": 0.0103983, "balance_loss_clip": 1.01617849, "balance_loss_mlp": 1.01589632, "epoch": 0.8205020291597775, "flos": 23579116237440.0, "grad_norm": 2.432418979718719, "language_loss": 0.8047933, "learning_rate": 3.284697424316132e-07, "loss": 0.8257103, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13647, "time_per_iteration": 2.3573670387268066 }, { "auxiliary_loss_clip": 0.01050381, "auxiliary_loss_mlp": 0.01038128, "balance_loss_clip": 1.015836, "balance_loss_mlp": 1.01613474, "epoch": 0.8205621524124456, "flos": 26797898390400.0, "grad_norm": 1.4463165037124226, "language_loss": 0.69187403, "learning_rate": 3.2825592573785034e-07, "loss": 0.71275914, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34179688, "step": 13648, "time_per_iteration": 2.447502851486206 }, { "auxiliary_loss_clip": 0.01050857, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.01293778, "balance_loss_mlp": 1.01526821, "epoch": 0.8206222756651135, "flos": 27526605621120.0, "grad_norm": 1.7552021762172534, "language_loss": 0.80988908, "learning_rate": 3.28042172436791e-07, "loss": 0.83075011, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 13649, "time_per_iteration": 2.4110682010650635 }, { "auxiliary_loss_clip": 0.01052699, "auxiliary_loss_mlp": 0.01039758, "balance_loss_clip": 1.01552248, "balance_loss_mlp": 1.01712382, "epoch": 0.8206823989177815, "flos": 21177545345280.0, "grad_norm": 1.7849611286931002, "language_loss": 0.69653559, "learning_rate": 3.278284825365396e-07, "loss": 0.71746016, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 13650, "time_per_iteration": 2.3758177757263184 }, { "auxiliary_loss_clip": 0.01054068, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.00969422, "balance_loss_mlp": 1.01670206, "epoch": 0.8207425221704494, "flos": 11508838398720.0, "grad_norm": 2.4164552698358093, "language_loss": 0.6226548, "learning_rate": 3.276148560452001e-07, "loss": 0.64352965, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.375, "step": 13651, "time_per_iteration": 2.3309831619262695 }, { "auxiliary_loss_clip": 0.01053787, "auxiliary_loss_mlp": 0.01041017, "balance_loss_clip": 1.01783085, "balance_loss_mlp": 1.01708078, "epoch": 0.8208026454231174, "flos": 19790969765760.0, "grad_norm": 1.9620790485356434, "language_loss": 0.73194975, "learning_rate": 3.2740129297087293e-07, "loss": 0.7528978, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 13652, "time_per_iteration": 2.419726610183716 }, { "auxiliary_loss_clip": 0.01048927, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.01000059, "balance_loss_mlp": 1.01581001, "epoch": 0.8208627686757853, "flos": 15666679923840.0, "grad_norm": 1.879729385048439, "language_loss": 0.73662359, "learning_rate": 3.271877933216558e-07, "loss": 0.75743532, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33203125, "step": 13653, "time_per_iteration": 2.325052261352539 }, { "auxiliary_loss_clip": 0.01054125, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.01432359, "balance_loss_mlp": 1.01675344, "epoch": 0.8209228919284534, "flos": 37481286447360.0, "grad_norm": 2.4210461574398323, "language_loss": 0.64397931, "learning_rate": 3.269743571056451e-07, "loss": 0.66492069, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37304688, "step": 13654, "time_per_iteration": 2.518131971359253 }, { "auxiliary_loss_clip": 0.01053519, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.01451957, "balance_loss_mlp": 1.01555622, "epoch": 0.8209830151811213, "flos": 23111838334080.0, "grad_norm": 1.573878586221211, "language_loss": 0.7097019, "learning_rate": 3.2676098433093447e-07, "loss": 0.73060519, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.37890625, "step": 13655, "time_per_iteration": 2.374218463897705 }, { "auxiliary_loss_clip": 0.01051606, "auxiliary_loss_mlp": 0.01038536, "balance_loss_clip": 1.01543355, "balance_loss_mlp": 1.01632094, "epoch": 0.8210431384337893, "flos": 21287102791680.0, "grad_norm": 2.1167436207498036, "language_loss": 0.83530742, "learning_rate": 3.265476750056162e-07, "loss": 0.85620886, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 13656, "time_per_iteration": 2.3978655338287354 }, { "auxiliary_loss_clip": 0.01048442, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.01348901, "balance_loss_mlp": 1.01607013, "epoch": 0.8211032616864572, "flos": 11501821215360.0, "grad_norm": 3.101973590447265, "language_loss": 0.7497052, "learning_rate": 3.2633442913777654e-07, "loss": 0.7705276, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.32421875, "step": 13657, "time_per_iteration": 2.346237897872925 }, { "auxiliary_loss_clip": 0.01050945, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.01377368, "balance_loss_mlp": 1.01568151, "epoch": 0.8211633849391252, "flos": 29820294812160.0, "grad_norm": 1.7592275728503664, "language_loss": 0.569278, "learning_rate": 3.2612124673550325e-07, "loss": 0.59016013, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 13658, "time_per_iteration": 2.432565450668335 }, { "auxiliary_loss_clip": 0.01052458, "auxiliary_loss_mlp": 0.01036111, "balance_loss_clip": 1.01329422, "balance_loss_mlp": 1.01639915, "epoch": 0.8212235081917931, "flos": 13114598693760.0, "grad_norm": 2.2456863124925484, "language_loss": 0.80314744, "learning_rate": 3.259081278068805e-07, "loss": 0.82403314, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 13659, "time_per_iteration": 3.616368532180786 }, { "auxiliary_loss_clip": 0.01046958, "auxiliary_loss_mlp": 0.0102825, "balance_loss_clip": 1.00841379, "balance_loss_mlp": 1.01454568, "epoch": 0.8212836314444611, "flos": 40513772252160.0, "grad_norm": 1.6977335711673756, "language_loss": 0.60716242, "learning_rate": 3.256950723599887e-07, "loss": 0.62791455, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.32421875, "step": 13660, "time_per_iteration": 2.532332181930542 }, { "auxiliary_loss_clip": 0.01052209, "auxiliary_loss_mlp": 0.01038461, "balance_loss_clip": 1.01504779, "balance_loss_mlp": 1.01577711, "epoch": 0.8213437546971292, "flos": 18769550762880.0, "grad_norm": 1.8607194847785282, "language_loss": 0.73910785, "learning_rate": 3.254820804029075e-07, "loss": 0.76001453, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 13661, "time_per_iteration": 2.3380308151245117 }, { "auxiliary_loss_clip": 0.01051333, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.01401043, "balance_loss_mlp": 1.01492476, "epoch": 0.8214038779497971, "flos": 19681272673920.0, "grad_norm": 2.189305543722619, "language_loss": 0.76009095, "learning_rate": 3.252691519437143e-07, "loss": 0.78097749, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 13662, "time_per_iteration": 2.375488519668579 }, { "auxiliary_loss_clip": 0.01007668, "auxiliary_loss_mlp": 0.01002759, "balance_loss_clip": 1.00045872, "balance_loss_mlp": 1.00100398, "epoch": 0.8214640012024651, "flos": 71599457848320.0, "grad_norm": 0.7420737585933828, "language_loss": 0.54107761, "learning_rate": 3.250562869904825e-07, "loss": 0.56118184, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.06640625, "step": 13663, "time_per_iteration": 3.1538243293762207 }, { "auxiliary_loss_clip": 0.01051303, "auxiliary_loss_mlp": 0.01040294, "balance_loss_clip": 1.01682162, "balance_loss_mlp": 1.01489711, "epoch": 0.821524124455133, "flos": 14756319555840.0, "grad_norm": 2.0252667493238854, "language_loss": 0.67170167, "learning_rate": 3.248434855512838e-07, "loss": 0.69261765, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 13664, "time_per_iteration": 3.8017237186431885 }, { "auxiliary_loss_clip": 0.01049642, "auxiliary_loss_mlp": 0.01033978, "balance_loss_clip": 1.01324737, "balance_loss_mlp": 1.01655626, "epoch": 0.821584247707801, "flos": 25081114371840.0, "grad_norm": 1.430025292777419, "language_loss": 0.75514597, "learning_rate": 3.246307476341881e-07, "loss": 0.77598214, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33203125, "step": 13665, "time_per_iteration": 2.3990750312805176 }, { "auxiliary_loss_clip": 0.01050885, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.0124433, "balance_loss_mlp": 1.01570559, "epoch": 0.8216443709604689, "flos": 36829213384320.0, "grad_norm": 2.1342566014921975, "language_loss": 0.66423064, "learning_rate": 3.2441807324726256e-07, "loss": 0.68509936, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 13666, "time_per_iteration": 2.4688801765441895 }, { "auxiliary_loss_clip": 0.01051038, "auxiliary_loss_mlp": 0.01037169, "balance_loss_clip": 1.01521039, "balance_loss_mlp": 1.01682162, "epoch": 0.821704494213137, "flos": 25080660524160.0, "grad_norm": 1.7857118734320716, "language_loss": 0.77399957, "learning_rate": 3.2420546239857174e-07, "loss": 0.79488158, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34179688, "step": 13667, "time_per_iteration": 2.401698112487793 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.01315045, "balance_loss_mlp": 1.01629221, "epoch": 0.8217646174658049, "flos": 14355237323520.0, "grad_norm": 1.8006380230745018, "language_loss": 0.78316867, "learning_rate": 3.239929150961773e-07, "loss": 0.80406594, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 13668, "time_per_iteration": 3.638336420059204 }, { "auxiliary_loss_clip": 0.01051074, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.01525223, "balance_loss_mlp": 1.01564789, "epoch": 0.8218247407184729, "flos": 22089476724480.0, "grad_norm": 2.0100748046801606, "language_loss": 0.75601101, "learning_rate": 3.2378043134813984e-07, "loss": 0.77688688, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35546875, "step": 13669, "time_per_iteration": 2.387239456176758 }, { "auxiliary_loss_clip": 0.01050515, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.01219463, "balance_loss_mlp": 1.01562881, "epoch": 0.8218848639711408, "flos": 16763092260480.0, "grad_norm": 2.4468980491540977, "language_loss": 0.79291874, "learning_rate": 3.235680111625161e-07, "loss": 0.81376886, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 13670, "time_per_iteration": 2.3346989154815674 }, { "auxiliary_loss_clip": 0.01055168, "auxiliary_loss_mlp": 0.01037544, "balance_loss_clip": 1.01240265, "balance_loss_mlp": 1.01705599, "epoch": 0.8219449872238088, "flos": 25993604332800.0, "grad_norm": 1.8258222579839392, "language_loss": 0.76102912, "learning_rate": 3.2335565454736123e-07, "loss": 0.78195626, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 13671, "time_per_iteration": 2.4013445377349854 }, { "auxiliary_loss_clip": 0.01054108, "auxiliary_loss_mlp": 0.01037429, "balance_loss_clip": 1.01213324, "balance_loss_mlp": 1.0159483, "epoch": 0.8220051104764767, "flos": 20777021694720.0, "grad_norm": 1.7824943752643938, "language_loss": 0.78080714, "learning_rate": 3.23143361510728e-07, "loss": 0.80172253, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 13672, "time_per_iteration": 2.3632864952087402 }, { "auxiliary_loss_clip": 0.01051913, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.01548052, "balance_loss_mlp": 1.0159359, "epoch": 0.8220652337291448, "flos": 14573968191360.0, "grad_norm": 2.073093528543402, "language_loss": 0.7617296, "learning_rate": 3.2293113206066733e-07, "loss": 0.78265226, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 13673, "time_per_iteration": 2.349086284637451 }, { "auxiliary_loss_clip": 0.01053444, "auxiliary_loss_mlp": 0.01038652, "balance_loss_clip": 1.01417875, "balance_loss_mlp": 1.01659727, "epoch": 0.8221253569818128, "flos": 23804724643200.0, "grad_norm": 1.615873367473832, "language_loss": 0.80120802, "learning_rate": 3.227189662052254e-07, "loss": 0.82212907, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 13674, "time_per_iteration": 2.379401683807373 }, { "auxiliary_loss_clip": 0.01050769, "auxiliary_loss_mlp": 0.01037696, "balance_loss_clip": 1.01343679, "balance_loss_mlp": 1.0157249, "epoch": 0.8221854802344807, "flos": 21287172614400.0, "grad_norm": 1.7303943918478601, "language_loss": 0.71977067, "learning_rate": 3.225068639524484e-07, "loss": 0.7406553, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.34960938, "step": 13675, "time_per_iteration": 2.363065004348755 }, { "auxiliary_loss_clip": 0.01050633, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.013901, "balance_loss_mlp": 1.01612639, "epoch": 0.8222456034871487, "flos": 20955812100480.0, "grad_norm": 2.6098853065842924, "language_loss": 0.75069153, "learning_rate": 3.2229482531037965e-07, "loss": 0.77154952, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34570312, "step": 13676, "time_per_iteration": 2.3545026779174805 }, { "auxiliary_loss_clip": 0.01052071, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.01237726, "balance_loss_mlp": 1.01603508, "epoch": 0.8223057267398166, "flos": 21396450769920.0, "grad_norm": 1.809011174321349, "language_loss": 0.8122431, "learning_rate": 3.2208285028705893e-07, "loss": 0.833112, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 13677, "time_per_iteration": 2.4093449115753174 }, { "auxiliary_loss_clip": 0.01052746, "auxiliary_loss_mlp": 0.01038322, "balance_loss_clip": 1.01579142, "balance_loss_mlp": 1.0159483, "epoch": 0.8223658499924846, "flos": 15267308348160.0, "grad_norm": 1.7549310263364226, "language_loss": 0.7133922, "learning_rate": 3.218709388905245e-07, "loss": 0.73430282, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3671875, "step": 13678, "time_per_iteration": 2.328145980834961 }, { "auxiliary_loss_clip": 0.01050454, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.01602888, "balance_loss_mlp": 1.01560354, "epoch": 0.8224259732451525, "flos": 31248172396800.0, "grad_norm": 1.4069972995959763, "language_loss": 0.72585535, "learning_rate": 3.216590911288133e-07, "loss": 0.74674273, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 13679, "time_per_iteration": 2.472179412841797 }, { "auxiliary_loss_clip": 0.01050499, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.00834274, "balance_loss_mlp": 1.01498365, "epoch": 0.8224860964978206, "flos": 21573705075840.0, "grad_norm": 2.02662258294978, "language_loss": 0.71132404, "learning_rate": 3.214473070099564e-07, "loss": 0.73213869, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13680, "time_per_iteration": 2.34512996673584 }, { "auxiliary_loss_clip": 0.01051273, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.01201606, "balance_loss_mlp": 1.01563263, "epoch": 0.8225462197504885, "flos": 25482056958720.0, "grad_norm": 1.9510840540928875, "language_loss": 0.60647523, "learning_rate": 3.21235586541986e-07, "loss": 0.62733579, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13681, "time_per_iteration": 3.8736627101898193 }, { "auxiliary_loss_clip": 0.01053923, "auxiliary_loss_mlp": 0.01035557, "balance_loss_clip": 1.01165557, "balance_loss_mlp": 1.01605809, "epoch": 0.8226063430031565, "flos": 39383878055040.0, "grad_norm": 1.5680507605801115, "language_loss": 0.70608044, "learning_rate": 3.2102392973293047e-07, "loss": 0.72697532, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37890625, "step": 13682, "time_per_iteration": 2.5191221237182617 }, { "auxiliary_loss_clip": 0.01053147, "auxiliary_loss_mlp": 0.01043877, "balance_loss_clip": 1.01784194, "balance_loss_mlp": 1.01644659, "epoch": 0.8226664662558244, "flos": 22814308794240.0, "grad_norm": 1.8693623397348311, "language_loss": 0.80388629, "learning_rate": 3.20812336590816e-07, "loss": 0.82485664, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 13683, "time_per_iteration": 2.4139251708984375 }, { "auxiliary_loss_clip": 0.01048606, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.01546884, "balance_loss_mlp": 1.0151968, "epoch": 0.8227265895084924, "flos": 25664443234560.0, "grad_norm": 1.9077755716809752, "language_loss": 0.87738049, "learning_rate": 3.206008071236661e-07, "loss": 0.89823306, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33398438, "step": 13684, "time_per_iteration": 2.378774642944336 }, { "auxiliary_loss_clip": 0.01049373, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.01389074, "balance_loss_mlp": 1.01505518, "epoch": 0.8227867127611603, "flos": 26178015467520.0, "grad_norm": 1.4507026702346044, "language_loss": 0.80641437, "learning_rate": 3.2038934133950157e-07, "loss": 0.82726991, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 13685, "time_per_iteration": 2.4716949462890625 }, { "auxiliary_loss_clip": 0.01052829, "auxiliary_loss_mlp": 0.01040829, "balance_loss_clip": 1.01872778, "balance_loss_mlp": 1.0160706, "epoch": 0.8228468360138284, "flos": 22016962097280.0, "grad_norm": 1.5525981825681079, "language_loss": 0.69723439, "learning_rate": 3.2017793924634194e-07, "loss": 0.718171, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 13686, "time_per_iteration": 2.3744308948516846 }, { "auxiliary_loss_clip": 0.01052174, "auxiliary_loss_mlp": 0.01039551, "balance_loss_clip": 1.0151248, "balance_loss_mlp": 1.01495135, "epoch": 0.8229069592664963, "flos": 14902465973760.0, "grad_norm": 1.930919656493472, "language_loss": 0.78772295, "learning_rate": 3.1996660085220263e-07, "loss": 0.80864024, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 13687, "time_per_iteration": 2.3539745807647705 }, { "auxiliary_loss_clip": 0.0105235, "auxiliary_loss_mlp": 0.01038446, "balance_loss_clip": 1.01430631, "balance_loss_mlp": 1.01625586, "epoch": 0.8229670825191643, "flos": 15668565137280.0, "grad_norm": 1.7758782421818677, "language_loss": 0.73762667, "learning_rate": 3.1975532616509825e-07, "loss": 0.75853467, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 13688, "time_per_iteration": 2.313310146331787 }, { "auxiliary_loss_clip": 0.01051268, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.01481509, "balance_loss_mlp": 1.01615644, "epoch": 0.8230272057718323, "flos": 23182432836480.0, "grad_norm": 1.6903391610442347, "language_loss": 0.74015427, "learning_rate": 3.1954411519304025e-07, "loss": 0.76103687, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13689, "time_per_iteration": 2.4368338584899902 }, { "auxiliary_loss_clip": 0.01051561, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.01475191, "balance_loss_mlp": 1.01543069, "epoch": 0.8230873290245002, "flos": 21031364016000.0, "grad_norm": 1.8818249213994498, "language_loss": 0.69594038, "learning_rate": 3.1933296794403887e-07, "loss": 0.71683574, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 13690, "time_per_iteration": 2.362656354904175 }, { "auxiliary_loss_clip": 0.01052087, "auxiliary_loss_mlp": 0.01038551, "balance_loss_clip": 1.0155673, "balance_loss_mlp": 1.01586604, "epoch": 0.8231474522771682, "flos": 21249117365760.0, "grad_norm": 1.6793630829494646, "language_loss": 0.86230582, "learning_rate": 3.191218844260988e-07, "loss": 0.88321221, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 13691, "time_per_iteration": 2.4108457565307617 }, { "auxiliary_loss_clip": 0.01051606, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01021302, "balance_loss_mlp": 1.01633859, "epoch": 0.8232075755298361, "flos": 23840894678400.0, "grad_norm": 2.559678180374444, "language_loss": 0.78346282, "learning_rate": 3.189108646472252e-07, "loss": 0.80430126, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13692, "time_per_iteration": 2.374525308609009 }, { "auxiliary_loss_clip": 0.01050421, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.00974655, "balance_loss_mlp": 1.01560473, "epoch": 0.8232676987825042, "flos": 21652852861440.0, "grad_norm": 1.4796838820272846, "language_loss": 0.72551727, "learning_rate": 3.186999086154205e-07, "loss": 0.74634123, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 13693, "time_per_iteration": 2.4065725803375244 }, { "auxiliary_loss_clip": 0.0104909, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.01364207, "balance_loss_mlp": 1.0145061, "epoch": 0.8233278220351721, "flos": 26321508622080.0, "grad_norm": 1.3351774240298224, "language_loss": 0.844064, "learning_rate": 3.1848901633868355e-07, "loss": 0.86490089, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 13694, "time_per_iteration": 2.398897409439087 }, { "auxiliary_loss_clip": 0.01052029, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.01283145, "balance_loss_mlp": 1.01610231, "epoch": 0.8233879452878401, "flos": 21724739084160.0, "grad_norm": 2.0761203967320543, "language_loss": 0.78034443, "learning_rate": 3.182781878250118e-07, "loss": 0.8012237, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 13695, "time_per_iteration": 2.4076693058013916 }, { "auxiliary_loss_clip": 0.01052164, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.01241493, "balance_loss_mlp": 1.01712561, "epoch": 0.823448068540508, "flos": 20556719815680.0, "grad_norm": 1.9007416577707212, "language_loss": 0.82522988, "learning_rate": 3.1806742308239985e-07, "loss": 0.84609699, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13696, "time_per_iteration": 2.348827362060547 }, { "auxiliary_loss_clip": 0.01007741, "auxiliary_loss_mlp": 0.01003216, "balance_loss_clip": 1.00107014, "balance_loss_mlp": 1.0009315, "epoch": 0.823508191793176, "flos": 67269947834880.0, "grad_norm": 0.7456124811532253, "language_loss": 0.63992459, "learning_rate": 3.178567221188393e-07, "loss": 0.66003418, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06835938, "step": 13697, "time_per_iteration": 3.1126952171325684 }, { "auxiliary_loss_clip": 0.01048768, "auxiliary_loss_mlp": 0.01033674, "balance_loss_clip": 1.01265693, "balance_loss_mlp": 1.01534414, "epoch": 0.8235683150458439, "flos": 17927515658880.0, "grad_norm": 1.6683153295939304, "language_loss": 0.73709458, "learning_rate": 3.1764608494232037e-07, "loss": 0.75791895, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.3359375, "step": 13698, "time_per_iteration": 3.6471986770629883 }, { "auxiliary_loss_clip": 0.01051791, "auxiliary_loss_mlp": 0.01034473, "balance_loss_clip": 1.00973666, "balance_loss_mlp": 1.01578915, "epoch": 0.823628438298512, "flos": 18915103687680.0, "grad_norm": 2.083305990538955, "language_loss": 0.73110628, "learning_rate": 3.174355115608305e-07, "loss": 0.75196898, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 13699, "time_per_iteration": 2.3676772117614746 }, { "auxiliary_loss_clip": 0.01050724, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.01172388, "balance_loss_mlp": 1.01573539, "epoch": 0.8236885615511799, "flos": 18695500035840.0, "grad_norm": 2.2577930345676887, "language_loss": 0.82995224, "learning_rate": 3.1722500198235526e-07, "loss": 0.85079175, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 13700, "time_per_iteration": 2.327467679977417 }, { "auxiliary_loss_clip": 0.01051805, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.0131526, "balance_loss_mlp": 1.01564407, "epoch": 0.8237486848038479, "flos": 23693910387840.0, "grad_norm": 1.8268670295827365, "language_loss": 0.73855007, "learning_rate": 3.170145562148763e-07, "loss": 0.75942791, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 13701, "time_per_iteration": 2.4037322998046875 }, { "auxiliary_loss_clip": 0.01053038, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.0098182, "balance_loss_mlp": 1.01631689, "epoch": 0.8238088080565159, "flos": 23440161559680.0, "grad_norm": 2.00875775776149, "language_loss": 0.70594144, "learning_rate": 3.1680417426637384e-07, "loss": 0.72682142, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 13702, "time_per_iteration": 2.368945837020874 }, { "auxiliary_loss_clip": 0.01052834, "auxiliary_loss_mlp": 0.01035992, "balance_loss_clip": 1.01155365, "balance_loss_mlp": 1.0167563, "epoch": 0.8238689313091838, "flos": 22745459859840.0, "grad_norm": 1.7779821416780852, "language_loss": 0.7594527, "learning_rate": 3.1659385614482603e-07, "loss": 0.78034103, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 13703, "time_per_iteration": 2.393434762954712 }, { "auxiliary_loss_clip": 0.01053456, "auxiliary_loss_mlp": 0.01040066, "balance_loss_clip": 1.0155921, "balance_loss_mlp": 1.01590419, "epoch": 0.8239290545618518, "flos": 25628901603840.0, "grad_norm": 2.266470965689381, "language_loss": 0.71195197, "learning_rate": 3.1638360185820755e-07, "loss": 0.73288715, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 13704, "time_per_iteration": 2.3892483711242676 }, { "auxiliary_loss_clip": 0.010495, "auxiliary_loss_mlp": 0.01038807, "balance_loss_clip": 1.01515639, "balance_loss_mlp": 1.01472795, "epoch": 0.8239891778145197, "flos": 26025410448000.0, "grad_norm": 1.9029960182633554, "language_loss": 0.65494573, "learning_rate": 3.161734114144916e-07, "loss": 0.67582887, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34765625, "step": 13705, "time_per_iteration": 3.8251585960388184 }, { "auxiliary_loss_clip": 0.01052628, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.01119828, "balance_loss_mlp": 1.01596391, "epoch": 0.8240493010671878, "flos": 21832236760320.0, "grad_norm": 2.067869870464641, "language_loss": 0.70667964, "learning_rate": 3.1596328482164915e-07, "loss": 0.72755706, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 13706, "time_per_iteration": 2.3849499225616455 }, { "auxiliary_loss_clip": 0.01051745, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.013641, "balance_loss_mlp": 1.01678371, "epoch": 0.8241094243198557, "flos": 18550924629120.0, "grad_norm": 1.6907490182608942, "language_loss": 0.70994508, "learning_rate": 3.157532220876475e-07, "loss": 0.73082322, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13707, "time_per_iteration": 3.726036548614502 }, { "auxiliary_loss_clip": 0.0105212, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.01244688, "balance_loss_mlp": 1.01534212, "epoch": 0.8241695475725237, "flos": 25445991657600.0, "grad_norm": 1.8775499190859868, "language_loss": 0.80200076, "learning_rate": 3.1554322322045226e-07, "loss": 0.82288575, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 13708, "time_per_iteration": 2.3908145427703857 }, { "auxiliary_loss_clip": 0.0105183, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.01318884, "balance_loss_mlp": 1.01571441, "epoch": 0.8242296708251916, "flos": 18988665655680.0, "grad_norm": 2.094011680408907, "language_loss": 0.69728827, "learning_rate": 3.1533328822802664e-07, "loss": 0.71816814, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36132812, "step": 13709, "time_per_iteration": 2.349421739578247 }, { "auxiliary_loss_clip": 0.01052518, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.01273739, "balance_loss_mlp": 1.01680636, "epoch": 0.8242897940778596, "flos": 22599802200960.0, "grad_norm": 2.043366930458977, "language_loss": 0.83966368, "learning_rate": 3.151234171183319e-07, "loss": 0.86054784, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35742188, "step": 13710, "time_per_iteration": 2.384967803955078 }, { "auxiliary_loss_clip": 0.01050417, "auxiliary_loss_mlp": 0.01038897, "balance_loss_clip": 1.01618779, "balance_loss_mlp": 1.01473761, "epoch": 0.8243499173305275, "flos": 21467150006400.0, "grad_norm": 1.854981875320383, "language_loss": 0.79263496, "learning_rate": 3.149136098993257e-07, "loss": 0.81352806, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 13711, "time_per_iteration": 2.3450136184692383 }, { "auxiliary_loss_clip": 0.0105149, "auxiliary_loss_mlp": 0.01036184, "balance_loss_clip": 1.0132246, "balance_loss_mlp": 1.0157392, "epoch": 0.8244100405831956, "flos": 20009351520000.0, "grad_norm": 1.8966336712692795, "language_loss": 0.67359197, "learning_rate": 3.1470386657896473e-07, "loss": 0.69446874, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 13712, "time_per_iteration": 2.3466148376464844 }, { "auxiliary_loss_clip": 0.01052535, "auxiliary_loss_mlp": 0.01040053, "balance_loss_clip": 1.01517439, "balance_loss_mlp": 1.0165453, "epoch": 0.8244701638358635, "flos": 26429529968640.0, "grad_norm": 1.7683742884443596, "language_loss": 0.7514528, "learning_rate": 3.14494187165202e-07, "loss": 0.77237868, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 13713, "time_per_iteration": 2.3883330821990967 }, { "auxiliary_loss_clip": 0.01051023, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.00772572, "balance_loss_mlp": 1.01480937, "epoch": 0.8245302870885315, "flos": 17639028161280.0, "grad_norm": 1.9502625444840858, "language_loss": 0.82376063, "learning_rate": 3.1428457166598833e-07, "loss": 0.84459436, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 13714, "time_per_iteration": 2.3304924964904785 }, { "auxiliary_loss_clip": 0.01052617, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.01670885, "balance_loss_mlp": 1.01726341, "epoch": 0.8245904103411995, "flos": 26208425128320.0, "grad_norm": 1.6623445712396068, "language_loss": 0.67541242, "learning_rate": 3.1407502008927235e-07, "loss": 0.69634324, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 13715, "time_per_iteration": 2.431386947631836 }, { "auxiliary_loss_clip": 0.0105384, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.01191115, "balance_loss_mlp": 1.01662421, "epoch": 0.8246505335938674, "flos": 24203991484800.0, "grad_norm": 2.3056388832187213, "language_loss": 0.76429844, "learning_rate": 3.1386553244300086e-07, "loss": 0.78518689, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37109375, "step": 13716, "time_per_iteration": 2.3897364139556885 }, { "auxiliary_loss_clip": 0.01007645, "auxiliary_loss_mlp": 0.0100343, "balance_loss_clip": 1.00128424, "balance_loss_mlp": 1.00077212, "epoch": 0.8247106568465354, "flos": 67088434343040.0, "grad_norm": 0.7201942139237212, "language_loss": 0.59022033, "learning_rate": 3.136561087351175e-07, "loss": 0.61033112, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06884766, "step": 13717, "time_per_iteration": 3.1317803859710693 }, { "auxiliary_loss_clip": 0.01050625, "auxiliary_loss_mlp": 0.01036374, "balance_loss_clip": 1.01489198, "balance_loss_mlp": 1.01497698, "epoch": 0.8247707800992033, "flos": 12567404954880.0, "grad_norm": 2.2083422575513496, "language_loss": 0.81297016, "learning_rate": 3.1344674897356373e-07, "loss": 0.83384007, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35546875, "step": 13718, "time_per_iteration": 2.352323532104492 }, { "auxiliary_loss_clip": 0.01049063, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.01844597, "balance_loss_mlp": 1.01490891, "epoch": 0.8248309033518714, "flos": 15922732901760.0, "grad_norm": 1.9954337661688828, "language_loss": 0.69619828, "learning_rate": 3.132374531662778e-07, "loss": 0.71708977, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 13719, "time_per_iteration": 2.3277177810668945 }, { "auxiliary_loss_clip": 0.01052147, "auxiliary_loss_mlp": 0.01039859, "balance_loss_clip": 1.01382399, "balance_loss_mlp": 1.01532471, "epoch": 0.8248910266045393, "flos": 17563825359360.0, "grad_norm": 2.242367341032914, "language_loss": 0.70832318, "learning_rate": 3.13028221321197e-07, "loss": 0.72924328, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 13720, "time_per_iteration": 2.3641624450683594 }, { "auxiliary_loss_clip": 0.01053947, "auxiliary_loss_mlp": 0.01038022, "balance_loss_clip": 1.01304746, "balance_loss_mlp": 1.01688361, "epoch": 0.8249511498572073, "flos": 28618444569600.0, "grad_norm": 1.6906866637176536, "language_loss": 0.76969123, "learning_rate": 3.1281905344625467e-07, "loss": 0.79061091, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 13721, "time_per_iteration": 3.868593454360962 }, { "auxiliary_loss_clip": 0.01050118, "auxiliary_loss_mlp": 0.0103269, "balance_loss_clip": 1.01255548, "balance_loss_mlp": 1.0152241, "epoch": 0.8250112731098752, "flos": 25555409458560.0, "grad_norm": 1.8044308012485728, "language_loss": 0.78912723, "learning_rate": 3.1260994954938305e-07, "loss": 0.80995524, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.34765625, "step": 13722, "time_per_iteration": 2.396998167037964 }, { "auxiliary_loss_clip": 0.01050425, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01130223, "balance_loss_mlp": 1.01596189, "epoch": 0.8250713963625432, "flos": 27744917552640.0, "grad_norm": 1.7520334927874042, "language_loss": 0.64098966, "learning_rate": 3.1240090963851205e-07, "loss": 0.66182053, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 13723, "time_per_iteration": 2.427154064178467 }, { "auxiliary_loss_clip": 0.01052913, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.01373231, "balance_loss_mlp": 1.0165993, "epoch": 0.8251315196152111, "flos": 21609700554240.0, "grad_norm": 1.4859289116893657, "language_loss": 0.75809765, "learning_rate": 3.121919337215666e-07, "loss": 0.77900505, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 13724, "time_per_iteration": 2.371713876724243 }, { "auxiliary_loss_clip": 0.01052832, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.01659405, "balance_loss_mlp": 1.01658201, "epoch": 0.8251916428678792, "flos": 28578259728000.0, "grad_norm": 2.3848005751551757, "language_loss": 0.64839172, "learning_rate": 3.1198302180647253e-07, "loss": 0.66933435, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 13725, "time_per_iteration": 2.416520118713379 }, { "auxiliary_loss_clip": 0.01050748, "auxiliary_loss_mlp": 0.01041174, "balance_loss_clip": 1.01833391, "balance_loss_mlp": 1.01587284, "epoch": 0.8252517661205471, "flos": 23074097287680.0, "grad_norm": 1.5642246146217766, "language_loss": 0.82790619, "learning_rate": 3.1177417390115125e-07, "loss": 0.8488254, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 13726, "time_per_iteration": 2.3837780952453613 }, { "auxiliary_loss_clip": 0.01047529, "auxiliary_loss_mlp": 0.0103571, "balance_loss_clip": 1.01565921, "balance_loss_mlp": 1.0140605, "epoch": 0.8253118893732151, "flos": 31758218582400.0, "grad_norm": 1.69345792485503, "language_loss": 0.71831417, "learning_rate": 3.1156539001352286e-07, "loss": 0.73914659, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33398438, "step": 13727, "time_per_iteration": 2.457618474960327 }, { "auxiliary_loss_clip": 0.0105435, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.0138638, "balance_loss_mlp": 1.01714301, "epoch": 0.8253720126258831, "flos": 18295430232960.0, "grad_norm": 1.7039133327494973, "language_loss": 0.63871086, "learning_rate": 3.113566701515036e-07, "loss": 0.65965283, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 13728, "time_per_iteration": 2.373444080352783 }, { "auxiliary_loss_clip": 0.01054819, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.01403749, "balance_loss_mlp": 1.01728225, "epoch": 0.825432135878551, "flos": 26796117911040.0, "grad_norm": 1.8318426646296377, "language_loss": 0.72683394, "learning_rate": 3.111480143230092e-07, "loss": 0.74776649, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 13729, "time_per_iteration": 2.390575647354126 }, { "auxiliary_loss_clip": 0.01007064, "auxiliary_loss_mlp": 0.01002672, "balance_loss_clip": 1.0003233, "balance_loss_mlp": 1.00046849, "epoch": 0.825492259131219, "flos": 54216552487680.0, "grad_norm": 0.8471337922207444, "language_loss": 0.62750763, "learning_rate": 3.109394225359514e-07, "loss": 0.64760494, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.06591797, "step": 13730, "time_per_iteration": 2.8276736736297607 }, { "auxiliary_loss_clip": 0.01051951, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.01548135, "balance_loss_mlp": 1.01659226, "epoch": 0.825552382383887, "flos": 43754655162240.0, "grad_norm": 2.002134561137746, "language_loss": 0.65245491, "learning_rate": 3.1073089479823945e-07, "loss": 0.6733492, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 13731, "time_per_iteration": 2.546267509460449 }, { "auxiliary_loss_clip": 0.01053281, "auxiliary_loss_mlp": 0.01037294, "balance_loss_clip": 1.01453674, "balance_loss_mlp": 1.0155654, "epoch": 0.825612505636555, "flos": 12602038890240.0, "grad_norm": 2.0595488610429675, "language_loss": 0.70732892, "learning_rate": 3.105224311177812e-07, "loss": 0.72823465, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.37695312, "step": 13732, "time_per_iteration": 2.3264107704162598 }, { "auxiliary_loss_clip": 0.01053072, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.0168829, "balance_loss_mlp": 1.01567078, "epoch": 0.8256726288892229, "flos": 17594863424640.0, "grad_norm": 3.241746381902063, "language_loss": 0.7422685, "learning_rate": 3.103140315024817e-07, "loss": 0.7632215, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 13733, "time_per_iteration": 2.3161416053771973 }, { "auxiliary_loss_clip": 0.01048847, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.01110864, "balance_loss_mlp": 1.01472259, "epoch": 0.8257327521418909, "flos": 23804654820480.0, "grad_norm": 1.4962230508326781, "language_loss": 0.82774031, "learning_rate": 3.1010569596024437e-07, "loss": 0.84855068, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 13734, "time_per_iteration": 2.4225738048553467 }, { "auxiliary_loss_clip": 0.01050733, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.00889063, "balance_loss_mlp": 1.01661229, "epoch": 0.8257928753945588, "flos": 19280120618880.0, "grad_norm": 1.8422566488163872, "language_loss": 0.84302306, "learning_rate": 3.098974244989676e-07, "loss": 0.8638351, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 13735, "time_per_iteration": 2.333651304244995 }, { "auxiliary_loss_clip": 0.01051862, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.01177704, "balance_loss_mlp": 1.01654196, "epoch": 0.8258529986472268, "flos": 18477851420160.0, "grad_norm": 1.9368549063382565, "language_loss": 0.7217797, "learning_rate": 3.096892171265497e-07, "loss": 0.74262047, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.35351562, "step": 13736, "time_per_iteration": 2.381186008453369 }, { "auxiliary_loss_clip": 0.01007357, "auxiliary_loss_mlp": 0.01003346, "balance_loss_clip": 1.00123656, "balance_loss_mlp": 1.00075221, "epoch": 0.8259131218998947, "flos": 62135133200640.0, "grad_norm": 0.8596043439707867, "language_loss": 0.68088019, "learning_rate": 3.0948107385088665e-07, "loss": 0.70098722, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06591797, "step": 13737, "time_per_iteration": 3.0227253437042236 }, { "auxiliary_loss_clip": 0.01052281, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.01192832, "balance_loss_mlp": 1.01600826, "epoch": 0.8259732451525628, "flos": 22158081279360.0, "grad_norm": 1.8930576818336164, "language_loss": 0.70642078, "learning_rate": 3.0927299467987e-07, "loss": 0.72728372, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 13738, "time_per_iteration": 3.616687536239624 }, { "auxiliary_loss_clip": 0.01055874, "auxiliary_loss_mlp": 0.01044219, "balance_loss_clip": 1.01733685, "balance_loss_mlp": 1.01789188, "epoch": 0.8260333684052307, "flos": 38360154902400.0, "grad_norm": 2.247222552464759, "language_loss": 0.64482176, "learning_rate": 3.090649796213911e-07, "loss": 0.66582274, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 13739, "time_per_iteration": 2.507190704345703 }, { "auxiliary_loss_clip": 0.01007507, "auxiliary_loss_mlp": 0.01001849, "balance_loss_clip": 0.99967909, "balance_loss_mlp": 1.00071645, "epoch": 0.8260934916578987, "flos": 62182474871040.0, "grad_norm": 0.8194267873820062, "language_loss": 0.59452707, "learning_rate": 3.0885702868333853e-07, "loss": 0.61462063, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06787109, "step": 13740, "time_per_iteration": 3.0841991901397705 }, { "auxiliary_loss_clip": 0.01054657, "auxiliary_loss_mlp": 0.01040965, "balance_loss_clip": 1.0150969, "balance_loss_mlp": 1.01659513, "epoch": 0.8261536149105667, "flos": 22564365304320.0, "grad_norm": 1.9198958775916841, "language_loss": 0.76910233, "learning_rate": 3.086491418735959e-07, "loss": 0.79005855, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 13741, "time_per_iteration": 2.42486572265625 }, { "auxiliary_loss_clip": 0.01052388, "auxiliary_loss_mlp": 0.01041871, "balance_loss_clip": 1.01780307, "balance_loss_mlp": 1.01635325, "epoch": 0.8262137381632346, "flos": 32524108277760.0, "grad_norm": 2.2226064025118664, "language_loss": 0.63181102, "learning_rate": 3.0844131920004726e-07, "loss": 0.65275359, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 13742, "time_per_iteration": 2.4346420764923096 }, { "auxiliary_loss_clip": 0.01054467, "auxiliary_loss_mlp": 0.01043108, "balance_loss_clip": 1.01561832, "balance_loss_mlp": 1.0166986, "epoch": 0.8262738614159026, "flos": 14135598760320.0, "grad_norm": 3.079206878354677, "language_loss": 0.69156039, "learning_rate": 3.0823356067057327e-07, "loss": 0.7125361, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37695312, "step": 13743, "time_per_iteration": 2.3485946655273438 }, { "auxiliary_loss_clip": 0.01052704, "auxiliary_loss_mlp": 0.01043268, "balance_loss_clip": 1.02092826, "balance_loss_mlp": 1.01695454, "epoch": 0.8263339846685706, "flos": 19824416714880.0, "grad_norm": 1.887763168095159, "language_loss": 0.67104936, "learning_rate": 3.0802586629305283e-07, "loss": 0.69200909, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 13744, "time_per_iteration": 3.775764226913452 }, { "auxiliary_loss_clip": 0.01052515, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.01308846, "balance_loss_mlp": 1.01660144, "epoch": 0.8263941079212386, "flos": 22744901278080.0, "grad_norm": 1.8519567134814667, "language_loss": 0.76621699, "learning_rate": 3.078182360753612e-07, "loss": 0.78710234, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 13745, "time_per_iteration": 2.3882153034210205 }, { "auxiliary_loss_clip": 0.01049569, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.01518714, "balance_loss_mlp": 1.01446176, "epoch": 0.8264542311739065, "flos": 20119607193600.0, "grad_norm": 1.980059763992647, "language_loss": 0.80219698, "learning_rate": 3.076106700253709e-07, "loss": 0.82307625, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 13746, "time_per_iteration": 2.335047721862793 }, { "auxiliary_loss_clip": 0.01056986, "auxiliary_loss_mlp": 0.01044792, "balance_loss_clip": 1.01892424, "balance_loss_mlp": 1.01900578, "epoch": 0.8265143544265745, "flos": 16836200380800.0, "grad_norm": 2.1482314666378293, "language_loss": 0.70225751, "learning_rate": 3.0740316815095415e-07, "loss": 0.72327536, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 13747, "time_per_iteration": 3.7638440132141113 }, { "auxiliary_loss_clip": 0.01053447, "auxiliary_loss_mlp": 0.01037844, "balance_loss_clip": 1.01396644, "balance_loss_mlp": 1.01627421, "epoch": 0.8265744776792424, "flos": 22017485767680.0, "grad_norm": 1.9391881632777639, "language_loss": 0.76672113, "learning_rate": 3.0719573045997835e-07, "loss": 0.78763402, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 13748, "time_per_iteration": 2.364699125289917 }, { "auxiliary_loss_clip": 0.01049228, "auxiliary_loss_mlp": 0.01036504, "balance_loss_clip": 1.01622629, "balance_loss_mlp": 1.01575422, "epoch": 0.8266346009319104, "flos": 19243845849600.0, "grad_norm": 2.664302121373129, "language_loss": 0.64361084, "learning_rate": 3.069883569603102e-07, "loss": 0.66446817, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33398438, "step": 13749, "time_per_iteration": 2.3361499309539795 }, { "auxiliary_loss_clip": 0.01049959, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.01420939, "balance_loss_mlp": 1.01536822, "epoch": 0.8266947241845783, "flos": 24165726768000.0, "grad_norm": 1.5699244249134474, "language_loss": 0.74999297, "learning_rate": 3.067810476598132e-07, "loss": 0.77085233, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 13750, "time_per_iteration": 2.3706769943237305 }, { "auxiliary_loss_clip": 0.01051969, "auxiliary_loss_mlp": 0.0104187, "balance_loss_clip": 1.01786137, "balance_loss_mlp": 1.01605988, "epoch": 0.8267548474372464, "flos": 21104751427200.0, "grad_norm": 2.3474379514526564, "language_loss": 0.66507316, "learning_rate": 3.065738025663496e-07, "loss": 0.68601149, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 13751, "time_per_iteration": 2.3911654949188232 }, { "auxiliary_loss_clip": 0.01048849, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.01484108, "balance_loss_mlp": 1.01453328, "epoch": 0.8268149706899143, "flos": 39966718158720.0, "grad_norm": 1.51298149150531, "language_loss": 0.6147306, "learning_rate": 3.0636662168777607e-07, "loss": 0.63558602, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 13752, "time_per_iteration": 2.5097508430480957 }, { "auxiliary_loss_clip": 0.01007159, "auxiliary_loss_mlp": 0.01002187, "balance_loss_clip": 1.00005269, "balance_loss_mlp": 1.00031424, "epoch": 0.8268750939425823, "flos": 65779611960960.0, "grad_norm": 0.7765846021345417, "language_loss": 0.57579887, "learning_rate": 3.0615950503194986e-07, "loss": 0.59589231, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.06835938, "step": 13753, "time_per_iteration": 3.0536856651306152 }, { "auxiliary_loss_clip": 0.01007313, "auxiliary_loss_mlp": 0.01007027, "balance_loss_clip": 1.00489342, "balance_loss_mlp": 1.00056815, "epoch": 0.8269352171952503, "flos": 52978846412160.0, "grad_norm": 0.7069682616430156, "language_loss": 0.5501436, "learning_rate": 3.0595245260672563e-07, "loss": 0.57028699, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.06738281, "step": 13754, "time_per_iteration": 3.107558250427246 }, { "auxiliary_loss_clip": 0.01049186, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.01640403, "balance_loss_mlp": 1.0145998, "epoch": 0.8269953404479182, "flos": 23075004983040.0, "grad_norm": 1.8921019108325405, "language_loss": 0.70836234, "learning_rate": 3.0574546441995354e-07, "loss": 0.7292257, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34570312, "step": 13755, "time_per_iteration": 2.3663318157196045 }, { "auxiliary_loss_clip": 0.01051061, "auxiliary_loss_mlp": 0.01039404, "balance_loss_clip": 1.01788652, "balance_loss_mlp": 1.01684356, "epoch": 0.8270554637005862, "flos": 14209125816960.0, "grad_norm": 2.043156823382538, "language_loss": 0.71967268, "learning_rate": 3.0553854047948324e-07, "loss": 0.74057728, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 13756, "time_per_iteration": 2.376335859298706 }, { "auxiliary_loss_clip": 0.01053294, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.01517296, "balance_loss_mlp": 1.01719117, "epoch": 0.8271155869532542, "flos": 21760978942080.0, "grad_norm": 1.7339633548618025, "language_loss": 0.73984134, "learning_rate": 3.053316807931623e-07, "loss": 0.76076186, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36132812, "step": 13757, "time_per_iteration": 2.3743927478790283 }, { "auxiliary_loss_clip": 0.01053738, "auxiliary_loss_mlp": 0.01041493, "balance_loss_clip": 1.01526666, "balance_loss_mlp": 1.01643682, "epoch": 0.8271757102059222, "flos": 15119625830400.0, "grad_norm": 2.2490857398343986, "language_loss": 0.70114172, "learning_rate": 3.0512488536883283e-07, "loss": 0.72209406, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 13758, "time_per_iteration": 2.3407645225524902 }, { "auxiliary_loss_clip": 0.01049417, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.0121063, "balance_loss_mlp": 1.01472569, "epoch": 0.8272358334585901, "flos": 24132594021120.0, "grad_norm": 1.499913236473441, "language_loss": 0.70495522, "learning_rate": 3.0491815421433775e-07, "loss": 0.72579527, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 13759, "time_per_iteration": 2.3859896659851074 }, { "auxiliary_loss_clip": 0.01051509, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.0128634, "balance_loss_mlp": 1.01598716, "epoch": 0.8272959567112581, "flos": 18989678085120.0, "grad_norm": 1.6237615617864072, "language_loss": 0.71326077, "learning_rate": 3.047114873375161e-07, "loss": 0.73415029, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35546875, "step": 13760, "time_per_iteration": 3.735513210296631 }, { "auxiliary_loss_clip": 0.01050736, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.01408446, "balance_loss_mlp": 1.01649499, "epoch": 0.827356079963926, "flos": 20630561074560.0, "grad_norm": 1.5853894668209771, "language_loss": 0.78532076, "learning_rate": 3.0450488474620505e-07, "loss": 0.80618179, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 13761, "time_per_iteration": 2.35186505317688 }, { "auxiliary_loss_clip": 0.01049634, "auxiliary_loss_mlp": 0.01040045, "balance_loss_clip": 1.01842046, "balance_loss_mlp": 1.01634729, "epoch": 0.827416203216594, "flos": 22415600534400.0, "grad_norm": 1.636737354445705, "language_loss": 0.71192634, "learning_rate": 3.042983464482387e-07, "loss": 0.73282313, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33203125, "step": 13762, "time_per_iteration": 2.391179323196411 }, { "auxiliary_loss_clip": 0.01050492, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.01218283, "balance_loss_mlp": 1.01551795, "epoch": 0.827476326469262, "flos": 19025184804480.0, "grad_norm": 1.895140834418085, "language_loss": 0.71432304, "learning_rate": 3.0409187245144853e-07, "loss": 0.73516947, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 13763, "time_per_iteration": 2.3426053524017334 }, { "auxiliary_loss_clip": 0.01006985, "auxiliary_loss_mlp": 0.0100321, "balance_loss_clip": 1.00089765, "balance_loss_mlp": 1.00036263, "epoch": 0.82753644972193, "flos": 68497180502400.0, "grad_norm": 0.8528146917432381, "language_loss": 0.65251702, "learning_rate": 3.038854627636651e-07, "loss": 0.67261899, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.06640625, "step": 13764, "time_per_iteration": 3.079747200012207 }, { "auxiliary_loss_clip": 0.01053139, "auxiliary_loss_mlp": 0.01039773, "balance_loss_clip": 1.01497769, "balance_loss_mlp": 1.01700807, "epoch": 0.8275965729745979, "flos": 18404429097600.0, "grad_norm": 2.083606920641129, "language_loss": 0.79720962, "learning_rate": 3.0367911739271423e-07, "loss": 0.81813872, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 13765, "time_per_iteration": 2.3299560546875 }, { "auxiliary_loss_clip": 0.01052633, "auxiliary_loss_mlp": 0.01042159, "balance_loss_clip": 1.01830554, "balance_loss_mlp": 1.01561797, "epoch": 0.8276566962272659, "flos": 28510807248000.0, "grad_norm": 1.6456613241945417, "language_loss": 0.63294125, "learning_rate": 3.034728363464214e-07, "loss": 0.65388918, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 13766, "time_per_iteration": 2.4275379180908203 }, { "auxiliary_loss_clip": 0.01051572, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.01347017, "balance_loss_mlp": 1.01591229, "epoch": 0.8277168194799339, "flos": 20229199551360.0, "grad_norm": 1.6075299287805156, "language_loss": 0.83589661, "learning_rate": 3.03266619632609e-07, "loss": 0.85678536, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35742188, "step": 13767, "time_per_iteration": 2.350020170211792 }, { "auxiliary_loss_clip": 0.01052686, "auxiliary_loss_mlp": 0.01042005, "balance_loss_clip": 1.01771057, "balance_loss_mlp": 1.01605737, "epoch": 0.8277769427326018, "flos": 28475335440000.0, "grad_norm": 2.1493476042476316, "language_loss": 0.69444752, "learning_rate": 3.030604672590964e-07, "loss": 0.71539438, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 13768, "time_per_iteration": 2.441054582595825 }, { "auxiliary_loss_clip": 0.01049897, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.01337028, "balance_loss_mlp": 1.01531804, "epoch": 0.8278370659852698, "flos": 27196432093440.0, "grad_norm": 1.7636803453809866, "language_loss": 0.75820464, "learning_rate": 3.028543792337006e-07, "loss": 0.77906346, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34570312, "step": 13769, "time_per_iteration": 2.3991925716400146 }, { "auxiliary_loss_clip": 0.01052602, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.01700974, "balance_loss_mlp": 1.01604247, "epoch": 0.8278971892379378, "flos": 37814601997440.0, "grad_norm": 1.8224599423572245, "language_loss": 0.75199759, "learning_rate": 3.0264835556423675e-07, "loss": 0.77293444, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 13770, "time_per_iteration": 2.509031295776367 }, { "auxiliary_loss_clip": 0.01052885, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.01595819, "balance_loss_mlp": 1.01700926, "epoch": 0.8279573124906058, "flos": 22559198423040.0, "grad_norm": 1.610761780167085, "language_loss": 0.76836324, "learning_rate": 3.0244239625851785e-07, "loss": 0.7893002, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 13771, "time_per_iteration": 2.358564615249634 }, { "auxiliary_loss_clip": 0.01051689, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.01765084, "balance_loss_mlp": 1.01590943, "epoch": 0.8280174357432737, "flos": 36063149132160.0, "grad_norm": 1.4946167505200783, "language_loss": 0.73615336, "learning_rate": 3.0223650132435284e-07, "loss": 0.75706458, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 13772, "time_per_iteration": 2.4893031120300293 }, { "auxiliary_loss_clip": 0.01050463, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.01251292, "balance_loss_mlp": 1.01585746, "epoch": 0.8280775589959417, "flos": 22960106098560.0, "grad_norm": 2.3518812610166706, "language_loss": 0.75801677, "learning_rate": 3.0203067076955035e-07, "loss": 0.77889872, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.34570312, "step": 13773, "time_per_iteration": 2.3506124019622803 }, { "auxiliary_loss_clip": 0.01052566, "auxiliary_loss_mlp": 0.0103221, "balance_loss_clip": 1.00860691, "balance_loss_mlp": 1.01683044, "epoch": 0.8281376822486096, "flos": 26062208887680.0, "grad_norm": 1.85577387804443, "language_loss": 0.76618958, "learning_rate": 3.01824904601915e-07, "loss": 0.78703737, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35742188, "step": 13774, "time_per_iteration": 2.411689281463623 }, { "auxiliary_loss_clip": 0.01054427, "auxiliary_loss_mlp": 0.01037829, "balance_loss_clip": 1.01319993, "balance_loss_mlp": 1.01660085, "epoch": 0.8281978055012776, "flos": 20666731109760.0, "grad_norm": 2.508567694578778, "language_loss": 0.76004612, "learning_rate": 3.01619202829249e-07, "loss": 0.78096873, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 13775, "time_per_iteration": 2.3425917625427246 }, { "auxiliary_loss_clip": 0.01054133, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.01475048, "balance_loss_mlp": 1.01612878, "epoch": 0.8282579287539455, "flos": 29313984142080.0, "grad_norm": 2.103369488104231, "language_loss": 0.74504805, "learning_rate": 3.01413565459353e-07, "loss": 0.76600754, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 13776, "time_per_iteration": 2.447099208831787 }, { "auxiliary_loss_clip": 0.01052396, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.01158619, "balance_loss_mlp": 1.01567721, "epoch": 0.8283180520066136, "flos": 15705258842880.0, "grad_norm": 2.040335963238092, "language_loss": 0.78148699, "learning_rate": 3.0120799250002483e-07, "loss": 0.80237287, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 13777, "time_per_iteration": 3.5787482261657715 }, { "auxiliary_loss_clip": 0.01050825, "auxiliary_loss_mlp": 0.01029178, "balance_loss_clip": 1.0078392, "balance_loss_mlp": 1.01695979, "epoch": 0.8283781752592815, "flos": 24790287813120.0, "grad_norm": 1.5960846157181359, "language_loss": 0.83586824, "learning_rate": 3.010024839590604e-07, "loss": 0.85666823, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33789062, "step": 13778, "time_per_iteration": 2.4183297157287598 }, { "auxiliary_loss_clip": 0.01048916, "auxiliary_loss_mlp": 0.01035161, "balance_loss_clip": 1.01203406, "balance_loss_mlp": 1.01517332, "epoch": 0.8284382985119495, "flos": 18981997585920.0, "grad_norm": 1.7673606845096157, "language_loss": 0.75397754, "learning_rate": 3.0079703984425187e-07, "loss": 0.77481824, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.33789062, "step": 13779, "time_per_iteration": 2.339005708694458 }, { "auxiliary_loss_clip": 0.01007046, "auxiliary_loss_mlp": 0.01004806, "balance_loss_clip": 1.00256479, "balance_loss_mlp": 1.00059843, "epoch": 0.8284984217646175, "flos": 61030898853120.0, "grad_norm": 0.9106768976356092, "language_loss": 0.56792045, "learning_rate": 3.0059166016338954e-07, "loss": 0.58803892, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06445312, "step": 13780, "time_per_iteration": 3.031635046005249 }, { "auxiliary_loss_clip": 0.01051691, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.00748956, "balance_loss_mlp": 1.01591408, "epoch": 0.8285585450172854, "flos": 19713742104960.0, "grad_norm": 1.7676867696948724, "language_loss": 0.80314624, "learning_rate": 3.0038634492426205e-07, "loss": 0.82397455, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 13781, "time_per_iteration": 2.3589601516723633 }, { "auxiliary_loss_clip": 0.01053482, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.01246142, "balance_loss_mlp": 1.01680911, "epoch": 0.8286186682699535, "flos": 21687835910400.0, "grad_norm": 1.9901426666413942, "language_loss": 0.77369487, "learning_rate": 3.001810941346543e-07, "loss": 0.79461491, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 13782, "time_per_iteration": 2.3807458877563477 }, { "auxiliary_loss_clip": 0.01051252, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.01702642, "balance_loss_mlp": 1.01548147, "epoch": 0.8286787915226214, "flos": 25774384705920.0, "grad_norm": 1.8358969054842813, "language_loss": 0.77083892, "learning_rate": 2.9997590780234983e-07, "loss": 0.79175818, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 13783, "time_per_iteration": 3.881993055343628 }, { "auxiliary_loss_clip": 0.01051276, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.01194227, "balance_loss_mlp": 1.01525295, "epoch": 0.8287389147752894, "flos": 21287277348480.0, "grad_norm": 1.6725097106097901, "language_loss": 0.74693674, "learning_rate": 2.997707859351304e-07, "loss": 0.76780856, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 13784, "time_per_iteration": 2.3460171222686768 }, { "auxiliary_loss_clip": 0.01053507, "auxiliary_loss_mlp": 0.01041365, "balance_loss_clip": 1.01640248, "balance_loss_mlp": 1.01538658, "epoch": 0.8287990380279573, "flos": 33543537333120.0, "grad_norm": 1.4760948762816442, "language_loss": 0.70681024, "learning_rate": 2.99565728540772e-07, "loss": 0.72775894, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38085938, "step": 13785, "time_per_iteration": 2.469475269317627 }, { "auxiliary_loss_clip": 0.01053004, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.00933588, "balance_loss_mlp": 1.01755333, "epoch": 0.8288591612806253, "flos": 22965238068480.0, "grad_norm": 1.3986330166977154, "language_loss": 0.69214553, "learning_rate": 2.993607356270516e-07, "loss": 0.71300286, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 13786, "time_per_iteration": 3.7534193992614746 }, { "auxiliary_loss_clip": 0.01054253, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.01497364, "balance_loss_mlp": 1.01677942, "epoch": 0.8289192845332932, "flos": 18587967448320.0, "grad_norm": 1.7469830628853085, "language_loss": 0.77954209, "learning_rate": 2.991558072017426e-07, "loss": 0.80049133, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 13787, "time_per_iteration": 2.36616587638855 }, { "auxiliary_loss_clip": 0.01051251, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.01318002, "balance_loss_mlp": 1.0162673, "epoch": 0.8289794077859612, "flos": 15449520067200.0, "grad_norm": 1.9229834919189646, "language_loss": 0.81121737, "learning_rate": 2.989509432726163e-07, "loss": 0.83208847, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 13788, "time_per_iteration": 2.3398399353027344 }, { "auxiliary_loss_clip": 0.0105177, "auxiliary_loss_mlp": 0.01037142, "balance_loss_clip": 1.01393235, "balance_loss_mlp": 1.01649249, "epoch": 0.8290395310386292, "flos": 28876557317760.0, "grad_norm": 1.5498830095731873, "language_loss": 0.72400296, "learning_rate": 2.9874614384744014e-07, "loss": 0.744892, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 13789, "time_per_iteration": 2.442476272583008 }, { "auxiliary_loss_clip": 0.01053047, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.01651359, "balance_loss_mlp": 1.01553667, "epoch": 0.8290996542912972, "flos": 36574766328960.0, "grad_norm": 1.7885158809985646, "language_loss": 0.69249332, "learning_rate": 2.985414089339813e-07, "loss": 0.71343714, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 13790, "time_per_iteration": 2.493319272994995 }, { "auxiliary_loss_clip": 0.0105293, "auxiliary_loss_mlp": 0.01039518, "balance_loss_clip": 1.01291037, "balance_loss_mlp": 1.01577473, "epoch": 0.8291597775439651, "flos": 23621884519680.0, "grad_norm": 1.6634848212219613, "language_loss": 0.78915727, "learning_rate": 2.9833673854000265e-07, "loss": 0.81008178, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 13791, "time_per_iteration": 2.3961334228515625 }, { "auxiliary_loss_clip": 0.01051638, "auxiliary_loss_mlp": 0.01037842, "balance_loss_clip": 1.01421475, "balance_loss_mlp": 1.01665175, "epoch": 0.8292199007966331, "flos": 21396415858560.0, "grad_norm": 1.369797902189044, "language_loss": 0.70294362, "learning_rate": 2.981321326732651e-07, "loss": 0.72383839, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 13792, "time_per_iteration": 2.3537349700927734 }, { "auxiliary_loss_clip": 0.01051992, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.01626265, "balance_loss_mlp": 1.0152595, "epoch": 0.829280024049301, "flos": 28766336555520.0, "grad_norm": 1.4751845445129725, "language_loss": 0.65817314, "learning_rate": 2.9792759134152736e-07, "loss": 0.67910171, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 13793, "time_per_iteration": 2.4260761737823486 }, { "auxiliary_loss_clip": 0.01054919, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.01248026, "balance_loss_mlp": 1.0163691, "epoch": 0.829340147301969, "flos": 19937046360960.0, "grad_norm": 2.0165776275288594, "language_loss": 0.67431343, "learning_rate": 2.977231145525461e-07, "loss": 0.6952374, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.38476562, "step": 13794, "time_per_iteration": 2.3247358798980713 }, { "auxiliary_loss_clip": 0.01052529, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.0185051, "balance_loss_mlp": 1.01605558, "epoch": 0.829400270554637, "flos": 25227400435200.0, "grad_norm": 1.959689437603246, "language_loss": 0.67641664, "learning_rate": 2.975187023140757e-07, "loss": 0.69736737, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 13795, "time_per_iteration": 2.400827407836914 }, { "auxiliary_loss_clip": 0.01051073, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 1.01478744, "balance_loss_mlp": 1.01651311, "epoch": 0.829460393807305, "flos": 24462383523840.0, "grad_norm": 1.6723176521333354, "language_loss": 0.67424846, "learning_rate": 2.973143546338661e-07, "loss": 0.69511926, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34570312, "step": 13796, "time_per_iteration": 2.387488842010498 }, { "auxiliary_loss_clip": 0.01049898, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.01278341, "balance_loss_mlp": 1.01523995, "epoch": 0.829520517059973, "flos": 15121580866560.0, "grad_norm": 1.761670827209745, "language_loss": 0.72874415, "learning_rate": 2.971100715196666e-07, "loss": 0.74959326, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 13797, "time_per_iteration": 2.4074742794036865 }, { "auxiliary_loss_clip": 0.01052442, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.01114583, "balance_loss_mlp": 1.0159719, "epoch": 0.8295806403126409, "flos": 21578906868480.0, "grad_norm": 2.155284808437615, "language_loss": 0.73711842, "learning_rate": 2.969058529792243e-07, "loss": 0.75798762, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 13798, "time_per_iteration": 2.375195026397705 }, { "auxiliary_loss_clip": 0.01049378, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.01377344, "balance_loss_mlp": 1.01542974, "epoch": 0.8296407635653089, "flos": 21725472222720.0, "grad_norm": 1.6734794937195392, "language_loss": 0.77761555, "learning_rate": 2.967016990202822e-07, "loss": 0.79846567, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 13799, "time_per_iteration": 2.3752589225769043 }, { "auxiliary_loss_clip": 0.01051448, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.01659465, "balance_loss_mlp": 1.01627743, "epoch": 0.8297008868179768, "flos": 11180375527680.0, "grad_norm": 1.9447547702520052, "language_loss": 0.68761981, "learning_rate": 2.9649760965058245e-07, "loss": 0.70853627, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 13800, "time_per_iteration": 3.7445592880249023 }, { "auxiliary_loss_clip": 0.01055478, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.01402044, "balance_loss_mlp": 1.01746249, "epoch": 0.8297610100706448, "flos": 20663100328320.0, "grad_norm": 2.4376241082715464, "language_loss": 0.75459301, "learning_rate": 2.9629358487786515e-07, "loss": 0.77556372, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.37890625, "step": 13801, "time_per_iteration": 2.400974750518799 }, { "auxiliary_loss_clip": 0.01052327, "auxiliary_loss_mlp": 0.01033393, "balance_loss_clip": 1.01113629, "balance_loss_mlp": 1.01645184, "epoch": 0.8298211333233128, "flos": 20375276146560.0, "grad_norm": 1.690723584642869, "language_loss": 0.74254447, "learning_rate": 2.9608962470986476e-07, "loss": 0.76340163, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 13802, "time_per_iteration": 2.342258930206299 }, { "auxiliary_loss_clip": 0.01052282, "auxiliary_loss_mlp": 0.01037321, "balance_loss_clip": 1.01537442, "balance_loss_mlp": 1.01589513, "epoch": 0.8298812565759808, "flos": 21507579227520.0, "grad_norm": 1.5904066130398413, "language_loss": 0.75603104, "learning_rate": 2.9588572915431644e-07, "loss": 0.77692705, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 13803, "time_per_iteration": 2.401033878326416 }, { "auxiliary_loss_clip": 0.01053479, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.01641965, "balance_loss_mlp": 1.01751614, "epoch": 0.8299413798286487, "flos": 22817625373440.0, "grad_norm": 1.7379508259199545, "language_loss": 0.79947293, "learning_rate": 2.9568189821895215e-07, "loss": 0.82039821, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 13804, "time_per_iteration": 2.3658573627471924 }, { "auxiliary_loss_clip": 0.01050076, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.01337147, "balance_loss_mlp": 1.01557219, "epoch": 0.8300015030813167, "flos": 29677918821120.0, "grad_norm": 2.2819174987018718, "language_loss": 0.74462098, "learning_rate": 2.954781319115016e-07, "loss": 0.76547003, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 13805, "time_per_iteration": 2.468914031982422 }, { "auxiliary_loss_clip": 0.0105313, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.01155579, "balance_loss_mlp": 1.01642728, "epoch": 0.8300616263339846, "flos": 19718455138560.0, "grad_norm": 2.161691789369759, "language_loss": 0.78895795, "learning_rate": 2.952744302396906e-07, "loss": 0.80984503, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 13806, "time_per_iteration": 2.3333864212036133 }, { "auxiliary_loss_clip": 0.01054055, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.01494598, "balance_loss_mlp": 1.01684356, "epoch": 0.8301217495866526, "flos": 19900911237120.0, "grad_norm": 1.7041629118466792, "language_loss": 0.65038013, "learning_rate": 2.950707932112444e-07, "loss": 0.67131543, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37304688, "step": 13807, "time_per_iteration": 2.3705432415008545 }, { "auxiliary_loss_clip": 0.01052349, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.01195908, "balance_loss_mlp": 1.01644373, "epoch": 0.8301818728393207, "flos": 19714859268480.0, "grad_norm": 1.9915210660419373, "language_loss": 0.73760939, "learning_rate": 2.948672208338847e-07, "loss": 0.7584933, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 13808, "time_per_iteration": 2.333934783935547 }, { "auxiliary_loss_clip": 0.01056791, "auxiliary_loss_mlp": 0.01051107, "balance_loss_clip": 1.02429652, "balance_loss_mlp": 1.01840341, "epoch": 0.8302419960919886, "flos": 28292390582400.0, "grad_norm": 2.0746180464506936, "language_loss": 0.68752772, "learning_rate": 2.9466371311533046e-07, "loss": 0.70860672, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3828125, "step": 13809, "time_per_iteration": 2.421452045440674 }, { "auxiliary_loss_clip": 0.01052968, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.0113852, "balance_loss_mlp": 1.01603651, "epoch": 0.8303021193446566, "flos": 18222461758080.0, "grad_norm": 7.044162913114388, "language_loss": 0.74933815, "learning_rate": 2.9446027006329896e-07, "loss": 0.77020699, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36914062, "step": 13810, "time_per_iteration": 2.3284552097320557 }, { "auxiliary_loss_clip": 0.01050156, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.01600695, "balance_loss_mlp": 1.01583242, "epoch": 0.8303622425973245, "flos": 23110337145600.0, "grad_norm": 1.6683660835991407, "language_loss": 0.82136279, "learning_rate": 2.94256891685505e-07, "loss": 0.84223056, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34375, "step": 13811, "time_per_iteration": 2.427122116088867 }, { "auxiliary_loss_clip": 0.01052627, "auxiliary_loss_mlp": 0.01042857, "balance_loss_clip": 1.0209825, "balance_loss_mlp": 1.01676893, "epoch": 0.8304223658499925, "flos": 19571854872960.0, "grad_norm": 2.7283289138936193, "language_loss": 0.74288625, "learning_rate": 2.9405357798966156e-07, "loss": 0.76384109, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 13812, "time_per_iteration": 2.3442749977111816 }, { "auxiliary_loss_clip": 0.01051352, "auxiliary_loss_mlp": 0.01033516, "balance_loss_clip": 1.01197445, "balance_loss_mlp": 1.01614738, "epoch": 0.8304824891026604, "flos": 24424956679680.0, "grad_norm": 1.5794534687098756, "language_loss": 0.79124367, "learning_rate": 2.9385032898347664e-07, "loss": 0.81209236, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 13813, "time_per_iteration": 2.3870816230773926 }, { "auxiliary_loss_clip": 0.01051797, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.01181436, "balance_loss_mlp": 1.01487148, "epoch": 0.8305426123553284, "flos": 22380722219520.0, "grad_norm": 1.9436167787435077, "language_loss": 0.72395039, "learning_rate": 2.93647144674658e-07, "loss": 0.74483776, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36914062, "step": 13814, "time_per_iteration": 2.3860552310943604 }, { "auxiliary_loss_clip": 0.01056453, "auxiliary_loss_mlp": 0.01050354, "balance_loss_clip": 1.02193475, "balance_loss_mlp": 1.01701093, "epoch": 0.8306027356079964, "flos": 14902675441920.0, "grad_norm": 2.364331436263779, "language_loss": 0.69425642, "learning_rate": 2.9344402507091116e-07, "loss": 0.71532452, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.39453125, "step": 13815, "time_per_iteration": 2.33354115486145 }, { "auxiliary_loss_clip": 0.01053235, "auxiliary_loss_mlp": 0.0104153, "balance_loss_clip": 1.01657987, "balance_loss_mlp": 1.01715803, "epoch": 0.8306628588606644, "flos": 19643601450240.0, "grad_norm": 1.9617324547094606, "language_loss": 0.77129471, "learning_rate": 2.9324097017993745e-07, "loss": 0.79224241, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.359375, "step": 13816, "time_per_iteration": 2.375720500946045 }, { "auxiliary_loss_clip": 0.01050975, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.01540756, "balance_loss_mlp": 1.01607001, "epoch": 0.8307229821133323, "flos": 24388577176320.0, "grad_norm": 1.6779692252605694, "language_loss": 0.82161421, "learning_rate": 2.930379800094371e-07, "loss": 0.84249794, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 13817, "time_per_iteration": 3.618485450744629 }, { "auxiliary_loss_clip": 0.01054679, "auxiliary_loss_mlp": 0.01040402, "balance_loss_clip": 1.01696527, "balance_loss_mlp": 1.01779377, "epoch": 0.8307831053660003, "flos": 20995857296640.0, "grad_norm": 2.5983065912203784, "language_loss": 0.78762937, "learning_rate": 2.9283505456710875e-07, "loss": 0.80858022, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36914062, "step": 13818, "time_per_iteration": 2.3574862480163574 }, { "auxiliary_loss_clip": 0.01053033, "auxiliary_loss_mlp": 0.01043072, "balance_loss_clip": 1.01771569, "balance_loss_mlp": 1.01673877, "epoch": 0.8308432286186682, "flos": 21396241301760.0, "grad_norm": 2.0308241695081133, "language_loss": 0.8286863, "learning_rate": 2.926321938606453e-07, "loss": 0.84964734, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 13819, "time_per_iteration": 2.358039379119873 }, { "auxiliary_loss_clip": 0.01007601, "auxiliary_loss_mlp": 0.01002851, "balance_loss_clip": 1.00065732, "balance_loss_mlp": 1.00097549, "epoch": 0.8309033518713362, "flos": 62530978728960.0, "grad_norm": 0.7675731671404494, "language_loss": 0.56303877, "learning_rate": 2.924293978977399e-07, "loss": 0.58314323, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06640625, "step": 13820, "time_per_iteration": 3.0220210552215576 }, { "auxiliary_loss_clip": 0.01050728, "auxiliary_loss_mlp": 0.01032963, "balance_loss_clip": 1.01075411, "balance_loss_mlp": 1.01657045, "epoch": 0.8309634751240043, "flos": 16978262169600.0, "grad_norm": 2.0579865976482496, "language_loss": 0.68949676, "learning_rate": 2.922266666860831e-07, "loss": 0.7103337, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 13821, "time_per_iteration": 2.3411002159118652 }, { "auxiliary_loss_clip": 0.0105212, "auxiliary_loss_mlp": 0.01043232, "balance_loss_clip": 1.01978326, "balance_loss_mlp": 1.01539207, "epoch": 0.8310235983766722, "flos": 22673364168960.0, "grad_norm": 1.835189032306545, "language_loss": 0.70818734, "learning_rate": 2.920240002333625e-07, "loss": 0.72914088, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 13822, "time_per_iteration": 2.3627798557281494 }, { "auxiliary_loss_clip": 0.0105088, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.013798, "balance_loss_mlp": 1.01651478, "epoch": 0.8310837216293402, "flos": 30810117168000.0, "grad_norm": 1.9505218808494906, "language_loss": 0.62915409, "learning_rate": 2.918213985472631e-07, "loss": 0.65002638, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34375, "step": 13823, "time_per_iteration": 3.830002546310425 }, { "auxiliary_loss_clip": 0.0100734, "auxiliary_loss_mlp": 0.01003842, "balance_loss_clip": 1.00142229, "balance_loss_mlp": 1.00080514, "epoch": 0.8311438448820081, "flos": 71272531077120.0, "grad_norm": 0.8885623155215208, "language_loss": 0.6205194, "learning_rate": 2.916188616354669e-07, "loss": 0.64063126, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.06542969, "step": 13824, "time_per_iteration": 3.1623013019561768 }, { "auxiliary_loss_clip": 0.01051144, "auxiliary_loss_mlp": 0.01037439, "balance_loss_clip": 1.01558781, "balance_loss_mlp": 1.01580739, "epoch": 0.8312039681346761, "flos": 20886020559360.0, "grad_norm": 1.7853334377204213, "language_loss": 0.74869823, "learning_rate": 2.914163895056552e-07, "loss": 0.76958412, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13825, "time_per_iteration": 3.6964385509490967 }, { "auxiliary_loss_clip": 0.01052382, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.01477265, "balance_loss_mlp": 1.01544952, "epoch": 0.831264091387344, "flos": 17016631620480.0, "grad_norm": 3.4781028329438524, "language_loss": 0.81076765, "learning_rate": 2.9121398216550486e-07, "loss": 0.83168244, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 13826, "time_per_iteration": 2.370760440826416 }, { "auxiliary_loss_clip": 0.01050847, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.01247573, "balance_loss_mlp": 1.01509225, "epoch": 0.831324214640012, "flos": 24418602812160.0, "grad_norm": 1.7764532414796457, "language_loss": 0.68957734, "learning_rate": 2.910116396226914e-07, "loss": 0.71043593, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 13827, "time_per_iteration": 2.381718397140503 }, { "auxiliary_loss_clip": 0.01050679, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.01093602, "balance_loss_mlp": 1.01552093, "epoch": 0.83138433789268, "flos": 13544938512000.0, "grad_norm": 1.7334520836180642, "language_loss": 0.75202692, "learning_rate": 2.9080936188488834e-07, "loss": 0.77286386, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 13828, "time_per_iteration": 2.3453621864318848 }, { "auxiliary_loss_clip": 0.01050173, "auxiliary_loss_mlp": 0.01038468, "balance_loss_clip": 1.01625967, "balance_loss_mlp": 1.01453066, "epoch": 0.831444461145348, "flos": 44490693778560.0, "grad_norm": 1.6423490393980427, "language_loss": 0.68561882, "learning_rate": 2.906071489597657e-07, "loss": 0.70650518, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 13829, "time_per_iteration": 2.545858860015869 }, { "auxiliary_loss_clip": 0.01052757, "auxiliary_loss_mlp": 0.01038189, "balance_loss_clip": 1.01475263, "balance_loss_mlp": 1.01605916, "epoch": 0.8315045843980159, "flos": 22704088032000.0, "grad_norm": 1.565011421612002, "language_loss": 0.83773082, "learning_rate": 2.9040500085499054e-07, "loss": 0.85864031, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 13830, "time_per_iteration": 2.3741729259490967 }, { "auxiliary_loss_clip": 0.01053022, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.01688671, "balance_loss_mlp": 1.01638186, "epoch": 0.8315647076506839, "flos": 16872544972800.0, "grad_norm": 2.3280243599180563, "language_loss": 0.75826573, "learning_rate": 2.9020291757822925e-07, "loss": 0.77919966, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 13831, "time_per_iteration": 2.3168656826019287 }, { "auxiliary_loss_clip": 0.01052326, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.01956987, "balance_loss_mlp": 1.01620054, "epoch": 0.8316248309033518, "flos": 13807869027840.0, "grad_norm": 1.6354801103887562, "language_loss": 0.72577822, "learning_rate": 2.9000089913714523e-07, "loss": 0.74673176, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 13832, "time_per_iteration": 2.3292856216430664 }, { "auxiliary_loss_clip": 0.01050285, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.01532292, "balance_loss_mlp": 1.01483798, "epoch": 0.8316849541560198, "flos": 23511419377920.0, "grad_norm": 1.6040728004629479, "language_loss": 0.85757154, "learning_rate": 2.897989455393979e-07, "loss": 0.87846142, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 13833, "time_per_iteration": 2.3700947761535645 }, { "auxiliary_loss_clip": 0.01053338, "auxiliary_loss_mlp": 0.01040427, "balance_loss_clip": 1.0164299, "balance_loss_mlp": 1.01593113, "epoch": 0.8317450774086879, "flos": 23770160530560.0, "grad_norm": 1.6929412126889882, "language_loss": 0.7674824, "learning_rate": 2.8959705679264625e-07, "loss": 0.78842002, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 13834, "time_per_iteration": 2.419008493423462 }, { "auxiliary_loss_clip": 0.01048838, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.01514244, "balance_loss_mlp": 1.01421428, "epoch": 0.8318052006613558, "flos": 16214641712640.0, "grad_norm": 1.8474667138693, "language_loss": 0.80643141, "learning_rate": 2.893952329045459e-07, "loss": 0.82729846, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34570312, "step": 13835, "time_per_iteration": 2.323345184326172 }, { "auxiliary_loss_clip": 0.01054952, "auxiliary_loss_mlp": 0.01044186, "balance_loss_clip": 1.01816273, "balance_loss_mlp": 1.01688337, "epoch": 0.8318653239140238, "flos": 19973530598400.0, "grad_norm": 1.9494426247551537, "language_loss": 0.81866753, "learning_rate": 2.8919347388274905e-07, "loss": 0.83965892, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38085938, "step": 13836, "time_per_iteration": 2.341207981109619 }, { "auxiliary_loss_clip": 0.01050405, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.01354456, "balance_loss_mlp": 1.01550746, "epoch": 0.8319254471666917, "flos": 17703967023360.0, "grad_norm": 2.038987795361162, "language_loss": 0.78562623, "learning_rate": 2.8899177973490727e-07, "loss": 0.80648905, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 13837, "time_per_iteration": 2.351989984512329 }, { "auxiliary_loss_clip": 0.01053746, "auxiliary_loss_mlp": 0.01039346, "balance_loss_clip": 1.01332319, "balance_loss_mlp": 1.01655102, "epoch": 0.8319855704193597, "flos": 19535545192320.0, "grad_norm": 1.8343042227337962, "language_loss": 0.84872752, "learning_rate": 2.887901504686685e-07, "loss": 0.86965847, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37109375, "step": 13838, "time_per_iteration": 2.4359796047210693 }, { "auxiliary_loss_clip": 0.01051163, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.01545429, "balance_loss_mlp": 1.01649654, "epoch": 0.8320456936720276, "flos": 21177021674880.0, "grad_norm": 3.764797506738201, "language_loss": 0.76096821, "learning_rate": 2.885885860916795e-07, "loss": 0.78187239, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34570312, "step": 13839, "time_per_iteration": 2.434671640396118 }, { "auxiliary_loss_clip": 0.0105339, "auxiliary_loss_mlp": 0.01039684, "balance_loss_clip": 1.01560366, "balance_loss_mlp": 1.01674056, "epoch": 0.8321058169246957, "flos": 33249603663360.0, "grad_norm": 1.649856715504996, "language_loss": 0.68742806, "learning_rate": 2.8838708661158253e-07, "loss": 0.70835882, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 13840, "time_per_iteration": 3.8739395141601562 }, { "auxiliary_loss_clip": 0.01051178, "auxiliary_loss_mlp": 0.01039927, "balance_loss_clip": 1.01639557, "balance_loss_mlp": 1.01516032, "epoch": 0.8321659401773636, "flos": 14207310426240.0, "grad_norm": 2.067789430489718, "language_loss": 0.80408007, "learning_rate": 2.8818565203601843e-07, "loss": 0.82499111, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 13841, "time_per_iteration": 2.3843681812286377 }, { "auxiliary_loss_clip": 0.01050799, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.01246619, "balance_loss_mlp": 1.01606655, "epoch": 0.8322260634300316, "flos": 15157366876800.0, "grad_norm": 1.92226869039995, "language_loss": 0.69694698, "learning_rate": 2.879842823726262e-07, "loss": 0.71781993, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.34765625, "step": 13842, "time_per_iteration": 2.352890968322754 }, { "auxiliary_loss_clip": 0.01052225, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.01587391, "balance_loss_mlp": 1.01698661, "epoch": 0.8322861866826995, "flos": 25299670682880.0, "grad_norm": 1.7047367426841706, "language_loss": 0.73605102, "learning_rate": 2.8778297762904124e-07, "loss": 0.75697219, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 13843, "time_per_iteration": 2.397564649581909 }, { "auxiliary_loss_clip": 0.01053669, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 1.01461959, "balance_loss_mlp": 1.01831079, "epoch": 0.8323463099353675, "flos": 17018412099840.0, "grad_norm": 1.9723552537437776, "language_loss": 0.79233873, "learning_rate": 2.875817378128975e-07, "loss": 0.81324244, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 13844, "time_per_iteration": 2.37977933883667 }, { "auxiliary_loss_clip": 0.01007312, "auxiliary_loss_mlp": 0.01002205, "balance_loss_clip": 0.99988037, "balance_loss_mlp": 1.00063097, "epoch": 0.8324064331880354, "flos": 55605222748800.0, "grad_norm": 0.7795706560298629, "language_loss": 0.55293953, "learning_rate": 2.8738056293182624e-07, "loss": 0.57303464, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.06689453, "step": 13845, "time_per_iteration": 2.9196736812591553 }, { "auxiliary_loss_clip": 0.01053505, "auxiliary_loss_mlp": 0.01044647, "balance_loss_clip": 1.01957726, "balance_loss_mlp": 1.0168097, "epoch": 0.8324665564407034, "flos": 26137481512320.0, "grad_norm": 1.6168982165684447, "language_loss": 0.76270741, "learning_rate": 2.871794529934555e-07, "loss": 0.7836889, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 13846, "time_per_iteration": 2.431917190551758 }, { "auxiliary_loss_clip": 0.01052726, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.01529646, "balance_loss_mlp": 1.01525569, "epoch": 0.8325266796933715, "flos": 22048244542080.0, "grad_norm": 1.6893832241671234, "language_loss": 0.79995537, "learning_rate": 2.8697840800541115e-07, "loss": 0.82088214, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 13847, "time_per_iteration": 2.378840446472168 }, { "auxiliary_loss_clip": 0.01052283, "auxiliary_loss_mlp": 0.01036246, "balance_loss_clip": 1.01390624, "balance_loss_mlp": 1.01725411, "epoch": 0.8325868029460394, "flos": 22815635425920.0, "grad_norm": 1.6532108236265493, "language_loss": 0.75626749, "learning_rate": 2.867774279753175e-07, "loss": 0.77715278, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 13848, "time_per_iteration": 2.3682680130004883 }, { "auxiliary_loss_clip": 0.01051893, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.0096699, "balance_loss_mlp": 1.01648879, "epoch": 0.8326469261987074, "flos": 14756563935360.0, "grad_norm": 1.9362789311221424, "language_loss": 0.65080839, "learning_rate": 2.8657651291079554e-07, "loss": 0.67165875, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 13849, "time_per_iteration": 2.3233935832977295 }, { "auxiliary_loss_clip": 0.01052941, "auxiliary_loss_mlp": 0.01035672, "balance_loss_clip": 1.0120213, "balance_loss_mlp": 1.01582527, "epoch": 0.8327070494513753, "flos": 22925123049600.0, "grad_norm": 2.078194133531837, "language_loss": 0.80933607, "learning_rate": 2.863756628194638e-07, "loss": 0.83022225, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 13850, "time_per_iteration": 2.376472234725952 }, { "auxiliary_loss_clip": 0.01048691, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.01755655, "balance_loss_mlp": 1.01545763, "epoch": 0.8327671727040433, "flos": 20664357137280.0, "grad_norm": 1.5449075535510972, "language_loss": 0.79077542, "learning_rate": 2.8617487770893877e-07, "loss": 0.81165564, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33203125, "step": 13851, "time_per_iteration": 2.362128973007202 }, { "auxiliary_loss_clip": 0.01007315, "auxiliary_loss_mlp": 0.01002828, "balance_loss_clip": 1.00083768, "balance_loss_mlp": 1.00072193, "epoch": 0.8328272959567112, "flos": 56057661457920.0, "grad_norm": 0.7632058527128924, "language_loss": 0.55857456, "learning_rate": 2.859741575868344e-07, "loss": 0.57867599, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06591797, "step": 13852, "time_per_iteration": 2.9974281787872314 }, { "auxiliary_loss_clip": 0.01051257, "auxiliary_loss_mlp": 0.01036364, "balance_loss_clip": 1.01389313, "balance_loss_mlp": 1.0168339, "epoch": 0.8328874192093793, "flos": 32301816451200.0, "grad_norm": 1.616951403917932, "language_loss": 0.68711853, "learning_rate": 2.8577350246076125e-07, "loss": 0.7079947, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 13853, "time_per_iteration": 2.475623369216919 }, { "auxiliary_loss_clip": 0.01051995, "auxiliary_loss_mlp": 0.01041526, "balance_loss_clip": 1.01881659, "balance_loss_mlp": 1.01660192, "epoch": 0.8329475424620472, "flos": 23511593934720.0, "grad_norm": 1.607626158603418, "language_loss": 0.79193658, "learning_rate": 2.855729123383286e-07, "loss": 0.81287181, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 13854, "time_per_iteration": 2.450927495956421 }, { "auxiliary_loss_clip": 0.01007551, "auxiliary_loss_mlp": 0.01002352, "balance_loss_clip": 1.0003854, "balance_loss_mlp": 1.00098419, "epoch": 0.8330076657147152, "flos": 67837392028800.0, "grad_norm": 0.7695029478139919, "language_loss": 0.58825493, "learning_rate": 2.8537238722714295e-07, "loss": 0.60835397, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.06542969, "step": 13855, "time_per_iteration": 2.8962531089782715 }, { "auxiliary_loss_clip": 0.01050309, "auxiliary_loss_mlp": 0.01035005, "balance_loss_clip": 1.01236701, "balance_loss_mlp": 1.01550174, "epoch": 0.8330677889673831, "flos": 22891711011840.0, "grad_norm": 1.718937411762079, "language_loss": 0.7298038, "learning_rate": 2.8517192713480853e-07, "loss": 0.75065696, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 13856, "time_per_iteration": 2.3992316722869873 }, { "auxiliary_loss_clip": 0.01052218, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.01415598, "balance_loss_mlp": 1.01574838, "epoch": 0.8331279122200511, "flos": 27343800408960.0, "grad_norm": 2.2507125941325574, "language_loss": 0.75970894, "learning_rate": 2.8497153206892677e-07, "loss": 0.7806074, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 13857, "time_per_iteration": 3.6044180393218994 }, { "auxiliary_loss_clip": 0.01048771, "auxiliary_loss_mlp": 0.01029425, "balance_loss_clip": 1.00970781, "balance_loss_mlp": 1.01580274, "epoch": 0.833188035472719, "flos": 19937151095040.0, "grad_norm": 1.518262910820934, "language_loss": 0.7417469, "learning_rate": 2.847712020370958e-07, "loss": 0.7625289, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.328125, "step": 13858, "time_per_iteration": 2.363327741622925 }, { "auxiliary_loss_clip": 0.01053647, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.01323485, "balance_loss_mlp": 1.01567364, "epoch": 0.833248158725387, "flos": 15231696894720.0, "grad_norm": 1.7705479286423251, "language_loss": 0.74186277, "learning_rate": 2.8457093704691316e-07, "loss": 0.76279652, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 13859, "time_per_iteration": 2.3303451538085938 }, { "auxiliary_loss_clip": 0.01048607, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.01156259, "balance_loss_mlp": 1.01520658, "epoch": 0.8333082819780551, "flos": 24534374480640.0, "grad_norm": 2.036425015766899, "language_loss": 0.80098069, "learning_rate": 2.8437073710597205e-07, "loss": 0.82178652, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33398438, "step": 13860, "time_per_iteration": 2.4286410808563232 }, { "auxiliary_loss_clip": 0.01051917, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.01527762, "balance_loss_mlp": 1.0159533, "epoch": 0.833368405230723, "flos": 31466065392000.0, "grad_norm": 1.331054852985571, "language_loss": 0.83022416, "learning_rate": 2.841706022218644e-07, "loss": 0.85112011, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 13861, "time_per_iteration": 2.46478533744812 }, { "auxiliary_loss_clip": 0.01052528, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.01485777, "balance_loss_mlp": 1.01648378, "epoch": 0.833428528483391, "flos": 14901837569280.0, "grad_norm": 1.7670574805354997, "language_loss": 0.80256557, "learning_rate": 2.839705324021806e-07, "loss": 0.82347023, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 13862, "time_per_iteration": 3.7615230083465576 }, { "auxiliary_loss_clip": 0.01051769, "auxiliary_loss_mlp": 0.01039661, "balance_loss_clip": 1.01413846, "balance_loss_mlp": 1.01547027, "epoch": 0.8334886517360589, "flos": 22198754880000.0, "grad_norm": 1.9675933090824378, "language_loss": 0.76526237, "learning_rate": 2.83770527654505e-07, "loss": 0.78617662, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 13863, "time_per_iteration": 2.371427536010742 }, { "auxiliary_loss_clip": 0.0105048, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.01437664, "balance_loss_mlp": 1.0165329, "epoch": 0.8335487749887269, "flos": 30371258977920.0, "grad_norm": 2.137240418573223, "language_loss": 0.76087868, "learning_rate": 2.835705879864232e-07, "loss": 0.78175527, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33984375, "step": 13864, "time_per_iteration": 2.4306232929229736 }, { "auxiliary_loss_clip": 0.01053348, "auxiliary_loss_mlp": 0.01042266, "balance_loss_clip": 1.01793516, "balance_loss_mlp": 1.01683247, "epoch": 0.8336088982413948, "flos": 24679997228160.0, "grad_norm": 2.1045293964084024, "language_loss": 0.70262861, "learning_rate": 2.833707134055168e-07, "loss": 0.72358477, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 13865, "time_per_iteration": 3.7431747913360596 }, { "auxiliary_loss_clip": 0.01053279, "auxiliary_loss_mlp": 0.01035954, "balance_loss_clip": 1.01255369, "balance_loss_mlp": 1.01772404, "epoch": 0.8336690214940629, "flos": 38175778679040.0, "grad_norm": 1.6168183426959257, "language_loss": 0.76737183, "learning_rate": 2.831709039193653e-07, "loss": 0.78826416, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 13866, "time_per_iteration": 2.500892162322998 }, { "auxiliary_loss_clip": 0.01007416, "auxiliary_loss_mlp": 0.01002361, "balance_loss_clip": 1.00021529, "balance_loss_mlp": 1.00087428, "epoch": 0.8337291447467308, "flos": 55562629023360.0, "grad_norm": 0.8749185653629723, "language_loss": 0.63279253, "learning_rate": 2.8297115953554465e-07, "loss": 0.65289026, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06542969, "step": 13867, "time_per_iteration": 2.9033100605010986 }, { "auxiliary_loss_clip": 0.01050187, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.02086353, "balance_loss_mlp": 1.01578975, "epoch": 0.8337892679993988, "flos": 24132419464320.0, "grad_norm": 1.6520518313967918, "language_loss": 0.73296803, "learning_rate": 2.827714802616301e-07, "loss": 0.75389779, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 13868, "time_per_iteration": 2.3808765411376953 }, { "auxiliary_loss_clip": 0.01054169, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.0141505, "balance_loss_mlp": 1.0176332, "epoch": 0.8338493912520667, "flos": 28182658579200.0, "grad_norm": 1.4795469785062092, "language_loss": 0.81028068, "learning_rate": 2.8257186610519325e-07, "loss": 0.83121598, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 13869, "time_per_iteration": 2.4742603302001953 }, { "auxiliary_loss_clip": 0.0105124, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.01483464, "balance_loss_mlp": 1.01496696, "epoch": 0.8339095145047347, "flos": 22157417963520.0, "grad_norm": 1.5557233387619653, "language_loss": 0.83645785, "learning_rate": 2.823723170738028e-07, "loss": 0.85735673, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 13870, "time_per_iteration": 2.3889806270599365 }, { "auxiliary_loss_clip": 0.01052185, "auxiliary_loss_mlp": 0.0103374, "balance_loss_clip": 1.01084054, "balance_loss_mlp": 1.01573837, "epoch": 0.8339696377574026, "flos": 17306271192960.0, "grad_norm": 2.414769402610643, "language_loss": 0.72599822, "learning_rate": 2.821728331750264e-07, "loss": 0.74685746, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 13871, "time_per_iteration": 2.3129358291625977 }, { "auxiliary_loss_clip": 0.01051389, "auxiliary_loss_mlp": 0.01036996, "balance_loss_clip": 1.01559842, "balance_loss_mlp": 1.01644945, "epoch": 0.8340297610100706, "flos": 20667289691520.0, "grad_norm": 1.6748976927576198, "language_loss": 0.69694996, "learning_rate": 2.8197341441642853e-07, "loss": 0.71783388, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34960938, "step": 13872, "time_per_iteration": 2.3589656352996826 }, { "auxiliary_loss_clip": 0.01051273, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.01190734, "balance_loss_mlp": 1.01558411, "epoch": 0.8340898842627387, "flos": 20514579937920.0, "grad_norm": 2.0414003822902194, "language_loss": 0.74736977, "learning_rate": 2.817740608055712e-07, "loss": 0.76823103, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 13873, "time_per_iteration": 2.4185218811035156 }, { "auxiliary_loss_clip": 0.01051929, "auxiliary_loss_mlp": 0.01041247, "balance_loss_clip": 1.01515222, "balance_loss_mlp": 1.01560402, "epoch": 0.8341500075154066, "flos": 21425010128640.0, "grad_norm": 2.9835600588287754, "language_loss": 0.75932515, "learning_rate": 2.81574772350013e-07, "loss": 0.78025693, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 13874, "time_per_iteration": 2.3684730529785156 }, { "auxiliary_loss_clip": 0.01049946, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.00992894, "balance_loss_mlp": 1.01582074, "epoch": 0.8342101307680746, "flos": 22089895660800.0, "grad_norm": 2.0597759068351715, "language_loss": 0.67370868, "learning_rate": 2.813755490573118e-07, "loss": 0.6945262, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 13875, "time_per_iteration": 2.385450839996338 }, { "auxiliary_loss_clip": 0.01051796, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.01667428, "balance_loss_mlp": 1.0168221, "epoch": 0.8342702540207425, "flos": 21870396743040.0, "grad_norm": 1.6217109127490692, "language_loss": 0.80670393, "learning_rate": 2.8117639093502243e-07, "loss": 0.82761455, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 13876, "time_per_iteration": 2.364272356033325 }, { "auxiliary_loss_clip": 0.01051406, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.01331711, "balance_loss_mlp": 1.01632881, "epoch": 0.8343303772734105, "flos": 22527392307840.0, "grad_norm": 1.9022099504369743, "language_loss": 0.88698155, "learning_rate": 2.8097729799069615e-07, "loss": 0.90785277, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 13877, "time_per_iteration": 2.461705207824707 }, { "auxiliary_loss_clip": 0.01050939, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.0138973, "balance_loss_mlp": 1.01583731, "epoch": 0.8343905005260784, "flos": 14938880388480.0, "grad_norm": 2.0013063118620558, "language_loss": 0.70331621, "learning_rate": 2.807782702318828e-07, "loss": 0.72417212, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.3515625, "step": 13878, "time_per_iteration": 2.3225207328796387 }, { "auxiliary_loss_clip": 0.01049561, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.01093817, "balance_loss_mlp": 1.01554716, "epoch": 0.8344506237787465, "flos": 15011569572480.0, "grad_norm": 2.400869835659916, "language_loss": 0.80274403, "learning_rate": 2.805793076661309e-07, "loss": 0.82356864, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 13879, "time_per_iteration": 3.6919219493865967 }, { "auxiliary_loss_clip": 0.01050877, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.01448274, "balance_loss_mlp": 1.01529372, "epoch": 0.8345107470314144, "flos": 17559601084800.0, "grad_norm": 2.4256216190546738, "language_loss": 0.84652543, "learning_rate": 2.803804103009828e-07, "loss": 0.86739898, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 13880, "time_per_iteration": 2.3245058059692383 }, { "auxiliary_loss_clip": 0.01053597, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.0172677, "balance_loss_mlp": 1.01646733, "epoch": 0.8345708702840824, "flos": 25186238075520.0, "grad_norm": 1.5331472946042062, "language_loss": 0.78905255, "learning_rate": 2.80181578143982e-07, "loss": 0.80998802, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 13881, "time_per_iteration": 2.4143354892730713 }, { "auxiliary_loss_clip": 0.01048094, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.01524746, "balance_loss_mlp": 1.01501179, "epoch": 0.8346309935367503, "flos": 15082722656640.0, "grad_norm": 2.709410250725937, "language_loss": 0.79915601, "learning_rate": 2.7998281120266807e-07, "loss": 0.81998789, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33007812, "step": 13882, "time_per_iteration": 2.3307859897613525 }, { "auxiliary_loss_clip": 0.01051673, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.01712251, "balance_loss_mlp": 1.01580751, "epoch": 0.8346911167894183, "flos": 22929486969600.0, "grad_norm": 1.7886800399754492, "language_loss": 0.81691611, "learning_rate": 2.79784109484579e-07, "loss": 0.83784068, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 13883, "time_per_iteration": 2.3828089237213135 }, { "auxiliary_loss_clip": 0.01051721, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.01228333, "balance_loss_mlp": 1.01560628, "epoch": 0.8347512400420862, "flos": 20192017086720.0, "grad_norm": 2.9994701895846125, "language_loss": 0.7552464, "learning_rate": 2.795854729972482e-07, "loss": 0.77613461, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 13884, "time_per_iteration": 2.3497042655944824 }, { "auxiliary_loss_clip": 0.01056479, "auxiliary_loss_mlp": 0.01048041, "balance_loss_clip": 1.02101588, "balance_loss_mlp": 1.01739883, "epoch": 0.8348113632947542, "flos": 25953733693440.0, "grad_norm": 1.832345232783009, "language_loss": 0.71166587, "learning_rate": 2.7938690174820913e-07, "loss": 0.73271108, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.390625, "step": 13885, "time_per_iteration": 2.5710909366607666 }, { "auxiliary_loss_clip": 0.01053321, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.01090932, "balance_loss_mlp": 1.01655674, "epoch": 0.8348714865474223, "flos": 34203116338560.0, "grad_norm": 1.6712098153287573, "language_loss": 0.71574938, "learning_rate": 2.791883957449912e-07, "loss": 0.73661351, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3671875, "step": 13886, "time_per_iteration": 2.500521421432495 }, { "auxiliary_loss_clip": 0.01050317, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.01198351, "balance_loss_mlp": 1.01512194, "epoch": 0.8349316098000902, "flos": 24388961201280.0, "grad_norm": 1.5188370331110352, "language_loss": 0.7985267, "learning_rate": 2.7898995499512134e-07, "loss": 0.81938887, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3515625, "step": 13887, "time_per_iteration": 2.502025842666626 }, { "auxiliary_loss_clip": 0.01054284, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.01218545, "balance_loss_mlp": 1.01665366, "epoch": 0.8349917330527582, "flos": 23031817764480.0, "grad_norm": 2.34324001260461, "language_loss": 0.66526616, "learning_rate": 2.7879157950612467e-07, "loss": 0.68617666, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 13888, "time_per_iteration": 2.361043930053711 }, { "auxiliary_loss_clip": 0.01053494, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.01340139, "balance_loss_mlp": 1.01623702, "epoch": 0.8350518563054261, "flos": 13625028904320.0, "grad_norm": 2.0966431601965523, "language_loss": 0.69187278, "learning_rate": 2.785932692855244e-07, "loss": 0.71278167, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 13889, "time_per_iteration": 2.4941892623901367 }, { "auxiliary_loss_clip": 0.0105091, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.01192319, "balance_loss_mlp": 1.01571953, "epoch": 0.8351119795580941, "flos": 21578732311680.0, "grad_norm": 1.8000607853089268, "language_loss": 0.69868827, "learning_rate": 2.783950243408399e-07, "loss": 0.71954799, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 13890, "time_per_iteration": 2.413825511932373 }, { "auxiliary_loss_clip": 0.01052934, "auxiliary_loss_mlp": 0.01038765, "balance_loss_clip": 1.01468468, "balance_loss_mlp": 1.01626945, "epoch": 0.835172102810762, "flos": 20037526853760.0, "grad_norm": 2.301841993172882, "language_loss": 0.60690153, "learning_rate": 2.7819684467958817e-07, "loss": 0.62781852, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 13891, "time_per_iteration": 2.4797375202178955 }, { "auxiliary_loss_clip": 0.01052639, "auxiliary_loss_mlp": 0.01037957, "balance_loss_clip": 1.01578426, "balance_loss_mlp": 1.01658893, "epoch": 0.8352322260634301, "flos": 25110616337280.0, "grad_norm": 1.591332594816603, "language_loss": 0.72557354, "learning_rate": 2.779987303092846e-07, "loss": 0.74647951, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36132812, "step": 13892, "time_per_iteration": 2.3765666484832764 }, { "auxiliary_loss_clip": 0.01049991, "auxiliary_loss_mlp": 0.01038988, "balance_loss_clip": 1.01558685, "balance_loss_mlp": 1.01572239, "epoch": 0.835292349316098, "flos": 24862592972160.0, "grad_norm": 1.7579176608943043, "language_loss": 0.67234182, "learning_rate": 2.7780068123744207e-07, "loss": 0.69323164, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34179688, "step": 13893, "time_per_iteration": 2.429600954055786 }, { "auxiliary_loss_clip": 0.01050239, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.01187074, "balance_loss_mlp": 1.01516163, "epoch": 0.835352472568766, "flos": 19864531733760.0, "grad_norm": 1.973268814451002, "language_loss": 0.79320294, "learning_rate": 2.7760269747156996e-07, "loss": 0.81403828, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 13894, "time_per_iteration": 2.359849691390991 }, { "auxiliary_loss_clip": 0.01051103, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.01051879, "balance_loss_mlp": 1.01722026, "epoch": 0.8354125958214339, "flos": 22053655802880.0, "grad_norm": 1.6854776551188797, "language_loss": 0.73884588, "learning_rate": 2.7740477901917625e-07, "loss": 0.75968313, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33789062, "step": 13895, "time_per_iteration": 2.3930459022521973 }, { "auxiliary_loss_clip": 0.01051904, "auxiliary_loss_mlp": 0.01041491, "balance_loss_clip": 1.01382232, "balance_loss_mlp": 1.01565838, "epoch": 0.8354727190741019, "flos": 21396730060800.0, "grad_norm": 2.4505439661467463, "language_loss": 0.72847283, "learning_rate": 2.772069258877667e-07, "loss": 0.7494067, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.36328125, "step": 13896, "time_per_iteration": 3.6465444564819336 }, { "auxiliary_loss_clip": 0.01050907, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.00905585, "balance_loss_mlp": 1.0159781, "epoch": 0.8355328423267698, "flos": 50839125649920.0, "grad_norm": 2.2807018633487703, "language_loss": 0.60111749, "learning_rate": 2.770091380848423e-07, "loss": 0.6219359, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 13897, "time_per_iteration": 2.63702130317688 }, { "auxiliary_loss_clip": 0.01007418, "auxiliary_loss_mlp": 0.01002538, "balance_loss_clip": 1.00041604, "balance_loss_mlp": 1.00089371, "epoch": 0.8355929655794379, "flos": 65547577998720.0, "grad_norm": 0.6947303130168239, "language_loss": 0.57659686, "learning_rate": 2.7681141561790423e-07, "loss": 0.59669644, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.06542969, "step": 13898, "time_per_iteration": 3.039914846420288 }, { "auxiliary_loss_clip": 0.01053838, "auxiliary_loss_mlp": 0.01040399, "balance_loss_clip": 1.01659274, "balance_loss_mlp": 1.01673758, "epoch": 0.8356530888321058, "flos": 19169550743040.0, "grad_norm": 2.803886389906241, "language_loss": 0.80977094, "learning_rate": 2.7661375849444967e-07, "loss": 0.83071333, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 13899, "time_per_iteration": 2.3612303733825684 }, { "auxiliary_loss_clip": 0.01052108, "auxiliary_loss_mlp": 0.01037228, "balance_loss_clip": 1.01450694, "balance_loss_mlp": 1.01624608, "epoch": 0.8357132120847738, "flos": 44125013531520.0, "grad_norm": 2.027063614629441, "language_loss": 0.69904768, "learning_rate": 2.764161667219749e-07, "loss": 0.71994108, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 13900, "time_per_iteration": 2.5547778606414795 }, { "auxiliary_loss_clip": 0.01052547, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.0128262, "balance_loss_mlp": 1.01708972, "epoch": 0.8357733353374418, "flos": 24388542264960.0, "grad_norm": 1.399338992702125, "language_loss": 0.72139406, "learning_rate": 2.762186403079716e-07, "loss": 0.74225342, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.35351562, "step": 13901, "time_per_iteration": 2.399590015411377 }, { "auxiliary_loss_clip": 0.01054779, "auxiliary_loss_mlp": 0.01044885, "balance_loss_clip": 1.01985109, "balance_loss_mlp": 1.01700246, "epoch": 0.8358334585901097, "flos": 20915452702080.0, "grad_norm": 2.1904254446554736, "language_loss": 0.80732137, "learning_rate": 2.7602117925992963e-07, "loss": 0.82831806, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 13902, "time_per_iteration": 3.7852110862731934 }, { "auxiliary_loss_clip": 0.01050087, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.01142097, "balance_loss_mlp": 1.01582122, "epoch": 0.8358935818427777, "flos": 19243182533760.0, "grad_norm": 1.4716145059181194, "language_loss": 0.63317645, "learning_rate": 2.758237835853379e-07, "loss": 0.65401697, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34179688, "step": 13903, "time_per_iteration": 2.34626841545105 }, { "auxiliary_loss_clip": 0.01050999, "auxiliary_loss_mlp": 0.01033419, "balance_loss_clip": 1.0122354, "balance_loss_mlp": 1.01555622, "epoch": 0.8359537050954456, "flos": 24132908223360.0, "grad_norm": 1.8278020579347583, "language_loss": 0.75478101, "learning_rate": 2.7562645329168054e-07, "loss": 0.77562517, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35351562, "step": 13904, "time_per_iteration": 2.3883681297302246 }, { "auxiliary_loss_clip": 0.01049572, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.0133332, "balance_loss_mlp": 1.01484656, "epoch": 0.8360138283481137, "flos": 16179484106880.0, "grad_norm": 1.6858745803138315, "language_loss": 0.7345351, "learning_rate": 2.7542918838644104e-07, "loss": 0.75540161, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34765625, "step": 13905, "time_per_iteration": 3.725339412689209 }, { "auxiliary_loss_clip": 0.0105189, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.01382375, "balance_loss_mlp": 1.01712859, "epoch": 0.8360739516007816, "flos": 22197847184640.0, "grad_norm": 1.5090764797846925, "language_loss": 0.67245924, "learning_rate": 2.752319888771e-07, "loss": 0.69333947, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 13906, "time_per_iteration": 2.3862814903259277 }, { "auxiliary_loss_clip": 0.01052412, "auxiliary_loss_mlp": 0.01035042, "balance_loss_clip": 1.0133698, "balance_loss_mlp": 1.01675153, "epoch": 0.8361340748534496, "flos": 20922085860480.0, "grad_norm": 1.459778482270787, "language_loss": 0.74776208, "learning_rate": 2.7503485477113475e-07, "loss": 0.76863658, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35742188, "step": 13907, "time_per_iteration": 2.370638847351074 }, { "auxiliary_loss_clip": 0.01051446, "auxiliary_loss_mlp": 0.01045327, "balance_loss_clip": 1.0208298, "balance_loss_mlp": 1.0151664, "epoch": 0.8361941981061175, "flos": 26172499472640.0, "grad_norm": 2.0680055206534247, "language_loss": 0.76231468, "learning_rate": 2.7483778607602005e-07, "loss": 0.7832824, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 13908, "time_per_iteration": 2.5035171508789062 }, { "auxiliary_loss_clip": 0.01053181, "auxiliary_loss_mlp": 0.01039568, "balance_loss_clip": 1.01565421, "balance_loss_mlp": 1.01657963, "epoch": 0.8362543213587855, "flos": 24418393344000.0, "grad_norm": 3.8963499257245107, "language_loss": 0.72963989, "learning_rate": 2.7464078279922964e-07, "loss": 0.75056738, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 13909, "time_per_iteration": 2.404506206512451 }, { "auxiliary_loss_clip": 0.0105262, "auxiliary_loss_mlp": 0.01042904, "balance_loss_clip": 1.01857328, "balance_loss_mlp": 1.01569819, "epoch": 0.8363144446114534, "flos": 17201426780160.0, "grad_norm": 2.0349596519876885, "language_loss": 0.74864256, "learning_rate": 2.744438449482338e-07, "loss": 0.76959789, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 13910, "time_per_iteration": 2.360766887664795 }, { "auxiliary_loss_clip": 0.01052878, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.01135027, "balance_loss_mlp": 1.01672399, "epoch": 0.8363745678641215, "flos": 19278444873600.0, "grad_norm": 1.7741762943482722, "language_loss": 0.73987514, "learning_rate": 2.742469725305001e-07, "loss": 0.76073378, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36132812, "step": 13911, "time_per_iteration": 2.353569507598877 }, { "auxiliary_loss_clip": 0.01054392, "auxiliary_loss_mlp": 0.01042359, "balance_loss_clip": 1.01829064, "balance_loss_mlp": 1.01763332, "epoch": 0.8364346911167894, "flos": 11874064798080.0, "grad_norm": 1.9483510845309961, "language_loss": 0.79688179, "learning_rate": 2.740501655534946e-07, "loss": 0.81784928, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 13912, "time_per_iteration": 2.360982656478882 }, { "auxiliary_loss_clip": 0.01051668, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.01256561, "balance_loss_mlp": 1.01559186, "epoch": 0.8364948143694574, "flos": 20224311960960.0, "grad_norm": 1.6414312137101574, "language_loss": 0.79737759, "learning_rate": 2.738534240246797e-07, "loss": 0.81823838, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 13913, "time_per_iteration": 2.3668673038482666 }, { "auxiliary_loss_clip": 0.01051433, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.01720071, "balance_loss_mlp": 1.01536632, "epoch": 0.8365549376221254, "flos": 21611934881280.0, "grad_norm": 1.8384597020111357, "language_loss": 0.75360614, "learning_rate": 2.736567479515153e-07, "loss": 0.77453637, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 13914, "time_per_iteration": 2.4084463119506836 }, { "auxiliary_loss_clip": 0.0105122, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.01521349, "balance_loss_mlp": 1.01606119, "epoch": 0.8366150608747933, "flos": 23293107446400.0, "grad_norm": 1.54286108915256, "language_loss": 0.72427046, "learning_rate": 2.7346013734146025e-07, "loss": 0.74516612, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 13915, "time_per_iteration": 2.3849143981933594 }, { "auxiliary_loss_clip": 0.01052102, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.01519728, "balance_loss_mlp": 1.01575112, "epoch": 0.8366751841274613, "flos": 15266784677760.0, "grad_norm": 1.8348290888952778, "language_loss": 0.73952836, "learning_rate": 2.7326359220197035e-07, "loss": 0.76043016, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 13916, "time_per_iteration": 2.36014461517334 }, { "auxiliary_loss_clip": 0.01053341, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.01262581, "balance_loss_mlp": 1.01666284, "epoch": 0.8367353073801292, "flos": 13224086317440.0, "grad_norm": 1.8285629309249507, "language_loss": 0.75942498, "learning_rate": 2.7306711254049755e-07, "loss": 0.78032333, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 13917, "time_per_iteration": 2.3376317024230957 }, { "auxiliary_loss_clip": 0.01048256, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.01192617, "balance_loss_mlp": 1.01576781, "epoch": 0.8367954306327973, "flos": 24203991484800.0, "grad_norm": 1.4400010493707238, "language_loss": 0.80111629, "learning_rate": 2.728706983644933e-07, "loss": 0.82192945, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.32421875, "step": 13918, "time_per_iteration": 2.3873038291931152 }, { "auxiliary_loss_clip": 0.01051422, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.01453435, "balance_loss_mlp": 1.01594412, "epoch": 0.8368555538854652, "flos": 24533606430720.0, "grad_norm": 1.8376973891348236, "language_loss": 0.69317234, "learning_rate": 2.7267434968140457e-07, "loss": 0.71405768, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 13919, "time_per_iteration": 3.846771001815796 }, { "auxiliary_loss_clip": 0.01049931, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.01256537, "balance_loss_mlp": 1.01512003, "epoch": 0.8369156771381332, "flos": 20258526960000.0, "grad_norm": 1.7972019550509266, "language_loss": 0.75026494, "learning_rate": 2.7247806649867835e-07, "loss": 0.77110958, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 13920, "time_per_iteration": 2.3838131427764893 }, { "auxiliary_loss_clip": 0.01051708, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.01610303, "balance_loss_mlp": 1.01517367, "epoch": 0.8369758003908011, "flos": 21834471087360.0, "grad_norm": 1.6705689181276333, "language_loss": 0.70048988, "learning_rate": 2.722818488237566e-07, "loss": 0.7214067, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 13921, "time_per_iteration": 2.3906428813934326 }, { "auxiliary_loss_clip": 0.01052856, "auxiliary_loss_mlp": 0.01041671, "balance_loss_clip": 1.01904511, "balance_loss_mlp": 1.01623964, "epoch": 0.8370359236434691, "flos": 21718420128000.0, "grad_norm": 2.088615072506136, "language_loss": 0.86441517, "learning_rate": 2.720856966640801e-07, "loss": 0.88536042, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36523438, "step": 13922, "time_per_iteration": 2.3818106651306152 }, { "auxiliary_loss_clip": 0.01050404, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.01381218, "balance_loss_mlp": 1.01610684, "epoch": 0.837096046896137, "flos": 23147763989760.0, "grad_norm": 1.6348824005361218, "language_loss": 0.72524655, "learning_rate": 2.71889610027088e-07, "loss": 0.74609828, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34179688, "step": 13923, "time_per_iteration": 2.3598484992980957 }, { "auxiliary_loss_clip": 0.01050152, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.01338243, "balance_loss_mlp": 1.01580787, "epoch": 0.8371561701488051, "flos": 24491885489280.0, "grad_norm": 1.842089625101855, "language_loss": 0.77452481, "learning_rate": 2.7169358892021433e-07, "loss": 0.79538023, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 13924, "time_per_iteration": 2.404940366744995 }, { "auxiliary_loss_clip": 0.01051247, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.01380813, "balance_loss_mlp": 1.01670694, "epoch": 0.837216293401473, "flos": 29205404213760.0, "grad_norm": 1.5831476758720693, "language_loss": 0.66194391, "learning_rate": 2.7149763335089293e-07, "loss": 0.68280661, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34570312, "step": 13925, "time_per_iteration": 2.419646978378296 }, { "auxiliary_loss_clip": 0.01052655, "auxiliary_loss_mlp": 0.01039801, "balance_loss_clip": 1.01506472, "balance_loss_mlp": 1.01616478, "epoch": 0.837276416654141, "flos": 25264094140800.0, "grad_norm": 1.579363095954496, "language_loss": 0.75717258, "learning_rate": 2.713017433265543e-07, "loss": 0.77809715, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36523438, "step": 13926, "time_per_iteration": 2.4070005416870117 }, { "auxiliary_loss_clip": 0.01052404, "auxiliary_loss_mlp": 0.01035281, "balance_loss_clip": 1.01266646, "balance_loss_mlp": 1.01707458, "epoch": 0.837336539906809, "flos": 13881151704960.0, "grad_norm": 1.7330251614017704, "language_loss": 0.72802353, "learning_rate": 2.711059188546274e-07, "loss": 0.74890035, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 13927, "time_per_iteration": 2.358887195587158 }, { "auxiliary_loss_clip": 0.01007112, "auxiliary_loss_mlp": 0.01003964, "balance_loss_clip": 1.00185394, "balance_loss_mlp": 1.00055981, "epoch": 0.8373966631594769, "flos": 68867050112640.0, "grad_norm": 0.7005500572806644, "language_loss": 0.58968961, "learning_rate": 2.7091015994253695e-07, "loss": 0.60980034, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06542969, "step": 13928, "time_per_iteration": 3.165985584259033 }, { "auxiliary_loss_clip": 0.01053132, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.01459813, "balance_loss_mlp": 1.01753008, "epoch": 0.8374567864121449, "flos": 20447930419200.0, "grad_norm": 1.6964071671264327, "language_loss": 0.7000308, "learning_rate": 2.707144665977068e-07, "loss": 0.72094744, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35546875, "step": 13929, "time_per_iteration": 2.3797028064727783 }, { "auxiliary_loss_clip": 0.01053941, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.01186669, "balance_loss_mlp": 1.01651883, "epoch": 0.8375169096648128, "flos": 41902512336000.0, "grad_norm": 1.4712845365888085, "language_loss": 0.68311983, "learning_rate": 2.705188388275574e-07, "loss": 0.70402157, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.375, "step": 13930, "time_per_iteration": 2.591395378112793 }, { "auxiliary_loss_clip": 0.01052159, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.01116252, "balance_loss_mlp": 1.01706719, "epoch": 0.8375770329174809, "flos": 20008374001920.0, "grad_norm": 1.7228012173533487, "language_loss": 0.72717243, "learning_rate": 2.703232766395067e-07, "loss": 0.748025, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 13931, "time_per_iteration": 2.3748037815093994 }, { "auxiliary_loss_clip": 0.01049732, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.01177812, "balance_loss_mlp": 1.01494157, "epoch": 0.8376371561701488, "flos": 22782502679040.0, "grad_norm": 2.073179889007315, "language_loss": 0.72849226, "learning_rate": 2.701277800409705e-07, "loss": 0.74932826, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 13932, "time_per_iteration": 2.393982172012329 }, { "auxiliary_loss_clip": 0.01050707, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.01375103, "balance_loss_mlp": 1.01548719, "epoch": 0.8376972794228168, "flos": 23913339482880.0, "grad_norm": 2.6770667235520453, "language_loss": 0.68100238, "learning_rate": 2.699323490393628e-07, "loss": 0.70185536, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3515625, "step": 13933, "time_per_iteration": 2.4285342693328857 }, { "auxiliary_loss_clip": 0.01051725, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.01839852, "balance_loss_mlp": 1.01677752, "epoch": 0.8377574026754847, "flos": 13733888123520.0, "grad_norm": 2.001974996568261, "language_loss": 0.77769417, "learning_rate": 2.697369836420933e-07, "loss": 0.79860312, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34960938, "step": 13934, "time_per_iteration": 2.427338123321533 }, { "auxiliary_loss_clip": 0.01053315, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.01700425, "balance_loss_mlp": 1.01861596, "epoch": 0.8378175259281527, "flos": 21650304332160.0, "grad_norm": 1.4839032088989714, "language_loss": 0.78045487, "learning_rate": 2.6954168385657115e-07, "loss": 0.8013835, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 13935, "time_per_iteration": 2.3798108100891113 }, { "auxiliary_loss_clip": 0.01050684, "auxiliary_loss_mlp": 0.01034393, "balance_loss_clip": 1.01160049, "balance_loss_mlp": 1.01509666, "epoch": 0.8378776491808206, "flos": 15447949056000.0, "grad_norm": 2.617736194582649, "language_loss": 0.58006477, "learning_rate": 2.6934644969020135e-07, "loss": 0.60091555, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 13936, "time_per_iteration": 3.635401725769043 }, { "auxiliary_loss_clip": 0.01050669, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.01590133, "balance_loss_mlp": 1.01627624, "epoch": 0.8379377724334887, "flos": 14719521116160.0, "grad_norm": 1.9219422532484516, "language_loss": 0.90483451, "learning_rate": 2.691512811503882e-07, "loss": 0.92571694, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 13937, "time_per_iteration": 2.3338940143585205 }, { "auxiliary_loss_clip": 0.01052501, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.01147008, "balance_loss_mlp": 1.01689625, "epoch": 0.8379978956861566, "flos": 24534095189760.0, "grad_norm": 1.8980343801774786, "language_loss": 0.82361597, "learning_rate": 2.689561782445313e-07, "loss": 0.84448385, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 13938, "time_per_iteration": 2.3976593017578125 }, { "auxiliary_loss_clip": 0.01053715, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.01395547, "balance_loss_mlp": 1.01655865, "epoch": 0.8380580189388246, "flos": 18951622836480.0, "grad_norm": 1.653898703961682, "language_loss": 0.71325374, "learning_rate": 2.6876114098002965e-07, "loss": 0.73417169, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37109375, "step": 13939, "time_per_iteration": 2.352163076400757 }, { "auxiliary_loss_clip": 0.01054696, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.01569617, "balance_loss_mlp": 1.01797056, "epoch": 0.8381181421914926, "flos": 26539122326400.0, "grad_norm": 1.6429745656731911, "language_loss": 0.7724036, "learning_rate": 2.6856616936428e-07, "loss": 0.79334563, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 13940, "time_per_iteration": 2.466586112976074 }, { "auxiliary_loss_clip": 0.01051471, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 1.01619411, "balance_loss_mlp": 1.01607919, "epoch": 0.8381782654441605, "flos": 23290454183040.0, "grad_norm": 1.5766175909662246, "language_loss": 0.77832818, "learning_rate": 2.6837126340467374e-07, "loss": 0.79923183, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 13941, "time_per_iteration": 2.4094760417938232 }, { "auxiliary_loss_clip": 0.01053959, "auxiliary_loss_mlp": 0.01039437, "balance_loss_clip": 1.01335359, "balance_loss_mlp": 1.01619208, "epoch": 0.8382383886968285, "flos": 26757643726080.0, "grad_norm": 2.012550227143333, "language_loss": 0.73967361, "learning_rate": 2.6817642310860276e-07, "loss": 0.7606076, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 13942, "time_per_iteration": 3.877239942550659 }, { "auxiliary_loss_clip": 0.01055951, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.01788116, "balance_loss_mlp": 1.01710129, "epoch": 0.8382985119494964, "flos": 26103336336000.0, "grad_norm": 1.482033853710732, "language_loss": 0.80640155, "learning_rate": 2.679816484834554e-07, "loss": 0.82740331, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 13943, "time_per_iteration": 2.4358839988708496 }, { "auxiliary_loss_clip": 0.01050074, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.0132432, "balance_loss_mlp": 1.01574028, "epoch": 0.8383586352021645, "flos": 16434210453120.0, "grad_norm": 1.9023293530257637, "language_loss": 0.86309409, "learning_rate": 2.6778693953661766e-07, "loss": 0.88394672, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 13944, "time_per_iteration": 2.346911668777466 }, { "auxiliary_loss_clip": 0.01007305, "auxiliary_loss_mlp": 0.01002841, "balance_loss_clip": 1.0006237, "balance_loss_mlp": 1.00067306, "epoch": 0.8384187584548324, "flos": 64192249952640.0, "grad_norm": 0.6506471623353044, "language_loss": 0.50248998, "learning_rate": 2.6759229627547263e-07, "loss": 0.52259147, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.06640625, "step": 13945, "time_per_iteration": 4.516470909118652 }, { "auxiliary_loss_clip": 0.01050295, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.01425123, "balance_loss_mlp": 1.01556253, "epoch": 0.8384788817075004, "flos": 22381804471680.0, "grad_norm": 3.491207694801384, "language_loss": 0.65808415, "learning_rate": 2.673977187074017e-07, "loss": 0.67894524, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 13946, "time_per_iteration": 2.367666006088257 }, { "auxiliary_loss_clip": 0.01052298, "auxiliary_loss_mlp": 0.01038127, "balance_loss_clip": 1.01380789, "balance_loss_mlp": 1.01575506, "epoch": 0.8385390049601683, "flos": 29495567456640.0, "grad_norm": 1.5422470779827908, "language_loss": 0.6819396, "learning_rate": 2.672032068397829e-07, "loss": 0.7028439, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 13947, "time_per_iteration": 2.4149975776672363 }, { "auxiliary_loss_clip": 0.01053567, "auxiliary_loss_mlp": 0.01038705, "balance_loss_clip": 1.01476812, "balance_loss_mlp": 1.01676881, "epoch": 0.8385991282128363, "flos": 32706424730880.0, "grad_norm": 1.6517484214611318, "language_loss": 0.71017009, "learning_rate": 2.6700876067999176e-07, "loss": 0.73109281, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 13948, "time_per_iteration": 2.4547441005706787 }, { "auxiliary_loss_clip": 0.01049324, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.01216257, "balance_loss_mlp": 1.01540065, "epoch": 0.8386592514655042, "flos": 25440021815040.0, "grad_norm": 2.476518356447475, "language_loss": 0.85993695, "learning_rate": 2.6681438023540194e-07, "loss": 0.88075513, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 13949, "time_per_iteration": 2.526679277420044 }, { "auxiliary_loss_clip": 0.01050791, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.01235962, "balance_loss_mlp": 1.01647615, "epoch": 0.8387193747181723, "flos": 22014867415680.0, "grad_norm": 1.8522828109691583, "language_loss": 0.71584839, "learning_rate": 2.66620065513385e-07, "loss": 0.73670053, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 13950, "time_per_iteration": 2.3749654293060303 }, { "auxiliary_loss_clip": 0.01052396, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.01365066, "balance_loss_mlp": 1.01644874, "epoch": 0.8387794979708402, "flos": 18149248903680.0, "grad_norm": 2.397443794993565, "language_loss": 0.65751255, "learning_rate": 2.6642581652130913e-07, "loss": 0.67840666, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 13951, "time_per_iteration": 2.3428561687469482 }, { "auxiliary_loss_clip": 0.01051282, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.01205921, "balance_loss_mlp": 1.01666784, "epoch": 0.8388396212235082, "flos": 25410031090560.0, "grad_norm": 1.4372239803501226, "language_loss": 0.71239537, "learning_rate": 2.662316332665393e-07, "loss": 0.73325276, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 13952, "time_per_iteration": 2.4094398021698 }, { "auxiliary_loss_clip": 0.01050466, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.01051128, "balance_loss_mlp": 1.01586211, "epoch": 0.8388997444761762, "flos": 22271967734400.0, "grad_norm": 1.8814199420152866, "language_loss": 0.73660219, "learning_rate": 2.6603751575643987e-07, "loss": 0.757424, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34570312, "step": 13953, "time_per_iteration": 2.3582210540771484 }, { "auxiliary_loss_clip": 0.01049176, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.01343036, "balance_loss_mlp": 1.01539075, "epoch": 0.8389598677288441, "flos": 19572203986560.0, "grad_norm": 1.8235012722915565, "language_loss": 0.69339204, "learning_rate": 2.6584346399837176e-07, "loss": 0.71422958, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33789062, "step": 13954, "time_per_iteration": 2.344651937484741 }, { "auxiliary_loss_clip": 0.01052522, "auxiliary_loss_mlp": 0.01037621, "balance_loss_clip": 1.01649737, "balance_loss_mlp": 1.01758313, "epoch": 0.8390199909815121, "flos": 17383743233280.0, "grad_norm": 1.8825576347124364, "language_loss": 0.73999518, "learning_rate": 2.656494779996932e-07, "loss": 0.76089656, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34960938, "step": 13955, "time_per_iteration": 2.3476345539093018 }, { "auxiliary_loss_clip": 0.01051564, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.01028657, "balance_loss_mlp": 1.01589549, "epoch": 0.83908011423418, "flos": 24638625400320.0, "grad_norm": 2.2772161076402666, "language_loss": 0.67923588, "learning_rate": 2.6545555776775995e-07, "loss": 0.70008355, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 13956, "time_per_iteration": 2.3679006099700928 }, { "auxiliary_loss_clip": 0.01052079, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.01227736, "balance_loss_mlp": 1.0154798, "epoch": 0.8391402374868481, "flos": 24717179692800.0, "grad_norm": 1.8896353761738685, "language_loss": 0.81010771, "learning_rate": 2.6526170330992667e-07, "loss": 0.83099425, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 13957, "time_per_iteration": 2.427332639694214 }, { "auxiliary_loss_clip": 0.01007397, "auxiliary_loss_mlp": 0.0100207, "balance_loss_clip": 0.99980515, "balance_loss_mlp": 1.00079751, "epoch": 0.839200360739516, "flos": 56868344294400.0, "grad_norm": 0.7558085797372138, "language_loss": 0.53481519, "learning_rate": 2.6506791463354283e-07, "loss": 0.55490983, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.06591797, "step": 13958, "time_per_iteration": 4.4519431591033936 }, { "auxiliary_loss_clip": 0.01051912, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.01182485, "balance_loss_mlp": 1.01596808, "epoch": 0.839260483992184, "flos": 18331809736320.0, "grad_norm": 1.8842618826754418, "language_loss": 0.74766445, "learning_rate": 2.648741917459574e-07, "loss": 0.7685442, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 13959, "time_per_iteration": 2.3306961059570312 }, { "auxiliary_loss_clip": 0.01048855, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.01468432, "balance_loss_mlp": 1.0150404, "epoch": 0.8393206072448519, "flos": 27086735001600.0, "grad_norm": 2.240977232916885, "language_loss": 0.56797683, "learning_rate": 2.646805346545169e-07, "loss": 0.58883148, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 13960, "time_per_iteration": 2.401132345199585 }, { "auxiliary_loss_clip": 0.01007225, "auxiliary_loss_mlp": 0.01004853, "balance_loss_clip": 1.00250459, "balance_loss_mlp": 1.00059056, "epoch": 0.8393807304975199, "flos": 61518287566080.0, "grad_norm": 0.7671256473689121, "language_loss": 0.60729432, "learning_rate": 2.6448694336656397e-07, "loss": 0.62741512, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.06640625, "step": 13961, "time_per_iteration": 3.080437660217285 }, { "auxiliary_loss_clip": 0.01049715, "auxiliary_loss_mlp": 0.01033636, "balance_loss_clip": 1.01287007, "balance_loss_mlp": 1.01479673, "epoch": 0.8394408537501878, "flos": 14894191981440.0, "grad_norm": 2.1806492419214982, "language_loss": 0.68956649, "learning_rate": 2.642934178894405e-07, "loss": 0.71040004, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34960938, "step": 13962, "time_per_iteration": 2.315078020095825 }, { "auxiliary_loss_clip": 0.01052607, "auxiliary_loss_mlp": 0.01038684, "balance_loss_clip": 1.01534271, "balance_loss_mlp": 1.01585555, "epoch": 0.8395009770028559, "flos": 17411464719360.0, "grad_norm": 2.0349300100565864, "language_loss": 0.74746966, "learning_rate": 2.640999582304841e-07, "loss": 0.76838255, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 13963, "time_per_iteration": 2.321296215057373 }, { "auxiliary_loss_clip": 0.01051002, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.01272798, "balance_loss_mlp": 1.01575625, "epoch": 0.8395611002555238, "flos": 27923603224320.0, "grad_norm": 1.6057179317828205, "language_loss": 0.76988971, "learning_rate": 2.6390656439703173e-07, "loss": 0.79076278, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 13964, "time_per_iteration": 2.4147701263427734 }, { "auxiliary_loss_clip": 0.01054303, "auxiliary_loss_mlp": 0.01038166, "balance_loss_clip": 1.01298892, "balance_loss_mlp": 1.01685345, "epoch": 0.8396212235081918, "flos": 11100354958080.0, "grad_norm": 2.152414332348708, "language_loss": 0.79776901, "learning_rate": 2.637132363964161e-07, "loss": 0.8186937, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.375, "step": 13965, "time_per_iteration": 2.3288767337799072 }, { "auxiliary_loss_clip": 0.01049649, "auxiliary_loss_mlp": 0.01034034, "balance_loss_clip": 1.01271963, "balance_loss_mlp": 1.01537395, "epoch": 0.8396813467608598, "flos": 35734197502080.0, "grad_norm": 3.7693335838664157, "language_loss": 0.67569804, "learning_rate": 2.635199742359684e-07, "loss": 0.69653487, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34179688, "step": 13966, "time_per_iteration": 2.511916399002075 }, { "auxiliary_loss_clip": 0.01051077, "auxiliary_loss_mlp": 0.01038773, "balance_loss_clip": 1.01602745, "balance_loss_mlp": 1.01564622, "epoch": 0.8397414700135277, "flos": 26175536760960.0, "grad_norm": 1.607886525975985, "language_loss": 0.76046568, "learning_rate": 2.633267779230177e-07, "loss": 0.7813642, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 13967, "time_per_iteration": 2.3921566009521484 }, { "auxiliary_loss_clip": 0.01051129, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.0126133, "balance_loss_mlp": 1.0162226, "epoch": 0.8398015932661957, "flos": 18332123938560.0, "grad_norm": 2.594141152868473, "language_loss": 0.84317046, "learning_rate": 2.6313364746488974e-07, "loss": 0.86403579, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34960938, "step": 13968, "time_per_iteration": 2.3446543216705322 }, { "auxiliary_loss_clip": 0.01053555, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.00933754, "balance_loss_mlp": 1.01778984, "epoch": 0.8398617165188637, "flos": 17378681086080.0, "grad_norm": 2.1356129542614943, "language_loss": 0.78725195, "learning_rate": 2.629405828689075e-07, "loss": 0.80809629, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35742188, "step": 13969, "time_per_iteration": 2.331275224685669 }, { "auxiliary_loss_clip": 0.01052724, "auxiliary_loss_mlp": 0.0104112, "balance_loss_clip": 1.01589561, "balance_loss_mlp": 1.01559711, "epoch": 0.8399218397715317, "flos": 22928579274240.0, "grad_norm": 1.9807432893774355, "language_loss": 0.78099203, "learning_rate": 2.627475841423923e-07, "loss": 0.80193049, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 13970, "time_per_iteration": 2.3704679012298584 }, { "auxiliary_loss_clip": 0.01052162, "auxiliary_loss_mlp": 0.01040974, "balance_loss_clip": 1.01795506, "balance_loss_mlp": 1.01609945, "epoch": 0.8399819630241996, "flos": 23148427305600.0, "grad_norm": 2.4319994938023246, "language_loss": 0.73200881, "learning_rate": 2.625546512926633e-07, "loss": 0.75294018, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 13971, "time_per_iteration": 2.3713085651397705 }, { "auxiliary_loss_clip": 0.01050794, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.01101613, "balance_loss_mlp": 1.01490045, "epoch": 0.8400420862768676, "flos": 16396539229440.0, "grad_norm": 1.8808912792576304, "language_loss": 0.78765976, "learning_rate": 2.623617843270358e-07, "loss": 0.80852473, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.359375, "step": 13972, "time_per_iteration": 2.326504707336426 }, { "auxiliary_loss_clip": 0.01049432, "auxiliary_loss_mlp": 0.01034797, "balance_loss_clip": 1.01358938, "balance_loss_mlp": 1.01558626, "epoch": 0.8401022095295355, "flos": 21286439475840.0, "grad_norm": 1.345971531179824, "language_loss": 0.69233406, "learning_rate": 2.6216898325282333e-07, "loss": 0.71317631, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33789062, "step": 13973, "time_per_iteration": 2.3693132400512695 }, { "auxiliary_loss_clip": 0.01051313, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.01008201, "balance_loss_mlp": 1.01591873, "epoch": 0.8401623327822035, "flos": 17310355822080.0, "grad_norm": 1.823149804393061, "language_loss": 0.79006004, "learning_rate": 2.619762480773382e-07, "loss": 0.81089288, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 13974, "time_per_iteration": 2.326477527618408 }, { "auxiliary_loss_clip": 0.01051888, "auxiliary_loss_mlp": 0.01040955, "balance_loss_clip": 1.01789951, "balance_loss_mlp": 1.01640081, "epoch": 0.8402224560348714, "flos": 22235588231040.0, "grad_norm": 1.805281144647976, "language_loss": 0.74015844, "learning_rate": 2.617835788078868e-07, "loss": 0.76108694, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 13975, "time_per_iteration": 2.384267568588257 }, { "auxiliary_loss_clip": 0.01050976, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.01265657, "balance_loss_mlp": 1.01558626, "epoch": 0.8402825792875395, "flos": 20228920260480.0, "grad_norm": 1.6725082261729158, "language_loss": 0.73107451, "learning_rate": 2.6159097545177645e-07, "loss": 0.75193405, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 13976, "time_per_iteration": 3.619525194168091 }, { "auxiliary_loss_clip": 0.01051005, "auxiliary_loss_mlp": 0.01035239, "balance_loss_clip": 1.01300633, "balance_loss_mlp": 1.01600814, "epoch": 0.8403427025402074, "flos": 23288987905920.0, "grad_norm": 1.7105858514969159, "language_loss": 0.73449337, "learning_rate": 2.61398438016311e-07, "loss": 0.75535583, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 13977, "time_per_iteration": 2.399258852005005 }, { "auxiliary_loss_clip": 0.01050422, "auxiliary_loss_mlp": 0.01039928, "balance_loss_clip": 1.01634812, "balance_loss_mlp": 1.01501095, "epoch": 0.8404028257928754, "flos": 32674094945280.0, "grad_norm": 1.4571553691463306, "language_loss": 0.69647002, "learning_rate": 2.6120596650879043e-07, "loss": 0.71737349, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 13978, "time_per_iteration": 2.525038003921509 }, { "auxiliary_loss_clip": 0.01049508, "auxiliary_loss_mlp": 0.01033373, "balance_loss_clip": 1.01280904, "balance_loss_mlp": 1.0165478, "epoch": 0.8404629490455434, "flos": 16179588840960.0, "grad_norm": 2.4596481241122596, "language_loss": 0.78435111, "learning_rate": 2.610135609365145e-07, "loss": 0.80517989, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.328125, "step": 13979, "time_per_iteration": 2.325166702270508 }, { "auxiliary_loss_clip": 0.01049565, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.01080906, "balance_loss_mlp": 1.01543832, "epoch": 0.8405230722982113, "flos": 15193571823360.0, "grad_norm": 1.9636038436655818, "language_loss": 0.79833853, "learning_rate": 2.60821221306778e-07, "loss": 0.81916332, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34179688, "step": 13980, "time_per_iteration": 2.330409526824951 }, { "auxiliary_loss_clip": 0.01051569, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.01496506, "balance_loss_mlp": 1.0166409, "epoch": 0.8405831955508793, "flos": 27811357603200.0, "grad_norm": 1.534735359624246, "language_loss": 0.87362099, "learning_rate": 2.606289476268757e-07, "loss": 0.89449489, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34960938, "step": 13981, "time_per_iteration": 2.406942844390869 }, { "auxiliary_loss_clip": 0.01051235, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.01346421, "balance_loss_mlp": 1.01629519, "epoch": 0.8406433188035473, "flos": 23768310228480.0, "grad_norm": 2.3849868163410153, "language_loss": 0.69083053, "learning_rate": 2.6043673990409745e-07, "loss": 0.71169579, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34960938, "step": 13982, "time_per_iteration": 2.3748786449432373 }, { "auxiliary_loss_clip": 0.01050893, "auxiliary_loss_mlp": 0.01039551, "balance_loss_clip": 1.01572061, "balance_loss_mlp": 1.01568687, "epoch": 0.8407034420562153, "flos": 29204391784320.0, "grad_norm": 1.572498779580134, "language_loss": 0.69188666, "learning_rate": 2.602445981457324e-07, "loss": 0.71279109, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3515625, "step": 13983, "time_per_iteration": 3.7749762535095215 }, { "auxiliary_loss_clip": 0.0105103, "auxiliary_loss_mlp": 0.01035679, "balance_loss_clip": 1.01175332, "balance_loss_mlp": 1.01506782, "epoch": 0.8407635653088832, "flos": 26358865643520.0, "grad_norm": 1.8294312722190453, "language_loss": 0.80307949, "learning_rate": 2.6005252235906684e-07, "loss": 0.8239466, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 13984, "time_per_iteration": 2.397649049758911 }, { "auxiliary_loss_clip": 0.01049913, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.01188278, "balance_loss_mlp": 1.01452982, "epoch": 0.8408236885615512, "flos": 21467778410880.0, "grad_norm": 1.9313389362117013, "language_loss": 0.61486447, "learning_rate": 2.598605125513842e-07, "loss": 0.63570261, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 13985, "time_per_iteration": 3.6453235149383545 }, { "auxiliary_loss_clip": 0.01053742, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.01331139, "balance_loss_mlp": 1.01682591, "epoch": 0.8408838118142191, "flos": 22962689539200.0, "grad_norm": 1.6924853364224823, "language_loss": 0.83136284, "learning_rate": 2.5966856872996467e-07, "loss": 0.85226882, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36914062, "step": 13986, "time_per_iteration": 2.378918409347534 }, { "auxiliary_loss_clip": 0.01051533, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.01015306, "balance_loss_mlp": 1.01675296, "epoch": 0.8409439350668871, "flos": 26798736263040.0, "grad_norm": 1.4631967865728899, "language_loss": 0.66784656, "learning_rate": 2.5947669090208755e-07, "loss": 0.68868482, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 13987, "time_per_iteration": 2.4241740703582764 }, { "auxiliary_loss_clip": 0.0105131, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.01483214, "balance_loss_mlp": 1.01559663, "epoch": 0.841004058319555, "flos": 26577456865920.0, "grad_norm": 3.1035315059978816, "language_loss": 0.68497467, "learning_rate": 2.5928487907502906e-07, "loss": 0.70585746, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 13988, "time_per_iteration": 2.402193546295166 }, { "auxiliary_loss_clip": 0.01054166, "auxiliary_loss_mlp": 0.01048398, "balance_loss_clip": 1.02202868, "balance_loss_mlp": 1.01680803, "epoch": 0.8410641815722231, "flos": 14500999716480.0, "grad_norm": 2.8102135920438465, "language_loss": 0.81864411, "learning_rate": 2.590931332560622e-07, "loss": 0.8396697, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37304688, "step": 13989, "time_per_iteration": 2.3401761054992676 }, { "auxiliary_loss_clip": 0.01053146, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.00990963, "balance_loss_mlp": 1.01626968, "epoch": 0.841124304824891, "flos": 29165463751680.0, "grad_norm": 1.7442394364719818, "language_loss": 0.76197797, "learning_rate": 2.5890145345245826e-07, "loss": 0.7828449, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 13990, "time_per_iteration": 2.4131994247436523 }, { "auxiliary_loss_clip": 0.01047002, "auxiliary_loss_mlp": 0.01038746, "balance_loss_clip": 1.01787281, "balance_loss_mlp": 1.01413083, "epoch": 0.841184428077559, "flos": 22411131880320.0, "grad_norm": 1.6136812814108434, "language_loss": 0.81506282, "learning_rate": 2.5870983967148597e-07, "loss": 0.83592027, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.328125, "step": 13991, "time_per_iteration": 2.4583330154418945 }, { "auxiliary_loss_clip": 0.01051099, "auxiliary_loss_mlp": 0.01038056, "balance_loss_clip": 1.01528716, "balance_loss_mlp": 1.01581264, "epoch": 0.841244551330227, "flos": 22961781843840.0, "grad_norm": 2.507398066768113, "language_loss": 0.71970248, "learning_rate": 2.585182919204105e-07, "loss": 0.74059403, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 13992, "time_per_iteration": 2.3855385780334473 }, { "auxiliary_loss_clip": 0.01052422, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.01147151, "balance_loss_mlp": 1.01629472, "epoch": 0.8413046745828949, "flos": 21031678218240.0, "grad_norm": 1.6474027487389236, "language_loss": 0.77998984, "learning_rate": 2.583268102064959e-07, "loss": 0.80084437, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36132812, "step": 13993, "time_per_iteration": 2.3763575553894043 }, { "auxiliary_loss_clip": 0.01054165, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.01795411, "balance_loss_mlp": 1.01608825, "epoch": 0.841364797835563, "flos": 27050879168640.0, "grad_norm": 2.104335495844484, "language_loss": 0.74802995, "learning_rate": 2.5813539453700393e-07, "loss": 0.76900876, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 13994, "time_per_iteration": 2.4393832683563232 }, { "auxiliary_loss_clip": 0.0104999, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.01509154, "balance_loss_mlp": 1.01552773, "epoch": 0.8414249210882309, "flos": 17894697114240.0, "grad_norm": 1.6449099273634091, "language_loss": 0.60768735, "learning_rate": 2.5794404491919163e-07, "loss": 0.62856579, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34375, "step": 13995, "time_per_iteration": 2.349722146987915 }, { "auxiliary_loss_clip": 0.01050151, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.01416421, "balance_loss_mlp": 1.01578021, "epoch": 0.8414850443408989, "flos": 25440196371840.0, "grad_norm": 1.7817827831168362, "language_loss": 0.72804511, "learning_rate": 2.577527613603163e-07, "loss": 0.74891436, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34375, "step": 13996, "time_per_iteration": 2.4049980640411377 }, { "auxiliary_loss_clip": 0.0105152, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.01573443, "balance_loss_mlp": 1.01564181, "epoch": 0.8415451675935668, "flos": 23218986896640.0, "grad_norm": 1.9951342006771153, "language_loss": 0.65454477, "learning_rate": 2.5756154386763017e-07, "loss": 0.67544967, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 13997, "time_per_iteration": 2.394443988800049 }, { "auxiliary_loss_clip": 0.01053917, "auxiliary_loss_mlp": 0.01046475, "balance_loss_clip": 1.02170336, "balance_loss_mlp": 1.0164175, "epoch": 0.8416052908462348, "flos": 18545653013760.0, "grad_norm": 1.9918549352968093, "language_loss": 0.82935274, "learning_rate": 2.5737039244838565e-07, "loss": 0.85035658, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 13998, "time_per_iteration": 3.8182199001312256 }, { "auxiliary_loss_clip": 0.01052328, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.01550984, "balance_loss_mlp": 1.0158031, "epoch": 0.8416654140989027, "flos": 26103964740480.0, "grad_norm": 1.5062917570038723, "language_loss": 0.81282365, "learning_rate": 2.5717930710982984e-07, "loss": 0.83374679, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 13999, "time_per_iteration": 2.4215519428253174 }, { "auxiliary_loss_clip": 0.01053461, "auxiliary_loss_mlp": 0.01039488, "balance_loss_clip": 1.01397705, "balance_loss_mlp": 1.01592994, "epoch": 0.8417255373515707, "flos": 26432043586560.0, "grad_norm": 2.3347608760530476, "language_loss": 0.68133831, "learning_rate": 2.569882878592096e-07, "loss": 0.70226777, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 14000, "time_per_iteration": 2.4092214107513428 }, { "auxiliary_loss_clip": 0.01054648, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.01144874, "balance_loss_mlp": 1.01704526, "epoch": 0.8417856606042387, "flos": 24716586199680.0, "grad_norm": 1.4101090972599177, "language_loss": 0.80305713, "learning_rate": 2.5679733470376885e-07, "loss": 0.82395512, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.375, "step": 14001, "time_per_iteration": 2.411808490753174 }, { "auxiliary_loss_clip": 0.01051964, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.01344585, "balance_loss_mlp": 1.01588631, "epoch": 0.8418457838569067, "flos": 20849780701440.0, "grad_norm": 1.9134872990295089, "language_loss": 0.79620141, "learning_rate": 2.5660644765074703e-07, "loss": 0.81707925, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36132812, "step": 14002, "time_per_iteration": 2.3698573112487793 }, { "auxiliary_loss_clip": 0.01051147, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.01534832, "balance_loss_mlp": 1.01570463, "epoch": 0.8419059071095746, "flos": 28659292727040.0, "grad_norm": 1.4352666282898348, "language_loss": 0.78919876, "learning_rate": 2.5641562670738334e-07, "loss": 0.8100965, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 14003, "time_per_iteration": 2.434781789779663 }, { "auxiliary_loss_clip": 0.01051244, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.01391733, "balance_loss_mlp": 1.01618433, "epoch": 0.8419660303622426, "flos": 21652503747840.0, "grad_norm": 1.5299627182967663, "language_loss": 0.66478348, "learning_rate": 2.5622487188091436e-07, "loss": 0.68566102, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 14004, "time_per_iteration": 2.3747096061706543 }, { "auxiliary_loss_clip": 0.01053005, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.01919484, "balance_loss_mlp": 1.01623344, "epoch": 0.8420261536149106, "flos": 25299949973760.0, "grad_norm": 1.9865701419292883, "language_loss": 0.77247357, "learning_rate": 2.560341831785724e-07, "loss": 0.79344046, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.3671875, "step": 14005, "time_per_iteration": 2.3871724605560303 }, { "auxiliary_loss_clip": 0.0105211, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.01204753, "balance_loss_mlp": 1.01539683, "epoch": 0.8420862768675785, "flos": 18762603402240.0, "grad_norm": 4.059525841825488, "language_loss": 0.79214799, "learning_rate": 2.5584356060758906e-07, "loss": 0.81304252, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 14006, "time_per_iteration": 2.437835693359375 }, { "auxiliary_loss_clip": 0.01051606, "auxiliary_loss_mlp": 0.01037244, "balance_loss_clip": 1.01424885, "balance_loss_mlp": 1.01606596, "epoch": 0.8421464001202466, "flos": 18327201436800.0, "grad_norm": 1.714009812045215, "language_loss": 0.77780455, "learning_rate": 2.556530041751932e-07, "loss": 0.79869306, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 14007, "time_per_iteration": 2.3309378623962402 }, { "auxiliary_loss_clip": 0.01052246, "auxiliary_loss_mlp": 0.01038053, "balance_loss_clip": 1.01397288, "balance_loss_mlp": 1.01589251, "epoch": 0.8422065233729145, "flos": 31535926755840.0, "grad_norm": 2.0856938504438665, "language_loss": 0.6632266, "learning_rate": 2.554625138886102e-07, "loss": 0.68412954, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 14008, "time_per_iteration": 2.4392004013061523 }, { "auxiliary_loss_clip": 0.01007489, "auxiliary_loss_mlp": 0.01001778, "balance_loss_clip": 0.99969143, "balance_loss_mlp": 1.00076818, "epoch": 0.8422666466255825, "flos": 64295034595200.0, "grad_norm": 0.7159798722931919, "language_loss": 0.57031476, "learning_rate": 2.552720897550631e-07, "loss": 0.59040749, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06738281, "step": 14009, "time_per_iteration": 3.0698609352111816 }, { "auxiliary_loss_clip": 0.01049571, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.01128721, "balance_loss_mlp": 1.01533425, "epoch": 0.8423267698782504, "flos": 24315573790080.0, "grad_norm": 1.2929447897587416, "language_loss": 0.78449094, "learning_rate": 2.5508173178177304e-07, "loss": 0.80530488, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34179688, "step": 14010, "time_per_iteration": 2.4197700023651123 }, { "auxiliary_loss_clip": 0.0105426, "auxiliary_loss_mlp": 0.01039696, "balance_loss_clip": 1.01413727, "balance_loss_mlp": 1.01730275, "epoch": 0.8423868931309184, "flos": 18295116030720.0, "grad_norm": 2.044519748142335, "language_loss": 0.73314679, "learning_rate": 2.548914399759592e-07, "loss": 0.75408638, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36914062, "step": 14011, "time_per_iteration": 2.347945213317871 }, { "auxiliary_loss_clip": 0.01050918, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.01618576, "balance_loss_mlp": 1.01485538, "epoch": 0.8424470163835863, "flos": 23549090601600.0, "grad_norm": 1.7641990752119325, "language_loss": 0.85652083, "learning_rate": 2.5470121434483636e-07, "loss": 0.87742162, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 14012, "time_per_iteration": 2.383077383041382 }, { "auxiliary_loss_clip": 0.01046507, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.01437962, "balance_loss_mlp": 1.01442051, "epoch": 0.8425071396362543, "flos": 23768345139840.0, "grad_norm": 1.6119150805098506, "language_loss": 0.68904543, "learning_rate": 2.5451105489561884e-07, "loss": 0.70984089, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.3203125, "step": 14013, "time_per_iteration": 2.3757898807525635 }, { "auxiliary_loss_clip": 0.01055385, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.01459551, "balance_loss_mlp": 1.0167619, "epoch": 0.8425672628889223, "flos": 16178017829760.0, "grad_norm": 2.390081421197911, "language_loss": 0.79793096, "learning_rate": 2.5432096163551644e-07, "loss": 0.81887341, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.38671875, "step": 14014, "time_per_iteration": 2.3513364791870117 }, { "auxiliary_loss_clip": 0.0104946, "auxiliary_loss_mlp": 0.0104151, "balance_loss_clip": 1.01958728, "balance_loss_mlp": 1.01438344, "epoch": 0.8426273861415903, "flos": 23148008369280.0, "grad_norm": 1.696297297601098, "language_loss": 0.68580532, "learning_rate": 2.5413093457173884e-07, "loss": 0.70671505, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 14015, "time_per_iteration": 3.6770594120025635 }, { "auxiliary_loss_clip": 0.01050611, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.01369333, "balance_loss_mlp": 1.01578307, "epoch": 0.8426875093942582, "flos": 17456781530880.0, "grad_norm": 2.167017928447435, "language_loss": 0.77674979, "learning_rate": 2.5394097371149036e-07, "loss": 0.79762423, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 14016, "time_per_iteration": 2.365633249282837 }, { "auxiliary_loss_clip": 0.01052472, "auxiliary_loss_mlp": 0.01041577, "balance_loss_clip": 1.01903415, "balance_loss_mlp": 1.01703167, "epoch": 0.8427476326469262, "flos": 19639691377920.0, "grad_norm": 1.7993442821109884, "language_loss": 0.80175537, "learning_rate": 2.5375107906197544e-07, "loss": 0.82269585, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 14017, "time_per_iteration": 2.353987455368042 }, { "auxiliary_loss_clip": 0.01051085, "auxiliary_loss_mlp": 0.01039254, "balance_loss_clip": 1.01571047, "balance_loss_mlp": 1.01608622, "epoch": 0.8428077558995941, "flos": 11940539760000.0, "grad_norm": 2.645239349595698, "language_loss": 0.64054632, "learning_rate": 2.5356125063039525e-07, "loss": 0.66144967, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 14018, "time_per_iteration": 2.4242959022521973 }, { "auxiliary_loss_clip": 0.01051363, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.01466513, "balance_loss_mlp": 1.01621222, "epoch": 0.8428678791522621, "flos": 10450970069760.0, "grad_norm": 1.9975851987268989, "language_loss": 0.80325818, "learning_rate": 2.5337148842394687e-07, "loss": 0.82414281, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14019, "time_per_iteration": 2.333517074584961 }, { "auxiliary_loss_clip": 0.01052855, "auxiliary_loss_mlp": 0.01039304, "balance_loss_clip": 1.0157125, "balance_loss_mlp": 1.01672208, "epoch": 0.8429280024049302, "flos": 28765987441920.0, "grad_norm": 2.402750953628054, "language_loss": 0.79159433, "learning_rate": 2.531817924498265e-07, "loss": 0.81251591, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 14020, "time_per_iteration": 2.4432289600372314 }, { "auxiliary_loss_clip": 0.01051611, "auxiliary_loss_mlp": 0.01039601, "balance_loss_clip": 1.01469803, "balance_loss_mlp": 1.01545119, "epoch": 0.8429881256575981, "flos": 19536068862720.0, "grad_norm": 1.6238611106838678, "language_loss": 0.7269814, "learning_rate": 2.5299216271522805e-07, "loss": 0.74789351, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36132812, "step": 14021, "time_per_iteration": 2.36151385307312 }, { "auxiliary_loss_clip": 0.01052167, "auxiliary_loss_mlp": 0.01041132, "balance_loss_clip": 1.01724267, "balance_loss_mlp": 1.01569629, "epoch": 0.8430482489102661, "flos": 24789764142720.0, "grad_norm": 1.6281075840262558, "language_loss": 0.70830679, "learning_rate": 2.5280259922734125e-07, "loss": 0.72923976, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 14022, "time_per_iteration": 3.812649726867676 }, { "auxiliary_loss_clip": 0.01054641, "auxiliary_loss_mlp": 0.01042995, "balance_loss_clip": 1.01902199, "balance_loss_mlp": 1.01758647, "epoch": 0.843108372162934, "flos": 21543155769600.0, "grad_norm": 1.8067638791216571, "language_loss": 0.73345995, "learning_rate": 2.526131019933553e-07, "loss": 0.75443631, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 14023, "time_per_iteration": 2.366070508956909 }, { "auxiliary_loss_clip": 0.01050839, "auxiliary_loss_mlp": 0.01041429, "balance_loss_clip": 1.01780224, "balance_loss_mlp": 1.01552415, "epoch": 0.843168495415602, "flos": 24607622246400.0, "grad_norm": 1.507049688553594, "language_loss": 0.67486525, "learning_rate": 2.524236710204559e-07, "loss": 0.69578791, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 14024, "time_per_iteration": 3.7416255474090576 }, { "auxiliary_loss_clip": 0.01050502, "auxiliary_loss_mlp": 0.01039474, "balance_loss_clip": 1.01614499, "balance_loss_mlp": 1.01589036, "epoch": 0.8432286186682699, "flos": 15121825246080.0, "grad_norm": 1.8169778923759035, "language_loss": 0.82014298, "learning_rate": 2.522343063158261e-07, "loss": 0.84104282, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34570312, "step": 14025, "time_per_iteration": 2.3460795879364014 }, { "auxiliary_loss_clip": 0.01048841, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.01071274, "balance_loss_mlp": 1.01498461, "epoch": 0.843288741920938, "flos": 20300876305920.0, "grad_norm": 1.4186257156020363, "language_loss": 0.78257287, "learning_rate": 2.5204500788664606e-07, "loss": 0.80335921, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.33789062, "step": 14026, "time_per_iteration": 2.3605825901031494 }, { "auxiliary_loss_clip": 0.01051832, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.01752114, "balance_loss_mlp": 1.01694489, "epoch": 0.8433488651736059, "flos": 23330953226880.0, "grad_norm": 1.8770422417133452, "language_loss": 0.83049273, "learning_rate": 2.518557757400945e-07, "loss": 0.85141701, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34765625, "step": 14027, "time_per_iteration": 2.389634609222412 }, { "auxiliary_loss_clip": 0.01051038, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.01526833, "balance_loss_mlp": 1.01533043, "epoch": 0.8434089884262739, "flos": 39456532327680.0, "grad_norm": 1.4109709585345187, "language_loss": 0.57530349, "learning_rate": 2.5166660988334754e-07, "loss": 0.59618771, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 14028, "time_per_iteration": 2.516916036605835 }, { "auxiliary_loss_clip": 0.01049714, "auxiliary_loss_mlp": 0.01032768, "balance_loss_clip": 1.01160848, "balance_loss_mlp": 1.0154593, "epoch": 0.8434691116789418, "flos": 23767716735360.0, "grad_norm": 1.8749214619115566, "language_loss": 0.64570928, "learning_rate": 2.51477510323578e-07, "loss": 0.66653413, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 14029, "time_per_iteration": 2.379504919052124 }, { "auxiliary_loss_clip": 0.01047524, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.01380169, "balance_loss_mlp": 1.01462054, "epoch": 0.8435292349316098, "flos": 22670396703360.0, "grad_norm": 1.496541787549182, "language_loss": 0.76205403, "learning_rate": 2.51288477067956e-07, "loss": 0.7828629, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.328125, "step": 14030, "time_per_iteration": 2.367100477218628 }, { "auxiliary_loss_clip": 0.01050699, "auxiliary_loss_mlp": 0.01034516, "balance_loss_clip": 1.01278377, "balance_loss_mlp": 1.01679754, "epoch": 0.8435893581842777, "flos": 18842623971840.0, "grad_norm": 1.8096201469772781, "language_loss": 0.84156358, "learning_rate": 2.510995101236502e-07, "loss": 0.86241567, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33984375, "step": 14031, "time_per_iteration": 2.341137170791626 }, { "auxiliary_loss_clip": 0.01050575, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.01067233, "balance_loss_mlp": 1.01567948, "epoch": 0.8436494814369457, "flos": 20703180435840.0, "grad_norm": 4.702719676775216, "language_loss": 0.81579912, "learning_rate": 2.509106094978266e-07, "loss": 0.83662784, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 14032, "time_per_iteration": 2.34971022605896 }, { "auxiliary_loss_clip": 0.01051719, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.01507759, "balance_loss_mlp": 1.01573789, "epoch": 0.8437096046896138, "flos": 22673084878080.0, "grad_norm": 1.3871189266839887, "language_loss": 0.76244426, "learning_rate": 2.507217751976478e-07, "loss": 0.78336543, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 14033, "time_per_iteration": 2.3855786323547363 }, { "auxiliary_loss_clip": 0.01051534, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.01630354, "balance_loss_mlp": 1.01611876, "epoch": 0.8437697279422817, "flos": 16179204816000.0, "grad_norm": 1.8285747422904859, "language_loss": 0.84834003, "learning_rate": 2.505330072302743e-07, "loss": 0.86923021, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35546875, "step": 14034, "time_per_iteration": 2.3158233165740967 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01033497, "balance_loss_clip": 1.0107398, "balance_loss_mlp": 1.01579595, "epoch": 0.8438298511949497, "flos": 28764625898880.0, "grad_norm": 1.4260534028255158, "language_loss": 0.78802985, "learning_rate": 2.503443056028656e-07, "loss": 0.80888188, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 14035, "time_per_iteration": 2.424017906188965 }, { "auxiliary_loss_clip": 0.01051052, "auxiliary_loss_mlp": 0.0103642, "balance_loss_clip": 1.01506948, "balance_loss_mlp": 1.0166378, "epoch": 0.8438899744476176, "flos": 33723025966080.0, "grad_norm": 1.352840024852956, "language_loss": 0.7283566, "learning_rate": 2.501556703225751e-07, "loss": 0.74923134, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 14036, "time_per_iteration": 2.470640182495117 }, { "auxiliary_loss_clip": 0.01047853, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.01249504, "balance_loss_mlp": 1.01517355, "epoch": 0.8439500977002856, "flos": 25109848287360.0, "grad_norm": 1.6437226045276623, "language_loss": 0.71038139, "learning_rate": 2.49967101396557e-07, "loss": 0.73117316, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.32617188, "step": 14037, "time_per_iteration": 2.389752149581909 }, { "auxiliary_loss_clip": 0.01050234, "auxiliary_loss_mlp": 0.01037326, "balance_loss_clip": 1.01531982, "balance_loss_mlp": 1.01549339, "epoch": 0.8440102209529535, "flos": 32849080012800.0, "grad_norm": 1.6435204291204595, "language_loss": 0.69656575, "learning_rate": 2.4977859883196227e-07, "loss": 0.71744132, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14038, "time_per_iteration": 3.898721933364868 }, { "auxiliary_loss_clip": 0.01052736, "auxiliary_loss_mlp": 0.01035545, "balance_loss_clip": 1.01101184, "balance_loss_mlp": 1.0167129, "epoch": 0.8440703442056215, "flos": 23729137816320.0, "grad_norm": 1.6141012177492886, "language_loss": 0.7723158, "learning_rate": 2.49590162635938e-07, "loss": 0.79319859, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.359375, "step": 14039, "time_per_iteration": 2.407320976257324 }, { "auxiliary_loss_clip": 0.01054199, "auxiliary_loss_mlp": 0.01038664, "balance_loss_clip": 1.01477432, "balance_loss_mlp": 1.016505, "epoch": 0.8441304674582895, "flos": 20192680402560.0, "grad_norm": 2.0645968212891224, "language_loss": 0.80195546, "learning_rate": 2.4940179281563046e-07, "loss": 0.82288414, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37695312, "step": 14040, "time_per_iteration": 2.359320640563965 }, { "auxiliary_loss_clip": 0.01051751, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.01774192, "balance_loss_mlp": 1.01642156, "epoch": 0.8441905907109575, "flos": 20219145079680.0, "grad_norm": 2.121384663365544, "language_loss": 0.70167196, "learning_rate": 2.492134893781821e-07, "loss": 0.72262388, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.35351562, "step": 14041, "time_per_iteration": 2.3709168434143066 }, { "auxiliary_loss_clip": 0.01052821, "auxiliary_loss_mlp": 0.01041196, "balance_loss_clip": 1.01801014, "balance_loss_mlp": 1.01663399, "epoch": 0.8442507139636254, "flos": 13515611103360.0, "grad_norm": 2.0010267224435525, "language_loss": 0.71359539, "learning_rate": 2.490252523307341e-07, "loss": 0.73453557, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 14042, "time_per_iteration": 2.382293939590454 }, { "auxiliary_loss_clip": 0.01049336, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.00981736, "balance_loss_mlp": 1.01549792, "epoch": 0.8443108372162934, "flos": 18219319735680.0, "grad_norm": 1.687025571893789, "language_loss": 0.76165164, "learning_rate": 2.4883708168042373e-07, "loss": 0.78245288, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33789062, "step": 14043, "time_per_iteration": 2.409672498703003 }, { "auxiliary_loss_clip": 0.01050292, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.01010585, "balance_loss_mlp": 1.01514864, "epoch": 0.8443709604689613, "flos": 16104246393600.0, "grad_norm": 2.6261614097163446, "language_loss": 0.7365073, "learning_rate": 2.486489774343865e-07, "loss": 0.75732446, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 14044, "time_per_iteration": 2.348379135131836 }, { "auxiliary_loss_clip": 0.01048713, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.01375794, "balance_loss_mlp": 1.01476657, "epoch": 0.8444310837216293, "flos": 18511228546560.0, "grad_norm": 1.7724126077677926, "language_loss": 0.75467604, "learning_rate": 2.484609395997559e-07, "loss": 0.77551186, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 14045, "time_per_iteration": 2.419152021408081 }, { "auxiliary_loss_clip": 0.01049734, "auxiliary_loss_mlp": 0.01037397, "balance_loss_clip": 1.01551032, "balance_loss_mlp": 1.01532507, "epoch": 0.8444912069742974, "flos": 14938950211200.0, "grad_norm": 3.117943971129822, "language_loss": 0.79594171, "learning_rate": 2.4827296818366216e-07, "loss": 0.81681299, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 14046, "time_per_iteration": 2.330622673034668 }, { "auxiliary_loss_clip": 0.01052127, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.01398087, "balance_loss_mlp": 1.01645887, "epoch": 0.8445513302269653, "flos": 20119292991360.0, "grad_norm": 2.749696319041143, "language_loss": 0.79782689, "learning_rate": 2.4808506319323255e-07, "loss": 0.81871665, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 14047, "time_per_iteration": 2.3647823333740234 }, { "auxiliary_loss_clip": 0.01050564, "auxiliary_loss_mlp": 0.01040188, "balance_loss_clip": 1.01836109, "balance_loss_mlp": 1.0164063, "epoch": 0.8446114534796333, "flos": 31169722838400.0, "grad_norm": 1.741070088683045, "language_loss": 0.72971392, "learning_rate": 2.478972246355935e-07, "loss": 0.75062144, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 14048, "time_per_iteration": 2.431424617767334 }, { "auxiliary_loss_clip": 0.01052916, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.0120554, "balance_loss_mlp": 1.01669765, "epoch": 0.8446715767323012, "flos": 23947275191040.0, "grad_norm": 1.3538520502561788, "language_loss": 0.74179101, "learning_rate": 2.477094525178667e-07, "loss": 0.76267534, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 14049, "time_per_iteration": 2.392207622528076 }, { "auxiliary_loss_clip": 0.01007967, "auxiliary_loss_mlp": 0.01005111, "balance_loss_clip": 1.00272703, "balance_loss_mlp": 1.00111318, "epoch": 0.8447316999849692, "flos": 67981653233280.0, "grad_norm": 0.816100666537498, "language_loss": 0.60741127, "learning_rate": 2.475217468471729e-07, "loss": 0.62754214, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06835938, "step": 14050, "time_per_iteration": 2.97776198387146 }, { "auxiliary_loss_clip": 0.01051677, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.01185036, "balance_loss_mlp": 1.01603889, "epoch": 0.8447918232376371, "flos": 22417834861440.0, "grad_norm": 2.0819184109739184, "language_loss": 0.72993493, "learning_rate": 2.473341076306303e-07, "loss": 0.75081307, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 14051, "time_per_iteration": 2.4369964599609375 }, { "auxiliary_loss_clip": 0.01050087, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.01245236, "balance_loss_mlp": 1.01570797, "epoch": 0.8448519464903052, "flos": 23693072515200.0, "grad_norm": 1.8961678058458085, "language_loss": 0.76257777, "learning_rate": 2.471465348753547e-07, "loss": 0.78342593, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 14052, "time_per_iteration": 2.3618345260620117 }, { "auxiliary_loss_clip": 0.01048262, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 1.01273322, "balance_loss_mlp": 1.01602387, "epoch": 0.8449120697429731, "flos": 13735040198400.0, "grad_norm": 2.128196504283438, "language_loss": 0.75999457, "learning_rate": 2.469590285884575e-07, "loss": 0.78080964, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.32226562, "step": 14053, "time_per_iteration": 2.3419103622436523 }, { "auxiliary_loss_clip": 0.01050084, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.01090002, "balance_loss_mlp": 1.01561642, "epoch": 0.8449721929956411, "flos": 20885741268480.0, "grad_norm": 1.7447893641752743, "language_loss": 0.75266963, "learning_rate": 2.467715887770494e-07, "loss": 0.77350748, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34570312, "step": 14054, "time_per_iteration": 3.615701198577881 }, { "auxiliary_loss_clip": 0.01055428, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.01370144, "balance_loss_mlp": 1.01825917, "epoch": 0.845032316248309, "flos": 33215598132480.0, "grad_norm": 1.5127575897453718, "language_loss": 0.79013205, "learning_rate": 2.4658421544823895e-07, "loss": 0.81106198, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 14055, "time_per_iteration": 2.4808404445648193 }, { "auxiliary_loss_clip": 0.01048988, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.01051378, "balance_loss_mlp": 1.0147748, "epoch": 0.845092439500977, "flos": 23584143473280.0, "grad_norm": 1.615999552709702, "language_loss": 0.73982286, "learning_rate": 2.463969086091302e-07, "loss": 0.76062703, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 14056, "time_per_iteration": 2.38860821723938 }, { "auxiliary_loss_clip": 0.01054955, "auxiliary_loss_mlp": 0.0103984, "balance_loss_clip": 1.01511621, "balance_loss_mlp": 1.01750314, "epoch": 0.8451525627536449, "flos": 13333085182080.0, "grad_norm": 2.4281441400675914, "language_loss": 0.69095731, "learning_rate": 2.4620966826682686e-07, "loss": 0.71190524, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 14057, "time_per_iteration": 2.3328404426574707 }, { "auxiliary_loss_clip": 0.01052563, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.01125097, "balance_loss_mlp": 1.01605535, "epoch": 0.8452126860063129, "flos": 27816768864000.0, "grad_norm": 1.6792598115877024, "language_loss": 0.78910136, "learning_rate": 2.460224944284284e-07, "loss": 0.80997539, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36523438, "step": 14058, "time_per_iteration": 2.400944709777832 }, { "auxiliary_loss_clip": 0.01053776, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.01628852, "balance_loss_mlp": 1.01712632, "epoch": 0.845272809258981, "flos": 27123498529920.0, "grad_norm": 1.7908678159045432, "language_loss": 0.70265949, "learning_rate": 2.45835387101033e-07, "loss": 0.72358906, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 14059, "time_per_iteration": 2.400639533996582 }, { "auxiliary_loss_clip": 0.01053856, "auxiliary_loss_mlp": 0.01041432, "balance_loss_clip": 1.01720881, "balance_loss_mlp": 1.01679623, "epoch": 0.8453329325116489, "flos": 18331600268160.0, "grad_norm": 3.1995526759682185, "language_loss": 0.58886796, "learning_rate": 2.4564834629173516e-07, "loss": 0.60982078, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 14060, "time_per_iteration": 2.322502613067627 }, { "auxiliary_loss_clip": 0.01053129, "auxiliary_loss_mlp": 0.01036517, "balance_loss_clip": 1.01144731, "balance_loss_mlp": 1.01579356, "epoch": 0.8453930557643169, "flos": 22674132218880.0, "grad_norm": 1.6670972418422754, "language_loss": 0.76666665, "learning_rate": 2.454613720076277e-07, "loss": 0.78756315, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 14061, "time_per_iteration": 2.3755078315734863 }, { "auxiliary_loss_clip": 0.01053817, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.01148963, "balance_loss_mlp": 1.01635253, "epoch": 0.8454531790169848, "flos": 22486299770880.0, "grad_norm": 2.4084213862985204, "language_loss": 0.72347617, "learning_rate": 2.452744642558013e-07, "loss": 0.74436414, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 14062, "time_per_iteration": 3.788644313812256 }, { "auxiliary_loss_clip": 0.01007939, "auxiliary_loss_mlp": 0.0100111, "balance_loss_clip": 0.99891669, "balance_loss_mlp": 1.00098348, "epoch": 0.8455133022696528, "flos": 58274925949440.0, "grad_norm": 0.6353977904721891, "language_loss": 0.52700102, "learning_rate": 2.450876230433432e-07, "loss": 0.54709154, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06933594, "step": 14063, "time_per_iteration": 3.0691773891448975 }, { "auxiliary_loss_clip": 0.01048426, "auxiliary_loss_mlp": 0.01033226, "balance_loss_clip": 1.01399779, "balance_loss_mlp": 1.01603615, "epoch": 0.8455734255223207, "flos": 21360210912000.0, "grad_norm": 8.977404707849722, "language_loss": 0.83472002, "learning_rate": 2.449008483773378e-07, "loss": 0.85553658, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.32421875, "step": 14064, "time_per_iteration": 3.687551975250244 }, { "auxiliary_loss_clip": 0.0105304, "auxiliary_loss_mlp": 0.0103877, "balance_loss_clip": 1.01352179, "balance_loss_mlp": 1.01597404, "epoch": 0.8456335487749888, "flos": 20448209710080.0, "grad_norm": 1.9641555357005338, "language_loss": 0.7329247, "learning_rate": 2.447141402648685e-07, "loss": 0.75384283, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 14065, "time_per_iteration": 2.3486921787261963 }, { "auxiliary_loss_clip": 0.0105013, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.0132339, "balance_loss_mlp": 1.01629472, "epoch": 0.8456936720276567, "flos": 28839619232640.0, "grad_norm": 1.463001037382905, "language_loss": 0.77889204, "learning_rate": 2.445274987130146e-07, "loss": 0.79973787, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33984375, "step": 14066, "time_per_iteration": 2.4267537593841553 }, { "auxiliary_loss_clip": 0.01050946, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.01338613, "balance_loss_mlp": 1.01628721, "epoch": 0.8457537952803247, "flos": 22671828069120.0, "grad_norm": 1.719343216206393, "language_loss": 0.70846236, "learning_rate": 2.4434092372885363e-07, "loss": 0.72933382, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 14067, "time_per_iteration": 2.3682994842529297 }, { "auxiliary_loss_clip": 0.01049972, "auxiliary_loss_mlp": 0.01032659, "balance_loss_clip": 1.01022351, "balance_loss_mlp": 1.01481557, "epoch": 0.8458139185329926, "flos": 33801510435840.0, "grad_norm": 1.7601486411776635, "language_loss": 0.72642308, "learning_rate": 2.4415441531946144e-07, "loss": 0.74724948, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14068, "time_per_iteration": 2.4987051486968994 }, { "auxiliary_loss_clip": 0.01007671, "auxiliary_loss_mlp": 0.01002822, "balance_loss_clip": 1.00073624, "balance_loss_mlp": 1.00095153, "epoch": 0.8458740417856606, "flos": 70292274433920.0, "grad_norm": 0.6955618834500206, "language_loss": 0.60557467, "learning_rate": 2.4396797349190976e-07, "loss": 0.62567961, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06738281, "step": 14069, "time_per_iteration": 3.1007158756256104 }, { "auxiliary_loss_clip": 0.01050997, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.01526475, "balance_loss_mlp": 1.01600564, "epoch": 0.8459341650383285, "flos": 24169706663040.0, "grad_norm": 1.7393564428343797, "language_loss": 0.75202239, "learning_rate": 2.4378159825326804e-07, "loss": 0.77289701, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34960938, "step": 14070, "time_per_iteration": 2.395197629928589 }, { "auxiliary_loss_clip": 0.01052083, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.02045262, "balance_loss_mlp": 1.01710296, "epoch": 0.8459942882909965, "flos": 38179618928640.0, "grad_norm": 1.619984189947319, "language_loss": 0.67794824, "learning_rate": 2.435952896106039e-07, "loss": 0.69889843, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 14071, "time_per_iteration": 2.5020875930786133 }, { "auxiliary_loss_clip": 0.01007469, "auxiliary_loss_mlp": 0.01004224, "balance_loss_clip": 1.00197101, "balance_loss_mlp": 1.00076425, "epoch": 0.8460544115436646, "flos": 64115092114560.0, "grad_norm": 0.7356129684498104, "language_loss": 0.61095899, "learning_rate": 2.4340904757098313e-07, "loss": 0.63107592, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.06738281, "step": 14072, "time_per_iteration": 2.8788344860076904 }, { "auxiliary_loss_clip": 0.01052664, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.01628399, "balance_loss_mlp": 1.01536918, "epoch": 0.8461145347963325, "flos": 24169671751680.0, "grad_norm": 1.7463466683440252, "language_loss": 0.73720169, "learning_rate": 2.4322287214146664e-07, "loss": 0.7581507, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37304688, "step": 14073, "time_per_iteration": 2.378504991531372 }, { "auxiliary_loss_clip": 0.01056524, "auxiliary_loss_mlp": 0.01044614, "balance_loss_clip": 1.01803005, "balance_loss_mlp": 1.01730585, "epoch": 0.8461746580490005, "flos": 34892441688960.0, "grad_norm": 2.8761451757139516, "language_loss": 0.78925747, "learning_rate": 2.430367633291155e-07, "loss": 0.81026888, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.39257812, "step": 14074, "time_per_iteration": 2.4853456020355225 }, { "auxiliary_loss_clip": 0.01051543, "auxiliary_loss_mlp": 0.01039623, "balance_loss_clip": 1.01735437, "balance_loss_mlp": 1.01632857, "epoch": 0.8462347813016684, "flos": 25555828394880.0, "grad_norm": 2.256001541958331, "language_loss": 0.76470953, "learning_rate": 2.4285072114098583e-07, "loss": 0.78562117, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14075, "time_per_iteration": 2.4008069038391113 }, { "auxiliary_loss_clip": 0.01050385, "auxiliary_loss_mlp": 0.01035436, "balance_loss_clip": 1.01294076, "balance_loss_mlp": 1.015522, "epoch": 0.8462949045543364, "flos": 21324250344960.0, "grad_norm": 2.2340895076231457, "language_loss": 0.74285448, "learning_rate": 2.4266474558413355e-07, "loss": 0.76371264, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 14076, "time_per_iteration": 2.3767597675323486 }, { "auxiliary_loss_clip": 0.01054103, "auxiliary_loss_mlp": 0.01048216, "balance_loss_clip": 1.02438629, "balance_loss_mlp": 1.01734567, "epoch": 0.8463550278070043, "flos": 22636356261120.0, "grad_norm": 1.9605213231215752, "language_loss": 0.78674448, "learning_rate": 2.4247883666560945e-07, "loss": 0.80776763, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 14077, "time_per_iteration": 3.848389148712158 }, { "auxiliary_loss_clip": 0.01053461, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.01489425, "balance_loss_mlp": 1.01686668, "epoch": 0.8464151510596724, "flos": 13004761956480.0, "grad_norm": 2.0313029651281456, "language_loss": 0.76352459, "learning_rate": 2.422929943924643e-07, "loss": 0.78443855, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 14078, "time_per_iteration": 2.327629327774048 }, { "auxiliary_loss_clip": 0.01050859, "auxiliary_loss_mlp": 0.01031968, "balance_loss_clip": 1.00924659, "balance_loss_mlp": 1.01568925, "epoch": 0.8464752743123403, "flos": 15704036945280.0, "grad_norm": 7.987137618837253, "language_loss": 0.85962403, "learning_rate": 2.4210721877174565e-07, "loss": 0.88045228, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 14079, "time_per_iteration": 2.405475616455078 }, { "auxiliary_loss_clip": 0.01056518, "auxiliary_loss_mlp": 0.01044146, "balance_loss_clip": 1.01772904, "balance_loss_mlp": 1.01726556, "epoch": 0.8465353975650083, "flos": 21652852861440.0, "grad_norm": 2.566866230367173, "language_loss": 0.59778905, "learning_rate": 2.419215098104965e-07, "loss": 0.61879563, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.39257812, "step": 14080, "time_per_iteration": 2.4580578804016113 }, { "auxiliary_loss_clip": 0.01052471, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.01417255, "balance_loss_mlp": 1.01544249, "epoch": 0.8465955208176762, "flos": 18514649859840.0, "grad_norm": 2.0179232201050197, "language_loss": 0.68227386, "learning_rate": 2.4173586751576014e-07, "loss": 0.70317942, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 14081, "time_per_iteration": 2.433373212814331 }, { "auxiliary_loss_clip": 0.01054265, "auxiliary_loss_mlp": 0.01040508, "balance_loss_clip": 1.01827526, "balance_loss_mlp": 1.01643491, "epoch": 0.8466556440703442, "flos": 24199592653440.0, "grad_norm": 1.6998601142191438, "language_loss": 0.74215543, "learning_rate": 2.41550291894576e-07, "loss": 0.76310313, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.37890625, "step": 14082, "time_per_iteration": 2.449704647064209 }, { "auxiliary_loss_clip": 0.01051949, "auxiliary_loss_mlp": 0.010361, "balance_loss_clip": 1.01378405, "balance_loss_mlp": 1.01581597, "epoch": 0.8467157673230121, "flos": 20374857210240.0, "grad_norm": 1.793017148134033, "language_loss": 0.76784945, "learning_rate": 2.413647829539809e-07, "loss": 0.78872991, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36132812, "step": 14083, "time_per_iteration": 2.433077096939087 }, { "auxiliary_loss_clip": 0.0105485, "auxiliary_loss_mlp": 0.01041272, "balance_loss_clip": 1.01588023, "balance_loss_mlp": 1.01666236, "epoch": 0.8467758905756801, "flos": 28472437797120.0, "grad_norm": 2.023256167341085, "language_loss": 0.67092478, "learning_rate": 2.411793407010092e-07, "loss": 0.69188595, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 14084, "time_per_iteration": 2.4911766052246094 }, { "auxiliary_loss_clip": 0.0105209, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.01103377, "balance_loss_mlp": 1.01699305, "epoch": 0.8468360138283482, "flos": 11691748344960.0, "grad_norm": 2.3257229671657718, "language_loss": 0.71885228, "learning_rate": 2.409939651426938e-07, "loss": 0.73970294, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 14085, "time_per_iteration": 2.400099754333496 }, { "auxiliary_loss_clip": 0.01051333, "auxiliary_loss_mlp": 0.01038307, "balance_loss_clip": 1.01447678, "balance_loss_mlp": 1.01523614, "epoch": 0.8468961370810161, "flos": 24606714551040.0, "grad_norm": 1.7064022716174612, "language_loss": 0.72022849, "learning_rate": 2.408086562860634e-07, "loss": 0.74112487, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 14086, "time_per_iteration": 2.4531853199005127 }, { "auxiliary_loss_clip": 0.01050348, "auxiliary_loss_mlp": 0.01040245, "balance_loss_clip": 1.01831043, "balance_loss_mlp": 1.01570225, "epoch": 0.8469562603336841, "flos": 19608792958080.0, "grad_norm": 2.5164785876138733, "language_loss": 0.76179343, "learning_rate": 2.4062341413814445e-07, "loss": 0.78269935, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14087, "time_per_iteration": 2.3480405807495117 }, { "auxiliary_loss_clip": 0.01052208, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.0105598, "balance_loss_mlp": 1.01740801, "epoch": 0.847016383586352, "flos": 22637822538240.0, "grad_norm": 1.3672241044346496, "language_loss": 0.74484235, "learning_rate": 2.4043823870596227e-07, "loss": 0.76568699, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 14088, "time_per_iteration": 2.3802154064178467 }, { "auxiliary_loss_clip": 0.01053325, "auxiliary_loss_mlp": 0.01040602, "balance_loss_clip": 1.01685548, "balance_loss_mlp": 1.01634634, "epoch": 0.84707650683902, "flos": 20959093768320.0, "grad_norm": 2.3299484383038296, "language_loss": 0.73592508, "learning_rate": 2.402531299965387e-07, "loss": 0.75686437, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 14089, "time_per_iteration": 2.4030070304870605 }, { "auxiliary_loss_clip": 0.01049911, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.01348436, "balance_loss_mlp": 1.01650548, "epoch": 0.8471366300916879, "flos": 24091990243200.0, "grad_norm": 1.4386294465822587, "language_loss": 0.79864252, "learning_rate": 2.400680880168928e-07, "loss": 0.81947505, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33398438, "step": 14090, "time_per_iteration": 2.42712140083313 }, { "auxiliary_loss_clip": 0.01052641, "auxiliary_loss_mlp": 0.01046825, "balance_loss_clip": 1.02170718, "balance_loss_mlp": 1.01569486, "epoch": 0.847196753344356, "flos": 18331914470400.0, "grad_norm": 2.2745768792216436, "language_loss": 0.78366488, "learning_rate": 2.3988311277404085e-07, "loss": 0.80465961, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 14091, "time_per_iteration": 2.3472237586975098 }, { "auxiliary_loss_clip": 0.01007474, "auxiliary_loss_mlp": 0.01004838, "balance_loss_clip": 1.00253725, "balance_loss_mlp": 1.00077438, "epoch": 0.8472568765970239, "flos": 49564584357120.0, "grad_norm": 0.8193754219782357, "language_loss": 0.59433031, "learning_rate": 2.396982042749982e-07, "loss": 0.61445343, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.06738281, "step": 14092, "time_per_iteration": 3.070391893386841 }, { "auxiliary_loss_clip": 0.01051227, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.01484764, "balance_loss_mlp": 1.01568723, "epoch": 0.8473169998496919, "flos": 19278130671360.0, "grad_norm": 1.6634163539024676, "language_loss": 0.70982414, "learning_rate": 2.395133625267756e-07, "loss": 0.73072118, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 14093, "time_per_iteration": 2.383718729019165 }, { "auxiliary_loss_clip": 0.01049075, "auxiliary_loss_mlp": 0.010292, "balance_loss_clip": 1.00806463, "balance_loss_mlp": 1.01512742, "epoch": 0.8473771231023598, "flos": 17674604703360.0, "grad_norm": 2.2366740438893022, "language_loss": 0.8475976, "learning_rate": 2.3932858753638263e-07, "loss": 0.86838037, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 14094, "time_per_iteration": 3.6670734882354736 }, { "auxiliary_loss_clip": 0.01049546, "auxiliary_loss_mlp": 0.01039694, "balance_loss_clip": 1.01798582, "balance_loss_mlp": 1.01569068, "epoch": 0.8474372463550278, "flos": 26358551441280.0, "grad_norm": 1.666315957630569, "language_loss": 0.72060108, "learning_rate": 2.3914387931082626e-07, "loss": 0.74149346, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33789062, "step": 14095, "time_per_iteration": 2.440690279006958 }, { "auxiliary_loss_clip": 0.0104949, "auxiliary_loss_mlp": 0.01040062, "balance_loss_clip": 1.01735306, "balance_loss_mlp": 1.01485598, "epoch": 0.8474973696076957, "flos": 23400989147520.0, "grad_norm": 1.6613392594477725, "language_loss": 0.81536281, "learning_rate": 2.3895923785711105e-07, "loss": 0.83625835, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 14096, "time_per_iteration": 2.374547004699707 }, { "auxiliary_loss_clip": 0.01053673, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.02044177, "balance_loss_mlp": 1.01619625, "epoch": 0.8475574928603637, "flos": 25074690681600.0, "grad_norm": 1.8869473062754396, "language_loss": 0.78710103, "learning_rate": 2.387746631822374e-07, "loss": 0.80808878, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 14097, "time_per_iteration": 2.4155218601226807 }, { "auxiliary_loss_clip": 0.01051318, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.0149368, "balance_loss_mlp": 1.01655912, "epoch": 0.8476176161130318, "flos": 19965885010560.0, "grad_norm": 1.6451445070223836, "language_loss": 0.81299186, "learning_rate": 2.385901552932048e-07, "loss": 0.83387297, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 14098, "time_per_iteration": 2.356947183609009 }, { "auxiliary_loss_clip": 0.01051399, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.01462078, "balance_loss_mlp": 1.01570296, "epoch": 0.8476777393656997, "flos": 21284833553280.0, "grad_norm": 1.8061116992736108, "language_loss": 0.72840947, "learning_rate": 2.3840571419701062e-07, "loss": 0.74929011, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 14099, "time_per_iteration": 2.3820905685424805 }, { "auxiliary_loss_clip": 0.01052109, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 1.01487708, "balance_loss_mlp": 1.01581228, "epoch": 0.8477378626183677, "flos": 29970176745600.0, "grad_norm": 1.847100518505733, "language_loss": 0.64791375, "learning_rate": 2.3822133990064787e-07, "loss": 0.66884321, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.36328125, "step": 14100, "time_per_iteration": 2.4516537189483643 }, { "auxiliary_loss_clip": 0.01053033, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.018803, "balance_loss_mlp": 1.01615751, "epoch": 0.8477979858710356, "flos": 24236740206720.0, "grad_norm": 1.9374589707803758, "language_loss": 0.75035322, "learning_rate": 2.380370324111085e-07, "loss": 0.77130413, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 14101, "time_per_iteration": 2.3584036827087402 }, { "auxiliary_loss_clip": 0.01052469, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.01185274, "balance_loss_mlp": 1.01690578, "epoch": 0.8478581091237036, "flos": 25592487189120.0, "grad_norm": 1.7245593968836188, "language_loss": 0.72540778, "learning_rate": 2.3785279173538163e-07, "loss": 0.74626261, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 14102, "time_per_iteration": 3.8572287559509277 }, { "auxiliary_loss_clip": 0.01053259, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.01107156, "balance_loss_mlp": 1.01675963, "epoch": 0.8479182323763715, "flos": 12056311428480.0, "grad_norm": 2.334240603147291, "language_loss": 0.83815849, "learning_rate": 2.3766861788045366e-07, "loss": 0.85904735, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 14103, "time_per_iteration": 3.780444383621216 }, { "auxiliary_loss_clip": 0.01049872, "auxiliary_loss_mlp": 0.01034857, "balance_loss_clip": 1.01289892, "balance_loss_mlp": 1.01536334, "epoch": 0.8479783556290396, "flos": 21432341514240.0, "grad_norm": 1.871872713532463, "language_loss": 0.79645598, "learning_rate": 2.374845108533079e-07, "loss": 0.8173033, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 14104, "time_per_iteration": 2.3724448680877686 }, { "auxiliary_loss_clip": 0.01054972, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.01667976, "balance_loss_mlp": 1.01738477, "epoch": 0.8480384788817075, "flos": 19641716236800.0, "grad_norm": 1.7802614585535894, "language_loss": 0.79658109, "learning_rate": 2.3730047066092607e-07, "loss": 0.81755054, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 14105, "time_per_iteration": 2.3625824451446533 }, { "auxiliary_loss_clip": 0.01055676, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.01419544, "balance_loss_mlp": 1.01697087, "epoch": 0.8480986021343755, "flos": 22488184984320.0, "grad_norm": 3.3192270708078317, "language_loss": 0.51168412, "learning_rate": 2.3711649731028749e-07, "loss": 0.53264213, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.38671875, "step": 14106, "time_per_iteration": 2.3698086738586426 }, { "auxiliary_loss_clip": 0.01052066, "auxiliary_loss_mlp": 0.01041581, "balance_loss_clip": 1.01804852, "balance_loss_mlp": 1.01579762, "epoch": 0.8481587253870434, "flos": 22089476724480.0, "grad_norm": 1.7920148056216658, "language_loss": 0.7652117, "learning_rate": 2.3693259080836792e-07, "loss": 0.78614819, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 14107, "time_per_iteration": 2.366255521774292 }, { "auxiliary_loss_clip": 0.010527, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.01307917, "balance_loss_mlp": 1.01646996, "epoch": 0.8482188486397114, "flos": 33581313290880.0, "grad_norm": 1.608095364799378, "language_loss": 0.74699724, "learning_rate": 2.3674875116214087e-07, "loss": 0.7678932, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14108, "time_per_iteration": 2.462435245513916 }, { "auxiliary_loss_clip": 0.0105004, "auxiliary_loss_mlp": 0.01037555, "balance_loss_clip": 1.01386774, "balance_loss_mlp": 1.0155077, "epoch": 0.8482789718923793, "flos": 20918455079040.0, "grad_norm": 2.0371417899081963, "language_loss": 0.74013305, "learning_rate": 2.3656497837857836e-07, "loss": 0.76100898, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34570312, "step": 14109, "time_per_iteration": 2.3611695766448975 }, { "auxiliary_loss_clip": 0.0105027, "auxiliary_loss_mlp": 0.01036517, "balance_loss_clip": 1.01379538, "balance_loss_mlp": 1.01545501, "epoch": 0.8483390951450474, "flos": 12895379066880.0, "grad_norm": 1.9604291675362098, "language_loss": 0.75628936, "learning_rate": 2.3638127246464811e-07, "loss": 0.77715719, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 14110, "time_per_iteration": 2.3429877758026123 }, { "auxiliary_loss_clip": 0.01052281, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.01866698, "balance_loss_mlp": 1.01697755, "epoch": 0.8483992183977154, "flos": 25080485967360.0, "grad_norm": 1.6446367706017544, "language_loss": 0.77143693, "learning_rate": 2.3619763342731658e-07, "loss": 0.79236567, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 14111, "time_per_iteration": 2.4031736850738525 }, { "auxiliary_loss_clip": 0.01051322, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.01160467, "balance_loss_mlp": 1.01711273, "epoch": 0.8484593416503833, "flos": 25556247331200.0, "grad_norm": 1.800503090833019, "language_loss": 0.68415046, "learning_rate": 2.3601406127354772e-07, "loss": 0.70499039, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 14112, "time_per_iteration": 2.399285316467285 }, { "auxiliary_loss_clip": 0.01052344, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.01295376, "balance_loss_mlp": 1.01567137, "epoch": 0.8485194649030513, "flos": 27197235054720.0, "grad_norm": 1.3492068549958418, "language_loss": 0.74451977, "learning_rate": 2.3583055601030312e-07, "loss": 0.76540446, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 14113, "time_per_iteration": 2.422658681869507 }, { "auxiliary_loss_clip": 0.01051824, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.01392198, "balance_loss_mlp": 1.01642382, "epoch": 0.8485795881557192, "flos": 24204794446080.0, "grad_norm": 2.479367587742233, "language_loss": 0.66761798, "learning_rate": 2.3564711764454003e-07, "loss": 0.68851328, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 14114, "time_per_iteration": 2.3939766883850098 }, { "auxiliary_loss_clip": 0.01054277, "auxiliary_loss_mlp": 0.01044877, "balance_loss_clip": 1.02124965, "balance_loss_mlp": 1.01760805, "epoch": 0.8486397114083872, "flos": 21140607260160.0, "grad_norm": 1.6043460797270042, "language_loss": 0.80354118, "learning_rate": 2.3546374618321495e-07, "loss": 0.82453275, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 14115, "time_per_iteration": 2.371312379837036 }, { "auxiliary_loss_clip": 0.0105167, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.01323438, "balance_loss_mlp": 1.01592851, "epoch": 0.8486998346610551, "flos": 19973740066560.0, "grad_norm": 1.8042622614463173, "language_loss": 0.79720098, "learning_rate": 2.3528044163328187e-07, "loss": 0.81807554, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 14116, "time_per_iteration": 2.3490540981292725 }, { "auxiliary_loss_clip": 0.01052641, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.01032281, "balance_loss_mlp": 1.01612902, "epoch": 0.8487599579137232, "flos": 19791283968000.0, "grad_norm": 1.747321330865072, "language_loss": 0.69130528, "learning_rate": 2.3509720400169076e-07, "loss": 0.71217442, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 14117, "time_per_iteration": 3.7858188152313232 }, { "auxiliary_loss_clip": 0.01052896, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.01418531, "balance_loss_mlp": 1.01603711, "epoch": 0.8488200811663911, "flos": 26394826210560.0, "grad_norm": 2.017031225200575, "language_loss": 0.66186351, "learning_rate": 2.3491403329539096e-07, "loss": 0.68276143, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 14118, "time_per_iteration": 2.38550066947937 }, { "auxiliary_loss_clip": 0.01049682, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.01325917, "balance_loss_mlp": 1.01524353, "epoch": 0.8488802044190591, "flos": 16358449069440.0, "grad_norm": 2.3065924994543128, "language_loss": 0.74858397, "learning_rate": 2.3473092952132757e-07, "loss": 0.76942724, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 14119, "time_per_iteration": 2.376495838165283 }, { "auxiliary_loss_clip": 0.01052891, "auxiliary_loss_mlp": 0.01036625, "balance_loss_clip": 1.00996971, "balance_loss_mlp": 1.01603651, "epoch": 0.848940327671727, "flos": 19207850371200.0, "grad_norm": 1.6296428787467565, "language_loss": 0.79114741, "learning_rate": 2.345478926864446e-07, "loss": 0.81204259, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36914062, "step": 14120, "time_per_iteration": 2.3576154708862305 }, { "auxiliary_loss_clip": 0.01053713, "auxiliary_loss_mlp": 0.01040084, "balance_loss_clip": 1.01627779, "balance_loss_mlp": 1.0173595, "epoch": 0.849000450924395, "flos": 21870117452160.0, "grad_norm": 2.531667868651732, "language_loss": 0.77032983, "learning_rate": 2.3436492279768227e-07, "loss": 0.79126781, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14121, "time_per_iteration": 2.4227097034454346 }, { "auxiliary_loss_clip": 0.01007535, "auxiliary_loss_mlp": 0.01001477, "balance_loss_clip": 0.99929547, "balance_loss_mlp": 1.00086248, "epoch": 0.8490605741770629, "flos": 71162938719360.0, "grad_norm": 0.821426194701518, "language_loss": 0.60232222, "learning_rate": 2.3418201986197883e-07, "loss": 0.62241232, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.06689453, "step": 14122, "time_per_iteration": 3.0033016204833984 }, { "auxiliary_loss_clip": 0.01051925, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.01473927, "balance_loss_mlp": 1.01640642, "epoch": 0.849120697429731, "flos": 24972185329920.0, "grad_norm": 2.4193978744918083, "language_loss": 0.81728429, "learning_rate": 2.3399918388627048e-07, "loss": 0.83816195, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 14123, "time_per_iteration": 2.405208110809326 }, { "auxiliary_loss_clip": 0.0105011, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.01042533, "balance_loss_mlp": 1.01540208, "epoch": 0.8491808206823989, "flos": 23031363916800.0, "grad_norm": 2.117695700851805, "language_loss": 0.84035283, "learning_rate": 2.3381641487749016e-07, "loss": 0.86118752, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34765625, "step": 14124, "time_per_iteration": 2.363466262817383 }, { "auxiliary_loss_clip": 0.01054703, "auxiliary_loss_mlp": 0.01038961, "balance_loss_clip": 1.01405811, "balance_loss_mlp": 1.01811481, "epoch": 0.8492409439350669, "flos": 23877413827200.0, "grad_norm": 1.9171578557979745, "language_loss": 0.73050684, "learning_rate": 2.3363371284256805e-07, "loss": 0.75144345, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 14125, "time_per_iteration": 2.388522148132324 }, { "auxiliary_loss_clip": 0.01054504, "auxiliary_loss_mlp": 0.0104154, "balance_loss_clip": 1.01540911, "balance_loss_mlp": 1.0166564, "epoch": 0.8493010671877349, "flos": 22418777468160.0, "grad_norm": 1.672999069299573, "language_loss": 0.74589038, "learning_rate": 2.3345107778843288e-07, "loss": 0.76685083, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 14126, "time_per_iteration": 2.3957440853118896 }, { "auxiliary_loss_clip": 0.01050841, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.0135417, "balance_loss_mlp": 1.01577485, "epoch": 0.8493611904404028, "flos": 17528493196800.0, "grad_norm": 1.5213625715771126, "language_loss": 0.69004464, "learning_rate": 2.3326850972200928e-07, "loss": 0.71090716, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34960938, "step": 14127, "time_per_iteration": 2.343822717666626 }, { "auxiliary_loss_clip": 0.010532, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.01184702, "balance_loss_mlp": 1.01646745, "epoch": 0.8494213136930708, "flos": 19461948312960.0, "grad_norm": 1.7771054285299006, "language_loss": 0.70207798, "learning_rate": 2.330860086502211e-07, "loss": 0.72297573, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 14128, "time_per_iteration": 2.346179485321045 }, { "auxiliary_loss_clip": 0.01051631, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.01458645, "balance_loss_mlp": 1.01684284, "epoch": 0.8494814369457387, "flos": 18769306383360.0, "grad_norm": 1.8346344772894638, "language_loss": 0.78852397, "learning_rate": 2.3290357457998855e-07, "loss": 0.80942041, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 14129, "time_per_iteration": 2.349085569381714 }, { "auxiliary_loss_clip": 0.01053706, "auxiliary_loss_mlp": 0.01045361, "balance_loss_clip": 1.02156711, "balance_loss_mlp": 1.0173161, "epoch": 0.8495415601984068, "flos": 23330359733760.0, "grad_norm": 1.6786662694153613, "language_loss": 0.69216979, "learning_rate": 2.3272120751823031e-07, "loss": 0.71316046, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14130, "time_per_iteration": 2.362420082092285 }, { "auxiliary_loss_clip": 0.01050823, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.01519108, "balance_loss_mlp": 1.01535738, "epoch": 0.8496016834510747, "flos": 26611706776320.0, "grad_norm": 2.447856244249683, "language_loss": 0.71736306, "learning_rate": 2.3253890747186e-07, "loss": 0.73825628, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 14131, "time_per_iteration": 2.414033889770508 }, { "auxiliary_loss_clip": 0.0105253, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.01053405, "balance_loss_mlp": 1.01595259, "epoch": 0.8496618067037427, "flos": 25479298961280.0, "grad_norm": 1.8941064976203343, "language_loss": 0.69420606, "learning_rate": 2.3235667444779162e-07, "loss": 0.71505201, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36523438, "step": 14132, "time_per_iteration": 2.399500846862793 }, { "auxiliary_loss_clip": 0.01049471, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.01440787, "balance_loss_mlp": 1.01480269, "epoch": 0.8497219299564106, "flos": 25373407207680.0, "grad_norm": 1.6363964853809982, "language_loss": 0.7070775, "learning_rate": 2.3217450845293564e-07, "loss": 0.72793531, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 14133, "time_per_iteration": 3.622816801071167 }, { "auxiliary_loss_clip": 0.01007711, "auxiliary_loss_mlp": 0.01006901, "balance_loss_clip": 1.00469553, "balance_loss_mlp": 1.00099468, "epoch": 0.8497820532090786, "flos": 67776642529920.0, "grad_norm": 0.729539990578915, "language_loss": 0.57700938, "learning_rate": 2.3199240949419918e-07, "loss": 0.59715551, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.06738281, "step": 14134, "time_per_iteration": 3.098874807357788 }, { "auxiliary_loss_clip": 0.01054562, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.01224542, "balance_loss_mlp": 1.01648867, "epoch": 0.8498421764617465, "flos": 23439428421120.0, "grad_norm": 1.9200511468499248, "language_loss": 0.80576718, "learning_rate": 2.3181037757848787e-07, "loss": 0.82666981, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.38085938, "step": 14135, "time_per_iteration": 2.3692755699157715 }, { "auxiliary_loss_clip": 0.01054308, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.01669383, "balance_loss_mlp": 1.01689923, "epoch": 0.8499022997144146, "flos": 17711647522560.0, "grad_norm": 1.6950578932121916, "language_loss": 0.64396143, "learning_rate": 2.316284127127044e-07, "loss": 0.66493511, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37304688, "step": 14136, "time_per_iteration": 2.3670544624328613 }, { "auxiliary_loss_clip": 0.01054071, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.0132978, "balance_loss_mlp": 1.01660371, "epoch": 0.8499624229670825, "flos": 18587513600640.0, "grad_norm": 1.8205925686847422, "language_loss": 0.85296953, "learning_rate": 2.3144651490374835e-07, "loss": 0.87388992, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 14137, "time_per_iteration": 2.335573434829712 }, { "auxiliary_loss_clip": 0.01050997, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.01429081, "balance_loss_mlp": 1.01666451, "epoch": 0.8500225462197505, "flos": 24344901198720.0, "grad_norm": 1.9824955795008088, "language_loss": 0.80070686, "learning_rate": 2.3126468415851773e-07, "loss": 0.82157391, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 14138, "time_per_iteration": 2.413609027862549 }, { "auxiliary_loss_clip": 0.01053003, "auxiliary_loss_mlp": 0.01042411, "balance_loss_clip": 1.01852167, "balance_loss_mlp": 1.01648188, "epoch": 0.8500826694724185, "flos": 16544570860800.0, "grad_norm": 1.635701728427246, "language_loss": 0.65525252, "learning_rate": 2.310829204839073e-07, "loss": 0.67620671, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 14139, "time_per_iteration": 2.3462941646575928 }, { "auxiliary_loss_clip": 0.01052311, "auxiliary_loss_mlp": 0.01035639, "balance_loss_clip": 1.01333499, "balance_loss_mlp": 1.01606393, "epoch": 0.8501427927250864, "flos": 16288482971520.0, "grad_norm": 1.5247114762573557, "language_loss": 0.71252441, "learning_rate": 2.3090122388681043e-07, "loss": 0.73340392, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36328125, "step": 14140, "time_per_iteration": 2.326925039291382 }, { "auxiliary_loss_clip": 0.01054429, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.0183388, "balance_loss_mlp": 1.01626694, "epoch": 0.8502029159777544, "flos": 26686700110080.0, "grad_norm": 2.0572548245846525, "language_loss": 0.65554368, "learning_rate": 2.3071959437411648e-07, "loss": 0.67650926, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 14141, "time_per_iteration": 2.408808708190918 }, { "auxiliary_loss_clip": 0.01052918, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.01709986, "balance_loss_mlp": 1.01639485, "epoch": 0.8502630392304223, "flos": 35589307893120.0, "grad_norm": 1.4648052860458578, "language_loss": 0.72045428, "learning_rate": 2.3053803195271214e-07, "loss": 0.74139154, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 14142, "time_per_iteration": 3.8912203311920166 }, { "auxiliary_loss_clip": 0.01051341, "auxiliary_loss_mlp": 0.01036147, "balance_loss_clip": 1.01337802, "balance_loss_mlp": 1.01582778, "epoch": 0.8503231624830904, "flos": 21648488941440.0, "grad_norm": 1.6105185862922722, "language_loss": 0.6654985, "learning_rate": 2.3035653662948375e-07, "loss": 0.68637329, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 14143, "time_per_iteration": 3.8661394119262695 }, { "auxiliary_loss_clip": 0.01053511, "auxiliary_loss_mlp": 0.0104088, "balance_loss_clip": 1.01594162, "balance_loss_mlp": 1.01564789, "epoch": 0.8503832857357583, "flos": 22416403495680.0, "grad_norm": 2.912128314137782, "language_loss": 0.69358557, "learning_rate": 2.3017510841131216e-07, "loss": 0.71452951, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 14144, "time_per_iteration": 2.4366819858551025 }, { "auxiliary_loss_clip": 0.01051363, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.01508117, "balance_loss_mlp": 1.01607442, "epoch": 0.8504434089884263, "flos": 18696966312960.0, "grad_norm": 2.2569784082259905, "language_loss": 0.65932876, "learning_rate": 2.299937473050777e-07, "loss": 0.68021804, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14145, "time_per_iteration": 2.344773292541504 }, { "auxiliary_loss_clip": 0.01051212, "auxiliary_loss_mlp": 0.01040651, "balance_loss_clip": 1.01864517, "balance_loss_mlp": 1.01641536, "epoch": 0.8505035322410942, "flos": 20007047370240.0, "grad_norm": 1.7956610289071568, "language_loss": 0.86403221, "learning_rate": 2.2981245331765842e-07, "loss": 0.88495082, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14146, "time_per_iteration": 2.3681747913360596 }, { "auxiliary_loss_clip": 0.0105116, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.01255345, "balance_loss_mlp": 1.01570892, "epoch": 0.8505636554937622, "flos": 20811166871040.0, "grad_norm": 1.5793144901290441, "language_loss": 0.85149777, "learning_rate": 2.2963122645592814e-07, "loss": 0.8723563, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 14147, "time_per_iteration": 2.442284107208252 }, { "auxiliary_loss_clip": 0.0105441, "auxiliary_loss_mlp": 0.01038954, "balance_loss_clip": 1.01438546, "balance_loss_mlp": 1.01696873, "epoch": 0.8506237787464301, "flos": 14173549274880.0, "grad_norm": 2.576137685658371, "language_loss": 0.87368214, "learning_rate": 2.2945006672675894e-07, "loss": 0.89461571, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 14148, "time_per_iteration": 2.333218574523926 }, { "auxiliary_loss_clip": 0.01051841, "auxiliary_loss_mlp": 0.01036872, "balance_loss_clip": 1.01396012, "balance_loss_mlp": 1.01682472, "epoch": 0.8506839019990982, "flos": 23257251613440.0, "grad_norm": 1.6531174876610206, "language_loss": 0.73258007, "learning_rate": 2.292689741370204e-07, "loss": 0.7534672, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34960938, "step": 14149, "time_per_iteration": 2.4016737937927246 }, { "auxiliary_loss_clip": 0.01052481, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.01137936, "balance_loss_mlp": 1.01597309, "epoch": 0.8507440252517661, "flos": 23658089466240.0, "grad_norm": 1.6304762550003746, "language_loss": 0.77982825, "learning_rate": 2.290879486935804e-07, "loss": 0.80068815, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36523438, "step": 14150, "time_per_iteration": 2.4086837768554688 }, { "auxiliary_loss_clip": 0.01051868, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.01225519, "balance_loss_mlp": 1.0171349, "epoch": 0.8508041485044341, "flos": 18660342430080.0, "grad_norm": 1.6866242654458588, "language_loss": 0.73500013, "learning_rate": 2.2890699040330231e-07, "loss": 0.75585735, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 14151, "time_per_iteration": 2.3537256717681885 }, { "auxiliary_loss_clip": 0.01007602, "auxiliary_loss_mlp": 0.01005494, "balance_loss_clip": 1.00311029, "balance_loss_mlp": 1.0009892, "epoch": 0.8508642717571021, "flos": 52508217018240.0, "grad_norm": 0.8821073840116479, "language_loss": 0.596753, "learning_rate": 2.2872609927304909e-07, "loss": 0.61688399, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06640625, "step": 14152, "time_per_iteration": 2.7637526988983154 }, { "auxiliary_loss_clip": 0.0100739, "auxiliary_loss_mlp": 0.0100524, "balance_loss_clip": 1.00301063, "balance_loss_mlp": 1.00079393, "epoch": 0.85092439500977, "flos": 69293898794880.0, "grad_norm": 0.7006780134306205, "language_loss": 0.61344445, "learning_rate": 2.285452753096797e-07, "loss": 0.63357079, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.06591797, "step": 14153, "time_per_iteration": 3.0684869289398193 }, { "auxiliary_loss_clip": 0.01053219, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.01574445, "balance_loss_mlp": 1.0168221, "epoch": 0.850984518262438, "flos": 24388577176320.0, "grad_norm": 1.6417033631873912, "language_loss": 0.81422424, "learning_rate": 2.2836451852005067e-07, "loss": 0.83515191, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14154, "time_per_iteration": 2.3942973613739014 }, { "auxiliary_loss_clip": 0.01050208, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.01360703, "balance_loss_mlp": 1.01615906, "epoch": 0.851044641515106, "flos": 23293700939520.0, "grad_norm": 1.8020357223969257, "language_loss": 0.80829883, "learning_rate": 2.281838289110165e-07, "loss": 0.82914281, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.33984375, "step": 14155, "time_per_iteration": 2.368971347808838 }, { "auxiliary_loss_clip": 0.01052398, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.01239514, "balance_loss_mlp": 1.01551819, "epoch": 0.851104764767774, "flos": 22049117326080.0, "grad_norm": 1.6434159914313629, "language_loss": 0.71089202, "learning_rate": 2.2800320648942904e-07, "loss": 0.73178542, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36914062, "step": 14156, "time_per_iteration": 2.358189344406128 }, { "auxiliary_loss_clip": 0.01049875, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.010023, "balance_loss_mlp": 1.01609707, "epoch": 0.8511648880204419, "flos": 20703669194880.0, "grad_norm": 1.9724311676902375, "language_loss": 0.74881947, "learning_rate": 2.278226512621386e-07, "loss": 0.76963794, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33789062, "step": 14157, "time_per_iteration": 3.800255060195923 }, { "auxiliary_loss_clip": 0.01049037, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.00849473, "balance_loss_mlp": 1.01575637, "epoch": 0.8512250112731099, "flos": 24023525333760.0, "grad_norm": 2.4215197620551954, "language_loss": 0.80403388, "learning_rate": 2.2764216323598995e-07, "loss": 0.82480818, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33203125, "step": 14158, "time_per_iteration": 2.360689163208008 }, { "auxiliary_loss_clip": 0.01051781, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.01684332, "balance_loss_mlp": 1.01567376, "epoch": 0.8512851345257778, "flos": 22014448479360.0, "grad_norm": 2.19580141813171, "language_loss": 0.79989564, "learning_rate": 2.27461742417828e-07, "loss": 0.82082123, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 14159, "time_per_iteration": 2.364351272583008 }, { "auxiliary_loss_clip": 0.01052732, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.01467443, "balance_loss_mlp": 1.01644111, "epoch": 0.8513452577784458, "flos": 14829322942080.0, "grad_norm": 1.7636351609607326, "language_loss": 0.7301544, "learning_rate": 2.2728138881449488e-07, "loss": 0.75106835, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 14160, "time_per_iteration": 2.338268756866455 }, { "auxiliary_loss_clip": 0.01055424, "auxiliary_loss_mlp": 0.01038252, "balance_loss_clip": 1.01340938, "balance_loss_mlp": 1.01682448, "epoch": 0.8514053810311137, "flos": 33034294108800.0, "grad_norm": 2.075687742517877, "language_loss": 0.7114774, "learning_rate": 2.2710110243282866e-07, "loss": 0.73241413, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 14161, "time_per_iteration": 2.4808170795440674 }, { "auxiliary_loss_clip": 0.01052045, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.01591825, "balance_loss_mlp": 1.01469469, "epoch": 0.8514655042837818, "flos": 27563194592640.0, "grad_norm": 2.533822471929798, "language_loss": 0.79851621, "learning_rate": 2.2692088327966653e-07, "loss": 0.81942087, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37304688, "step": 14162, "time_per_iteration": 2.429096221923828 }, { "auxiliary_loss_clip": 0.01051912, "auxiliary_loss_mlp": 0.01039011, "balance_loss_clip": 1.01642072, "balance_loss_mlp": 1.01623261, "epoch": 0.8515256275364497, "flos": 35554534312320.0, "grad_norm": 2.1460387174767206, "language_loss": 0.78203666, "learning_rate": 2.2674073136184235e-07, "loss": 0.80294585, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 14163, "time_per_iteration": 2.51216459274292 }, { "auxiliary_loss_clip": 0.01007139, "auxiliary_loss_mlp": 0.01002491, "balance_loss_clip": 1.00033295, "balance_loss_mlp": 1.00072277, "epoch": 0.8515857507891177, "flos": 70204154428800.0, "grad_norm": 0.7102808817500432, "language_loss": 0.55211878, "learning_rate": 2.2656064668618735e-07, "loss": 0.57221508, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06445312, "step": 14164, "time_per_iteration": 3.0411384105682373 }, { "auxiliary_loss_clip": 0.01052082, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.01339793, "balance_loss_mlp": 1.01610196, "epoch": 0.8516458740417857, "flos": 22674167130240.0, "grad_norm": 1.7745609479401523, "language_loss": 0.73648071, "learning_rate": 2.2638062925953005e-07, "loss": 0.75737303, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.359375, "step": 14165, "time_per_iteration": 2.383466958999634 }, { "auxiliary_loss_clip": 0.01049976, "auxiliary_loss_mlp": 0.01033795, "balance_loss_clip": 1.01015532, "balance_loss_mlp": 1.01491213, "epoch": 0.8517059972944536, "flos": 22746332643840.0, "grad_norm": 1.788726058580675, "language_loss": 0.68561596, "learning_rate": 2.26200679088697e-07, "loss": 0.70645368, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 14166, "time_per_iteration": 2.3798439502716064 }, { "auxiliary_loss_clip": 0.01051198, "auxiliary_loss_mlp": 0.01033815, "balance_loss_clip": 1.012012, "balance_loss_mlp": 1.01580656, "epoch": 0.8517661205471216, "flos": 21688080289920.0, "grad_norm": 1.6796441434707552, "language_loss": 0.74233019, "learning_rate": 2.260207961805125e-07, "loss": 0.76318032, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 14167, "time_per_iteration": 2.3874104022979736 }, { "auxiliary_loss_clip": 0.01050995, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.01604879, "balance_loss_mlp": 1.01572227, "epoch": 0.8518262437997896, "flos": 25373651587200.0, "grad_norm": 1.7308488185765762, "language_loss": 0.81850827, "learning_rate": 2.258409805417969e-07, "loss": 0.83939695, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.3515625, "step": 14168, "time_per_iteration": 2.3986122608184814 }, { "auxiliary_loss_clip": 0.01050781, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.01129031, "balance_loss_mlp": 1.01567149, "epoch": 0.8518863670524576, "flos": 27234173139840.0, "grad_norm": 1.976545804441052, "language_loss": 0.77593684, "learning_rate": 2.2566123217936893e-07, "loss": 0.79677773, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 14169, "time_per_iteration": 2.4107086658477783 }, { "auxiliary_loss_clip": 0.0105446, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.0149709, "balance_loss_mlp": 1.01685119, "epoch": 0.8519464903051255, "flos": 20958465363840.0, "grad_norm": 1.703114247569637, "language_loss": 0.65332794, "learning_rate": 2.254815511000452e-07, "loss": 0.67425376, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.375, "step": 14170, "time_per_iteration": 2.3651392459869385 }, { "auxiliary_loss_clip": 0.01048708, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.0099647, "balance_loss_mlp": 1.01390374, "epoch": 0.8520066135577935, "flos": 18440773689600.0, "grad_norm": 2.309207034441405, "language_loss": 0.88022101, "learning_rate": 2.253019373106384e-07, "loss": 0.90102857, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 14171, "time_per_iteration": 2.327394485473633 }, { "auxiliary_loss_clip": 0.01051547, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.01342618, "balance_loss_mlp": 1.01633835, "epoch": 0.8520667368104614, "flos": 29129014425600.0, "grad_norm": 1.7492771333054395, "language_loss": 0.55889893, "learning_rate": 2.2512239081796003e-07, "loss": 0.5797736, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14172, "time_per_iteration": 2.426170587539673 }, { "auxiliary_loss_clip": 0.01048841, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.01454616, "balance_loss_mlp": 1.01508069, "epoch": 0.8521268600631294, "flos": 16033442423040.0, "grad_norm": 2.059231718298724, "language_loss": 0.70154548, "learning_rate": 2.2494291162881862e-07, "loss": 0.72238195, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 14173, "time_per_iteration": 3.573348045349121 }, { "auxiliary_loss_clip": 0.01052634, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.01419461, "balance_loss_mlp": 1.01636744, "epoch": 0.8521869833157973, "flos": 22453795428480.0, "grad_norm": 3.761712718379248, "language_loss": 0.78329974, "learning_rate": 2.247634997500205e-07, "loss": 0.80421889, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 14174, "time_per_iteration": 2.359973430633545 }, { "auxiliary_loss_clip": 0.01053416, "auxiliary_loss_mlp": 0.010383, "balance_loss_clip": 1.01512611, "balance_loss_mlp": 1.01648259, "epoch": 0.8522471065684654, "flos": 24970893609600.0, "grad_norm": 1.6858293800662034, "language_loss": 0.83188844, "learning_rate": 2.245841551883676e-07, "loss": 0.8528055, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 14175, "time_per_iteration": 2.396933078765869 }, { "auxiliary_loss_clip": 0.01054853, "auxiliary_loss_mlp": 0.01038858, "balance_loss_clip": 1.0147177, "balance_loss_mlp": 1.01778173, "epoch": 0.8523072298211333, "flos": 17709692486400.0, "grad_norm": 2.2236129712727957, "language_loss": 0.67388165, "learning_rate": 2.2440487795066153e-07, "loss": 0.69481874, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37109375, "step": 14176, "time_per_iteration": 2.332918882369995 }, { "auxiliary_loss_clip": 0.01050417, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.01195645, "balance_loss_mlp": 1.01517475, "epoch": 0.8523673530738013, "flos": 25445049050880.0, "grad_norm": 1.7588681422119954, "language_loss": 0.79681885, "learning_rate": 2.2422566804370068e-07, "loss": 0.81766677, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14177, "time_per_iteration": 2.405050039291382 }, { "auxiliary_loss_clip": 0.01052178, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.00931072, "balance_loss_mlp": 1.01614928, "epoch": 0.8524274763264693, "flos": 31428289434240.0, "grad_norm": 1.6597435101348865, "language_loss": 0.7447561, "learning_rate": 2.2404652547428026e-07, "loss": 0.76559317, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 14178, "time_per_iteration": 2.45031476020813 }, { "auxiliary_loss_clip": 0.01052666, "auxiliary_loss_mlp": 0.01042731, "balance_loss_clip": 1.01986694, "balance_loss_mlp": 1.01684499, "epoch": 0.8524875995791372, "flos": 17711682433920.0, "grad_norm": 1.8452012845144103, "language_loss": 0.76106292, "learning_rate": 2.238674502491935e-07, "loss": 0.78201693, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 14179, "time_per_iteration": 2.345099687576294 }, { "auxiliary_loss_clip": 0.01050773, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 1.01144278, "balance_loss_mlp": 1.01593208, "epoch": 0.8525477228318052, "flos": 21686299810560.0, "grad_norm": 2.2073564606593123, "language_loss": 0.83499324, "learning_rate": 2.2368844237523165e-07, "loss": 0.85584164, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14180, "time_per_iteration": 2.3743889331817627 }, { "auxiliary_loss_clip": 0.01052194, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.0137527, "balance_loss_mlp": 1.01651311, "epoch": 0.8526078460844732, "flos": 24825899266560.0, "grad_norm": 6.209726723977838, "language_loss": 0.62721407, "learning_rate": 2.235095018591815e-07, "loss": 0.64811063, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 14181, "time_per_iteration": 3.8730671405792236 }, { "auxiliary_loss_clip": 0.01051159, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 1.01569557, "balance_loss_mlp": 1.01621437, "epoch": 0.8526679693371412, "flos": 13515576192000.0, "grad_norm": 2.183304251135541, "language_loss": 0.7315644, "learning_rate": 2.2333062870782894e-07, "loss": 0.75245225, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 14182, "time_per_iteration": 2.3320682048797607 }, { "auxiliary_loss_clip": 0.01050865, "auxiliary_loss_mlp": 0.0104099, "balance_loss_clip": 1.01779175, "balance_loss_mlp": 1.01598775, "epoch": 0.8527280925898091, "flos": 23512955477760.0, "grad_norm": 1.4775660769841545, "language_loss": 0.71953827, "learning_rate": 2.2315182292795697e-07, "loss": 0.74045682, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 14183, "time_per_iteration": 3.698413848876953 }, { "auxiliary_loss_clip": 0.01051843, "auxiliary_loss_mlp": 0.01036838, "balance_loss_clip": 1.01545179, "balance_loss_mlp": 1.01742125, "epoch": 0.8527882158424771, "flos": 20301993469440.0, "grad_norm": 1.8396985258261307, "language_loss": 0.73308444, "learning_rate": 2.2297308452634644e-07, "loss": 0.75397122, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 14184, "time_per_iteration": 2.359746217727661 }, { "auxiliary_loss_clip": 0.01052098, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.01087117, "balance_loss_mlp": 1.01633596, "epoch": 0.852848339095145, "flos": 17201531514240.0, "grad_norm": 1.7483955149190165, "language_loss": 0.77317274, "learning_rate": 2.2279441350977457e-07, "loss": 0.79402745, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 14185, "time_per_iteration": 2.380070447921753 }, { "auxiliary_loss_clip": 0.01051634, "auxiliary_loss_mlp": 0.01038554, "balance_loss_clip": 1.01411653, "balance_loss_mlp": 1.01509595, "epoch": 0.852908462347813, "flos": 18368014682880.0, "grad_norm": 1.8519955994472161, "language_loss": 0.80296504, "learning_rate": 2.2261580988501637e-07, "loss": 0.82386696, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36523438, "step": 14186, "time_per_iteration": 2.3651018142700195 }, { "auxiliary_loss_clip": 0.01051744, "auxiliary_loss_mlp": 0.01040343, "balance_loss_clip": 1.01464105, "balance_loss_mlp": 1.01510739, "epoch": 0.8529685856004809, "flos": 18623893104000.0, "grad_norm": 1.7753239710802815, "language_loss": 0.63306183, "learning_rate": 2.224372736588449e-07, "loss": 0.65398264, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3671875, "step": 14187, "time_per_iteration": 2.3398501873016357 }, { "auxiliary_loss_clip": 0.01054024, "auxiliary_loss_mlp": 0.01035239, "balance_loss_clip": 1.00964499, "balance_loss_mlp": 1.01602697, "epoch": 0.853028708853149, "flos": 29606346800640.0, "grad_norm": 1.563610760347071, "language_loss": 0.777807, "learning_rate": 2.2225880483803005e-07, "loss": 0.79869962, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 14188, "time_per_iteration": 2.458784341812134 }, { "auxiliary_loss_clip": 0.01054452, "auxiliary_loss_mlp": 0.01042544, "balance_loss_clip": 1.0160917, "balance_loss_mlp": 1.01722765, "epoch": 0.8530888321058169, "flos": 26352127751040.0, "grad_norm": 1.5784033567624374, "language_loss": 0.79075825, "learning_rate": 2.2208040342933932e-07, "loss": 0.81172812, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37304688, "step": 14189, "time_per_iteration": 2.4401755332946777 }, { "auxiliary_loss_clip": 0.01051684, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.01327157, "balance_loss_mlp": 1.01562703, "epoch": 0.8531489553584849, "flos": 20520933805440.0, "grad_norm": 2.867577547316191, "language_loss": 0.81399238, "learning_rate": 2.2190206943953793e-07, "loss": 0.83488131, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 14190, "time_per_iteration": 2.3590621948242188 }, { "auxiliary_loss_clip": 0.01051313, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.01876283, "balance_loss_mlp": 1.01648748, "epoch": 0.8532090786111529, "flos": 20703250258560.0, "grad_norm": 1.8029502890416584, "language_loss": 0.77141535, "learning_rate": 2.2172380287538894e-07, "loss": 0.79233694, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 14191, "time_per_iteration": 2.3620171546936035 }, { "auxiliary_loss_clip": 0.01050508, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.01158845, "balance_loss_mlp": 1.01554465, "epoch": 0.8532692018638208, "flos": 19827872939520.0, "grad_norm": 1.8905590225072815, "language_loss": 0.69905359, "learning_rate": 2.2154560374365073e-07, "loss": 0.71991408, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.34960938, "step": 14192, "time_per_iteration": 2.3388164043426514 }, { "auxiliary_loss_clip": 0.01054948, "auxiliary_loss_mlp": 0.01044151, "balance_loss_clip": 1.01713824, "balance_loss_mlp": 1.01632106, "epoch": 0.8533293251164888, "flos": 20995717651200.0, "grad_norm": 2.669462044371613, "language_loss": 0.64359683, "learning_rate": 2.2136747205108164e-07, "loss": 0.66458786, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.38671875, "step": 14193, "time_per_iteration": 2.3587467670440674 }, { "auxiliary_loss_clip": 0.0105103, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.01456213, "balance_loss_mlp": 1.01539803, "epoch": 0.8533894483691568, "flos": 22418498177280.0, "grad_norm": 1.8496558471054128, "language_loss": 0.77399087, "learning_rate": 2.211894078044365e-07, "loss": 0.79486436, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 14194, "time_per_iteration": 2.3582582473754883 }, { "auxiliary_loss_clip": 0.01052397, "auxiliary_loss_mlp": 0.01034563, "balance_loss_clip": 1.01187754, "balance_loss_mlp": 1.01606321, "epoch": 0.8534495716218248, "flos": 21615460928640.0, "grad_norm": 2.066792351149847, "language_loss": 0.70763743, "learning_rate": 2.2101141101046705e-07, "loss": 0.72850704, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 14195, "time_per_iteration": 2.3598520755767822 }, { "auxiliary_loss_clip": 0.01050898, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.01157784, "balance_loss_mlp": 1.0150435, "epoch": 0.8535096948744927, "flos": 22345180588800.0, "grad_norm": 1.9056908114137008, "language_loss": 0.86577547, "learning_rate": 2.2083348167592343e-07, "loss": 0.88663077, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 14196, "time_per_iteration": 3.8925931453704834 }, { "auxiliary_loss_clip": 0.0100731, "auxiliary_loss_mlp": 0.01003825, "balance_loss_clip": 1.00127351, "balance_loss_mlp": 1.00069582, "epoch": 0.8535698181271607, "flos": 52759906076160.0, "grad_norm": 0.7696621663842129, "language_loss": 0.55205905, "learning_rate": 2.2065561980755243e-07, "loss": 0.57217044, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.06640625, "step": 14197, "time_per_iteration": 2.9587671756744385 }, { "auxiliary_loss_clip": 0.01048838, "auxiliary_loss_mlp": 0.01035225, "balance_loss_clip": 1.01376748, "balance_loss_mlp": 1.01482749, "epoch": 0.8536299413798286, "flos": 19061878510080.0, "grad_norm": 1.5881670446152643, "language_loss": 0.82180727, "learning_rate": 2.2047782541209826e-07, "loss": 0.84264791, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 14198, "time_per_iteration": 2.4055604934692383 }, { "auxiliary_loss_clip": 0.01050786, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 1.01317692, "balance_loss_mlp": 1.01573122, "epoch": 0.8536900646324966, "flos": 49342922801280.0, "grad_norm": 1.660344287652332, "language_loss": 0.69876492, "learning_rate": 2.203000984963035e-07, "loss": 0.71962333, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 14199, "time_per_iteration": 2.597259283065796 }, { "auxiliary_loss_clip": 0.01049143, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.01245475, "balance_loss_mlp": 1.01466286, "epoch": 0.8537501878851645, "flos": 21761258232960.0, "grad_norm": 1.571579475047364, "language_loss": 0.87316275, "learning_rate": 2.201224390669072e-07, "loss": 0.89398307, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34375, "step": 14200, "time_per_iteration": 2.366915225982666 }, { "auxiliary_loss_clip": 0.01052058, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.0121392, "balance_loss_mlp": 1.01576877, "epoch": 0.8538103111378326, "flos": 22268197307520.0, "grad_norm": 1.786208890817532, "language_loss": 0.79266798, "learning_rate": 2.1994484713064666e-07, "loss": 0.81353676, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 14201, "time_per_iteration": 2.375302791595459 }, { "auxiliary_loss_clip": 0.01050845, "auxiliary_loss_mlp": 0.0103639, "balance_loss_clip": 1.01487279, "balance_loss_mlp": 1.01639652, "epoch": 0.8538704343905005, "flos": 20302866253440.0, "grad_norm": 1.6048446643939962, "language_loss": 0.6972723, "learning_rate": 2.19767322694256e-07, "loss": 0.71814466, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34570312, "step": 14202, "time_per_iteration": 2.3649210929870605 }, { "auxiliary_loss_clip": 0.01051001, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.00957155, "balance_loss_mlp": 1.01560009, "epoch": 0.8539305576431685, "flos": 24753978132480.0, "grad_norm": 1.5520969485652851, "language_loss": 0.81808656, "learning_rate": 2.195898657644666e-07, "loss": 0.83894241, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35351562, "step": 14203, "time_per_iteration": 2.4086856842041016 }, { "auxiliary_loss_clip": 0.01053794, "auxiliary_loss_mlp": 0.01038167, "balance_loss_clip": 1.01344347, "balance_loss_mlp": 1.01701748, "epoch": 0.8539906808958365, "flos": 26686420819200.0, "grad_norm": 2.25833859985307, "language_loss": 0.67175484, "learning_rate": 2.1941247634800808e-07, "loss": 0.6926744, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 14204, "time_per_iteration": 2.397388219833374 }, { "auxiliary_loss_clip": 0.01051077, "auxiliary_loss_mlp": 0.01033434, "balance_loss_clip": 1.01046205, "balance_loss_mlp": 1.01507998, "epoch": 0.8540508041485044, "flos": 13364821474560.0, "grad_norm": 2.0632766775772726, "language_loss": 0.61908352, "learning_rate": 2.1923515445160667e-07, "loss": 0.6399287, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 14205, "time_per_iteration": 2.3288393020629883 }, { "auxiliary_loss_clip": 0.01052071, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.01213765, "balance_loss_mlp": 1.01591587, "epoch": 0.8541109274011724, "flos": 32779497939840.0, "grad_norm": 2.158883787216666, "language_loss": 0.73175228, "learning_rate": 2.1905790008198655e-07, "loss": 0.75262469, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 14206, "time_per_iteration": 2.4556050300598145 }, { "auxiliary_loss_clip": 0.01052875, "auxiliary_loss_mlp": 0.01036789, "balance_loss_clip": 1.01397264, "balance_loss_mlp": 1.01686835, "epoch": 0.8541710506538404, "flos": 17638329934080.0, "grad_norm": 2.4879431028472667, "language_loss": 0.77912146, "learning_rate": 2.1888071324586987e-07, "loss": 0.80001813, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 14207, "time_per_iteration": 2.332979679107666 }, { "auxiliary_loss_clip": 0.01052472, "auxiliary_loss_mlp": 0.01040257, "balance_loss_clip": 1.01504457, "balance_loss_mlp": 1.01637459, "epoch": 0.8542311739065084, "flos": 20262122830080.0, "grad_norm": 1.6525265142816643, "language_loss": 0.85677254, "learning_rate": 2.1870359394997485e-07, "loss": 0.87769979, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36132812, "step": 14208, "time_per_iteration": 2.420612335205078 }, { "auxiliary_loss_clip": 0.010537, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.01622677, "balance_loss_mlp": 1.0175786, "epoch": 0.8542912971591763, "flos": 17784685820160.0, "grad_norm": 1.5302790424408783, "language_loss": 0.67651868, "learning_rate": 2.1852654220101785e-07, "loss": 0.69744921, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 14209, "time_per_iteration": 2.387730360031128 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.01286829, "balance_loss_mlp": 1.01628327, "epoch": 0.8543514204118443, "flos": 26978294718720.0, "grad_norm": 2.0974091085793907, "language_loss": 0.71548915, "learning_rate": 2.1834955800571287e-07, "loss": 0.73638177, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 14210, "time_per_iteration": 2.424652338027954 }, { "auxiliary_loss_clip": 0.01051718, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.01138353, "balance_loss_mlp": 1.01580715, "epoch": 0.8544115436645122, "flos": 24023455511040.0, "grad_norm": 1.53735490380864, "language_loss": 0.71192479, "learning_rate": 2.1817264137077141e-07, "loss": 0.73277795, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 14211, "time_per_iteration": 2.438556432723999 }, { "auxiliary_loss_clip": 0.01052, "auxiliary_loss_mlp": 0.01038303, "balance_loss_clip": 1.01490211, "balance_loss_mlp": 1.01606941, "epoch": 0.8544716669171802, "flos": 16617050576640.0, "grad_norm": 2.124177690237166, "language_loss": 0.82404345, "learning_rate": 2.1799579230290166e-07, "loss": 0.8449465, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 14212, "time_per_iteration": 2.372735023498535 }, { "auxiliary_loss_clip": 0.01052454, "auxiliary_loss_mlp": 0.01039684, "balance_loss_clip": 1.01443493, "balance_loss_mlp": 1.01552439, "epoch": 0.8545317901698481, "flos": 40004179914240.0, "grad_norm": 1.8247932034805714, "language_loss": 0.67364275, "learning_rate": 2.178190108088105e-07, "loss": 0.69456416, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36914062, "step": 14213, "time_per_iteration": 3.740384578704834 }, { "auxiliary_loss_clip": 0.01050114, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.01269627, "balance_loss_mlp": 1.01519704, "epoch": 0.8545919134225162, "flos": 19901469818880.0, "grad_norm": 2.9897743998709374, "language_loss": 0.78679276, "learning_rate": 2.1764229689520098e-07, "loss": 0.80763352, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34960938, "step": 14214, "time_per_iteration": 2.3919677734375 }, { "auxiliary_loss_clip": 0.01054205, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.01340199, "balance_loss_mlp": 1.01628804, "epoch": 0.8546520366751841, "flos": 18951971950080.0, "grad_norm": 2.130046224009511, "language_loss": 0.6902495, "learning_rate": 2.1746565056877397e-07, "loss": 0.71118128, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 14215, "time_per_iteration": 2.3381853103637695 }, { "auxiliary_loss_clip": 0.01051757, "auxiliary_loss_mlp": 0.01034754, "balance_loss_clip": 1.01253343, "balance_loss_mlp": 1.01607323, "epoch": 0.8547121599278521, "flos": 35620136490240.0, "grad_norm": 4.324591868684738, "language_loss": 0.63581395, "learning_rate": 2.172890718362279e-07, "loss": 0.65667903, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35742188, "step": 14216, "time_per_iteration": 2.4723329544067383 }, { "auxiliary_loss_clip": 0.01053618, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.01296389, "balance_loss_mlp": 1.01671433, "epoch": 0.8547722831805201, "flos": 16909099032960.0, "grad_norm": 1.9909534802835311, "language_loss": 0.66755551, "learning_rate": 2.17112560704259e-07, "loss": 0.68846965, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 14217, "time_per_iteration": 2.357816457748413 }, { "auxiliary_loss_clip": 0.01050066, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.0143733, "balance_loss_mlp": 1.01618361, "epoch": 0.854832406433188, "flos": 23001512837760.0, "grad_norm": 1.5198784206054226, "language_loss": 0.66223073, "learning_rate": 2.1693611717956072e-07, "loss": 0.68310452, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.33984375, "step": 14218, "time_per_iteration": 2.3877785205841064 }, { "auxiliary_loss_clip": 0.01051886, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.00925565, "balance_loss_mlp": 1.01528692, "epoch": 0.854892529685856, "flos": 20411550915840.0, "grad_norm": 2.3259191433876545, "language_loss": 0.71423805, "learning_rate": 2.167597412688238e-07, "loss": 0.7350744, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36523438, "step": 14219, "time_per_iteration": 2.356041431427002 }, { "auxiliary_loss_clip": 0.01052875, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.01639986, "balance_loss_mlp": 1.01532984, "epoch": 0.854952652938524, "flos": 16397796038400.0, "grad_norm": 2.522881955866012, "language_loss": 0.69621986, "learning_rate": 2.1658343297873549e-07, "loss": 0.71716642, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 14220, "time_per_iteration": 2.3231122493743896 }, { "auxiliary_loss_clip": 0.01050349, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.01254845, "balance_loss_mlp": 1.01571393, "epoch": 0.855012776191192, "flos": 21177615168000.0, "grad_norm": 2.4250903324870943, "language_loss": 0.73312223, "learning_rate": 2.164071923159827e-07, "loss": 0.75397146, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14221, "time_per_iteration": 3.7627100944519043 }, { "auxiliary_loss_clip": 0.01051569, "auxiliary_loss_mlp": 0.01039597, "balance_loss_clip": 1.01489711, "balance_loss_mlp": 1.01512456, "epoch": 0.8550728994438599, "flos": 26139785662080.0, "grad_norm": 1.9670247942293195, "language_loss": 0.61314988, "learning_rate": 2.1623101928724763e-07, "loss": 0.63406157, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 14222, "time_per_iteration": 2.423718214035034 }, { "auxiliary_loss_clip": 0.0105083, "auxiliary_loss_mlp": 0.010337, "balance_loss_clip": 1.01232576, "balance_loss_mlp": 1.01593256, "epoch": 0.8551330226965279, "flos": 22785609790080.0, "grad_norm": 1.511916637479822, "language_loss": 0.8490172, "learning_rate": 2.1605491389921093e-07, "loss": 0.86986244, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34960938, "step": 14223, "time_per_iteration": 3.7547054290771484 }, { "auxiliary_loss_clip": 0.01050431, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.00906277, "balance_loss_mlp": 1.01581049, "epoch": 0.8551931459491958, "flos": 22417939595520.0, "grad_norm": 1.6480598158879873, "language_loss": 0.75182533, "learning_rate": 2.158788761585515e-07, "loss": 0.77263725, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 14224, "time_per_iteration": 2.382056474685669 }, { "auxiliary_loss_clip": 0.01050507, "auxiliary_loss_mlp": 0.01037945, "balance_loss_clip": 1.01441288, "balance_loss_mlp": 1.01537418, "epoch": 0.8552532692018638, "flos": 19572169075200.0, "grad_norm": 9.458703346235927, "language_loss": 0.75876081, "learning_rate": 2.1570290607194307e-07, "loss": 0.77964532, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 14225, "time_per_iteration": 2.4040286540985107 }, { "auxiliary_loss_clip": 0.01052107, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.01441658, "balance_loss_mlp": 1.01683223, "epoch": 0.8553133924545318, "flos": 26431554827520.0, "grad_norm": 1.9218871709224337, "language_loss": 0.78189874, "learning_rate": 2.1552700364605925e-07, "loss": 0.80277944, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 14226, "time_per_iteration": 2.4230284690856934 }, { "auxiliary_loss_clip": 0.01053126, "auxiliary_loss_mlp": 0.01037203, "balance_loss_clip": 1.01188314, "balance_loss_mlp": 1.01575351, "epoch": 0.8553735157071998, "flos": 16361521269120.0, "grad_norm": 2.2183632225212047, "language_loss": 0.56626618, "learning_rate": 2.153511688875702e-07, "loss": 0.58716953, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 14227, "time_per_iteration": 2.378129482269287 }, { "auxiliary_loss_clip": 0.0105169, "auxiliary_loss_mlp": 0.01039063, "balance_loss_clip": 1.01587653, "balance_loss_mlp": 1.01668382, "epoch": 0.8554336389598677, "flos": 20886264938880.0, "grad_norm": 2.2175202841180606, "language_loss": 0.67226744, "learning_rate": 2.151754018031442e-07, "loss": 0.69317502, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14228, "time_per_iteration": 2.3727986812591553 }, { "auxiliary_loss_clip": 0.01052052, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.02069962, "balance_loss_mlp": 1.01532936, "epoch": 0.8554937622125357, "flos": 21283751301120.0, "grad_norm": 2.2674112597620417, "language_loss": 0.75739515, "learning_rate": 2.1499970239944542e-07, "loss": 0.77836847, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 14229, "time_per_iteration": 2.3637821674346924 }, { "auxiliary_loss_clip": 0.01051418, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.01253295, "balance_loss_mlp": 1.01584828, "epoch": 0.8555538854652037, "flos": 22412249043840.0, "grad_norm": 1.803547560593422, "language_loss": 0.73233318, "learning_rate": 2.1482407068313724e-07, "loss": 0.75319517, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14230, "time_per_iteration": 2.3771896362304688 }, { "auxiliary_loss_clip": 0.01051859, "auxiliary_loss_mlp": 0.01037799, "balance_loss_clip": 1.01408863, "balance_loss_mlp": 1.01615, "epoch": 0.8556140087178716, "flos": 20192680402560.0, "grad_norm": 2.0178630894043863, "language_loss": 0.8413595, "learning_rate": 2.1464850666087897e-07, "loss": 0.86225611, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 14231, "time_per_iteration": 2.3776204586029053 }, { "auxiliary_loss_clip": 0.010547, "auxiliary_loss_mlp": 0.01037284, "balance_loss_clip": 1.01220202, "balance_loss_mlp": 1.01779008, "epoch": 0.8556741319705397, "flos": 22637019576960.0, "grad_norm": 1.9655391492153602, "language_loss": 0.68818259, "learning_rate": 2.1447301033932796e-07, "loss": 0.70910239, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 14232, "time_per_iteration": 2.375871181488037 }, { "auxiliary_loss_clip": 0.01052115, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.01123428, "balance_loss_mlp": 1.01577032, "epoch": 0.8557342552232076, "flos": 23548217817600.0, "grad_norm": 1.4676649118817033, "language_loss": 0.68283737, "learning_rate": 2.1429758172513955e-07, "loss": 0.70371199, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 14233, "time_per_iteration": 2.3809025287628174 }, { "auxiliary_loss_clip": 0.01050678, "auxiliary_loss_mlp": 0.01039657, "balance_loss_clip": 1.01644695, "balance_loss_mlp": 1.01551259, "epoch": 0.8557943784758756, "flos": 19608862780800.0, "grad_norm": 1.9298030289160908, "language_loss": 0.77886283, "learning_rate": 2.1412222082496556e-07, "loss": 0.79976618, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14234, "time_per_iteration": 2.380378484725952 }, { "auxiliary_loss_clip": 0.01007508, "auxiliary_loss_mlp": 0.01001938, "balance_loss_clip": 0.99985152, "balance_loss_mlp": 1.00088191, "epoch": 0.8558545017285435, "flos": 70638753432960.0, "grad_norm": 0.7569250946905336, "language_loss": 0.58087528, "learning_rate": 2.1394692764545684e-07, "loss": 0.60096973, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06640625, "step": 14235, "time_per_iteration": 2.980517864227295 }, { "auxiliary_loss_clip": 0.01007466, "auxiliary_loss_mlp": 0.01003114, "balance_loss_clip": 1.00103974, "balance_loss_mlp": 1.00082135, "epoch": 0.8559146249812115, "flos": 56646715783680.0, "grad_norm": 0.7785273872823135, "language_loss": 0.5666039, "learning_rate": 2.1377170219325858e-07, "loss": 0.58670974, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.06640625, "step": 14236, "time_per_iteration": 4.343386888504028 }, { "auxiliary_loss_clip": 0.01051548, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.01405668, "balance_loss_mlp": 1.01606917, "epoch": 0.8559747482338794, "flos": 22887277269120.0, "grad_norm": 1.884092665027232, "language_loss": 0.71141207, "learning_rate": 2.1359654447501673e-07, "loss": 0.73230153, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 14237, "time_per_iteration": 2.412303924560547 }, { "auxiliary_loss_clip": 0.01049734, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.00846481, "balance_loss_mlp": 1.01466119, "epoch": 0.8560348714865474, "flos": 22600814630400.0, "grad_norm": 2.184706797758299, "language_loss": 0.64232361, "learning_rate": 2.1342145449737314e-07, "loss": 0.66312623, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 14238, "time_per_iteration": 2.363118886947632 }, { "auxiliary_loss_clip": 0.0104832, "auxiliary_loss_mlp": 0.01035094, "balance_loss_clip": 1.01511478, "balance_loss_mlp": 1.01544166, "epoch": 0.8560949947392154, "flos": 17930483124480.0, "grad_norm": 1.4307236124273763, "language_loss": 0.70268726, "learning_rate": 2.1324643226696648e-07, "loss": 0.72352147, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.328125, "step": 14239, "time_per_iteration": 2.3594250679016113 }, { "auxiliary_loss_clip": 0.01053543, "auxiliary_loss_mlp": 0.01039064, "balance_loss_clip": 1.01596141, "balance_loss_mlp": 1.0159142, "epoch": 0.8561551179918834, "flos": 31024972874880.0, "grad_norm": 2.079543085468315, "language_loss": 0.68040776, "learning_rate": 2.1307147779043455e-07, "loss": 0.70133388, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37695312, "step": 14240, "time_per_iteration": 2.432443857192993 }, { "auxiliary_loss_clip": 0.01052478, "auxiliary_loss_mlp": 0.01037258, "balance_loss_clip": 1.01309466, "balance_loss_mlp": 1.01621294, "epoch": 0.8562152412445513, "flos": 30663866016000.0, "grad_norm": 1.5933112816632753, "language_loss": 0.63404679, "learning_rate": 2.1289659107441182e-07, "loss": 0.65494418, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 14241, "time_per_iteration": 2.4248340129852295 }, { "auxiliary_loss_clip": 0.01053192, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.01279914, "balance_loss_mlp": 1.0159682, "epoch": 0.8562753644972193, "flos": 31574819877120.0, "grad_norm": 1.9067559980636262, "language_loss": 0.74761885, "learning_rate": 2.1272177212552855e-07, "loss": 0.76853043, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 14242, "time_per_iteration": 2.4423553943634033 }, { "auxiliary_loss_clip": 0.01055461, "auxiliary_loss_mlp": 0.0104386, "balance_loss_clip": 1.01860011, "balance_loss_mlp": 1.01687384, "epoch": 0.8563354877498872, "flos": 26212439934720.0, "grad_norm": 2.0904098676968768, "language_loss": 0.77858227, "learning_rate": 2.1254702095041498e-07, "loss": 0.79957545, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 14243, "time_per_iteration": 2.3952407836914062 }, { "auxiliary_loss_clip": 0.01051913, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.01240563, "balance_loss_mlp": 1.01547074, "epoch": 0.8563956110025552, "flos": 24133187514240.0, "grad_norm": 1.8238690772012056, "language_loss": 0.69196421, "learning_rate": 2.123723375556974e-07, "loss": 0.71285117, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 14244, "time_per_iteration": 2.3828656673431396 }, { "auxiliary_loss_clip": 0.01007498, "auxiliary_loss_mlp": 0.01004, "balance_loss_clip": 1.00199735, "balance_loss_mlp": 1.0008502, "epoch": 0.8564557342552233, "flos": 56269095851520.0, "grad_norm": 0.7550752114065635, "language_loss": 0.58548033, "learning_rate": 2.1219772194800046e-07, "loss": 0.60559529, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06640625, "step": 14245, "time_per_iteration": 2.9039578437805176 }, { "auxiliary_loss_clip": 0.01054329, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.01141405, "balance_loss_mlp": 1.01694345, "epoch": 0.8565158575078912, "flos": 23439498243840.0, "grad_norm": 1.7162919033430326, "language_loss": 0.7848593, "learning_rate": 2.1202317413394488e-07, "loss": 0.80576301, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 14246, "time_per_iteration": 2.3933281898498535 }, { "auxiliary_loss_clip": 0.01050706, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.01322174, "balance_loss_mlp": 1.01480865, "epoch": 0.8565759807605592, "flos": 20374892121600.0, "grad_norm": 2.0939872339994974, "language_loss": 0.82936156, "learning_rate": 2.1184869412014938e-07, "loss": 0.85023308, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 14247, "time_per_iteration": 2.5168938636779785 }, { "auxiliary_loss_clip": 0.01053194, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.01377308, "balance_loss_mlp": 1.01631308, "epoch": 0.8566361040132271, "flos": 18806104823040.0, "grad_norm": 2.3658429764996107, "language_loss": 0.78642792, "learning_rate": 2.1167428191323112e-07, "loss": 0.8073315, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36914062, "step": 14248, "time_per_iteration": 2.343745231628418 }, { "auxiliary_loss_clip": 0.01050366, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.01192319, "balance_loss_mlp": 1.01528597, "epoch": 0.8566962272658951, "flos": 24534199923840.0, "grad_norm": 1.6366104947513447, "language_loss": 0.78706264, "learning_rate": 2.1149993751980278e-07, "loss": 0.80791682, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 14249, "time_per_iteration": 2.388091802597046 }, { "auxiliary_loss_clip": 0.01048937, "auxiliary_loss_mlp": 0.01042257, "balance_loss_clip": 1.02015543, "balance_loss_mlp": 1.01502645, "epoch": 0.856756350518563, "flos": 23177580157440.0, "grad_norm": 1.714982138587563, "language_loss": 0.78929508, "learning_rate": 2.1132566094647597e-07, "loss": 0.81020701, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33984375, "step": 14250, "time_per_iteration": 2.3757753372192383 }, { "auxiliary_loss_clip": 0.01049549, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.01590157, "balance_loss_mlp": 1.01633346, "epoch": 0.856816473771231, "flos": 20807675735040.0, "grad_norm": 1.6969024760921536, "language_loss": 0.80752194, "learning_rate": 2.1115145219985942e-07, "loss": 0.82839102, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33203125, "step": 14251, "time_per_iteration": 2.3598287105560303 }, { "auxiliary_loss_clip": 0.0104949, "auxiliary_loss_mlp": 0.0103883, "balance_loss_clip": 1.01567948, "balance_loss_mlp": 1.01543725, "epoch": 0.856876597023899, "flos": 20227174692480.0, "grad_norm": 2.111820165092905, "language_loss": 0.6268791, "learning_rate": 2.1097731128656005e-07, "loss": 0.6477623, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.33984375, "step": 14252, "time_per_iteration": 3.5945284366607666 }, { "auxiliary_loss_clip": 0.01053918, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.01612961, "balance_loss_mlp": 1.01718855, "epoch": 0.856936720276567, "flos": 18295150942080.0, "grad_norm": 1.8122178497914783, "language_loss": 0.71517867, "learning_rate": 2.1080323821317924e-07, "loss": 0.73613495, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 14253, "time_per_iteration": 2.351261854171753 }, { "auxiliary_loss_clip": 0.01007522, "auxiliary_loss_mlp": 0.01003026, "balance_loss_clip": 1.00087988, "balance_loss_mlp": 1.00096035, "epoch": 0.8569968435292349, "flos": 69875202798720.0, "grad_norm": 0.7986697264614281, "language_loss": 0.59286714, "learning_rate": 2.1062923298631907e-07, "loss": 0.61297262, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06542969, "step": 14254, "time_per_iteration": 3.0478641986846924 }, { "auxiliary_loss_clip": 0.0105131, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 1.0128206, "balance_loss_mlp": 1.01564097, "epoch": 0.8570569667819029, "flos": 25847388092160.0, "grad_norm": 2.0315172060200353, "language_loss": 0.8204416, "learning_rate": 2.1045529561257825e-07, "loss": 0.84131664, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 14255, "time_per_iteration": 2.411677360534668 }, { "auxiliary_loss_clip": 0.01050346, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.01249051, "balance_loss_mlp": 1.01563919, "epoch": 0.8571170900345708, "flos": 23256029715840.0, "grad_norm": 2.1891324917075554, "language_loss": 0.67867517, "learning_rate": 2.1028142609855126e-07, "loss": 0.69952023, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 14256, "time_per_iteration": 2.3813443183898926 }, { "auxiliary_loss_clip": 0.01051572, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.01218951, "balance_loss_mlp": 1.01620877, "epoch": 0.8571772132872388, "flos": 18916639787520.0, "grad_norm": 1.575102866038727, "language_loss": 0.70956153, "learning_rate": 2.1010762445083218e-07, "loss": 0.73042357, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35351562, "step": 14257, "time_per_iteration": 2.363162040710449 }, { "auxiliary_loss_clip": 0.01050804, "auxiliary_loss_mlp": 0.0103628, "balance_loss_clip": 1.01386905, "balance_loss_mlp": 1.01609731, "epoch": 0.8572373365399069, "flos": 33248870524800.0, "grad_norm": 2.055132774878838, "language_loss": 0.78328735, "learning_rate": 2.0993389067601197e-07, "loss": 0.80415815, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14258, "time_per_iteration": 2.458268404006958 }, { "auxiliary_loss_clip": 0.01049797, "auxiliary_loss_mlp": 0.01037751, "balance_loss_clip": 1.01451755, "balance_loss_mlp": 1.01563358, "epoch": 0.8572974597925748, "flos": 23326519484160.0, "grad_norm": 1.5571999058495358, "language_loss": 0.68618089, "learning_rate": 2.0976022478067735e-07, "loss": 0.7070564, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34179688, "step": 14259, "time_per_iteration": 2.4487814903259277 }, { "auxiliary_loss_clip": 0.01051607, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.0168159, "balance_loss_mlp": 1.01633787, "epoch": 0.8573575830452428, "flos": 24534688682880.0, "grad_norm": 1.8429491871247183, "language_loss": 0.78116131, "learning_rate": 2.0958662677141437e-07, "loss": 0.80207419, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 14260, "time_per_iteration": 3.8206326961517334 }, { "auxiliary_loss_clip": 0.01052416, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.01208949, "balance_loss_mlp": 1.01595271, "epoch": 0.8574177062979107, "flos": 24164400136320.0, "grad_norm": 1.9160344093529227, "language_loss": 0.75255936, "learning_rate": 2.09413096654806e-07, "loss": 0.77344334, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 14261, "time_per_iteration": 2.3912618160247803 }, { "auxiliary_loss_clip": 0.01054041, "auxiliary_loss_mlp": 0.01041511, "balance_loss_clip": 1.01658463, "balance_loss_mlp": 1.01685858, "epoch": 0.8574778295505787, "flos": 17929784897280.0, "grad_norm": 1.746663444508514, "language_loss": 0.79363006, "learning_rate": 2.0923963443743276e-07, "loss": 0.81458557, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 14262, "time_per_iteration": 3.7280969619750977 }, { "auxiliary_loss_clip": 0.01050114, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.01371336, "balance_loss_mlp": 1.01598251, "epoch": 0.8575379528032466, "flos": 21579605095680.0, "grad_norm": 1.6254130578689368, "language_loss": 0.6898821, "learning_rate": 2.0906624012587203e-07, "loss": 0.7107361, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34179688, "step": 14263, "time_per_iteration": 2.383579969406128 }, { "auxiliary_loss_clip": 0.01052156, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.01108658, "balance_loss_mlp": 1.01592088, "epoch": 0.8575980760559146, "flos": 21760525094400.0, "grad_norm": 1.639666032765686, "language_loss": 0.80461317, "learning_rate": 2.088929137266986e-07, "loss": 0.82548422, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14264, "time_per_iteration": 2.414418935775757 }, { "auxiliary_loss_clip": 0.01052036, "auxiliary_loss_mlp": 0.01035202, "balance_loss_clip": 1.01412594, "balance_loss_mlp": 1.01745415, "epoch": 0.8576581993085826, "flos": 34385013855360.0, "grad_norm": 1.2575650837592116, "language_loss": 0.70033234, "learning_rate": 2.0871965524648582e-07, "loss": 0.72120476, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 14265, "time_per_iteration": 2.4751060009002686 }, { "auxiliary_loss_clip": 0.01049131, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.00975907, "balance_loss_mlp": 1.01568902, "epoch": 0.8577183225612506, "flos": 23221360869120.0, "grad_norm": 1.6958172045399718, "language_loss": 0.67114872, "learning_rate": 2.085464646918027e-07, "loss": 0.69195998, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33398438, "step": 14266, "time_per_iteration": 2.3920364379882812 }, { "auxiliary_loss_clip": 0.01049978, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.01427221, "balance_loss_mlp": 1.01572132, "epoch": 0.8577784458139185, "flos": 28802890615680.0, "grad_norm": 1.7363513264435786, "language_loss": 0.75919098, "learning_rate": 2.0837334206921731e-07, "loss": 0.78005278, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 14267, "time_per_iteration": 2.432713508605957 }, { "auxiliary_loss_clip": 0.01050346, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.0148654, "balance_loss_mlp": 1.01635575, "epoch": 0.8578385690665865, "flos": 19754555351040.0, "grad_norm": 1.664977856879449, "language_loss": 0.88182515, "learning_rate": 2.082002873852946e-07, "loss": 0.90269029, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 14268, "time_per_iteration": 2.3639976978302 }, { "auxiliary_loss_clip": 0.01052205, "auxiliary_loss_mlp": 0.01042628, "balance_loss_clip": 1.01886964, "balance_loss_mlp": 1.0164845, "epoch": 0.8578986923192544, "flos": 20703040790400.0, "grad_norm": 2.086719359556617, "language_loss": 0.74296671, "learning_rate": 2.0802730064659667e-07, "loss": 0.76391506, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 14269, "time_per_iteration": 2.363553285598755 }, { "auxiliary_loss_clip": 0.01053532, "auxiliary_loss_mlp": 0.01035794, "balance_loss_clip": 1.01163042, "balance_loss_mlp": 1.0172832, "epoch": 0.8579588155719224, "flos": 36100226862720.0, "grad_norm": 1.617598010344271, "language_loss": 0.66815686, "learning_rate": 2.0785438185968252e-07, "loss": 0.68905008, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 14270, "time_per_iteration": 2.5219831466674805 }, { "auxiliary_loss_clip": 0.0104933, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.0100863, "balance_loss_mlp": 1.01510286, "epoch": 0.8580189388245905, "flos": 22852468776960.0, "grad_norm": 1.7479383806637516, "language_loss": 0.74351501, "learning_rate": 2.0768153103110997e-07, "loss": 0.76432675, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34179688, "step": 14271, "time_per_iteration": 2.395087242126465 }, { "auxiliary_loss_clip": 0.01007498, "auxiliary_loss_mlp": 0.01001966, "balance_loss_clip": 0.99990374, "balance_loss_mlp": 1.00103784, "epoch": 0.8580790620772584, "flos": 69638385980160.0, "grad_norm": 0.804146333283602, "language_loss": 0.59781289, "learning_rate": 2.0750874816743358e-07, "loss": 0.61790752, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.06445312, "step": 14272, "time_per_iteration": 3.103320837020874 }, { "auxiliary_loss_clip": 0.01054445, "auxiliary_loss_mlp": 0.01039394, "balance_loss_clip": 1.01459885, "balance_loss_mlp": 1.01628637, "epoch": 0.8581391853299264, "flos": 13333399384320.0, "grad_norm": 1.7911138025259181, "language_loss": 0.75680757, "learning_rate": 2.0733603327520499e-07, "loss": 0.77774596, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 14273, "time_per_iteration": 2.3307783603668213 }, { "auxiliary_loss_clip": 0.01050441, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.01486456, "balance_loss_mlp": 1.01533103, "epoch": 0.8581993085825943, "flos": 19644648791040.0, "grad_norm": 1.9080315461119712, "language_loss": 0.82863688, "learning_rate": 2.0716338636097385e-07, "loss": 0.84950829, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 14274, "time_per_iteration": 2.365957498550415 }, { "auxiliary_loss_clip": 0.01007366, "auxiliary_loss_mlp": 0.01004797, "balance_loss_clip": 1.00274622, "balance_loss_mlp": 1.00067723, "epoch": 0.8582594318352623, "flos": 55822452428160.0, "grad_norm": 0.7965737159428715, "language_loss": 0.60912901, "learning_rate": 2.0699080743128672e-07, "loss": 0.62925065, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06689453, "step": 14275, "time_per_iteration": 3.077043294906616 }, { "auxiliary_loss_clip": 0.01051016, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.01172614, "balance_loss_mlp": 1.01506591, "epoch": 0.8583195550879302, "flos": 24278426236800.0, "grad_norm": 2.003225086837967, "language_loss": 0.60258901, "learning_rate": 2.0681829649268768e-07, "loss": 0.62345749, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 14276, "time_per_iteration": 3.8302817344665527 }, { "auxiliary_loss_clip": 0.01050378, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.0180434, "balance_loss_mlp": 1.01547384, "epoch": 0.8583796783405983, "flos": 13443271032960.0, "grad_norm": 11.323914245953544, "language_loss": 0.7723316, "learning_rate": 2.0664585355171838e-07, "loss": 0.79323518, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 14277, "time_per_iteration": 2.3553404808044434 }, { "auxiliary_loss_clip": 0.01051879, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.01429915, "balance_loss_mlp": 1.01612473, "epoch": 0.8584398015932662, "flos": 16179344461440.0, "grad_norm": 1.4691060314085298, "language_loss": 0.84632963, "learning_rate": 2.0647347861491803e-07, "loss": 0.86722231, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 14278, "time_per_iteration": 2.3435792922973633 }, { "auxiliary_loss_clip": 0.01052869, "auxiliary_loss_mlp": 0.01041406, "balance_loss_clip": 1.01689637, "balance_loss_mlp": 1.01601231, "epoch": 0.8584999248459342, "flos": 17449659613440.0, "grad_norm": 2.60581225637081, "language_loss": 0.75925934, "learning_rate": 2.0630117168882366e-07, "loss": 0.78020209, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 14279, "time_per_iteration": 2.3565292358398438 }, { "auxiliary_loss_clip": 0.01051778, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.01630712, "balance_loss_mlp": 1.01689458, "epoch": 0.8585600480986021, "flos": 23439882268800.0, "grad_norm": 2.837020086690907, "language_loss": 0.68103063, "learning_rate": 2.0612893277996845e-07, "loss": 0.70193607, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 14280, "time_per_iteration": 2.4041364192962646 }, { "auxiliary_loss_clip": 0.01048989, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.01393104, "balance_loss_mlp": 1.01488876, "epoch": 0.8586201713512701, "flos": 19936906715520.0, "grad_norm": 2.0851398902811606, "language_loss": 0.6362251, "learning_rate": 2.0595676189488343e-07, "loss": 0.65705836, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34179688, "step": 14281, "time_per_iteration": 2.375554323196411 }, { "auxiliary_loss_clip": 0.01051034, "auxiliary_loss_mlp": 0.01034661, "balance_loss_clip": 1.011904, "balance_loss_mlp": 1.01541352, "epoch": 0.858680294603938, "flos": 15303862408320.0, "grad_norm": 1.6953979716714958, "language_loss": 0.74497187, "learning_rate": 2.0578465904009845e-07, "loss": 0.76582885, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 14282, "time_per_iteration": 2.3513193130493164 }, { "auxiliary_loss_clip": 0.01048934, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.00961757, "balance_loss_mlp": 1.01412809, "epoch": 0.858740417856606, "flos": 22710127697280.0, "grad_norm": 1.94571691345977, "language_loss": 0.7696082, "learning_rate": 2.0561262422213832e-07, "loss": 0.79040915, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 14283, "time_per_iteration": 2.366105794906616 }, { "auxiliary_loss_clip": 0.01051815, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.01476073, "balance_loss_mlp": 1.01581943, "epoch": 0.8588005411092741, "flos": 34052990025600.0, "grad_norm": 2.0381824159204456, "language_loss": 0.60741448, "learning_rate": 2.0544065744752736e-07, "loss": 0.62831151, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 14284, "time_per_iteration": 2.4761359691619873 }, { "auxiliary_loss_clip": 0.01049281, "auxiliary_loss_mlp": 0.01037527, "balance_loss_clip": 1.01702309, "balance_loss_mlp": 1.01636159, "epoch": 0.858860664361942, "flos": 28912308416640.0, "grad_norm": 1.7844210426601728, "language_loss": 0.76096022, "learning_rate": 2.0526875872278749e-07, "loss": 0.78182828, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.328125, "step": 14285, "time_per_iteration": 2.431771755218506 }, { "auxiliary_loss_clip": 0.01053461, "auxiliary_loss_mlp": 0.01031368, "balance_loss_clip": 1.00869477, "balance_loss_mlp": 1.01679206, "epoch": 0.85892078761461, "flos": 19791493436160.0, "grad_norm": 1.7883562408957088, "language_loss": 0.75329101, "learning_rate": 2.0509692805443524e-07, "loss": 0.77413929, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3671875, "step": 14286, "time_per_iteration": 2.374467134475708 }, { "auxiliary_loss_clip": 0.01007215, "auxiliary_loss_mlp": 0.01001952, "balance_loss_clip": 0.99990201, "balance_loss_mlp": 1.00078678, "epoch": 0.8589809108672779, "flos": 67103483005440.0, "grad_norm": 0.7660428542429637, "language_loss": 0.49564639, "learning_rate": 2.0492516544898718e-07, "loss": 0.51573813, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06445312, "step": 14287, "time_per_iteration": 2.9578208923339844 }, { "auxiliary_loss_clip": 0.01051981, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.01498604, "balance_loss_mlp": 1.01636612, "epoch": 0.8590410341199459, "flos": 29714961640320.0, "grad_norm": 4.74191911870384, "language_loss": 0.79904681, "learning_rate": 2.0475347091295704e-07, "loss": 0.81994045, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 14288, "time_per_iteration": 2.431166887283325 }, { "auxiliary_loss_clip": 0.01052707, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.01335466, "balance_loss_mlp": 1.01607442, "epoch": 0.8591011573726138, "flos": 23986307957760.0, "grad_norm": 1.9208755598081817, "language_loss": 0.81716287, "learning_rate": 2.045818444528553e-07, "loss": 0.83805358, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 14289, "time_per_iteration": 2.3694446086883545 }, { "auxiliary_loss_clip": 0.0105105, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.01515818, "balance_loss_mlp": 1.01574528, "epoch": 0.8591612806252819, "flos": 14427786862080.0, "grad_norm": 1.7271681425759704, "language_loss": 0.66202283, "learning_rate": 2.0441028607518973e-07, "loss": 0.68290854, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 14290, "time_per_iteration": 2.3477470874786377 }, { "auxiliary_loss_clip": 0.01051823, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.01260567, "balance_loss_mlp": 1.01622641, "epoch": 0.8592214038779498, "flos": 31575797395200.0, "grad_norm": 1.7649327256791993, "language_loss": 0.55867732, "learning_rate": 2.0423879578646642e-07, "loss": 0.57956183, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 14291, "time_per_iteration": 2.4352805614471436 }, { "auxiliary_loss_clip": 0.01052389, "auxiliary_loss_mlp": 0.01035145, "balance_loss_clip": 1.01274562, "balance_loss_mlp": 1.0161953, "epoch": 0.8592815271306178, "flos": 17456327683200.0, "grad_norm": 2.0065295702718875, "language_loss": 0.7327435, "learning_rate": 2.0406737359318792e-07, "loss": 0.75361878, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 14292, "time_per_iteration": 3.5918936729431152 }, { "auxiliary_loss_clip": 0.01050944, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.01217127, "balance_loss_mlp": 1.01599348, "epoch": 0.8593416503832857, "flos": 25410170736000.0, "grad_norm": 1.4564133341793501, "language_loss": 0.72554511, "learning_rate": 2.038960195018542e-07, "loss": 0.74639976, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 14293, "time_per_iteration": 2.39924955368042 }, { "auxiliary_loss_clip": 0.01049444, "auxiliary_loss_mlp": 0.01035552, "balance_loss_clip": 1.01459527, "balance_loss_mlp": 1.01551938, "epoch": 0.8594017736359537, "flos": 20995578005760.0, "grad_norm": 2.4168352524979326, "language_loss": 0.70295858, "learning_rate": 2.0372473351896358e-07, "loss": 0.72380853, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 14294, "time_per_iteration": 2.408008337020874 }, { "auxiliary_loss_clip": 0.01049265, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.01335311, "balance_loss_mlp": 1.01453781, "epoch": 0.8594618968886216, "flos": 22089965483520.0, "grad_norm": 1.9259815318842464, "language_loss": 0.78439629, "learning_rate": 2.0355351565101087e-07, "loss": 0.80523634, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34765625, "step": 14295, "time_per_iteration": 2.3979897499084473 }, { "auxiliary_loss_clip": 0.01054854, "auxiliary_loss_mlp": 0.01041268, "balance_loss_clip": 1.01468468, "balance_loss_mlp": 1.01707149, "epoch": 0.8595220201412896, "flos": 11655438664320.0, "grad_norm": 3.8886276433957514, "language_loss": 0.70455551, "learning_rate": 2.0338236590448975e-07, "loss": 0.72551674, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 14296, "time_per_iteration": 2.399430274963379 }, { "auxiliary_loss_clip": 0.01051278, "auxiliary_loss_mlp": 0.01037634, "balance_loss_clip": 1.01461434, "balance_loss_mlp": 1.01663315, "epoch": 0.8595821433939577, "flos": 25039358519040.0, "grad_norm": 2.904456616152101, "language_loss": 0.80465829, "learning_rate": 2.0321128428588842e-07, "loss": 0.82554746, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34570312, "step": 14297, "time_per_iteration": 2.4327590465545654 }, { "auxiliary_loss_clip": 0.01050095, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.01043022, "balance_loss_mlp": 1.01557171, "epoch": 0.8596422666466256, "flos": 28510283577600.0, "grad_norm": 1.5413836096475462, "language_loss": 0.68538052, "learning_rate": 2.030402708016954e-07, "loss": 0.70619762, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 14298, "time_per_iteration": 2.489471673965454 }, { "auxiliary_loss_clip": 0.01049544, "auxiliary_loss_mlp": 0.01040998, "balance_loss_clip": 1.01901555, "balance_loss_mlp": 1.0156616, "epoch": 0.8597023898992936, "flos": 13588300287360.0, "grad_norm": 2.0945718458667484, "language_loss": 0.69365275, "learning_rate": 2.0286932545839576e-07, "loss": 0.71455818, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 14299, "time_per_iteration": 2.3402888774871826 }, { "auxiliary_loss_clip": 0.01053073, "auxiliary_loss_mlp": 0.01043404, "balance_loss_clip": 1.01963401, "balance_loss_mlp": 1.01679659, "epoch": 0.8597625131519615, "flos": 32299617035520.0, "grad_norm": 2.253863799311946, "language_loss": 0.72622991, "learning_rate": 2.0269844826247096e-07, "loss": 0.74719465, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 14300, "time_per_iteration": 2.4405646324157715 }, { "auxiliary_loss_clip": 0.01050988, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.01119673, "balance_loss_mlp": 1.01567006, "epoch": 0.8598226364046295, "flos": 28729119179520.0, "grad_norm": 1.5785591915022912, "language_loss": 0.70384169, "learning_rate": 2.0252763922040116e-07, "loss": 0.72469306, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 14301, "time_per_iteration": 3.804576873779297 }, { "auxiliary_loss_clip": 0.01051304, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.01425982, "balance_loss_mlp": 1.0167371, "epoch": 0.8598827596572974, "flos": 21870745856640.0, "grad_norm": 1.8275376379621842, "language_loss": 0.75553656, "learning_rate": 2.023568983386641e-07, "loss": 0.77640659, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 14302, "time_per_iteration": 3.6765034198760986 }, { "auxiliary_loss_clip": 0.01048582, "auxiliary_loss_mlp": 0.01033707, "balance_loss_clip": 1.01341796, "balance_loss_mlp": 1.01561189, "epoch": 0.8599428829099655, "flos": 23766215546880.0, "grad_norm": 1.7524840453271138, "language_loss": 0.84753084, "learning_rate": 2.02186225623733e-07, "loss": 0.86835378, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.328125, "step": 14303, "time_per_iteration": 2.381174325942993 }, { "auxiliary_loss_clip": 0.01051905, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.01858091, "balance_loss_mlp": 1.01564586, "epoch": 0.8600030061626334, "flos": 16211953537920.0, "grad_norm": 2.011442267863653, "language_loss": 0.77156448, "learning_rate": 2.0201562108208025e-07, "loss": 0.79248804, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 14304, "time_per_iteration": 2.4121642112731934 }, { "auxiliary_loss_clip": 0.01050593, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.01174915, "balance_loss_mlp": 1.01584327, "epoch": 0.8600631294153014, "flos": 15668460403200.0, "grad_norm": 2.173074361289434, "language_loss": 0.54854405, "learning_rate": 2.0184508472017537e-07, "loss": 0.56939447, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14305, "time_per_iteration": 2.3727962970733643 }, { "auxiliary_loss_clip": 0.01050595, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.01297987, "balance_loss_mlp": 1.01570785, "epoch": 0.8601232526679693, "flos": 17492148604800.0, "grad_norm": 2.233975617450759, "language_loss": 0.84506172, "learning_rate": 2.0167461654448558e-07, "loss": 0.86593491, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34960938, "step": 14306, "time_per_iteration": 2.3420698642730713 }, { "auxiliary_loss_clip": 0.01048712, "auxiliary_loss_mlp": 0.01036934, "balance_loss_clip": 1.01669276, "balance_loss_mlp": 1.01491821, "epoch": 0.8601833759206373, "flos": 26984543852160.0, "grad_norm": 1.4220146544779362, "language_loss": 0.72079504, "learning_rate": 2.01504216561474e-07, "loss": 0.74165154, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 14307, "time_per_iteration": 2.481304883956909 }, { "auxiliary_loss_clip": 0.01052464, "auxiliary_loss_mlp": 0.01040093, "balance_loss_clip": 1.0151664, "balance_loss_mlp": 1.01587772, "epoch": 0.8602434991733052, "flos": 25228552510080.0, "grad_norm": 2.014513141332092, "language_loss": 0.65132952, "learning_rate": 2.0133388477760316e-07, "loss": 0.67225504, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36523438, "step": 14308, "time_per_iteration": 2.40107798576355 }, { "auxiliary_loss_clip": 0.01007695, "auxiliary_loss_mlp": 0.01001662, "balance_loss_clip": 0.99955243, "balance_loss_mlp": 1.0010612, "epoch": 0.8603036224259732, "flos": 71011974533760.0, "grad_norm": 0.6259472051918687, "language_loss": 0.48565644, "learning_rate": 2.0116362119933172e-07, "loss": 0.50575, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06640625, "step": 14309, "time_per_iteration": 3.0910515785217285 }, { "auxiliary_loss_clip": 0.01052397, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.01051188, "balance_loss_mlp": 1.01576686, "epoch": 0.8603637456786413, "flos": 20299654408320.0, "grad_norm": 2.7904357798905934, "language_loss": 0.67898214, "learning_rate": 2.0099342583311563e-07, "loss": 0.69985545, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 14310, "time_per_iteration": 2.375838279724121 }, { "auxiliary_loss_clip": 0.01051833, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.01669812, "balance_loss_mlp": 1.01603186, "epoch": 0.8604238689313092, "flos": 21834750378240.0, "grad_norm": 2.699264454757523, "language_loss": 0.79315019, "learning_rate": 2.0082329868540905e-07, "loss": 0.81405473, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 14311, "time_per_iteration": 2.430875778198242 }, { "auxiliary_loss_clip": 0.01051185, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.01717627, "balance_loss_mlp": 1.01589739, "epoch": 0.8604839921839772, "flos": 18003242131200.0, "grad_norm": 2.1299861413331134, "language_loss": 0.7267797, "learning_rate": 2.006532397626639e-07, "loss": 0.74768376, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 14312, "time_per_iteration": 2.3423750400543213 }, { "auxiliary_loss_clip": 0.01049377, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.01834512, "balance_loss_mlp": 1.01483762, "epoch": 0.8605441154366451, "flos": 16251265595520.0, "grad_norm": 2.2950885733746356, "language_loss": 0.78492045, "learning_rate": 2.0048324907132797e-07, "loss": 0.80582201, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 14313, "time_per_iteration": 2.3421685695648193 }, { "auxiliary_loss_clip": 0.01050213, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.01271021, "balance_loss_mlp": 1.01531577, "epoch": 0.8606042386893131, "flos": 32265786061440.0, "grad_norm": 1.4580167708436145, "language_loss": 0.73356831, "learning_rate": 2.003133266178474e-07, "loss": 0.75443673, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.34960938, "step": 14314, "time_per_iteration": 2.474968194961548 }, { "auxiliary_loss_clip": 0.01050585, "auxiliary_loss_mlp": 0.01040146, "balance_loss_clip": 1.01729321, "balance_loss_mlp": 1.01553988, "epoch": 0.860664361941981, "flos": 20228745703680.0, "grad_norm": 2.125162981915688, "language_loss": 0.69836718, "learning_rate": 2.001434724086657e-07, "loss": 0.71927446, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 14315, "time_per_iteration": 2.3753459453582764 }, { "auxiliary_loss_clip": 0.01050815, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.01575255, "balance_loss_mlp": 1.0160141, "epoch": 0.8607244851946491, "flos": 25190462350080.0, "grad_norm": 2.002087728913297, "language_loss": 0.72825456, "learning_rate": 1.9997368645022418e-07, "loss": 0.74913967, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14316, "time_per_iteration": 3.892972946166992 }, { "auxiliary_loss_clip": 0.0105297, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.01515937, "balance_loss_mlp": 1.01706851, "epoch": 0.860784608447317, "flos": 20481132988800.0, "grad_norm": 2.885796569819384, "language_loss": 0.83959895, "learning_rate": 1.9980396874896056e-07, "loss": 0.86050719, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 14317, "time_per_iteration": 2.376652956008911 }, { "auxiliary_loss_clip": 0.01050511, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.01268303, "balance_loss_mlp": 1.01684248, "epoch": 0.860844731699985, "flos": 50474178541440.0, "grad_norm": 1.5623948753763375, "language_loss": 0.67926365, "learning_rate": 1.996343193113108e-07, "loss": 0.70011544, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3359375, "step": 14318, "time_per_iteration": 2.624621868133545 }, { "auxiliary_loss_clip": 0.01049271, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.01605439, "balance_loss_mlp": 1.01540899, "epoch": 0.8609048549526529, "flos": 41171151841920.0, "grad_norm": 1.6503342385204527, "language_loss": 0.72015297, "learning_rate": 1.9946473814370911e-07, "loss": 0.74101067, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 14319, "time_per_iteration": 2.5291357040405273 }, { "auxiliary_loss_clip": 0.0105344, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.01532483, "balance_loss_mlp": 1.01774752, "epoch": 0.8609649782053209, "flos": 23950068099840.0, "grad_norm": 1.6908877784155507, "language_loss": 0.6833666, "learning_rate": 1.992952252525839e-07, "loss": 0.70427728, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 14320, "time_per_iteration": 2.38474178314209 }, { "auxiliary_loss_clip": 0.01052042, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.01183844, "balance_loss_mlp": 1.01461947, "epoch": 0.8610251014579888, "flos": 23111454309120.0, "grad_norm": 1.9278673034584484, "language_loss": 0.80917871, "learning_rate": 1.9912578064436446e-07, "loss": 0.83006614, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 14321, "time_per_iteration": 2.3748888969421387 }, { "auxiliary_loss_clip": 0.0104944, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.01200628, "balance_loss_mlp": 1.01502228, "epoch": 0.8610852247106568, "flos": 19425813189120.0, "grad_norm": 1.8475049782445703, "language_loss": 0.72163641, "learning_rate": 1.9895640432547567e-07, "loss": 0.74249041, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.34375, "step": 14322, "time_per_iteration": 2.3470282554626465 }, { "auxiliary_loss_clip": 0.01053422, "auxiliary_loss_mlp": 0.01044485, "balance_loss_clip": 1.01860404, "balance_loss_mlp": 1.01651335, "epoch": 0.8611453479633249, "flos": 19311228506880.0, "grad_norm": 2.6296418918061364, "language_loss": 0.57540858, "learning_rate": 1.9878709630234102e-07, "loss": 0.59638762, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.36914062, "step": 14323, "time_per_iteration": 2.388249158859253 }, { "auxiliary_loss_clip": 0.0104977, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.01290119, "balance_loss_mlp": 1.01524818, "epoch": 0.8612054712159928, "flos": 23252678225280.0, "grad_norm": 1.9455941518702309, "language_loss": 0.7661097, "learning_rate": 1.986178565813801e-07, "loss": 0.78695339, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 14324, "time_per_iteration": 2.4001431465148926 }, { "auxiliary_loss_clip": 0.01051992, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.01267719, "balance_loss_mlp": 1.01645923, "epoch": 0.8612655944686608, "flos": 16027682048640.0, "grad_norm": 2.029742945289351, "language_loss": 0.6790992, "learning_rate": 1.9844868516901036e-07, "loss": 0.69998801, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 14325, "time_per_iteration": 2.3445866107940674 }, { "auxiliary_loss_clip": 0.0105256, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.01681304, "balance_loss_mlp": 1.01618719, "epoch": 0.8613257177213287, "flos": 22491606297600.0, "grad_norm": 1.6746860454760584, "language_loss": 0.66470534, "learning_rate": 1.982795820716472e-07, "loss": 0.68565607, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36523438, "step": 14326, "time_per_iteration": 2.3663687705993652 }, { "auxiliary_loss_clip": 0.01050923, "auxiliary_loss_mlp": 0.01034878, "balance_loss_clip": 1.01146555, "balance_loss_mlp": 1.01570451, "epoch": 0.8613858409739967, "flos": 17237108056320.0, "grad_norm": 1.9942290564813254, "language_loss": 0.86150301, "learning_rate": 1.9811054729570253e-07, "loss": 0.88236105, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 14327, "time_per_iteration": 2.3654677867889404 }, { "auxiliary_loss_clip": 0.01050941, "auxiliary_loss_mlp": 0.0103676, "balance_loss_clip": 1.01384795, "balance_loss_mlp": 1.01534045, "epoch": 0.8614459642266646, "flos": 22819999345920.0, "grad_norm": 2.366396813292019, "language_loss": 0.76209134, "learning_rate": 1.9794158084758661e-07, "loss": 0.78296828, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 14328, "time_per_iteration": 2.5113308429718018 }, { "auxiliary_loss_clip": 0.01050349, "auxiliary_loss_mlp": 0.01035164, "balance_loss_clip": 1.0143615, "balance_loss_mlp": 1.01539612, "epoch": 0.8615060874793327, "flos": 26503126848000.0, "grad_norm": 1.6427311277253742, "language_loss": 0.80718976, "learning_rate": 1.9777268273370673e-07, "loss": 0.82804489, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34960938, "step": 14329, "time_per_iteration": 2.42568039894104 }, { "auxiliary_loss_clip": 0.01050812, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.01137412, "balance_loss_mlp": 1.01572895, "epoch": 0.8615662107320006, "flos": 24059869925760.0, "grad_norm": 2.499847368144723, "language_loss": 0.78202534, "learning_rate": 1.9760385296046757e-07, "loss": 0.80287659, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 14330, "time_per_iteration": 2.4003398418426514 }, { "auxiliary_loss_clip": 0.01051292, "auxiliary_loss_mlp": 0.01038156, "balance_loss_clip": 1.01494622, "balance_loss_mlp": 1.015553, "epoch": 0.8616263339846686, "flos": 24164051022720.0, "grad_norm": 1.9712306196427594, "language_loss": 0.65905011, "learning_rate": 1.974350915342702e-07, "loss": 0.67994457, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 14331, "time_per_iteration": 3.643087387084961 }, { "auxiliary_loss_clip": 0.01051346, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.01648355, "balance_loss_mlp": 1.01591659, "epoch": 0.8616864572373365, "flos": 21723307718400.0, "grad_norm": 1.6294641329704256, "language_loss": 0.76799405, "learning_rate": 1.9726639846151506e-07, "loss": 0.78889364, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 14332, "time_per_iteration": 2.4314746856689453 }, { "auxiliary_loss_clip": 0.01052422, "auxiliary_loss_mlp": 0.01037019, "balance_loss_clip": 1.0117228, "balance_loss_mlp": 1.01593924, "epoch": 0.8617465804900045, "flos": 23765587142400.0, "grad_norm": 2.250019182161868, "language_loss": 0.67943347, "learning_rate": 1.9709777374859904e-07, "loss": 0.70032787, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 14333, "time_per_iteration": 2.4071218967437744 }, { "auxiliary_loss_clip": 0.01055634, "auxiliary_loss_mlp": 0.01039882, "balance_loss_clip": 1.01410937, "balance_loss_mlp": 1.01748788, "epoch": 0.8618067037426724, "flos": 37702496021760.0, "grad_norm": 1.8526800354880604, "language_loss": 0.63645357, "learning_rate": 1.969292174019157e-07, "loss": 0.65740877, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 14334, "time_per_iteration": 2.5634424686431885 }, { "auxiliary_loss_clip": 0.01054261, "auxiliary_loss_mlp": 0.01043255, "balance_loss_clip": 1.02014005, "balance_loss_mlp": 1.01693225, "epoch": 0.8618668269953405, "flos": 21469942915200.0, "grad_norm": 1.9024144078805274, "language_loss": 0.69899261, "learning_rate": 1.967607294278577e-07, "loss": 0.71996778, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37304688, "step": 14335, "time_per_iteration": 2.4713752269744873 }, { "auxiliary_loss_clip": 0.01051627, "auxiliary_loss_mlp": 0.01033201, "balance_loss_clip": 1.01010966, "balance_loss_mlp": 1.01590121, "epoch": 0.8619269502480085, "flos": 22231713070080.0, "grad_norm": 1.4017113721890082, "language_loss": 0.83432913, "learning_rate": 1.965923098328135e-07, "loss": 0.85517734, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 14336, "time_per_iteration": 2.4397692680358887 }, { "auxiliary_loss_clip": 0.0105277, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.01524234, "balance_loss_mlp": 1.01596212, "epoch": 0.8619870735006764, "flos": 22709534204160.0, "grad_norm": 1.7051428021440802, "language_loss": 0.68666488, "learning_rate": 1.9642395862316907e-07, "loss": 0.70757812, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 14337, "time_per_iteration": 2.3845314979553223 }, { "auxiliary_loss_clip": 0.01050431, "auxiliary_loss_mlp": 0.01034372, "balance_loss_clip": 1.01124537, "balance_loss_mlp": 1.01548409, "epoch": 0.8620471967533444, "flos": 37518887848320.0, "grad_norm": 2.333968655212592, "language_loss": 0.67523372, "learning_rate": 1.962556758053089e-07, "loss": 0.6960817, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34960938, "step": 14338, "time_per_iteration": 2.5261096954345703 }, { "auxiliary_loss_clip": 0.01051701, "auxiliary_loss_mlp": 0.01033512, "balance_loss_clip": 1.01160169, "balance_loss_mlp": 1.01659822, "epoch": 0.8621073200060123, "flos": 19681447230720.0, "grad_norm": 1.8923093599751015, "language_loss": 0.63474262, "learning_rate": 1.9608746138561448e-07, "loss": 0.65559471, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 14339, "time_per_iteration": 2.382248878479004 }, { "auxiliary_loss_clip": 0.0105041, "auxiliary_loss_mlp": 0.01041433, "balance_loss_clip": 1.01979649, "balance_loss_mlp": 1.01513827, "epoch": 0.8621674432586803, "flos": 14536017676800.0, "grad_norm": 1.9824973763259925, "language_loss": 0.64125097, "learning_rate": 1.9591931537046458e-07, "loss": 0.66216934, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35351562, "step": 14340, "time_per_iteration": 3.781794309616089 }, { "auxiliary_loss_clip": 0.0104745, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.00977385, "balance_loss_mlp": 1.01478195, "epoch": 0.8622275665113482, "flos": 20739071180160.0, "grad_norm": 2.0654071128468656, "language_loss": 0.80803549, "learning_rate": 1.9575123776623493e-07, "loss": 0.82881296, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.32617188, "step": 14341, "time_per_iteration": 3.6583330631256104 }, { "auxiliary_loss_clip": 0.01051132, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.01499987, "balance_loss_mlp": 1.01590526, "epoch": 0.8622876897640163, "flos": 24714805720320.0, "grad_norm": 1.878808157870589, "language_loss": 0.75433564, "learning_rate": 1.9558322857929887e-07, "loss": 0.77521026, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 14342, "time_per_iteration": 2.4565579891204834 }, { "auxiliary_loss_clip": 0.01053242, "auxiliary_loss_mlp": 0.01036083, "balance_loss_clip": 1.01262236, "balance_loss_mlp": 1.01669288, "epoch": 0.8623478130166842, "flos": 17456397505920.0, "grad_norm": 1.8404107092150621, "language_loss": 0.69993186, "learning_rate": 1.95415287816028e-07, "loss": 0.72082508, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 14343, "time_per_iteration": 2.370647430419922 }, { "auxiliary_loss_clip": 0.01051841, "auxiliary_loss_mlp": 0.01043298, "balance_loss_clip": 1.01840687, "balance_loss_mlp": 1.01561022, "epoch": 0.8624079362693522, "flos": 18108330923520.0, "grad_norm": 1.7591341628445278, "language_loss": 0.68916768, "learning_rate": 1.9524741548278967e-07, "loss": 0.71011907, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36132812, "step": 14344, "time_per_iteration": 2.345428943634033 }, { "auxiliary_loss_clip": 0.01052533, "auxiliary_loss_mlp": 0.01042553, "balance_loss_clip": 1.0191046, "balance_loss_mlp": 1.01591182, "epoch": 0.8624680595220201, "flos": 30665087913600.0, "grad_norm": 1.4525951709770133, "language_loss": 0.82339704, "learning_rate": 1.9507961158595054e-07, "loss": 0.84434795, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 14345, "time_per_iteration": 2.4513206481933594 }, { "auxiliary_loss_clip": 0.01054163, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.01420414, "balance_loss_mlp": 1.01691103, "epoch": 0.8625281827746881, "flos": 37997058096000.0, "grad_norm": 2.3635798710389566, "language_loss": 0.52574247, "learning_rate": 1.9491187613187355e-07, "loss": 0.5466702, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 14346, "time_per_iteration": 2.516200304031372 }, { "auxiliary_loss_clip": 0.01051196, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.01213098, "balance_loss_mlp": 1.01593232, "epoch": 0.862588306027356, "flos": 26248540147200.0, "grad_norm": 1.6003013885391215, "language_loss": 0.76417136, "learning_rate": 1.9474420912691913e-07, "loss": 0.78503352, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 14347, "time_per_iteration": 2.394943952560425 }, { "auxiliary_loss_clip": 0.01051796, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.01248717, "balance_loss_mlp": 1.01602435, "epoch": 0.862648429280024, "flos": 25877797752960.0, "grad_norm": 2.010399028019852, "language_loss": 0.82088959, "learning_rate": 1.945766105774449e-07, "loss": 0.84178442, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35742188, "step": 14348, "time_per_iteration": 2.41615891456604 }, { "auxiliary_loss_clip": 0.01048798, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.01391292, "balance_loss_mlp": 1.01460934, "epoch": 0.862708552532692, "flos": 37814881288320.0, "grad_norm": 4.126265416725106, "language_loss": 0.67220789, "learning_rate": 1.9440908048980665e-07, "loss": 0.69305158, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 14349, "time_per_iteration": 2.5754201412200928 }, { "auxiliary_loss_clip": 0.01051086, "auxiliary_loss_mlp": 0.01040874, "balance_loss_clip": 1.01619744, "balance_loss_mlp": 1.01603961, "epoch": 0.86276867578536, "flos": 19090996450560.0, "grad_norm": 2.6717548638374966, "language_loss": 0.71867704, "learning_rate": 1.942416188703573e-07, "loss": 0.73959661, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.34960938, "step": 14350, "time_per_iteration": 2.3782923221588135 }, { "auxiliary_loss_clip": 0.01051203, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.01139951, "balance_loss_mlp": 1.01553881, "epoch": 0.862828799038028, "flos": 22163178337920.0, "grad_norm": 1.7674119769877688, "language_loss": 0.78347218, "learning_rate": 1.9407422572544618e-07, "loss": 0.80433488, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 14351, "time_per_iteration": 2.4134812355041504 }, { "auxiliary_loss_clip": 0.01052545, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.01271379, "balance_loss_mlp": 1.01665211, "epoch": 0.8628889222906959, "flos": 23144552144640.0, "grad_norm": 2.1450596668276676, "language_loss": 0.86003006, "learning_rate": 1.9390690106142204e-07, "loss": 0.8809011, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 14352, "time_per_iteration": 2.3906478881835938 }, { "auxiliary_loss_clip": 0.0100733, "auxiliary_loss_mlp": 0.01001909, "balance_loss_clip": 0.99990678, "balance_loss_mlp": 1.00068617, "epoch": 0.8629490455433639, "flos": 57814455761280.0, "grad_norm": 0.7956931311048218, "language_loss": 0.62031937, "learning_rate": 1.9373964488462913e-07, "loss": 0.64041179, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06640625, "step": 14353, "time_per_iteration": 3.022263288497925 }, { "auxiliary_loss_clip": 0.01049525, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.01125169, "balance_loss_mlp": 1.01534081, "epoch": 0.8630091687960318, "flos": 15918892652160.0, "grad_norm": 1.8322733634112012, "language_loss": 0.82273853, "learning_rate": 1.9357245720140948e-07, "loss": 0.84356016, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34179688, "step": 14354, "time_per_iteration": 2.376633405685425 }, { "auxiliary_loss_clip": 0.01051105, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.01025856, "balance_loss_mlp": 1.01586366, "epoch": 0.8630692920486999, "flos": 17960892785280.0, "grad_norm": 1.9655131639202645, "language_loss": 0.87276268, "learning_rate": 1.934053380181031e-07, "loss": 0.89360946, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3515625, "step": 14355, "time_per_iteration": 3.8565993309020996 }, { "auxiliary_loss_clip": 0.01050629, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.00962079, "balance_loss_mlp": 1.01504719, "epoch": 0.8631294153013678, "flos": 22454074719360.0, "grad_norm": 2.065636535395998, "language_loss": 0.60051262, "learning_rate": 1.9323828734104763e-07, "loss": 0.62134224, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 14356, "time_per_iteration": 2.37774658203125 }, { "auxiliary_loss_clip": 0.01052833, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.0146668, "balance_loss_mlp": 1.0161097, "epoch": 0.8631895385540358, "flos": 16836060735360.0, "grad_norm": 12.54296107964176, "language_loss": 0.77781641, "learning_rate": 1.9307130517657756e-07, "loss": 0.79873925, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 14357, "time_per_iteration": 2.333808422088623 }, { "auxiliary_loss_clip": 0.01053088, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.01299238, "balance_loss_mlp": 1.01665878, "epoch": 0.8632496618067037, "flos": 18696233174400.0, "grad_norm": 4.554164092518857, "language_loss": 0.78753829, "learning_rate": 1.9290439153102468e-07, "loss": 0.80843455, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 14358, "time_per_iteration": 2.3766281604766846 }, { "auxiliary_loss_clip": 0.01051743, "auxiliary_loss_mlp": 0.01042143, "balance_loss_clip": 1.01708496, "balance_loss_mlp": 1.01627469, "epoch": 0.8633097850593717, "flos": 24278775350400.0, "grad_norm": 1.5046146057602634, "language_loss": 0.75798953, "learning_rate": 1.9273754641071816e-07, "loss": 0.7789284, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 14359, "time_per_iteration": 2.398514747619629 }, { "auxiliary_loss_clip": 0.01048632, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.01039791, "balance_loss_mlp": 1.0143218, "epoch": 0.8633699083120396, "flos": 21177510433920.0, "grad_norm": 2.187515548298292, "language_loss": 0.71698326, "learning_rate": 1.9257076982198517e-07, "loss": 0.7377916, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 14360, "time_per_iteration": 2.351353645324707 }, { "auxiliary_loss_clip": 0.0105326, "auxiliary_loss_mlp": 0.01041185, "balance_loss_clip": 1.0175097, "balance_loss_mlp": 1.01674414, "epoch": 0.8634300315647077, "flos": 19243880760960.0, "grad_norm": 1.9331844430707534, "language_loss": 0.77378923, "learning_rate": 1.9240406177114953e-07, "loss": 0.79473364, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 14361, "time_per_iteration": 2.3651018142700195 }, { "auxiliary_loss_clip": 0.01007342, "auxiliary_loss_mlp": 0.01002054, "balance_loss_clip": 0.99994451, "balance_loss_mlp": 1.00092745, "epoch": 0.8634901548173756, "flos": 66192494232960.0, "grad_norm": 0.9617329510203322, "language_loss": 0.58919775, "learning_rate": 1.922374222645329e-07, "loss": 0.60929173, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06445312, "step": 14362, "time_per_iteration": 3.033932685852051 }, { "auxiliary_loss_clip": 0.01052205, "auxiliary_loss_mlp": 0.01044418, "balance_loss_clip": 1.01748848, "balance_loss_mlp": 1.01498532, "epoch": 0.8635502780700436, "flos": 24788402599680.0, "grad_norm": 1.841984012020704, "language_loss": 0.81514573, "learning_rate": 1.9207085130845524e-07, "loss": 0.8361119, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37109375, "step": 14363, "time_per_iteration": 2.396483898162842 }, { "auxiliary_loss_clip": 0.01053613, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.01542783, "balance_loss_mlp": 1.01639414, "epoch": 0.8636104013227116, "flos": 25188856427520.0, "grad_norm": 2.842748680840053, "language_loss": 0.74851406, "learning_rate": 1.9190434890923112e-07, "loss": 0.76946998, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37109375, "step": 14364, "time_per_iteration": 2.403127670288086 }, { "auxiliary_loss_clip": 0.01051714, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.01245761, "balance_loss_mlp": 1.01549816, "epoch": 0.8636705245753795, "flos": 23877309093120.0, "grad_norm": 1.6259453615953252, "language_loss": 0.72654533, "learning_rate": 1.917379150731755e-07, "loss": 0.74740934, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36132812, "step": 14365, "time_per_iteration": 2.373814821243286 }, { "auxiliary_loss_clip": 0.01055015, "auxiliary_loss_mlp": 0.0104197, "balance_loss_clip": 1.01601839, "balance_loss_mlp": 1.01703584, "epoch": 0.8637306478280475, "flos": 23109394538880.0, "grad_norm": 2.07534741671618, "language_loss": 0.72905439, "learning_rate": 1.915715498065993e-07, "loss": 0.7500242, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 14366, "time_per_iteration": 2.386399030685425 }, { "auxiliary_loss_clip": 0.01050302, "auxiliary_loss_mlp": 0.01036958, "balance_loss_clip": 1.01652527, "balance_loss_mlp": 1.01628542, "epoch": 0.8637907710807154, "flos": 21905763816960.0, "grad_norm": 1.5370760280974591, "language_loss": 0.82754475, "learning_rate": 1.9140525311581146e-07, "loss": 0.8484174, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 14367, "time_per_iteration": 2.362192392349243 }, { "auxiliary_loss_clip": 0.01052496, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.01479793, "balance_loss_mlp": 1.01601171, "epoch": 0.8638508943333835, "flos": 23579570085120.0, "grad_norm": 2.09377123851104, "language_loss": 0.62257791, "learning_rate": 1.9123902500711743e-07, "loss": 0.64349604, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36523438, "step": 14368, "time_per_iteration": 2.3991174697875977 }, { "auxiliary_loss_clip": 0.01051586, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.01126552, "balance_loss_mlp": 1.01682699, "epoch": 0.8639110175860514, "flos": 25774663996800.0, "grad_norm": 1.961123152056968, "language_loss": 0.7722922, "learning_rate": 1.91072865486821e-07, "loss": 0.79314697, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14369, "time_per_iteration": 2.4198153018951416 }, { "auxiliary_loss_clip": 0.01053645, "auxiliary_loss_mlp": 0.01043179, "balance_loss_clip": 1.01813269, "balance_loss_mlp": 1.01624858, "epoch": 0.8639711408387194, "flos": 23368275336960.0, "grad_norm": 1.8161075868717163, "language_loss": 0.6493926, "learning_rate": 1.9090677456122294e-07, "loss": 0.67036092, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 14370, "time_per_iteration": 2.4210610389709473 }, { "auxiliary_loss_clip": 0.01051526, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.01470923, "balance_loss_mlp": 1.01619887, "epoch": 0.8640312640913873, "flos": 22126135518720.0, "grad_norm": 1.6874168667824085, "language_loss": 0.67241448, "learning_rate": 1.907407522366209e-07, "loss": 0.6933198, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35351562, "step": 14371, "time_per_iteration": 3.682610511779785 }, { "auxiliary_loss_clip": 0.01007367, "auxiliary_loss_mlp": 0.01002545, "balance_loss_clip": 1.00050676, "balance_loss_mlp": 1.00080836, "epoch": 0.8640913873440553, "flos": 57569192259840.0, "grad_norm": 0.8595969799600731, "language_loss": 0.56965387, "learning_rate": 1.905747985193107e-07, "loss": 0.58975291, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06542969, "step": 14372, "time_per_iteration": 2.939011812210083 }, { "auxiliary_loss_clip": 0.01050444, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.01483476, "balance_loss_mlp": 1.01576161, "epoch": 0.8641515105967232, "flos": 23986307957760.0, "grad_norm": 1.7173762939578163, "language_loss": 0.80282629, "learning_rate": 1.9040891341558597e-07, "loss": 0.823704, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 14373, "time_per_iteration": 2.41999888420105 }, { "auxiliary_loss_clip": 0.01050385, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.01310408, "balance_loss_mlp": 1.0148499, "epoch": 0.8642116338493913, "flos": 19061738864640.0, "grad_norm": 2.4164510511952635, "language_loss": 0.64601362, "learning_rate": 1.9024309693173656e-07, "loss": 0.66687101, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14374, "time_per_iteration": 2.3702383041381836 }, { "auxiliary_loss_clip": 0.01050734, "auxiliary_loss_mlp": 0.01039649, "balance_loss_clip": 1.01782155, "balance_loss_mlp": 1.01619589, "epoch": 0.8642717571020592, "flos": 18253325266560.0, "grad_norm": 1.698429218989576, "language_loss": 0.78087282, "learning_rate": 1.9007734907404993e-07, "loss": 0.80177665, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 14375, "time_per_iteration": 2.361356019973755 }, { "auxiliary_loss_clip": 0.01052118, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.01300704, "balance_loss_mlp": 1.01605022, "epoch": 0.8643318803547272, "flos": 57661224203520.0, "grad_norm": 1.6261428268643123, "language_loss": 0.61944938, "learning_rate": 1.899116698488117e-07, "loss": 0.64034164, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36132812, "step": 14376, "time_per_iteration": 2.7170045375823975 }, { "auxiliary_loss_clip": 0.01050528, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.01449978, "balance_loss_mlp": 1.0156827, "epoch": 0.8643920036073952, "flos": 19608513667200.0, "grad_norm": 1.4370469823802037, "language_loss": 0.67760539, "learning_rate": 1.8974605926230457e-07, "loss": 0.69847685, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 14377, "time_per_iteration": 2.403916120529175 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01036142, "balance_loss_clip": 1.01263356, "balance_loss_mlp": 1.01576114, "epoch": 0.8644521268600631, "flos": 20849291942400.0, "grad_norm": 1.6009919638063392, "language_loss": 0.718279, "learning_rate": 1.8958051732080804e-07, "loss": 0.7391575, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 14378, "time_per_iteration": 2.3924341201782227 }, { "auxiliary_loss_clip": 0.01007542, "auxiliary_loss_mlp": 0.01003609, "balance_loss_clip": 1.00135636, "balance_loss_mlp": 1.00093508, "epoch": 0.8645122501127311, "flos": 66716016203520.0, "grad_norm": 0.8038415321168, "language_loss": 0.60323489, "learning_rate": 1.894150440305995e-07, "loss": 0.62334645, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.06640625, "step": 14379, "time_per_iteration": 3.0264432430267334 }, { "auxiliary_loss_clip": 0.01051591, "auxiliary_loss_mlp": 0.01037305, "balance_loss_clip": 1.015275, "balance_loss_mlp": 1.01614761, "epoch": 0.864572373365399, "flos": 21688918162560.0, "grad_norm": 1.707260192037767, "language_loss": 0.75568128, "learning_rate": 1.8924963939795478e-07, "loss": 0.7765702, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 14380, "time_per_iteration": 3.7975189685821533 }, { "auxiliary_loss_clip": 0.01053147, "auxiliary_loss_mlp": 0.01041017, "balance_loss_clip": 1.01871324, "balance_loss_mlp": 1.01673937, "epoch": 0.8646324966180671, "flos": 20265369586560.0, "grad_norm": 3.7721776289491857, "language_loss": 0.76114023, "learning_rate": 1.8908430342914473e-07, "loss": 0.78208184, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36328125, "step": 14381, "time_per_iteration": 3.6527624130249023 }, { "auxiliary_loss_clip": 0.01050319, "auxiliary_loss_mlp": 0.01036139, "balance_loss_clip": 1.01490808, "balance_loss_mlp": 1.01609433, "epoch": 0.864692619870735, "flos": 11945427350400.0, "grad_norm": 2.226706374907213, "language_loss": 0.85747313, "learning_rate": 1.8891903613043892e-07, "loss": 0.87833768, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 14382, "time_per_iteration": 2.354745864868164 }, { "auxiliary_loss_clip": 0.01053107, "auxiliary_loss_mlp": 0.01039356, "balance_loss_clip": 1.01526403, "balance_loss_mlp": 1.01624107, "epoch": 0.864752743123403, "flos": 21469628712960.0, "grad_norm": 1.6667954444799133, "language_loss": 0.77066052, "learning_rate": 1.8875383750810504e-07, "loss": 0.79158521, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 14383, "time_per_iteration": 2.3565421104431152 }, { "auxiliary_loss_clip": 0.01051655, "auxiliary_loss_mlp": 0.01036633, "balance_loss_clip": 1.01342332, "balance_loss_mlp": 1.0164783, "epoch": 0.8648128663760709, "flos": 19529191324800.0, "grad_norm": 1.869589342949668, "language_loss": 0.86133265, "learning_rate": 1.8858870756840738e-07, "loss": 0.8822155, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14384, "time_per_iteration": 2.389897584915161 }, { "auxiliary_loss_clip": 0.01050808, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.01470077, "balance_loss_mlp": 1.01608753, "epoch": 0.8648729896287389, "flos": 21286893323520.0, "grad_norm": 1.7868591558117337, "language_loss": 0.81718117, "learning_rate": 1.884236463176072e-07, "loss": 0.83805633, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 14385, "time_per_iteration": 2.359178304672241 }, { "auxiliary_loss_clip": 0.01054765, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.0143218, "balance_loss_mlp": 1.01774347, "epoch": 0.8649331128814068, "flos": 24603432883200.0, "grad_norm": 2.4094531132280257, "language_loss": 0.73892832, "learning_rate": 1.8825865376196437e-07, "loss": 0.75985199, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37109375, "step": 14386, "time_per_iteration": 2.426994800567627 }, { "auxiliary_loss_clip": 0.0105088, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.0151403, "balance_loss_mlp": 1.01556253, "epoch": 0.8649932361340749, "flos": 15376900705920.0, "grad_norm": 2.068668149610385, "language_loss": 0.83156645, "learning_rate": 1.8809372990773476e-07, "loss": 0.852449, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 14387, "time_per_iteration": 2.3287112712860107 }, { "auxiliary_loss_clip": 0.01048785, "auxiliary_loss_mlp": 0.01036989, "balance_loss_clip": 1.01487589, "balance_loss_mlp": 1.01562548, "epoch": 0.8650533593867428, "flos": 19900213009920.0, "grad_norm": 2.0065247593543645, "language_loss": 0.69954193, "learning_rate": 1.8792887476117224e-07, "loss": 0.72039968, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33203125, "step": 14388, "time_per_iteration": 2.349789619445801 }, { "auxiliary_loss_clip": 0.01049068, "auxiliary_loss_mlp": 0.01033099, "balance_loss_clip": 1.01295233, "balance_loss_mlp": 1.01549351, "epoch": 0.8651134826394108, "flos": 25625829404160.0, "grad_norm": 2.162655054502081, "language_loss": 0.91206992, "learning_rate": 1.877640883285283e-07, "loss": 0.93289161, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.3359375, "step": 14389, "time_per_iteration": 2.4075024127960205 }, { "auxiliary_loss_clip": 0.01050492, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.0123502, "balance_loss_mlp": 1.01581693, "epoch": 0.8651736058920788, "flos": 18733520373120.0, "grad_norm": 1.4856391960708895, "language_loss": 0.72011459, "learning_rate": 1.8759937061605212e-07, "loss": 0.74096143, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 14390, "time_per_iteration": 2.369751453399658 }, { "auxiliary_loss_clip": 0.01052255, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.02074957, "balance_loss_mlp": 1.01562285, "epoch": 0.8652337291447467, "flos": 20775729974400.0, "grad_norm": 1.730005687610173, "language_loss": 0.82822776, "learning_rate": 1.8743472162998941e-07, "loss": 0.84918809, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 14391, "time_per_iteration": 2.3752269744873047 }, { "auxiliary_loss_clip": 0.01007018, "auxiliary_loss_mlp": 0.01003983, "balance_loss_clip": 1.00182509, "balance_loss_mlp": 1.00064504, "epoch": 0.8652938523974147, "flos": 64224719383680.0, "grad_norm": 0.8628401988850583, "language_loss": 0.68091345, "learning_rate": 1.8727014137658337e-07, "loss": 0.70102346, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06347656, "step": 14392, "time_per_iteration": 2.9022059440612793 }, { "auxiliary_loss_clip": 0.01053554, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.01194334, "balance_loss_mlp": 1.01569998, "epoch": 0.8653539756500827, "flos": 18039621634560.0, "grad_norm": 2.071500595190967, "language_loss": 0.76634955, "learning_rate": 1.8710562986207523e-07, "loss": 0.78724855, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 14393, "time_per_iteration": 2.3607017993927 }, { "auxiliary_loss_clip": 0.01051799, "auxiliary_loss_mlp": 0.01041451, "balance_loss_clip": 1.01790738, "balance_loss_mlp": 1.01525664, "epoch": 0.8654140989027507, "flos": 17381508906240.0, "grad_norm": 1.8099869682145857, "language_loss": 0.76135468, "learning_rate": 1.8694118709270357e-07, "loss": 0.78228718, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 14394, "time_per_iteration": 3.7599117755889893 }, { "auxiliary_loss_clip": 0.01053142, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.01269603, "balance_loss_mlp": 1.01582539, "epoch": 0.8654742221554186, "flos": 53282941153920.0, "grad_norm": 2.060777797731929, "language_loss": 0.67130232, "learning_rate": 1.867768130747036e-07, "loss": 0.69220579, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37304688, "step": 14395, "time_per_iteration": 2.67030668258667 }, { "auxiliary_loss_clip": 0.01052287, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.01103795, "balance_loss_mlp": 1.01616418, "epoch": 0.8655343454080866, "flos": 23913583862400.0, "grad_norm": 1.82963079221799, "language_loss": 0.68918121, "learning_rate": 1.8661250781430838e-07, "loss": 0.71004319, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 14396, "time_per_iteration": 2.3892412185668945 }, { "auxiliary_loss_clip": 0.01053584, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.01550865, "balance_loss_mlp": 1.01651955, "epoch": 0.8655944686607545, "flos": 24096074872320.0, "grad_norm": 2.495892751588844, "language_loss": 0.70968735, "learning_rate": 1.8644827131774954e-07, "loss": 0.73061156, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 14397, "time_per_iteration": 2.380506753921509 }, { "auxiliary_loss_clip": 0.01050429, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.01767373, "balance_loss_mlp": 1.01532412, "epoch": 0.8656545919134225, "flos": 23111593954560.0, "grad_norm": 1.6536543215745072, "language_loss": 0.6430977, "learning_rate": 1.86284103591253e-07, "loss": 0.66400599, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 14398, "time_per_iteration": 2.3721985816955566 }, { "auxiliary_loss_clip": 0.01051443, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.0132513, "balance_loss_mlp": 1.01669979, "epoch": 0.8657147151660904, "flos": 21140711994240.0, "grad_norm": 2.119871165372035, "language_loss": 0.77818179, "learning_rate": 1.8612000464104517e-07, "loss": 0.79905093, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 14399, "time_per_iteration": 2.361816644668579 }, { "auxiliary_loss_clip": 0.01050696, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.01341522, "balance_loss_mlp": 1.01528633, "epoch": 0.8657748384187585, "flos": 16288517882880.0, "grad_norm": 1.8588538403020105, "language_loss": 0.94026971, "learning_rate": 1.8595597447334855e-07, "loss": 0.96112955, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 14400, "time_per_iteration": 2.341435670852661 }, { "auxiliary_loss_clip": 0.01053545, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.01785803, "balance_loss_mlp": 1.01708591, "epoch": 0.8658349616714264, "flos": 30842656421760.0, "grad_norm": 3.068645084377434, "language_loss": 0.68208051, "learning_rate": 1.8579201309438353e-07, "loss": 0.70301414, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36523438, "step": 14401, "time_per_iteration": 2.447643280029297 }, { "auxiliary_loss_clip": 0.01052571, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.01203465, "balance_loss_mlp": 1.01624489, "epoch": 0.8658950849240944, "flos": 18951867216000.0, "grad_norm": 7.024235610908877, "language_loss": 0.7531473, "learning_rate": 1.8562812051036714e-07, "loss": 0.77403724, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 14402, "time_per_iteration": 2.4042270183563232 }, { "auxiliary_loss_clip": 0.01050233, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.01482224, "balance_loss_mlp": 1.01544046, "epoch": 0.8659552081767624, "flos": 23363317923840.0, "grad_norm": 1.705171018002368, "language_loss": 0.76789141, "learning_rate": 1.8546429672751397e-07, "loss": 0.78876483, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14403, "time_per_iteration": 2.377317190170288 }, { "auxiliary_loss_clip": 0.01053144, "auxiliary_loss_mlp": 0.01034424, "balance_loss_clip": 1.00961649, "balance_loss_mlp": 1.0163691, "epoch": 0.8660153314294303, "flos": 23840859767040.0, "grad_norm": 1.794015014646919, "language_loss": 0.7434516, "learning_rate": 1.853005417520368e-07, "loss": 0.76432729, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 14404, "time_per_iteration": 2.4152848720550537 }, { "auxiliary_loss_clip": 0.01051017, "auxiliary_loss_mlp": 0.01034593, "balance_loss_clip": 1.01186001, "balance_loss_mlp": 1.01630807, "epoch": 0.8660754546820983, "flos": 23111349575040.0, "grad_norm": 1.6507702660645553, "language_loss": 0.72616911, "learning_rate": 1.851368555901447e-07, "loss": 0.74702525, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14405, "time_per_iteration": 2.360077381134033 }, { "auxiliary_loss_clip": 0.01052882, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.01730371, "balance_loss_mlp": 1.01667666, "epoch": 0.8661355779347663, "flos": 14391128067840.0, "grad_norm": 2.2623503851460907, "language_loss": 0.67973387, "learning_rate": 1.8497323824804467e-07, "loss": 0.70067585, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 14406, "time_per_iteration": 2.3535852432250977 }, { "auxiliary_loss_clip": 0.0105086, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.01113546, "balance_loss_mlp": 1.01559234, "epoch": 0.8661957011874343, "flos": 21869105022720.0, "grad_norm": 1.7634541150599348, "language_loss": 0.84436381, "learning_rate": 1.8480968973194177e-07, "loss": 0.86519527, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35351562, "step": 14407, "time_per_iteration": 2.367490768432617 }, { "auxiliary_loss_clip": 0.01051043, "auxiliary_loss_mlp": 0.01040121, "balance_loss_clip": 1.01682734, "balance_loss_mlp": 1.01588809, "epoch": 0.8662558244401022, "flos": 21834087062400.0, "grad_norm": 1.8713919160069212, "language_loss": 0.71585619, "learning_rate": 1.8464621004803748e-07, "loss": 0.73676789, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14408, "time_per_iteration": 2.379157066345215 }, { "auxiliary_loss_clip": 0.01048091, "auxiliary_loss_mlp": 0.01037957, "balance_loss_clip": 1.01684475, "balance_loss_mlp": 1.01480699, "epoch": 0.8663159476927702, "flos": 17383149740160.0, "grad_norm": 4.607523568653544, "language_loss": 0.78163075, "learning_rate": 1.844827992025304e-07, "loss": 0.80249131, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33203125, "step": 14409, "time_per_iteration": 2.3353817462921143 }, { "auxiliary_loss_clip": 0.0105469, "auxiliary_loss_mlp": 0.01036189, "balance_loss_clip": 1.01198912, "balance_loss_mlp": 1.01688671, "epoch": 0.8663760709454381, "flos": 22746611934720.0, "grad_norm": 1.7686131202826405, "language_loss": 0.7830891, "learning_rate": 1.8431945720161757e-07, "loss": 0.80399787, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37890625, "step": 14410, "time_per_iteration": 2.3880598545074463 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.01840699, "balance_loss_mlp": 1.01571, "epoch": 0.8664361941981061, "flos": 17376097645440.0, "grad_norm": 3.058786885765909, "language_loss": 0.78603214, "learning_rate": 1.8415618405149315e-07, "loss": 0.80696344, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 14411, "time_per_iteration": 3.53472638130188 }, { "auxiliary_loss_clip": 0.01049385, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.01666641, "balance_loss_mlp": 1.01530814, "epoch": 0.866496317450774, "flos": 16033512245760.0, "grad_norm": 1.6339132344309752, "language_loss": 0.74860048, "learning_rate": 1.8399297975834794e-07, "loss": 0.76946801, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 14412, "time_per_iteration": 2.3892323970794678 }, { "auxiliary_loss_clip": 0.01047502, "auxiliary_loss_mlp": 0.01029304, "balance_loss_clip": 1.01021862, "balance_loss_mlp": 1.0145421, "epoch": 0.8665564407034421, "flos": 20813750311680.0, "grad_norm": 1.6685640436507865, "language_loss": 0.70674813, "learning_rate": 1.83829844328371e-07, "loss": 0.72751617, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.33007812, "step": 14413, "time_per_iteration": 2.367943286895752 }, { "auxiliary_loss_clip": 0.01051068, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.01747358, "balance_loss_mlp": 1.01612329, "epoch": 0.86661656395611, "flos": 15814257707520.0, "grad_norm": 2.022487475375983, "language_loss": 0.63735592, "learning_rate": 1.8366677776774874e-07, "loss": 0.65826535, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34960938, "step": 14414, "time_per_iteration": 2.4305784702301025 }, { "auxiliary_loss_clip": 0.01052466, "auxiliary_loss_mlp": 0.01039151, "balance_loss_clip": 1.01721621, "balance_loss_mlp": 1.01660895, "epoch": 0.866676687208778, "flos": 23035867482240.0, "grad_norm": 1.7742663599636839, "language_loss": 0.64442754, "learning_rate": 1.8350378008266377e-07, "loss": 0.66534376, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.359375, "step": 14415, "time_per_iteration": 2.392693519592285 }, { "auxiliary_loss_clip": 0.01007711, "auxiliary_loss_mlp": 0.01002493, "balance_loss_clip": 1.00047863, "balance_loss_mlp": 1.00120735, "epoch": 0.866736810461446, "flos": 63798778396800.0, "grad_norm": 0.7952035184378787, "language_loss": 0.60459179, "learning_rate": 1.8334085127929754e-07, "loss": 0.62469381, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.06542969, "step": 14416, "time_per_iteration": 3.1082167625427246 }, { "auxiliary_loss_clip": 0.01053715, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.0128895, "balance_loss_mlp": 1.01580429, "epoch": 0.8667969337141139, "flos": 20448314444160.0, "grad_norm": 1.6381203733142533, "language_loss": 0.76190567, "learning_rate": 1.831779913638285e-07, "loss": 0.78282809, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 14417, "time_per_iteration": 2.3641412258148193 }, { "auxiliary_loss_clip": 0.01050511, "auxiliary_loss_mlp": 0.01036029, "balance_loss_clip": 1.01327181, "balance_loss_mlp": 1.01556015, "epoch": 0.866857056966782, "flos": 21652608481920.0, "grad_norm": 1.8006581663878356, "language_loss": 0.76299393, "learning_rate": 1.830152003424319e-07, "loss": 0.78385937, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34960938, "step": 14418, "time_per_iteration": 2.4161219596862793 }, { "auxiliary_loss_clip": 0.01049106, "auxiliary_loss_mlp": 0.01038762, "balance_loss_clip": 1.01703048, "balance_loss_mlp": 1.0142715, "epoch": 0.8669171802194499, "flos": 22851840372480.0, "grad_norm": 1.4525876246499663, "language_loss": 0.68668652, "learning_rate": 1.8285247822128126e-07, "loss": 0.70756519, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 14419, "time_per_iteration": 3.8454902172088623 }, { "auxiliary_loss_clip": 0.01051181, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.01372814, "balance_loss_mlp": 1.01570344, "epoch": 0.8669773034721179, "flos": 18733171259520.0, "grad_norm": 1.658256954937257, "language_loss": 0.79460526, "learning_rate": 1.826898250065465e-07, "loss": 0.81547678, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14420, "time_per_iteration": 2.363663911819458 }, { "auxiliary_loss_clip": 0.01050323, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.01623821, "balance_loss_mlp": 1.01631415, "epoch": 0.8670374267247858, "flos": 18915033864960.0, "grad_norm": 1.4537718602653396, "language_loss": 0.83897996, "learning_rate": 1.8252724070439586e-07, "loss": 0.85986316, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33984375, "step": 14421, "time_per_iteration": 3.7353038787841797 }, { "auxiliary_loss_clip": 0.01007271, "auxiliary_loss_mlp": 0.01003496, "balance_loss_clip": 1.0013268, "balance_loss_mlp": 1.00086844, "epoch": 0.8670975499774538, "flos": 48811227840000.0, "grad_norm": 0.7863018082135329, "language_loss": 0.49235007, "learning_rate": 1.823647253209941e-07, "loss": 0.51245773, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06396484, "step": 14422, "time_per_iteration": 3.04862642288208 }, { "auxiliary_loss_clip": 0.01050754, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.01716971, "balance_loss_mlp": 1.01593828, "epoch": 0.8671576732301217, "flos": 26135072628480.0, "grad_norm": 2.1788932053276024, "language_loss": 0.74472761, "learning_rate": 1.8220227886250417e-07, "loss": 0.76562369, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 14423, "time_per_iteration": 2.4050352573394775 }, { "auxiliary_loss_clip": 0.01048005, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.01052046, "balance_loss_mlp": 1.0144403, "epoch": 0.8672177964827897, "flos": 18366513494400.0, "grad_norm": 1.58939400368069, "language_loss": 0.77634406, "learning_rate": 1.8203990133508684e-07, "loss": 0.79712582, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.3359375, "step": 14424, "time_per_iteration": 2.3670294284820557 }, { "auxiliary_loss_clip": 0.01048521, "auxiliary_loss_mlp": 0.01036068, "balance_loss_clip": 1.01490867, "balance_loss_mlp": 1.01528001, "epoch": 0.8672779197354576, "flos": 28544184374400.0, "grad_norm": 1.7764334806323139, "language_loss": 0.7202909, "learning_rate": 1.8187759274489767e-07, "loss": 0.74113679, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33203125, "step": 14425, "time_per_iteration": 2.439204216003418 }, { "auxiliary_loss_clip": 0.0105399, "auxiliary_loss_mlp": 0.01041607, "balance_loss_clip": 1.01708591, "balance_loss_mlp": 1.01685369, "epoch": 0.8673380429881257, "flos": 22381385535360.0, "grad_norm": 1.4706355324244202, "language_loss": 0.6903196, "learning_rate": 1.817153530980926e-07, "loss": 0.71127558, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 14426, "time_per_iteration": 2.4000675678253174 }, { "auxiliary_loss_clip": 0.01053567, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.01230907, "balance_loss_mlp": 1.01638472, "epoch": 0.8673981662407936, "flos": 20995368537600.0, "grad_norm": 1.720471478332221, "language_loss": 0.71173418, "learning_rate": 1.815531824008234e-07, "loss": 0.73263264, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 14427, "time_per_iteration": 2.3671512603759766 }, { "auxiliary_loss_clip": 0.01052061, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.01237226, "balance_loss_mlp": 1.01729631, "epoch": 0.8674582894934616, "flos": 24425619995520.0, "grad_norm": 1.5745276244735862, "language_loss": 0.6892997, "learning_rate": 1.8139108065924004e-07, "loss": 0.71016836, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 14428, "time_per_iteration": 2.4152557849884033 }, { "auxiliary_loss_clip": 0.01051072, "auxiliary_loss_mlp": 0.01036357, "balance_loss_clip": 1.01563859, "balance_loss_mlp": 1.01599622, "epoch": 0.8675184127461296, "flos": 20736557562240.0, "grad_norm": 2.7347542750204332, "language_loss": 0.71543837, "learning_rate": 1.812290478794889e-07, "loss": 0.73631263, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3515625, "step": 14429, "time_per_iteration": 2.38466477394104 }, { "auxiliary_loss_clip": 0.01051669, "auxiliary_loss_mlp": 0.01035094, "balance_loss_clip": 1.01168108, "balance_loss_mlp": 1.01558828, "epoch": 0.8675785359987975, "flos": 19134637516800.0, "grad_norm": 2.0855148343500876, "language_loss": 0.68227589, "learning_rate": 1.810670840677151e-07, "loss": 0.70314348, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 14430, "time_per_iteration": 2.348679542541504 }, { "auxiliary_loss_clip": 0.01053568, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.01724386, "balance_loss_mlp": 1.01619172, "epoch": 0.8676386592514655, "flos": 22709569115520.0, "grad_norm": 1.8197723765073757, "language_loss": 0.70474076, "learning_rate": 1.8090518923005948e-07, "loss": 0.72568905, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37304688, "step": 14431, "time_per_iteration": 2.3773789405822754 }, { "auxiliary_loss_clip": 0.01052522, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.01276672, "balance_loss_mlp": 1.01659822, "epoch": 0.8676987825041335, "flos": 14208986171520.0, "grad_norm": 2.851282654389925, "language_loss": 0.64794946, "learning_rate": 1.8074336337266116e-07, "loss": 0.66882741, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 14432, "time_per_iteration": 2.3374879360198975 }, { "auxiliary_loss_clip": 0.01052054, "auxiliary_loss_mlp": 0.01036839, "balance_loss_clip": 1.01378429, "balance_loss_mlp": 1.01602423, "epoch": 0.8677589057568015, "flos": 13589068337280.0, "grad_norm": 2.072126781277754, "language_loss": 0.79673481, "learning_rate": 1.8058160650165656e-07, "loss": 0.81762373, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 14433, "time_per_iteration": 2.392008066177368 }, { "auxiliary_loss_clip": 0.010079, "auxiliary_loss_mlp": 0.01002924, "balance_loss_clip": 1.00090969, "balance_loss_mlp": 1.00143969, "epoch": 0.8678190290094694, "flos": 68930383052160.0, "grad_norm": 0.7064654112680087, "language_loss": 0.58629304, "learning_rate": 1.804199186231805e-07, "loss": 0.60640132, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.06445312, "step": 14434, "time_per_iteration": 4.567859649658203 }, { "auxiliary_loss_clip": 0.01049316, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.01193571, "balance_loss_mlp": 1.01519513, "epoch": 0.8678791522621374, "flos": 32556472974720.0, "grad_norm": 1.7541212996513071, "language_loss": 0.80602515, "learning_rate": 1.802582997433628e-07, "loss": 0.82683849, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 14435, "time_per_iteration": 2.4611613750457764 }, { "auxiliary_loss_clip": 0.01050317, "auxiliary_loss_mlp": 0.0103664, "balance_loss_clip": 1.01205921, "balance_loss_mlp": 1.01459384, "epoch": 0.8679392755148053, "flos": 35041206458880.0, "grad_norm": 1.9115427732680266, "language_loss": 0.63658369, "learning_rate": 1.8009674986833322e-07, "loss": 0.6574533, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 14436, "time_per_iteration": 2.4748575687408447 }, { "auxiliary_loss_clip": 0.01051616, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.01267338, "balance_loss_mlp": 1.01567459, "epoch": 0.8679993987674733, "flos": 18551483210880.0, "grad_norm": 1.9427905691954463, "language_loss": 0.71655893, "learning_rate": 1.7993526900421706e-07, "loss": 0.73744535, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 14437, "time_per_iteration": 2.3624138832092285 }, { "auxiliary_loss_clip": 0.01051034, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.01196766, "balance_loss_mlp": 1.01595938, "epoch": 0.8680595220201412, "flos": 27453148387200.0, "grad_norm": 1.9693064498905066, "language_loss": 0.82057613, "learning_rate": 1.797738571571381e-07, "loss": 0.8414318, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 14438, "time_per_iteration": 2.412100076675415 }, { "auxiliary_loss_clip": 0.0104801, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.012362, "balance_loss_mlp": 1.01486444, "epoch": 0.8681196452728093, "flos": 19207780548480.0, "grad_norm": 1.8655223814872073, "language_loss": 0.68529022, "learning_rate": 1.7961251433321656e-07, "loss": 0.70610851, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33203125, "step": 14439, "time_per_iteration": 2.405155897140503 }, { "auxiliary_loss_clip": 0.01049645, "auxiliary_loss_mlp": 0.01037607, "balance_loss_clip": 1.01780617, "balance_loss_mlp": 1.0155679, "epoch": 0.8681797685254772, "flos": 37558933044480.0, "grad_norm": 1.579482142597671, "language_loss": 0.65052712, "learning_rate": 1.7945124053857085e-07, "loss": 0.67139959, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33984375, "step": 14440, "time_per_iteration": 2.5525436401367188 }, { "auxiliary_loss_clip": 0.01048789, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.01345026, "balance_loss_mlp": 1.01561809, "epoch": 0.8682398917781452, "flos": 23288952994560.0, "grad_norm": 1.5864778673107185, "language_loss": 0.66462296, "learning_rate": 1.7929003577931722e-07, "loss": 0.68545723, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33203125, "step": 14441, "time_per_iteration": 2.414846181869507 }, { "auxiliary_loss_clip": 0.01048139, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.01035953, "balance_loss_mlp": 1.01475143, "epoch": 0.8683000150308132, "flos": 21871688463360.0, "grad_norm": 1.5129986953260706, "language_loss": 0.66906655, "learning_rate": 1.7912890006156722e-07, "loss": 0.6898551, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33398438, "step": 14442, "time_per_iteration": 2.3914287090301514 }, { "auxiliary_loss_clip": 0.01052985, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.00887752, "balance_loss_mlp": 1.01600635, "epoch": 0.8683601382834811, "flos": 14646343173120.0, "grad_norm": 1.795210209118752, "language_loss": 0.73153412, "learning_rate": 1.7896783339143195e-07, "loss": 0.75240958, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37109375, "step": 14443, "time_per_iteration": 2.375741481781006 }, { "auxiliary_loss_clip": 0.01051288, "auxiliary_loss_mlp": 0.01037773, "balance_loss_clip": 1.01468205, "balance_loss_mlp": 1.01554728, "epoch": 0.8684202615361492, "flos": 26358691086720.0, "grad_norm": 1.7248452203181228, "language_loss": 0.84083539, "learning_rate": 1.7880683577501877e-07, "loss": 0.86172599, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 14444, "time_per_iteration": 2.432027578353882 }, { "auxiliary_loss_clip": 0.01052421, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.01354527, "balance_loss_mlp": 1.01605988, "epoch": 0.8684803847888171, "flos": 20702970967680.0, "grad_norm": 1.9117899248915144, "language_loss": 0.78204179, "learning_rate": 1.7864590721843342e-07, "loss": 0.80292881, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 14445, "time_per_iteration": 2.380584478378296 }, { "auxiliary_loss_clip": 0.01051294, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.01223826, "balance_loss_mlp": 1.0161109, "epoch": 0.8685405080414851, "flos": 22637019576960.0, "grad_norm": 2.0183425071375516, "language_loss": 0.68651801, "learning_rate": 1.7848504772777728e-07, "loss": 0.70737618, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14446, "time_per_iteration": 2.3673148155212402 }, { "auxiliary_loss_clip": 0.01050737, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.01185453, "balance_loss_mlp": 1.01601982, "epoch": 0.868600631294153, "flos": 24821046587520.0, "grad_norm": 3.5811218468262975, "language_loss": 0.83564538, "learning_rate": 1.7832425730915102e-07, "loss": 0.8564893, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 14447, "time_per_iteration": 2.401456594467163 }, { "auxiliary_loss_clip": 0.010511, "auxiliary_loss_mlp": 0.01031383, "balance_loss_clip": 1.00959206, "balance_loss_mlp": 1.01524067, "epoch": 0.868660754546821, "flos": 25112955398400.0, "grad_norm": 1.627511418538142, "language_loss": 0.74921435, "learning_rate": 1.781635359686515e-07, "loss": 0.7700392, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 14448, "time_per_iteration": 2.3894741535186768 }, { "auxiliary_loss_clip": 0.01052124, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.01475525, "balance_loss_mlp": 1.01640427, "epoch": 0.8687208777994889, "flos": 12676997312640.0, "grad_norm": 1.837328683434655, "language_loss": 0.81810379, "learning_rate": 1.7800288371237303e-07, "loss": 0.8390075, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35742188, "step": 14449, "time_per_iteration": 2.342402458190918 }, { "auxiliary_loss_clip": 0.01007504, "auxiliary_loss_mlp": 0.01002702, "balance_loss_clip": 1.00055635, "balance_loss_mlp": 1.00111389, "epoch": 0.8687810010521569, "flos": 65613948226560.0, "grad_norm": 0.8060752102195506, "language_loss": 0.60662937, "learning_rate": 1.7784230054640758e-07, "loss": 0.6267314, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06396484, "step": 14450, "time_per_iteration": 4.334201335906982 }, { "auxiliary_loss_clip": 0.01052669, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.01851881, "balance_loss_mlp": 1.01640177, "epoch": 0.8688411243048249, "flos": 24242849694720.0, "grad_norm": 1.665231092104808, "language_loss": 0.77154803, "learning_rate": 1.7768178647684517e-07, "loss": 0.79249656, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 14451, "time_per_iteration": 2.3821632862091064 }, { "auxiliary_loss_clip": 0.01050451, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.01443207, "balance_loss_mlp": 1.01563513, "epoch": 0.8689012475574929, "flos": 18220890746880.0, "grad_norm": 2.3118632568996795, "language_loss": 0.73174465, "learning_rate": 1.7752134150977205e-07, "loss": 0.75262487, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 14452, "time_per_iteration": 2.322871208190918 }, { "auxiliary_loss_clip": 0.01052745, "auxiliary_loss_mlp": 0.01037576, "balance_loss_clip": 1.01336479, "balance_loss_mlp": 1.0163703, "epoch": 0.8689613708101608, "flos": 19645696131840.0, "grad_norm": 3.2578683686783396, "language_loss": 0.73253429, "learning_rate": 1.7736096565127201e-07, "loss": 0.75343752, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 14453, "time_per_iteration": 2.3865013122558594 }, { "auxiliary_loss_clip": 0.01051441, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.0145694, "balance_loss_mlp": 1.01679325, "epoch": 0.8690214940628288, "flos": 11727953291520.0, "grad_norm": 2.113175952742342, "language_loss": 0.75225008, "learning_rate": 1.7720065890742664e-07, "loss": 0.77313614, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14454, "time_per_iteration": 2.3500101566314697 }, { "auxiliary_loss_clip": 0.01051902, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.0109725, "balance_loss_mlp": 1.01703608, "epoch": 0.8690816173154968, "flos": 34934930680320.0, "grad_norm": 1.7912316901110124, "language_loss": 0.60556537, "learning_rate": 1.7704042128431552e-07, "loss": 0.62642872, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 14455, "time_per_iteration": 2.507681131362915 }, { "auxiliary_loss_clip": 0.01051523, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.01244092, "balance_loss_mlp": 1.01581407, "epoch": 0.8691417405681647, "flos": 11614136659200.0, "grad_norm": 2.450685008244239, "language_loss": 0.81517994, "learning_rate": 1.7688025278801378e-07, "loss": 0.83603692, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35742188, "step": 14456, "time_per_iteration": 2.4141736030578613 }, { "auxiliary_loss_clip": 0.01054681, "auxiliary_loss_mlp": 0.01038807, "balance_loss_clip": 1.01280785, "balance_loss_mlp": 1.01712883, "epoch": 0.8692018638208328, "flos": 24606889107840.0, "grad_norm": 2.0119713695527803, "language_loss": 0.76657146, "learning_rate": 1.7672015342459568e-07, "loss": 0.7875064, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 14457, "time_per_iteration": 2.406972646713257 }, { "auxiliary_loss_clip": 0.01049226, "auxiliary_loss_mlp": 0.01037545, "balance_loss_clip": 1.01735115, "balance_loss_mlp": 1.01541305, "epoch": 0.8692619870735007, "flos": 25993918535040.0, "grad_norm": 1.6818097507220557, "language_loss": 0.79564953, "learning_rate": 1.765601232001328e-07, "loss": 0.81651723, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.33789062, "step": 14458, "time_per_iteration": 2.412250280380249 }, { "auxiliary_loss_clip": 0.01050666, "auxiliary_loss_mlp": 0.01039094, "balance_loss_clip": 1.01395321, "balance_loss_mlp": 1.01532936, "epoch": 0.8693221103261687, "flos": 18040808620800.0, "grad_norm": 1.790909351944964, "language_loss": 0.71755838, "learning_rate": 1.7640016212069187e-07, "loss": 0.73845601, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35351562, "step": 14459, "time_per_iteration": 3.7746572494506836 }, { "auxiliary_loss_clip": 0.01047852, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.01564503, "balance_loss_mlp": 1.0154531, "epoch": 0.8693822335788366, "flos": 27491063990400.0, "grad_norm": 1.3897437694234784, "language_loss": 0.74882376, "learning_rate": 1.762402701923398e-07, "loss": 0.76964426, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.32421875, "step": 14460, "time_per_iteration": 3.8905177116394043 }, { "auxiliary_loss_clip": 0.01053225, "auxiliary_loss_mlp": 0.01042052, "balance_loss_clip": 1.01596904, "balance_loss_mlp": 1.01631093, "epoch": 0.8694423568315046, "flos": 24096563631360.0, "grad_norm": 1.9965178057843482, "language_loss": 0.66299862, "learning_rate": 1.7608044742113947e-07, "loss": 0.68395138, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36914062, "step": 14461, "time_per_iteration": 2.4059970378875732 }, { "auxiliary_loss_clip": 0.01052054, "auxiliary_loss_mlp": 0.01039752, "balance_loss_clip": 1.01539755, "balance_loss_mlp": 1.0157187, "epoch": 0.8695024800841725, "flos": 18361346613120.0, "grad_norm": 1.968614735224122, "language_loss": 0.83271682, "learning_rate": 1.7592069381315123e-07, "loss": 0.85363489, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 14462, "time_per_iteration": 2.35707950592041 }, { "auxiliary_loss_clip": 0.01051052, "auxiliary_loss_mlp": 0.01035852, "balance_loss_clip": 1.01272547, "balance_loss_mlp": 1.01529264, "epoch": 0.8695626033368405, "flos": 14026879186560.0, "grad_norm": 1.6442196783916103, "language_loss": 0.65778005, "learning_rate": 1.757610093744335e-07, "loss": 0.67864907, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 14463, "time_per_iteration": 2.347097396850586 }, { "auxiliary_loss_clip": 0.01055588, "auxiliary_loss_mlp": 0.01042197, "balance_loss_clip": 1.01670969, "balance_loss_mlp": 1.01767278, "epoch": 0.8696227265895085, "flos": 16835921089920.0, "grad_norm": 2.4967587124308617, "language_loss": 0.67551196, "learning_rate": 1.7560139411104058e-07, "loss": 0.69648981, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 14464, "time_per_iteration": 2.349292039871216 }, { "auxiliary_loss_clip": 0.01054044, "auxiliary_loss_mlp": 0.01044797, "balance_loss_clip": 1.01975131, "balance_loss_mlp": 1.0164063, "epoch": 0.8696828498421765, "flos": 21797986849920.0, "grad_norm": 2.0684708220169044, "language_loss": 0.63398784, "learning_rate": 1.7544184802902607e-07, "loss": 0.65497619, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 14465, "time_per_iteration": 2.3800735473632812 }, { "auxiliary_loss_clip": 0.01047876, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.01356494, "balance_loss_mlp": 1.01501691, "epoch": 0.8697429730948444, "flos": 22893666048000.0, "grad_norm": 1.4265899809777687, "language_loss": 0.85213065, "learning_rate": 1.7528237113443934e-07, "loss": 0.87294763, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.328125, "step": 14466, "time_per_iteration": 2.396258592605591 }, { "auxiliary_loss_clip": 0.01054522, "auxiliary_loss_mlp": 0.01045284, "balance_loss_clip": 1.0182476, "balance_loss_mlp": 1.01688814, "epoch": 0.8698030963475124, "flos": 24716306908800.0, "grad_norm": 2.989678666768689, "language_loss": 0.63806611, "learning_rate": 1.7512296343332779e-07, "loss": 0.65906417, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37695312, "step": 14467, "time_per_iteration": 2.3986077308654785 }, { "auxiliary_loss_clip": 0.01048712, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.0133462, "balance_loss_mlp": 1.01577568, "epoch": 0.8698632196001803, "flos": 28440876061440.0, "grad_norm": 1.3958473326239322, "language_loss": 0.69412422, "learning_rate": 1.7496362493173655e-07, "loss": 0.71494776, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33007812, "step": 14468, "time_per_iteration": 2.476921319961548 }, { "auxiliary_loss_clip": 0.01048431, "auxiliary_loss_mlp": 0.01041413, "balance_loss_clip": 1.01958597, "balance_loss_mlp": 1.0143137, "epoch": 0.8699233428528483, "flos": 27635220460800.0, "grad_norm": 1.5886291915105746, "language_loss": 0.7165466, "learning_rate": 1.7480435563570773e-07, "loss": 0.737445, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 14469, "time_per_iteration": 2.4104373455047607 }, { "auxiliary_loss_clip": 0.01048863, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.01146984, "balance_loss_mlp": 1.01585186, "epoch": 0.8699834661055164, "flos": 20044683682560.0, "grad_norm": 2.058547498557045, "language_loss": 0.84875453, "learning_rate": 1.7464515555128024e-07, "loss": 0.86956239, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.328125, "step": 14470, "time_per_iteration": 2.3831186294555664 }, { "auxiliary_loss_clip": 0.01052815, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.01122212, "balance_loss_mlp": 1.01701951, "epoch": 0.8700435893581843, "flos": 23731651434240.0, "grad_norm": 1.6053008113663312, "language_loss": 0.73962528, "learning_rate": 1.7448602468449148e-07, "loss": 0.76048118, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35742188, "step": 14471, "time_per_iteration": 2.3714077472686768 }, { "auxiliary_loss_clip": 0.01050203, "auxiliary_loss_mlp": 0.01031544, "balance_loss_clip": 1.01129031, "balance_loss_mlp": 1.01627195, "epoch": 0.8701037126108523, "flos": 23547345033600.0, "grad_norm": 1.5205760103295891, "language_loss": 0.79933476, "learning_rate": 1.7432696304137573e-07, "loss": 0.82015228, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.33984375, "step": 14472, "time_per_iteration": 2.422560691833496 }, { "auxiliary_loss_clip": 0.01050686, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.00781655, "balance_loss_mlp": 1.01540375, "epoch": 0.8701638358635202, "flos": 18842449415040.0, "grad_norm": 2.308038611795501, "language_loss": 0.73918849, "learning_rate": 1.741679706279644e-07, "loss": 0.75999618, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 14473, "time_per_iteration": 2.337561845779419 }, { "auxiliary_loss_clip": 0.0105229, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.01612055, "balance_loss_mlp": 1.01635146, "epoch": 0.8702239591161882, "flos": 27927094360320.0, "grad_norm": 1.4834471818587378, "language_loss": 0.73025572, "learning_rate": 1.7400904745028644e-07, "loss": 0.75116396, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 14474, "time_per_iteration": 3.856506109237671 }, { "auxiliary_loss_clip": 0.01051024, "auxiliary_loss_mlp": 0.01039389, "balance_loss_clip": 1.01518977, "balance_loss_mlp": 1.01552248, "epoch": 0.8702840823688561, "flos": 17233163072640.0, "grad_norm": 1.870445442957608, "language_loss": 0.69389832, "learning_rate": 1.7385019351436925e-07, "loss": 0.71480238, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 14475, "time_per_iteration": 2.3503222465515137 }, { "auxiliary_loss_clip": 0.01050375, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.01314712, "balance_loss_mlp": 1.01456714, "epoch": 0.8703442056215241, "flos": 19426546327680.0, "grad_norm": 1.7174655084413035, "language_loss": 0.78662527, "learning_rate": 1.736914088262349e-07, "loss": 0.80749685, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 14476, "time_per_iteration": 2.4334428310394287 }, { "auxiliary_loss_clip": 0.01050059, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01166487, "balance_loss_mlp": 1.01559615, "epoch": 0.8704043288741921, "flos": 22272735784320.0, "grad_norm": 1.4319934102124585, "language_loss": 0.72982097, "learning_rate": 1.7353269339190525e-07, "loss": 0.75065565, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 14477, "time_per_iteration": 2.4092743396759033 }, { "auxiliary_loss_clip": 0.01050825, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.01382291, "balance_loss_mlp": 1.01538324, "epoch": 0.8704644521268601, "flos": 16647948996480.0, "grad_norm": 2.05573675346676, "language_loss": 0.6060279, "learning_rate": 1.7337404721739946e-07, "loss": 0.62690622, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 14478, "time_per_iteration": 2.360492706298828 }, { "auxiliary_loss_clip": 0.01051745, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.02069426, "balance_loss_mlp": 1.01816034, "epoch": 0.870524575379528, "flos": 24279054641280.0, "grad_norm": 1.6339579079715436, "language_loss": 0.73044217, "learning_rate": 1.732154703087323e-07, "loss": 0.75137311, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3359375, "step": 14479, "time_per_iteration": 2.394862651824951 }, { "auxiliary_loss_clip": 0.01050443, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.0138334, "balance_loss_mlp": 1.015728, "epoch": 0.870584698632196, "flos": 28767383896320.0, "grad_norm": 1.5170201079056684, "language_loss": 0.72456378, "learning_rate": 1.7305696267191805e-07, "loss": 0.74545008, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.34765625, "step": 14480, "time_per_iteration": 2.482680082321167 }, { "auxiliary_loss_clip": 0.01052145, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.01733124, "balance_loss_mlp": 1.01638448, "epoch": 0.8706448218848639, "flos": 32448346894080.0, "grad_norm": 2.5486233998507024, "language_loss": 0.71230781, "learning_rate": 1.728985243129666e-07, "loss": 0.73323119, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 14481, "time_per_iteration": 2.445005416870117 }, { "auxiliary_loss_clip": 0.01050126, "auxiliary_loss_mlp": 0.01034617, "balance_loss_clip": 1.0129329, "balance_loss_mlp": 1.01566386, "epoch": 0.8707049451375319, "flos": 22746891225600.0, "grad_norm": 1.6071471212135056, "language_loss": 0.77790958, "learning_rate": 1.7274015523788643e-07, "loss": 0.79875696, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 14482, "time_per_iteration": 2.3997795581817627 }, { "auxiliary_loss_clip": 0.0105143, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.01248693, "balance_loss_mlp": 1.01622081, "epoch": 0.8707650683902, "flos": 15851056147200.0, "grad_norm": 1.677502106281115, "language_loss": 0.77432203, "learning_rate": 1.7258185545268234e-07, "loss": 0.79518366, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14483, "time_per_iteration": 2.341221570968628 }, { "auxiliary_loss_clip": 0.01055598, "auxiliary_loss_mlp": 0.01041658, "balance_loss_clip": 1.01627827, "balance_loss_mlp": 1.017133, "epoch": 0.8708251916428679, "flos": 16467308288640.0, "grad_norm": 2.2441600747011923, "language_loss": 0.62732434, "learning_rate": 1.7242362496335749e-07, "loss": 0.64829683, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 14484, "time_per_iteration": 2.4026832580566406 }, { "auxiliary_loss_clip": 0.01050857, "auxiliary_loss_mlp": 0.0103996, "balance_loss_clip": 1.01679778, "balance_loss_mlp": 1.01587522, "epoch": 0.8708853148955359, "flos": 15376935617280.0, "grad_norm": 1.7876971821430463, "language_loss": 0.69705451, "learning_rate": 1.7226546377591222e-07, "loss": 0.71796262, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34960938, "step": 14485, "time_per_iteration": 2.343864679336548 }, { "auxiliary_loss_clip": 0.01050309, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.01436138, "balance_loss_mlp": 1.01544642, "epoch": 0.8709454381482038, "flos": 30550119206400.0, "grad_norm": 1.8368487178134811, "language_loss": 0.64403749, "learning_rate": 1.7210737189634373e-07, "loss": 0.66490769, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14486, "time_per_iteration": 2.4628937244415283 }, { "auxiliary_loss_clip": 0.0105227, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.01523948, "balance_loss_mlp": 1.01526737, "epoch": 0.8710055614008718, "flos": 22600325871360.0, "grad_norm": 5.221181795989689, "language_loss": 0.63132209, "learning_rate": 1.7194934933064653e-07, "loss": 0.65223753, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 14487, "time_per_iteration": 2.3619534969329834 }, { "auxiliary_loss_clip": 0.01049032, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.01243448, "balance_loss_mlp": 1.01481378, "epoch": 0.8710656846535397, "flos": 18442135232640.0, "grad_norm": 2.052719560672066, "language_loss": 0.69268131, "learning_rate": 1.7179139608481318e-07, "loss": 0.71350223, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34375, "step": 14488, "time_per_iteration": 2.3762807846069336 }, { "auxiliary_loss_clip": 0.01051225, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.01322162, "balance_loss_mlp": 1.01574314, "epoch": 0.8711258079062077, "flos": 16503059387520.0, "grad_norm": 1.9807963629322713, "language_loss": 0.86101049, "learning_rate": 1.716335121648338e-07, "loss": 0.88187623, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 14489, "time_per_iteration": 2.3453447818756104 }, { "auxiliary_loss_clip": 0.01055431, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.01506817, "balance_loss_mlp": 1.01692426, "epoch": 0.8711859311588757, "flos": 15662595294720.0, "grad_norm": 2.3086008732504024, "language_loss": 0.77794933, "learning_rate": 1.7147569757669445e-07, "loss": 0.79890233, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38476562, "step": 14490, "time_per_iteration": 3.625624418258667 }, { "auxiliary_loss_clip": 0.01055086, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.01172256, "balance_loss_mlp": 1.01745367, "epoch": 0.8712460544115437, "flos": 15556703541120.0, "grad_norm": 2.007114664812903, "language_loss": 0.77498358, "learning_rate": 1.7131795232638012e-07, "loss": 0.79590589, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37695312, "step": 14491, "time_per_iteration": 2.3316125869750977 }, { "auxiliary_loss_clip": 0.01052161, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.00988197, "balance_loss_mlp": 1.01814306, "epoch": 0.8713061776642116, "flos": 16762638412800.0, "grad_norm": 1.6867502573859026, "language_loss": 0.67660707, "learning_rate": 1.711602764198723e-07, "loss": 0.69744349, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 14492, "time_per_iteration": 2.3669166564941406 }, { "auxiliary_loss_clip": 0.01050428, "auxiliary_loss_mlp": 0.01035942, "balance_loss_clip": 1.01426959, "balance_loss_mlp": 1.01611364, "epoch": 0.8713663009168796, "flos": 24278740439040.0, "grad_norm": 1.820461415491572, "language_loss": 0.70943332, "learning_rate": 1.7100266986314992e-07, "loss": 0.73029709, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 14493, "time_per_iteration": 2.3752059936523438 }, { "auxiliary_loss_clip": 0.01054156, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.01300573, "balance_loss_mlp": 1.0179106, "epoch": 0.8714264241695475, "flos": 23794739994240.0, "grad_norm": 2.1534882901387653, "language_loss": 0.90536761, "learning_rate": 1.7084513266218936e-07, "loss": 0.92628729, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 14494, "time_per_iteration": 2.393378257751465 }, { "auxiliary_loss_clip": 0.01049794, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.01191652, "balance_loss_mlp": 1.0162375, "epoch": 0.8714865474222155, "flos": 37996429691520.0, "grad_norm": 1.8555462780225984, "language_loss": 0.6101315, "learning_rate": 1.7068766482296514e-07, "loss": 0.63094735, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.3359375, "step": 14495, "time_per_iteration": 2.5097391605377197 }, { "auxiliary_loss_clip": 0.01051739, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.01685309, "balance_loss_mlp": 1.01620626, "epoch": 0.8715466706748836, "flos": 22454598389760.0, "grad_norm": 2.095277928906892, "language_loss": 0.8206116, "learning_rate": 1.7053026635144762e-07, "loss": 0.8415215, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 14496, "time_per_iteration": 2.3912484645843506 }, { "auxiliary_loss_clip": 0.01051751, "auxiliary_loss_mlp": 0.01040043, "balance_loss_clip": 1.01629639, "balance_loss_mlp": 1.01602113, "epoch": 0.8716067939275515, "flos": 21214064494080.0, "grad_norm": 1.9998890409542844, "language_loss": 0.79575229, "learning_rate": 1.7037293725360624e-07, "loss": 0.81667024, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 14497, "time_per_iteration": 2.3593032360076904 }, { "auxiliary_loss_clip": 0.01053479, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.01292944, "balance_loss_mlp": 1.01671469, "epoch": 0.8716669171802195, "flos": 22996764892800.0, "grad_norm": 2.0306023453279702, "language_loss": 0.69066668, "learning_rate": 1.70215677535406e-07, "loss": 0.71156418, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 14498, "time_per_iteration": 3.860302686691284 }, { "auxiliary_loss_clip": 0.01049559, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.01301527, "balance_loss_mlp": 1.01414561, "epoch": 0.8717270404328874, "flos": 29782902879360.0, "grad_norm": 1.5401694129570487, "language_loss": 0.57816005, "learning_rate": 1.700584872028108e-07, "loss": 0.59900773, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14499, "time_per_iteration": 2.420132875442505 }, { "auxiliary_loss_clip": 0.01051102, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.01134348, "balance_loss_mlp": 1.01504695, "epoch": 0.8717871636855554, "flos": 22017031920000.0, "grad_norm": 2.0625017157527727, "language_loss": 0.81390321, "learning_rate": 1.6990136626178097e-07, "loss": 0.83477283, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 14500, "time_per_iteration": 3.6935768127441406 }, { "auxiliary_loss_clip": 0.0105087, "auxiliary_loss_mlp": 0.01034085, "balance_loss_clip": 1.01198328, "balance_loss_mlp": 1.01636529, "epoch": 0.8718472869382233, "flos": 16653325345920.0, "grad_norm": 1.9855667166709514, "language_loss": 0.74026549, "learning_rate": 1.6974431471827466e-07, "loss": 0.76111507, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 14501, "time_per_iteration": 2.4110333919525146 }, { "auxiliary_loss_clip": 0.01055428, "auxiliary_loss_mlp": 0.01040539, "balance_loss_clip": 1.01618457, "balance_loss_mlp": 1.01769507, "epoch": 0.8719074101908914, "flos": 19494452655360.0, "grad_norm": 1.9311537528659417, "language_loss": 0.65361047, "learning_rate": 1.695873325782482e-07, "loss": 0.67457014, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37695312, "step": 14502, "time_per_iteration": 2.352646827697754 }, { "auxiliary_loss_clip": 0.01051681, "auxiliary_loss_mlp": 0.01039112, "balance_loss_clip": 1.01454318, "balance_loss_mlp": 1.01553571, "epoch": 0.8719675334435593, "flos": 33069556448640.0, "grad_norm": 1.7127206568203082, "language_loss": 0.69185269, "learning_rate": 1.6943041984765262e-07, "loss": 0.71276063, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 14503, "time_per_iteration": 2.4910473823547363 }, { "auxiliary_loss_clip": 0.01052784, "auxiliary_loss_mlp": 0.01038438, "balance_loss_clip": 1.01627719, "balance_loss_mlp": 1.01734006, "epoch": 0.8720276566962273, "flos": 13625412929280.0, "grad_norm": 4.592120890562074, "language_loss": 0.70193446, "learning_rate": 1.6927357653243912e-07, "loss": 0.72284669, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 14504, "time_per_iteration": 2.365865468978882 }, { "auxiliary_loss_clip": 0.01052583, "auxiliary_loss_mlp": 0.01034395, "balance_loss_clip": 1.01123297, "balance_loss_mlp": 1.01667786, "epoch": 0.8720877799488952, "flos": 23513025300480.0, "grad_norm": 1.7856525028519383, "language_loss": 0.71443051, "learning_rate": 1.691168026385552e-07, "loss": 0.7353003, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 14505, "time_per_iteration": 2.3962557315826416 }, { "auxiliary_loss_clip": 0.01049628, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.01190889, "balance_loss_mlp": 1.01549053, "epoch": 0.8721479032015632, "flos": 20813086995840.0, "grad_norm": 1.5191908036521902, "language_loss": 0.79223692, "learning_rate": 1.6896009817194545e-07, "loss": 0.81307101, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 14506, "time_per_iteration": 2.366384983062744 }, { "auxiliary_loss_clip": 0.01052852, "auxiliary_loss_mlp": 0.01040002, "balance_loss_clip": 1.0151825, "balance_loss_mlp": 1.01578021, "epoch": 0.8722080264542311, "flos": 19462646540160.0, "grad_norm": 2.3520222184498745, "language_loss": 0.74663454, "learning_rate": 1.6880346313855221e-07, "loss": 0.76756305, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 14507, "time_per_iteration": 2.398972511291504 }, { "auxiliary_loss_clip": 0.01054852, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.01535535, "balance_loss_mlp": 1.01757598, "epoch": 0.8722681497068991, "flos": 21760804385280.0, "grad_norm": 1.9474351656384616, "language_loss": 0.7391237, "learning_rate": 1.686468975443156e-07, "loss": 0.76007164, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 14508, "time_per_iteration": 2.3806002140045166 }, { "auxiliary_loss_clip": 0.01053356, "auxiliary_loss_mlp": 0.01040464, "balance_loss_clip": 1.01640725, "balance_loss_mlp": 1.01679063, "epoch": 0.8723282729595672, "flos": 28875859090560.0, "grad_norm": 1.7050479592082792, "language_loss": 0.69515884, "learning_rate": 1.6849040139517202e-07, "loss": 0.716097, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 14509, "time_per_iteration": 2.46449613571167 }, { "auxiliary_loss_clip": 0.010516, "auxiliary_loss_mlp": 0.01039152, "balance_loss_clip": 1.01707482, "balance_loss_mlp": 1.01544428, "epoch": 0.8723883962122351, "flos": 26467934330880.0, "grad_norm": 1.7507779816856157, "language_loss": 0.59780121, "learning_rate": 1.683339746970558e-07, "loss": 0.61870873, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36132812, "step": 14510, "time_per_iteration": 2.4332053661346436 }, { "auxiliary_loss_clip": 0.01055904, "auxiliary_loss_mlp": 0.01041211, "balance_loss_clip": 1.01461506, "balance_loss_mlp": 1.01713979, "epoch": 0.8724485194649031, "flos": 20520445046400.0, "grad_norm": 2.252989554385775, "language_loss": 0.69177431, "learning_rate": 1.6817761745589865e-07, "loss": 0.71274543, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38671875, "step": 14511, "time_per_iteration": 2.38542103767395 }, { "auxiliary_loss_clip": 0.01052701, "auxiliary_loss_mlp": 0.01039107, "balance_loss_clip": 1.01513374, "balance_loss_mlp": 1.01591074, "epoch": 0.872508642717571, "flos": 24352197672960.0, "grad_norm": 1.587789810817385, "language_loss": 0.83077025, "learning_rate": 1.6802132967763027e-07, "loss": 0.85168827, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 14512, "time_per_iteration": 2.4233531951904297 }, { "auxiliary_loss_clip": 0.01007307, "auxiliary_loss_mlp": 0.01002751, "balance_loss_clip": 1.00066495, "balance_loss_mlp": 1.00077724, "epoch": 0.872568765970239, "flos": 61407159108480.0, "grad_norm": 0.7886194555220868, "language_loss": 0.58663827, "learning_rate": 1.6786511136817617e-07, "loss": 0.60673887, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06542969, "step": 14513, "time_per_iteration": 4.3166656494140625 }, { "auxiliary_loss_clip": 0.01051115, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.01428914, "balance_loss_mlp": 1.01566899, "epoch": 0.8726288892229069, "flos": 22597044203520.0, "grad_norm": 1.6527009067617198, "language_loss": 0.7720412, "learning_rate": 1.6770896253346112e-07, "loss": 0.79292029, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35351562, "step": 14514, "time_per_iteration": 2.3793115615844727 }, { "auxiliary_loss_clip": 0.010541, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.01347113, "balance_loss_mlp": 1.01719427, "epoch": 0.872689012475575, "flos": 25884011975040.0, "grad_norm": 2.1200130228847054, "language_loss": 0.66979003, "learning_rate": 1.675528831794055e-07, "loss": 0.69069624, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36914062, "step": 14515, "time_per_iteration": 2.4392335414886475 }, { "auxiliary_loss_clip": 0.01052655, "auxiliary_loss_mlp": 0.01038016, "balance_loss_clip": 1.01385236, "balance_loss_mlp": 1.01684189, "epoch": 0.8727491357282429, "flos": 21505659102720.0, "grad_norm": 5.866820584193284, "language_loss": 0.79754555, "learning_rate": 1.6739687331192842e-07, "loss": 0.81845224, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 14516, "time_per_iteration": 2.3702218532562256 }, { "auxiliary_loss_clip": 0.01054421, "auxiliary_loss_mlp": 0.01040559, "balance_loss_clip": 1.01558459, "balance_loss_mlp": 1.01682353, "epoch": 0.8728092589809109, "flos": 19206523739520.0, "grad_norm": 2.5694117224712216, "language_loss": 0.73533773, "learning_rate": 1.672409329369453e-07, "loss": 0.75628752, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 14517, "time_per_iteration": 2.375077486038208 }, { "auxiliary_loss_clip": 0.0104948, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.01011229, "balance_loss_mlp": 1.01501429, "epoch": 0.8728693822335788, "flos": 20594251393920.0, "grad_norm": 1.9441985151446917, "language_loss": 0.74341333, "learning_rate": 1.6708506206036966e-07, "loss": 0.76421553, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34570312, "step": 14518, "time_per_iteration": 2.359325647354126 }, { "auxiliary_loss_clip": 0.01050169, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.01712942, "balance_loss_mlp": 1.01608574, "epoch": 0.8729295054862468, "flos": 21727462170240.0, "grad_norm": 1.689296842198873, "language_loss": 0.74494874, "learning_rate": 1.6692926068811275e-07, "loss": 0.76583087, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 14519, "time_per_iteration": 2.412550449371338 }, { "auxiliary_loss_clip": 0.01053817, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.0095855, "balance_loss_mlp": 1.01669669, "epoch": 0.8729896287389147, "flos": 17672544933120.0, "grad_norm": 2.4443099882717734, "language_loss": 0.77425474, "learning_rate": 1.6677352882608142e-07, "loss": 0.79513735, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 14520, "time_per_iteration": 2.3367722034454346 }, { "auxiliary_loss_clip": 0.01053092, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.01436305, "balance_loss_mlp": 1.01602495, "epoch": 0.8730497519915827, "flos": 24570649249920.0, "grad_norm": 1.9674378611307657, "language_loss": 0.83253276, "learning_rate": 1.666178664801816e-07, "loss": 0.85344487, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 14521, "time_per_iteration": 2.4174301624298096 }, { "auxiliary_loss_clip": 0.01054234, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.01457453, "balance_loss_mlp": 1.01745558, "epoch": 0.8731098752442508, "flos": 13442887008000.0, "grad_norm": 2.961688995869925, "language_loss": 0.77622581, "learning_rate": 1.6646227365631616e-07, "loss": 0.79716098, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 14522, "time_per_iteration": 2.340801954269409 }, { "auxiliary_loss_clip": 0.01050229, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.01354361, "balance_loss_mlp": 1.01572609, "epoch": 0.8731699984969187, "flos": 23473399040640.0, "grad_norm": 1.8871823987824776, "language_loss": 0.76788521, "learning_rate": 1.66306750360385e-07, "loss": 0.78874075, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34570312, "step": 14523, "time_per_iteration": 2.472548484802246 }, { "auxiliary_loss_clip": 0.01049082, "auxiliary_loss_mlp": 0.01034835, "balance_loss_clip": 1.01279354, "balance_loss_mlp": 1.01532483, "epoch": 0.8732301217495867, "flos": 17711682433920.0, "grad_norm": 2.089044532158217, "language_loss": 0.8006084, "learning_rate": 1.6615129659828542e-07, "loss": 0.82144761, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3359375, "step": 14524, "time_per_iteration": 2.3702359199523926 }, { "auxiliary_loss_clip": 0.01049882, "auxiliary_loss_mlp": 0.0103634, "balance_loss_clip": 1.01522851, "balance_loss_mlp": 1.01564193, "epoch": 0.8732902450022546, "flos": 22053271777920.0, "grad_norm": 1.953926729326464, "language_loss": 0.78962189, "learning_rate": 1.6599591237591272e-07, "loss": 0.81048411, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 14525, "time_per_iteration": 2.389815092086792 }, { "auxiliary_loss_clip": 0.01053026, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.01226199, "balance_loss_mlp": 1.01678085, "epoch": 0.8733503682549226, "flos": 22271897911680.0, "grad_norm": 1.7026300731526534, "language_loss": 0.69832736, "learning_rate": 1.6584059769915902e-07, "loss": 0.71921098, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 14526, "time_per_iteration": 2.426753520965576 }, { "auxiliary_loss_clip": 0.01053958, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.02117229, "balance_loss_mlp": 1.01672864, "epoch": 0.8734104915075905, "flos": 23363317923840.0, "grad_norm": 1.7820131665151953, "language_loss": 0.6197983, "learning_rate": 1.6568535257391326e-07, "loss": 0.64081186, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37109375, "step": 14527, "time_per_iteration": 2.406776189804077 }, { "auxiliary_loss_clip": 0.01057224, "auxiliary_loss_mlp": 0.01043716, "balance_loss_clip": 1.01392543, "balance_loss_mlp": 1.01766706, "epoch": 0.8734706147602586, "flos": 17711333320320.0, "grad_norm": 2.204870710376478, "language_loss": 0.66951531, "learning_rate": 1.6553017700606265e-07, "loss": 0.6905247, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.39648438, "step": 14528, "time_per_iteration": 2.339491844177246 }, { "auxiliary_loss_clip": 0.01050713, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.0131669, "balance_loss_mlp": 1.0158658, "epoch": 0.8735307380129265, "flos": 22048419098880.0, "grad_norm": 2.0907935662216826, "language_loss": 0.90990615, "learning_rate": 1.6537507100149205e-07, "loss": 0.93075943, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 14529, "time_per_iteration": 2.4254465103149414 }, { "auxiliary_loss_clip": 0.01052493, "auxiliary_loss_mlp": 0.01040928, "balance_loss_clip": 1.01663339, "balance_loss_mlp": 1.01721466, "epoch": 0.8735908612655945, "flos": 25337237172480.0, "grad_norm": 1.642666456580712, "language_loss": 0.85604131, "learning_rate": 1.6522003456608258e-07, "loss": 0.87697554, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3515625, "step": 14530, "time_per_iteration": 3.6752138137817383 }, { "auxiliary_loss_clip": 0.01053145, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.018013, "balance_loss_mlp": 1.01692295, "epoch": 0.8736509845182624, "flos": 21539909013120.0, "grad_norm": 1.473058567954134, "language_loss": 0.74442333, "learning_rate": 1.650650677057128e-07, "loss": 0.76536614, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36328125, "step": 14531, "time_per_iteration": 2.401484489440918 }, { "auxiliary_loss_clip": 0.01049361, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.01321852, "balance_loss_mlp": 1.01527548, "epoch": 0.8737111077709304, "flos": 22016159136000.0, "grad_norm": 1.8921286767174363, "language_loss": 0.62653184, "learning_rate": 1.6491017042625966e-07, "loss": 0.64738208, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.33984375, "step": 14532, "time_per_iteration": 2.3815221786499023 }, { "auxiliary_loss_clip": 0.01007108, "auxiliary_loss_mlp": 0.01003071, "balance_loss_clip": 1.00111592, "balance_loss_mlp": 1.00057578, "epoch": 0.8737712310235983, "flos": 70062965424000.0, "grad_norm": 0.8174100637871623, "language_loss": 0.58716989, "learning_rate": 1.6475534273359704e-07, "loss": 0.60727167, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06542969, "step": 14533, "time_per_iteration": 3.1141157150268555 }, { "auxiliary_loss_clip": 0.01050982, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.0191946, "balance_loss_mlp": 1.01593113, "epoch": 0.8738313542762663, "flos": 28657058400000.0, "grad_norm": 1.4832637211699515, "language_loss": 0.77388275, "learning_rate": 1.646005846335954e-07, "loss": 0.79479969, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 14534, "time_per_iteration": 2.424967050552368 }, { "auxiliary_loss_clip": 0.01051047, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.01146924, "balance_loss_mlp": 1.01546597, "epoch": 0.8738914775289344, "flos": 22345285322880.0, "grad_norm": 1.719036952047885, "language_loss": 0.76095951, "learning_rate": 1.6444589613212357e-07, "loss": 0.78180903, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 14535, "time_per_iteration": 2.428898572921753 }, { "auxiliary_loss_clip": 0.01050641, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.01356959, "balance_loss_mlp": 1.01499295, "epoch": 0.8739516007816023, "flos": 31758288405120.0, "grad_norm": 1.820469728259462, "language_loss": 0.75391686, "learning_rate": 1.64291277235048e-07, "loss": 0.77480972, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 14536, "time_per_iteration": 2.4703774452209473 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.01318383, "balance_loss_mlp": 1.01588941, "epoch": 0.8740117240342703, "flos": 21210747914880.0, "grad_norm": 1.838014669475169, "language_loss": 0.65413988, "learning_rate": 1.641367279482304e-07, "loss": 0.67501289, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 14537, "time_per_iteration": 2.3703253269195557 }, { "auxiliary_loss_clip": 0.01050386, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.01122117, "balance_loss_mlp": 1.01532269, "epoch": 0.8740718472869382, "flos": 25185644582400.0, "grad_norm": 1.7210764117688369, "language_loss": 0.5859949, "learning_rate": 1.6398224827753216e-07, "loss": 0.60683084, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 14538, "time_per_iteration": 3.8798182010650635 }, { "auxiliary_loss_clip": 0.01051181, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.01377785, "balance_loss_mlp": 1.01682007, "epoch": 0.8741319705396062, "flos": 19499898827520.0, "grad_norm": 1.769523144263107, "language_loss": 0.69233698, "learning_rate": 1.6382783822881142e-07, "loss": 0.71320486, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 14539, "time_per_iteration": 3.725053548812866 }, { "auxiliary_loss_clip": 0.01051779, "auxiliary_loss_mlp": 0.01037761, "balance_loss_clip": 1.01329947, "balance_loss_mlp": 1.01462531, "epoch": 0.8741920937922741, "flos": 14100022218240.0, "grad_norm": 2.0232703006988486, "language_loss": 0.75708842, "learning_rate": 1.6367349780792262e-07, "loss": 0.77798378, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 14540, "time_per_iteration": 2.3647208213806152 }, { "auxiliary_loss_clip": 0.01052665, "auxiliary_loss_mlp": 0.01036965, "balance_loss_clip": 1.0131824, "balance_loss_mlp": 1.01579523, "epoch": 0.8742522170449422, "flos": 27708607872000.0, "grad_norm": 1.725496990558571, "language_loss": 0.79850203, "learning_rate": 1.635192270207193e-07, "loss": 0.81939828, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 14541, "time_per_iteration": 2.411863088607788 }, { "auxiliary_loss_clip": 0.01055318, "auxiliary_loss_mlp": 0.01040705, "balance_loss_clip": 1.01450276, "balance_loss_mlp": 1.01687515, "epoch": 0.8743123402976101, "flos": 21141514955520.0, "grad_norm": 2.0477198441044173, "language_loss": 0.67706835, "learning_rate": 1.6336502587305035e-07, "loss": 0.69802856, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38476562, "step": 14542, "time_per_iteration": 2.4290144443511963 }, { "auxiliary_loss_clip": 0.01007257, "auxiliary_loss_mlp": 0.01005621, "balance_loss_clip": 1.00362992, "balance_loss_mlp": 1.00086927, "epoch": 0.8743724635502781, "flos": 60866982552960.0, "grad_norm": 0.7809254347538565, "language_loss": 0.54495132, "learning_rate": 1.632108943707642e-07, "loss": 0.56508017, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06396484, "step": 14543, "time_per_iteration": 2.82061767578125 }, { "auxiliary_loss_clip": 0.01052569, "auxiliary_loss_mlp": 0.01042899, "balance_loss_clip": 1.01854479, "balance_loss_mlp": 1.0155344, "epoch": 0.874432586802946, "flos": 28108084181760.0, "grad_norm": 1.807680649800455, "language_loss": 0.70575893, "learning_rate": 1.6305683251970458e-07, "loss": 0.72671366, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 14544, "time_per_iteration": 2.444471597671509 }, { "auxiliary_loss_clip": 0.01049393, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.01183271, "balance_loss_mlp": 1.01587033, "epoch": 0.874492710055614, "flos": 23549160424320.0, "grad_norm": 1.5150320853556156, "language_loss": 0.76764488, "learning_rate": 1.62902840325714e-07, "loss": 0.78845191, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.33398438, "step": 14545, "time_per_iteration": 2.3829143047332764 }, { "auxiliary_loss_clip": 0.01051985, "auxiliary_loss_mlp": 0.01043729, "balance_loss_clip": 1.01855242, "balance_loss_mlp": 1.01539457, "epoch": 0.8745528333082819, "flos": 40914086434560.0, "grad_norm": 1.7039030296454296, "language_loss": 0.66989863, "learning_rate": 1.6274891779463217e-07, "loss": 0.6908558, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 14546, "time_per_iteration": 2.5625557899475098 }, { "auxiliary_loss_clip": 0.01051674, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.01356375, "balance_loss_mlp": 1.01612723, "epoch": 0.87461295656095, "flos": 23621779785600.0, "grad_norm": 1.810372117369066, "language_loss": 0.73926306, "learning_rate": 1.6259506493229536e-07, "loss": 0.76015264, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 14547, "time_per_iteration": 2.3816723823547363 }, { "auxiliary_loss_clip": 0.0105583, "auxiliary_loss_mlp": 0.01043332, "balance_loss_clip": 1.0163312, "balance_loss_mlp": 1.01665401, "epoch": 0.874673079813618, "flos": 38792763959040.0, "grad_norm": 3.0371995697866874, "language_loss": 0.70599997, "learning_rate": 1.6244128174453752e-07, "loss": 0.72699159, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39257812, "step": 14548, "time_per_iteration": 2.509713649749756 }, { "auxiliary_loss_clip": 0.01054339, "auxiliary_loss_mlp": 0.01042837, "balance_loss_clip": 1.01860237, "balance_loss_mlp": 1.01703501, "epoch": 0.8747332030662859, "flos": 23695027551360.0, "grad_norm": 2.1337145397838, "language_loss": 0.72025084, "learning_rate": 1.6228756823719093e-07, "loss": 0.74122262, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 14549, "time_per_iteration": 2.3755037784576416 }, { "auxiliary_loss_clip": 0.01054215, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.0137676, "balance_loss_mlp": 1.01620555, "epoch": 0.8747933263189539, "flos": 24461301271680.0, "grad_norm": 2.3213538990769496, "language_loss": 0.85580772, "learning_rate": 1.6213392441608352e-07, "loss": 0.87674356, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37890625, "step": 14550, "time_per_iteration": 2.4042165279388428 }, { "auxiliary_loss_clip": 0.01053971, "auxiliary_loss_mlp": 0.01042492, "balance_loss_clip": 1.01949668, "balance_loss_mlp": 1.01746249, "epoch": 0.8748534495716218, "flos": 13808287964160.0, "grad_norm": 2.097664514765953, "language_loss": 0.7280342, "learning_rate": 1.6198035028704183e-07, "loss": 0.74899888, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 14551, "time_per_iteration": 2.371159791946411 }, { "auxiliary_loss_clip": 0.01050929, "auxiliary_loss_mlp": 0.01043003, "balance_loss_clip": 1.0198524, "balance_loss_mlp": 1.01522851, "epoch": 0.8749135728242898, "flos": 29860793856000.0, "grad_norm": 1.8083656889802748, "language_loss": 0.64748687, "learning_rate": 1.6182684585588934e-07, "loss": 0.66842616, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 14552, "time_per_iteration": 2.4948740005493164 }, { "auxiliary_loss_clip": 0.01053529, "auxiliary_loss_mlp": 0.01037945, "balance_loss_clip": 1.01229072, "balance_loss_mlp": 1.01654243, "epoch": 0.8749736960769577, "flos": 24132349641600.0, "grad_norm": 1.7323296978630636, "language_loss": 0.8056643, "learning_rate": 1.616734111284479e-07, "loss": 0.82657909, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36914062, "step": 14553, "time_per_iteration": 3.8931126594543457 }, { "auxiliary_loss_clip": 0.01051794, "auxiliary_loss_mlp": 0.01038953, "balance_loss_clip": 1.01649439, "balance_loss_mlp": 1.01537132, "epoch": 0.8750338193296258, "flos": 17201566425600.0, "grad_norm": 2.0588183092556793, "language_loss": 0.70751387, "learning_rate": 1.6152004611053416e-07, "loss": 0.72842133, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36523438, "step": 14554, "time_per_iteration": 2.3445024490356445 }, { "auxiliary_loss_clip": 0.01051617, "auxiliary_loss_mlp": 0.0103322, "balance_loss_clip": 1.01027226, "balance_loss_mlp": 1.0163368, "epoch": 0.8750939425822937, "flos": 23732070370560.0, "grad_norm": 1.76698966551834, "language_loss": 0.84692323, "learning_rate": 1.6136675080796457e-07, "loss": 0.86777163, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 14555, "time_per_iteration": 2.421133518218994 }, { "auxiliary_loss_clip": 0.01052183, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.01572943, "balance_loss_mlp": 1.01623774, "epoch": 0.8751540658349617, "flos": 26540483869440.0, "grad_norm": 1.5760057221753692, "language_loss": 0.71861625, "learning_rate": 1.6121352522655252e-07, "loss": 0.73953485, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 14556, "time_per_iteration": 2.419344425201416 }, { "auxiliary_loss_clip": 0.0105301, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.01330972, "balance_loss_mlp": 1.01612353, "epoch": 0.8752141890876296, "flos": 19385907638400.0, "grad_norm": 1.7303926539880536, "language_loss": 0.78168011, "learning_rate": 1.6106036937210732e-07, "loss": 0.80258942, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 14557, "time_per_iteration": 2.3595898151397705 }, { "auxiliary_loss_clip": 0.01052922, "auxiliary_loss_mlp": 0.01042743, "balance_loss_clip": 1.01849544, "balance_loss_mlp": 1.01725745, "epoch": 0.8752743123402976, "flos": 25373232650880.0, "grad_norm": 1.9822413690921272, "language_loss": 0.83609843, "learning_rate": 1.6090728325043767e-07, "loss": 0.85705507, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 14558, "time_per_iteration": 2.4252378940582275 }, { "auxiliary_loss_clip": 0.01007222, "auxiliary_loss_mlp": 0.01003125, "balance_loss_clip": 1.00102711, "balance_loss_mlp": 1.00060964, "epoch": 0.8753344355929655, "flos": 59949535178880.0, "grad_norm": 0.8076429301721307, "language_loss": 0.56120491, "learning_rate": 1.6075426686734784e-07, "loss": 0.58130836, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06640625, "step": 14559, "time_per_iteration": 3.0158135890960693 }, { "auxiliary_loss_clip": 0.01051633, "auxiliary_loss_mlp": 0.0104322, "balance_loss_clip": 1.02016449, "balance_loss_mlp": 1.01657104, "epoch": 0.8753945588456336, "flos": 17893684684800.0, "grad_norm": 1.6997829775980096, "language_loss": 0.67151904, "learning_rate": 1.606013202286407e-07, "loss": 0.69246757, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 14560, "time_per_iteration": 2.3558578491210938 }, { "auxiliary_loss_clip": 0.01050842, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.01133728, "balance_loss_mlp": 1.01563036, "epoch": 0.8754546820983016, "flos": 30913704771840.0, "grad_norm": 2.2355422896787727, "language_loss": 0.79957747, "learning_rate": 1.6044844334011541e-07, "loss": 0.82042611, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 14561, "time_per_iteration": 2.4576961994171143 }, { "auxiliary_loss_clip": 0.0105279, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.01135337, "balance_loss_mlp": 1.01576674, "epoch": 0.8755148053509695, "flos": 20630037404160.0, "grad_norm": 1.9554554275711065, "language_loss": 0.78238583, "learning_rate": 1.6029563620756982e-07, "loss": 0.80327725, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 14562, "time_per_iteration": 2.389127731323242 }, { "auxiliary_loss_clip": 0.01048043, "auxiliary_loss_mlp": 0.01032521, "balance_loss_clip": 1.01201653, "balance_loss_mlp": 1.01499462, "epoch": 0.8755749286036375, "flos": 34968307806720.0, "grad_norm": 1.7356531363076508, "language_loss": 0.72123349, "learning_rate": 1.601428988367981e-07, "loss": 0.74203908, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33007812, "step": 14563, "time_per_iteration": 2.5335230827331543 }, { "auxiliary_loss_clip": 0.01053368, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.01631546, "balance_loss_mlp": 1.01715851, "epoch": 0.8756350518563054, "flos": 18185488761600.0, "grad_norm": 2.165037869642114, "language_loss": 0.66895819, "learning_rate": 1.5999023123359235e-07, "loss": 0.68989861, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 14564, "time_per_iteration": 2.386122703552246 }, { "auxiliary_loss_clip": 0.01051184, "auxiliary_loss_mlp": 0.01042026, "balance_loss_clip": 1.01866114, "balance_loss_mlp": 1.01511157, "epoch": 0.8756951751089734, "flos": 20082983310720.0, "grad_norm": 1.7031422890689045, "language_loss": 0.72178292, "learning_rate": 1.598376334037408e-07, "loss": 0.742715, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 14565, "time_per_iteration": 2.366636037826538 }, { "auxiliary_loss_clip": 0.01055865, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.01493299, "balance_loss_mlp": 1.01725125, "epoch": 0.8757552983616413, "flos": 27524057091840.0, "grad_norm": 1.5352819349930902, "language_loss": 0.78520077, "learning_rate": 1.5968510535303102e-07, "loss": 0.8061555, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38671875, "step": 14566, "time_per_iteration": 2.417046546936035 }, { "auxiliary_loss_clip": 0.01054976, "auxiliary_loss_mlp": 0.01039649, "balance_loss_clip": 1.0155803, "balance_loss_mlp": 1.01835096, "epoch": 0.8758154216143094, "flos": 18072160888320.0, "grad_norm": 1.6332418398382826, "language_loss": 0.72486234, "learning_rate": 1.5953264708724624e-07, "loss": 0.74580854, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 14567, "time_per_iteration": 2.344057083129883 }, { "auxiliary_loss_clip": 0.01049945, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.01331544, "balance_loss_mlp": 1.01509142, "epoch": 0.8758755448669773, "flos": 25044525400320.0, "grad_norm": 1.7431235141523838, "language_loss": 0.74950659, "learning_rate": 1.5938025861216776e-07, "loss": 0.77036291, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14568, "time_per_iteration": 2.394897937774658 }, { "auxiliary_loss_clip": 0.01051578, "auxiliary_loss_mlp": 0.01035769, "balance_loss_clip": 1.01282096, "balance_loss_mlp": 1.01653695, "epoch": 0.8759356681196453, "flos": 22855715533440.0, "grad_norm": 2.5902977471189135, "language_loss": 0.87785709, "learning_rate": 1.5922793993357475e-07, "loss": 0.89873052, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 14569, "time_per_iteration": 3.623642683029175 }, { "auxiliary_loss_clip": 0.01051271, "auxiliary_loss_mlp": 0.01035515, "balance_loss_clip": 1.01290107, "balance_loss_mlp": 1.01533735, "epoch": 0.8759957913723132, "flos": 21031468750080.0, "grad_norm": 1.7201117260667607, "language_loss": 0.74533105, "learning_rate": 1.5907569105724284e-07, "loss": 0.76619887, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 14570, "time_per_iteration": 2.3672828674316406 }, { "auxiliary_loss_clip": 0.01052354, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.0128119, "balance_loss_mlp": 1.0162636, "epoch": 0.8760559146249812, "flos": 20009456254080.0, "grad_norm": 1.947766536055009, "language_loss": 0.69155496, "learning_rate": 1.5892351198894472e-07, "loss": 0.71244079, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 14571, "time_per_iteration": 2.374133348464966 }, { "auxiliary_loss_clip": 0.01049984, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.01545644, "balance_loss_mlp": 1.01541233, "epoch": 0.8761160378776491, "flos": 19973146573440.0, "grad_norm": 1.9653663737465632, "language_loss": 0.63232189, "learning_rate": 1.5877140273445156e-07, "loss": 0.6531992, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 14572, "time_per_iteration": 2.367926597595215 }, { "auxiliary_loss_clip": 0.01049446, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.01360726, "balance_loss_mlp": 1.01555097, "epoch": 0.8761761611303172, "flos": 28803134995200.0, "grad_norm": 1.7277527700903688, "language_loss": 0.74779129, "learning_rate": 1.5861936329953162e-07, "loss": 0.76863676, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 14573, "time_per_iteration": 2.4180846214294434 }, { "auxiliary_loss_clip": 0.01050296, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.01247549, "balance_loss_mlp": 1.01591909, "epoch": 0.8762362843829851, "flos": 18331530445440.0, "grad_norm": 1.8716373587714052, "language_loss": 0.74406058, "learning_rate": 1.5846739368994966e-07, "loss": 0.76490736, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 14574, "time_per_iteration": 2.369725465774536 }, { "auxiliary_loss_clip": 0.01049162, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.01046062, "balance_loss_mlp": 1.01485276, "epoch": 0.8762964076356531, "flos": 15778227317760.0, "grad_norm": 1.6169473327908486, "language_loss": 0.76850677, "learning_rate": 1.5831549391146903e-07, "loss": 0.78931558, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 14575, "time_per_iteration": 2.344646453857422 }, { "auxiliary_loss_clip": 0.01052949, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.01201415, "balance_loss_mlp": 1.0168047, "epoch": 0.8763565308883211, "flos": 33175518024960.0, "grad_norm": 1.7028904320680918, "language_loss": 0.67382884, "learning_rate": 1.5816366396984916e-07, "loss": 0.69469362, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36132812, "step": 14576, "time_per_iteration": 2.5170536041259766 }, { "auxiliary_loss_clip": 0.01049276, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.01248515, "balance_loss_mlp": 1.01465869, "epoch": 0.876416654140989, "flos": 15887191271040.0, "grad_norm": 1.9655599365030259, "language_loss": 0.67828703, "learning_rate": 1.5801190387084806e-07, "loss": 0.69911623, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 14577, "time_per_iteration": 2.3722751140594482 }, { "auxiliary_loss_clip": 0.01052683, "auxiliary_loss_mlp": 0.01037729, "balance_loss_clip": 1.01357675, "balance_loss_mlp": 1.01667774, "epoch": 0.876476777393657, "flos": 25884046886400.0, "grad_norm": 2.170936043630389, "language_loss": 0.71898293, "learning_rate": 1.5786021362021962e-07, "loss": 0.739887, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 14578, "time_per_iteration": 3.8376705646514893 }, { "auxiliary_loss_clip": 0.01052057, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.01671648, "balance_loss_mlp": 1.01577091, "epoch": 0.876536900646325, "flos": 13588823957760.0, "grad_norm": 1.9216888572285167, "language_loss": 0.72142768, "learning_rate": 1.5770859322371676e-07, "loss": 0.74234641, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 14579, "time_per_iteration": 3.6995441913604736 }, { "auxiliary_loss_clip": 0.01049241, "auxiliary_loss_mlp": 0.01034531, "balance_loss_clip": 1.01312149, "balance_loss_mlp": 1.01583362, "epoch": 0.876597023898993, "flos": 12202527669120.0, "grad_norm": 1.685802114630485, "language_loss": 0.71417069, "learning_rate": 1.5755704268708912e-07, "loss": 0.73500842, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33398438, "step": 14580, "time_per_iteration": 2.352154016494751 }, { "auxiliary_loss_clip": 0.0105103, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.01079249, "balance_loss_mlp": 1.01612902, "epoch": 0.8766571471516609, "flos": 25335631249920.0, "grad_norm": 1.6660201199610032, "language_loss": 0.66678977, "learning_rate": 1.5740556201608256e-07, "loss": 0.68761563, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 14581, "time_per_iteration": 2.4073681831359863 }, { "auxiliary_loss_clip": 0.01049701, "auxiliary_loss_mlp": 0.01037667, "balance_loss_clip": 1.01605403, "balance_loss_mlp": 1.01598465, "epoch": 0.8767172704043289, "flos": 30112098888960.0, "grad_norm": 1.4715494736555645, "language_loss": 0.74186057, "learning_rate": 1.572541512164416e-07, "loss": 0.76273423, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33789062, "step": 14582, "time_per_iteration": 2.464460849761963 }, { "auxiliary_loss_clip": 0.01051707, "auxiliary_loss_mlp": 0.01036721, "balance_loss_clip": 1.01441669, "balance_loss_mlp": 1.01591849, "epoch": 0.8767773936569968, "flos": 19280155530240.0, "grad_norm": 2.0496277002457437, "language_loss": 0.67323959, "learning_rate": 1.5710281029390826e-07, "loss": 0.69412392, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 14583, "time_per_iteration": 2.38077449798584 }, { "auxiliary_loss_clip": 0.01051337, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.01031148, "balance_loss_mlp": 1.01583374, "epoch": 0.8768375169096648, "flos": 21246289545600.0, "grad_norm": 1.7764908910091652, "language_loss": 0.79944104, "learning_rate": 1.5695153925422067e-07, "loss": 0.82027316, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35546875, "step": 14584, "time_per_iteration": 2.3708746433258057 }, { "auxiliary_loss_clip": 0.0105214, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.01174378, "balance_loss_mlp": 1.01572585, "epoch": 0.8768976401623327, "flos": 23294399166720.0, "grad_norm": 1.6828564864289728, "language_loss": 0.73508257, "learning_rate": 1.5680033810311555e-07, "loss": 0.75595725, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 14585, "time_per_iteration": 2.3895106315612793 }, { "auxiliary_loss_clip": 0.010523, "auxiliary_loss_mlp": 0.01038925, "balance_loss_clip": 1.01499963, "balance_loss_mlp": 1.01579821, "epoch": 0.8769577634150008, "flos": 21360176000640.0, "grad_norm": 1.8822710455399656, "language_loss": 0.754857, "learning_rate": 1.5664920684632654e-07, "loss": 0.77576923, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 14586, "time_per_iteration": 2.358318567276001 }, { "auxiliary_loss_clip": 0.01051044, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.01385522, "balance_loss_mlp": 1.01572907, "epoch": 0.8770178866676687, "flos": 23512920566400.0, "grad_norm": 2.3911271177366746, "language_loss": 0.79884398, "learning_rate": 1.564981454895844e-07, "loss": 0.81972265, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 14587, "time_per_iteration": 2.40177845954895 }, { "auxiliary_loss_clip": 0.0105083, "auxiliary_loss_mlp": 0.01039767, "balance_loss_clip": 1.01500762, "balance_loss_mlp": 1.01550531, "epoch": 0.8770780099203367, "flos": 19718036202240.0, "grad_norm": 1.5918394132006, "language_loss": 0.74539125, "learning_rate": 1.5634715403861697e-07, "loss": 0.76629722, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35351562, "step": 14588, "time_per_iteration": 2.3632571697235107 }, { "auxiliary_loss_clip": 0.01047629, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01175046, "balance_loss_mlp": 1.0140022, "epoch": 0.8771381331730047, "flos": 21394879758720.0, "grad_norm": 1.7363272881171794, "language_loss": 0.67724192, "learning_rate": 1.5619623249915016e-07, "loss": 0.69804066, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 14589, "time_per_iteration": 2.36016583442688 }, { "auxiliary_loss_clip": 0.01052465, "auxiliary_loss_mlp": 0.01040754, "balance_loss_clip": 1.01787758, "balance_loss_mlp": 1.01647973, "epoch": 0.8771982564256726, "flos": 20260866021120.0, "grad_norm": 2.2065080097352814, "language_loss": 0.72204363, "learning_rate": 1.5604538087690732e-07, "loss": 0.74297589, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 14590, "time_per_iteration": 2.357405662536621 }, { "auxiliary_loss_clip": 0.01055103, "auxiliary_loss_mlp": 0.01040892, "balance_loss_clip": 1.01455855, "balance_loss_mlp": 1.01622939, "epoch": 0.8772583796783406, "flos": 12488711016960.0, "grad_norm": 2.3692051203805744, "language_loss": 0.7565518, "learning_rate": 1.558945991776086e-07, "loss": 0.77751178, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.38867188, "step": 14591, "time_per_iteration": 2.3590657711029053 }, { "auxiliary_loss_clip": 0.01048385, "auxiliary_loss_mlp": 0.01031781, "balance_loss_clip": 1.01121712, "balance_loss_mlp": 1.0157299, "epoch": 0.8773185029310085, "flos": 15920289106560.0, "grad_norm": 1.7135776101866866, "language_loss": 0.81528795, "learning_rate": 1.5574388740697096e-07, "loss": 0.83608961, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.32617188, "step": 14592, "time_per_iteration": 2.4044902324676514 }, { "auxiliary_loss_clip": 0.01049295, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.01290798, "balance_loss_mlp": 1.01628733, "epoch": 0.8773786261836766, "flos": 21503529509760.0, "grad_norm": 1.633880048787702, "language_loss": 0.83850247, "learning_rate": 1.5559324557071052e-07, "loss": 0.859326, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33007812, "step": 14593, "time_per_iteration": 3.8078904151916504 }, { "auxiliary_loss_clip": 0.01049461, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 1.00801611, "balance_loss_mlp": 1.01557827, "epoch": 0.8774387494363445, "flos": 26760262078080.0, "grad_norm": 1.4837288056804365, "language_loss": 0.76988798, "learning_rate": 1.5544267367453845e-07, "loss": 0.79067373, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33789062, "step": 14594, "time_per_iteration": 2.413724422454834 }, { "auxiliary_loss_clip": 0.01052604, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.00937819, "balance_loss_mlp": 1.01580286, "epoch": 0.8774988726890125, "flos": 18477851420160.0, "grad_norm": 2.1321347766519962, "language_loss": 0.792732, "learning_rate": 1.552921717241651e-07, "loss": 0.81358647, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 14595, "time_per_iteration": 2.3622894287109375 }, { "auxiliary_loss_clip": 0.01052856, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.01253033, "balance_loss_mlp": 1.01693964, "epoch": 0.8775589959416804, "flos": 24425201059200.0, "grad_norm": 1.517037972196789, "language_loss": 0.7116797, "learning_rate": 1.5514173972529743e-07, "loss": 0.73256505, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 14596, "time_per_iteration": 2.3878815174102783 }, { "auxiliary_loss_clip": 0.0105085, "auxiliary_loss_mlp": 0.01036682, "balance_loss_clip": 1.01314998, "balance_loss_mlp": 1.01563644, "epoch": 0.8776191191943484, "flos": 23439044396160.0, "grad_norm": 2.3188813631273018, "language_loss": 0.86506951, "learning_rate": 1.5499137768364067e-07, "loss": 0.88594484, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 14597, "time_per_iteration": 2.3991200923919678 }, { "auxiliary_loss_clip": 0.01051171, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.01627123, "balance_loss_mlp": 1.01639056, "epoch": 0.8776792424470163, "flos": 26829739416960.0, "grad_norm": 1.9458280397770233, "language_loss": 0.73625499, "learning_rate": 1.5484108560489494e-07, "loss": 0.75714099, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34765625, "step": 14598, "time_per_iteration": 2.4084115028381348 }, { "auxiliary_loss_clip": 0.01053178, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.01416504, "balance_loss_mlp": 1.017066, "epoch": 0.8777393656996844, "flos": 15625447741440.0, "grad_norm": 2.1006960461239808, "language_loss": 0.78876412, "learning_rate": 1.5469086349476036e-07, "loss": 0.80966568, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 14599, "time_per_iteration": 2.375121593475342 }, { "auxiliary_loss_clip": 0.01051647, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.01707137, "balance_loss_mlp": 1.01592469, "epoch": 0.8777994889523523, "flos": 18879003475200.0, "grad_norm": 2.422671686022171, "language_loss": 0.69270754, "learning_rate": 1.545407113589332e-07, "loss": 0.71362603, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 14600, "time_per_iteration": 2.372790575027466 }, { "auxiliary_loss_clip": 0.01051432, "auxiliary_loss_mlp": 0.01041611, "balance_loss_clip": 1.01735187, "balance_loss_mlp": 1.01517367, "epoch": 0.8778596122050203, "flos": 48824916825600.0, "grad_norm": 1.8053161208507504, "language_loss": 0.70565987, "learning_rate": 1.543906292031072e-07, "loss": 0.72659028, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 14601, "time_per_iteration": 2.618393898010254 }, { "auxiliary_loss_clip": 0.01054621, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.01945114, "balance_loss_mlp": 1.01710284, "epoch": 0.8779197354576883, "flos": 25659171619200.0, "grad_norm": 1.8624732162812827, "language_loss": 0.74313915, "learning_rate": 1.542406170329733e-07, "loss": 0.76409745, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.375, "step": 14602, "time_per_iteration": 2.4199438095092773 }, { "auxiliary_loss_clip": 0.01049891, "auxiliary_loss_mlp": 0.01036403, "balance_loss_clip": 1.01468301, "balance_loss_mlp": 1.01506102, "epoch": 0.8779798587103562, "flos": 18842239946880.0, "grad_norm": 2.0405340969556467, "language_loss": 0.71652102, "learning_rate": 1.5409067485422056e-07, "loss": 0.73738396, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 14603, "time_per_iteration": 2.3925294876098633 }, { "auxiliary_loss_clip": 0.01007536, "auxiliary_loss_mlp": 0.0100387, "balance_loss_clip": 1.00192666, "balance_loss_mlp": 1.00087619, "epoch": 0.8780399819630242, "flos": 68609635591680.0, "grad_norm": 0.7491218798712806, "language_loss": 0.54221612, "learning_rate": 1.539408026725344e-07, "loss": 0.56233019, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.06640625, "step": 14604, "time_per_iteration": 3.0479793548583984 }, { "auxiliary_loss_clip": 0.01007717, "auxiliary_loss_mlp": 0.01003251, "balance_loss_clip": 1.00109363, "balance_loss_mlp": 1.0009892, "epoch": 0.8781001052156922, "flos": 65731290906240.0, "grad_norm": 0.7025825843673941, "language_loss": 0.59413135, "learning_rate": 1.537910004935976e-07, "loss": 0.61424106, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06738281, "step": 14605, "time_per_iteration": 3.040858745574951 }, { "auxiliary_loss_clip": 0.01053769, "auxiliary_loss_mlp": 0.01036719, "balance_loss_clip": 1.01281714, "balance_loss_mlp": 1.0168134, "epoch": 0.8781602284683602, "flos": 22048698389760.0, "grad_norm": 1.513932823924889, "language_loss": 0.85803336, "learning_rate": 1.536412683230912e-07, "loss": 0.87893832, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37109375, "step": 14606, "time_per_iteration": 2.443516969680786 }, { "auxiliary_loss_clip": 0.01053705, "auxiliary_loss_mlp": 0.01038621, "balance_loss_clip": 1.01419508, "balance_loss_mlp": 1.01663423, "epoch": 0.8782203517210281, "flos": 17562079791360.0, "grad_norm": 1.8482310612406647, "language_loss": 0.72071898, "learning_rate": 1.534916061666931e-07, "loss": 0.74164224, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 14607, "time_per_iteration": 2.399864912033081 }, { "auxiliary_loss_clip": 0.01050318, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.02281141, "balance_loss_mlp": 1.01595402, "epoch": 0.8782804749736961, "flos": 25519239423360.0, "grad_norm": 2.0546494897413425, "language_loss": 0.72776079, "learning_rate": 1.533420140300785e-07, "loss": 0.74869883, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 14608, "time_per_iteration": 3.667001485824585 }, { "auxiliary_loss_clip": 0.0105293, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.01193523, "balance_loss_mlp": 1.0158608, "epoch": 0.878340598226364, "flos": 21797672647680.0, "grad_norm": 2.2716431163487405, "language_loss": 0.88559294, "learning_rate": 1.5319249191891936e-07, "loss": 0.90646946, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.37109375, "step": 14609, "time_per_iteration": 2.354722738265991 }, { "auxiliary_loss_clip": 0.01051517, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.01340985, "balance_loss_mlp": 1.01593447, "epoch": 0.878400721479032, "flos": 21101434848000.0, "grad_norm": 1.6374785333437896, "language_loss": 0.71098799, "learning_rate": 1.5304303983888643e-07, "loss": 0.73186779, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 14610, "time_per_iteration": 2.3782598972320557 }, { "auxiliary_loss_clip": 0.01049874, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.01496923, "balance_loss_mlp": 1.01583982, "epoch": 0.8784608447316999, "flos": 20922469885440.0, "grad_norm": 1.9184077921712457, "language_loss": 0.82493496, "learning_rate": 1.5289365779564612e-07, "loss": 0.84580338, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 14611, "time_per_iteration": 2.3657169342041016 }, { "auxiliary_loss_clip": 0.0105198, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.01630735, "balance_loss_mlp": 1.01609111, "epoch": 0.878520967984368, "flos": 23329556772480.0, "grad_norm": 1.6502364819181696, "language_loss": 0.77763224, "learning_rate": 1.5274434579486338e-07, "loss": 0.79854977, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 14612, "time_per_iteration": 2.371793508529663 }, { "auxiliary_loss_clip": 0.01051041, "auxiliary_loss_mlp": 0.01038898, "balance_loss_clip": 1.01566362, "balance_loss_mlp": 1.01591372, "epoch": 0.8785810912370359, "flos": 25517842968960.0, "grad_norm": 1.4762804016886035, "language_loss": 0.73168856, "learning_rate": 1.525951038422002e-07, "loss": 0.75258791, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14613, "time_per_iteration": 2.4040138721466064 }, { "auxiliary_loss_clip": 0.01007744, "auxiliary_loss_mlp": 0.01005047, "balance_loss_clip": 1.00310385, "balance_loss_mlp": 1.00104463, "epoch": 0.8786412144897039, "flos": 61838371710720.0, "grad_norm": 2.2419735321524175, "language_loss": 0.6466831, "learning_rate": 1.5244593194331667e-07, "loss": 0.66681099, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.06738281, "step": 14614, "time_per_iteration": 2.764796018600464 }, { "auxiliary_loss_clip": 0.01007598, "auxiliary_loss_mlp": 0.0100237, "balance_loss_clip": 1.00014091, "balance_loss_mlp": 1.00079381, "epoch": 0.8787013377423719, "flos": 70985684413440.0, "grad_norm": 1.0612186889936714, "language_loss": 0.5874182, "learning_rate": 1.5229683010386762e-07, "loss": 0.60751796, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.06835938, "step": 14615, "time_per_iteration": 3.0901243686676025 }, { "auxiliary_loss_clip": 0.01049503, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.01376653, "balance_loss_mlp": 1.0147258, "epoch": 0.8787614609950398, "flos": 17346456034560.0, "grad_norm": 2.010199554561567, "language_loss": 0.74217439, "learning_rate": 1.5214779832950807e-07, "loss": 0.76302618, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 14616, "time_per_iteration": 2.3388102054595947 }, { "auxiliary_loss_clip": 0.01007132, "auxiliary_loss_mlp": 0.01002786, "balance_loss_clip": 1.00064027, "balance_loss_mlp": 1.00061274, "epoch": 0.8788215842477078, "flos": 72507967914240.0, "grad_norm": 0.8107854779223704, "language_loss": 0.57991934, "learning_rate": 1.5199883662588953e-07, "loss": 0.60001856, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.06542969, "step": 14617, "time_per_iteration": 3.139739513397217 }, { "auxiliary_loss_clip": 0.01049279, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.01162148, "balance_loss_mlp": 1.01525807, "epoch": 0.8788817075003758, "flos": 24826283291520.0, "grad_norm": 1.7692779425853984, "language_loss": 0.84165281, "learning_rate": 1.5184994499865987e-07, "loss": 0.86247623, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 14618, "time_per_iteration": 5.254082202911377 }, { "auxiliary_loss_clip": 0.01048742, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.01688647, "balance_loss_mlp": 1.01550448, "epoch": 0.8789418307530438, "flos": 22637647981440.0, "grad_norm": 1.843018274050604, "language_loss": 0.70210838, "learning_rate": 1.5170112345346598e-07, "loss": 0.72298503, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33203125, "step": 14619, "time_per_iteration": 2.4150447845458984 }, { "auxiliary_loss_clip": 0.01052496, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.01383507, "balance_loss_mlp": 1.01588058, "epoch": 0.8790019540057117, "flos": 19784895189120.0, "grad_norm": 1.7025136876897844, "language_loss": 0.78141689, "learning_rate": 1.5155237199595016e-07, "loss": 0.80231285, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 14620, "time_per_iteration": 2.364327907562256 }, { "auxiliary_loss_clip": 0.01052591, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.01484752, "balance_loss_mlp": 1.01605034, "epoch": 0.8790620772583797, "flos": 20228745703680.0, "grad_norm": 1.6534031541445695, "language_loss": 0.80833733, "learning_rate": 1.514036906317542e-07, "loss": 0.82925975, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 14621, "time_per_iteration": 2.3962936401367188 }, { "auxiliary_loss_clip": 0.01053267, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.01547146, "balance_loss_mlp": 1.0158186, "epoch": 0.8791222005110476, "flos": 24129731289600.0, "grad_norm": 1.725273771530116, "language_loss": 0.67544448, "learning_rate": 1.5125507936651506e-07, "loss": 0.69637042, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 14622, "time_per_iteration": 2.4168307781219482 }, { "auxiliary_loss_clip": 0.01050569, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.01407015, "balance_loss_mlp": 1.0158633, "epoch": 0.8791823237637156, "flos": 21613191690240.0, "grad_norm": 1.958513423376429, "language_loss": 0.73783863, "learning_rate": 1.511065382058687e-07, "loss": 0.75870633, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 14623, "time_per_iteration": 2.3885605335235596 }, { "auxiliary_loss_clip": 0.01048686, "auxiliary_loss_mlp": 0.01035408, "balance_loss_clip": 1.01507115, "balance_loss_mlp": 1.0140264, "epoch": 0.8792424470163835, "flos": 24242256201600.0, "grad_norm": 1.6136285316277603, "language_loss": 0.79784828, "learning_rate": 1.5095806715544801e-07, "loss": 0.81868923, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34570312, "step": 14624, "time_per_iteration": 2.384242057800293 }, { "auxiliary_loss_clip": 0.01051661, "auxiliary_loss_mlp": 0.01038171, "balance_loss_clip": 1.01341164, "balance_loss_mlp": 1.01544905, "epoch": 0.8793025702690516, "flos": 24892234583040.0, "grad_norm": 2.42789489661739, "language_loss": 0.81030399, "learning_rate": 1.5080966622088265e-07, "loss": 0.83120227, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 14625, "time_per_iteration": 2.3979785442352295 }, { "auxiliary_loss_clip": 0.01049851, "auxiliary_loss_mlp": 0.01036691, "balance_loss_clip": 1.01371908, "balance_loss_mlp": 1.0157392, "epoch": 0.8793626935217195, "flos": 25371975841920.0, "grad_norm": 1.497574969020073, "language_loss": 0.74497068, "learning_rate": 1.5066133540779967e-07, "loss": 0.76583612, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34179688, "step": 14626, "time_per_iteration": 2.4881114959716797 }, { "auxiliary_loss_clip": 0.01052014, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.01890397, "balance_loss_mlp": 1.01526713, "epoch": 0.8794228167743875, "flos": 34676573552640.0, "grad_norm": 1.7821006235267698, "language_loss": 0.72075039, "learning_rate": 1.505130747218246e-07, "loss": 0.74171054, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 14627, "time_per_iteration": 2.51969313621521 }, { "auxiliary_loss_clip": 0.01051765, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.01341021, "balance_loss_mlp": 1.0161643, "epoch": 0.8794829400270555, "flos": 19462995653760.0, "grad_norm": 1.6852743405212474, "language_loss": 0.7307049, "learning_rate": 1.5036488416857873e-07, "loss": 0.75159073, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 14628, "time_per_iteration": 2.382138967514038 }, { "auxiliary_loss_clip": 0.01052383, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.01440644, "balance_loss_mlp": 1.01642942, "epoch": 0.8795430632797234, "flos": 15230509908480.0, "grad_norm": 3.1925889429096133, "language_loss": 0.70181817, "learning_rate": 1.5021676375368175e-07, "loss": 0.72272456, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 14629, "time_per_iteration": 2.326552629470825 }, { "auxiliary_loss_clip": 0.01048011, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.01375222, "balance_loss_mlp": 1.01472783, "epoch": 0.8796031865323914, "flos": 27743521098240.0, "grad_norm": 1.477212248349002, "language_loss": 0.69722068, "learning_rate": 1.5006871348275053e-07, "loss": 0.71803117, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.33203125, "step": 14630, "time_per_iteration": 2.431384563446045 }, { "auxiliary_loss_clip": 0.01049563, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.01959467, "balance_loss_mlp": 1.01495743, "epoch": 0.8796633097850594, "flos": 31284063141120.0, "grad_norm": 1.4617566611865997, "language_loss": 0.74890769, "learning_rate": 1.499207333613999e-07, "loss": 0.76983744, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34570312, "step": 14631, "time_per_iteration": 2.4536116123199463 }, { "auxiliary_loss_clip": 0.01049435, "auxiliary_loss_mlp": 0.0103657, "balance_loss_clip": 1.01407504, "balance_loss_mlp": 1.01510763, "epoch": 0.8797234330377274, "flos": 24242011822080.0, "grad_norm": 1.9741286991475018, "language_loss": 0.7006917, "learning_rate": 1.4977282339523954e-07, "loss": 0.72155178, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 14632, "time_per_iteration": 3.8044238090515137 }, { "auxiliary_loss_clip": 0.01051942, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.01493359, "balance_loss_mlp": 1.01691949, "epoch": 0.8797835562903953, "flos": 24166355172480.0, "grad_norm": 1.9216362738889932, "language_loss": 0.65643287, "learning_rate": 1.4962498358987929e-07, "loss": 0.67731702, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 14633, "time_per_iteration": 2.3925583362579346 }, { "auxiliary_loss_clip": 0.01051569, "auxiliary_loss_mlp": 0.01038081, "balance_loss_clip": 1.01481128, "balance_loss_mlp": 1.01610279, "epoch": 0.8798436795430633, "flos": 19283576843520.0, "grad_norm": 1.5115313559840258, "language_loss": 0.85194314, "learning_rate": 1.4947721395092528e-07, "loss": 0.87283969, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 14634, "time_per_iteration": 2.411674737930298 }, { "auxiliary_loss_clip": 0.01052075, "auxiliary_loss_mlp": 0.01044987, "balance_loss_clip": 1.01971459, "balance_loss_mlp": 1.0155766, "epoch": 0.8799038027957312, "flos": 28178259747840.0, "grad_norm": 1.7468707314417613, "language_loss": 0.80740869, "learning_rate": 1.4932951448398056e-07, "loss": 0.82837933, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 14635, "time_per_iteration": 2.4323837757110596 }, { "auxiliary_loss_clip": 0.01050131, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.01192188, "balance_loss_mlp": 1.01516342, "epoch": 0.8799639260483992, "flos": 24643547902080.0, "grad_norm": 1.676121815361769, "language_loss": 0.65953028, "learning_rate": 1.4918188519464648e-07, "loss": 0.68037093, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 14636, "time_per_iteration": 2.3979039192199707 }, { "auxiliary_loss_clip": 0.01052724, "auxiliary_loss_mlp": 0.0103812, "balance_loss_clip": 1.01338375, "balance_loss_mlp": 1.01608467, "epoch": 0.8800240493010671, "flos": 22199383284480.0, "grad_norm": 2.718790377766819, "language_loss": 0.71030664, "learning_rate": 1.4903432608852074e-07, "loss": 0.73121512, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 14637, "time_per_iteration": 2.3874804973602295 }, { "auxiliary_loss_clip": 0.01052782, "auxiliary_loss_mlp": 0.01037423, "balance_loss_clip": 1.01643085, "balance_loss_mlp": 1.01729512, "epoch": 0.8800841725537352, "flos": 14245226029440.0, "grad_norm": 1.9078911352764052, "language_loss": 0.67874491, "learning_rate": 1.4888683717119843e-07, "loss": 0.69964701, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.35546875, "step": 14638, "time_per_iteration": 2.3331735134124756 }, { "auxiliary_loss_clip": 0.01050775, "auxiliary_loss_mlp": 0.01035477, "balance_loss_clip": 1.01314902, "balance_loss_mlp": 1.01499021, "epoch": 0.8801442958064031, "flos": 37414252903680.0, "grad_norm": 1.7899782009784433, "language_loss": 0.59170234, "learning_rate": 1.4873941844827286e-07, "loss": 0.61256486, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 14639, "time_per_iteration": 2.516568183898926 }, { "auxiliary_loss_clip": 0.01052193, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 1.01326501, "balance_loss_mlp": 1.01601803, "epoch": 0.8802044190590711, "flos": 25046131322880.0, "grad_norm": 1.8971500601668443, "language_loss": 0.74952304, "learning_rate": 1.4859206992533402e-07, "loss": 0.7704047, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 14640, "time_per_iteration": 2.4171130657196045 }, { "auxiliary_loss_clip": 0.01053827, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.0149262, "balance_loss_mlp": 1.0169301, "epoch": 0.8802645423117391, "flos": 24132733666560.0, "grad_norm": 1.9186908299080092, "language_loss": 0.70992094, "learning_rate": 1.4844479160796985e-07, "loss": 0.7308557, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36914062, "step": 14641, "time_per_iteration": 2.4263501167297363 }, { "auxiliary_loss_clip": 0.01051999, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.01556897, "balance_loss_mlp": 1.01586628, "epoch": 0.880324665564407, "flos": 17930238744960.0, "grad_norm": 2.0801872974386564, "language_loss": 0.86634523, "learning_rate": 1.4829758350176457e-07, "loss": 0.88725644, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36132812, "step": 14642, "time_per_iteration": 2.3609280586242676 }, { "auxiliary_loss_clip": 0.01052302, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.01623487, "balance_loss_mlp": 1.01678514, "epoch": 0.880384788817075, "flos": 21286579121280.0, "grad_norm": 1.6321026760155928, "language_loss": 0.79960918, "learning_rate": 1.4815044561230038e-07, "loss": 0.8205303, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 14643, "time_per_iteration": 2.3718738555908203 }, { "auxiliary_loss_clip": 0.01047577, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.01024973, "balance_loss_mlp": 1.01444125, "epoch": 0.880444912069743, "flos": 12457672951680.0, "grad_norm": 1.5451584873944582, "language_loss": 0.73984587, "learning_rate": 1.4800337794515705e-07, "loss": 0.76063633, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33007812, "step": 14644, "time_per_iteration": 2.375429391860962 }, { "auxiliary_loss_clip": 0.01052189, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.01327872, "balance_loss_mlp": 1.01619124, "epoch": 0.880505035322411, "flos": 13625098727040.0, "grad_norm": 1.9761924518983958, "language_loss": 0.80151892, "learning_rate": 1.47856380505911e-07, "loss": 0.8224026, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 14645, "time_per_iteration": 2.3687925338745117 }, { "auxiliary_loss_clip": 0.01049847, "auxiliary_loss_mlp": 0.01040288, "balance_loss_clip": 1.01714981, "balance_loss_mlp": 1.01551723, "epoch": 0.8805651585750789, "flos": 23182013900160.0, "grad_norm": 1.976120855431325, "language_loss": 0.65226549, "learning_rate": 1.477094533001364e-07, "loss": 0.67316681, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34375, "step": 14646, "time_per_iteration": 2.4060981273651123 }, { "auxiliary_loss_clip": 0.0105507, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.01517665, "balance_loss_mlp": 1.0168494, "epoch": 0.8806252818277469, "flos": 14902116860160.0, "grad_norm": 2.400269466136488, "language_loss": 0.79461151, "learning_rate": 1.475625963334055e-07, "loss": 0.81557518, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 14647, "time_per_iteration": 2.3389861583709717 }, { "auxiliary_loss_clip": 0.01049218, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.01163614, "balance_loss_mlp": 1.0157311, "epoch": 0.8806854050804148, "flos": 17638225200000.0, "grad_norm": 2.0125180049269926, "language_loss": 0.76866156, "learning_rate": 1.4741580961128652e-07, "loss": 0.7894727, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33398438, "step": 14648, "time_per_iteration": 3.6478703022003174 }, { "auxiliary_loss_clip": 0.01049628, "auxiliary_loss_mlp": 0.01034231, "balance_loss_clip": 1.0126543, "balance_loss_mlp": 1.01422679, "epoch": 0.8807455283330828, "flos": 25331372064000.0, "grad_norm": 1.6080929328129512, "language_loss": 0.6663149, "learning_rate": 1.4726909313934522e-07, "loss": 0.68715352, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35351562, "step": 14649, "time_per_iteration": 2.41279673576355 }, { "auxiliary_loss_clip": 0.01052058, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.01683319, "balance_loss_mlp": 1.01696026, "epoch": 0.8808056515857507, "flos": 25263989406720.0, "grad_norm": 1.3329406428238044, "language_loss": 0.6316244, "learning_rate": 1.4712244692314578e-07, "loss": 0.65255308, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 14650, "time_per_iteration": 2.417815923690796 }, { "auxiliary_loss_clip": 0.01050257, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.01258087, "balance_loss_mlp": 1.01643729, "epoch": 0.8808657748384188, "flos": 26577631422720.0, "grad_norm": 1.4319901573395113, "language_loss": 0.73330861, "learning_rate": 1.4697587096824914e-07, "loss": 0.7541517, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33789062, "step": 14651, "time_per_iteration": 2.4618141651153564 }, { "auxiliary_loss_clip": 0.01053462, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.01385057, "balance_loss_mlp": 1.01649153, "epoch": 0.8809258980910867, "flos": 18660237696000.0, "grad_norm": 1.871973372133509, "language_loss": 0.7245481, "learning_rate": 1.4682936528021284e-07, "loss": 0.74546343, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 14652, "time_per_iteration": 2.3723065853118896 }, { "auxiliary_loss_clip": 0.01049086, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.01368034, "balance_loss_mlp": 1.01459944, "epoch": 0.8809860213437547, "flos": 19791249056640.0, "grad_norm": 6.689570025707998, "language_loss": 0.76491225, "learning_rate": 1.4668292986459286e-07, "loss": 0.78576511, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34375, "step": 14653, "time_per_iteration": 2.3501734733581543 }, { "auxiliary_loss_clip": 0.01053274, "auxiliary_loss_mlp": 0.01041586, "balance_loss_clip": 1.01880479, "balance_loss_mlp": 1.01666164, "epoch": 0.8810461445964227, "flos": 17893510128000.0, "grad_norm": 1.8188093949543576, "language_loss": 0.73121566, "learning_rate": 1.465365647269421e-07, "loss": 0.75216424, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 14654, "time_per_iteration": 2.3689918518066406 }, { "auxiliary_loss_clip": 0.01051011, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.01485336, "balance_loss_mlp": 1.01657224, "epoch": 0.8811062678490906, "flos": 29162775576960.0, "grad_norm": 2.0620574642171587, "language_loss": 0.72510076, "learning_rate": 1.4639026987281012e-07, "loss": 0.74599451, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34375, "step": 14655, "time_per_iteration": 2.4395909309387207 }, { "auxiliary_loss_clip": 0.01050533, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.01305604, "balance_loss_mlp": 1.0154618, "epoch": 0.8811663911017587, "flos": 20337814391040.0, "grad_norm": 1.8744210317203864, "language_loss": 0.82878172, "learning_rate": 1.462440453077449e-07, "loss": 0.84964466, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 14656, "time_per_iteration": 2.3804893493652344 }, { "auxiliary_loss_clip": 0.01051011, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.01362276, "balance_loss_mlp": 1.01614571, "epoch": 0.8812265143544266, "flos": 25884500734080.0, "grad_norm": 2.1344815841140194, "language_loss": 0.69965732, "learning_rate": 1.460978910372914e-07, "loss": 0.72051948, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 14657, "time_per_iteration": 3.837470531463623 }, { "auxiliary_loss_clip": 0.01053081, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.01483345, "balance_loss_mlp": 1.0172956, "epoch": 0.8812866376070946, "flos": 27194372323200.0, "grad_norm": 1.8517487216035111, "language_loss": 0.84944594, "learning_rate": 1.4595180706699207e-07, "loss": 0.87035704, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 14658, "time_per_iteration": 3.8206231594085693 }, { "auxiliary_loss_clip": 0.01057772, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.0142796, "balance_loss_mlp": 1.01898694, "epoch": 0.8813467608597625, "flos": 23806016363520.0, "grad_norm": 2.0338500213524653, "language_loss": 0.78058231, "learning_rate": 1.4580579340238554e-07, "loss": 0.80155736, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 14659, "time_per_iteration": 2.394940137863159 }, { "auxiliary_loss_clip": 0.01051061, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.01250982, "balance_loss_mlp": 1.01579213, "epoch": 0.8814068841124305, "flos": 21104402313600.0, "grad_norm": 1.9921804425670369, "language_loss": 0.61636269, "learning_rate": 1.4565985004900894e-07, "loss": 0.63722467, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 14660, "time_per_iteration": 2.3532655239105225 }, { "auxiliary_loss_clip": 0.0105099, "auxiliary_loss_mlp": 0.01036304, "balance_loss_clip": 1.01180625, "balance_loss_mlp": 1.01545763, "epoch": 0.8814670073650984, "flos": 24715853061120.0, "grad_norm": 1.743900398888105, "language_loss": 0.78071612, "learning_rate": 1.455139770123972e-07, "loss": 0.80158907, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.35546875, "step": 14661, "time_per_iteration": 2.403336763381958 }, { "auxiliary_loss_clip": 0.01053497, "auxiliary_loss_mlp": 0.0103833, "balance_loss_clip": 1.01564491, "balance_loss_mlp": 1.01727676, "epoch": 0.8815271306177664, "flos": 22965168245760.0, "grad_norm": 2.428569271424006, "language_loss": 0.77548337, "learning_rate": 1.45368174298081e-07, "loss": 0.79640162, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 14662, "time_per_iteration": 2.3714115619659424 }, { "auxiliary_loss_clip": 0.01050505, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.01412249, "balance_loss_mlp": 1.01601386, "epoch": 0.8815872538704344, "flos": 19459155404160.0, "grad_norm": 2.2776404795246683, "language_loss": 0.75055641, "learning_rate": 1.4522244191158929e-07, "loss": 0.77140737, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34570312, "step": 14663, "time_per_iteration": 2.3559348583221436 }, { "auxiliary_loss_clip": 0.01050208, "auxiliary_loss_mlp": 0.01042705, "balance_loss_clip": 1.01980519, "balance_loss_mlp": 1.01623964, "epoch": 0.8816473771231024, "flos": 32155355831040.0, "grad_norm": 1.5845917911179896, "language_loss": 0.70784259, "learning_rate": 1.450767798584489e-07, "loss": 0.72877175, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33984375, "step": 14664, "time_per_iteration": 2.455859422683716 }, { "auxiliary_loss_clip": 0.01050418, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.013291, "balance_loss_mlp": 1.01588047, "epoch": 0.8817075003757703, "flos": 19681272673920.0, "grad_norm": 1.539708869698651, "language_loss": 0.81569707, "learning_rate": 1.449311881441828e-07, "loss": 0.83656877, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34570312, "step": 14665, "time_per_iteration": 2.368464469909668 }, { "auxiliary_loss_clip": 0.01052585, "auxiliary_loss_mlp": 0.01038663, "balance_loss_clip": 1.01624012, "balance_loss_mlp": 1.01632977, "epoch": 0.8817676236284383, "flos": 15667727264640.0, "grad_norm": 2.0474168163263826, "language_loss": 0.59539372, "learning_rate": 1.447856667743117e-07, "loss": 0.61630619, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 14666, "time_per_iteration": 2.357415199279785 }, { "auxiliary_loss_clip": 0.01052752, "auxiliary_loss_mlp": 0.01042369, "balance_loss_clip": 1.01678634, "balance_loss_mlp": 1.01666522, "epoch": 0.8818277468811063, "flos": 17894208355200.0, "grad_norm": 1.9218399007873008, "language_loss": 0.85442472, "learning_rate": 1.4464021575435403e-07, "loss": 0.87537593, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36132812, "step": 14667, "time_per_iteration": 2.394066333770752 }, { "auxiliary_loss_clip": 0.01050194, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.0150398, "balance_loss_mlp": 1.01538372, "epoch": 0.8818878701337742, "flos": 18769515851520.0, "grad_norm": 2.145806635719087, "language_loss": 0.63057584, "learning_rate": 1.4449483508982563e-07, "loss": 0.65145445, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14668, "time_per_iteration": 2.362997055053711 }, { "auxiliary_loss_clip": 0.0104937, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.01071525, "balance_loss_mlp": 1.01548839, "epoch": 0.8819479933864423, "flos": 17711333320320.0, "grad_norm": 2.1400383437071757, "language_loss": 0.5858646, "learning_rate": 1.4434952478623918e-07, "loss": 0.60667193, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33984375, "step": 14669, "time_per_iteration": 2.3936922550201416 }, { "auxiliary_loss_clip": 0.01050351, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.01417065, "balance_loss_mlp": 1.01585484, "epoch": 0.8820081166391102, "flos": 11727953291520.0, "grad_norm": 2.8966416637383867, "language_loss": 0.72972214, "learning_rate": 1.442042848491043e-07, "loss": 0.75057971, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 14670, "time_per_iteration": 2.3401994705200195 }, { "auxiliary_loss_clip": 0.0105006, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.01188159, "balance_loss_mlp": 1.01508272, "epoch": 0.8820682398917782, "flos": 27489143865600.0, "grad_norm": 2.2853226442424344, "language_loss": 0.74422979, "learning_rate": 1.44059115283929e-07, "loss": 0.76508516, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34960938, "step": 14671, "time_per_iteration": 2.410701036453247 }, { "auxiliary_loss_clip": 0.01053555, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.01065016, "balance_loss_mlp": 1.0156486, "epoch": 0.8821283631444461, "flos": 16872894086400.0, "grad_norm": 2.0566855698135673, "language_loss": 0.85772175, "learning_rate": 1.43914016096218e-07, "loss": 0.87862897, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 14672, "time_per_iteration": 3.7538070678710938 }, { "auxiliary_loss_clip": 0.01049391, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.00861669, "balance_loss_mlp": 1.01528728, "epoch": 0.8821884863971141, "flos": 24279787779840.0, "grad_norm": 1.5034026200337745, "language_loss": 0.73692447, "learning_rate": 1.4376898729147336e-07, "loss": 0.75773382, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.33984375, "step": 14673, "time_per_iteration": 2.3869614601135254 }, { "auxiliary_loss_clip": 0.01007123, "auxiliary_loss_mlp": 0.01003396, "balance_loss_clip": 1.0013932, "balance_loss_mlp": 1.00062084, "epoch": 0.882248609649782, "flos": 59428734428160.0, "grad_norm": 0.8114662331786209, "language_loss": 0.49533847, "learning_rate": 1.4362402887519487e-07, "loss": 0.51544362, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06494141, "step": 14674, "time_per_iteration": 3.1172256469726562 }, { "auxiliary_loss_clip": 0.01052695, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.01649714, "balance_loss_mlp": 1.01632392, "epoch": 0.88230873290245, "flos": 19936767070080.0, "grad_norm": 1.8987643759859028, "language_loss": 0.77921367, "learning_rate": 1.4347914085287971e-07, "loss": 0.80015063, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 14675, "time_per_iteration": 2.3450896739959717 }, { "auxiliary_loss_clip": 0.01049249, "auxiliary_loss_mlp": 0.01035826, "balance_loss_clip": 1.01197219, "balance_loss_mlp": 1.01452637, "epoch": 0.882368856155118, "flos": 16361835471360.0, "grad_norm": 1.8177934753532914, "language_loss": 0.8091265, "learning_rate": 1.4333432323002105e-07, "loss": 0.82997721, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34765625, "step": 14676, "time_per_iteration": 2.3451733589172363 }, { "auxiliary_loss_clip": 0.01007444, "auxiliary_loss_mlp": 0.01002793, "balance_loss_clip": 1.00059938, "balance_loss_mlp": 1.00081897, "epoch": 0.882428979407786, "flos": 70590711669120.0, "grad_norm": 0.6968071521777633, "language_loss": 0.549335, "learning_rate": 1.431895760121109e-07, "loss": 0.56943738, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06640625, "step": 14677, "time_per_iteration": 3.1387436389923096 }, { "auxiliary_loss_clip": 0.01049464, "auxiliary_loss_mlp": 0.01035844, "balance_loss_clip": 1.01230037, "balance_loss_mlp": 1.0149684, "epoch": 0.8824891026604539, "flos": 18149318726400.0, "grad_norm": 2.214177729229667, "language_loss": 0.66934329, "learning_rate": 1.4304489920463847e-07, "loss": 0.69019634, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34570312, "step": 14678, "time_per_iteration": 2.3951282501220703 }, { "auxiliary_loss_clip": 0.0105312, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.01032472, "balance_loss_mlp": 1.01602066, "epoch": 0.8825492259131219, "flos": 27231554787840.0, "grad_norm": 1.9129401106567923, "language_loss": 0.72074437, "learning_rate": 1.4290029281308936e-07, "loss": 0.7416153, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 14679, "time_per_iteration": 2.416346549987793 }, { "auxiliary_loss_clip": 0.01050591, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.01294005, "balance_loss_mlp": 1.0165031, "epoch": 0.8826093491657898, "flos": 22273294366080.0, "grad_norm": 1.6948903633377934, "language_loss": 0.64779937, "learning_rate": 1.4275575684294694e-07, "loss": 0.66864944, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 14680, "time_per_iteration": 2.384524345397949 }, { "auxiliary_loss_clip": 0.01049866, "auxiliary_loss_mlp": 0.01033083, "balance_loss_clip": 1.01093423, "balance_loss_mlp": 1.01552081, "epoch": 0.8826694724184578, "flos": 14204028758400.0, "grad_norm": 2.399927093588753, "language_loss": 0.78623712, "learning_rate": 1.4261129129969328e-07, "loss": 0.80706662, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34375, "step": 14681, "time_per_iteration": 2.311941146850586 }, { "auxiliary_loss_clip": 0.0105255, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.01128626, "balance_loss_mlp": 1.01663375, "epoch": 0.8827295956711259, "flos": 20630630897280.0, "grad_norm": 2.238339378226848, "language_loss": 0.74864483, "learning_rate": 1.424668961888047e-07, "loss": 0.76952714, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 14682, "time_per_iteration": 2.3667376041412354 }, { "auxiliary_loss_clip": 0.01053956, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.01178026, "balance_loss_mlp": 1.01694298, "epoch": 0.8827897189237938, "flos": 18512136241920.0, "grad_norm": 1.7613403498478073, "language_loss": 0.76048768, "learning_rate": 1.4232257151575765e-07, "loss": 0.78139758, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 14683, "time_per_iteration": 2.3584680557250977 }, { "auxiliary_loss_clip": 0.01051962, "auxiliary_loss_mlp": 0.0103117, "balance_loss_clip": 1.0094142, "balance_loss_mlp": 1.01613927, "epoch": 0.8828498421764618, "flos": 22746297732480.0, "grad_norm": 1.7964198103156148, "language_loss": 0.66651493, "learning_rate": 1.4217831728602492e-07, "loss": 0.68734628, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 14684, "time_per_iteration": 2.3624069690704346 }, { "auxiliary_loss_clip": 0.01049837, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.01442611, "balance_loss_mlp": 1.01543307, "epoch": 0.8829099654291297, "flos": 15011499749760.0, "grad_norm": 2.176683169484656, "language_loss": 0.70344186, "learning_rate": 1.4203413350507677e-07, "loss": 0.72431898, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34375, "step": 14685, "time_per_iteration": 2.3525054454803467 }, { "auxiliary_loss_clip": 0.01052362, "auxiliary_loss_mlp": 0.01036375, "balance_loss_clip": 1.01353455, "balance_loss_mlp": 1.01612818, "epoch": 0.8829700886817977, "flos": 16719800307840.0, "grad_norm": 1.9047767798172004, "language_loss": 0.75565302, "learning_rate": 1.418900201783806e-07, "loss": 0.77654034, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 14686, "time_per_iteration": 2.3587822914123535 }, { "auxiliary_loss_clip": 0.01049162, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01012969, "balance_loss_mlp": 1.01504493, "epoch": 0.8830302119344656, "flos": 15262490580480.0, "grad_norm": 1.787811810046663, "language_loss": 0.64587873, "learning_rate": 1.417459773114007e-07, "loss": 0.6666823, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 14687, "time_per_iteration": 3.609645366668701 }, { "auxiliary_loss_clip": 0.01051997, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.01169693, "balance_loss_mlp": 1.01634121, "epoch": 0.8830903351871336, "flos": 28616594267520.0, "grad_norm": 1.7653816116382128, "language_loss": 0.69407117, "learning_rate": 1.4160200490959984e-07, "loss": 0.71493661, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 14688, "time_per_iteration": 2.426697254180908 }, { "auxiliary_loss_clip": 0.01049821, "auxiliary_loss_mlp": 0.01030581, "balance_loss_clip": 1.00864613, "balance_loss_mlp": 1.01560903, "epoch": 0.8831504584398016, "flos": 28000377037440.0, "grad_norm": 1.6703653959792895, "language_loss": 0.68168277, "learning_rate": 1.4145810297843697e-07, "loss": 0.70248675, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 14689, "time_per_iteration": 2.442763328552246 }, { "auxiliary_loss_clip": 0.01053745, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.01518536, "balance_loss_mlp": 1.01823211, "epoch": 0.8832105816924696, "flos": 26578399472640.0, "grad_norm": 1.3949429825611912, "language_loss": 0.75300866, "learning_rate": 1.4131427152336905e-07, "loss": 0.77392268, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 14690, "time_per_iteration": 2.5216026306152344 }, { "auxiliary_loss_clip": 0.0105328, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.01534998, "balance_loss_mlp": 1.01686525, "epoch": 0.8832707049451375, "flos": 24897645843840.0, "grad_norm": 1.4484521300837756, "language_loss": 0.74007756, "learning_rate": 1.4117051054985018e-07, "loss": 0.76100761, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 14691, "time_per_iteration": 2.417863607406616 }, { "auxiliary_loss_clip": 0.01054594, "auxiliary_loss_mlp": 0.01036329, "balance_loss_clip": 1.0104841, "balance_loss_mlp": 1.01661837, "epoch": 0.8833308281978055, "flos": 15450218294400.0, "grad_norm": 1.9498668780143102, "language_loss": 0.53616554, "learning_rate": 1.4102682006333243e-07, "loss": 0.55707479, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 14692, "time_per_iteration": 2.3652570247650146 }, { "auxiliary_loss_clip": 0.01052606, "auxiliary_loss_mlp": 0.01033273, "balance_loss_clip": 1.01020598, "balance_loss_mlp": 1.01648331, "epoch": 0.8833909514504734, "flos": 20300527192320.0, "grad_norm": 3.702451205214102, "language_loss": 0.61470395, "learning_rate": 1.4088320006926346e-07, "loss": 0.63556272, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 14693, "time_per_iteration": 2.35591459274292 }, { "auxiliary_loss_clip": 0.01049797, "auxiliary_loss_mlp": 0.0103004, "balance_loss_clip": 1.00978625, "balance_loss_mlp": 1.01666665, "epoch": 0.8834510747031414, "flos": 20373041819520.0, "grad_norm": 1.4888641858912761, "language_loss": 0.76384771, "learning_rate": 1.407396505730898e-07, "loss": 0.78464603, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.33203125, "step": 14694, "time_per_iteration": 2.3568222522735596 }, { "auxiliary_loss_clip": 0.01052312, "auxiliary_loss_mlp": 0.01035224, "balance_loss_clip": 1.01148927, "balance_loss_mlp": 1.01519537, "epoch": 0.8835111979558095, "flos": 29750747650560.0, "grad_norm": 1.8730629576668674, "language_loss": 0.73703134, "learning_rate": 1.4059617158025527e-07, "loss": 0.75790668, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37109375, "step": 14695, "time_per_iteration": 2.423276424407959 }, { "auxiliary_loss_clip": 0.01047532, "auxiliary_loss_mlp": 0.01029117, "balance_loss_clip": 1.00857687, "balance_loss_mlp": 1.01509774, "epoch": 0.8835713212084774, "flos": 24133396982400.0, "grad_norm": 1.6818178710568183, "language_loss": 0.81305408, "learning_rate": 1.404527630961998e-07, "loss": 0.83382058, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.32421875, "step": 14696, "time_per_iteration": 2.375077486038208 }, { "auxiliary_loss_clip": 0.01051062, "auxiliary_loss_mlp": 0.01038977, "balance_loss_clip": 1.01529026, "balance_loss_mlp": 1.01603413, "epoch": 0.8836314444611454, "flos": 27671111205120.0, "grad_norm": 1.5379292139354235, "language_loss": 0.76208556, "learning_rate": 1.4030942512636236e-07, "loss": 0.78298599, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 14697, "time_per_iteration": 3.811378240585327 }, { "auxiliary_loss_clip": 0.01051016, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.01105762, "balance_loss_mlp": 1.01660299, "epoch": 0.8836915677138133, "flos": 16836025824000.0, "grad_norm": 2.0535885235468787, "language_loss": 0.73154414, "learning_rate": 1.401661576761779e-07, "loss": 0.75239205, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34375, "step": 14698, "time_per_iteration": 3.6180996894836426 }, { "auxiliary_loss_clip": 0.0100735, "auxiliary_loss_mlp": 0.01003856, "balance_loss_clip": 1.00167501, "balance_loss_mlp": 1.00084519, "epoch": 0.8837516909664813, "flos": 69306920732160.0, "grad_norm": 0.7989470629199014, "language_loss": 0.53810358, "learning_rate": 1.4002296075107856e-07, "loss": 0.55821562, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.06494141, "step": 14699, "time_per_iteration": 3.01292085647583 }, { "auxiliary_loss_clip": 0.01053815, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.01017833, "balance_loss_mlp": 1.01712084, "epoch": 0.8838118142191492, "flos": 21323656851840.0, "grad_norm": 1.9141900783580514, "language_loss": 0.78101885, "learning_rate": 1.3987983435649508e-07, "loss": 0.80190128, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 14700, "time_per_iteration": 2.365626573562622 }, { "auxiliary_loss_clip": 0.01049466, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.01040888, "balance_loss_mlp": 1.01542211, "epoch": 0.8838719374718172, "flos": 21469489067520.0, "grad_norm": 2.359922764632413, "language_loss": 0.74135756, "learning_rate": 1.3973677849785494e-07, "loss": 0.76216912, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 14701, "time_per_iteration": 2.3747668266296387 }, { "auxiliary_loss_clip": 0.01053228, "auxiliary_loss_mlp": 0.01042608, "balance_loss_clip": 1.01855171, "balance_loss_mlp": 1.01667762, "epoch": 0.8839320607244852, "flos": 26467724862720.0, "grad_norm": 3.2824409452121195, "language_loss": 0.72016478, "learning_rate": 1.3959379318058262e-07, "loss": 0.74112308, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 14702, "time_per_iteration": 2.44448184967041 }, { "auxiliary_loss_clip": 0.0105161, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.01390672, "balance_loss_mlp": 1.01631165, "epoch": 0.8839921839771532, "flos": 45220553084160.0, "grad_norm": 1.5725787898043782, "language_loss": 0.72624266, "learning_rate": 1.3945087841010006e-07, "loss": 0.7471453, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3515625, "step": 14703, "time_per_iteration": 2.5852670669555664 }, { "auxiliary_loss_clip": 0.01050646, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.01414037, "balance_loss_mlp": 1.0164783, "epoch": 0.8840523072298211, "flos": 20005965118080.0, "grad_norm": 1.6828188912172477, "language_loss": 0.67782617, "learning_rate": 1.3930803419182645e-07, "loss": 0.69867229, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.34179688, "step": 14704, "time_per_iteration": 2.3586885929107666 }, { "auxiliary_loss_clip": 0.01048882, "auxiliary_loss_mlp": 0.01033918, "balance_loss_clip": 1.01342618, "balance_loss_mlp": 1.01516843, "epoch": 0.8841124304824891, "flos": 24424851945600.0, "grad_norm": 1.6599694063504173, "language_loss": 0.71256185, "learning_rate": 1.3916526053117905e-07, "loss": 0.73338985, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 14705, "time_per_iteration": 2.4272680282592773 }, { "auxiliary_loss_clip": 0.0105052, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.01505423, "balance_loss_mlp": 1.01620805, "epoch": 0.884172553735157, "flos": 31283295091200.0, "grad_norm": 1.588152852573069, "language_loss": 0.71838397, "learning_rate": 1.3902255743357104e-07, "loss": 0.7392441, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 14706, "time_per_iteration": 2.4467341899871826 }, { "auxiliary_loss_clip": 0.01049402, "auxiliary_loss_mlp": 0.01033219, "balance_loss_clip": 1.01127243, "balance_loss_mlp": 1.01464963, "epoch": 0.884232676987825, "flos": 21390271459200.0, "grad_norm": 2.4598819558046205, "language_loss": 0.75650251, "learning_rate": 1.3887992490441413e-07, "loss": 0.77732873, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14707, "time_per_iteration": 2.356515407562256 }, { "auxiliary_loss_clip": 0.01007273, "auxiliary_loss_mlp": 0.01001697, "balance_loss_clip": 0.99965864, "balance_loss_mlp": 1.0007714, "epoch": 0.8842928002404931, "flos": 57908582386560.0, "grad_norm": 0.795656255042947, "language_loss": 0.60505176, "learning_rate": 1.387373629491173e-07, "loss": 0.62514138, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06494141, "step": 14708, "time_per_iteration": 2.85077166557312 }, { "auxiliary_loss_clip": 0.0104803, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.01656508, "balance_loss_mlp": 1.01502299, "epoch": 0.884352923493161, "flos": 41462292602880.0, "grad_norm": 1.6374904675139312, "language_loss": 0.67917585, "learning_rate": 1.3859487157308625e-07, "loss": 0.70002663, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33007812, "step": 14709, "time_per_iteration": 2.5372962951660156 }, { "auxiliary_loss_clip": 0.01053454, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.01723993, "balance_loss_mlp": 1.01567674, "epoch": 0.884413046745829, "flos": 46539327070080.0, "grad_norm": 1.5333441710305382, "language_loss": 0.63940382, "learning_rate": 1.3845245078172373e-07, "loss": 0.660357, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37695312, "step": 14710, "time_per_iteration": 2.621408224105835 }, { "auxiliary_loss_clip": 0.01049688, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.00985754, "balance_loss_mlp": 1.01560938, "epoch": 0.8844731699984969, "flos": 19134323314560.0, "grad_norm": 2.3491182491360996, "language_loss": 0.64527941, "learning_rate": 1.38310100580431e-07, "loss": 0.6660828, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33984375, "step": 14711, "time_per_iteration": 3.8813204765319824 }, { "auxiliary_loss_clip": 0.01052795, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.01097965, "balance_loss_mlp": 1.01516509, "epoch": 0.8845332932511649, "flos": 23259451029120.0, "grad_norm": 2.186700473621865, "language_loss": 0.76560366, "learning_rate": 1.38167820974606e-07, "loss": 0.78648031, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37695312, "step": 14712, "time_per_iteration": 2.3994243144989014 }, { "auxiliary_loss_clip": 0.01050583, "auxiliary_loss_mlp": 0.01032001, "balance_loss_clip": 1.01016128, "balance_loss_mlp": 1.0152328, "epoch": 0.8845934165038328, "flos": 17563685713920.0, "grad_norm": 2.158509219939818, "language_loss": 0.8210606, "learning_rate": 1.3802561196964368e-07, "loss": 0.84188646, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 14713, "time_per_iteration": 2.3216567039489746 }, { "auxiliary_loss_clip": 0.0105115, "auxiliary_loss_mlp": 0.01035435, "balance_loss_clip": 1.01248717, "balance_loss_mlp": 1.01582003, "epoch": 0.8846535397565009, "flos": 27484640300160.0, "grad_norm": 1.4340263077516178, "language_loss": 0.56665593, "learning_rate": 1.3788347357093688e-07, "loss": 0.58752179, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 14714, "time_per_iteration": 2.4429399967193604 }, { "auxiliary_loss_clip": 0.01050037, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.01343179, "balance_loss_mlp": 1.01549053, "epoch": 0.8847136630091688, "flos": 28760331801600.0, "grad_norm": 2.6361253651312553, "language_loss": 0.75395221, "learning_rate": 1.377414057838755e-07, "loss": 0.77481115, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 14715, "time_per_iteration": 2.4262616634368896 }, { "auxiliary_loss_clip": 0.01050731, "auxiliary_loss_mlp": 0.01037837, "balance_loss_clip": 1.01589096, "balance_loss_mlp": 1.01564837, "epoch": 0.8847737862618368, "flos": 23475807924480.0, "grad_norm": 1.5739359227451473, "language_loss": 0.75882447, "learning_rate": 1.375994086138461e-07, "loss": 0.77971023, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 14716, "time_per_iteration": 2.377725124359131 }, { "auxiliary_loss_clip": 0.0105159, "auxiliary_loss_mlp": 0.01036231, "balance_loss_clip": 1.01428473, "balance_loss_mlp": 1.01720953, "epoch": 0.8848339095145047, "flos": 18659888582400.0, "grad_norm": 2.0499895720712336, "language_loss": 0.72050154, "learning_rate": 1.3745748206623397e-07, "loss": 0.74137974, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 14717, "time_per_iteration": 2.3739850521087646 }, { "auxiliary_loss_clip": 0.01048642, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.01408613, "balance_loss_mlp": 1.01594162, "epoch": 0.8848940327671727, "flos": 32268928083840.0, "grad_norm": 1.9122782425007487, "language_loss": 0.7586118, "learning_rate": 1.373156261464208e-07, "loss": 0.77943885, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.328125, "step": 14718, "time_per_iteration": 2.440760850906372 }, { "auxiliary_loss_clip": 0.01053039, "auxiliary_loss_mlp": 0.01035952, "balance_loss_clip": 1.01301634, "balance_loss_mlp": 1.01659489, "epoch": 0.8849541560198406, "flos": 24020767336320.0, "grad_norm": 1.4719794843831806, "language_loss": 0.79551363, "learning_rate": 1.3717384085978602e-07, "loss": 0.81640351, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 14719, "time_per_iteration": 2.448371410369873 }, { "auxiliary_loss_clip": 0.01051962, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.01079273, "balance_loss_mlp": 1.01623249, "epoch": 0.8850142792725086, "flos": 16872126036480.0, "grad_norm": 1.6746753510534922, "language_loss": 0.72914875, "learning_rate": 1.3703212621170579e-07, "loss": 0.75000918, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35742188, "step": 14720, "time_per_iteration": 2.321080446243286 }, { "auxiliary_loss_clip": 0.01052599, "auxiliary_loss_mlp": 0.01038792, "balance_loss_clip": 1.01577258, "balance_loss_mlp": 1.01611102, "epoch": 0.8850744025251767, "flos": 24022931840640.0, "grad_norm": 1.9529949028842144, "language_loss": 0.83635426, "learning_rate": 1.3689048220755383e-07, "loss": 0.85726821, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 14721, "time_per_iteration": 2.387066602706909 }, { "auxiliary_loss_clip": 0.01050375, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.00934911, "balance_loss_mlp": 1.01481175, "epoch": 0.8851345257778446, "flos": 47953868515200.0, "grad_norm": 1.9481566897519562, "language_loss": 0.63804162, "learning_rate": 1.3674890885270186e-07, "loss": 0.65888608, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35546875, "step": 14722, "time_per_iteration": 2.5906805992126465 }, { "auxiliary_loss_clip": 0.01052908, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.01145184, "balance_loss_mlp": 1.01658392, "epoch": 0.8851946490305126, "flos": 36609539909760.0, "grad_norm": 1.979313157385034, "language_loss": 0.70039159, "learning_rate": 1.3660740615251754e-07, "loss": 0.7212671, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36328125, "step": 14723, "time_per_iteration": 2.50056529045105 }, { "auxiliary_loss_clip": 0.01052215, "auxiliary_loss_mlp": 0.01032693, "balance_loss_clip": 1.01124787, "balance_loss_mlp": 1.01702714, "epoch": 0.8852547722831805, "flos": 21543155769600.0, "grad_norm": 2.243513411189543, "language_loss": 0.7863248, "learning_rate": 1.3646597411236703e-07, "loss": 0.80717397, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 14724, "time_per_iteration": 2.374720335006714 }, { "auxiliary_loss_clip": 0.01007401, "auxiliary_loss_mlp": 0.01004621, "balance_loss_clip": 1.00277317, "balance_loss_mlp": 1.00095606, "epoch": 0.8853148955358485, "flos": 63056141533440.0, "grad_norm": 0.7989246139628524, "language_loss": 0.59033334, "learning_rate": 1.363246127376143e-07, "loss": 0.61045361, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.06445312, "step": 14725, "time_per_iteration": 2.890690803527832 }, { "auxiliary_loss_clip": 0.01053474, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.01595581, "balance_loss_mlp": 1.01566041, "epoch": 0.8853750187885164, "flos": 18148864878720.0, "grad_norm": 2.0890331679198177, "language_loss": 0.70723808, "learning_rate": 1.3618332203361837e-07, "loss": 0.72817194, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37890625, "step": 14726, "time_per_iteration": 2.3660340309143066 }, { "auxiliary_loss_clip": 0.01050444, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.02171135, "balance_loss_mlp": 1.01638317, "epoch": 0.8854351420411845, "flos": 39568882682880.0, "grad_norm": 1.9561295117410014, "language_loss": 0.70124018, "learning_rate": 1.3604210200573785e-07, "loss": 0.72218513, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.33984375, "step": 14727, "time_per_iteration": 3.797562599182129 }, { "auxiliary_loss_clip": 0.01052469, "auxiliary_loss_mlp": 0.01036089, "balance_loss_clip": 1.01334381, "balance_loss_mlp": 1.0171572, "epoch": 0.8854952652938524, "flos": 23768170583040.0, "grad_norm": 1.8164049728761797, "language_loss": 0.71159363, "learning_rate": 1.3590095265932733e-07, "loss": 0.73247921, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 14728, "time_per_iteration": 2.408064842224121 }, { "auxiliary_loss_clip": 0.01050462, "auxiliary_loss_mlp": 0.01038344, "balance_loss_clip": 1.01621866, "balance_loss_mlp": 1.01482725, "epoch": 0.8855553885465204, "flos": 18289495301760.0, "grad_norm": 2.741858823085044, "language_loss": 0.67745548, "learning_rate": 1.3575987399973987e-07, "loss": 0.69834352, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 14729, "time_per_iteration": 2.4033734798431396 }, { "auxiliary_loss_clip": 0.0105042, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.01636076, "balance_loss_mlp": 1.01600695, "epoch": 0.8856155117991883, "flos": 36865907089920.0, "grad_norm": 1.5516827242790097, "language_loss": 0.6382696, "learning_rate": 1.3561886603232453e-07, "loss": 0.6591512, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34570312, "step": 14730, "time_per_iteration": 2.5201025009155273 }, { "auxiliary_loss_clip": 0.01048155, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.01282787, "balance_loss_mlp": 1.0146184, "epoch": 0.8856756350518563, "flos": 22162794312960.0, "grad_norm": 1.4653830231208116, "language_loss": 0.80333817, "learning_rate": 1.3547792876242904e-07, "loss": 0.82416111, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33398438, "step": 14731, "time_per_iteration": 2.4182229042053223 }, { "auxiliary_loss_clip": 0.01052241, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.01616931, "balance_loss_mlp": 1.01631618, "epoch": 0.8857357583045242, "flos": 20739909052800.0, "grad_norm": 4.395653692647698, "language_loss": 0.8436166, "learning_rate": 1.3533706219539708e-07, "loss": 0.86453402, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 14732, "time_per_iteration": 2.397047996520996 }, { "auxiliary_loss_clip": 0.01007368, "auxiliary_loss_mlp": 0.01003941, "balance_loss_clip": 1.00180757, "balance_loss_mlp": 1.00076258, "epoch": 0.8857958815571922, "flos": 69888748406400.0, "grad_norm": 0.9096958495326123, "language_loss": 0.6009661, "learning_rate": 1.3519626633657045e-07, "loss": 0.62107921, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.06640625, "step": 14733, "time_per_iteration": 3.0329487323760986 }, { "auxiliary_loss_clip": 0.01052529, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.01996636, "balance_loss_mlp": 1.01688588, "epoch": 0.8858560048098603, "flos": 15121057196160.0, "grad_norm": 2.092835623421941, "language_loss": 0.67863953, "learning_rate": 1.3505554119128838e-07, "loss": 0.69958973, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 14734, "time_per_iteration": 2.4193387031555176 }, { "auxiliary_loss_clip": 0.01051231, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.01223874, "balance_loss_mlp": 1.01647568, "epoch": 0.8859161280625282, "flos": 16610277772800.0, "grad_norm": 1.9607925285537038, "language_loss": 0.76260614, "learning_rate": 1.3491488676488682e-07, "loss": 0.78346455, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14735, "time_per_iteration": 2.3928422927856445 }, { "auxiliary_loss_clip": 0.01051038, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.01626885, "balance_loss_mlp": 1.01537526, "epoch": 0.8859762513151962, "flos": 18693579911040.0, "grad_norm": 1.7051663776893324, "language_loss": 0.71580184, "learning_rate": 1.3477430306270066e-07, "loss": 0.73670012, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 14736, "time_per_iteration": 2.359177827835083 }, { "auxiliary_loss_clip": 0.0105326, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.01058936, "balance_loss_mlp": 1.01679039, "epoch": 0.8860363745678641, "flos": 19535859394560.0, "grad_norm": 1.7685525722404087, "language_loss": 0.85681939, "learning_rate": 1.3463379009005892e-07, "loss": 0.87768233, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 14737, "time_per_iteration": 3.7893126010894775 }, { "auxiliary_loss_clip": 0.01053817, "auxiliary_loss_mlp": 0.01042916, "balance_loss_clip": 1.01852572, "balance_loss_mlp": 1.01583982, "epoch": 0.8860964978205321, "flos": 35953452040320.0, "grad_norm": 2.0431015404724406, "language_loss": 0.69739789, "learning_rate": 1.3449334785229093e-07, "loss": 0.71836519, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37890625, "step": 14738, "time_per_iteration": 3.8634164333343506 }, { "auxiliary_loss_clip": 0.010536, "auxiliary_loss_mlp": 0.01041016, "balance_loss_clip": 1.01598167, "balance_loss_mlp": 1.01619601, "epoch": 0.8861566210732, "flos": 21211585787520.0, "grad_norm": 1.6946612579399456, "language_loss": 0.75922263, "learning_rate": 1.343529763547222e-07, "loss": 0.78016877, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37304688, "step": 14739, "time_per_iteration": 2.3862528800964355 }, { "auxiliary_loss_clip": 0.01049536, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.01403451, "balance_loss_mlp": 1.01528418, "epoch": 0.886216744325868, "flos": 14608253013120.0, "grad_norm": 1.9359119899609276, "language_loss": 0.88140976, "learning_rate": 1.3421267560267559e-07, "loss": 0.90226126, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34179688, "step": 14740, "time_per_iteration": 2.3734538555145264 }, { "auxiliary_loss_clip": 0.01052029, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 1.01405609, "balance_loss_mlp": 1.01648736, "epoch": 0.886276867578536, "flos": 26650425340800.0, "grad_norm": 1.987199339541, "language_loss": 0.64086175, "learning_rate": 1.34072445601471e-07, "loss": 0.66176087, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 14741, "time_per_iteration": 2.481691360473633 }, { "auxiliary_loss_clip": 0.01051026, "auxiliary_loss_mlp": 0.01033189, "balance_loss_clip": 1.01090848, "balance_loss_mlp": 1.01556277, "epoch": 0.886336990831204, "flos": 16763127171840.0, "grad_norm": 1.73085471409354, "language_loss": 0.73896277, "learning_rate": 1.3393228635642717e-07, "loss": 0.75980484, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14742, "time_per_iteration": 2.363487958908081 }, { "auxiliary_loss_clip": 0.0105034, "auxiliary_loss_mlp": 0.0103277, "balance_loss_clip": 1.01181316, "balance_loss_mlp": 1.01550126, "epoch": 0.8863971140838719, "flos": 25264094140800.0, "grad_norm": 1.9694307494184315, "language_loss": 0.60638952, "learning_rate": 1.3379219787285733e-07, "loss": 0.62722057, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34960938, "step": 14743, "time_per_iteration": 2.420274257659912 }, { "auxiliary_loss_clip": 0.01052608, "auxiliary_loss_mlp": 0.01040359, "balance_loss_clip": 1.01637459, "balance_loss_mlp": 1.01574945, "epoch": 0.8864572373365399, "flos": 23403188563200.0, "grad_norm": 1.6898113977038467, "language_loss": 0.61042082, "learning_rate": 1.3365218015607437e-07, "loss": 0.63135052, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 14744, "time_per_iteration": 2.397066831588745 }, { "auxiliary_loss_clip": 0.01052659, "auxiliary_loss_mlp": 0.01036658, "balance_loss_clip": 1.01392484, "balance_loss_mlp": 1.01627779, "epoch": 0.8865173605892078, "flos": 18547852429440.0, "grad_norm": 1.9789426535969046, "language_loss": 0.77245796, "learning_rate": 1.3351223321138762e-07, "loss": 0.79335105, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 14745, "time_per_iteration": 2.437195301055908 }, { "auxiliary_loss_clip": 0.0105159, "auxiliary_loss_mlp": 0.01036434, "balance_loss_clip": 1.01316404, "balance_loss_mlp": 1.01646519, "epoch": 0.8865774838418758, "flos": 19024870602240.0, "grad_norm": 1.6584406010776997, "language_loss": 0.78799057, "learning_rate": 1.3337235704410454e-07, "loss": 0.80887079, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14746, "time_per_iteration": 2.377253532409668 }, { "auxiliary_loss_clip": 0.0105304, "auxiliary_loss_mlp": 0.01038827, "balance_loss_clip": 1.0156287, "balance_loss_mlp": 1.01566684, "epoch": 0.8866376070945439, "flos": 22162096085760.0, "grad_norm": 2.075396867836792, "language_loss": 0.77899182, "learning_rate": 1.3323255165952873e-07, "loss": 0.79991055, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 14747, "time_per_iteration": 2.407752513885498 }, { "auxiliary_loss_clip": 0.01049117, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.01722097, "balance_loss_mlp": 1.01566017, "epoch": 0.8866977303472118, "flos": 20703215347200.0, "grad_norm": 1.826140113927889, "language_loss": 0.83613455, "learning_rate": 1.3309281706296127e-07, "loss": 0.85701287, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33398438, "step": 14748, "time_per_iteration": 2.3926098346710205 }, { "auxiliary_loss_clip": 0.01051464, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.01603162, "balance_loss_mlp": 1.01532435, "epoch": 0.8867578535998798, "flos": 48792726685440.0, "grad_norm": 1.9137885497676963, "language_loss": 0.78548574, "learning_rate": 1.3295315325970148e-07, "loss": 0.80638826, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 14749, "time_per_iteration": 2.617220878601074 }, { "auxiliary_loss_clip": 0.01053265, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.01458168, "balance_loss_mlp": 1.01593399, "epoch": 0.8868179768525477, "flos": 21104262668160.0, "grad_norm": 1.837622621768525, "language_loss": 0.71329427, "learning_rate": 1.328135602550451e-07, "loss": 0.7342146, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 14750, "time_per_iteration": 2.419569730758667 }, { "auxiliary_loss_clip": 0.01050963, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.01352942, "balance_loss_mlp": 1.01599884, "epoch": 0.8868781001052157, "flos": 21829967521920.0, "grad_norm": 2.059354886972471, "language_loss": 0.60393858, "learning_rate": 1.3267403805428546e-07, "loss": 0.62480307, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34960938, "step": 14751, "time_per_iteration": 3.86102557182312 }, { "auxiliary_loss_clip": 0.01051463, "auxiliary_loss_mlp": 0.01041304, "balance_loss_clip": 1.01663971, "balance_loss_mlp": 1.01568723, "epoch": 0.8869382233578836, "flos": 13515576192000.0, "grad_norm": 4.517423045563117, "language_loss": 0.83219039, "learning_rate": 1.3253458666271344e-07, "loss": 0.85311806, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 14752, "time_per_iteration": 2.3574717044830322 }, { "auxiliary_loss_clip": 0.01052169, "auxiliary_loss_mlp": 0.01039064, "balance_loss_clip": 1.01562786, "balance_loss_mlp": 1.01575184, "epoch": 0.8869983466105517, "flos": 22704053120640.0, "grad_norm": 1.8931821346306634, "language_loss": 0.80657399, "learning_rate": 1.3239520608561793e-07, "loss": 0.82748634, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 14753, "time_per_iteration": 2.4221737384796143 }, { "auxiliary_loss_clip": 0.01050975, "auxiliary_loss_mlp": 0.01032281, "balance_loss_clip": 1.01032281, "balance_loss_mlp": 1.01622844, "epoch": 0.8870584698632196, "flos": 15339857886720.0, "grad_norm": 1.7974820393194133, "language_loss": 0.66230422, "learning_rate": 1.3225589632828248e-07, "loss": 0.68313682, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 14754, "time_per_iteration": 2.386648416519165 }, { "auxiliary_loss_clip": 0.01052492, "auxiliary_loss_mlp": 0.01039906, "balance_loss_clip": 1.01563501, "balance_loss_mlp": 1.01616526, "epoch": 0.8871185931158876, "flos": 26614394951040.0, "grad_norm": 1.99764462649781, "language_loss": 0.7563355, "learning_rate": 1.3211665739599065e-07, "loss": 0.77725947, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 14755, "time_per_iteration": 2.4014854431152344 }, { "auxiliary_loss_clip": 0.01053283, "auxiliary_loss_mlp": 0.01040081, "balance_loss_clip": 1.01613152, "balance_loss_mlp": 1.01636887, "epoch": 0.8871787163685555, "flos": 21797951938560.0, "grad_norm": 1.4774004448743587, "language_loss": 0.78767437, "learning_rate": 1.3197748929402262e-07, "loss": 0.80860794, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 14756, "time_per_iteration": 2.4362778663635254 }, { "auxiliary_loss_clip": 0.01052271, "auxiliary_loss_mlp": 0.01038557, "balance_loss_clip": 1.01498914, "balance_loss_mlp": 1.01693153, "epoch": 0.8872388396212235, "flos": 14902081948800.0, "grad_norm": 1.965636163090672, "language_loss": 0.77675962, "learning_rate": 1.3183839202765535e-07, "loss": 0.79766792, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 14757, "time_per_iteration": 2.379281520843506 }, { "auxiliary_loss_clip": 0.0104894, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.01148963, "balance_loss_mlp": 1.01568437, "epoch": 0.8872989628738914, "flos": 26430961334400.0, "grad_norm": 2.112806029794963, "language_loss": 0.69003123, "learning_rate": 1.316993656021632e-07, "loss": 0.71083891, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33203125, "step": 14758, "time_per_iteration": 2.5208089351654053 }, { "auxiliary_loss_clip": 0.01051905, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.01354051, "balance_loss_mlp": 1.01623714, "epoch": 0.8873590861265594, "flos": 48140723445120.0, "grad_norm": 2.144936148702777, "language_loss": 0.70096582, "learning_rate": 1.3156041002281915e-07, "loss": 0.72185463, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 14759, "time_per_iteration": 2.681598663330078 }, { "auxiliary_loss_clip": 0.01050447, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.01253402, "balance_loss_mlp": 1.01530719, "epoch": 0.8874192093792275, "flos": 18331984293120.0, "grad_norm": 1.8033045556908833, "language_loss": 0.754673, "learning_rate": 1.3142152529489092e-07, "loss": 0.77552712, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14760, "time_per_iteration": 2.4065093994140625 }, { "auxiliary_loss_clip": 0.01052082, "auxiliary_loss_mlp": 0.01045206, "balance_loss_clip": 1.02114928, "balance_loss_mlp": 1.01609313, "epoch": 0.8874793326318954, "flos": 17893265748480.0, "grad_norm": 2.262152577854511, "language_loss": 0.77920634, "learning_rate": 1.3128271142364565e-07, "loss": 0.80017924, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 14761, "time_per_iteration": 2.3795619010925293 }, { "auxiliary_loss_clip": 0.0105301, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.01881206, "balance_loss_mlp": 1.01619816, "epoch": 0.8875394558845634, "flos": 31100908815360.0, "grad_norm": 1.6834610228874083, "language_loss": 0.62578404, "learning_rate": 1.3114396841434717e-07, "loss": 0.64673352, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 14762, "time_per_iteration": 2.497728109359741 }, { "auxiliary_loss_clip": 0.01051459, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.02073121, "balance_loss_mlp": 1.01524389, "epoch": 0.8875995791372313, "flos": 21140991285120.0, "grad_norm": 3.3397768794389915, "language_loss": 0.65686208, "learning_rate": 1.3100529627225697e-07, "loss": 0.67783344, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 14763, "time_per_iteration": 2.3707215785980225 }, { "auxiliary_loss_clip": 0.01052006, "auxiliary_loss_mlp": 0.01040872, "balance_loss_clip": 1.0162549, "balance_loss_mlp": 1.01583886, "epoch": 0.8876597023898993, "flos": 17454233001600.0, "grad_norm": 2.1742792726391076, "language_loss": 0.71997464, "learning_rate": 1.3086669500263335e-07, "loss": 0.74090338, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 14764, "time_per_iteration": 2.3449761867523193 }, { "auxiliary_loss_clip": 0.01052846, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.01360285, "balance_loss_mlp": 1.01532328, "epoch": 0.8877198256425672, "flos": 22706915852160.0, "grad_norm": 2.1996305781841947, "language_loss": 0.67721736, "learning_rate": 1.3072816461073166e-07, "loss": 0.69812143, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 14765, "time_per_iteration": 2.45389986038208 }, { "auxiliary_loss_clip": 0.01048485, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.01403773, "balance_loss_mlp": 1.01534534, "epoch": 0.8877799488952353, "flos": 24533955544320.0, "grad_norm": 1.5272913023761607, "language_loss": 0.77066612, "learning_rate": 1.3058970510180568e-07, "loss": 0.79149461, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33203125, "step": 14766, "time_per_iteration": 3.679873466491699 }, { "auxiliary_loss_clip": 0.01050068, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.01459157, "balance_loss_mlp": 1.01591074, "epoch": 0.8878400721479032, "flos": 20958151161600.0, "grad_norm": 4.558702523987892, "language_loss": 0.74464536, "learning_rate": 1.3045131648110496e-07, "loss": 0.76551402, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34179688, "step": 14767, "time_per_iteration": 2.393247365951538 }, { "auxiliary_loss_clip": 0.01048318, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.01009035, "balance_loss_mlp": 1.01479852, "epoch": 0.8879001954005712, "flos": 25294259422080.0, "grad_norm": 1.8852588770781378, "language_loss": 0.71725512, "learning_rate": 1.303129987538778e-07, "loss": 0.73805249, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3359375, "step": 14768, "time_per_iteration": 2.454174041748047 }, { "auxiliary_loss_clip": 0.01050301, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.01419675, "balance_loss_mlp": 1.01566052, "epoch": 0.8879603186532391, "flos": 23184213315840.0, "grad_norm": 1.9259597198050724, "language_loss": 0.7154426, "learning_rate": 1.3017475192536932e-07, "loss": 0.73630726, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34570312, "step": 14769, "time_per_iteration": 2.3738369941711426 }, { "auxiliary_loss_clip": 0.01050298, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 1.01734161, "balance_loss_mlp": 1.01657081, "epoch": 0.8880204419059071, "flos": 13654775249280.0, "grad_norm": 1.8785305647633457, "language_loss": 0.67695844, "learning_rate": 1.3003657600082174e-07, "loss": 0.69784737, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3359375, "step": 14770, "time_per_iteration": 2.3863718509674072 }, { "auxiliary_loss_clip": 0.01049258, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.01101255, "balance_loss_mlp": 1.01618695, "epoch": 0.888080565158575, "flos": 20630805454080.0, "grad_norm": 2.095149126672128, "language_loss": 0.66787046, "learning_rate": 1.2989847098547424e-07, "loss": 0.68868905, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33007812, "step": 14771, "time_per_iteration": 2.4183762073516846 }, { "auxiliary_loss_clip": 0.01050898, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.01338935, "balance_loss_mlp": 1.01586962, "epoch": 0.888140688411243, "flos": 28618793683200.0, "grad_norm": 1.4485260225649235, "language_loss": 0.83245158, "learning_rate": 1.2976043688456396e-07, "loss": 0.85330147, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34960938, "step": 14772, "time_per_iteration": 2.4486985206604004 }, { "auxiliary_loss_clip": 0.01047735, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.01019776, "balance_loss_mlp": 1.01461923, "epoch": 0.8882008116639111, "flos": 25519064866560.0, "grad_norm": 1.515875771520094, "language_loss": 0.77382445, "learning_rate": 1.296224737033258e-07, "loss": 0.79460508, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33007812, "step": 14773, "time_per_iteration": 2.404783010482788 }, { "auxiliary_loss_clip": 0.01048162, "auxiliary_loss_mlp": 0.01035285, "balance_loss_clip": 1.01320779, "balance_loss_mlp": 1.01437831, "epoch": 0.888260934916579, "flos": 27672437836800.0, "grad_norm": 1.5774279565386107, "language_loss": 0.76007712, "learning_rate": 1.294845814469907e-07, "loss": 0.78091168, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33789062, "step": 14774, "time_per_iteration": 2.449115753173828 }, { "auxiliary_loss_clip": 0.01051996, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.013659, "balance_loss_mlp": 1.01580012, "epoch": 0.888321058169247, "flos": 21610154401920.0, "grad_norm": 2.680319345499952, "language_loss": 0.7298516, "learning_rate": 1.2934676012078783e-07, "loss": 0.75074404, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 14775, "time_per_iteration": 2.367400884628296 }, { "auxiliary_loss_clip": 0.01049651, "auxiliary_loss_mlp": 0.01032653, "balance_loss_clip": 1.01124287, "balance_loss_mlp": 1.01521349, "epoch": 0.8883811814219149, "flos": 18148166651520.0, "grad_norm": 1.690030931142148, "language_loss": 0.80999327, "learning_rate": 1.292090097299432e-07, "loss": 0.83081627, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 14776, "time_per_iteration": 2.355612277984619 }, { "auxiliary_loss_clip": 0.01054291, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.01272488, "balance_loss_mlp": 1.01652992, "epoch": 0.8884413046745829, "flos": 28323533381760.0, "grad_norm": 1.981990844084084, "language_loss": 0.70823574, "learning_rate": 1.290713302796802e-07, "loss": 0.72915506, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37695312, "step": 14777, "time_per_iteration": 5.131423711776733 }, { "auxiliary_loss_clip": 0.01049743, "auxiliary_loss_mlp": 0.01039119, "balance_loss_clip": 1.01646948, "balance_loss_mlp": 1.015607, "epoch": 0.8885014279272508, "flos": 15157855635840.0, "grad_norm": 1.7185832991799852, "language_loss": 0.71828926, "learning_rate": 1.2893372177522e-07, "loss": 0.73917788, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34179688, "step": 14778, "time_per_iteration": 2.3240807056427 }, { "auxiliary_loss_clip": 0.01049859, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.01321054, "balance_loss_mlp": 1.0151149, "epoch": 0.8885615511799189, "flos": 19098572215680.0, "grad_norm": 1.7804618223691089, "language_loss": 0.78462696, "learning_rate": 1.287961842217804e-07, "loss": 0.80548, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 14779, "time_per_iteration": 2.3631489276885986 }, { "auxiliary_loss_clip": 0.01007713, "auxiliary_loss_mlp": 0.01008249, "balance_loss_clip": 1.00618684, "balance_loss_mlp": 1.001297, "epoch": 0.8886216744325868, "flos": 51184206328320.0, "grad_norm": 0.8820590879232184, "language_loss": 0.56879252, "learning_rate": 1.2865871762457747e-07, "loss": 0.58895212, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.06445312, "step": 14780, "time_per_iteration": 2.8037467002868652 }, { "auxiliary_loss_clip": 0.0100723, "auxiliary_loss_mlp": 0.01002461, "balance_loss_clip": 1.00021946, "balance_loss_mlp": 1.00087976, "epoch": 0.8886817976852548, "flos": 61609549593600.0, "grad_norm": 0.7866438008065298, "language_loss": 0.62488997, "learning_rate": 1.2852132198882326e-07, "loss": 0.64498687, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06347656, "step": 14781, "time_per_iteration": 3.091899871826172 }, { "auxiliary_loss_clip": 0.01007503, "auxiliary_loss_mlp": 0.01004209, "balance_loss_clip": 1.00201523, "balance_loss_mlp": 1.00105512, "epoch": 0.8887419209379227, "flos": 60644027410560.0, "grad_norm": 0.7955781249151259, "language_loss": 0.58187819, "learning_rate": 1.2838399731972805e-07, "loss": 0.60199523, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06445312, "step": 14782, "time_per_iteration": 2.9037725925445557 }, { "auxiliary_loss_clip": 0.01049141, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.01235759, "balance_loss_mlp": 1.01505232, "epoch": 0.8888020441905907, "flos": 29204566341120.0, "grad_norm": 3.4175268029236574, "language_loss": 0.66846347, "learning_rate": 1.2824674362249922e-07, "loss": 0.68928266, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33984375, "step": 14783, "time_per_iteration": 2.4568326473236084 }, { "auxiliary_loss_clip": 0.0105186, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.01142955, "balance_loss_mlp": 1.01521647, "epoch": 0.8888621674432586, "flos": 22161642238080.0, "grad_norm": 1.5553596959163478, "language_loss": 0.78983331, "learning_rate": 1.281095609023415e-07, "loss": 0.81068802, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 14784, "time_per_iteration": 2.4145240783691406 }, { "auxiliary_loss_clip": 0.01051273, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.01197219, "balance_loss_mlp": 1.01549733, "epoch": 0.8889222906959267, "flos": 27671599964160.0, "grad_norm": 2.59034721555454, "language_loss": 0.61158818, "learning_rate": 1.279724491644565e-07, "loss": 0.63245142, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 14785, "time_per_iteration": 2.4406211376190186 }, { "auxiliary_loss_clip": 0.01051594, "auxiliary_loss_mlp": 0.01037112, "balance_loss_clip": 1.01424789, "balance_loss_mlp": 1.01633835, "epoch": 0.8889824139485947, "flos": 14167893634560.0, "grad_norm": 1.8074647057983573, "language_loss": 0.65965152, "learning_rate": 1.278354084140445e-07, "loss": 0.68053854, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 14786, "time_per_iteration": 2.4042420387268066 }, { "auxiliary_loss_clip": 0.01054009, "auxiliary_loss_mlp": 0.01036942, "balance_loss_clip": 1.01151502, "balance_loss_mlp": 1.01637793, "epoch": 0.8890425372012626, "flos": 12852366405120.0, "grad_norm": 2.377474598809219, "language_loss": 0.86342156, "learning_rate": 1.276984386563009e-07, "loss": 0.88433105, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 14787, "time_per_iteration": 2.3607263565063477 }, { "auxiliary_loss_clip": 0.01050108, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.01008439, "balance_loss_mlp": 1.01599741, "epoch": 0.8891026604539306, "flos": 21688219935360.0, "grad_norm": 2.3406044648916864, "language_loss": 0.72038424, "learning_rate": 1.2756153989642027e-07, "loss": 0.74121201, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.33984375, "step": 14788, "time_per_iteration": 2.3660459518432617 }, { "auxiliary_loss_clip": 0.0104856, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.01004815, "balance_loss_mlp": 1.01619482, "epoch": 0.8891627837065985, "flos": 21870361831680.0, "grad_norm": 2.1497605820627235, "language_loss": 0.70429915, "learning_rate": 1.274247121395935e-07, "loss": 0.72510684, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.32421875, "step": 14789, "time_per_iteration": 2.378129720687866 }, { "auxiliary_loss_clip": 0.01049992, "auxiliary_loss_mlp": 0.0103542, "balance_loss_clip": 1.01403379, "balance_loss_mlp": 1.01523244, "epoch": 0.8892229069592665, "flos": 21579151248000.0, "grad_norm": 1.4732683594268048, "language_loss": 0.71496874, "learning_rate": 1.2728795539100956e-07, "loss": 0.73582286, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34765625, "step": 14790, "time_per_iteration": 3.8277101516723633 }, { "auxiliary_loss_clip": 0.01053107, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.01229262, "balance_loss_mlp": 1.01688516, "epoch": 0.8892830302119344, "flos": 23074865337600.0, "grad_norm": 1.6863984435486632, "language_loss": 0.73735559, "learning_rate": 1.2715126965585387e-07, "loss": 0.75825471, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 14791, "time_per_iteration": 2.366764545440674 }, { "auxiliary_loss_clip": 0.01049966, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.01656103, "balance_loss_mlp": 1.01624537, "epoch": 0.8893431534646025, "flos": 23071129822080.0, "grad_norm": 1.6272401953017068, "language_loss": 0.7520504, "learning_rate": 1.2701465493931008e-07, "loss": 0.77293718, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33789062, "step": 14792, "time_per_iteration": 2.4044735431671143 }, { "auxiliary_loss_clip": 0.01053977, "auxiliary_loss_mlp": 0.01040136, "balance_loss_clip": 1.01570964, "balance_loss_mlp": 1.01686168, "epoch": 0.8894032767172704, "flos": 22453900162560.0, "grad_norm": 1.9892672163664173, "language_loss": 0.67928368, "learning_rate": 1.2687811124655801e-07, "loss": 0.70022476, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 14793, "time_per_iteration": 2.3843812942504883 }, { "auxiliary_loss_clip": 0.01052246, "auxiliary_loss_mlp": 0.01037764, "balance_loss_clip": 1.01436281, "balance_loss_mlp": 1.01587629, "epoch": 0.8894633999699384, "flos": 25337062615680.0, "grad_norm": 1.544871638271269, "language_loss": 0.72641563, "learning_rate": 1.2674163858277552e-07, "loss": 0.7473157, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 14794, "time_per_iteration": 2.433540105819702 }, { "auxiliary_loss_clip": 0.01055015, "auxiliary_loss_mlp": 0.01037953, "balance_loss_clip": 1.01316965, "balance_loss_mlp": 1.01777434, "epoch": 0.8895235232226063, "flos": 20993099299200.0, "grad_norm": 1.618307003249504, "language_loss": 0.76069713, "learning_rate": 1.2660523695313785e-07, "loss": 0.78162682, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 14795, "time_per_iteration": 2.375746488571167 }, { "auxiliary_loss_clip": 0.01007995, "auxiliary_loss_mlp": 0.01002499, "balance_loss_clip": 1.00011444, "balance_loss_mlp": 1.00140536, "epoch": 0.8895836464752743, "flos": 69729754608000.0, "grad_norm": 0.7651769198396768, "language_loss": 0.5611679, "learning_rate": 1.2646890636281727e-07, "loss": 0.58127278, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.06591797, "step": 14796, "time_per_iteration": 2.9150376319885254 }, { "auxiliary_loss_clip": 0.01053131, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.0092628, "balance_loss_mlp": 1.01688278, "epoch": 0.8896437697279422, "flos": 23220697553280.0, "grad_norm": 2.425223446016281, "language_loss": 0.70581806, "learning_rate": 1.263326468169843e-07, "loss": 0.72668815, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 14797, "time_per_iteration": 2.380681276321411 }, { "auxiliary_loss_clip": 0.01007021, "auxiliary_loss_mlp": 0.01002791, "balance_loss_clip": 1.0006336, "balance_loss_mlp": 1.00060475, "epoch": 0.8897038929806103, "flos": 70749532776960.0, "grad_norm": 0.7649326163082742, "language_loss": 0.58066183, "learning_rate": 1.2619645832080417e-07, "loss": 0.60075992, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06396484, "step": 14798, "time_per_iteration": 3.0925769805908203 }, { "auxiliary_loss_clip": 0.01051292, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.01561499, "balance_loss_mlp": 1.01582026, "epoch": 0.8897640162332782, "flos": 19244090229120.0, "grad_norm": 1.5288634003414678, "language_loss": 0.80042708, "learning_rate": 1.2606034087944251e-07, "loss": 0.82131839, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14799, "time_per_iteration": 2.3624792098999023 }, { "auxiliary_loss_clip": 0.01007309, "auxiliary_loss_mlp": 0.01003132, "balance_loss_clip": 1.00085545, "balance_loss_mlp": 1.00095177, "epoch": 0.8898241394859462, "flos": 41353606074240.0, "grad_norm": 0.897152418930086, "language_loss": 0.58214629, "learning_rate": 1.2592429449806053e-07, "loss": 0.6022507, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.06347656, "step": 14800, "time_per_iteration": 2.9968645572662354 }, { "auxiliary_loss_clip": 0.01050908, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.01147056, "balance_loss_mlp": 1.01610756, "epoch": 0.8898842627386142, "flos": 18985383987840.0, "grad_norm": 1.4546437636246483, "language_loss": 0.67665118, "learning_rate": 1.2578831918181698e-07, "loss": 0.69748765, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 14801, "time_per_iteration": 2.3665432929992676 }, { "auxiliary_loss_clip": 0.01052583, "auxiliary_loss_mlp": 0.01042191, "balance_loss_clip": 1.0164535, "balance_loss_mlp": 1.01628172, "epoch": 0.8899443859912821, "flos": 13216545463680.0, "grad_norm": 2.404747042440108, "language_loss": 0.77291739, "learning_rate": 1.256524149358682e-07, "loss": 0.7938652, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36328125, "step": 14802, "time_per_iteration": 2.3704888820648193 }, { "auxiliary_loss_clip": 0.01053053, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.01437616, "balance_loss_mlp": 1.01747477, "epoch": 0.8900045092439501, "flos": 22673573637120.0, "grad_norm": 1.7041265086199322, "language_loss": 0.73510641, "learning_rate": 1.2551658176536805e-07, "loss": 0.75600505, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 14803, "time_per_iteration": 2.413618326187134 }, { "auxiliary_loss_clip": 0.01050523, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.01497197, "balance_loss_mlp": 1.01557899, "epoch": 0.890064632496618, "flos": 21140572348800.0, "grad_norm": 2.098513660000098, "language_loss": 0.73455548, "learning_rate": 1.2538081967546664e-07, "loss": 0.75544393, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34960938, "step": 14804, "time_per_iteration": 2.385745048522949 }, { "auxiliary_loss_clip": 0.01051503, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.0145843, "balance_loss_mlp": 1.01593518, "epoch": 0.8901247557492861, "flos": 23396136468480.0, "grad_norm": 1.9303981503520953, "language_loss": 0.82411623, "learning_rate": 1.252451286713123e-07, "loss": 0.8450132, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 14805, "time_per_iteration": 2.397770643234253 }, { "auxiliary_loss_clip": 0.0105326, "auxiliary_loss_mlp": 0.01037865, "balance_loss_clip": 1.01354587, "balance_loss_mlp": 1.01615953, "epoch": 0.890184879001954, "flos": 29168291571840.0, "grad_norm": 1.9865125433152264, "language_loss": 0.6901961, "learning_rate": 1.251095087580505e-07, "loss": 0.71110737, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 14806, "time_per_iteration": 3.641875743865967 }, { "auxiliary_loss_clip": 0.01050682, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.01350689, "balance_loss_mlp": 1.01547277, "epoch": 0.890245002254622, "flos": 14426983900800.0, "grad_norm": 1.8028826758602752, "language_loss": 0.68688166, "learning_rate": 1.2497395994082438e-07, "loss": 0.70776451, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3515625, "step": 14807, "time_per_iteration": 2.349215030670166 }, { "auxiliary_loss_clip": 0.01050523, "auxiliary_loss_mlp": 0.0103018, "balance_loss_clip": 1.00927067, "balance_loss_mlp": 1.01573682, "epoch": 0.8903051255072899, "flos": 22381106244480.0, "grad_norm": 1.7297845186649055, "language_loss": 0.75641936, "learning_rate": 1.248384822247732e-07, "loss": 0.77722633, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 14808, "time_per_iteration": 2.39782977104187 }, { "auxiliary_loss_clip": 0.01051277, "auxiliary_loss_mlp": 0.01046327, "balance_loss_clip": 1.02477419, "balance_loss_mlp": 1.0153954, "epoch": 0.8903652487599579, "flos": 20776323467520.0, "grad_norm": 1.867037083597602, "language_loss": 0.82792574, "learning_rate": 1.2470307561503513e-07, "loss": 0.84890181, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.359375, "step": 14809, "time_per_iteration": 2.3776838779449463 }, { "auxiliary_loss_clip": 0.01051733, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.01144075, "balance_loss_mlp": 1.01648307, "epoch": 0.8904253720126258, "flos": 24423385668480.0, "grad_norm": 1.682411020551403, "language_loss": 0.69539607, "learning_rate": 1.2456774011674442e-07, "loss": 0.71625441, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 14810, "time_per_iteration": 2.4153411388397217 }, { "auxiliary_loss_clip": 0.01053412, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.01300669, "balance_loss_mlp": 1.01612043, "epoch": 0.8904854952652939, "flos": 19462856008320.0, "grad_norm": 1.8888440844050092, "language_loss": 0.71727234, "learning_rate": 1.2443247573503257e-07, "loss": 0.73817682, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 14811, "time_per_iteration": 2.361935615539551 }, { "auxiliary_loss_clip": 0.01051305, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.011814, "balance_loss_mlp": 1.01564109, "epoch": 0.8905456185179618, "flos": 50798975719680.0, "grad_norm": 1.8431598131958493, "language_loss": 0.66688633, "learning_rate": 1.2429728247502924e-07, "loss": 0.68774128, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 14812, "time_per_iteration": 2.6266841888427734 }, { "auxiliary_loss_clip": 0.0105101, "auxiliary_loss_mlp": 0.01039772, "balance_loss_clip": 1.01787329, "balance_loss_mlp": 1.016096, "epoch": 0.8906057417706298, "flos": 17783917770240.0, "grad_norm": 1.913007960483859, "language_loss": 0.6963203, "learning_rate": 1.24162160341861e-07, "loss": 0.71722817, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 14813, "time_per_iteration": 2.333292245864868 }, { "auxiliary_loss_clip": 0.01054906, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.01516926, "balance_loss_mlp": 1.01625705, "epoch": 0.8906658650232978, "flos": 21943784154240.0, "grad_norm": 1.7552626828301867, "language_loss": 0.76554704, "learning_rate": 1.2402710934065198e-07, "loss": 0.78650022, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38671875, "step": 14814, "time_per_iteration": 2.375821590423584 }, { "auxiliary_loss_clip": 0.01053447, "auxiliary_loss_mlp": 0.0103972, "balance_loss_clip": 1.01510298, "balance_loss_mlp": 1.01610875, "epoch": 0.8907259882759657, "flos": 21286753678080.0, "grad_norm": 1.930595412168764, "language_loss": 0.75528151, "learning_rate": 1.2389212947652229e-07, "loss": 0.77621317, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 14815, "time_per_iteration": 2.3721117973327637 }, { "auxiliary_loss_clip": 0.0104874, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.01190615, "balance_loss_mlp": 1.01527357, "epoch": 0.8907861115286337, "flos": 20119397725440.0, "grad_norm": 1.8758049164226078, "language_loss": 0.76662827, "learning_rate": 1.237572207545914e-07, "loss": 0.78744495, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.3359375, "step": 14816, "time_per_iteration": 3.7810137271881104 }, { "auxiliary_loss_clip": 0.01051629, "auxiliary_loss_mlp": 0.01038641, "balance_loss_clip": 1.01494265, "balance_loss_mlp": 1.01556253, "epoch": 0.8908462347813016, "flos": 20083122956160.0, "grad_norm": 2.093226320481197, "language_loss": 0.78321278, "learning_rate": 1.2362238317997476e-07, "loss": 0.80411541, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 14817, "time_per_iteration": 3.724142551422119 }, { "auxiliary_loss_clip": 0.01007061, "auxiliary_loss_mlp": 0.01002756, "balance_loss_clip": 1.0007894, "balance_loss_mlp": 1.00064349, "epoch": 0.8909063580339697, "flos": 65500480707840.0, "grad_norm": 0.7653813251518286, "language_loss": 0.5654977, "learning_rate": 1.2348761675778517e-07, "loss": 0.58559591, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.06396484, "step": 14818, "time_per_iteration": 3.050084114074707 }, { "auxiliary_loss_clip": 0.01050766, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.01486468, "balance_loss_mlp": 1.01597953, "epoch": 0.8909664812866376, "flos": 29861736462720.0, "grad_norm": 1.7601078222839204, "language_loss": 0.65451109, "learning_rate": 1.2335292149313325e-07, "loss": 0.6753794, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 14819, "time_per_iteration": 2.438056707382202 }, { "auxiliary_loss_clip": 0.01051583, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.01447034, "balance_loss_mlp": 1.01574457, "epoch": 0.8910266045393056, "flos": 25445956746240.0, "grad_norm": 1.7672090581460367, "language_loss": 0.79758418, "learning_rate": 1.2321829739112731e-07, "loss": 0.81847459, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 14820, "time_per_iteration": 2.395019054412842 }, { "auxiliary_loss_clip": 0.01051323, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.01408219, "balance_loss_mlp": 1.0157733, "epoch": 0.8910867277919735, "flos": 24497960065920.0, "grad_norm": 1.7991922740093875, "language_loss": 0.77500254, "learning_rate": 1.2308374445687087e-07, "loss": 0.79587466, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 14821, "time_per_iteration": 2.4189209938049316 }, { "auxiliary_loss_clip": 0.0100718, "auxiliary_loss_mlp": 0.01002397, "balance_loss_clip": 1.00019181, "balance_loss_mlp": 1.0006727, "epoch": 0.8911468510446415, "flos": 60685085036160.0, "grad_norm": 0.8057418577401025, "language_loss": 0.59398824, "learning_rate": 1.2294926269546712e-07, "loss": 0.61408401, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.06494141, "step": 14822, "time_per_iteration": 2.9050939083099365 }, { "auxiliary_loss_clip": 0.01050869, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.01217341, "balance_loss_mlp": 1.01496363, "epoch": 0.8912069742973094, "flos": 25336329477120.0, "grad_norm": 2.157988280535342, "language_loss": 0.69957972, "learning_rate": 1.2281485211201515e-07, "loss": 0.72042555, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.359375, "step": 14823, "time_per_iteration": 2.399238109588623 }, { "auxiliary_loss_clip": 0.0105084, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.01913476, "balance_loss_mlp": 1.0162909, "epoch": 0.8912670975499775, "flos": 18222531580800.0, "grad_norm": 1.5192741174416835, "language_loss": 0.70008218, "learning_rate": 1.2268051271161262e-07, "loss": 0.72101402, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34375, "step": 14824, "time_per_iteration": 2.340325355529785 }, { "auxiliary_loss_clip": 0.01053697, "auxiliary_loss_mlp": 0.01040202, "balance_loss_clip": 1.01624072, "balance_loss_mlp": 1.01677942, "epoch": 0.8913272208026454, "flos": 26503301404800.0, "grad_norm": 1.9066080746696903, "language_loss": 0.71257818, "learning_rate": 1.2254624449935303e-07, "loss": 0.73351717, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 14825, "time_per_iteration": 2.4077060222625732 }, { "auxiliary_loss_clip": 0.01051736, "auxiliary_loss_mlp": 0.01038324, "balance_loss_clip": 1.01572204, "balance_loss_mlp": 1.01655555, "epoch": 0.8913873440553134, "flos": 18801461612160.0, "grad_norm": 1.7807751593300052, "language_loss": 0.72414076, "learning_rate": 1.2241204748032786e-07, "loss": 0.74504137, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 14826, "time_per_iteration": 2.349210739135742 }, { "auxiliary_loss_clip": 0.0104995, "auxiliary_loss_mlp": 0.0103274, "balance_loss_clip": 1.00998247, "balance_loss_mlp": 1.01577604, "epoch": 0.8914474673079814, "flos": 20883890966400.0, "grad_norm": 2.1159058810326594, "language_loss": 0.76709688, "learning_rate": 1.2227792165962615e-07, "loss": 0.78792381, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 14827, "time_per_iteration": 2.361990451812744 }, { "auxiliary_loss_clip": 0.01052306, "auxiliary_loss_mlp": 0.01037731, "balance_loss_clip": 1.01433063, "balance_loss_mlp": 1.0163734, "epoch": 0.8915075905606493, "flos": 20951587825920.0, "grad_norm": 1.7226516453612053, "language_loss": 0.79597795, "learning_rate": 1.221438670423336e-07, "loss": 0.81687832, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 14828, "time_per_iteration": 2.3604185581207275 }, { "auxiliary_loss_clip": 0.01049461, "auxiliary_loss_mlp": 0.01037613, "balance_loss_clip": 1.01528525, "balance_loss_mlp": 1.01507998, "epoch": 0.8915677138133173, "flos": 23075179539840.0, "grad_norm": 1.6219881018422477, "language_loss": 0.75917113, "learning_rate": 1.2200988363353392e-07, "loss": 0.78004187, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 14829, "time_per_iteration": 2.4182090759277344 }, { "auxiliary_loss_clip": 0.01050804, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.01529217, "balance_loss_mlp": 1.01494634, "epoch": 0.8916278370659853, "flos": 23439149130240.0, "grad_norm": 1.6045890988892912, "language_loss": 0.85306025, "learning_rate": 1.2187597143830773e-07, "loss": 0.87394238, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 14830, "time_per_iteration": 3.851357936859131 }, { "auxiliary_loss_clip": 0.01048312, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.01368177, "balance_loss_mlp": 1.01490557, "epoch": 0.8916879603186533, "flos": 25159179905280.0, "grad_norm": 1.4609527648802036, "language_loss": 0.76009005, "learning_rate": 1.2174213046173299e-07, "loss": 0.78092277, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33398438, "step": 14831, "time_per_iteration": 2.4423270225524902 }, { "auxiliary_loss_clip": 0.01050776, "auxiliary_loss_mlp": 0.01034612, "balance_loss_clip": 1.01248693, "balance_loss_mlp": 1.01514781, "epoch": 0.8917480835713212, "flos": 20228815526400.0, "grad_norm": 1.8041159341666464, "language_loss": 0.74326825, "learning_rate": 1.216083607088847e-07, "loss": 0.76412219, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 14832, "time_per_iteration": 2.355379104614258 }, { "auxiliary_loss_clip": 0.01050955, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.0108757, "balance_loss_mlp": 1.01472044, "epoch": 0.8918082068239892, "flos": 26100787806720.0, "grad_norm": 2.0005695687082827, "language_loss": 0.6809386, "learning_rate": 1.214746621848355e-07, "loss": 0.70177543, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 14833, "time_per_iteration": 2.44321346282959 }, { "auxiliary_loss_clip": 0.01055507, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.01319671, "balance_loss_mlp": 1.01688218, "epoch": 0.8918683300766571, "flos": 24830158452480.0, "grad_norm": 1.9759516784930489, "language_loss": 0.75258577, "learning_rate": 1.2134103489465575e-07, "loss": 0.77353561, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 14834, "time_per_iteration": 2.454921245574951 }, { "auxiliary_loss_clip": 0.01050036, "auxiliary_loss_mlp": 0.01041297, "balance_loss_clip": 1.01870692, "balance_loss_mlp": 1.01515317, "epoch": 0.8919284533293251, "flos": 22304192785920.0, "grad_norm": 1.8434126733695482, "language_loss": 0.8028841, "learning_rate": 1.2120747884341188e-07, "loss": 0.82379735, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 14835, "time_per_iteration": 2.3849573135375977 }, { "auxiliary_loss_clip": 0.01048931, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.01182485, "balance_loss_mlp": 1.01494336, "epoch": 0.891988576581993, "flos": 30372201584640.0, "grad_norm": 1.31271614792042, "language_loss": 0.75358534, "learning_rate": 1.210739940361689e-07, "loss": 0.7743963, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33984375, "step": 14836, "time_per_iteration": 2.4360358715057373 }, { "auxiliary_loss_clip": 0.01051151, "auxiliary_loss_mlp": 0.01039965, "balance_loss_clip": 1.01724374, "balance_loss_mlp": 1.01620698, "epoch": 0.8920486998346611, "flos": 15552234887040.0, "grad_norm": 3.0379453071869857, "language_loss": 0.69573683, "learning_rate": 1.2094058047798838e-07, "loss": 0.71664798, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 14837, "time_per_iteration": 2.3673036098480225 }, { "auxiliary_loss_clip": 0.01052796, "auxiliary_loss_mlp": 0.01040699, "balance_loss_clip": 1.0172627, "balance_loss_mlp": 1.01556599, "epoch": 0.892108823087329, "flos": 21213924848640.0, "grad_norm": 1.8960288765815292, "language_loss": 0.68573689, "learning_rate": 1.2080723817392913e-07, "loss": 0.70667183, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37304688, "step": 14838, "time_per_iteration": 2.3757171630859375 }, { "auxiliary_loss_clip": 0.01052031, "auxiliary_loss_mlp": 0.01039206, "balance_loss_clip": 1.01641309, "balance_loss_mlp": 1.01611972, "epoch": 0.892168946339997, "flos": 21977964241920.0, "grad_norm": 1.9506570729956223, "language_loss": 0.76815999, "learning_rate": 1.2067396712904777e-07, "loss": 0.78907233, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 14839, "time_per_iteration": 2.394498109817505 }, { "auxiliary_loss_clip": 0.01007086, "auxiliary_loss_mlp": 0.01002901, "balance_loss_clip": 1.00080252, "balance_loss_mlp": 1.00069702, "epoch": 0.892229069592665, "flos": 67472025984000.0, "grad_norm": 0.6856539739649039, "language_loss": 0.49558571, "learning_rate": 1.205407673483978e-07, "loss": 0.51568556, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06396484, "step": 14840, "time_per_iteration": 2.976696729660034 }, { "auxiliary_loss_clip": 0.01054505, "auxiliary_loss_mlp": 0.01041835, "balance_loss_clip": 1.01541853, "balance_loss_mlp": 1.0165602, "epoch": 0.8922891928453329, "flos": 19458666645120.0, "grad_norm": 2.364006681994037, "language_loss": 0.65899092, "learning_rate": 1.2040763883703074e-07, "loss": 0.67995429, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.37890625, "step": 14841, "time_per_iteration": 2.3947315216064453 }, { "auxiliary_loss_clip": 0.01048322, "auxiliary_loss_mlp": 0.01038593, "balance_loss_clip": 1.01680136, "balance_loss_mlp": 1.01553035, "epoch": 0.8923493160980009, "flos": 23366285389440.0, "grad_norm": 1.6233230426512433, "language_loss": 0.69275796, "learning_rate": 1.2027458159999438e-07, "loss": 0.7136271, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.328125, "step": 14842, "time_per_iteration": 2.4083244800567627 }, { "auxiliary_loss_clip": 0.01050709, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.01622331, "balance_loss_mlp": 1.01604009, "epoch": 0.8924094393506689, "flos": 26175850963200.0, "grad_norm": 1.8091688648881643, "language_loss": 0.81451178, "learning_rate": 1.2014159564233373e-07, "loss": 0.83539367, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 14843, "time_per_iteration": 2.4307589530944824 }, { "auxiliary_loss_clip": 0.01051939, "auxiliary_loss_mlp": 0.01037097, "balance_loss_clip": 1.01139545, "balance_loss_mlp": 1.01602364, "epoch": 0.8924695626033369, "flos": 22017415944960.0, "grad_norm": 1.7244105222734682, "language_loss": 0.69486403, "learning_rate": 1.2000868096909257e-07, "loss": 0.71575439, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.359375, "step": 14844, "time_per_iteration": 2.3709352016448975 }, { "auxiliary_loss_clip": 0.01052165, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.01474524, "balance_loss_mlp": 1.0165503, "epoch": 0.8925296858560048, "flos": 14793048172800.0, "grad_norm": 4.42305541046238, "language_loss": 0.92664987, "learning_rate": 1.1987583758531038e-07, "loss": 0.94753921, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35546875, "step": 14845, "time_per_iteration": 2.365661859512329 }, { "auxiliary_loss_clip": 0.01049437, "auxiliary_loss_mlp": 0.01033986, "balance_loss_clip": 1.01311231, "balance_loss_mlp": 1.01509356, "epoch": 0.8925898091086728, "flos": 22345529702400.0, "grad_norm": 2.3884934870669436, "language_loss": 0.73384041, "learning_rate": 1.1974306549602476e-07, "loss": 0.75467461, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 14846, "time_per_iteration": 3.7307493686676025 }, { "auxiliary_loss_clip": 0.01052221, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.0131402, "balance_loss_mlp": 1.0160265, "epoch": 0.8926499323613407, "flos": 45804580174080.0, "grad_norm": 1.6652193887878242, "language_loss": 0.58007586, "learning_rate": 1.1961036470627094e-07, "loss": 0.6009627, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 14847, "time_per_iteration": 2.5949532985687256 }, { "auxiliary_loss_clip": 0.01051661, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.01655507, "balance_loss_mlp": 1.01553416, "epoch": 0.8927100556140087, "flos": 22125960961920.0, "grad_norm": 2.1367234469815592, "language_loss": 0.78104532, "learning_rate": 1.1947773522108052e-07, "loss": 0.80194587, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 14848, "time_per_iteration": 2.3722219467163086 }, { "auxiliary_loss_clip": 0.01050672, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.01223886, "balance_loss_mlp": 1.01609766, "epoch": 0.8927701788666766, "flos": 28328874819840.0, "grad_norm": 3.05689587058186, "language_loss": 0.70047039, "learning_rate": 1.1934517704548251e-07, "loss": 0.72132468, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 14849, "time_per_iteration": 2.4361462593078613 }, { "auxiliary_loss_clip": 0.01052132, "auxiliary_loss_mlp": 0.01041707, "balance_loss_clip": 1.01910448, "balance_loss_mlp": 1.01673102, "epoch": 0.8928303021193447, "flos": 25293980131200.0, "grad_norm": 2.161323663643168, "language_loss": 0.81212139, "learning_rate": 1.1921269018450364e-07, "loss": 0.83305979, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 14850, "time_per_iteration": 2.4341299533843994 }, { "auxiliary_loss_clip": 0.0104961, "auxiliary_loss_mlp": 0.01038179, "balance_loss_clip": 1.01572013, "balance_loss_mlp": 1.01556706, "epoch": 0.8928904253720126, "flos": 22235623142400.0, "grad_norm": 1.5395075069989408, "language_loss": 0.76271588, "learning_rate": 1.1908027464316872e-07, "loss": 0.78359377, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.33984375, "step": 14851, "time_per_iteration": 2.3653006553649902 }, { "auxiliary_loss_clip": 0.0104922, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.01419163, "balance_loss_mlp": 1.01531935, "epoch": 0.8929505486246806, "flos": 27091064010240.0, "grad_norm": 4.843097906590092, "language_loss": 0.79422939, "learning_rate": 1.1894793042649775e-07, "loss": 0.81509215, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33789062, "step": 14852, "time_per_iteration": 2.4754979610443115 }, { "auxiliary_loss_clip": 0.01049915, "auxiliary_loss_mlp": 0.01034085, "balance_loss_clip": 1.01439118, "balance_loss_mlp": 1.01651001, "epoch": 0.8930106718773486, "flos": 23038241454720.0, "grad_norm": 1.4157552946801468, "language_loss": 0.69985759, "learning_rate": 1.1881565753951006e-07, "loss": 0.72069752, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.33398438, "step": 14853, "time_per_iteration": 2.40934157371521 }, { "auxiliary_loss_clip": 0.0105199, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.01092386, "balance_loss_mlp": 1.01638603, "epoch": 0.8930707951300165, "flos": 35625198637440.0, "grad_norm": 1.6617548273687268, "language_loss": 0.68832666, "learning_rate": 1.1868345598722118e-07, "loss": 0.70919454, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 14854, "time_per_iteration": 2.5686371326446533 }, { "auxiliary_loss_clip": 0.01049586, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.01331615, "balance_loss_mlp": 1.01558971, "epoch": 0.8931309183826845, "flos": 23038765125120.0, "grad_norm": 1.4611403327704033, "language_loss": 0.75849593, "learning_rate": 1.1855132577464399e-07, "loss": 0.77932698, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 14855, "time_per_iteration": 2.386436939239502 }, { "auxiliary_loss_clip": 0.0104978, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.01554978, "balance_loss_mlp": 1.01490414, "epoch": 0.8931910416353525, "flos": 26503441050240.0, "grad_norm": 1.8815796286748594, "language_loss": 0.65241063, "learning_rate": 1.1841926690678893e-07, "loss": 0.67328715, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 14856, "time_per_iteration": 3.8436403274536133 }, { "auxiliary_loss_clip": 0.01051013, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.01333547, "balance_loss_mlp": 1.01614642, "epoch": 0.8932511648880205, "flos": 24972429709440.0, "grad_norm": 1.7686468559502548, "language_loss": 0.67592967, "learning_rate": 1.1828727938866378e-07, "loss": 0.69679129, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 14857, "time_per_iteration": 3.7630090713500977 }, { "auxiliary_loss_clip": 0.01052192, "auxiliary_loss_mlp": 0.01041385, "balance_loss_clip": 1.0177815, "balance_loss_mlp": 1.01644111, "epoch": 0.8933112881406884, "flos": 24459625526400.0, "grad_norm": 2.5745242543170246, "language_loss": 0.76602131, "learning_rate": 1.1815536322527408e-07, "loss": 0.78695714, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 14858, "time_per_iteration": 2.4342620372772217 }, { "auxiliary_loss_clip": 0.01050327, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.01047635, "balance_loss_mlp": 1.01556945, "epoch": 0.8933714113933564, "flos": 28291832000640.0, "grad_norm": 1.5943165359000278, "language_loss": 0.7055527, "learning_rate": 1.1802351842162139e-07, "loss": 0.72639167, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 14859, "time_per_iteration": 2.558547019958496 }, { "auxiliary_loss_clip": 0.0104713, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.0133996, "balance_loss_mlp": 1.01487541, "epoch": 0.8934315346460243, "flos": 21433772880000.0, "grad_norm": 1.8166937381377912, "language_loss": 0.7648983, "learning_rate": 1.1789174498270526e-07, "loss": 0.78569686, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.32421875, "step": 14860, "time_per_iteration": 2.41283917427063 }, { "auxiliary_loss_clip": 0.01053441, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.01557446, "balance_loss_mlp": 1.0170486, "epoch": 0.8934916578986923, "flos": 23768449873920.0, "grad_norm": 1.7940440570536653, "language_loss": 0.5912354, "learning_rate": 1.1776004291352303e-07, "loss": 0.61216664, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 14861, "time_per_iteration": 2.3816370964050293 }, { "auxiliary_loss_clip": 0.01048758, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.01494741, "balance_loss_mlp": 1.01441264, "epoch": 0.8935517811513602, "flos": 18915173510400.0, "grad_norm": 1.8436424259842377, "language_loss": 0.64547014, "learning_rate": 1.176284122190685e-07, "loss": 0.66632187, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 14862, "time_per_iteration": 2.3634543418884277 }, { "auxiliary_loss_clip": 0.01049466, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.01497626, "balance_loss_mlp": 1.01551104, "epoch": 0.8936119044040283, "flos": 24060219039360.0, "grad_norm": 1.6675190657866936, "language_loss": 0.7917698, "learning_rate": 1.1749685290433298e-07, "loss": 0.81263542, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33984375, "step": 14863, "time_per_iteration": 2.39382004737854 }, { "auxiliary_loss_clip": 0.01048595, "auxiliary_loss_mlp": 0.01035639, "balance_loss_clip": 1.01497972, "balance_loss_mlp": 1.0147177, "epoch": 0.8936720276566962, "flos": 21323028447360.0, "grad_norm": 1.8049602549685346, "language_loss": 0.71942008, "learning_rate": 1.1736536497430627e-07, "loss": 0.74026239, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33984375, "step": 14864, "time_per_iteration": 2.37341046333313 }, { "auxiliary_loss_clip": 0.01054097, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.0194782, "balance_loss_mlp": 1.01658177, "epoch": 0.8937321509093642, "flos": 18405127324800.0, "grad_norm": 2.052172941113189, "language_loss": 0.77792966, "learning_rate": 1.1723394843397283e-07, "loss": 0.79889941, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 14865, "time_per_iteration": 2.3459110260009766 }, { "auxiliary_loss_clip": 0.01049055, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.01500595, "balance_loss_mlp": 1.01516318, "epoch": 0.8937922741620322, "flos": 22053655802880.0, "grad_norm": 1.5636695727912613, "language_loss": 0.728338, "learning_rate": 1.1710260328831668e-07, "loss": 0.74917829, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33984375, "step": 14866, "time_per_iteration": 2.42179536819458 }, { "auxiliary_loss_clip": 0.01053365, "auxiliary_loss_mlp": 0.01039297, "balance_loss_clip": 1.0146327, "balance_loss_mlp": 1.0164324, "epoch": 0.8938523974147001, "flos": 25663256248320.0, "grad_norm": 1.9238564542844627, "language_loss": 0.84980315, "learning_rate": 1.1697132954231869e-07, "loss": 0.87072974, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 14867, "time_per_iteration": 2.3887078762054443 }, { "auxiliary_loss_clip": 0.01050782, "auxiliary_loss_mlp": 0.01035193, "balance_loss_clip": 1.01399803, "balance_loss_mlp": 1.01521313, "epoch": 0.8939125206673681, "flos": 25741566161280.0, "grad_norm": 1.6311325385807318, "language_loss": 0.81606942, "learning_rate": 1.168401272009567e-07, "loss": 0.83692914, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35546875, "step": 14868, "time_per_iteration": 2.445683240890503 }, { "auxiliary_loss_clip": 0.01051987, "auxiliary_loss_mlp": 0.01042979, "balance_loss_clip": 1.01981664, "balance_loss_mlp": 1.0159502, "epoch": 0.8939726439200361, "flos": 27343276738560.0, "grad_norm": 1.7472753569003732, "language_loss": 0.78087652, "learning_rate": 1.167089962692056e-07, "loss": 0.80182618, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 14869, "time_per_iteration": 3.870619773864746 }, { "auxiliary_loss_clip": 0.01050143, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.00813973, "balance_loss_mlp": 1.01540112, "epoch": 0.8940327671727041, "flos": 20337814391040.0, "grad_norm": 1.629715546094554, "language_loss": 0.66370142, "learning_rate": 1.1657793675203853e-07, "loss": 0.68449247, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 14870, "time_per_iteration": 2.356036901473999 }, { "auxiliary_loss_clip": 0.01007202, "auxiliary_loss_mlp": 0.01005391, "balance_loss_clip": 1.00344801, "balance_loss_mlp": 1.00076461, "epoch": 0.894092890425372, "flos": 58408015518720.0, "grad_norm": 0.800322759913264, "language_loss": 0.55945396, "learning_rate": 1.1644694865442461e-07, "loss": 0.57957995, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.06445312, "step": 14871, "time_per_iteration": 3.0734031200408936 }, { "auxiliary_loss_clip": 0.01050125, "auxiliary_loss_mlp": 0.01036696, "balance_loss_clip": 1.0156672, "balance_loss_mlp": 1.01646757, "epoch": 0.89415301367804, "flos": 19828606078080.0, "grad_norm": 1.7949662647018405, "language_loss": 0.77935767, "learning_rate": 1.16316031981331e-07, "loss": 0.80022585, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.3359375, "step": 14872, "time_per_iteration": 2.420847177505493 }, { "auxiliary_loss_clip": 0.01046851, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.01291442, "balance_loss_mlp": 1.01451278, "epoch": 0.8942131369307079, "flos": 25774594174080.0, "grad_norm": 1.6718826344661868, "language_loss": 0.67990822, "learning_rate": 1.1618518673772215e-07, "loss": 0.70070434, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.32421875, "step": 14873, "time_per_iteration": 2.424628734588623 }, { "auxiliary_loss_clip": 0.01049096, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.01342726, "balance_loss_mlp": 1.01523137, "epoch": 0.8942732601833759, "flos": 23147903635200.0, "grad_norm": 1.6180229024717137, "language_loss": 0.60626721, "learning_rate": 1.1605441292856033e-07, "loss": 0.62710565, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33789062, "step": 14874, "time_per_iteration": 2.379300594329834 }, { "auxiliary_loss_clip": 0.01053543, "auxiliary_loss_mlp": 0.01038705, "balance_loss_clip": 1.01581693, "balance_loss_mlp": 1.01715636, "epoch": 0.8943333834360438, "flos": 27854300442240.0, "grad_norm": 1.912092342744865, "language_loss": 0.77353221, "learning_rate": 1.1592371055880356e-07, "loss": 0.79445469, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 14875, "time_per_iteration": 2.433391809463501 }, { "auxiliary_loss_clip": 0.01053762, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.01922858, "balance_loss_mlp": 1.01584637, "epoch": 0.8943935066887119, "flos": 22162864135680.0, "grad_norm": 1.855332862834475, "language_loss": 0.78396523, "learning_rate": 1.1579307963340857e-07, "loss": 0.80496019, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 14876, "time_per_iteration": 2.399177312850952 }, { "auxiliary_loss_clip": 0.01050012, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.01018882, "balance_loss_mlp": 1.01546669, "epoch": 0.8944536299413798, "flos": 21469000308480.0, "grad_norm": 1.6770990792964833, "language_loss": 0.7972399, "learning_rate": 1.156625201573287e-07, "loss": 0.81805158, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 14877, "time_per_iteration": 2.383795738220215 }, { "auxiliary_loss_clip": 0.0105043, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.01267624, "balance_loss_mlp": 1.01588964, "epoch": 0.8945137531940478, "flos": 17747817557760.0, "grad_norm": 2.305642842773134, "language_loss": 0.76421052, "learning_rate": 1.155320321355151e-07, "loss": 0.78506583, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 14878, "time_per_iteration": 2.3365564346313477 }, { "auxiliary_loss_clip": 0.01050698, "auxiliary_loss_mlp": 0.01035916, "balance_loss_clip": 1.01343298, "balance_loss_mlp": 1.01549721, "epoch": 0.8945738764467158, "flos": 21141200753280.0, "grad_norm": 1.7712033576593, "language_loss": 0.77257764, "learning_rate": 1.1540161557291539e-07, "loss": 0.7934438, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 14879, "time_per_iteration": 2.393078088760376 }, { "auxiliary_loss_clip": 0.01052155, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.01235616, "balance_loss_mlp": 1.01615703, "epoch": 0.8946339996993837, "flos": 14902116860160.0, "grad_norm": 1.9076792557482565, "language_loss": 0.76041067, "learning_rate": 1.1527127047447538e-07, "loss": 0.78128004, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 14880, "time_per_iteration": 2.3671092987060547 }, { "auxiliary_loss_clip": 0.01050783, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.01316023, "balance_loss_mlp": 1.01548636, "epoch": 0.8946941229520518, "flos": 27380913050880.0, "grad_norm": 1.8168109603602274, "language_loss": 0.84126222, "learning_rate": 1.1514099684513822e-07, "loss": 0.86212504, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 14881, "time_per_iteration": 2.437927007675171 }, { "auxiliary_loss_clip": 0.01049666, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.01284957, "balance_loss_mlp": 1.01545405, "epoch": 0.8947542462047197, "flos": 31794912288000.0, "grad_norm": 1.5486726370160948, "language_loss": 0.67834449, "learning_rate": 1.1501079468984287e-07, "loss": 0.69918382, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34179688, "step": 14882, "time_per_iteration": 2.4777891635894775 }, { "auxiliary_loss_clip": 0.0105358, "auxiliary_loss_mlp": 0.01041526, "balance_loss_clip": 1.01535988, "balance_loss_mlp": 1.01601171, "epoch": 0.8948143694573877, "flos": 20882634157440.0, "grad_norm": 2.8837520252788424, "language_loss": 0.77034658, "learning_rate": 1.1488066401352691e-07, "loss": 0.79129761, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 14883, "time_per_iteration": 2.3403522968292236 }, { "auxiliary_loss_clip": 0.01049713, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.01839161, "balance_loss_mlp": 1.01566184, "epoch": 0.8948744927100556, "flos": 28214429783040.0, "grad_norm": 1.5326318713516347, "language_loss": 0.72989583, "learning_rate": 1.147506048211253e-07, "loss": 0.75078821, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 14884, "time_per_iteration": 2.441246747970581 }, { "auxiliary_loss_clip": 0.01048983, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.01125407, "balance_loss_mlp": 1.0147717, "epoch": 0.8949346159627236, "flos": 21901749010560.0, "grad_norm": 1.6431452866740375, "language_loss": 0.77318865, "learning_rate": 1.1462061711756987e-07, "loss": 0.79399657, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34179688, "step": 14885, "time_per_iteration": 3.6117777824401855 }, { "auxiliary_loss_clip": 0.01050511, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.01382101, "balance_loss_mlp": 1.01426768, "epoch": 0.8949947392153915, "flos": 21358116230400.0, "grad_norm": 2.1325965054203078, "language_loss": 0.82853413, "learning_rate": 1.1449070090778911e-07, "loss": 0.84941119, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 14886, "time_per_iteration": 2.3813095092773438 }, { "auxiliary_loss_clip": 0.01050745, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.01658154, "balance_loss_mlp": 1.01598704, "epoch": 0.8950548624680595, "flos": 52443454579200.0, "grad_norm": 1.4708031377994466, "language_loss": 0.64703596, "learning_rate": 1.1436085619671043e-07, "loss": 0.66792148, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 14887, "time_per_iteration": 2.6677768230438232 }, { "auxiliary_loss_clip": 0.01052932, "auxiliary_loss_mlp": 0.01044163, "balance_loss_clip": 1.01923633, "balance_loss_mlp": 1.01624417, "epoch": 0.8951149857207275, "flos": 20120270509440.0, "grad_norm": 2.0096663346377177, "language_loss": 0.6189748, "learning_rate": 1.1423108298925698e-07, "loss": 0.63994581, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 14888, "time_per_iteration": 2.366886854171753 }, { "auxiliary_loss_clip": 0.01052677, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.01038527, "balance_loss_mlp": 1.01622319, "epoch": 0.8951751089733955, "flos": 29861317526400.0, "grad_norm": 5.351501224210998, "language_loss": 0.71170735, "learning_rate": 1.1410138129034952e-07, "loss": 0.73254943, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.36523438, "step": 14889, "time_per_iteration": 2.440108299255371 }, { "auxiliary_loss_clip": 0.01053239, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.01656508, "balance_loss_mlp": 1.01639569, "epoch": 0.8952352322260634, "flos": 15262036732800.0, "grad_norm": 2.4822836599644225, "language_loss": 0.73111838, "learning_rate": 1.1397175110490676e-07, "loss": 0.75207078, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 14890, "time_per_iteration": 2.3153316974639893 }, { "auxiliary_loss_clip": 0.01050816, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.00971317, "balance_loss_mlp": 1.01551521, "epoch": 0.8952953554787314, "flos": 26797095429120.0, "grad_norm": 1.5308592940142571, "language_loss": 0.76785731, "learning_rate": 1.1384219243784454e-07, "loss": 0.7886914, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 14891, "time_per_iteration": 2.421882152557373 }, { "auxiliary_loss_clip": 0.01051884, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.01020813, "balance_loss_mlp": 1.01535416, "epoch": 0.8953554787313994, "flos": 14136331898880.0, "grad_norm": 1.7993306790696701, "language_loss": 0.77655447, "learning_rate": 1.1371270529407517e-07, "loss": 0.79740727, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 14892, "time_per_iteration": 2.3328733444213867 }, { "auxiliary_loss_clip": 0.01050942, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.01338959, "balance_loss_mlp": 1.01561952, "epoch": 0.8954156019840673, "flos": 25702114458240.0, "grad_norm": 1.2762769056415926, "language_loss": 0.82328975, "learning_rate": 1.1358328967850895e-07, "loss": 0.84415346, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 14893, "time_per_iteration": 2.443012237548828 }, { "auxiliary_loss_clip": 0.01048357, "auxiliary_loss_mlp": 0.01035011, "balance_loss_clip": 1.01459026, "balance_loss_mlp": 1.01501882, "epoch": 0.8954757252367354, "flos": 21906915891840.0, "grad_norm": 1.6395153263410058, "language_loss": 0.75543582, "learning_rate": 1.1345394559605348e-07, "loss": 0.77626944, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33398438, "step": 14894, "time_per_iteration": 2.38569712638855 }, { "auxiliary_loss_clip": 0.01052984, "auxiliary_loss_mlp": 0.01041675, "balance_loss_clip": 1.01717782, "balance_loss_mlp": 1.01659513, "epoch": 0.8955358484894033, "flos": 12969534528000.0, "grad_norm": 2.153770844264711, "language_loss": 0.68327785, "learning_rate": 1.1332467305161352e-07, "loss": 0.70422447, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 14895, "time_per_iteration": 3.7975454330444336 }, { "auxiliary_loss_clip": 0.01054446, "auxiliary_loss_mlp": 0.01039442, "balance_loss_clip": 1.01481295, "balance_loss_mlp": 1.01683092, "epoch": 0.8955959717420713, "flos": 17273033712000.0, "grad_norm": 1.693988328474845, "language_loss": 0.67511004, "learning_rate": 1.1319547205009094e-07, "loss": 0.69604897, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 14896, "time_per_iteration": 3.721370220184326 }, { "auxiliary_loss_clip": 0.01050763, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.01307631, "balance_loss_mlp": 1.01543355, "epoch": 0.8956560949947392, "flos": 14792978350080.0, "grad_norm": 1.72764249851666, "language_loss": 0.76612449, "learning_rate": 1.1306634259638492e-07, "loss": 0.78697842, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35351562, "step": 14897, "time_per_iteration": 2.3594586849212646 }, { "auxiliary_loss_clip": 0.01007246, "auxiliary_loss_mlp": 0.01003289, "balance_loss_clip": 1.00107145, "balance_loss_mlp": 1.00086248, "epoch": 0.8957162182474072, "flos": 63604661904000.0, "grad_norm": 0.7467298203132786, "language_loss": 0.55416936, "learning_rate": 1.129372846953931e-07, "loss": 0.57427466, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.06396484, "step": 14898, "time_per_iteration": 3.07427978515625 }, { "auxiliary_loss_clip": 0.01051918, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.01413059, "balance_loss_mlp": 1.01599193, "epoch": 0.8957763415000751, "flos": 25008669567360.0, "grad_norm": 1.5921317808078892, "language_loss": 0.71856821, "learning_rate": 1.12808298352008e-07, "loss": 0.73945892, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 14899, "time_per_iteration": 2.4115021228790283 }, { "auxiliary_loss_clip": 0.01051737, "auxiliary_loss_mlp": 0.0103857, "balance_loss_clip": 1.01487148, "balance_loss_mlp": 1.01618242, "epoch": 0.8958364647527431, "flos": 19827593648640.0, "grad_norm": 1.7385056279148108, "language_loss": 0.74539518, "learning_rate": 1.1267938357112106e-07, "loss": 0.76629817, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 14900, "time_per_iteration": 2.3962490558624268 }, { "auxiliary_loss_clip": 0.01007272, "auxiliary_loss_mlp": 0.01002463, "balance_loss_clip": 1.00055552, "balance_loss_mlp": 1.00079656, "epoch": 0.895896588005411, "flos": 65534102213760.0, "grad_norm": 0.7747427620410309, "language_loss": 0.61916459, "learning_rate": 1.1255054035762124e-07, "loss": 0.63926184, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.06494141, "step": 14901, "time_per_iteration": 3.017338991165161 }, { "auxiliary_loss_clip": 0.01049571, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.01610839, "balance_loss_mlp": 1.01444566, "epoch": 0.8959567112580791, "flos": 25589903748480.0, "grad_norm": 1.7682574213499602, "language_loss": 0.71641767, "learning_rate": 1.1242176871639441e-07, "loss": 0.73729777, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14902, "time_per_iteration": 2.4123973846435547 }, { "auxiliary_loss_clip": 0.0104826, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.01157975, "balance_loss_mlp": 1.01450551, "epoch": 0.896016834510747, "flos": 24200779639680.0, "grad_norm": 1.9030319450004336, "language_loss": 0.79045039, "learning_rate": 1.1229306865232313e-07, "loss": 0.81126845, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33789062, "step": 14903, "time_per_iteration": 2.3912999629974365 }, { "auxiliary_loss_clip": 0.01054546, "auxiliary_loss_mlp": 0.0104068, "balance_loss_clip": 1.01562238, "balance_loss_mlp": 1.01734209, "epoch": 0.896076957763415, "flos": 23074830426240.0, "grad_norm": 1.6710328682266278, "language_loss": 0.73746455, "learning_rate": 1.121644401702877e-07, "loss": 0.75841677, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 14904, "time_per_iteration": 2.4287121295928955 }, { "auxiliary_loss_clip": 0.01051581, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.01089025, "balance_loss_mlp": 1.01550245, "epoch": 0.8961370810160829, "flos": 22235518408320.0, "grad_norm": 1.8512715429452302, "language_loss": 0.76073432, "learning_rate": 1.12035883275166e-07, "loss": 0.78158975, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36132812, "step": 14905, "time_per_iteration": 2.365452527999878 }, { "auxiliary_loss_clip": 0.01050247, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.01420903, "balance_loss_mlp": 1.01592159, "epoch": 0.8961972042687509, "flos": 23071304378880.0, "grad_norm": 1.6113117048297951, "language_loss": 0.78029549, "learning_rate": 1.1190739797183279e-07, "loss": 0.80116463, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34179688, "step": 14906, "time_per_iteration": 2.4077351093292236 }, { "auxiliary_loss_clip": 0.0105304, "auxiliary_loss_mlp": 0.01039554, "balance_loss_clip": 1.01899004, "balance_loss_mlp": 1.01803231, "epoch": 0.896257327521419, "flos": 18184930179840.0, "grad_norm": 1.5567097323443335, "language_loss": 0.75146401, "learning_rate": 1.1177898426515996e-07, "loss": 0.77239001, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34960938, "step": 14907, "time_per_iteration": 2.3558757305145264 }, { "auxiliary_loss_clip": 0.01051067, "auxiliary_loss_mlp": 0.01047359, "balance_loss_clip": 1.02364802, "balance_loss_mlp": 1.01639163, "epoch": 0.8963174507740869, "flos": 17894487646080.0, "grad_norm": 1.647002550382285, "language_loss": 0.834548, "learning_rate": 1.1165064216001785e-07, "loss": 0.85553223, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34765625, "step": 14908, "time_per_iteration": 3.8091914653778076 }, { "auxiliary_loss_clip": 0.01053066, "auxiliary_loss_mlp": 0.01041182, "balance_loss_clip": 1.01584983, "balance_loss_mlp": 1.01608658, "epoch": 0.8963775740267549, "flos": 21031224370560.0, "grad_norm": 1.607222181074257, "language_loss": 0.71699393, "learning_rate": 1.1152237166127232e-07, "loss": 0.73793644, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 14909, "time_per_iteration": 2.410095691680908 }, { "auxiliary_loss_clip": 0.01051675, "auxiliary_loss_mlp": 0.01040286, "balance_loss_clip": 1.01688528, "balance_loss_mlp": 1.01584005, "epoch": 0.8964376972794228, "flos": 23178662409600.0, "grad_norm": 1.7379097705517068, "language_loss": 0.73602635, "learning_rate": 1.113941727737877e-07, "loss": 0.75694597, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 14910, "time_per_iteration": 2.4110944271087646 }, { "auxiliary_loss_clip": 0.01050207, "auxiliary_loss_mlp": 0.01035509, "balance_loss_clip": 1.0132525, "balance_loss_mlp": 1.01462257, "epoch": 0.8964978205320908, "flos": 24971836216320.0, "grad_norm": 2.656775552875627, "language_loss": 0.64188641, "learning_rate": 1.1126604550242502e-07, "loss": 0.66274357, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 14911, "time_per_iteration": 2.3925018310546875 }, { "auxiliary_loss_clip": 0.01051542, "auxiliary_loss_mlp": 0.01036905, "balance_loss_clip": 1.01445758, "balance_loss_mlp": 1.01627517, "epoch": 0.8965579437847587, "flos": 19171017020160.0, "grad_norm": 1.792018470863097, "language_loss": 0.76117301, "learning_rate": 1.111379898520437e-07, "loss": 0.78205746, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35351562, "step": 14912, "time_per_iteration": 2.374375343322754 }, { "auxiliary_loss_clip": 0.01050144, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.01090765, "balance_loss_mlp": 1.01497805, "epoch": 0.8966180670374267, "flos": 24275633328000.0, "grad_norm": 1.822262085863558, "language_loss": 0.83166128, "learning_rate": 1.1101000582749876e-07, "loss": 0.85250902, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3515625, "step": 14913, "time_per_iteration": 2.39888596534729 }, { "auxiliary_loss_clip": 0.01052328, "auxiliary_loss_mlp": 0.01047077, "balance_loss_clip": 1.02086329, "balance_loss_mlp": 1.01581502, "epoch": 0.8966781902900947, "flos": 13552339720320.0, "grad_norm": 2.324459160698388, "language_loss": 0.62771755, "learning_rate": 1.1088209343364407e-07, "loss": 0.64871162, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 14914, "time_per_iteration": 2.377101182937622 }, { "auxiliary_loss_clip": 0.01007422, "auxiliary_loss_mlp": 0.01001358, "balance_loss_clip": 0.99923593, "balance_loss_mlp": 1.00106573, "epoch": 0.8967383135427627, "flos": 65062949149440.0, "grad_norm": 0.7423087575752129, "language_loss": 0.55170262, "learning_rate": 1.1075425267532956e-07, "loss": 0.57179046, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.06347656, "step": 14915, "time_per_iteration": 3.0207266807556152 }, { "auxiliary_loss_clip": 0.01048639, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.01405478, "balance_loss_mlp": 1.01540613, "epoch": 0.8967984367954306, "flos": 29711819617920.0, "grad_norm": 1.4636715657799353, "language_loss": 0.72063142, "learning_rate": 1.1062648355740289e-07, "loss": 0.7414602, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33203125, "step": 14916, "time_per_iteration": 2.464329242706299 }, { "auxiliary_loss_clip": 0.01051501, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.01220512, "balance_loss_mlp": 1.01658988, "epoch": 0.8968585600480986, "flos": 25701311496960.0, "grad_norm": 1.5899278302571878, "language_loss": 0.78345811, "learning_rate": 1.1049878608470931e-07, "loss": 0.80432081, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 14917, "time_per_iteration": 2.415252447128296 }, { "auxiliary_loss_clip": 0.0105421, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.01992416, "balance_loss_mlp": 1.01678526, "epoch": 0.8969186833007665, "flos": 30043389600000.0, "grad_norm": 2.352300416065489, "language_loss": 0.69692636, "learning_rate": 1.1037116026209137e-07, "loss": 0.71792996, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 14918, "time_per_iteration": 2.458951950073242 }, { "auxiliary_loss_clip": 0.01050588, "auxiliary_loss_mlp": 0.01038002, "balance_loss_clip": 1.01661563, "balance_loss_mlp": 1.01506186, "epoch": 0.8969788065534345, "flos": 22817101703040.0, "grad_norm": 1.836198909314225, "language_loss": 0.84787893, "learning_rate": 1.102436060943881e-07, "loss": 0.86876482, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35546875, "step": 14919, "time_per_iteration": 2.4026193618774414 }, { "auxiliary_loss_clip": 0.01052331, "auxiliary_loss_mlp": 0.01035628, "balance_loss_clip": 1.01133358, "balance_loss_mlp": 1.01585066, "epoch": 0.8970389298061026, "flos": 13260640377600.0, "grad_norm": 2.081612636702083, "language_loss": 0.74426544, "learning_rate": 1.1011612358643696e-07, "loss": 0.76514506, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 14920, "time_per_iteration": 2.3832204341888428 }, { "auxiliary_loss_clip": 0.01051676, "auxiliary_loss_mlp": 0.01036164, "balance_loss_clip": 1.01136804, "balance_loss_mlp": 1.01599586, "epoch": 0.8970990530587705, "flos": 10265406860160.0, "grad_norm": 2.3758720078494227, "language_loss": 0.9174735, "learning_rate": 1.0998871274307164e-07, "loss": 0.93835193, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35742188, "step": 14921, "time_per_iteration": 2.3324625492095947 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.01035174, "balance_loss_clip": 1.01124895, "balance_loss_mlp": 1.01590562, "epoch": 0.8971591763114385, "flos": 20301679267200.0, "grad_norm": 1.8697688024333523, "language_loss": 0.74941194, "learning_rate": 1.0986137356912384e-07, "loss": 0.77029121, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 14922, "time_per_iteration": 2.38751220703125 }, { "auxiliary_loss_clip": 0.01051093, "auxiliary_loss_mlp": 0.01037092, "balance_loss_clip": 1.01420355, "balance_loss_mlp": 1.0159421, "epoch": 0.8972192995641064, "flos": 23255959893120.0, "grad_norm": 1.822642768230374, "language_loss": 0.71662772, "learning_rate": 1.097341060694219e-07, "loss": 0.73750961, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 14923, "time_per_iteration": 2.387789726257324 }, { "auxiliary_loss_clip": 0.01051935, "auxiliary_loss_mlp": 0.01034228, "balance_loss_clip": 1.00995719, "balance_loss_mlp": 1.01584339, "epoch": 0.8972794228167744, "flos": 18368608176000.0, "grad_norm": 2.5426642020377597, "language_loss": 0.72294468, "learning_rate": 1.0960691024879221e-07, "loss": 0.7438063, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 14924, "time_per_iteration": 3.5978314876556396 }, { "auxiliary_loss_clip": 0.01050154, "auxiliary_loss_mlp": 0.0103844, "balance_loss_clip": 1.01760197, "balance_loss_mlp": 1.01573396, "epoch": 0.8973395460694423, "flos": 23950905972480.0, "grad_norm": 1.344889882450706, "language_loss": 0.7329042, "learning_rate": 1.0947978611205844e-07, "loss": 0.75379014, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34375, "step": 14925, "time_per_iteration": 2.5300843715667725 }, { "auxiliary_loss_clip": 0.01054495, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.01306772, "balance_loss_mlp": 1.01749253, "epoch": 0.8973996693221103, "flos": 24969741534720.0, "grad_norm": 1.6239381845369998, "language_loss": 0.83642715, "learning_rate": 1.0935273366404008e-07, "loss": 0.85733986, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 14926, "time_per_iteration": 2.413839101791382 }, { "auxiliary_loss_clip": 0.01049514, "auxiliary_loss_mlp": 0.01036667, "balance_loss_clip": 1.014732, "balance_loss_mlp": 1.0150435, "epoch": 0.8974597925747783, "flos": 25737760823040.0, "grad_norm": 1.5140630076043755, "language_loss": 0.79529804, "learning_rate": 1.092257529095555e-07, "loss": 0.81615978, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 14927, "time_per_iteration": 2.4088706970214844 }, { "auxiliary_loss_clip": 0.01050663, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.01296735, "balance_loss_mlp": 1.01553071, "epoch": 0.8975199158274463, "flos": 38070375684480.0, "grad_norm": 2.048459350713193, "language_loss": 0.67110544, "learning_rate": 1.0909884385341994e-07, "loss": 0.6919564, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 14928, "time_per_iteration": 2.516594171524048 }, { "auxiliary_loss_clip": 0.01052606, "auxiliary_loss_mlp": 0.01039431, "balance_loss_clip": 1.01376545, "balance_loss_mlp": 1.01620865, "epoch": 0.8975800390801142, "flos": 25410484938240.0, "grad_norm": 3.0601042888359484, "language_loss": 0.72473526, "learning_rate": 1.0897200650044602e-07, "loss": 0.74565566, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 14929, "time_per_iteration": 2.4213781356811523 }, { "auxiliary_loss_clip": 0.01052851, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.01637006, "balance_loss_mlp": 1.01721525, "epoch": 0.8976401623327822, "flos": 21758604969600.0, "grad_norm": 2.9723918884752076, "language_loss": 0.69092166, "learning_rate": 1.0884524085544256e-07, "loss": 0.71183449, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 14930, "time_per_iteration": 2.3944034576416016 }, { "auxiliary_loss_clip": 0.01048284, "auxiliary_loss_mlp": 0.01038179, "balance_loss_clip": 1.01598191, "balance_loss_mlp": 1.01402068, "epoch": 0.8977002855854501, "flos": 13844457999360.0, "grad_norm": 1.9731137859716865, "language_loss": 0.76278591, "learning_rate": 1.0871854692321769e-07, "loss": 0.78365052, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34179688, "step": 14931, "time_per_iteration": 2.345442771911621 }, { "auxiliary_loss_clip": 0.01050487, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.009323, "balance_loss_mlp": 1.01621962, "epoch": 0.8977604088381181, "flos": 19426511416320.0, "grad_norm": 1.849944007058618, "language_loss": 0.64554191, "learning_rate": 1.0859192470857492e-07, "loss": 0.66635334, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 14932, "time_per_iteration": 2.378781318664551 }, { "auxiliary_loss_clip": 0.0104812, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.00903511, "balance_loss_mlp": 1.01489425, "epoch": 0.8978205320907862, "flos": 22741130851200.0, "grad_norm": 1.9942351478130844, "language_loss": 0.72741997, "learning_rate": 1.0846537421631552e-07, "loss": 0.74819481, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33203125, "step": 14933, "time_per_iteration": 2.38289737701416 }, { "auxiliary_loss_clip": 0.01051791, "auxiliary_loss_mlp": 0.01039003, "balance_loss_clip": 1.0164609, "balance_loss_mlp": 1.01613188, "epoch": 0.8978806553434541, "flos": 21359477773440.0, "grad_norm": 1.3938046570237874, "language_loss": 0.75340277, "learning_rate": 1.0833889545123898e-07, "loss": 0.77431071, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 14934, "time_per_iteration": 2.4023547172546387 }, { "auxiliary_loss_clip": 0.01051018, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.01757479, "balance_loss_mlp": 1.01608062, "epoch": 0.8979407785961221, "flos": 20923098289920.0, "grad_norm": 1.6240325149366295, "language_loss": 0.62137556, "learning_rate": 1.0821248841814123e-07, "loss": 0.6422919, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34960938, "step": 14935, "time_per_iteration": 3.7851572036743164 }, { "auxiliary_loss_clip": 0.01049508, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.01211643, "balance_loss_mlp": 1.01537526, "epoch": 0.89800090184879, "flos": 25227749548800.0, "grad_norm": 2.4039495523314747, "language_loss": 0.78287327, "learning_rate": 1.0808615312181512e-07, "loss": 0.80372065, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34179688, "step": 14936, "time_per_iteration": 3.8326992988586426 }, { "auxiliary_loss_clip": 0.01052156, "auxiliary_loss_mlp": 0.01033673, "balance_loss_clip": 1.01130927, "balance_loss_mlp": 1.01683784, "epoch": 0.898061025101458, "flos": 22561642218240.0, "grad_norm": 1.6030532497773822, "language_loss": 0.75020897, "learning_rate": 1.0795988956705193e-07, "loss": 0.77106726, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 14937, "time_per_iteration": 2.406010389328003 }, { "auxiliary_loss_clip": 0.01007309, "auxiliary_loss_mlp": 0.01002336, "balance_loss_clip": 1.00054801, "balance_loss_mlp": 1.00103116, "epoch": 0.8981211483541259, "flos": 56189843331840.0, "grad_norm": 0.9098764271575009, "language_loss": 0.63613236, "learning_rate": 1.0783369775863915e-07, "loss": 0.65622878, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.06298828, "step": 14938, "time_per_iteration": 2.9196455478668213 }, { "auxiliary_loss_clip": 0.01050698, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.01023757, "balance_loss_mlp": 1.01614356, "epoch": 0.898181271606794, "flos": 16391965841280.0, "grad_norm": 2.2618900503761643, "language_loss": 0.81114024, "learning_rate": 1.0770757770136251e-07, "loss": 0.83195972, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34570312, "step": 14939, "time_per_iteration": 2.369658946990967 }, { "auxiliary_loss_clip": 0.01007664, "auxiliary_loss_mlp": 0.01003141, "balance_loss_clip": 1.00113869, "balance_loss_mlp": 1.00118291, "epoch": 0.8982413948594619, "flos": 63436694019840.0, "grad_norm": 0.7169113701967231, "language_loss": 0.52947342, "learning_rate": 1.0758152940000375e-07, "loss": 0.54958147, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06494141, "step": 14940, "time_per_iteration": 3.1660187244415283 }, { "auxiliary_loss_clip": 0.01051229, "auxiliary_loss_mlp": 0.01040568, "balance_loss_clip": 1.01623738, "balance_loss_mlp": 1.01611114, "epoch": 0.8983015181121299, "flos": 21834261619200.0, "grad_norm": 2.0616024069619168, "language_loss": 0.79089344, "learning_rate": 1.0745555285934327e-07, "loss": 0.81181133, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3515625, "step": 14941, "time_per_iteration": 2.4127047061920166 }, { "auxiliary_loss_clip": 0.01052144, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.01266074, "balance_loss_mlp": 1.01630378, "epoch": 0.8983616413647978, "flos": 28948687920000.0, "grad_norm": 2.525730881317025, "language_loss": 0.73958069, "learning_rate": 1.0732964808415834e-07, "loss": 0.76045084, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 14942, "time_per_iteration": 2.4382858276367188 }, { "auxiliary_loss_clip": 0.0105242, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.01807678, "balance_loss_mlp": 1.01629639, "epoch": 0.8984217646174658, "flos": 17784127238400.0, "grad_norm": 2.2793270866070037, "language_loss": 0.81660247, "learning_rate": 1.0720381507922205e-07, "loss": 0.83753443, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 14943, "time_per_iteration": 2.3747622966766357 }, { "auxiliary_loss_clip": 0.01053311, "auxiliary_loss_mlp": 0.01041032, "balance_loss_clip": 1.01645064, "balance_loss_mlp": 1.01639247, "epoch": 0.8984818878701337, "flos": 23403398031360.0, "grad_norm": 1.4724150320221203, "language_loss": 0.72212583, "learning_rate": 1.0707805384930701e-07, "loss": 0.74306917, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 14944, "time_per_iteration": 2.3862924575805664 }, { "auxiliary_loss_clip": 0.01055455, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.01595032, "balance_loss_mlp": 1.01809752, "epoch": 0.8985420111228017, "flos": 22344971120640.0, "grad_norm": 2.03705519192667, "language_loss": 0.77594727, "learning_rate": 1.0695236439918187e-07, "loss": 0.79690093, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37304688, "step": 14945, "time_per_iteration": 2.388627529144287 }, { "auxiliary_loss_clip": 0.010551, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.01308179, "balance_loss_mlp": 1.01651609, "epoch": 0.8986021343754698, "flos": 21391842470400.0, "grad_norm": 2.006542898845663, "language_loss": 0.74621558, "learning_rate": 1.0682674673361302e-07, "loss": 0.76715916, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.38671875, "step": 14946, "time_per_iteration": 2.3833260536193848 }, { "auxiliary_loss_clip": 0.01050539, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.01284885, "balance_loss_mlp": 1.0157361, "epoch": 0.8986622576281377, "flos": 21324285256320.0, "grad_norm": 2.439889551593908, "language_loss": 0.6509093, "learning_rate": 1.0670120085736334e-07, "loss": 0.67177063, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 14947, "time_per_iteration": 2.392840623855591 }, { "auxiliary_loss_clip": 0.01051802, "auxiliary_loss_mlp": 0.01036048, "balance_loss_clip": 1.01486456, "balance_loss_mlp": 1.016451, "epoch": 0.8987223808808057, "flos": 23987145830400.0, "grad_norm": 1.5921533114050854, "language_loss": 0.70678592, "learning_rate": 1.0657572677519411e-07, "loss": 0.72766441, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35351562, "step": 14948, "time_per_iteration": 3.8085434436798096 }, { "auxiliary_loss_clip": 0.01051254, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.01385617, "balance_loss_mlp": 1.01553619, "epoch": 0.8987825041334736, "flos": 41499335422080.0, "grad_norm": 1.781191535069777, "language_loss": 0.7572571, "learning_rate": 1.0645032449186309e-07, "loss": 0.77813917, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 14949, "time_per_iteration": 2.5446434020996094 }, { "auxiliary_loss_clip": 0.01051631, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.01572132, "balance_loss_mlp": 1.01577568, "epoch": 0.8988426273861416, "flos": 27563020035840.0, "grad_norm": 2.4348267718544365, "language_loss": 0.76392531, "learning_rate": 1.0632499401212513e-07, "loss": 0.78486216, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.359375, "step": 14950, "time_per_iteration": 2.4635443687438965 }, { "auxiliary_loss_clip": 0.01051196, "auxiliary_loss_mlp": 0.01032756, "balance_loss_clip": 1.0129317, "balance_loss_mlp": 1.01643538, "epoch": 0.8989027506388095, "flos": 17091694776960.0, "grad_norm": 1.645194109234987, "language_loss": 0.67752457, "learning_rate": 1.0619973534073334e-07, "loss": 0.69836414, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.34765625, "step": 14951, "time_per_iteration": 2.362877607345581 }, { "auxiliary_loss_clip": 0.01053806, "auxiliary_loss_mlp": 0.01039431, "balance_loss_clip": 1.01576805, "balance_loss_mlp": 1.01595044, "epoch": 0.8989628738914776, "flos": 20554171286400.0, "grad_norm": 1.9398627828402029, "language_loss": 0.75155205, "learning_rate": 1.0607454848243769e-07, "loss": 0.77248442, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37890625, "step": 14952, "time_per_iteration": 2.3772268295288086 }, { "auxiliary_loss_clip": 0.01050727, "auxiliary_loss_mlp": 0.01035794, "balance_loss_clip": 1.0132041, "balance_loss_mlp": 1.01585436, "epoch": 0.8990229971441455, "flos": 16250218254720.0, "grad_norm": 2.135275354364461, "language_loss": 0.58027595, "learning_rate": 1.0594943344198481e-07, "loss": 0.60114115, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 14953, "time_per_iteration": 2.3818588256835938 }, { "auxiliary_loss_clip": 0.01050796, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.01972508, "balance_loss_mlp": 1.01532292, "epoch": 0.8990831203968135, "flos": 21980233480320.0, "grad_norm": 1.910669273643848, "language_loss": 0.83547455, "learning_rate": 1.0582439022411915e-07, "loss": 0.85640866, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 14954, "time_per_iteration": 2.413179397583008 }, { "auxiliary_loss_clip": 0.01050464, "auxiliary_loss_mlp": 0.01036546, "balance_loss_clip": 1.01419401, "balance_loss_mlp": 1.01575589, "epoch": 0.8991432436494814, "flos": 27446131203840.0, "grad_norm": 1.929985671989952, "language_loss": 0.61545658, "learning_rate": 1.0569941883358224e-07, "loss": 0.63632667, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 14955, "time_per_iteration": 2.469252347946167 }, { "auxiliary_loss_clip": 0.01049337, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.0142138, "balance_loss_mlp": 1.01535225, "epoch": 0.8992033669021494, "flos": 21578767223040.0, "grad_norm": 2.451157314835947, "language_loss": 0.5651381, "learning_rate": 1.0557451927511341e-07, "loss": 0.58598733, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 14956, "time_per_iteration": 2.3690924644470215 }, { "auxiliary_loss_clip": 0.01050374, "auxiliary_loss_mlp": 0.01036278, "balance_loss_clip": 1.01342547, "balance_loss_mlp": 1.01563931, "epoch": 0.8992634901548173, "flos": 28582972761600.0, "grad_norm": 1.9192830385260282, "language_loss": 0.8114742, "learning_rate": 1.0544969155344863e-07, "loss": 0.83234072, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 14957, "time_per_iteration": 2.4296562671661377 }, { "auxiliary_loss_clip": 0.01053283, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.01453018, "balance_loss_mlp": 1.01642048, "epoch": 0.8993236134074853, "flos": 19866347124480.0, "grad_norm": 1.582182770765239, "language_loss": 0.79643983, "learning_rate": 1.0532493567332123e-07, "loss": 0.81735611, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 14958, "time_per_iteration": 2.3776373863220215 }, { "auxiliary_loss_clip": 0.01052151, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.01241732, "balance_loss_mlp": 1.01742506, "epoch": 0.8993837366601534, "flos": 19389643153920.0, "grad_norm": 1.7122394836914494, "language_loss": 0.75946271, "learning_rate": 1.0520025163946277e-07, "loss": 0.78031611, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 14959, "time_per_iteration": 2.3777871131896973 }, { "auxiliary_loss_clip": 0.01049846, "auxiliary_loss_mlp": 0.01038502, "balance_loss_clip": 1.01692522, "balance_loss_mlp": 1.01572418, "epoch": 0.8994438599128213, "flos": 18550750072320.0, "grad_norm": 2.0879824995068628, "language_loss": 0.70002806, "learning_rate": 1.0507563945660015e-07, "loss": 0.72091156, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 14960, "time_per_iteration": 2.3423211574554443 }, { "auxiliary_loss_clip": 0.01051254, "auxiliary_loss_mlp": 0.01033875, "balance_loss_clip": 1.01154721, "balance_loss_mlp": 1.01622081, "epoch": 0.8995039831654893, "flos": 24426388045440.0, "grad_norm": 1.61849451171463, "language_loss": 0.66495299, "learning_rate": 1.049510991294591e-07, "loss": 0.68580425, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14961, "time_per_iteration": 2.435886859893799 }, { "auxiliary_loss_clip": 0.01049697, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.0111562, "balance_loss_mlp": 1.01522911, "epoch": 0.8995641064181572, "flos": 21250269440640.0, "grad_norm": 1.5873022694003038, "language_loss": 0.83700585, "learning_rate": 1.0482663066276254e-07, "loss": 0.85782874, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 14962, "time_per_iteration": 2.387817859649658 }, { "auxiliary_loss_clip": 0.01054143, "auxiliary_loss_mlp": 0.01039536, "balance_loss_clip": 1.01167727, "balance_loss_mlp": 1.01721537, "epoch": 0.8996242296708252, "flos": 23512536541440.0, "grad_norm": 2.260017434165118, "language_loss": 0.76958007, "learning_rate": 1.047022340612298e-07, "loss": 0.79051685, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.36914062, "step": 14963, "time_per_iteration": 2.4422810077667236 }, { "auxiliary_loss_clip": 0.01007505, "auxiliary_loss_mlp": 0.01005989, "balance_loss_clip": 1.00401044, "balance_loss_mlp": 1.00110185, "epoch": 0.8996843529234931, "flos": 62400123486720.0, "grad_norm": 0.7834123166137601, "language_loss": 0.57645494, "learning_rate": 1.0457790932957867e-07, "loss": 0.59658986, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06396484, "step": 14964, "time_per_iteration": 4.143456935882568 }, { "auxiliary_loss_clip": 0.01055042, "auxiliary_loss_mlp": 0.01040641, "balance_loss_clip": 1.01480806, "balance_loss_mlp": 1.01733744, "epoch": 0.8997444761761612, "flos": 24235867422720.0, "grad_norm": 2.7084635992428168, "language_loss": 0.69389927, "learning_rate": 1.0445365647252269e-07, "loss": 0.71485609, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 14965, "time_per_iteration": 2.371042013168335 }, { "auxiliary_loss_clip": 0.01053591, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 1.01811528, "balance_loss_mlp": 1.01699984, "epoch": 0.8998045994288291, "flos": 21360036355200.0, "grad_norm": 2.0934087210725116, "language_loss": 0.72811615, "learning_rate": 1.0432947549477433e-07, "loss": 0.74904889, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36523438, "step": 14966, "time_per_iteration": 2.391310691833496 }, { "auxiliary_loss_clip": 0.010534, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.01419246, "balance_loss_mlp": 1.01733398, "epoch": 0.8998647226814971, "flos": 28984892866560.0, "grad_norm": 1.661887708072744, "language_loss": 0.74639052, "learning_rate": 1.0420536640104205e-07, "loss": 0.76731169, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36132812, "step": 14967, "time_per_iteration": 2.4342055320739746 }, { "auxiliary_loss_clip": 0.01050442, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.0089761, "balance_loss_mlp": 1.01524508, "epoch": 0.899924845934165, "flos": 13625063815680.0, "grad_norm": 2.1109447262525385, "language_loss": 0.73194277, "learning_rate": 1.040813291960323e-07, "loss": 0.75276482, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 14968, "time_per_iteration": 2.367720127105713 }, { "auxiliary_loss_clip": 0.01051135, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.0157479, "balance_loss_mlp": 1.0158534, "epoch": 0.899984969186833, "flos": 20881691550720.0, "grad_norm": 1.7397159097506445, "language_loss": 0.71853191, "learning_rate": 1.0395736388444864e-07, "loss": 0.73941189, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35351562, "step": 14969, "time_per_iteration": 2.3686838150024414 }, { "auxiliary_loss_clip": 0.01053297, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.01507187, "balance_loss_mlp": 1.01735163, "epoch": 0.9000450924395009, "flos": 20920794140160.0, "grad_norm": 1.7884294444333897, "language_loss": 0.77104056, "learning_rate": 1.0383347047099201e-07, "loss": 0.79196364, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 14970, "time_per_iteration": 2.379412889480591 }, { "auxiliary_loss_clip": 0.01051797, "auxiliary_loss_mlp": 0.0104064, "balance_loss_clip": 1.01882458, "balance_loss_mlp": 1.01554763, "epoch": 0.900105215692169, "flos": 17164104670080.0, "grad_norm": 1.8430240125246662, "language_loss": 0.73927224, "learning_rate": 1.0370964896035972e-07, "loss": 0.76019663, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 14971, "time_per_iteration": 2.3629794120788574 }, { "auxiliary_loss_clip": 0.01051146, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.01093602, "balance_loss_mlp": 1.01574969, "epoch": 0.900165338944837, "flos": 19931076518400.0, "grad_norm": 2.2954249901035384, "language_loss": 0.82939601, "learning_rate": 1.035858993572476e-07, "loss": 0.85025859, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35351562, "step": 14972, "time_per_iteration": 2.347663640975952 }, { "auxiliary_loss_clip": 0.0105271, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.01543355, "balance_loss_mlp": 1.01604748, "epoch": 0.9002254621975049, "flos": 16106166518400.0, "grad_norm": 2.0012384755721135, "language_loss": 0.82860744, "learning_rate": 1.0346222166634855e-07, "loss": 0.84952754, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 14973, "time_per_iteration": 2.3555779457092285 }, { "auxiliary_loss_clip": 0.01052145, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.01415873, "balance_loss_mlp": 1.01700425, "epoch": 0.9002855854501729, "flos": 28474846680960.0, "grad_norm": 1.7986528908218993, "language_loss": 0.5881601, "learning_rate": 1.0333861589235193e-07, "loss": 0.6090557, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14974, "time_per_iteration": 2.42008376121521 }, { "auxiliary_loss_clip": 0.01052929, "auxiliary_loss_mlp": 0.0103729, "balance_loss_clip": 1.01406848, "balance_loss_mlp": 1.01701641, "epoch": 0.9003457087028408, "flos": 25629111072000.0, "grad_norm": 1.6279588594428829, "language_loss": 0.64286929, "learning_rate": 1.0321508203994489e-07, "loss": 0.66377151, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 14975, "time_per_iteration": 5.183246374130249 }, { "auxiliary_loss_clip": 0.01052896, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.01603532, "balance_loss_mlp": 1.01589739, "epoch": 0.9004058319555088, "flos": 24388262974080.0, "grad_norm": 1.5709603003272041, "language_loss": 0.73836386, "learning_rate": 1.0309162011381257e-07, "loss": 0.75928742, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 14976, "time_per_iteration": 2.409421682357788 }, { "auxiliary_loss_clip": 0.01051726, "auxiliary_loss_mlp": 0.01041406, "balance_loss_clip": 1.01800513, "balance_loss_mlp": 1.01641512, "epoch": 0.9004659552081767, "flos": 29058070809600.0, "grad_norm": 1.9078653939737877, "language_loss": 0.70985556, "learning_rate": 1.0296823011863565e-07, "loss": 0.73078686, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 14977, "time_per_iteration": 2.428184747695923 }, { "auxiliary_loss_clip": 0.01052696, "auxiliary_loss_mlp": 0.01037387, "balance_loss_clip": 1.01241255, "balance_loss_mlp": 1.01554286, "epoch": 0.9005260784608448, "flos": 16762917703680.0, "grad_norm": 2.135211995319159, "language_loss": 0.68265879, "learning_rate": 1.0284491205909351e-07, "loss": 0.70355964, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 14978, "time_per_iteration": 2.359135866165161 }, { "auxiliary_loss_clip": 0.0105582, "auxiliary_loss_mlp": 0.01045073, "balance_loss_clip": 1.01900148, "balance_loss_mlp": 1.01761985, "epoch": 0.9005862017135127, "flos": 20374961944320.0, "grad_norm": 6.013381974961672, "language_loss": 0.80486274, "learning_rate": 1.0272166593986286e-07, "loss": 0.82587171, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3828125, "step": 14979, "time_per_iteration": 2.359754800796509 }, { "auxiliary_loss_clip": 0.01007983, "auxiliary_loss_mlp": 0.01002387, "balance_loss_clip": 1.00034833, "balance_loss_mlp": 1.00132608, "epoch": 0.9006463249661807, "flos": 67577114776320.0, "grad_norm": 0.7245280421778713, "language_loss": 0.53899711, "learning_rate": 1.0259849176561642e-07, "loss": 0.55910081, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06640625, "step": 14980, "time_per_iteration": 3.057779312133789 }, { "auxiliary_loss_clip": 0.01053606, "auxiliary_loss_mlp": 0.01041609, "balance_loss_clip": 1.01774335, "balance_loss_mlp": 1.01649356, "epoch": 0.9007064482188486, "flos": 28292076380160.0, "grad_norm": 1.725214818972455, "language_loss": 0.83419812, "learning_rate": 1.0247538954102553e-07, "loss": 0.85515028, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 14981, "time_per_iteration": 2.412033796310425 }, { "auxiliary_loss_clip": 0.01050715, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.01095998, "balance_loss_mlp": 1.01536357, "epoch": 0.9007665714715166, "flos": 21615251460480.0, "grad_norm": 1.5332168605099683, "language_loss": 0.82007635, "learning_rate": 1.0235235927075758e-07, "loss": 0.84093142, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 14982, "time_per_iteration": 2.3825197219848633 }, { "auxiliary_loss_clip": 0.01049474, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.02039623, "balance_loss_mlp": 1.01595032, "epoch": 0.9008266947241845, "flos": 26540658426240.0, "grad_norm": 1.7730057233868943, "language_loss": 0.7324748, "learning_rate": 1.0222940095947885e-07, "loss": 0.75339156, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3359375, "step": 14983, "time_per_iteration": 2.428668737411499 }, { "auxiliary_loss_clip": 0.01050846, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.01166868, "balance_loss_mlp": 1.01683772, "epoch": 0.9008868179768525, "flos": 23109464361600.0, "grad_norm": 1.3337988057115415, "language_loss": 0.75941443, "learning_rate": 1.0210651461185115e-07, "loss": 0.7802496, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33984375, "step": 14984, "time_per_iteration": 2.429661512374878 }, { "auxiliary_loss_clip": 0.01049813, "auxiliary_loss_mlp": 0.01037793, "balance_loss_clip": 1.01560795, "balance_loss_mlp": 1.01546824, "epoch": 0.9009469412295206, "flos": 19059853651200.0, "grad_norm": 2.092608885873575, "language_loss": 0.71092248, "learning_rate": 1.0198370023253456e-07, "loss": 0.73179853, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34375, "step": 14985, "time_per_iteration": 2.37290620803833 }, { "auxiliary_loss_clip": 0.01052092, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.01024759, "balance_loss_mlp": 1.01527977, "epoch": 0.9010070644821885, "flos": 23221151400960.0, "grad_norm": 2.292884473473629, "language_loss": 0.7159552, "learning_rate": 1.0186095782618643e-07, "loss": 0.73679328, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3671875, "step": 14986, "time_per_iteration": 2.385183095932007 }, { "auxiliary_loss_clip": 0.01051797, "auxiliary_loss_mlp": 0.01040511, "balance_loss_clip": 1.01675248, "balance_loss_mlp": 1.01589131, "epoch": 0.9010671877348565, "flos": 17383847967360.0, "grad_norm": 1.7315819700580697, "language_loss": 0.77379215, "learning_rate": 1.0173828739746104e-07, "loss": 0.79471517, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 14987, "time_per_iteration": 2.354480504989624 }, { "auxiliary_loss_clip": 0.01051435, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.01370323, "balance_loss_mlp": 1.01610994, "epoch": 0.9011273109875244, "flos": 21907090448640.0, "grad_norm": 2.0141861638651415, "language_loss": 0.75144511, "learning_rate": 1.0161568895100981e-07, "loss": 0.77233076, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 14988, "time_per_iteration": 3.813774347305298 }, { "auxiliary_loss_clip": 0.01054688, "auxiliary_loss_mlp": 0.01039345, "balance_loss_clip": 1.01293993, "balance_loss_mlp": 1.01779079, "epoch": 0.9011874342401924, "flos": 24059695368960.0, "grad_norm": 2.325564068962338, "language_loss": 0.70492053, "learning_rate": 1.0149316249148188e-07, "loss": 0.72586083, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.36914062, "step": 14989, "time_per_iteration": 2.4426138401031494 }, { "auxiliary_loss_clip": 0.0105247, "auxiliary_loss_mlp": 0.01033941, "balance_loss_clip": 1.01096964, "balance_loss_mlp": 1.01669765, "epoch": 0.9012475574928603, "flos": 16757995201920.0, "grad_norm": 1.9485138157820747, "language_loss": 0.80976784, "learning_rate": 1.0137070802352376e-07, "loss": 0.83063197, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 14990, "time_per_iteration": 2.3571393489837646 }, { "auxiliary_loss_clip": 0.01054177, "auxiliary_loss_mlp": 0.01038546, "balance_loss_clip": 1.01390505, "balance_loss_mlp": 1.01627719, "epoch": 0.9013076807455284, "flos": 19970179107840.0, "grad_norm": 3.200810007364867, "language_loss": 0.79065984, "learning_rate": 1.0124832555177842e-07, "loss": 0.8115871, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 14991, "time_per_iteration": 2.417475938796997 }, { "auxiliary_loss_clip": 0.01007436, "auxiliary_loss_mlp": 0.01001704, "balance_loss_clip": 0.99972475, "balance_loss_mlp": 1.00107765, "epoch": 0.9013678039981963, "flos": 65176975249920.0, "grad_norm": 0.7760474460414648, "language_loss": 0.60323453, "learning_rate": 1.0112601508088726e-07, "loss": 0.62332594, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06347656, "step": 14992, "time_per_iteration": 2.99843430519104 }, { "auxiliary_loss_clip": 0.01050659, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.01299214, "balance_loss_mlp": 1.01558828, "epoch": 0.9014279272508643, "flos": 20520200666880.0, "grad_norm": 1.8106218087939918, "language_loss": 0.83915126, "learning_rate": 1.0100377661548764e-07, "loss": 0.86001951, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 14993, "time_per_iteration": 2.397394895553589 }, { "auxiliary_loss_clip": 0.01051226, "auxiliary_loss_mlp": 0.01035196, "balance_loss_clip": 1.01153326, "balance_loss_mlp": 1.01561332, "epoch": 0.9014880505035322, "flos": 17308156406400.0, "grad_norm": 1.8798194760138012, "language_loss": 0.74774289, "learning_rate": 1.0088161016021502e-07, "loss": 0.76860714, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 14994, "time_per_iteration": 2.359956979751587 }, { "auxiliary_loss_clip": 0.01049268, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.01575971, "balance_loss_mlp": 1.01503444, "epoch": 0.9015481737562002, "flos": 28401598915200.0, "grad_norm": 1.8157113121104376, "language_loss": 0.65301371, "learning_rate": 1.0075951571970187e-07, "loss": 0.67387128, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 14995, "time_per_iteration": 2.4588539600372314 }, { "auxiliary_loss_clip": 0.0105111, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.01612902, "balance_loss_mlp": 1.01496804, "epoch": 0.9016082970088681, "flos": 29751376055040.0, "grad_norm": 1.6908080593759034, "language_loss": 0.68207765, "learning_rate": 1.0063749329857873e-07, "loss": 0.70298696, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 14996, "time_per_iteration": 2.4428491592407227 }, { "auxiliary_loss_clip": 0.01048706, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.01232696, "balance_loss_mlp": 1.01507366, "epoch": 0.9016684202615362, "flos": 23512117605120.0, "grad_norm": 1.8268472314489996, "language_loss": 0.67126834, "learning_rate": 1.0051554290147168e-07, "loss": 0.69209206, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3359375, "step": 14997, "time_per_iteration": 2.4170167446136475 }, { "auxiliary_loss_clip": 0.0105094, "auxiliary_loss_mlp": 0.01033238, "balance_loss_clip": 1.01087403, "balance_loss_mlp": 1.01565516, "epoch": 0.9017285435142042, "flos": 16978401815040.0, "grad_norm": 1.7759333677244078, "language_loss": 0.7839765, "learning_rate": 1.0039366453300613e-07, "loss": 0.80481827, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 14998, "time_per_iteration": 2.3681607246398926 }, { "auxiliary_loss_clip": 0.01052701, "auxiliary_loss_mlp": 0.0103883, "balance_loss_clip": 1.01464236, "balance_loss_mlp": 1.01654065, "epoch": 0.9017886667668721, "flos": 21392401052160.0, "grad_norm": 1.6911029672543323, "language_loss": 0.75853956, "learning_rate": 1.0027185819780281e-07, "loss": 0.77945483, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 14999, "time_per_iteration": 2.3810958862304688 }, { "auxiliary_loss_clip": 0.01051058, "auxiliary_loss_mlp": 0.01037731, "balance_loss_clip": 1.01347232, "balance_loss_mlp": 1.01587367, "epoch": 0.9018487900195401, "flos": 20995508183040.0, "grad_norm": 1.9517952414468158, "language_loss": 0.76644188, "learning_rate": 1.0015012390048117e-07, "loss": 0.78732973, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3515625, "step": 15000, "time_per_iteration": 2.391608238220215 }, { "auxiliary_loss_clip": 0.01050462, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.00877094, "balance_loss_mlp": 1.01581681, "epoch": 0.901908913272208, "flos": 53356503121920.0, "grad_norm": 2.8663410385562913, "language_loss": 0.81709802, "learning_rate": 1.0002846164565704e-07, "loss": 0.83791637, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 15001, "time_per_iteration": 2.6566247940063477 }, { "auxiliary_loss_clip": 0.0105002, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.01052737, "balance_loss_mlp": 1.01615834, "epoch": 0.901969036524876, "flos": 22088778497280.0, "grad_norm": 1.4052938642882236, "language_loss": 0.78998697, "learning_rate": 9.990687143794407e-08, "loss": 0.81079233, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33984375, "step": 15002, "time_per_iteration": 2.3991551399230957 }, { "auxiliary_loss_clip": 0.01051953, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.01230192, "balance_loss_mlp": 1.01652622, "epoch": 0.9020291597775439, "flos": 23834086963200.0, "grad_norm": 2.770052620205701, "language_loss": 0.68741709, "learning_rate": 9.978535328195347e-08, "loss": 0.70828772, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 15003, "time_per_iteration": 3.716336250305176 }, { "auxiliary_loss_clip": 0.01053068, "auxiliary_loss_mlp": 0.01036269, "balance_loss_clip": 1.01209319, "balance_loss_mlp": 1.01662791, "epoch": 0.902089283030212, "flos": 18325211489280.0, "grad_norm": 1.8995512798942527, "language_loss": 0.86979282, "learning_rate": 9.9663907182292e-08, "loss": 0.89068621, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 15004, "time_per_iteration": 2.3642349243164062 }, { "auxiliary_loss_clip": 0.0105162, "auxiliary_loss_mlp": 0.01039829, "balance_loss_clip": 1.01704788, "balance_loss_mlp": 1.01606667, "epoch": 0.9021494062828799, "flos": 24169217904000.0, "grad_norm": 2.3091997467723004, "language_loss": 0.73508811, "learning_rate": 9.954253314356575e-08, "loss": 0.75600266, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 15005, "time_per_iteration": 2.406956911087036 }, { "auxiliary_loss_clip": 0.01051284, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.01685286, "balance_loss_mlp": 1.01528573, "epoch": 0.9022095295355479, "flos": 21615775130880.0, "grad_norm": 1.9508833116161668, "language_loss": 0.72676343, "learning_rate": 9.942123117037748e-08, "loss": 0.74768442, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 15006, "time_per_iteration": 2.3939785957336426 }, { "auxiliary_loss_clip": 0.0105262, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.01438296, "balance_loss_mlp": 1.01610792, "epoch": 0.9022696527882158, "flos": 18725455848960.0, "grad_norm": 1.9916306320954533, "language_loss": 0.8559652, "learning_rate": 9.930000126732618e-08, "loss": 0.87685817, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.36523438, "step": 15007, "time_per_iteration": 2.3554325103759766 }, { "auxiliary_loss_clip": 0.0105011, "auxiliary_loss_mlp": 0.01033275, "balance_loss_clip": 1.01095879, "balance_loss_mlp": 1.01654315, "epoch": 0.9023297760408838, "flos": 26759982787200.0, "grad_norm": 1.54843759899046, "language_loss": 0.79450673, "learning_rate": 9.917884343900928e-08, "loss": 0.81534058, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3359375, "step": 15008, "time_per_iteration": 2.4071714878082275 }, { "auxiliary_loss_clip": 0.01050235, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.01332867, "balance_loss_mlp": 1.01637256, "epoch": 0.9023898992935517, "flos": 20521492387200.0, "grad_norm": 1.71959046376413, "language_loss": 0.74527127, "learning_rate": 9.905775769002156e-08, "loss": 0.76612663, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33984375, "step": 15009, "time_per_iteration": 2.3788228034973145 }, { "auxiliary_loss_clip": 0.01050922, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.01242483, "balance_loss_mlp": 1.01579547, "epoch": 0.9024500225462198, "flos": 17455699278720.0, "grad_norm": 1.7724144213564452, "language_loss": 0.74492258, "learning_rate": 9.893674402495399e-08, "loss": 0.76578534, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 15010, "time_per_iteration": 2.3566486835479736 }, { "auxiliary_loss_clip": 0.01050957, "auxiliary_loss_mlp": 0.01036734, "balance_loss_clip": 1.01117563, "balance_loss_mlp": 1.01465726, "epoch": 0.9025101457988878, "flos": 20812563325440.0, "grad_norm": 1.9031977484913598, "language_loss": 0.76408154, "learning_rate": 9.881580244839538e-08, "loss": 0.78495848, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36328125, "step": 15011, "time_per_iteration": 2.3883235454559326 }, { "auxiliary_loss_clip": 0.01053698, "auxiliary_loss_mlp": 0.0104295, "balance_loss_clip": 1.01808286, "balance_loss_mlp": 1.0157665, "epoch": 0.9025702690515557, "flos": 19025359361280.0, "grad_norm": 2.1770034700634917, "language_loss": 0.74383104, "learning_rate": 9.869493296493204e-08, "loss": 0.76479757, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 15012, "time_per_iteration": 2.3655524253845215 }, { "auxiliary_loss_clip": 0.01052583, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.01478457, "balance_loss_mlp": 1.01700544, "epoch": 0.9026303923042237, "flos": 19681796344320.0, "grad_norm": 1.698880116206448, "language_loss": 0.70181191, "learning_rate": 9.857413557914763e-08, "loss": 0.72269726, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.35546875, "step": 15013, "time_per_iteration": 2.376260757446289 }, { "auxiliary_loss_clip": 0.01048519, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.01497746, "balance_loss_mlp": 1.01544189, "epoch": 0.9026905155568916, "flos": 24606958930560.0, "grad_norm": 1.427717531059623, "language_loss": 0.73779076, "learning_rate": 9.845341029562249e-08, "loss": 0.75863922, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33007812, "step": 15014, "time_per_iteration": 3.8816635608673096 }, { "auxiliary_loss_clip": 0.01050209, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.01077271, "balance_loss_mlp": 1.01480484, "epoch": 0.9027506388095596, "flos": 20520759248640.0, "grad_norm": 1.860487741399592, "language_loss": 0.73024958, "learning_rate": 9.833275711893474e-08, "loss": 0.75109196, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 15015, "time_per_iteration": 3.698607921600342 }, { "auxiliary_loss_clip": 0.01052068, "auxiliary_loss_mlp": 0.01037867, "balance_loss_clip": 1.01651633, "balance_loss_mlp": 1.01630497, "epoch": 0.9028107620622275, "flos": 22783375463040.0, "grad_norm": 1.811972082476494, "language_loss": 0.70252675, "learning_rate": 9.821217605365895e-08, "loss": 0.7234261, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35742188, "step": 15016, "time_per_iteration": 2.3719584941864014 }, { "auxiliary_loss_clip": 0.01050779, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.01182461, "balance_loss_mlp": 1.01671278, "epoch": 0.9028708853148956, "flos": 25409367774720.0, "grad_norm": 1.660263225613082, "language_loss": 0.71542913, "learning_rate": 9.809166710436855e-08, "loss": 0.73625702, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.33984375, "step": 15017, "time_per_iteration": 2.4338834285736084 }, { "auxiliary_loss_clip": 0.01050448, "auxiliary_loss_mlp": 0.0104089, "balance_loss_clip": 1.01905072, "balance_loss_mlp": 1.01607239, "epoch": 0.9029310085675635, "flos": 21870257097600.0, "grad_norm": 1.8098640209285484, "language_loss": 0.70274669, "learning_rate": 9.797123027563237e-08, "loss": 0.72366011, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 15018, "time_per_iteration": 2.380964517593384 }, { "auxiliary_loss_clip": 0.01050908, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.01439905, "balance_loss_mlp": 1.01544189, "epoch": 0.9029911318202315, "flos": 26213173073280.0, "grad_norm": 1.6575315998300169, "language_loss": 0.70012128, "learning_rate": 9.785086557201782e-08, "loss": 0.72100884, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 15019, "time_per_iteration": 2.420362710952759 }, { "auxiliary_loss_clip": 0.01049263, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.0186255, "balance_loss_mlp": 1.0154829, "epoch": 0.9030512550728994, "flos": 15960439036800.0, "grad_norm": 1.7399849799204408, "language_loss": 0.73422062, "learning_rate": 9.773057299808951e-08, "loss": 0.75510848, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3359375, "step": 15020, "time_per_iteration": 2.339611530303955 }, { "auxiliary_loss_clip": 0.01052559, "auxiliary_loss_mlp": 0.01038505, "balance_loss_clip": 1.0156405, "balance_loss_mlp": 1.01610541, "epoch": 0.9031113783255674, "flos": 23986482514560.0, "grad_norm": 1.4757315254643746, "language_loss": 0.74849725, "learning_rate": 9.7610352558408e-08, "loss": 0.76940787, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 15021, "time_per_iteration": 2.4290621280670166 }, { "auxiliary_loss_clip": 0.01054006, "auxiliary_loss_mlp": 0.01042672, "balance_loss_clip": 1.01596951, "balance_loss_mlp": 1.01659894, "epoch": 0.9031715015782353, "flos": 22236111901440.0, "grad_norm": 2.2872791492004754, "language_loss": 0.74369162, "learning_rate": 9.749020425753251e-08, "loss": 0.76465833, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.375, "step": 15022, "time_per_iteration": 2.3950164318084717 }, { "auxiliary_loss_clip": 0.01048566, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.01241636, "balance_loss_mlp": 1.01560092, "epoch": 0.9032316248309034, "flos": 26321962469760.0, "grad_norm": 1.5536870713820645, "language_loss": 0.73944461, "learning_rate": 9.737012810001943e-08, "loss": 0.76025891, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33007812, "step": 15023, "time_per_iteration": 2.4184203147888184 }, { "auxiliary_loss_clip": 0.01050318, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.01721048, "balance_loss_mlp": 1.01620567, "epoch": 0.9032917480835713, "flos": 22635623122560.0, "grad_norm": 1.6906985553699183, "language_loss": 0.83885241, "learning_rate": 9.725012409042155e-08, "loss": 0.85973489, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 15024, "time_per_iteration": 2.4038708209991455 }, { "auxiliary_loss_clip": 0.01051398, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01110303, "balance_loss_mlp": 1.01594269, "epoch": 0.9033518713362393, "flos": 23877623295360.0, "grad_norm": 1.4722755443858184, "language_loss": 0.70766866, "learning_rate": 9.713019223328966e-08, "loss": 0.72850847, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35546875, "step": 15025, "time_per_iteration": 2.4378652572631836 }, { "auxiliary_loss_clip": 0.01048882, "auxiliary_loss_mlp": 0.01036791, "balance_loss_clip": 1.01480842, "balance_loss_mlp": 1.01447988, "epoch": 0.9034119945889073, "flos": 26904104346240.0, "grad_norm": 1.6816604202423076, "language_loss": 0.77952576, "learning_rate": 9.70103325331717e-08, "loss": 0.80038249, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 15026, "time_per_iteration": 2.4539074897766113 }, { "auxiliary_loss_clip": 0.0105104, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.01288998, "balance_loss_mlp": 1.01608253, "epoch": 0.9034721178415752, "flos": 20849117385600.0, "grad_norm": 1.8805552105351868, "language_loss": 0.69643378, "learning_rate": 9.68905449946129e-08, "loss": 0.71729326, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 15027, "time_per_iteration": 3.8217170238494873 }, { "auxiliary_loss_clip": 0.01049468, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.01301754, "balance_loss_mlp": 1.0165695, "epoch": 0.9035322410942432, "flos": 22233284081280.0, "grad_norm": 1.509952892440184, "language_loss": 0.76072389, "learning_rate": 9.677082962215477e-08, "loss": 0.78157341, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.328125, "step": 15028, "time_per_iteration": 2.4096908569335938 }, { "auxiliary_loss_clip": 0.01051263, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.01629972, "balance_loss_mlp": 1.01622856, "epoch": 0.9035923643469111, "flos": 25922171957760.0, "grad_norm": 1.5962815872142295, "language_loss": 0.70081532, "learning_rate": 9.665118642033765e-08, "loss": 0.72172415, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 15029, "time_per_iteration": 2.447610378265381 }, { "auxiliary_loss_clip": 0.01052981, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.01415968, "balance_loss_mlp": 1.01580667, "epoch": 0.9036524875995792, "flos": 20338756997760.0, "grad_norm": 2.468780289047271, "language_loss": 0.75103283, "learning_rate": 9.653161539369858e-08, "loss": 0.77195203, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 15030, "time_per_iteration": 2.3669309616088867 }, { "auxiliary_loss_clip": 0.01052317, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.01362967, "balance_loss_mlp": 1.01579094, "epoch": 0.9037126108522471, "flos": 40113039133440.0, "grad_norm": 2.0554752459526466, "language_loss": 0.69224691, "learning_rate": 9.641211654677151e-08, "loss": 0.7131573, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 15031, "time_per_iteration": 2.53265380859375 }, { "auxiliary_loss_clip": 0.01049925, "auxiliary_loss_mlp": 0.0103173, "balance_loss_clip": 1.01084423, "balance_loss_mlp": 1.01569736, "epoch": 0.9037727341049151, "flos": 23330883404160.0, "grad_norm": 1.5495134266370991, "language_loss": 0.77896374, "learning_rate": 9.629268988408723e-08, "loss": 0.79978031, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 15032, "time_per_iteration": 2.4004437923431396 }, { "auxiliary_loss_clip": 0.01051144, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.01519442, "balance_loss_mlp": 1.01613653, "epoch": 0.903832857357583, "flos": 12821852010240.0, "grad_norm": 1.994326742553331, "language_loss": 0.75743032, "learning_rate": 9.617333541017502e-08, "loss": 0.77831995, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15033, "time_per_iteration": 2.355372667312622 }, { "auxiliary_loss_clip": 0.01052192, "auxiliary_loss_mlp": 0.0103655, "balance_loss_clip": 1.01440144, "balance_loss_mlp": 1.01693165, "epoch": 0.903892980610251, "flos": 25701835167360.0, "grad_norm": 1.593568676809785, "language_loss": 0.74832702, "learning_rate": 9.605405312956105e-08, "loss": 0.76921451, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15034, "time_per_iteration": 2.43583083152771 }, { "auxiliary_loss_clip": 0.01050708, "auxiliary_loss_mlp": 0.01040544, "balance_loss_clip": 1.0174768, "balance_loss_mlp": 1.01573575, "epoch": 0.9039531038629189, "flos": 14683211435520.0, "grad_norm": 1.9088404296549337, "language_loss": 0.6545068, "learning_rate": 9.593484304676791e-08, "loss": 0.67541933, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34960938, "step": 15035, "time_per_iteration": 2.3851022720336914 }, { "auxiliary_loss_clip": 0.01051071, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.01267362, "balance_loss_mlp": 1.015674, "epoch": 0.904013227115587, "flos": 24023769713280.0, "grad_norm": 5.028998722219779, "language_loss": 0.64136469, "learning_rate": 9.581570516631643e-08, "loss": 0.66224515, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35351562, "step": 15036, "time_per_iteration": 2.387105703353882 }, { "auxiliary_loss_clip": 0.01048599, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.01194549, "balance_loss_mlp": 1.01508522, "epoch": 0.9040733503682549, "flos": 22855366419840.0, "grad_norm": 1.7966164362945753, "language_loss": 0.83081138, "learning_rate": 9.569663949272455e-08, "loss": 0.85162342, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33398438, "step": 15037, "time_per_iteration": 2.381335973739624 }, { "auxiliary_loss_clip": 0.01052639, "auxiliary_loss_mlp": 0.01039064, "balance_loss_clip": 1.01470971, "balance_loss_mlp": 1.01623249, "epoch": 0.9041334736209229, "flos": 19973914623360.0, "grad_norm": 3.6043206392504956, "language_loss": 0.68708056, "learning_rate": 9.557764603050667e-08, "loss": 0.70799762, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 15038, "time_per_iteration": 2.361074686050415 }, { "auxiliary_loss_clip": 0.01052138, "auxiliary_loss_mlp": 0.01042129, "balance_loss_clip": 1.01784635, "balance_loss_mlp": 1.01568735, "epoch": 0.9041935968735909, "flos": 17529575448960.0, "grad_norm": 1.958335196365414, "language_loss": 0.77189374, "learning_rate": 9.545872478417494e-08, "loss": 0.79283643, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 15039, "time_per_iteration": 2.358555316925049 }, { "auxiliary_loss_clip": 0.01050756, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.01257849, "balance_loss_mlp": 1.01551986, "epoch": 0.9042537201262588, "flos": 22779151188480.0, "grad_norm": 1.609717570326033, "language_loss": 0.70997131, "learning_rate": 9.533987575823977e-08, "loss": 0.73082352, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15040, "time_per_iteration": 2.383469581604004 }, { "auxiliary_loss_clip": 0.01048331, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.01277912, "balance_loss_mlp": 1.01446342, "epoch": 0.9043138433789268, "flos": 20594356128000.0, "grad_norm": 1.6974305526461582, "language_loss": 0.69299841, "learning_rate": 9.522109895720709e-08, "loss": 0.71383291, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 15041, "time_per_iteration": 2.3906495571136475 }, { "auxiliary_loss_clip": 0.01050623, "auxiliary_loss_mlp": 0.01038189, "balance_loss_clip": 1.01480031, "balance_loss_mlp": 1.01550722, "epoch": 0.9043739666315948, "flos": 32961604924800.0, "grad_norm": 1.5532698556749072, "language_loss": 0.58364296, "learning_rate": 9.510239438558155e-08, "loss": 0.60453105, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 15042, "time_per_iteration": 3.8006675243377686 }, { "auxiliary_loss_clip": 0.01007478, "auxiliary_loss_mlp": 0.01003809, "balance_loss_clip": 1.00183046, "balance_loss_mlp": 1.00088644, "epoch": 0.9044340898842628, "flos": 67293061021440.0, "grad_norm": 0.78650599157204, "language_loss": 0.5701077, "learning_rate": 9.498376204786351e-08, "loss": 0.59022057, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06591797, "step": 15043, "time_per_iteration": 2.989306926727295 }, { "auxiliary_loss_clip": 0.01052446, "auxiliary_loss_mlp": 0.01036835, "balance_loss_clip": 1.01195586, "balance_loss_mlp": 1.01566982, "epoch": 0.9044942131369307, "flos": 17712171192960.0, "grad_norm": 1.6403142859375783, "language_loss": 0.70930278, "learning_rate": 9.486520194855274e-08, "loss": 0.73019558, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 15044, "time_per_iteration": 2.368720054626465 }, { "auxiliary_loss_clip": 0.01052464, "auxiliary_loss_mlp": 0.01040122, "balance_loss_clip": 1.01486111, "balance_loss_mlp": 1.01602292, "epoch": 0.9045543363895987, "flos": 17819633957760.0, "grad_norm": 2.1132294337830406, "language_loss": 0.70856953, "learning_rate": 9.474671409214407e-08, "loss": 0.72949541, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 15045, "time_per_iteration": 2.3572447299957275 }, { "auxiliary_loss_clip": 0.01055857, "auxiliary_loss_mlp": 0.01043117, "balance_loss_clip": 1.01747537, "balance_loss_mlp": 1.0177263, "epoch": 0.9046144596422666, "flos": 21871618640640.0, "grad_norm": 1.8890897985016673, "language_loss": 0.66188782, "learning_rate": 9.462829848313081e-08, "loss": 0.68287754, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3828125, "step": 15046, "time_per_iteration": 2.408220052719116 }, { "auxiliary_loss_clip": 0.01051613, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.01605725, "balance_loss_mlp": 1.0152036, "epoch": 0.9046745828949346, "flos": 17671776883200.0, "grad_norm": 1.947826110234466, "language_loss": 0.63388276, "learning_rate": 9.450995512600379e-08, "loss": 0.6547848, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 15047, "time_per_iteration": 2.326202869415283 }, { "auxiliary_loss_clip": 0.01051493, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.01114964, "balance_loss_mlp": 1.0161922, "epoch": 0.9047347061476025, "flos": 25701381319680.0, "grad_norm": 3.5907244419912363, "language_loss": 0.71480751, "learning_rate": 9.439168402525032e-08, "loss": 0.73565996, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15048, "time_per_iteration": 2.427826404571533 }, { "auxiliary_loss_clip": 0.01051379, "auxiliary_loss_mlp": 0.01039327, "balance_loss_clip": 1.01487672, "balance_loss_mlp": 1.01517594, "epoch": 0.9047948294002706, "flos": 15157262142720.0, "grad_norm": 2.1624912491406025, "language_loss": 0.76115394, "learning_rate": 9.427348518535483e-08, "loss": 0.78206098, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 15049, "time_per_iteration": 2.38037109375 }, { "auxiliary_loss_clip": 0.01048985, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.01242769, "balance_loss_mlp": 1.01559532, "epoch": 0.9048549526529385, "flos": 21871199704320.0, "grad_norm": 1.8443898199392317, "language_loss": 0.76894581, "learning_rate": 9.415535861079993e-08, "loss": 0.78977132, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33398438, "step": 15050, "time_per_iteration": 2.394037961959839 }, { "auxiliary_loss_clip": 0.01052502, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 1.01472187, "balance_loss_mlp": 1.01653755, "epoch": 0.9049150759056065, "flos": 23545599465600.0, "grad_norm": 1.633753504857696, "language_loss": 0.8283543, "learning_rate": 9.403730430606472e-08, "loss": 0.84924579, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.359375, "step": 15051, "time_per_iteration": 2.388979434967041 }, { "auxiliary_loss_clip": 0.01050715, "auxiliary_loss_mlp": 0.01035737, "balance_loss_clip": 1.01479149, "balance_loss_mlp": 1.01586437, "epoch": 0.9049751991582745, "flos": 19644893170560.0, "grad_norm": 2.0132181455467135, "language_loss": 0.90541101, "learning_rate": 9.391932227562582e-08, "loss": 0.92627549, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 15052, "time_per_iteration": 2.3580541610717773 }, { "auxiliary_loss_clip": 0.01054921, "auxiliary_loss_mlp": 0.01044073, "balance_loss_clip": 1.02030277, "balance_loss_mlp": 1.01760721, "epoch": 0.9050353224109424, "flos": 15595317371520.0, "grad_norm": 2.0167369452926747, "language_loss": 0.78122228, "learning_rate": 9.380141252395724e-08, "loss": 0.80221224, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 15053, "time_per_iteration": 2.364488124847412 }, { "auxiliary_loss_clip": 0.01050399, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.01094389, "balance_loss_mlp": 1.01590133, "epoch": 0.9050954456636104, "flos": 28182344376960.0, "grad_norm": 2.36758160023645, "language_loss": 0.74296385, "learning_rate": 9.368357505553049e-08, "loss": 0.76378977, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 15054, "time_per_iteration": 3.830526351928711 }, { "auxiliary_loss_clip": 0.01051359, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.01455593, "balance_loss_mlp": 1.01606274, "epoch": 0.9051555689162784, "flos": 25730638905600.0, "grad_norm": 1.6494788135442955, "language_loss": 0.84166622, "learning_rate": 9.356580987481333e-08, "loss": 0.86253887, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35351562, "step": 15055, "time_per_iteration": 3.7054131031036377 }, { "auxiliary_loss_clip": 0.01050897, "auxiliary_loss_mlp": 0.01039443, "balance_loss_clip": 1.0175916, "balance_loss_mlp": 1.0163573, "epoch": 0.9052156921689464, "flos": 23256169361280.0, "grad_norm": 1.711353792575477, "language_loss": 0.85895443, "learning_rate": 9.344811698627176e-08, "loss": 0.87985778, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 15056, "time_per_iteration": 2.3972790241241455 }, { "auxiliary_loss_clip": 0.01050345, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.01261926, "balance_loss_mlp": 1.01604986, "epoch": 0.9052758154216143, "flos": 29563159582080.0, "grad_norm": 1.8294652794422117, "language_loss": 0.73609209, "learning_rate": 9.333049639436863e-08, "loss": 0.75693572, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 15057, "time_per_iteration": 2.471463203430176 }, { "auxiliary_loss_clip": 0.01049643, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.01063836, "balance_loss_mlp": 1.01493275, "epoch": 0.9053359386742823, "flos": 22126589366400.0, "grad_norm": 4.933131153732209, "language_loss": 0.82291132, "learning_rate": 9.321294810356418e-08, "loss": 0.84373492, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 15058, "time_per_iteration": 2.401698350906372 }, { "auxiliary_loss_clip": 0.01007132, "auxiliary_loss_mlp": 0.01004767, "balance_loss_clip": 1.00274038, "balance_loss_mlp": 1.00074363, "epoch": 0.9053960619269502, "flos": 67086409484160.0, "grad_norm": 0.6745160721413789, "language_loss": 0.51416177, "learning_rate": 9.309547211831592e-08, "loss": 0.53428078, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.06347656, "step": 15059, "time_per_iteration": 3.1173038482666016 }, { "auxiliary_loss_clip": 0.01051667, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.01472783, "balance_loss_mlp": 1.01619172, "epoch": 0.9054561851796182, "flos": 15814502087040.0, "grad_norm": 1.7071141941421892, "language_loss": 0.68593585, "learning_rate": 9.297806844307831e-08, "loss": 0.70684421, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 15060, "time_per_iteration": 2.3875298500061035 }, { "auxiliary_loss_clip": 0.0105366, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.01336741, "balance_loss_mlp": 1.01699054, "epoch": 0.9055163084322861, "flos": 17566024775040.0, "grad_norm": 1.925187037826481, "language_loss": 0.65475637, "learning_rate": 9.286073708230357e-08, "loss": 0.67566091, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 15061, "time_per_iteration": 2.403048276901245 }, { "auxiliary_loss_clip": 0.01053012, "auxiliary_loss_mlp": 0.0103967, "balance_loss_clip": 1.01561356, "balance_loss_mlp": 1.01659751, "epoch": 0.9055764316849542, "flos": 17638609224960.0, "grad_norm": 1.771882182196225, "language_loss": 0.72359645, "learning_rate": 9.274347804044058e-08, "loss": 0.74452335, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 15062, "time_per_iteration": 2.392286777496338 }, { "auxiliary_loss_clip": 0.01050158, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.02131581, "balance_loss_mlp": 1.01544499, "epoch": 0.9056365549376221, "flos": 20119816661760.0, "grad_norm": 1.5465106105507815, "language_loss": 0.71571904, "learning_rate": 9.2626291321936e-08, "loss": 0.73665738, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 15063, "time_per_iteration": 2.382380247116089 }, { "auxiliary_loss_clip": 0.01049227, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.01168871, "balance_loss_mlp": 1.01493716, "epoch": 0.9056966781902901, "flos": 27597584148480.0, "grad_norm": 1.6540208077640417, "language_loss": 0.73584652, "learning_rate": 9.250917693123406e-08, "loss": 0.75667727, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 15064, "time_per_iteration": 2.4322423934936523 }, { "auxiliary_loss_clip": 0.01050568, "auxiliary_loss_mlp": 0.01034793, "balance_loss_clip": 1.01213145, "balance_loss_mlp": 1.01483679, "epoch": 0.9057568014429581, "flos": 25918960112640.0, "grad_norm": 2.5746970093739168, "language_loss": 0.71437275, "learning_rate": 9.23921348727752e-08, "loss": 0.73522639, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 15065, "time_per_iteration": 2.39947772026062 }, { "auxiliary_loss_clip": 0.01051536, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.01579309, "balance_loss_mlp": 1.01646876, "epoch": 0.905816924695626, "flos": 22929242590080.0, "grad_norm": 1.7819924703161123, "language_loss": 0.64160269, "learning_rate": 9.227516515099743e-08, "loss": 0.66250259, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 15066, "time_per_iteration": 3.9104833602905273 }, { "auxiliary_loss_clip": 0.01054888, "auxiliary_loss_mlp": 0.01038804, "balance_loss_clip": 1.01135015, "balance_loss_mlp": 1.0160929, "epoch": 0.905877047948294, "flos": 22156510268160.0, "grad_norm": 1.8455575926783088, "language_loss": 0.81439412, "learning_rate": 9.215826777033675e-08, "loss": 0.83533108, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38867188, "step": 15067, "time_per_iteration": 2.395298719406128 }, { "auxiliary_loss_clip": 0.01052065, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.01142406, "balance_loss_mlp": 1.01586938, "epoch": 0.905937171200962, "flos": 15303897319680.0, "grad_norm": 1.5870875889522804, "language_loss": 0.71375227, "learning_rate": 9.204144273522563e-08, "loss": 0.73464155, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36132812, "step": 15068, "time_per_iteration": 2.3575825691223145 }, { "auxiliary_loss_clip": 0.01049715, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.01186419, "balance_loss_mlp": 1.01497817, "epoch": 0.90599729445363, "flos": 19461983224320.0, "grad_norm": 1.9423216839563149, "language_loss": 0.87029517, "learning_rate": 9.19246900500943e-08, "loss": 0.89113438, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 15069, "time_per_iteration": 2.3593192100524902 }, { "auxiliary_loss_clip": 0.01054196, "auxiliary_loss_mlp": 0.01042273, "balance_loss_clip": 1.01717997, "balance_loss_mlp": 1.01613247, "epoch": 0.9060574177062979, "flos": 23731825991040.0, "grad_norm": 1.92445406522355, "language_loss": 0.59954369, "learning_rate": 9.180800971936987e-08, "loss": 0.62050843, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.38085938, "step": 15070, "time_per_iteration": 2.3853113651275635 }, { "auxiliary_loss_clip": 0.01052823, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.00903738, "balance_loss_mlp": 1.01635206, "epoch": 0.9061175409589659, "flos": 17310181265280.0, "grad_norm": 1.990371652200866, "language_loss": 0.82297492, "learning_rate": 9.169140174747724e-08, "loss": 0.8438549, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36523438, "step": 15071, "time_per_iteration": 2.337263822555542 }, { "auxiliary_loss_clip": 0.01055152, "auxiliary_loss_mlp": 0.01042163, "balance_loss_clip": 1.0163784, "balance_loss_mlp": 1.01629782, "epoch": 0.9061776642116338, "flos": 17777668636800.0, "grad_norm": 1.8765695336364918, "language_loss": 0.62900305, "learning_rate": 9.157486613883758e-08, "loss": 0.64997613, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38867188, "step": 15072, "time_per_iteration": 2.3437602519989014 }, { "auxiliary_loss_clip": 0.01050633, "auxiliary_loss_mlp": 0.01043568, "balance_loss_clip": 1.01859331, "balance_loss_mlp": 1.01552248, "epoch": 0.9062377874643018, "flos": 42776039352960.0, "grad_norm": 2.0982128117301215, "language_loss": 0.74455678, "learning_rate": 9.145840289787021e-08, "loss": 0.76549876, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3515625, "step": 15073, "time_per_iteration": 2.5954792499542236 }, { "auxiliary_loss_clip": 0.01049934, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.01181602, "balance_loss_mlp": 1.01576352, "epoch": 0.9062979107169697, "flos": 16360718307840.0, "grad_norm": 1.9721110932895534, "language_loss": 0.82780552, "learning_rate": 9.134201202899161e-08, "loss": 0.84863013, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 15074, "time_per_iteration": 2.3595244884490967 }, { "auxiliary_loss_clip": 0.0100718, "auxiliary_loss_mlp": 0.01002609, "balance_loss_clip": 1.00055826, "balance_loss_mlp": 1.00064027, "epoch": 0.9063580339696378, "flos": 69310272222720.0, "grad_norm": 0.766739562669384, "language_loss": 0.52392209, "learning_rate": 9.122569353661513e-08, "loss": 0.54402006, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06542969, "step": 15075, "time_per_iteration": 3.0974953174591064 }, { "auxiliary_loss_clip": 0.0100709, "auxiliary_loss_mlp": 0.01002254, "balance_loss_clip": 1.00007272, "balance_loss_mlp": 1.00050139, "epoch": 0.9064181572223057, "flos": 58791640204800.0, "grad_norm": 0.7298869990728761, "language_loss": 0.62248528, "learning_rate": 9.11094474251517e-08, "loss": 0.64257872, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.06591797, "step": 15076, "time_per_iteration": 2.9394147396087646 }, { "auxiliary_loss_clip": 0.01050708, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.01450753, "balance_loss_mlp": 1.01529968, "epoch": 0.9064782804749737, "flos": 21761607346560.0, "grad_norm": 1.7391437454406076, "language_loss": 0.83034539, "learning_rate": 9.09932736990091e-08, "loss": 0.85123014, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 15077, "time_per_iteration": 2.3835256099700928 }, { "auxiliary_loss_clip": 0.01048245, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.01177895, "balance_loss_mlp": 1.0145278, "epoch": 0.9065384037276417, "flos": 21396311124480.0, "grad_norm": 1.5505647438791716, "language_loss": 0.85246086, "learning_rate": 9.08771723625934e-08, "loss": 0.87326837, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33789062, "step": 15078, "time_per_iteration": 2.378180980682373 }, { "auxiliary_loss_clip": 0.0104929, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.01584172, "balance_loss_mlp": 1.01559019, "epoch": 0.9065985269803096, "flos": 38282298837120.0, "grad_norm": 1.5437044229509698, "language_loss": 0.66722679, "learning_rate": 9.076114342030617e-08, "loss": 0.68810976, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.33789062, "step": 15079, "time_per_iteration": 2.549582004547119 }, { "auxiliary_loss_clip": 0.01050635, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.01405692, "balance_loss_mlp": 1.01543498, "epoch": 0.9066586502329776, "flos": 44816922322560.0, "grad_norm": 1.4484849132229312, "language_loss": 0.71272981, "learning_rate": 9.064518687654765e-08, "loss": 0.73359716, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15080, "time_per_iteration": 2.5912885665893555 }, { "auxiliary_loss_clip": 0.0105395, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.01694369, "balance_loss_mlp": 1.01767802, "epoch": 0.9067187734856456, "flos": 18623020320000.0, "grad_norm": 2.195968336407109, "language_loss": 0.72021878, "learning_rate": 9.052930273571547e-08, "loss": 0.74117899, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 15081, "time_per_iteration": 2.3888871669769287 }, { "auxiliary_loss_clip": 0.0105084, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.01545238, "balance_loss_mlp": 1.01576591, "epoch": 0.9067788967383136, "flos": 22746472289280.0, "grad_norm": 2.0386660268943784, "language_loss": 0.7510519, "learning_rate": 9.04134910022032e-08, "loss": 0.77194566, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 15082, "time_per_iteration": 3.6623363494873047 }, { "auxiliary_loss_clip": 0.01050306, "auxiliary_loss_mlp": 0.0103864, "balance_loss_clip": 1.01724195, "balance_loss_mlp": 1.01602054, "epoch": 0.9068390199909815, "flos": 27669610016640.0, "grad_norm": 1.723431884258522, "language_loss": 0.79344159, "learning_rate": 9.029775168040266e-08, "loss": 0.81433105, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 15083, "time_per_iteration": 2.4238500595092773 }, { "auxiliary_loss_clip": 0.01048815, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.01411307, "balance_loss_mlp": 1.01566529, "epoch": 0.9068991432436495, "flos": 24242849694720.0, "grad_norm": 1.5648634737053604, "language_loss": 0.69420564, "learning_rate": 9.01820847747028e-08, "loss": 0.71503115, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.33203125, "step": 15084, "time_per_iteration": 2.390556573867798 }, { "auxiliary_loss_clip": 0.01052229, "auxiliary_loss_mlp": 0.01042132, "balance_loss_clip": 1.01837385, "balance_loss_mlp": 1.01630282, "epoch": 0.9069592664963174, "flos": 28032183152640.0, "grad_norm": 2.502446390723708, "language_loss": 0.67972487, "learning_rate": 9.006649028948965e-08, "loss": 0.70066845, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15085, "time_per_iteration": 2.464702606201172 }, { "auxiliary_loss_clip": 0.01007813, "auxiliary_loss_mlp": 0.01005012, "balance_loss_clip": 1.00285399, "balance_loss_mlp": 1.0011065, "epoch": 0.9070193897489854, "flos": 68775404060160.0, "grad_norm": 0.7779539763661253, "language_loss": 0.61293036, "learning_rate": 8.995096822914638e-08, "loss": 0.63305861, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06738281, "step": 15086, "time_per_iteration": 3.0223729610443115 }, { "auxiliary_loss_clip": 0.01050617, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.01506233, "balance_loss_mlp": 1.01514626, "epoch": 0.9070795130016533, "flos": 23440475761920.0, "grad_norm": 1.483012463809674, "language_loss": 0.73327291, "learning_rate": 8.983551859805416e-08, "loss": 0.75416565, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 15087, "time_per_iteration": 2.4166066646575928 }, { "auxiliary_loss_clip": 0.01051434, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.01554465, "balance_loss_mlp": 1.01593423, "epoch": 0.9071396362543214, "flos": 18915417889920.0, "grad_norm": 2.1510288249933907, "language_loss": 0.77817595, "learning_rate": 8.972014140059058e-08, "loss": 0.79906797, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 15088, "time_per_iteration": 2.3813536167144775 }, { "auxiliary_loss_clip": 0.01048757, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.01408398, "balance_loss_mlp": 1.0149287, "epoch": 0.9071997595069893, "flos": 25227470257920.0, "grad_norm": 2.0879784874212346, "language_loss": 0.73960835, "learning_rate": 8.960483664113038e-08, "loss": 0.76046634, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.33789062, "step": 15089, "time_per_iteration": 2.405083179473877 }, { "auxiliary_loss_clip": 0.01048388, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.01357067, "balance_loss_mlp": 1.01501203, "epoch": 0.9072598827596573, "flos": 24345634337280.0, "grad_norm": 1.9217342797439527, "language_loss": 0.76313996, "learning_rate": 8.948960432404628e-08, "loss": 0.78397155, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33398438, "step": 15090, "time_per_iteration": 2.3929076194763184 }, { "auxiliary_loss_clip": 0.01053538, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.0095222, "balance_loss_mlp": 1.01634836, "epoch": 0.9073200060123253, "flos": 22673852928000.0, "grad_norm": 2.819057224547379, "language_loss": 0.78742826, "learning_rate": 8.93744444537079e-08, "loss": 0.80832338, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37304688, "step": 15091, "time_per_iteration": 2.4856503009796143 }, { "auxiliary_loss_clip": 0.01048358, "auxiliary_loss_mlp": 0.01035175, "balance_loss_clip": 1.01490974, "balance_loss_mlp": 1.01597047, "epoch": 0.9073801292649932, "flos": 23694364235520.0, "grad_norm": 1.7239642381023876, "language_loss": 0.86387908, "learning_rate": 8.925935703448217e-08, "loss": 0.88471448, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.32421875, "step": 15092, "time_per_iteration": 2.4006941318511963 }, { "auxiliary_loss_clip": 0.01050392, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.01578927, "balance_loss_mlp": 1.01624763, "epoch": 0.9074402525176612, "flos": 25374210168960.0, "grad_norm": 1.4991782335568298, "language_loss": 0.79615426, "learning_rate": 8.914434207073296e-08, "loss": 0.81703746, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34179688, "step": 15093, "time_per_iteration": 3.8159642219543457 }, { "auxiliary_loss_clip": 0.0100721, "auxiliary_loss_mlp": 0.01003153, "balance_loss_clip": 1.00095975, "balance_loss_mlp": 1.00053358, "epoch": 0.9075003757703292, "flos": 67645998622080.0, "grad_norm": 0.7373730473670456, "language_loss": 0.57135832, "learning_rate": 8.902939956682188e-08, "loss": 0.5914619, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06689453, "step": 15094, "time_per_iteration": 4.389244318008423 }, { "auxiliary_loss_clip": 0.01052649, "auxiliary_loss_mlp": 0.01042437, "balance_loss_clip": 1.01632988, "balance_loss_mlp": 1.01594257, "epoch": 0.9075604990229972, "flos": 22452608442240.0, "grad_norm": 1.9554738733298196, "language_loss": 0.72562981, "learning_rate": 8.891452952710742e-08, "loss": 0.74658072, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3671875, "step": 15095, "time_per_iteration": 2.3751907348632812 }, { "auxiliary_loss_clip": 0.01052571, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.01214647, "balance_loss_mlp": 1.01669753, "epoch": 0.9076206222756651, "flos": 19535649926400.0, "grad_norm": 1.7392709836890108, "language_loss": 0.75233132, "learning_rate": 8.879973195594526e-08, "loss": 0.77320766, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 15096, "time_per_iteration": 2.3783934116363525 }, { "auxiliary_loss_clip": 0.0105207, "auxiliary_loss_mlp": 0.01041885, "balance_loss_clip": 1.01745868, "balance_loss_mlp": 1.01620317, "epoch": 0.9076807455283331, "flos": 30115624936320.0, "grad_norm": 2.2682570021621657, "language_loss": 0.57862979, "learning_rate": 8.868500685768898e-08, "loss": 0.59956938, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 15097, "time_per_iteration": 2.4446113109588623 }, { "auxiliary_loss_clip": 0.01049215, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.01352441, "balance_loss_mlp": 1.01484299, "epoch": 0.907740868781001, "flos": 18696547376640.0, "grad_norm": 1.5831040943619765, "language_loss": 0.80106419, "learning_rate": 8.857035423668935e-08, "loss": 0.82190651, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 15098, "time_per_iteration": 2.353168487548828 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.01039134, "balance_loss_clip": 1.01525688, "balance_loss_mlp": 1.01549828, "epoch": 0.907800992033669, "flos": 22637682892800.0, "grad_norm": 1.7335278312022695, "language_loss": 0.6724056, "learning_rate": 8.845577409729266e-08, "loss": 0.69331956, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 15099, "time_per_iteration": 2.3713717460632324 }, { "auxiliary_loss_clip": 0.01053799, "auxiliary_loss_mlp": 0.01037878, "balance_loss_clip": 1.01346409, "balance_loss_mlp": 1.01686585, "epoch": 0.907861115286337, "flos": 21286614032640.0, "grad_norm": 2.526494742452283, "language_loss": 0.71223438, "learning_rate": 8.834126644384477e-08, "loss": 0.73315114, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 15100, "time_per_iteration": 2.3790347576141357 }, { "auxiliary_loss_clip": 0.01007062, "auxiliary_loss_mlp": 0.01002655, "balance_loss_clip": 1.00060499, "balance_loss_mlp": 1.00051665, "epoch": 0.907921238539005, "flos": 69736108475520.0, "grad_norm": 0.6266908712204965, "language_loss": 0.53446585, "learning_rate": 8.822683128068775e-08, "loss": 0.55456305, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06542969, "step": 15101, "time_per_iteration": 3.0830421447753906 }, { "auxiliary_loss_clip": 0.01050876, "auxiliary_loss_mlp": 0.01036241, "balance_loss_clip": 1.01360345, "balance_loss_mlp": 1.01555777, "epoch": 0.9079813617916729, "flos": 23476261772160.0, "grad_norm": 1.841481246993074, "language_loss": 0.69419622, "learning_rate": 8.811246861216081e-08, "loss": 0.71506739, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15102, "time_per_iteration": 2.4149646759033203 }, { "auxiliary_loss_clip": 0.01050648, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.01254535, "balance_loss_mlp": 1.01632667, "epoch": 0.9080414850443409, "flos": 22928823653760.0, "grad_norm": 1.8256161639597182, "language_loss": 0.79907817, "learning_rate": 8.799817844260049e-08, "loss": 0.81993377, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 15103, "time_per_iteration": 2.375267267227173 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.00884271, "balance_loss_mlp": 1.01522481, "epoch": 0.9081016082970089, "flos": 26175885874560.0, "grad_norm": 2.3439553139288223, "language_loss": 0.72949314, "learning_rate": 8.78839607763413e-08, "loss": 0.75031716, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36328125, "step": 15104, "time_per_iteration": 2.4130516052246094 }, { "auxiliary_loss_clip": 0.01049418, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.01205969, "balance_loss_mlp": 1.01549709, "epoch": 0.9081617315496768, "flos": 24461021980800.0, "grad_norm": 1.7389292906329683, "language_loss": 0.7827028, "learning_rate": 8.77698156177138e-08, "loss": 0.80352253, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 15105, "time_per_iteration": 2.39373779296875 }, { "auxiliary_loss_clip": 0.01052233, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.01516199, "balance_loss_mlp": 1.01534963, "epoch": 0.9082218548023449, "flos": 24745913608320.0, "grad_norm": 1.9704301923492076, "language_loss": 0.74913359, "learning_rate": 8.765574297104628e-08, "loss": 0.77004075, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 15106, "time_per_iteration": 3.8610799312591553 }, { "auxiliary_loss_clip": 0.01051672, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.01706004, "balance_loss_mlp": 1.01607203, "epoch": 0.9082819780550128, "flos": 24420278557440.0, "grad_norm": 1.9964458730031656, "language_loss": 0.8107537, "learning_rate": 8.754174284066462e-08, "loss": 0.83167779, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 15107, "time_per_iteration": 2.4689202308654785 }, { "auxiliary_loss_clip": 0.01007398, "auxiliary_loss_mlp": 0.01001703, "balance_loss_clip": 0.99959326, "balance_loss_mlp": 1.00088382, "epoch": 0.9083421013076808, "flos": 59606233113600.0, "grad_norm": 0.8081757828764939, "language_loss": 0.59638691, "learning_rate": 8.742781523089205e-08, "loss": 0.61647791, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06542969, "step": 15108, "time_per_iteration": 2.9910144805908203 }, { "auxiliary_loss_clip": 0.01051774, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.01070809, "balance_loss_mlp": 1.01529789, "epoch": 0.9084022245603487, "flos": 33618809957760.0, "grad_norm": 1.6284268502527683, "language_loss": 0.75199223, "learning_rate": 8.73139601460482e-08, "loss": 0.77283263, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36523438, "step": 15109, "time_per_iteration": 2.4879775047302246 }, { "auxiliary_loss_clip": 0.01049595, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.01155186, "balance_loss_mlp": 1.01515746, "epoch": 0.9084623478130167, "flos": 24970579407360.0, "grad_norm": 1.9353387512796933, "language_loss": 0.72834414, "learning_rate": 8.720017759045073e-08, "loss": 0.74916291, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34375, "step": 15110, "time_per_iteration": 2.409074306488037 }, { "auxiliary_loss_clip": 0.01049264, "auxiliary_loss_mlp": 0.01032134, "balance_loss_clip": 1.01213074, "balance_loss_mlp": 1.01533818, "epoch": 0.9085224710656846, "flos": 31460619219840.0, "grad_norm": 1.904317585394856, "language_loss": 0.70277834, "learning_rate": 8.708646756841421e-08, "loss": 0.72359234, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33984375, "step": 15111, "time_per_iteration": 2.504939317703247 }, { "auxiliary_loss_clip": 0.01007403, "auxiliary_loss_mlp": 0.0100267, "balance_loss_clip": 1.00060797, "balance_loss_mlp": 1.00082445, "epoch": 0.9085825943183526, "flos": 64913940000000.0, "grad_norm": 0.6966348490962464, "language_loss": 0.51867938, "learning_rate": 8.697283008425026e-08, "loss": 0.53878009, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.06591797, "step": 15112, "time_per_iteration": 3.1745262145996094 }, { "auxiliary_loss_clip": 0.01051399, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.01347733, "balance_loss_mlp": 1.01539934, "epoch": 0.9086427175710206, "flos": 18952216329600.0, "grad_norm": 1.7585975158096268, "language_loss": 0.71713924, "learning_rate": 8.685926514226837e-08, "loss": 0.73801953, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 15113, "time_per_iteration": 2.4937188625335693 }, { "auxiliary_loss_clip": 0.01052041, "auxiliary_loss_mlp": 0.01031821, "balance_loss_clip": 1.01008952, "balance_loss_mlp": 1.01677155, "epoch": 0.9087028408236886, "flos": 34013643056640.0, "grad_norm": 4.091196724395817, "language_loss": 0.8000735, "learning_rate": 8.674577274677508e-08, "loss": 0.82091224, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35351562, "step": 15114, "time_per_iteration": 2.550168991088867 }, { "auxiliary_loss_clip": 0.01053948, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.01091564, "balance_loss_mlp": 1.01726079, "epoch": 0.9087629640763565, "flos": 21943504863360.0, "grad_norm": 1.9589559650189217, "language_loss": 0.72183037, "learning_rate": 8.663235290207405e-08, "loss": 0.74273801, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.3671875, "step": 15115, "time_per_iteration": 2.382282257080078 }, { "auxiliary_loss_clip": 0.01054311, "auxiliary_loss_mlp": 0.01036368, "balance_loss_clip": 1.01215672, "balance_loss_mlp": 1.01716256, "epoch": 0.9088230873290245, "flos": 21761816814720.0, "grad_norm": 2.400897865998849, "language_loss": 0.66502732, "learning_rate": 8.651900561246561e-08, "loss": 0.68593413, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37109375, "step": 15116, "time_per_iteration": 2.374147891998291 }, { "auxiliary_loss_clip": 0.01050182, "auxiliary_loss_mlp": 0.01041524, "balance_loss_clip": 1.01714599, "balance_loss_mlp": 1.01608133, "epoch": 0.9088832105816925, "flos": 21540258126720.0, "grad_norm": 1.89593917986977, "language_loss": 0.70644617, "learning_rate": 8.640573088224812e-08, "loss": 0.72736323, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.34179688, "step": 15117, "time_per_iteration": 2.3851237297058105 }, { "auxiliary_loss_clip": 0.01051535, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.0130161, "balance_loss_mlp": 1.01674342, "epoch": 0.9089433338343604, "flos": 25995454634880.0, "grad_norm": 1.590992920036761, "language_loss": 0.75742626, "learning_rate": 8.629252871571745e-08, "loss": 0.77828932, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 15118, "time_per_iteration": 2.4108362197875977 }, { "auxiliary_loss_clip": 0.01054116, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.01097012, "balance_loss_mlp": 1.01623249, "epoch": 0.9090034570870285, "flos": 21177370788480.0, "grad_norm": 1.9344194008819082, "language_loss": 0.74750936, "learning_rate": 8.617939911716554e-08, "loss": 0.76842523, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 15119, "time_per_iteration": 2.402151107788086 }, { "auxiliary_loss_clip": 0.01054619, "auxiliary_loss_mlp": 0.01039256, "balance_loss_clip": 1.01409137, "balance_loss_mlp": 1.01669323, "epoch": 0.9090635803396964, "flos": 16140940099200.0, "grad_norm": 2.66823603451073, "language_loss": 0.7219981, "learning_rate": 8.60663420908827e-08, "loss": 0.74293685, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 15120, "time_per_iteration": 2.358015298843384 }, { "auxiliary_loss_clip": 0.01052429, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.0127666, "balance_loss_mlp": 1.01639223, "epoch": 0.9091237035923644, "flos": 20590585701120.0, "grad_norm": 1.9763142740486987, "language_loss": 0.67467105, "learning_rate": 8.595335764115596e-08, "loss": 0.69555867, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 15121, "time_per_iteration": 2.4375100135803223 }, { "auxiliary_loss_clip": 0.01053056, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.02543736, "balance_loss_mlp": 1.01607347, "epoch": 0.9091838268450323, "flos": 52225840874880.0, "grad_norm": 1.9540074985116573, "language_loss": 0.71468949, "learning_rate": 8.58404457722699e-08, "loss": 0.73572445, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 15122, "time_per_iteration": 3.898468494415283 }, { "auxiliary_loss_clip": 0.01050014, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.01775134, "balance_loss_mlp": 1.01507652, "epoch": 0.9092439500977003, "flos": 20558535206400.0, "grad_norm": 1.3075823038540777, "language_loss": 0.75197941, "learning_rate": 8.572760648850575e-08, "loss": 0.77288163, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 15123, "time_per_iteration": 2.3963544368743896 }, { "auxiliary_loss_clip": 0.01050494, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.01434863, "balance_loss_mlp": 1.01596045, "epoch": 0.9093040733503682, "flos": 28616699001600.0, "grad_norm": 2.2086977231363316, "language_loss": 0.76450837, "learning_rate": 8.561483979414253e-08, "loss": 0.78539133, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34570312, "step": 15124, "time_per_iteration": 2.4285025596618652 }, { "auxiliary_loss_clip": 0.01051126, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.01244843, "balance_loss_mlp": 1.01587653, "epoch": 0.9093641966030362, "flos": 23439079307520.0, "grad_norm": 2.277560924770952, "language_loss": 0.74033177, "learning_rate": 8.55021456934566e-08, "loss": 0.76119602, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 15125, "time_per_iteration": 2.3974802494049072 }, { "auxiliary_loss_clip": 0.01049545, "auxiliary_loss_mlp": 0.01036691, "balance_loss_clip": 1.01437557, "balance_loss_mlp": 1.01580083, "epoch": 0.9094243198557042, "flos": 16799262295680.0, "grad_norm": 1.5564219523144494, "language_loss": 0.79704851, "learning_rate": 8.538952419072143e-08, "loss": 0.81791091, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33789062, "step": 15126, "time_per_iteration": 2.3447511196136475 }, { "auxiliary_loss_clip": 0.01050326, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.01467657, "balance_loss_mlp": 1.01529455, "epoch": 0.9094844431083722, "flos": 24272316748800.0, "grad_norm": 1.6360266289532175, "language_loss": 0.759094, "learning_rate": 8.527697529020694e-08, "loss": 0.77997887, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34960938, "step": 15127, "time_per_iteration": 2.4177520275115967 }, { "auxiliary_loss_clip": 0.01050907, "auxiliary_loss_mlp": 0.01037092, "balance_loss_clip": 1.01420379, "balance_loss_mlp": 1.01465201, "epoch": 0.9095445663610401, "flos": 21943574686080.0, "grad_norm": 2.087739206623208, "language_loss": 0.63932955, "learning_rate": 8.516449899618173e-08, "loss": 0.66020954, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 15128, "time_per_iteration": 2.3760385513305664 }, { "auxiliary_loss_clip": 0.01049457, "auxiliary_loss_mlp": 0.01036948, "balance_loss_clip": 1.01537108, "balance_loss_mlp": 1.01513934, "epoch": 0.9096046896137081, "flos": 19791807638400.0, "grad_norm": 1.5280348802564403, "language_loss": 0.77613556, "learning_rate": 8.505209531291013e-08, "loss": 0.79699957, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 15129, "time_per_iteration": 2.4115517139434814 }, { "auxiliary_loss_clip": 0.01051768, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.01159954, "balance_loss_mlp": 1.01605344, "epoch": 0.909664812866376, "flos": 22636984665600.0, "grad_norm": 2.150948679465896, "language_loss": 0.84293622, "learning_rate": 8.49397642446552e-08, "loss": 0.86379963, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 15130, "time_per_iteration": 2.3717751502990723 }, { "auxiliary_loss_clip": 0.01053313, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.01262617, "balance_loss_mlp": 1.01695919, "epoch": 0.909724936119044, "flos": 39850771933440.0, "grad_norm": 2.026457158063785, "language_loss": 0.76110959, "learning_rate": 8.482750579567644e-08, "loss": 0.78200281, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 15131, "time_per_iteration": 2.521912097930908 }, { "auxiliary_loss_clip": 0.01053523, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.01648033, "balance_loss_mlp": 1.01758432, "epoch": 0.9097850593717121, "flos": 35070394222080.0, "grad_norm": 1.7878945259328018, "language_loss": 0.60277736, "learning_rate": 8.471531997023085e-08, "loss": 0.62370312, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.359375, "step": 15132, "time_per_iteration": 2.4953842163085938 }, { "auxiliary_loss_clip": 0.01052599, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.01446319, "balance_loss_mlp": 1.01680982, "epoch": 0.90984518262438, "flos": 23366355212160.0, "grad_norm": 1.5128296312448373, "language_loss": 0.83333117, "learning_rate": 8.460320677257193e-08, "loss": 0.8542245, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 15133, "time_per_iteration": 5.278148412704468 }, { "auxiliary_loss_clip": 0.01050863, "auxiliary_loss_mlp": 0.01033047, "balance_loss_clip": 1.00955129, "balance_loss_mlp": 1.01497197, "epoch": 0.909905305877048, "flos": 27522171878400.0, "grad_norm": 1.7008559574402897, "language_loss": 0.74556816, "learning_rate": 8.449116620695118e-08, "loss": 0.76640731, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 15134, "time_per_iteration": 2.425355911254883 }, { "auxiliary_loss_clip": 0.01054305, "auxiliary_loss_mlp": 0.01044568, "balance_loss_clip": 1.01911664, "balance_loss_mlp": 1.01631021, "epoch": 0.9099654291297159, "flos": 24346856234880.0, "grad_norm": 1.5474451047086473, "language_loss": 0.73785543, "learning_rate": 8.437919827761786e-08, "loss": 0.75884414, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 15135, "time_per_iteration": 2.392383575439453 }, { "auxiliary_loss_clip": 0.01051413, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.01211262, "balance_loss_mlp": 1.01662838, "epoch": 0.9100255523823839, "flos": 21214169228160.0, "grad_norm": 5.851622807173836, "language_loss": 0.70939839, "learning_rate": 8.426730298881702e-08, "loss": 0.73025405, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 15136, "time_per_iteration": 2.3700170516967773 }, { "auxiliary_loss_clip": 0.01007405, "auxiliary_loss_mlp": 0.01003918, "balance_loss_clip": 1.00165284, "balance_loss_mlp": 1.00082278, "epoch": 0.9100856756350518, "flos": 46049773852800.0, "grad_norm": 0.8130697080179206, "language_loss": 0.59352648, "learning_rate": 8.415548034479214e-08, "loss": 0.61363971, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.06542969, "step": 15137, "time_per_iteration": 2.838423013687134 }, { "auxiliary_loss_clip": 0.01050953, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.01432252, "balance_loss_mlp": 1.01651621, "epoch": 0.9101457988877198, "flos": 20228885349120.0, "grad_norm": 1.638694349653523, "language_loss": 0.83351803, "learning_rate": 8.40437303497834e-08, "loss": 0.8543905, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 15138, "time_per_iteration": 2.3761496543884277 }, { "auxiliary_loss_clip": 0.01050716, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.0117135, "balance_loss_mlp": 1.01632202, "epoch": 0.9102059221403878, "flos": 26613941103360.0, "grad_norm": 1.5759700373381627, "language_loss": 0.81780219, "learning_rate": 8.39320530080283e-08, "loss": 0.83864772, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 15139, "time_per_iteration": 2.40073299407959 }, { "auxiliary_loss_clip": 0.0105248, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.01505637, "balance_loss_mlp": 1.01638365, "epoch": 0.9102660453930558, "flos": 21907474473600.0, "grad_norm": 4.954914010426568, "language_loss": 0.78147817, "learning_rate": 8.382044832376167e-08, "loss": 0.8023864, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 15140, "time_per_iteration": 2.396038293838501 }, { "auxiliary_loss_clip": 0.01050806, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.01588929, "balance_loss_mlp": 1.01537251, "epoch": 0.9103261686457237, "flos": 36175115462400.0, "grad_norm": 1.8951737025454354, "language_loss": 0.67241007, "learning_rate": 8.370891630121569e-08, "loss": 0.69330424, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 15141, "time_per_iteration": 2.541255235671997 }, { "auxiliary_loss_clip": 0.01052637, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.01578653, "balance_loss_mlp": 1.01646638, "epoch": 0.9103862918983917, "flos": 23877413827200.0, "grad_norm": 2.3500419409852555, "language_loss": 0.75877553, "learning_rate": 8.359745694462005e-08, "loss": 0.77970809, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36132812, "step": 15142, "time_per_iteration": 2.3918938636779785 }, { "auxiliary_loss_clip": 0.01049129, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 1.01792622, "balance_loss_mlp": 1.0146445, "epoch": 0.9104464151510596, "flos": 14938636008960.0, "grad_norm": 1.7404042846548633, "language_loss": 0.66151887, "learning_rate": 8.348607025820076e-08, "loss": 0.6823985, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 15143, "time_per_iteration": 2.3613274097442627 }, { "auxiliary_loss_clip": 0.01053026, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.01567459, "balance_loss_mlp": 1.01651812, "epoch": 0.9105065384037276, "flos": 33654421411200.0, "grad_norm": 1.9509932650818327, "language_loss": 0.62228274, "learning_rate": 8.337475624618152e-08, "loss": 0.64321738, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 15144, "time_per_iteration": 2.5000205039978027 }, { "auxiliary_loss_clip": 0.01048841, "auxiliary_loss_mlp": 0.01032582, "balance_loss_clip": 1.01049256, "balance_loss_mlp": 1.0155139, "epoch": 0.9105666616563957, "flos": 24315538878720.0, "grad_norm": 1.752132722670743, "language_loss": 0.7174207, "learning_rate": 8.326351491278382e-08, "loss": 0.73823488, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33398438, "step": 15145, "time_per_iteration": 3.8284499645233154 }, { "auxiliary_loss_clip": 0.01048844, "auxiliary_loss_mlp": 0.01034988, "balance_loss_clip": 1.01341128, "balance_loss_mlp": 1.01502657, "epoch": 0.9106267849090636, "flos": 29970386213760.0, "grad_norm": 1.6115306752377747, "language_loss": 0.71770406, "learning_rate": 8.315234626222545e-08, "loss": 0.73854232, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33789062, "step": 15146, "time_per_iteration": 2.4413132667541504 }, { "auxiliary_loss_clip": 0.01050374, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.01489401, "balance_loss_mlp": 1.01531005, "epoch": 0.9106869081617316, "flos": 25336573856640.0, "grad_norm": 1.8361916159179228, "language_loss": 0.7374891, "learning_rate": 8.304125029872233e-08, "loss": 0.75836426, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 15147, "time_per_iteration": 2.4238028526306152 }, { "auxiliary_loss_clip": 0.01052386, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.00981784, "balance_loss_mlp": 1.01617026, "epoch": 0.9107470314143995, "flos": 18186047343360.0, "grad_norm": 1.8725833452706726, "language_loss": 0.81288815, "learning_rate": 8.293022702648711e-08, "loss": 0.83374798, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 15148, "time_per_iteration": 2.3576431274414062 }, { "auxiliary_loss_clip": 0.01052166, "auxiliary_loss_mlp": 0.01039323, "balance_loss_clip": 1.01684046, "balance_loss_mlp": 1.01583433, "epoch": 0.9108071546670675, "flos": 23549684094720.0, "grad_norm": 1.9110768849653725, "language_loss": 0.69078678, "learning_rate": 8.281927644972996e-08, "loss": 0.71170169, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 15149, "time_per_iteration": 2.5805552005767822 }, { "auxiliary_loss_clip": 0.01050692, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.01444602, "balance_loss_mlp": 1.01615763, "epoch": 0.9108672779197354, "flos": 25629111072000.0, "grad_norm": 1.9590518326713922, "language_loss": 0.64335132, "learning_rate": 8.270839857265776e-08, "loss": 0.66422546, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 15150, "time_per_iteration": 2.411139965057373 }, { "auxiliary_loss_clip": 0.01051167, "auxiliary_loss_mlp": 0.01036537, "balance_loss_clip": 1.01463878, "balance_loss_mlp": 1.0161283, "epoch": 0.9109274011724035, "flos": 22338198316800.0, "grad_norm": 1.8368262737891778, "language_loss": 0.73909366, "learning_rate": 8.259759339947514e-08, "loss": 0.75997072, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15151, "time_per_iteration": 2.3840997219085693 }, { "auxiliary_loss_clip": 0.01050637, "auxiliary_loss_mlp": 0.01033996, "balance_loss_clip": 1.01196659, "balance_loss_mlp": 1.01558912, "epoch": 0.9109875244250714, "flos": 26686979400960.0, "grad_norm": 1.6122835233176094, "language_loss": 0.65755415, "learning_rate": 8.248686093438429e-08, "loss": 0.67840046, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15152, "time_per_iteration": 2.4152402877807617 }, { "auxiliary_loss_clip": 0.01051372, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.01050889, "balance_loss_mlp": 1.01612914, "epoch": 0.9110476476777394, "flos": 22928998210560.0, "grad_norm": 1.685315895501896, "language_loss": 0.7419796, "learning_rate": 8.23762011815834e-08, "loss": 0.76282203, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 15153, "time_per_iteration": 2.393449544906616 }, { "auxiliary_loss_clip": 0.01053226, "auxiliary_loss_mlp": 0.01039402, "balance_loss_clip": 1.01589358, "balance_loss_mlp": 1.01697063, "epoch": 0.9111077709304073, "flos": 13472214416640.0, "grad_norm": 1.915172448914857, "language_loss": 0.73182607, "learning_rate": 8.226561414526956e-08, "loss": 0.75275242, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 15154, "time_per_iteration": 2.372143507003784 }, { "auxiliary_loss_clip": 0.01051764, "auxiliary_loss_mlp": 0.0104158, "balance_loss_clip": 1.0199194, "balance_loss_mlp": 1.01671505, "epoch": 0.9111678941830753, "flos": 20849501410560.0, "grad_norm": 1.7159807784218863, "language_loss": 0.8331542, "learning_rate": 8.215509982963564e-08, "loss": 0.85408759, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 15155, "time_per_iteration": 2.428544759750366 }, { "auxiliary_loss_clip": 0.01050805, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.0108031, "balance_loss_mlp": 1.01610923, "epoch": 0.9112280174357432, "flos": 19681237762560.0, "grad_norm": 1.4983943006546294, "language_loss": 0.60541642, "learning_rate": 8.204465823887252e-08, "loss": 0.62625813, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 15156, "time_per_iteration": 2.380964517593384 }, { "auxiliary_loss_clip": 0.0105298, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.01204205, "balance_loss_mlp": 1.01558805, "epoch": 0.9112881406884112, "flos": 25445991657600.0, "grad_norm": 2.890945081823685, "language_loss": 0.75291383, "learning_rate": 8.193428937716796e-08, "loss": 0.77380884, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.375, "step": 15157, "time_per_iteration": 2.438042640686035 }, { "auxiliary_loss_clip": 0.01053039, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.01300812, "balance_loss_mlp": 1.01667082, "epoch": 0.9113482639410793, "flos": 33065751110400.0, "grad_norm": 2.172238793268555, "language_loss": 0.60491359, "learning_rate": 8.182399324870747e-08, "loss": 0.62578321, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.36328125, "step": 15158, "time_per_iteration": 2.483268976211548 }, { "auxiliary_loss_clip": 0.01050692, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.01723194, "balance_loss_mlp": 1.01582694, "epoch": 0.9114083871937472, "flos": 21834505998720.0, "grad_norm": 1.5856096911597755, "language_loss": 0.68724072, "learning_rate": 8.171376985767375e-08, "loss": 0.70814991, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34960938, "step": 15159, "time_per_iteration": 2.391744375228882 }, { "auxiliary_loss_clip": 0.01050391, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.01429021, "balance_loss_mlp": 1.01529956, "epoch": 0.9114685104464152, "flos": 27087782342400.0, "grad_norm": 1.9932664159770659, "language_loss": 0.79159427, "learning_rate": 8.160361920824588e-08, "loss": 0.81245637, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34960938, "step": 15160, "time_per_iteration": 2.3996379375457764 }, { "auxiliary_loss_clip": 0.01052352, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.01227283, "balance_loss_mlp": 1.01647377, "epoch": 0.9115286336990831, "flos": 17966094577920.0, "grad_norm": 1.7115918854809238, "language_loss": 0.70172203, "learning_rate": 8.149354130460073e-08, "loss": 0.72263247, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.359375, "step": 15161, "time_per_iteration": 2.3599627017974854 }, { "auxiliary_loss_clip": 0.01052325, "auxiliary_loss_mlp": 0.01038818, "balance_loss_clip": 1.01384413, "balance_loss_mlp": 1.0169313, "epoch": 0.9115887569517511, "flos": 22928753831040.0, "grad_norm": 1.8999933672753313, "language_loss": 0.77870959, "learning_rate": 8.138353615091321e-08, "loss": 0.79962105, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 15162, "time_per_iteration": 3.670523166656494 }, { "auxiliary_loss_clip": 0.01052142, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 1.01525283, "balance_loss_mlp": 1.01655364, "epoch": 0.911648880204419, "flos": 23987285475840.0, "grad_norm": 1.9189470796069488, "language_loss": 0.67682475, "learning_rate": 8.127360375135395e-08, "loss": 0.69772828, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 15163, "time_per_iteration": 2.4003565311431885 }, { "auxiliary_loss_clip": 0.01054711, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 1.01404929, "balance_loss_mlp": 1.01678014, "epoch": 0.911709003457087, "flos": 17054372666880.0, "grad_norm": 3.454002876937307, "language_loss": 0.72563648, "learning_rate": 8.116374411009186e-08, "loss": 0.74657255, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37890625, "step": 15164, "time_per_iteration": 2.3680005073547363 }, { "auxiliary_loss_clip": 0.01050407, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.01501, "balance_loss_mlp": 1.01659155, "epoch": 0.911769126709755, "flos": 21652259368320.0, "grad_norm": 1.5449890068754966, "language_loss": 0.76891905, "learning_rate": 8.105395723129315e-08, "loss": 0.78979182, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33789062, "step": 15165, "time_per_iteration": 2.4615161418914795 }, { "auxiliary_loss_clip": 0.01051632, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 1.01464963, "balance_loss_mlp": 1.01600647, "epoch": 0.911829249962423, "flos": 24789170649600.0, "grad_norm": 2.3311829899030463, "language_loss": 0.74049878, "learning_rate": 8.094424311912074e-08, "loss": 0.76138687, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 15166, "time_per_iteration": 2.4209272861480713 }, { "auxiliary_loss_clip": 0.01052728, "auxiliary_loss_mlp": 0.01041276, "balance_loss_clip": 1.01612246, "balance_loss_mlp": 1.01642561, "epoch": 0.9118893732150909, "flos": 20958360629760.0, "grad_norm": 1.7403104786953025, "language_loss": 0.73935509, "learning_rate": 8.083460177773482e-08, "loss": 0.76029515, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 15167, "time_per_iteration": 2.36311411857605 }, { "auxiliary_loss_clip": 0.01007478, "auxiliary_loss_mlp": 0.0100183, "balance_loss_clip": 0.99963641, "balance_loss_mlp": 1.00093222, "epoch": 0.9119494964677589, "flos": 67913991285120.0, "grad_norm": 0.770600437944389, "language_loss": 0.65663201, "learning_rate": 8.072503321129298e-08, "loss": 0.67672509, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06542969, "step": 15168, "time_per_iteration": 3.013223171234131 }, { "auxiliary_loss_clip": 0.01052148, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.0130831, "balance_loss_mlp": 1.01645923, "epoch": 0.9120096197204268, "flos": 18550540604160.0, "grad_norm": 2.15358930308531, "language_loss": 0.78987038, "learning_rate": 8.061553742395033e-08, "loss": 0.81074476, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 15169, "time_per_iteration": 2.38236403465271 }, { "auxiliary_loss_clip": 0.01050956, "auxiliary_loss_mlp": 0.0104124, "balance_loss_clip": 1.01775599, "balance_loss_mlp": 1.01615334, "epoch": 0.9120697429730948, "flos": 19024730956800.0, "grad_norm": 2.0548979037775634, "language_loss": 0.83201832, "learning_rate": 8.05061144198591e-08, "loss": 0.85294026, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 15170, "time_per_iteration": 2.3677990436553955 }, { "auxiliary_loss_clip": 0.01052483, "auxiliary_loss_mlp": 0.01043335, "balance_loss_clip": 1.01989865, "balance_loss_mlp": 1.01663983, "epoch": 0.9121298662257629, "flos": 17162778038400.0, "grad_norm": 2.268632909555113, "language_loss": 0.7827363, "learning_rate": 8.039676420316799e-08, "loss": 0.80369455, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 15171, "time_per_iteration": 2.346559762954712 }, { "auxiliary_loss_clip": 0.01050302, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.01274228, "balance_loss_mlp": 1.0153954, "epoch": 0.9121899894784308, "flos": 19681691610240.0, "grad_norm": 1.3129667978107689, "language_loss": 0.67763567, "learning_rate": 8.02874867780241e-08, "loss": 0.69848788, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 15172, "time_per_iteration": 3.779686450958252 }, { "auxiliary_loss_clip": 0.01053092, "auxiliary_loss_mlp": 0.01035817, "balance_loss_clip": 1.01166534, "balance_loss_mlp": 1.016837, "epoch": 0.9122501127310988, "flos": 22234680535680.0, "grad_norm": 1.6872854046019412, "language_loss": 0.76255083, "learning_rate": 8.017828214857103e-08, "loss": 0.78343993, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 15173, "time_per_iteration": 3.7590222358703613 }, { "auxiliary_loss_clip": 0.01055235, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.01065612, "balance_loss_mlp": 1.01679862, "epoch": 0.9123102359837667, "flos": 15956319496320.0, "grad_norm": 5.680307255330199, "language_loss": 0.6747005, "learning_rate": 8.00691503189499e-08, "loss": 0.69562447, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 15174, "time_per_iteration": 2.33768630027771 }, { "auxiliary_loss_clip": 0.01053821, "auxiliary_loss_mlp": 0.0103779, "balance_loss_clip": 1.01413846, "balance_loss_mlp": 1.01693344, "epoch": 0.9123703592364347, "flos": 25154606517120.0, "grad_norm": 2.120826259244081, "language_loss": 0.75703007, "learning_rate": 7.996009129329894e-08, "loss": 0.77794623, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 15175, "time_per_iteration": 2.4098305702209473 }, { "auxiliary_loss_clip": 0.01007743, "auxiliary_loss_mlp": 0.01002533, "balance_loss_clip": 1.00036335, "balance_loss_mlp": 1.00110388, "epoch": 0.9124304824891026, "flos": 60798482732160.0, "grad_norm": 0.9714962190808286, "language_loss": 0.58595693, "learning_rate": 7.985110507575421e-08, "loss": 0.60605967, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06640625, "step": 15176, "time_per_iteration": 3.0823464393615723 }, { "auxiliary_loss_clip": 0.01052868, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.01515472, "balance_loss_mlp": 1.01671171, "epoch": 0.9124906057417707, "flos": 18149947130880.0, "grad_norm": 1.9024045963577905, "language_loss": 0.6660167, "learning_rate": 7.97421916704475e-08, "loss": 0.68692982, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 15177, "time_per_iteration": 2.3582041263580322 }, { "auxiliary_loss_clip": 0.01051218, "auxiliary_loss_mlp": 0.01036298, "balance_loss_clip": 1.01567459, "balance_loss_mlp": 1.0160085, "epoch": 0.9125507289944386, "flos": 11686127616000.0, "grad_norm": 2.072631003055997, "language_loss": 0.82370782, "learning_rate": 7.963335108150926e-08, "loss": 0.84458297, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.3515625, "step": 15178, "time_per_iteration": 2.335771083831787 }, { "auxiliary_loss_clip": 0.01051896, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.01464248, "balance_loss_mlp": 1.01661062, "epoch": 0.9126108522471066, "flos": 17747852469120.0, "grad_norm": 2.437532408813133, "language_loss": 0.80170488, "learning_rate": 7.952458331306711e-08, "loss": 0.82259154, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15179, "time_per_iteration": 2.360410213470459 }, { "auxiliary_loss_clip": 0.01049871, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.01557231, "balance_loss_mlp": 1.01540947, "epoch": 0.9126709754997745, "flos": 27634522233600.0, "grad_norm": 1.6111438795304511, "language_loss": 0.68785906, "learning_rate": 7.941588836924507e-08, "loss": 0.7087397, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 15180, "time_per_iteration": 2.4125351905822754 }, { "auxiliary_loss_clip": 0.01049377, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.00866485, "balance_loss_mlp": 1.01473534, "epoch": 0.9127310987524425, "flos": 15924059533440.0, "grad_norm": 1.6327314023138948, "language_loss": 0.76227164, "learning_rate": 7.930726625416495e-08, "loss": 0.78306663, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 15181, "time_per_iteration": 2.371516466140747 }, { "auxiliary_loss_clip": 0.01053823, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.01231337, "balance_loss_mlp": 1.01684618, "epoch": 0.9127912220051104, "flos": 21535998940800.0, "grad_norm": 2.0867148906733304, "language_loss": 0.75588834, "learning_rate": 7.919871697194614e-08, "loss": 0.77678168, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 15182, "time_per_iteration": 2.3774423599243164 }, { "auxiliary_loss_clip": 0.01052619, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.01722169, "balance_loss_mlp": 1.01571989, "epoch": 0.9128513452577784, "flos": 24062348632320.0, "grad_norm": 1.585485205354692, "language_loss": 0.76992965, "learning_rate": 7.909024052670421e-08, "loss": 0.79087448, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 15183, "time_per_iteration": 2.4083359241485596 }, { "auxiliary_loss_clip": 0.01052591, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.0150075, "balance_loss_mlp": 1.01598811, "epoch": 0.9129114685104465, "flos": 16215549408000.0, "grad_norm": 2.4961495135799683, "language_loss": 0.77035403, "learning_rate": 7.898183692255256e-08, "loss": 0.7912637, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 15184, "time_per_iteration": 3.7914140224456787 }, { "auxiliary_loss_clip": 0.01051189, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.01348376, "balance_loss_mlp": 1.01563799, "epoch": 0.9129715917631144, "flos": 19383533665920.0, "grad_norm": 1.6801316906135204, "language_loss": 0.75811076, "learning_rate": 7.887350616360233e-08, "loss": 0.7789858, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 15185, "time_per_iteration": 2.3593344688415527 }, { "auxiliary_loss_clip": 0.01050945, "auxiliary_loss_mlp": 0.01037409, "balance_loss_clip": 1.01506889, "balance_loss_mlp": 1.01619816, "epoch": 0.9130317150157824, "flos": 20589538360320.0, "grad_norm": 1.9150249217978865, "language_loss": 0.70266652, "learning_rate": 7.876524825396158e-08, "loss": 0.72355008, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 15186, "time_per_iteration": 2.378145694732666 }, { "auxiliary_loss_clip": 0.01054528, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.01167214, "balance_loss_mlp": 1.01639903, "epoch": 0.9130918382684503, "flos": 20188316482560.0, "grad_norm": 1.884954953161776, "language_loss": 0.78709406, "learning_rate": 7.865706319773502e-08, "loss": 0.80800873, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 15187, "time_per_iteration": 2.369201898574829 }, { "auxiliary_loss_clip": 0.01052188, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.01523638, "balance_loss_mlp": 1.01607633, "epoch": 0.9131519615211183, "flos": 25555688749440.0, "grad_norm": 2.149920333985983, "language_loss": 0.67276782, "learning_rate": 7.854895099902515e-08, "loss": 0.69366121, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36132812, "step": 15188, "time_per_iteration": 2.4424679279327393 }, { "auxiliary_loss_clip": 0.01049902, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.01292562, "balance_loss_mlp": 1.01496696, "epoch": 0.9132120847737862, "flos": 17930587858560.0, "grad_norm": 2.4533419266181644, "language_loss": 0.77551329, "learning_rate": 7.844091166193157e-08, "loss": 0.79636562, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 15189, "time_per_iteration": 2.3504080772399902 }, { "auxiliary_loss_clip": 0.01049254, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.01546693, "balance_loss_mlp": 1.01560271, "epoch": 0.9132722080264543, "flos": 20046603807360.0, "grad_norm": 1.8411083033954903, "language_loss": 0.76709616, "learning_rate": 7.8332945190551e-08, "loss": 0.78795713, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3359375, "step": 15190, "time_per_iteration": 2.368091583251953 }, { "auxiliary_loss_clip": 0.01007135, "auxiliary_loss_mlp": 0.01002809, "balance_loss_clip": 1.00072265, "balance_loss_mlp": 1.00053132, "epoch": 0.9133323312791222, "flos": 70436361081600.0, "grad_norm": 0.7198672397282799, "language_loss": 0.57463574, "learning_rate": 7.822505158897797e-08, "loss": 0.59473515, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06640625, "step": 15191, "time_per_iteration": 3.077641487121582 }, { "auxiliary_loss_clip": 0.01052443, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.01366425, "balance_loss_mlp": 1.0161252, "epoch": 0.9133924545317902, "flos": 25482615540480.0, "grad_norm": 2.029064650429031, "language_loss": 0.75264323, "learning_rate": 7.81172308613034e-08, "loss": 0.77353913, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 15192, "time_per_iteration": 2.4041147232055664 }, { "auxiliary_loss_clip": 0.0105045, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.01177287, "balance_loss_mlp": 1.01568532, "epoch": 0.9134525777844581, "flos": 39929151669120.0, "grad_norm": 2.020019268129828, "language_loss": 0.70341885, "learning_rate": 7.800948301161647e-08, "loss": 0.72426426, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 15193, "time_per_iteration": 2.52298903465271 }, { "auxiliary_loss_clip": 0.01050117, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.01270962, "balance_loss_mlp": 1.01600301, "epoch": 0.9135127010371261, "flos": 20885671445760.0, "grad_norm": 1.564126717186315, "language_loss": 0.74198246, "learning_rate": 7.790180804400215e-08, "loss": 0.76282299, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33984375, "step": 15194, "time_per_iteration": 2.3826138973236084 }, { "auxiliary_loss_clip": 0.01051937, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.01207721, "balance_loss_mlp": 1.01544845, "epoch": 0.913572824289794, "flos": 20812214211840.0, "grad_norm": 2.924128941778097, "language_loss": 0.63117397, "learning_rate": 7.779420596254383e-08, "loss": 0.65205312, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 15195, "time_per_iteration": 2.375117063522339 }, { "auxiliary_loss_clip": 0.01053345, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.01517391, "balance_loss_mlp": 1.01732695, "epoch": 0.913632947542462, "flos": 25702079546880.0, "grad_norm": 1.9893008058260677, "language_loss": 0.72085899, "learning_rate": 7.768667677132201e-08, "loss": 0.74177444, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 15196, "time_per_iteration": 2.4274652004241943 }, { "auxiliary_loss_clip": 0.01051555, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.01284027, "balance_loss_mlp": 1.01653194, "epoch": 0.9136930707951301, "flos": 26285024384640.0, "grad_norm": 1.7181158208166285, "language_loss": 0.72008133, "learning_rate": 7.757922047441411e-08, "loss": 0.74093306, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34960938, "step": 15197, "time_per_iteration": 2.4101450443267822 }, { "auxiliary_loss_clip": 0.01052076, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.01174212, "balance_loss_mlp": 1.01566434, "epoch": 0.913753194047798, "flos": 22090768444800.0, "grad_norm": 1.7938382298687943, "language_loss": 0.79273593, "learning_rate": 7.747183707589489e-08, "loss": 0.81360888, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 15198, "time_per_iteration": 2.3840551376342773 }, { "auxiliary_loss_clip": 0.0105007, "auxiliary_loss_mlp": 0.0104055, "balance_loss_clip": 1.0183053, "balance_loss_mlp": 1.01494634, "epoch": 0.913813317300466, "flos": 23586063598080.0, "grad_norm": 1.365875570503282, "language_loss": 0.68353367, "learning_rate": 7.736452657983616e-08, "loss": 0.70443988, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 15199, "time_per_iteration": 2.4282310009002686 }, { "auxiliary_loss_clip": 0.01050759, "auxiliary_loss_mlp": 0.01043257, "balance_loss_clip": 1.02054739, "balance_loss_mlp": 1.01576662, "epoch": 0.9138734405531339, "flos": 28875195774720.0, "grad_norm": 1.6045577719599164, "language_loss": 0.68226153, "learning_rate": 7.725728899030714e-08, "loss": 0.70320165, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 15200, "time_per_iteration": 2.4239394664764404 }, { "auxiliary_loss_clip": 0.01050624, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.01489472, "balance_loss_mlp": 1.01640856, "epoch": 0.9139335638058019, "flos": 22819964434560.0, "grad_norm": 1.5719883871579239, "language_loss": 0.72594863, "learning_rate": 7.715012431137435e-08, "loss": 0.74682415, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34179688, "step": 15201, "time_per_iteration": 3.627408742904663 }, { "auxiliary_loss_clip": 0.01049404, "auxiliary_loss_mlp": 0.01030476, "balance_loss_clip": 1.0098412, "balance_loss_mlp": 1.01489699, "epoch": 0.9139936870584698, "flos": 18003207219840.0, "grad_norm": 1.8958109549469233, "language_loss": 0.71828485, "learning_rate": 7.704303254710165e-08, "loss": 0.73908365, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34570312, "step": 15202, "time_per_iteration": 2.42482852935791 }, { "auxiliary_loss_clip": 0.01050204, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.01159358, "balance_loss_mlp": 1.0150454, "epoch": 0.9140538103111379, "flos": 15812896164480.0, "grad_norm": 2.131529127815075, "language_loss": 0.6789434, "learning_rate": 7.693601370155001e-08, "loss": 0.69978786, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15203, "time_per_iteration": 2.3323166370391846 }, { "auxiliary_loss_clip": 0.01051064, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.01229429, "balance_loss_mlp": 1.01593053, "epoch": 0.9141139335638058, "flos": 23985923932800.0, "grad_norm": 1.5934040871832942, "language_loss": 0.69665039, "learning_rate": 7.682906777877751e-08, "loss": 0.7175281, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3515625, "step": 15204, "time_per_iteration": 2.416658878326416 }, { "auxiliary_loss_clip": 0.01050976, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.01333535, "balance_loss_mlp": 1.01486599, "epoch": 0.9141740568164738, "flos": 24023280954240.0, "grad_norm": 2.9163116236593307, "language_loss": 0.60821784, "learning_rate": 7.672219478283915e-08, "loss": 0.62910008, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 15205, "time_per_iteration": 2.3857386112213135 }, { "auxiliary_loss_clip": 0.01049477, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.0114764, "balance_loss_mlp": 1.01602638, "epoch": 0.9142341800691417, "flos": 27017013283200.0, "grad_norm": 1.6268422028815976, "language_loss": 0.82466835, "learning_rate": 7.661539471778811e-08, "loss": 0.84550595, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.33398438, "step": 15206, "time_per_iteration": 2.428603410720825 }, { "auxiliary_loss_clip": 0.01051204, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.01071072, "balance_loss_mlp": 1.01506901, "epoch": 0.9142943033218097, "flos": 20411446181760.0, "grad_norm": 2.1544804636293877, "language_loss": 0.74901009, "learning_rate": 7.650866758767382e-08, "loss": 0.769858, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 15207, "time_per_iteration": 2.377981185913086 }, { "auxiliary_loss_clip": 0.01050884, "auxiliary_loss_mlp": 0.01040649, "balance_loss_clip": 1.01591325, "balance_loss_mlp": 1.0152483, "epoch": 0.9143544265744776, "flos": 19754310971520.0, "grad_norm": 2.804695002610808, "language_loss": 0.74172431, "learning_rate": 7.640201339654373e-08, "loss": 0.76263964, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.35546875, "step": 15208, "time_per_iteration": 2.5837960243225098 }, { "auxiliary_loss_clip": 0.01049276, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.01138341, "balance_loss_mlp": 1.01498711, "epoch": 0.9144145498271457, "flos": 17164488695040.0, "grad_norm": 2.2342058106230245, "language_loss": 0.8734349, "learning_rate": 7.629543214844237e-08, "loss": 0.89424682, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 15209, "time_per_iteration": 2.397447109222412 }, { "auxiliary_loss_clip": 0.01050908, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.01589072, "balance_loss_mlp": 1.01581681, "epoch": 0.9144746730798137, "flos": 23725123009920.0, "grad_norm": 1.6114912720708439, "language_loss": 0.76500767, "learning_rate": 7.618892384741093e-08, "loss": 0.78588992, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3515625, "step": 15210, "time_per_iteration": 2.4308536052703857 }, { "auxiliary_loss_clip": 0.01050506, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.01306462, "balance_loss_mlp": 1.01527405, "epoch": 0.9145347963324816, "flos": 25846689864960.0, "grad_norm": 1.9211612723485298, "language_loss": 0.78979707, "learning_rate": 7.6082488497488e-08, "loss": 0.81065953, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15211, "time_per_iteration": 2.386380434036255 }, { "auxiliary_loss_clip": 0.01053148, "auxiliary_loss_mlp": 0.01039222, "balance_loss_clip": 1.01646519, "balance_loss_mlp": 1.01746535, "epoch": 0.9145949195851496, "flos": 19241820990720.0, "grad_norm": 1.668432992085951, "language_loss": 0.84114712, "learning_rate": 7.597612610270986e-08, "loss": 0.8620708, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 15212, "time_per_iteration": 5.1538825035095215 }, { "auxiliary_loss_clip": 0.01050141, "auxiliary_loss_mlp": 0.01033711, "balance_loss_clip": 1.01174104, "balance_loss_mlp": 1.01577914, "epoch": 0.9146550428378175, "flos": 18295395321600.0, "grad_norm": 1.9530934591538154, "language_loss": 0.84795952, "learning_rate": 7.586983666711022e-08, "loss": 0.86879802, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 15213, "time_per_iteration": 2.386683225631714 }, { "auxiliary_loss_clip": 0.01051177, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.01428366, "balance_loss_mlp": 1.0159049, "epoch": 0.9147151660904855, "flos": 20083227690240.0, "grad_norm": 1.832691632277064, "language_loss": 0.72772276, "learning_rate": 7.576362019471894e-08, "loss": 0.74860203, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35351562, "step": 15214, "time_per_iteration": 2.3498005867004395 }, { "auxiliary_loss_clip": 0.01054464, "auxiliary_loss_mlp": 0.01040127, "balance_loss_clip": 1.01593983, "balance_loss_mlp": 1.01735473, "epoch": 0.9147752893431534, "flos": 24387983683200.0, "grad_norm": 1.8572626958974865, "language_loss": 0.64050257, "learning_rate": 7.565747668956413e-08, "loss": 0.66144848, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37109375, "step": 15215, "time_per_iteration": 2.390673875808716 }, { "auxiliary_loss_clip": 0.01056333, "auxiliary_loss_mlp": 0.01041242, "balance_loss_clip": 1.01688719, "balance_loss_mlp": 1.01797485, "epoch": 0.9148354125958215, "flos": 18149423460480.0, "grad_norm": 2.1065076185557503, "language_loss": 0.7841273, "learning_rate": 7.555140615567058e-08, "loss": 0.80510306, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3828125, "step": 15216, "time_per_iteration": 2.35434627532959 }, { "auxiliary_loss_clip": 0.01051497, "auxiliary_loss_mlp": 0.01044434, "balance_loss_clip": 1.01798153, "balance_loss_mlp": 1.01592231, "epoch": 0.9148955358484894, "flos": 23366425034880.0, "grad_norm": 2.2740874913123807, "language_loss": 0.69148338, "learning_rate": 7.544540859706062e-08, "loss": 0.71244276, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.35546875, "step": 15217, "time_per_iteration": 2.3872904777526855 }, { "auxiliary_loss_clip": 0.01050788, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.01315355, "balance_loss_mlp": 1.016114, "epoch": 0.9149556591011574, "flos": 18075547290240.0, "grad_norm": 1.8591707201129317, "language_loss": 0.81166273, "learning_rate": 7.533948401775347e-08, "loss": 0.8325125, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34765625, "step": 15218, "time_per_iteration": 2.357994318008423 }, { "auxiliary_loss_clip": 0.01006657, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.00063956, "balance_loss_mlp": 1.00031471, "epoch": 0.9150157823538253, "flos": 54583733923200.0, "grad_norm": 0.8456353491287902, "language_loss": 0.5937959, "learning_rate": 7.523363242176595e-08, "loss": 0.61389005, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.06347656, "step": 15219, "time_per_iteration": 2.9859468936920166 }, { "auxiliary_loss_clip": 0.01049527, "auxiliary_loss_mlp": 0.01037671, "balance_loss_clip": 1.01516461, "balance_loss_mlp": 1.01515436, "epoch": 0.9150759056064933, "flos": 17892183496320.0, "grad_norm": 1.6954069829374863, "language_loss": 0.79407543, "learning_rate": 7.512785381311216e-08, "loss": 0.81494743, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 15220, "time_per_iteration": 2.3560376167297363 }, { "auxiliary_loss_clip": 0.01052588, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.01142287, "balance_loss_mlp": 1.01484036, "epoch": 0.9151360288591612, "flos": 18072649647360.0, "grad_norm": 1.8189813266972759, "language_loss": 0.66901416, "learning_rate": 7.50221481958031e-08, "loss": 0.68989933, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37890625, "step": 15221, "time_per_iteration": 2.337916612625122 }, { "auxiliary_loss_clip": 0.01051563, "auxiliary_loss_mlp": 0.01034797, "balance_loss_clip": 1.01229, "balance_loss_mlp": 1.01568484, "epoch": 0.9151961521118293, "flos": 19353508030080.0, "grad_norm": 1.7991319986858936, "language_loss": 0.86449575, "learning_rate": 7.491651557384692e-08, "loss": 0.88535929, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 15222, "time_per_iteration": 2.3781981468200684 }, { "auxiliary_loss_clip": 0.01007402, "auxiliary_loss_mlp": 0.0100438, "balance_loss_clip": 1.00242484, "balance_loss_mlp": 1.00101304, "epoch": 0.9152562753644973, "flos": 72143195362560.0, "grad_norm": 0.7258642535300226, "language_loss": 0.49734455, "learning_rate": 7.481095595124953e-08, "loss": 0.51746237, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06347656, "step": 15223, "time_per_iteration": 3.0696029663085938 }, { "auxiliary_loss_clip": 0.01054618, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.01646042, "balance_loss_mlp": 1.01729918, "epoch": 0.9153163986171652, "flos": 20775974353920.0, "grad_norm": 1.826383762200019, "language_loss": 0.73337221, "learning_rate": 7.470546933201349e-08, "loss": 0.75431681, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37304688, "step": 15224, "time_per_iteration": 3.815645456314087 }, { "auxiliary_loss_clip": 0.01052067, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.01234317, "balance_loss_mlp": 1.01651692, "epoch": 0.9153765218698332, "flos": 23038974593280.0, "grad_norm": 3.3809896540328848, "language_loss": 0.83071446, "learning_rate": 7.460005572013895e-08, "loss": 0.85158807, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 15225, "time_per_iteration": 2.4253525733947754 }, { "auxiliary_loss_clip": 0.01050531, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.01159596, "balance_loss_mlp": 1.0151, "epoch": 0.9154366451225011, "flos": 28989501166080.0, "grad_norm": 1.3491568782583614, "language_loss": 0.7190702, "learning_rate": 7.44947151196238e-08, "loss": 0.73991787, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 15226, "time_per_iteration": 2.4518628120422363 }, { "auxiliary_loss_clip": 0.01052552, "auxiliary_loss_mlp": 0.01036446, "balance_loss_clip": 1.01254487, "balance_loss_mlp": 1.01615882, "epoch": 0.9154967683751691, "flos": 22308417060480.0, "grad_norm": 1.8657266410321558, "language_loss": 0.76493901, "learning_rate": 7.43894475344613e-08, "loss": 0.78582895, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 15227, "time_per_iteration": 2.4098002910614014 }, { "auxiliary_loss_clip": 0.01050591, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.01100397, "balance_loss_mlp": 1.01583457, "epoch": 0.915556891627837, "flos": 24570335047680.0, "grad_norm": 1.421226217442011, "language_loss": 0.75421226, "learning_rate": 7.428425296864404e-08, "loss": 0.77503705, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 15228, "time_per_iteration": 2.426741123199463 }, { "auxiliary_loss_clip": 0.01049237, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.01894891, "balance_loss_mlp": 1.01476216, "epoch": 0.9156170148805051, "flos": 22163562362880.0, "grad_norm": 1.5871193047487895, "language_loss": 0.72332346, "learning_rate": 7.417913142616106e-08, "loss": 0.74422204, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 15229, "time_per_iteration": 2.380918502807617 }, { "auxiliary_loss_clip": 0.01052488, "auxiliary_loss_mlp": 0.01045351, "balance_loss_clip": 1.02143717, "balance_loss_mlp": 1.01664889, "epoch": 0.915677138133173, "flos": 20919676976640.0, "grad_norm": 1.5957277977597093, "language_loss": 0.83790684, "learning_rate": 7.407408291099848e-08, "loss": 0.85888523, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15230, "time_per_iteration": 2.380831480026245 }, { "auxiliary_loss_clip": 0.01049861, "auxiliary_loss_mlp": 0.01037244, "balance_loss_clip": 1.01603651, "balance_loss_mlp": 1.01547742, "epoch": 0.915737261385841, "flos": 24344202971520.0, "grad_norm": 1.9596635675126788, "language_loss": 0.84972274, "learning_rate": 7.396910742713957e-08, "loss": 0.87059379, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 15231, "time_per_iteration": 2.421139717102051 }, { "auxiliary_loss_clip": 0.01049364, "auxiliary_loss_mlp": 0.01031898, "balance_loss_clip": 1.01040506, "balance_loss_mlp": 1.0153749, "epoch": 0.9157973846385089, "flos": 26760157344000.0, "grad_norm": 1.6235288418079081, "language_loss": 0.73563945, "learning_rate": 7.386420497856516e-08, "loss": 0.75645214, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 15232, "time_per_iteration": 2.420163631439209 }, { "auxiliary_loss_clip": 0.01051726, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.01327884, "balance_loss_mlp": 1.01540422, "epoch": 0.9158575078911769, "flos": 18477746686080.0, "grad_norm": 2.34912453097201, "language_loss": 0.686607, "learning_rate": 7.375937556925338e-08, "loss": 0.70749319, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 15233, "time_per_iteration": 2.3632662296295166 }, { "auxiliary_loss_clip": 0.01051381, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.01736748, "balance_loss_mlp": 1.01618958, "epoch": 0.9159176311438448, "flos": 21797847204480.0, "grad_norm": 9.56671876820394, "language_loss": 0.70367324, "learning_rate": 7.365461920317861e-08, "loss": 0.72458494, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 15234, "time_per_iteration": 2.3705618381500244 }, { "auxiliary_loss_clip": 0.01053195, "auxiliary_loss_mlp": 0.01040999, "balance_loss_clip": 1.01662087, "balance_loss_mlp": 1.01671362, "epoch": 0.9159777543965129, "flos": 24782781870720.0, "grad_norm": 1.6492032911816235, "language_loss": 0.88812053, "learning_rate": 7.354993588431391e-08, "loss": 0.9090625, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 15235, "time_per_iteration": 2.4041740894317627 }, { "auxiliary_loss_clip": 0.01051611, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.01433778, "balance_loss_mlp": 1.01566744, "epoch": 0.9160378776491809, "flos": 26867585197440.0, "grad_norm": 1.8099074499446117, "language_loss": 0.78750122, "learning_rate": 7.344532561662853e-08, "loss": 0.80839479, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 15236, "time_per_iteration": 2.4411048889160156 }, { "auxiliary_loss_clip": 0.01007534, "auxiliary_loss_mlp": 0.01003718, "balance_loss_clip": 1.00156009, "balance_loss_mlp": 1.00109458, "epoch": 0.9160980009018488, "flos": 70574827000320.0, "grad_norm": 0.6775751180957379, "language_loss": 0.62298858, "learning_rate": 7.334078840409019e-08, "loss": 0.6431011, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06445312, "step": 15237, "time_per_iteration": 2.9446444511413574 }, { "auxiliary_loss_clip": 0.01051716, "auxiliary_loss_mlp": 0.01033816, "balance_loss_clip": 1.01066566, "balance_loss_mlp": 1.01535881, "epoch": 0.9161581241545168, "flos": 16288413148800.0, "grad_norm": 2.1698597783131746, "language_loss": 0.75677109, "learning_rate": 7.323632425066151e-08, "loss": 0.7776264, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 15238, "time_per_iteration": 2.394984483718872 }, { "auxiliary_loss_clip": 0.01052287, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.01320624, "balance_loss_mlp": 1.01576185, "epoch": 0.9162182474071847, "flos": 18437282553600.0, "grad_norm": 1.6658473238841707, "language_loss": 0.75590789, "learning_rate": 7.313193316030464e-08, "loss": 0.77680469, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 15239, "time_per_iteration": 2.3507447242736816 }, { "auxiliary_loss_clip": 0.01051963, "auxiliary_loss_mlp": 0.01040387, "balance_loss_clip": 1.01726031, "balance_loss_mlp": 1.01564741, "epoch": 0.9162783706598527, "flos": 19166373809280.0, "grad_norm": 1.9588296428558092, "language_loss": 0.64756405, "learning_rate": 7.302761513697819e-08, "loss": 0.66848755, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36328125, "step": 15240, "time_per_iteration": 2.4080207347869873 }, { "auxiliary_loss_clip": 0.01050789, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.01163507, "balance_loss_mlp": 1.01689959, "epoch": 0.9163384939125206, "flos": 20411934940800.0, "grad_norm": 2.416902241058621, "language_loss": 0.7738328, "learning_rate": 7.292337018463746e-08, "loss": 0.79467893, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33984375, "step": 15241, "time_per_iteration": 3.6248972415924072 }, { "auxiliary_loss_clip": 0.01056223, "auxiliary_loss_mlp": 0.01041032, "balance_loss_clip": 1.01368499, "balance_loss_mlp": 1.01687098, "epoch": 0.9163986171651887, "flos": 19644893170560.0, "grad_norm": 2.7320828903678223, "language_loss": 0.69351453, "learning_rate": 7.281919830723549e-08, "loss": 0.71448708, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.39453125, "step": 15242, "time_per_iteration": 2.4296491146087646 }, { "auxiliary_loss_clip": 0.01052794, "auxiliary_loss_mlp": 0.01037738, "balance_loss_clip": 1.01345515, "balance_loss_mlp": 1.01592207, "epoch": 0.9164587404178566, "flos": 12822236035200.0, "grad_norm": 1.9571624116420188, "language_loss": 0.82278931, "learning_rate": 7.271509950872334e-08, "loss": 0.84369469, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36914062, "step": 15243, "time_per_iteration": 2.3497073650360107 }, { "auxiliary_loss_clip": 0.01052462, "auxiliary_loss_mlp": 0.01040226, "balance_loss_clip": 1.01560926, "balance_loss_mlp": 1.01554632, "epoch": 0.9165188636705246, "flos": 22308312326400.0, "grad_norm": 1.983480697879043, "language_loss": 0.82979286, "learning_rate": 7.261107379304721e-08, "loss": 0.85071975, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36914062, "step": 15244, "time_per_iteration": 2.3992433547973633 }, { "auxiliary_loss_clip": 0.01054707, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.01655269, "balance_loss_mlp": 1.01656973, "epoch": 0.9165789869231925, "flos": 18222357024000.0, "grad_norm": 2.3146243430817988, "language_loss": 0.74770367, "learning_rate": 7.250712116415214e-08, "loss": 0.76867533, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.3828125, "step": 15245, "time_per_iteration": 2.389514446258545 }, { "auxiliary_loss_clip": 0.01051012, "auxiliary_loss_mlp": 0.01032722, "balance_loss_clip": 1.01047707, "balance_loss_mlp": 1.01604795, "epoch": 0.9166391101758605, "flos": 13690910373120.0, "grad_norm": 1.7886818904393458, "language_loss": 0.75658083, "learning_rate": 7.240324162598033e-08, "loss": 0.7774182, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 15246, "time_per_iteration": 2.3617959022521973 }, { "auxiliary_loss_clip": 0.01051738, "auxiliary_loss_mlp": 0.01037902, "balance_loss_clip": 1.01314235, "balance_loss_mlp": 1.01686502, "epoch": 0.9166992334285284, "flos": 17345862541440.0, "grad_norm": 4.110411356430959, "language_loss": 0.76816154, "learning_rate": 7.229943518247106e-08, "loss": 0.78905797, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.34765625, "step": 15247, "time_per_iteration": 2.350221872329712 }, { "auxiliary_loss_clip": 0.01052731, "auxiliary_loss_mlp": 0.01038264, "balance_loss_clip": 1.01517284, "balance_loss_mlp": 1.01673031, "epoch": 0.9167593566811965, "flos": 23730045511680.0, "grad_norm": 3.2974786381732875, "language_loss": 0.77358812, "learning_rate": 7.219570183756052e-08, "loss": 0.79449809, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 15248, "time_per_iteration": 2.421276807785034 }, { "auxiliary_loss_clip": 0.01052699, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.01071393, "balance_loss_mlp": 1.01629686, "epoch": 0.9168194799338644, "flos": 27816978332160.0, "grad_norm": 4.805014955445688, "language_loss": 0.74477547, "learning_rate": 7.209204159518178e-08, "loss": 0.76565719, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 15249, "time_per_iteration": 2.42830753326416 }, { "auxiliary_loss_clip": 0.01050866, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.01366854, "balance_loss_mlp": 1.01569724, "epoch": 0.9168796031865324, "flos": 21716709471360.0, "grad_norm": 2.38580984320093, "language_loss": 0.77715647, "learning_rate": 7.198845445926616e-08, "loss": 0.79803574, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 15250, "time_per_iteration": 2.358096122741699 }, { "auxiliary_loss_clip": 0.01051102, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.01181138, "balance_loss_mlp": 1.01518011, "epoch": 0.9169397264392004, "flos": 23403293297280.0, "grad_norm": 1.617364535357278, "language_loss": 0.76886821, "learning_rate": 7.188494043374138e-08, "loss": 0.78974384, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 15251, "time_per_iteration": 2.3793838024139404 }, { "auxiliary_loss_clip": 0.01052283, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.01478028, "balance_loss_mlp": 1.01561737, "epoch": 0.9169998496918683, "flos": 23949858631680.0, "grad_norm": 2.15419184980716, "language_loss": 0.82203096, "learning_rate": 7.178149952253298e-08, "loss": 0.84294951, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 15252, "time_per_iteration": 5.0567896366119385 }, { "auxiliary_loss_clip": 0.01050968, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.01441097, "balance_loss_mlp": 1.01545238, "epoch": 0.9170599729445363, "flos": 18331495534080.0, "grad_norm": 1.6743827170322876, "language_loss": 0.78220487, "learning_rate": 7.167813172956316e-08, "loss": 0.80308819, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 15253, "time_per_iteration": 2.3547821044921875 }, { "auxiliary_loss_clip": 0.0105337, "auxiliary_loss_mlp": 0.01037778, "balance_loss_clip": 1.01399589, "balance_loss_mlp": 1.01674914, "epoch": 0.9171200961972042, "flos": 22673748193920.0, "grad_norm": 1.8556419328893934, "language_loss": 0.74057031, "learning_rate": 7.157483705875256e-08, "loss": 0.76148188, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 15254, "time_per_iteration": 2.387122392654419 }, { "auxiliary_loss_clip": 0.01051319, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.01302075, "balance_loss_mlp": 1.01662636, "epoch": 0.9171802194498723, "flos": 26718226934400.0, "grad_norm": 1.5562298795790226, "language_loss": 0.80451298, "learning_rate": 7.14716155140167e-08, "loss": 0.82537544, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 15255, "time_per_iteration": 2.456204652786255 }, { "auxiliary_loss_clip": 0.01052281, "auxiliary_loss_mlp": 0.0104107, "balance_loss_clip": 1.01738334, "balance_loss_mlp": 1.0159657, "epoch": 0.9172403427025402, "flos": 37887710117760.0, "grad_norm": 2.112400669519704, "language_loss": 0.69488901, "learning_rate": 7.136846709927047e-08, "loss": 0.71582258, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 15256, "time_per_iteration": 2.514353036880493 }, { "auxiliary_loss_clip": 0.01050618, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.01268053, "balance_loss_mlp": 1.01577115, "epoch": 0.9173004659552082, "flos": 17054233021440.0, "grad_norm": 2.0625480365165347, "language_loss": 0.8482269, "learning_rate": 7.126539181842561e-08, "loss": 0.86908245, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 15257, "time_per_iteration": 2.358159065246582 }, { "auxiliary_loss_clip": 0.01049995, "auxiliary_loss_mlp": 0.01036067, "balance_loss_clip": 1.01596844, "balance_loss_mlp": 1.01624894, "epoch": 0.9173605892078761, "flos": 22200465536640.0, "grad_norm": 2.7633046242052486, "language_loss": 0.78248632, "learning_rate": 7.116238967539012e-08, "loss": 0.80334693, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33789062, "step": 15258, "time_per_iteration": 2.4609692096710205 }, { "auxiliary_loss_clip": 0.01052505, "auxiliary_loss_mlp": 0.01038175, "balance_loss_clip": 1.01436925, "balance_loss_mlp": 1.01708698, "epoch": 0.9174207124605441, "flos": 16506864725760.0, "grad_norm": 1.8132575453831952, "language_loss": 0.79936719, "learning_rate": 7.105946067406999e-08, "loss": 0.820274, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 15259, "time_per_iteration": 2.435352087020874 }, { "auxiliary_loss_clip": 0.01050345, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.01418209, "balance_loss_mlp": 1.01537561, "epoch": 0.917480835713212, "flos": 24534444303360.0, "grad_norm": 1.8361949403427937, "language_loss": 0.7680226, "learning_rate": 7.095660481836895e-08, "loss": 0.78887987, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34960938, "step": 15260, "time_per_iteration": 2.4690897464752197 }, { "auxiliary_loss_clip": 0.01049274, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.01364779, "balance_loss_mlp": 1.01529634, "epoch": 0.9175409589658801, "flos": 20879841248640.0, "grad_norm": 2.1492694629395843, "language_loss": 0.62133217, "learning_rate": 7.085382211218637e-08, "loss": 0.64217025, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 15261, "time_per_iteration": 2.377065896987915 }, { "auxiliary_loss_clip": 0.01049834, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.01230359, "balance_loss_mlp": 1.01484394, "epoch": 0.917601082218548, "flos": 14275356399360.0, "grad_norm": 1.7926567536483071, "language_loss": 0.75138152, "learning_rate": 7.075111255942002e-08, "loss": 0.77222276, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 15262, "time_per_iteration": 2.337038278579712 }, { "auxiliary_loss_clip": 0.01053107, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.01650047, "balance_loss_mlp": 1.01602292, "epoch": 0.917661205471216, "flos": 19098223102080.0, "grad_norm": 1.7918400136884522, "language_loss": 0.79257345, "learning_rate": 7.064847616396496e-08, "loss": 0.8135128, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 15263, "time_per_iteration": 2.3666024208068848 }, { "auxiliary_loss_clip": 0.0105357, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.011181, "balance_loss_mlp": 1.01638472, "epoch": 0.917721328723884, "flos": 21105484565760.0, "grad_norm": 2.5690240537562947, "language_loss": 0.77016926, "learning_rate": 7.054591292971324e-08, "loss": 0.79105365, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 15264, "time_per_iteration": 3.7960450649261475 }, { "auxiliary_loss_clip": 0.01051117, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.01404905, "balance_loss_mlp": 1.01524067, "epoch": 0.9177814519765519, "flos": 21942178231680.0, "grad_norm": 1.634163708861722, "language_loss": 0.83884037, "learning_rate": 7.044342286055394e-08, "loss": 0.8597309, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 15265, "time_per_iteration": 2.350616455078125 }, { "auxiliary_loss_clip": 0.01054383, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.01885152, "balance_loss_mlp": 1.01686275, "epoch": 0.9178415752292199, "flos": 24204864268800.0, "grad_norm": 1.7260164746603792, "language_loss": 0.73903036, "learning_rate": 7.034100596037306e-08, "loss": 0.76001179, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 15266, "time_per_iteration": 2.392383575439453 }, { "auxiliary_loss_clip": 0.01051576, "auxiliary_loss_mlp": 0.01037125, "balance_loss_clip": 1.01467764, "balance_loss_mlp": 1.01566005, "epoch": 0.9179016984818879, "flos": 20041192546560.0, "grad_norm": 1.8065032559076106, "language_loss": 0.78267169, "learning_rate": 7.023866223305486e-08, "loss": 0.80355871, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 15267, "time_per_iteration": 2.365565299987793 }, { "auxiliary_loss_clip": 0.0100708, "auxiliary_loss_mlp": 0.01001765, "balance_loss_clip": 0.99983358, "balance_loss_mlp": 1.00066292, "epoch": 0.9179618217345559, "flos": 65552291032320.0, "grad_norm": 0.7383519271513952, "language_loss": 0.56457776, "learning_rate": 7.013639168247975e-08, "loss": 0.58466619, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06396484, "step": 15268, "time_per_iteration": 3.0208370685577393 }, { "auxiliary_loss_clip": 0.01052799, "auxiliary_loss_mlp": 0.01038102, "balance_loss_clip": 1.01335454, "balance_loss_mlp": 1.0158658, "epoch": 0.9180219449872238, "flos": 21323552117760.0, "grad_norm": 1.7405659648040406, "language_loss": 0.78532946, "learning_rate": 7.0034194312526e-08, "loss": 0.80623841, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36914062, "step": 15269, "time_per_iteration": 2.3636474609375 }, { "auxiliary_loss_clip": 0.01051704, "auxiliary_loss_mlp": 0.01035336, "balance_loss_clip": 1.01299644, "balance_loss_mlp": 1.01574564, "epoch": 0.9180820682398918, "flos": 41058487284480.0, "grad_norm": 1.5870044711452655, "language_loss": 0.7324419, "learning_rate": 6.993207012706936e-08, "loss": 0.75331229, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 15270, "time_per_iteration": 2.5401723384857178 }, { "auxiliary_loss_clip": 0.01048887, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.01451802, "balance_loss_mlp": 1.01451027, "epoch": 0.9181421914925597, "flos": 28071704678400.0, "grad_norm": 1.5757264107320872, "language_loss": 0.8024317, "learning_rate": 6.98300191299821e-08, "loss": 0.82329535, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34375, "step": 15271, "time_per_iteration": 2.4134488105773926 }, { "auxiliary_loss_clip": 0.01052152, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.01043904, "balance_loss_mlp": 1.01573801, "epoch": 0.9182023147452277, "flos": 29168117015040.0, "grad_norm": 1.9990973914925392, "language_loss": 0.73252815, "learning_rate": 6.972804132513355e-08, "loss": 0.75339031, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 15272, "time_per_iteration": 2.4453237056732178 }, { "auxiliary_loss_clip": 0.01052511, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.01610947, "balance_loss_mlp": 1.01613569, "epoch": 0.9182624379978956, "flos": 24059695368960.0, "grad_norm": 1.92891695593503, "language_loss": 0.74088061, "learning_rate": 6.962613671639105e-08, "loss": 0.76178604, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 15273, "time_per_iteration": 2.378659725189209 }, { "auxiliary_loss_clip": 0.01047062, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.01308155, "balance_loss_mlp": 1.01418996, "epoch": 0.9183225612505637, "flos": 23292444130560.0, "grad_norm": 1.5794345015356184, "language_loss": 0.74840057, "learning_rate": 6.952430530761933e-08, "loss": 0.76920599, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.328125, "step": 15274, "time_per_iteration": 2.3989408016204834 }, { "auxiliary_loss_clip": 0.01053043, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.01712608, "balance_loss_mlp": 1.01682711, "epoch": 0.9183826845032316, "flos": 19608234376320.0, "grad_norm": 1.5936112647414051, "language_loss": 0.70140064, "learning_rate": 6.942254710267902e-08, "loss": 0.72231889, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36132812, "step": 15275, "time_per_iteration": 2.3592021465301514 }, { "auxiliary_loss_clip": 0.01051346, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.01225209, "balance_loss_mlp": 1.01600087, "epoch": 0.9184428077558996, "flos": 18478060888320.0, "grad_norm": 1.830073675741051, "language_loss": 0.73375267, "learning_rate": 6.932086210542953e-08, "loss": 0.75461829, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35351562, "step": 15276, "time_per_iteration": 2.4170949459075928 }, { "auxiliary_loss_clip": 0.01051312, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.0154984, "balance_loss_mlp": 1.01572561, "epoch": 0.9185029310085676, "flos": 20739978875520.0, "grad_norm": 1.6356857540517489, "language_loss": 0.74622548, "learning_rate": 6.921925031972642e-08, "loss": 0.76711535, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35546875, "step": 15277, "time_per_iteration": 2.3617658615112305 }, { "auxiliary_loss_clip": 0.0100726, "auxiliary_loss_mlp": 0.01001969, "balance_loss_clip": 0.9999308, "balance_loss_mlp": 1.00092649, "epoch": 0.9185630542612355, "flos": 68205830273280.0, "grad_norm": 0.7275460290626877, "language_loss": 0.5924778, "learning_rate": 6.91177117494226e-08, "loss": 0.61257005, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06347656, "step": 15278, "time_per_iteration": 3.1182339191436768 }, { "auxiliary_loss_clip": 0.01048221, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.01216459, "balance_loss_mlp": 1.01399267, "epoch": 0.9186231775139035, "flos": 12238662792960.0, "grad_norm": 1.6309028821447207, "language_loss": 0.65563726, "learning_rate": 6.901624639836879e-08, "loss": 0.67644846, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 15279, "time_per_iteration": 2.3303134441375732 }, { "auxiliary_loss_clip": 0.01007034, "auxiliary_loss_mlp": 0.01002254, "balance_loss_clip": 1.00012052, "balance_loss_mlp": 1.00072289, "epoch": 0.9186833007665715, "flos": 63935987506560.0, "grad_norm": 0.8433206327216874, "language_loss": 0.6018424, "learning_rate": 6.891485427041211e-08, "loss": 0.62193531, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.06298828, "step": 15280, "time_per_iteration": 3.025491952896118 }, { "auxiliary_loss_clip": 0.01052778, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.01681876, "balance_loss_mlp": 1.01607466, "epoch": 0.9187434240192395, "flos": 19973670243840.0, "grad_norm": 2.108827596187499, "language_loss": 0.71289408, "learning_rate": 6.881353536939815e-08, "loss": 0.73383725, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 15281, "time_per_iteration": 3.613032817840576 }, { "auxiliary_loss_clip": 0.01052689, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.01381183, "balance_loss_mlp": 1.01629186, "epoch": 0.9188035472719074, "flos": 25226667296640.0, "grad_norm": 1.5974526872980797, "language_loss": 0.85265988, "learning_rate": 6.871228969916831e-08, "loss": 0.87357259, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 15282, "time_per_iteration": 2.41317081451416 }, { "auxiliary_loss_clip": 0.01050066, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.01589489, "balance_loss_mlp": 1.01554477, "epoch": 0.9188636705245754, "flos": 18404568743040.0, "grad_norm": 1.9219534366074067, "language_loss": 0.61284769, "learning_rate": 6.861111726356194e-08, "loss": 0.63374805, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.34570312, "step": 15283, "time_per_iteration": 2.3454651832580566 }, { "auxiliary_loss_clip": 0.01053449, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.01439619, "balance_loss_mlp": 1.01696181, "epoch": 0.9189237937772433, "flos": 23767996026240.0, "grad_norm": 1.6135398165589792, "language_loss": 0.66462094, "learning_rate": 6.851001806641554e-08, "loss": 0.68553293, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 15284, "time_per_iteration": 2.3801820278167725 }, { "auxiliary_loss_clip": 0.01050614, "auxiliary_loss_mlp": 0.01039409, "balance_loss_clip": 1.01635361, "balance_loss_mlp": 1.01484609, "epoch": 0.9189839170299113, "flos": 21213575735040.0, "grad_norm": 1.763723098132223, "language_loss": 0.74865544, "learning_rate": 6.840899211156292e-08, "loss": 0.76955557, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 15285, "time_per_iteration": 2.3547353744506836 }, { "auxiliary_loss_clip": 0.01050291, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.01135445, "balance_loss_mlp": 1.01528859, "epoch": 0.9190440402825792, "flos": 16726433466240.0, "grad_norm": 1.974189813925432, "language_loss": 0.73408329, "learning_rate": 6.830803940283458e-08, "loss": 0.75493503, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34960938, "step": 15286, "time_per_iteration": 2.342820644378662 }, { "auxiliary_loss_clip": 0.01052646, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.01267147, "balance_loss_mlp": 1.01683235, "epoch": 0.9191041635352473, "flos": 23440056825600.0, "grad_norm": 2.0070472797051098, "language_loss": 0.74270666, "learning_rate": 6.820715994405945e-08, "loss": 0.76360416, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 15287, "time_per_iteration": 2.372950315475464 }, { "auxiliary_loss_clip": 0.01052645, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.01472759, "balance_loss_mlp": 1.01659977, "epoch": 0.9191642867879152, "flos": 18806523759360.0, "grad_norm": 2.0237227907227604, "language_loss": 0.66972691, "learning_rate": 6.810635373906226e-08, "loss": 0.69064653, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 15288, "time_per_iteration": 2.40139102935791 }, { "auxiliary_loss_clip": 0.01052598, "auxiliary_loss_mlp": 0.01039518, "balance_loss_clip": 1.01521134, "balance_loss_mlp": 1.01697397, "epoch": 0.9192244100405832, "flos": 32159580105600.0, "grad_norm": 1.879502953299506, "language_loss": 0.72312105, "learning_rate": 6.800562079166549e-08, "loss": 0.74404216, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 15289, "time_per_iteration": 2.4893312454223633 }, { "auxiliary_loss_clip": 0.01053745, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.01720238, "balance_loss_mlp": 1.01656973, "epoch": 0.9192845332932512, "flos": 16356878058240.0, "grad_norm": 1.873494844662835, "language_loss": 0.75443017, "learning_rate": 6.790496110568921e-08, "loss": 0.77539355, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 15290, "time_per_iteration": 2.4691734313964844 }, { "auxiliary_loss_clip": 0.0104936, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.01285267, "balance_loss_mlp": 1.01502323, "epoch": 0.9193446565459191, "flos": 26613277787520.0, "grad_norm": 2.465253955568107, "language_loss": 0.73246622, "learning_rate": 6.78043746849506e-08, "loss": 0.75329947, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 15291, "time_per_iteration": 2.559516429901123 }, { "auxiliary_loss_clip": 0.01051007, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.01502991, "balance_loss_mlp": 1.01576185, "epoch": 0.9194047797985871, "flos": 22491082627200.0, "grad_norm": 1.5509596894912097, "language_loss": 0.7150085, "learning_rate": 6.770386153326346e-08, "loss": 0.7358942, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 15292, "time_per_iteration": 5.320828914642334 }, { "auxiliary_loss_clip": 0.01051571, "auxiliary_loss_mlp": 0.0104019, "balance_loss_clip": 1.01729012, "balance_loss_mlp": 1.0156914, "epoch": 0.9194649030512551, "flos": 25077727969920.0, "grad_norm": 1.8413530852057134, "language_loss": 0.74215144, "learning_rate": 6.760342165443988e-08, "loss": 0.76306903, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 15293, "time_per_iteration": 2.3922340869903564 }, { "auxiliary_loss_clip": 0.01050881, "auxiliary_loss_mlp": 0.01036394, "balance_loss_clip": 1.0143398, "balance_loss_mlp": 1.01596856, "epoch": 0.9195250263039231, "flos": 11910339567360.0, "grad_norm": 1.8723844564887773, "language_loss": 0.79759997, "learning_rate": 6.750305505228837e-08, "loss": 0.81847274, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 15294, "time_per_iteration": 2.3481392860412598 }, { "auxiliary_loss_clip": 0.0105429, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.01601624, "balance_loss_mlp": 1.01672316, "epoch": 0.919585149556591, "flos": 21833109544320.0, "grad_norm": 1.723259489123444, "language_loss": 0.78771496, "learning_rate": 6.74027617306141e-08, "loss": 0.80866075, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 15295, "time_per_iteration": 2.4275336265563965 }, { "auxiliary_loss_clip": 0.0104948, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.00987089, "balance_loss_mlp": 1.0157311, "epoch": 0.919645272809259, "flos": 28182798224640.0, "grad_norm": 3.60942058032205, "language_loss": 0.72662544, "learning_rate": 6.730254169322114e-08, "loss": 0.74743319, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33789062, "step": 15296, "time_per_iteration": 2.4143970012664795 }, { "auxiliary_loss_clip": 0.01051782, "auxiliary_loss_mlp": 0.01044655, "balance_loss_clip": 1.02093291, "balance_loss_mlp": 1.01674604, "epoch": 0.9197053960619269, "flos": 18331844647680.0, "grad_norm": 1.8299357898403117, "language_loss": 0.75976562, "learning_rate": 6.720239494390912e-08, "loss": 0.78073001, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3515625, "step": 15297, "time_per_iteration": 2.3450939655303955 }, { "auxiliary_loss_clip": 0.01051488, "auxiliary_loss_mlp": 0.0103714, "balance_loss_clip": 1.01403725, "balance_loss_mlp": 1.01561511, "epoch": 0.9197655193145949, "flos": 28182204731520.0, "grad_norm": 1.63356895723524, "language_loss": 0.75385404, "learning_rate": 6.710232148647676e-08, "loss": 0.77474028, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 15298, "time_per_iteration": 2.4090311527252197 }, { "auxiliary_loss_clip": 0.01050957, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.01201034, "balance_loss_mlp": 1.01586318, "epoch": 0.9198256425672628, "flos": 17305503143040.0, "grad_norm": 6.216534683858197, "language_loss": 0.80912948, "learning_rate": 6.70023213247175e-08, "loss": 0.8299911, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 15299, "time_per_iteration": 2.3465356826782227 }, { "auxiliary_loss_clip": 0.01052068, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.01099634, "balance_loss_mlp": 1.01646221, "epoch": 0.9198857658199309, "flos": 17857549560960.0, "grad_norm": 1.9703583735498467, "language_loss": 0.64639342, "learning_rate": 6.690239446242385e-08, "loss": 0.66723543, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35546875, "step": 15300, "time_per_iteration": 2.3372607231140137 }, { "auxiliary_loss_clip": 0.01049307, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.01534486, "balance_loss_mlp": 1.01638234, "epoch": 0.9199458890725988, "flos": 22126449720960.0, "grad_norm": 1.797686180890295, "language_loss": 0.70885837, "learning_rate": 6.680254090338545e-08, "loss": 0.7296949, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.33007812, "step": 15301, "time_per_iteration": 2.374493360519409 }, { "auxiliary_loss_clip": 0.01052035, "auxiliary_loss_mlp": 0.01040872, "balance_loss_clip": 1.01673269, "balance_loss_mlp": 1.01566672, "epoch": 0.9200060123252668, "flos": 16033128220800.0, "grad_norm": 1.6682718127751661, "language_loss": 0.72404087, "learning_rate": 6.670276065138814e-08, "loss": 0.74496996, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 15302, "time_per_iteration": 2.335052490234375 }, { "auxiliary_loss_clip": 0.01052219, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.01068878, "balance_loss_mlp": 1.01590323, "epoch": 0.9200661355779348, "flos": 26863465656960.0, "grad_norm": 2.6262876880974155, "language_loss": 0.77786958, "learning_rate": 6.660305371021579e-08, "loss": 0.79872429, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 15303, "time_per_iteration": 2.4144067764282227 }, { "auxiliary_loss_clip": 0.01050995, "auxiliary_loss_mlp": 0.0103773, "balance_loss_clip": 1.01564014, "balance_loss_mlp": 1.01609623, "epoch": 0.9201262588306027, "flos": 12785926354560.0, "grad_norm": 2.6962140865571143, "language_loss": 0.89245665, "learning_rate": 6.650342008365006e-08, "loss": 0.91334391, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 15304, "time_per_iteration": 3.746840000152588 }, { "auxiliary_loss_clip": 0.01054015, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.0175271, "balance_loss_mlp": 1.01649559, "epoch": 0.9201863820832707, "flos": 20630561074560.0, "grad_norm": 1.9571833720204719, "language_loss": 0.79490423, "learning_rate": 6.64038597754677e-08, "loss": 0.81588125, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 15305, "time_per_iteration": 2.3608803749084473 }, { "auxiliary_loss_clip": 0.01051459, "auxiliary_loss_mlp": 0.01036735, "balance_loss_clip": 1.014395, "balance_loss_mlp": 1.01576447, "epoch": 0.9202465053359387, "flos": 26394616742400.0, "grad_norm": 1.9166731704319537, "language_loss": 0.82485986, "learning_rate": 6.630437278944501e-08, "loss": 0.84574175, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 15306, "time_per_iteration": 2.395981788635254 }, { "auxiliary_loss_clip": 0.01049118, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.01438355, "balance_loss_mlp": 1.01557326, "epoch": 0.9203066285886067, "flos": 10487419395840.0, "grad_norm": 1.8900045578673272, "language_loss": 0.72907501, "learning_rate": 6.62049591293541e-08, "loss": 0.7499184, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3359375, "step": 15307, "time_per_iteration": 2.35368013381958 }, { "auxiliary_loss_clip": 0.01053224, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.01242089, "balance_loss_mlp": 1.01614857, "epoch": 0.9203667518412746, "flos": 19389712976640.0, "grad_norm": 2.258704994485348, "language_loss": 0.80049157, "learning_rate": 6.610561879896526e-08, "loss": 0.82139683, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 15308, "time_per_iteration": 2.3615097999572754 }, { "auxiliary_loss_clip": 0.01050987, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.0113709, "balance_loss_mlp": 1.01553202, "epoch": 0.9204268750939426, "flos": 15924059533440.0, "grad_norm": 1.8194231728877157, "language_loss": 0.79118979, "learning_rate": 6.600635180204484e-08, "loss": 0.81203789, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 15309, "time_per_iteration": 2.3475453853607178 }, { "auxiliary_loss_clip": 0.01050483, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.01240683, "balance_loss_mlp": 1.01572657, "epoch": 0.9204869983466105, "flos": 16470834336000.0, "grad_norm": 1.799228500385183, "language_loss": 0.67624885, "learning_rate": 6.590715814235781e-08, "loss": 0.69710708, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 15310, "time_per_iteration": 2.417417526245117 }, { "auxiliary_loss_clip": 0.01051132, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.01299286, "balance_loss_mlp": 1.01477921, "epoch": 0.9205471215992785, "flos": 21538268179200.0, "grad_norm": 1.751596950252876, "language_loss": 0.66627264, "learning_rate": 6.580803782366495e-08, "loss": 0.6871345, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.36328125, "step": 15311, "time_per_iteration": 2.3843047618865967 }, { "auxiliary_loss_clip": 0.01051535, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.01274586, "balance_loss_mlp": 1.01554275, "epoch": 0.9206072448519464, "flos": 25004829317760.0, "grad_norm": 1.7512596315776432, "language_loss": 0.77436137, "learning_rate": 6.570899084972503e-08, "loss": 0.79524648, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 15312, "time_per_iteration": 2.4000730514526367 }, { "auxiliary_loss_clip": 0.01050621, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.01610637, "balance_loss_mlp": 1.01571012, "epoch": 0.9206673681046145, "flos": 20521597121280.0, "grad_norm": 1.580020815520702, "language_loss": 0.79669106, "learning_rate": 6.561001722429394e-08, "loss": 0.81758082, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 15313, "time_per_iteration": 2.4180285930633545 }, { "auxiliary_loss_clip": 0.01053312, "auxiliary_loss_mlp": 0.01042107, "balance_loss_clip": 1.0183481, "balance_loss_mlp": 1.01601386, "epoch": 0.9207274913572824, "flos": 20882494512000.0, "grad_norm": 1.8765613728212753, "language_loss": 0.79558635, "learning_rate": 6.55111169511251e-08, "loss": 0.8165406, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.37304688, "step": 15314, "time_per_iteration": 2.3652477264404297 }, { "auxiliary_loss_clip": 0.01055573, "auxiliary_loss_mlp": 0.01046459, "balance_loss_clip": 1.01802778, "balance_loss_mlp": 1.01750255, "epoch": 0.9207876146099504, "flos": 22707230054400.0, "grad_norm": 1.8485933282579343, "language_loss": 0.81111407, "learning_rate": 6.541229003396864e-08, "loss": 0.83213437, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.38085938, "step": 15315, "time_per_iteration": 2.3768415451049805 }, { "auxiliary_loss_clip": 0.01054981, "auxiliary_loss_mlp": 0.01040961, "balance_loss_clip": 1.0164752, "balance_loss_mlp": 1.01621199, "epoch": 0.9208477378626184, "flos": 18506585335680.0, "grad_norm": 1.7669144757812616, "language_loss": 0.77411795, "learning_rate": 6.531353647657156e-08, "loss": 0.79507732, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.38671875, "step": 15316, "time_per_iteration": 2.3411781787872314 }, { "auxiliary_loss_clip": 0.01051114, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.0196836, "balance_loss_mlp": 1.01509333, "epoch": 0.9209078611152863, "flos": 22998615194880.0, "grad_norm": 1.6804548106026098, "language_loss": 0.70384544, "learning_rate": 6.521485628267931e-08, "loss": 0.72478706, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 15317, "time_per_iteration": 2.41685152053833 }, { "auxiliary_loss_clip": 0.0105159, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.0092274, "balance_loss_mlp": 1.01577187, "epoch": 0.9209679843679544, "flos": 24060358684800.0, "grad_norm": 1.6749085059370903, "language_loss": 0.84575999, "learning_rate": 6.511624945603378e-08, "loss": 0.86659598, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 15318, "time_per_iteration": 2.406773567199707 }, { "auxiliary_loss_clip": 0.01052526, "auxiliary_loss_mlp": 0.01037779, "balance_loss_clip": 1.0140202, "balance_loss_mlp": 1.01710606, "epoch": 0.9210281076206223, "flos": 13552514277120.0, "grad_norm": 1.8869178629036363, "language_loss": 0.87136042, "learning_rate": 6.501771600037354e-08, "loss": 0.89226353, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 15319, "time_per_iteration": 2.345916986465454 }, { "auxiliary_loss_clip": 0.01007147, "auxiliary_loss_mlp": 0.01002411, "balance_loss_clip": 1.00048029, "balance_loss_mlp": 1.00067174, "epoch": 0.9210882308732903, "flos": 71422622478720.0, "grad_norm": 0.7704901841558655, "language_loss": 0.56232899, "learning_rate": 6.491925591943559e-08, "loss": 0.58242458, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06445312, "step": 15320, "time_per_iteration": 3.0600101947784424 }, { "auxiliary_loss_clip": 0.01054342, "auxiliary_loss_mlp": 0.01046369, "balance_loss_clip": 1.02070367, "balance_loss_mlp": 1.01670051, "epoch": 0.9211483541259582, "flos": 18508295992320.0, "grad_norm": 2.2140353245947004, "language_loss": 0.64959502, "learning_rate": 6.482086921695384e-08, "loss": 0.6706022, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37695312, "step": 15321, "time_per_iteration": 3.603483200073242 }, { "auxiliary_loss_clip": 0.0104752, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.01529944, "balance_loss_mlp": 1.0154016, "epoch": 0.9212084773786262, "flos": 23257111968000.0, "grad_norm": 1.4774290887337989, "language_loss": 0.72067875, "learning_rate": 6.47225558966582e-08, "loss": 0.74151009, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.3203125, "step": 15322, "time_per_iteration": 2.4050114154815674 }, { "auxiliary_loss_clip": 0.01050485, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01050174, "balance_loss_mlp": 1.01570344, "epoch": 0.9212686006312941, "flos": 16288587705600.0, "grad_norm": 1.739707953114904, "language_loss": 0.70902526, "learning_rate": 6.462431596227725e-08, "loss": 0.72984368, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 15323, "time_per_iteration": 2.3520314693450928 }, { "auxiliary_loss_clip": 0.0105355, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.01761806, "balance_loss_mlp": 1.01601577, "epoch": 0.9213287238839621, "flos": 19784930100480.0, "grad_norm": 2.038797405342826, "language_loss": 0.75984907, "learning_rate": 6.452614941753597e-08, "loss": 0.78082049, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 15324, "time_per_iteration": 2.3670294284820557 }, { "auxiliary_loss_clip": 0.01051971, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.01833653, "balance_loss_mlp": 1.01594031, "epoch": 0.92138884713663, "flos": 21029408979840.0, "grad_norm": 2.4698261283053657, "language_loss": 0.71974444, "learning_rate": 6.442805626615744e-08, "loss": 0.74069321, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 15325, "time_per_iteration": 2.415095567703247 }, { "auxiliary_loss_clip": 0.01050823, "auxiliary_loss_mlp": 0.01039424, "balance_loss_clip": 1.01664305, "balance_loss_mlp": 1.0156951, "epoch": 0.9214489703892981, "flos": 28585940227200.0, "grad_norm": 1.5678292866922723, "language_loss": 0.78743559, "learning_rate": 6.433003651186109e-08, "loss": 0.80833805, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 15326, "time_per_iteration": 2.4377479553222656 }, { "auxiliary_loss_clip": 0.01053409, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.01435101, "balance_loss_mlp": 1.01699889, "epoch": 0.921509093641966, "flos": 16360578662400.0, "grad_norm": 2.355235823209294, "language_loss": 0.71830773, "learning_rate": 6.42320901583635e-08, "loss": 0.73922533, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 15327, "time_per_iteration": 2.3529062271118164 }, { "auxiliary_loss_clip": 0.01054902, "auxiliary_loss_mlp": 0.01041623, "balance_loss_clip": 1.01613569, "balance_loss_mlp": 1.01750219, "epoch": 0.921569216894634, "flos": 26829704505600.0, "grad_norm": 1.9251572698192434, "language_loss": 0.7833643, "learning_rate": 6.413421720937906e-08, "loss": 0.80432951, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 15328, "time_per_iteration": 2.382625102996826 }, { "auxiliary_loss_clip": 0.01051252, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.01443934, "balance_loss_mlp": 1.0165441, "epoch": 0.921629340147302, "flos": 24643966838400.0, "grad_norm": 2.307508830439436, "language_loss": 0.72290993, "learning_rate": 6.4036417668619e-08, "loss": 0.74377859, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 15329, "time_per_iteration": 2.416104555130005 }, { "auxiliary_loss_clip": 0.01050805, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.01011729, "balance_loss_mlp": 1.01536655, "epoch": 0.9216894633999699, "flos": 15085585388160.0, "grad_norm": 1.823204177940141, "language_loss": 0.87753189, "learning_rate": 6.393869153979192e-08, "loss": 0.89834911, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.35546875, "step": 15330, "time_per_iteration": 2.3190059661865234 }, { "auxiliary_loss_clip": 0.01050666, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.01219189, "balance_loss_mlp": 1.01522934, "epoch": 0.921749586652638, "flos": 19203626096640.0, "grad_norm": 2.1657977143042833, "language_loss": 0.77177596, "learning_rate": 6.384103882660397e-08, "loss": 0.79263318, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 15331, "time_per_iteration": 3.762830972671509 }, { "auxiliary_loss_clip": 0.01051127, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.00995636, "balance_loss_mlp": 1.01557922, "epoch": 0.9218097099053059, "flos": 20521387653120.0, "grad_norm": 2.4153717349945483, "language_loss": 0.76285362, "learning_rate": 6.374345953275794e-08, "loss": 0.78368789, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 15332, "time_per_iteration": 3.705519437789917 }, { "auxiliary_loss_clip": 0.01050054, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.01651239, "balance_loss_mlp": 1.0148015, "epoch": 0.9218698331579739, "flos": 17347643020800.0, "grad_norm": 1.9057010793697213, "language_loss": 0.76043284, "learning_rate": 6.364595366195358e-08, "loss": 0.7813195, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15333, "time_per_iteration": 2.340244770050049 }, { "auxiliary_loss_clip": 0.01006952, "auxiliary_loss_mlp": 0.01004247, "balance_loss_clip": 1.00208902, "balance_loss_mlp": 1.00061607, "epoch": 0.9219299564106418, "flos": 61955435099520.0, "grad_norm": 0.8130744726073179, "language_loss": 0.52974391, "learning_rate": 6.354852121788879e-08, "loss": 0.54985595, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06347656, "step": 15334, "time_per_iteration": 2.9934167861938477 }, { "auxiliary_loss_clip": 0.0105083, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.01457632, "balance_loss_mlp": 1.01662803, "epoch": 0.9219900796633098, "flos": 15700964745600.0, "grad_norm": 2.14993344254625, "language_loss": 0.62770289, "learning_rate": 6.345116220425839e-08, "loss": 0.64857185, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 15335, "time_per_iteration": 2.34134840965271 }, { "auxiliary_loss_clip": 0.01050812, "auxiliary_loss_mlp": 0.010342, "balance_loss_clip": 1.01176453, "balance_loss_mlp": 1.01587188, "epoch": 0.9220502029159777, "flos": 24931616463360.0, "grad_norm": 1.6089924792913122, "language_loss": 0.72550017, "learning_rate": 6.335387662475366e-08, "loss": 0.74635029, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 15336, "time_per_iteration": 2.43109130859375 }, { "auxiliary_loss_clip": 0.01048206, "auxiliary_loss_mlp": 0.01036553, "balance_loss_clip": 1.01602566, "balance_loss_mlp": 1.01452994, "epoch": 0.9221103261686457, "flos": 15666365721600.0, "grad_norm": 1.894267554039299, "language_loss": 0.72664177, "learning_rate": 6.325666448306433e-08, "loss": 0.74748939, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 15337, "time_per_iteration": 2.3358404636383057 }, { "auxiliary_loss_clip": 0.010077, "auxiliary_loss_mlp": 0.01004081, "balance_loss_clip": 1.00212562, "balance_loss_mlp": 1.00128675, "epoch": 0.9221704494213137, "flos": 67512909052800.0, "grad_norm": 0.8863206124434034, "language_loss": 0.65510052, "learning_rate": 6.31595257828763e-08, "loss": 0.67521834, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06445312, "step": 15338, "time_per_iteration": 3.009782314300537 }, { "auxiliary_loss_clip": 0.01052988, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.01354349, "balance_loss_mlp": 1.01715112, "epoch": 0.9222305726739817, "flos": 30225636230400.0, "grad_norm": 2.788664674333504, "language_loss": 0.68557107, "learning_rate": 6.306246052787289e-08, "loss": 0.70646036, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 15339, "time_per_iteration": 2.433680534362793 }, { "auxiliary_loss_clip": 0.01051591, "auxiliary_loss_mlp": 0.01038916, "balance_loss_clip": 1.01456118, "balance_loss_mlp": 1.0155251, "epoch": 0.9222906959266496, "flos": 25336050186240.0, "grad_norm": 1.8912619281863274, "language_loss": 0.73080462, "learning_rate": 6.296546872173513e-08, "loss": 0.7517097, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36132812, "step": 15340, "time_per_iteration": 2.4865005016326904 }, { "auxiliary_loss_clip": 0.01050116, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.01457489, "balance_loss_mlp": 1.01568377, "epoch": 0.9223508191793176, "flos": 27598631489280.0, "grad_norm": 1.5021259533812021, "language_loss": 0.71880603, "learning_rate": 6.286855036814098e-08, "loss": 0.73968315, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34375, "step": 15341, "time_per_iteration": 2.434110641479492 }, { "auxiliary_loss_clip": 0.01048367, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.01080585, "balance_loss_mlp": 1.01560366, "epoch": 0.9224109424319856, "flos": 27306373564800.0, "grad_norm": 1.6337418967647617, "language_loss": 0.68549705, "learning_rate": 6.277170547076571e-08, "loss": 0.70629179, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.328125, "step": 15342, "time_per_iteration": 2.4492218494415283 }, { "auxiliary_loss_clip": 0.01050931, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.01940107, "balance_loss_mlp": 1.01535106, "epoch": 0.9224710656846535, "flos": 48206674736640.0, "grad_norm": 2.3248456395558996, "language_loss": 0.70408154, "learning_rate": 6.26749340332815e-08, "loss": 0.72500724, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 15343, "time_per_iteration": 4.058551549911499 }, { "auxiliary_loss_clip": 0.01007308, "auxiliary_loss_mlp": 0.01003675, "balance_loss_clip": 1.00168431, "balance_loss_mlp": 1.00106239, "epoch": 0.9225311889373216, "flos": 66718564732800.0, "grad_norm": 0.7236229551090327, "language_loss": 0.5205493, "learning_rate": 6.257823605935786e-08, "loss": 0.54065919, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.0625, "step": 15344, "time_per_iteration": 3.1956093311309814 }, { "auxiliary_loss_clip": 0.01047681, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.01343286, "balance_loss_mlp": 1.01513958, "epoch": 0.9225913121899895, "flos": 22270257077760.0, "grad_norm": 1.7010264266061874, "language_loss": 0.71914268, "learning_rate": 6.248161155266162e-08, "loss": 0.73994452, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.32617188, "step": 15345, "time_per_iteration": 2.37715482711792 }, { "auxiliary_loss_clip": 0.01051053, "auxiliary_loss_mlp": 0.01050231, "balance_loss_clip": 1.02797461, "balance_loss_mlp": 1.01593041, "epoch": 0.9226514354426575, "flos": 20081726501760.0, "grad_norm": 1.7065162245263463, "language_loss": 0.7838828, "learning_rate": 6.238506051685677e-08, "loss": 0.80489564, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 15346, "time_per_iteration": 2.371446371078491 }, { "auxiliary_loss_clip": 0.01054883, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.01627755, "balance_loss_mlp": 1.01697135, "epoch": 0.9227115586953254, "flos": 16069926660480.0, "grad_norm": 1.813688646271003, "language_loss": 0.77464461, "learning_rate": 6.228858295560457e-08, "loss": 0.79561889, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 15347, "time_per_iteration": 2.3573999404907227 }, { "auxiliary_loss_clip": 0.01048913, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.01505637, "balance_loss_mlp": 1.01600027, "epoch": 0.9227716819479934, "flos": 20445067687680.0, "grad_norm": 1.4786006510632959, "language_loss": 0.77499282, "learning_rate": 6.219217887256367e-08, "loss": 0.79584414, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.328125, "step": 15348, "time_per_iteration": 2.3677122592926025 }, { "auxiliary_loss_clip": 0.01052219, "auxiliary_loss_mlp": 0.01036504, "balance_loss_clip": 1.01272202, "balance_loss_mlp": 1.01529455, "epoch": 0.9228318052006613, "flos": 25006295594880.0, "grad_norm": 2.0562381314431564, "language_loss": 0.69263792, "learning_rate": 6.209584827138959e-08, "loss": 0.71352518, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 15349, "time_per_iteration": 2.396678924560547 }, { "auxiliary_loss_clip": 0.01051825, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.01588726, "balance_loss_mlp": 1.01526797, "epoch": 0.9228919284533293, "flos": 12676438730880.0, "grad_norm": 2.446393579527446, "language_loss": 0.88381404, "learning_rate": 6.199959115573495e-08, "loss": 0.90472174, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 15350, "time_per_iteration": 2.362734794616699 }, { "auxiliary_loss_clip": 0.0100732, "auxiliary_loss_mlp": 0.01004325, "balance_loss_clip": 1.00239372, "balance_loss_mlp": 1.0009141, "epoch": 0.9229520517059973, "flos": 69983014677120.0, "grad_norm": 0.778199143101153, "language_loss": 0.60496324, "learning_rate": 6.190340752924994e-08, "loss": 0.62507969, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06396484, "step": 15351, "time_per_iteration": 2.9814682006835938 }, { "auxiliary_loss_clip": 0.01051959, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.00894189, "balance_loss_mlp": 1.01529026, "epoch": 0.9230121749586653, "flos": 14792943438720.0, "grad_norm": 1.8404807836767416, "language_loss": 0.79415345, "learning_rate": 6.180729739558233e-08, "loss": 0.8149873, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3671875, "step": 15352, "time_per_iteration": 2.350602865219116 }, { "auxiliary_loss_clip": 0.01054986, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.02028728, "balance_loss_mlp": 1.01688743, "epoch": 0.9230722982113332, "flos": 22966075941120.0, "grad_norm": 1.9921791678696208, "language_loss": 0.60964429, "learning_rate": 6.171126075837585e-08, "loss": 0.63063586, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38085938, "step": 15353, "time_per_iteration": 2.3971407413482666 }, { "auxiliary_loss_clip": 0.01050803, "auxiliary_loss_mlp": 0.01033566, "balance_loss_clip": 1.01163197, "balance_loss_mlp": 1.01671386, "epoch": 0.9231324214640012, "flos": 18550470781440.0, "grad_norm": 1.6928846540972822, "language_loss": 0.75669795, "learning_rate": 6.161529762127293e-08, "loss": 0.77754164, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 15354, "time_per_iteration": 2.360952615737915 }, { "auxiliary_loss_clip": 0.01052666, "auxiliary_loss_mlp": 0.01043387, "balance_loss_clip": 1.01642191, "balance_loss_mlp": 1.01540279, "epoch": 0.9231925447166691, "flos": 22081866048000.0, "grad_norm": 3.672586130919069, "language_loss": 0.66598517, "learning_rate": 6.1519407987912e-08, "loss": 0.68694574, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37304688, "step": 15355, "time_per_iteration": 2.3724162578582764 }, { "auxiliary_loss_clip": 0.01051481, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.01546073, "balance_loss_mlp": 1.01700783, "epoch": 0.9232526679693371, "flos": 26539960199040.0, "grad_norm": 1.942898379941915, "language_loss": 0.75023985, "learning_rate": 6.142359186192947e-08, "loss": 0.77113324, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 15356, "time_per_iteration": 2.421790838241577 }, { "auxiliary_loss_clip": 0.01053025, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.0127486, "balance_loss_mlp": 1.01644623, "epoch": 0.9233127912220052, "flos": 14755795885440.0, "grad_norm": 2.067266046126509, "language_loss": 0.62477243, "learning_rate": 6.132784924695844e-08, "loss": 0.64567316, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36523438, "step": 15357, "time_per_iteration": 2.3439671993255615 }, { "auxiliary_loss_clip": 0.0105299, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.01466393, "balance_loss_mlp": 1.01514912, "epoch": 0.9233729144746731, "flos": 25260707738880.0, "grad_norm": 1.4335072994641604, "language_loss": 0.70439249, "learning_rate": 6.123218014662956e-08, "loss": 0.72532976, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 15358, "time_per_iteration": 2.4120168685913086 }, { "auxiliary_loss_clip": 0.01052315, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.01199722, "balance_loss_mlp": 1.01645374, "epoch": 0.9234330377273411, "flos": 27848749536000.0, "grad_norm": 4.701293079685919, "language_loss": 0.74734831, "learning_rate": 6.113658456457104e-08, "loss": 0.76821256, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 15359, "time_per_iteration": 2.4119837284088135 }, { "auxiliary_loss_clip": 0.01053301, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.01766956, "balance_loss_mlp": 1.01688063, "epoch": 0.923493160980009, "flos": 24607203310080.0, "grad_norm": 2.312420887571466, "language_loss": 0.66212904, "learning_rate": 6.104106250440732e-08, "loss": 0.68306863, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 15360, "time_per_iteration": 3.6597657203674316 }, { "auxiliary_loss_clip": 0.01006535, "auxiliary_loss_mlp": 0.01001545, "balance_loss_clip": 0.99969691, "balance_loss_mlp": 1.00033927, "epoch": 0.923553284232677, "flos": 67697459832960.0, "grad_norm": 0.7672192305093569, "language_loss": 0.55288333, "learning_rate": 6.094561396976083e-08, "loss": 0.57296419, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.06201172, "step": 15361, "time_per_iteration": 2.966583251953125 }, { "auxiliary_loss_clip": 0.01052803, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.00910294, "balance_loss_mlp": 1.01583314, "epoch": 0.9236134074853449, "flos": 18806244468480.0, "grad_norm": 1.7203801604272766, "language_loss": 0.70901269, "learning_rate": 6.085023896425112e-08, "loss": 0.72986132, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36914062, "step": 15362, "time_per_iteration": 2.3445169925689697 }, { "auxiliary_loss_clip": 0.01054248, "auxiliary_loss_mlp": 0.01040804, "balance_loss_clip": 1.01454234, "balance_loss_mlp": 1.01693535, "epoch": 0.923673530738013, "flos": 27781122499200.0, "grad_norm": 1.4217139723969487, "language_loss": 0.76357347, "learning_rate": 6.075493749149463e-08, "loss": 0.78452402, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37304688, "step": 15363, "time_per_iteration": 2.4201204776763916 }, { "auxiliary_loss_clip": 0.01050694, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.01429009, "balance_loss_mlp": 1.01519775, "epoch": 0.9237336539906809, "flos": 26795908442880.0, "grad_norm": 3.375166383724164, "language_loss": 0.84050894, "learning_rate": 6.065970955510514e-08, "loss": 0.86137658, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 15364, "time_per_iteration": 2.390108346939087 }, { "auxiliary_loss_clip": 0.0105025, "auxiliary_loss_mlp": 0.01032744, "balance_loss_clip": 1.01244211, "balance_loss_mlp": 1.01607978, "epoch": 0.9237937772433489, "flos": 23586552357120.0, "grad_norm": 1.4191354794287796, "language_loss": 0.68537056, "learning_rate": 6.056455515869419e-08, "loss": 0.70620048, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34179688, "step": 15365, "time_per_iteration": 2.3936142921447754 }, { "auxiliary_loss_clip": 0.0105189, "auxiliary_loss_mlp": 0.0103634, "balance_loss_clip": 1.01359439, "balance_loss_mlp": 1.01588392, "epoch": 0.9238539004960168, "flos": 26139366725760.0, "grad_norm": 1.8667158439718592, "language_loss": 0.64006162, "learning_rate": 6.046947430586913e-08, "loss": 0.66094398, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 15366, "time_per_iteration": 2.4121859073638916 }, { "auxiliary_loss_clip": 0.01051608, "auxiliary_loss_mlp": 0.01037716, "balance_loss_clip": 1.01407695, "balance_loss_mlp": 1.01622248, "epoch": 0.9239140237486848, "flos": 21066975469440.0, "grad_norm": 1.436527583184416, "language_loss": 0.7575044, "learning_rate": 6.037446700023619e-08, "loss": 0.77839768, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 15367, "time_per_iteration": 2.370427131652832 }, { "auxiliary_loss_clip": 0.01049678, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.01048374, "balance_loss_mlp": 1.01602423, "epoch": 0.9239741470013527, "flos": 24606784373760.0, "grad_norm": 5.659362194928722, "language_loss": 0.65844148, "learning_rate": 6.027953324539759e-08, "loss": 0.67925197, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3359375, "step": 15368, "time_per_iteration": 2.385650157928467 }, { "auxiliary_loss_clip": 0.01054669, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.01262116, "balance_loss_mlp": 1.01659548, "epoch": 0.9240342702540207, "flos": 24717074958720.0, "grad_norm": 1.9885917727497393, "language_loss": 0.76530981, "learning_rate": 6.018467304495401e-08, "loss": 0.78622127, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.38085938, "step": 15369, "time_per_iteration": 2.430224895477295 }, { "auxiliary_loss_clip": 0.0105561, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.02162457, "balance_loss_mlp": 1.01739526, "epoch": 0.9240943935066888, "flos": 20848942828800.0, "grad_norm": 1.8007990684839255, "language_loss": 0.77750194, "learning_rate": 6.008988640250145e-08, "loss": 0.79854155, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3828125, "step": 15370, "time_per_iteration": 3.796916961669922 }, { "auxiliary_loss_clip": 0.01052338, "auxiliary_loss_mlp": 0.01044009, "balance_loss_clip": 1.02164507, "balance_loss_mlp": 1.01709318, "epoch": 0.9241545167593567, "flos": 24461161626240.0, "grad_norm": 2.116223279410189, "language_loss": 0.67919368, "learning_rate": 5.999517332163528e-08, "loss": 0.70015717, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 15371, "time_per_iteration": 3.719233751296997 }, { "auxiliary_loss_clip": 0.01007398, "auxiliary_loss_mlp": 0.01002414, "balance_loss_clip": 1.00036323, "balance_loss_mlp": 1.00106251, "epoch": 0.9242146400120247, "flos": 61823951452800.0, "grad_norm": 0.7266312487718359, "language_loss": 0.5779953, "learning_rate": 5.99005338059464e-08, "loss": 0.59809339, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06347656, "step": 15372, "time_per_iteration": 2.9654109477996826 }, { "auxiliary_loss_clip": 0.0105065, "auxiliary_loss_mlp": 0.01036966, "balance_loss_clip": 1.01565123, "balance_loss_mlp": 1.01654744, "epoch": 0.9242747632646926, "flos": 22047476492160.0, "grad_norm": 2.228563726568278, "language_loss": 0.71583098, "learning_rate": 5.98059678590237e-08, "loss": 0.73670709, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 15373, "time_per_iteration": 2.3642055988311768 }, { "auxiliary_loss_clip": 0.01051492, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.01821065, "balance_loss_mlp": 1.01554477, "epoch": 0.9243348865173606, "flos": 18477362661120.0, "grad_norm": 2.311385518942066, "language_loss": 0.76251829, "learning_rate": 5.971147548445299e-08, "loss": 0.78345108, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 15374, "time_per_iteration": 2.384127378463745 }, { "auxiliary_loss_clip": 0.01052451, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.01348066, "balance_loss_mlp": 1.01661873, "epoch": 0.9243950097700285, "flos": 23257635638400.0, "grad_norm": 1.8588280215934128, "language_loss": 0.66339254, "learning_rate": 5.961705668581784e-08, "loss": 0.68428171, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 15375, "time_per_iteration": 2.3879010677337646 }, { "auxiliary_loss_clip": 0.01052167, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.01307702, "balance_loss_mlp": 1.0171628, "epoch": 0.9244551330226966, "flos": 29747884919040.0, "grad_norm": 1.8533380822029064, "language_loss": 0.67733037, "learning_rate": 5.952271146669829e-08, "loss": 0.69820321, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 15376, "time_per_iteration": 2.501324415206909 }, { "auxiliary_loss_clip": 0.01007069, "auxiliary_loss_mlp": 0.01001488, "balance_loss_clip": 0.99975967, "balance_loss_mlp": 1.00057745, "epoch": 0.9245152562753645, "flos": 68861569029120.0, "grad_norm": 0.6508527699562395, "language_loss": 0.61250848, "learning_rate": 5.94284398306717e-08, "loss": 0.63259411, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.06494141, "step": 15377, "time_per_iteration": 3.0804834365844727 }, { "auxiliary_loss_clip": 0.01051927, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.01499808, "balance_loss_mlp": 1.01598859, "epoch": 0.9245753795280325, "flos": 21578208641280.0, "grad_norm": 1.9839719171774826, "language_loss": 0.75093937, "learning_rate": 5.933424178131341e-08, "loss": 0.77183867, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 15378, "time_per_iteration": 2.4066317081451416 }, { "auxiliary_loss_clip": 0.01052139, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.01551628, "balance_loss_mlp": 1.01612437, "epoch": 0.9246355027807004, "flos": 34494641124480.0, "grad_norm": 2.048759237344714, "language_loss": 0.63258094, "learning_rate": 5.924011732219503e-08, "loss": 0.65349805, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 15379, "time_per_iteration": 2.5653374195098877 }, { "auxiliary_loss_clip": 0.01051106, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.01742435, "balance_loss_mlp": 1.01613283, "epoch": 0.9246956260333684, "flos": 15953142562560.0, "grad_norm": 2.142279046918483, "language_loss": 0.85817474, "learning_rate": 5.914606645688591e-08, "loss": 0.8790862, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 15380, "time_per_iteration": 2.368302345275879 }, { "auxiliary_loss_clip": 0.01053058, "auxiliary_loss_mlp": 0.010415, "balance_loss_clip": 1.01577497, "balance_loss_mlp": 1.0154469, "epoch": 0.9247557492860363, "flos": 23367227996160.0, "grad_norm": 1.6205207070230894, "language_loss": 0.74485242, "learning_rate": 5.905208918895233e-08, "loss": 0.76579797, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 15381, "time_per_iteration": 2.3691210746765137 }, { "auxiliary_loss_clip": 0.01052675, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.01263928, "balance_loss_mlp": 1.01701248, "epoch": 0.9248158725387043, "flos": 23038730213760.0, "grad_norm": 1.8799708907307389, "language_loss": 0.79396981, "learning_rate": 5.8958185521958524e-08, "loss": 0.81485969, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 15382, "time_per_iteration": 2.3785715103149414 }, { "auxiliary_loss_clip": 0.01051708, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.01311588, "balance_loss_mlp": 1.01570034, "epoch": 0.9248759957913724, "flos": 22521492288000.0, "grad_norm": 1.6668081396769785, "language_loss": 0.75908482, "learning_rate": 5.886435545946455e-08, "loss": 0.77996784, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 15383, "time_per_iteration": 3.8243911266326904 }, { "auxiliary_loss_clip": 0.01049249, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.01226127, "balance_loss_mlp": 1.01464558, "epoch": 0.9249361190440403, "flos": 25446096391680.0, "grad_norm": 1.6934675354556872, "language_loss": 0.76377821, "learning_rate": 5.8770599005028456e-08, "loss": 0.78461659, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 15384, "time_per_iteration": 2.408388137817383 }, { "auxiliary_loss_clip": 0.01049118, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 1.01041365, "balance_loss_mlp": 1.01541591, "epoch": 0.9249962422967083, "flos": 12378001495680.0, "grad_norm": 1.9464963292939774, "language_loss": 0.67370623, "learning_rate": 5.8676916162206045e-08, "loss": 0.69451678, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33789062, "step": 15385, "time_per_iteration": 2.355088710784912 }, { "auxiliary_loss_clip": 0.01049912, "auxiliary_loss_mlp": 0.01038158, "balance_loss_clip": 1.01550841, "balance_loss_mlp": 1.01502538, "epoch": 0.9250563655493762, "flos": 22928334894720.0, "grad_norm": 1.8035753415961924, "language_loss": 0.80839694, "learning_rate": 5.85833069345496e-08, "loss": 0.82927763, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34960938, "step": 15386, "time_per_iteration": 2.3712542057037354 }, { "auxiliary_loss_clip": 0.01051051, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.01888633, "balance_loss_mlp": 1.0165894, "epoch": 0.9251164888020442, "flos": 18477676863360.0, "grad_norm": 3.367762621418299, "language_loss": 0.77017665, "learning_rate": 5.8489771325608504e-08, "loss": 0.79109681, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 15387, "time_per_iteration": 2.3631856441497803 }, { "auxiliary_loss_clip": 0.01048639, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.01219797, "balance_loss_mlp": 1.01483572, "epoch": 0.9251766120547121, "flos": 33035655651840.0, "grad_norm": 1.2868056173122322, "language_loss": 0.70752704, "learning_rate": 5.839630933893014e-08, "loss": 0.72833896, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 15388, "time_per_iteration": 2.5433900356292725 }, { "auxiliary_loss_clip": 0.01053419, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.0134176, "balance_loss_mlp": 1.01689887, "epoch": 0.9252367353073802, "flos": 24386796696960.0, "grad_norm": 1.741957707054808, "language_loss": 0.82879961, "learning_rate": 5.8302920978058115e-08, "loss": 0.84970033, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36523438, "step": 15389, "time_per_iteration": 2.392090082168579 }, { "auxiliary_loss_clip": 0.01054892, "auxiliary_loss_mlp": 0.01041737, "balance_loss_clip": 1.01635766, "balance_loss_mlp": 1.01716936, "epoch": 0.9252968585600481, "flos": 18915836826240.0, "grad_norm": 1.6914358741267246, "language_loss": 0.80171055, "learning_rate": 5.820960624653381e-08, "loss": 0.82267684, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37890625, "step": 15390, "time_per_iteration": 2.4385111331939697 }, { "auxiliary_loss_clip": 0.01052658, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.01450682, "balance_loss_mlp": 1.01627254, "epoch": 0.9253569818127161, "flos": 21724285236480.0, "grad_norm": 1.8153392875505523, "language_loss": 0.77316666, "learning_rate": 5.811636514789597e-08, "loss": 0.79407406, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 15391, "time_per_iteration": 2.5704450607299805 }, { "auxiliary_loss_clip": 0.0105109, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.0104351, "balance_loss_mlp": 1.0149945, "epoch": 0.925417105065384, "flos": 34238937260160.0, "grad_norm": 2.890257705646562, "language_loss": 0.54029715, "learning_rate": 5.80231976856802e-08, "loss": 0.56116194, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 15392, "time_per_iteration": 2.5047607421875 }, { "auxiliary_loss_clip": 0.01050254, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.01173282, "balance_loss_mlp": 1.01487362, "epoch": 0.925477228318052, "flos": 25958307081600.0, "grad_norm": 1.851322463352318, "language_loss": 0.77963686, "learning_rate": 5.7930103863419454e-08, "loss": 0.80047506, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 15393, "time_per_iteration": 2.4171814918518066 }, { "auxiliary_loss_clip": 0.01049875, "auxiliary_loss_mlp": 0.01040208, "balance_loss_clip": 1.01822543, "balance_loss_mlp": 1.01510215, "epoch": 0.9255373515707199, "flos": 11837440915200.0, "grad_norm": 1.7486199821786865, "language_loss": 0.70659971, "learning_rate": 5.783708368464357e-08, "loss": 0.72750056, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 15394, "time_per_iteration": 2.350252389907837 }, { "auxiliary_loss_clip": 0.01052049, "auxiliary_loss_mlp": 0.01041327, "balance_loss_clip": 1.01796246, "balance_loss_mlp": 1.01707721, "epoch": 0.925597474823388, "flos": 21433249209600.0, "grad_norm": 5.38686037721917, "language_loss": 0.7392469, "learning_rate": 5.7744137152879956e-08, "loss": 0.76018071, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 15395, "time_per_iteration": 2.392467498779297 }, { "auxiliary_loss_clip": 0.01048765, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.01081681, "balance_loss_mlp": 1.01443768, "epoch": 0.925657598076056, "flos": 22856448672000.0, "grad_norm": 2.2880539839914182, "language_loss": 0.73003727, "learning_rate": 5.7651264271653785e-08, "loss": 0.75084686, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34375, "step": 15396, "time_per_iteration": 2.382709264755249 }, { "auxiliary_loss_clip": 0.01051195, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.01600039, "balance_loss_mlp": 1.0159514, "epoch": 0.9257177213287239, "flos": 25702812685440.0, "grad_norm": 1.7709636529205373, "language_loss": 0.88119841, "learning_rate": 5.755846504448603e-08, "loss": 0.90209866, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 15397, "time_per_iteration": 2.395958662033081 }, { "auxiliary_loss_clip": 0.01006897, "auxiliary_loss_mlp": 0.01002267, "balance_loss_clip": 1.00027657, "balance_loss_mlp": 1.00058401, "epoch": 0.9257778445813919, "flos": 59589929508480.0, "grad_norm": 0.8069971643774221, "language_loss": 0.55266517, "learning_rate": 5.746573947489586e-08, "loss": 0.57275683, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06347656, "step": 15398, "time_per_iteration": 2.882063865661621 }, { "auxiliary_loss_clip": 0.01056693, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.01146388, "balance_loss_mlp": 1.01711762, "epoch": 0.9258379678340598, "flos": 27708188935680.0, "grad_norm": 1.8838102242928636, "language_loss": 0.77997482, "learning_rate": 5.7373087566400025e-08, "loss": 0.80091643, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.39453125, "step": 15399, "time_per_iteration": 2.399207592010498 }, { "auxiliary_loss_clip": 0.01049569, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.01354742, "balance_loss_mlp": 1.01541042, "epoch": 0.9258980910867278, "flos": 24862383504000.0, "grad_norm": 1.438230453056525, "language_loss": 0.79200292, "learning_rate": 5.7280509322510826e-08, "loss": 0.81284833, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 15400, "time_per_iteration": 3.648439407348633 }, { "auxiliary_loss_clip": 0.01007185, "auxiliary_loss_mlp": 0.01003014, "balance_loss_clip": 1.0010829, "balance_loss_mlp": 1.00075936, "epoch": 0.9259582143393957, "flos": 63131414158080.0, "grad_norm": 0.7146864517387549, "language_loss": 0.5129751, "learning_rate": 5.718800474673946e-08, "loss": 0.53307712, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06445312, "step": 15401, "time_per_iteration": 2.9916491508483887 }, { "auxiliary_loss_clip": 0.01049646, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.01633239, "balance_loss_mlp": 1.01626635, "epoch": 0.9260183375920638, "flos": 24126170330880.0, "grad_norm": 2.118591962538782, "language_loss": 0.83353126, "learning_rate": 5.709557384259378e-08, "loss": 0.85439849, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33398438, "step": 15402, "time_per_iteration": 2.405545711517334 }, { "auxiliary_loss_clip": 0.0100681, "auxiliary_loss_mlp": 0.01005714, "balance_loss_clip": 1.00355637, "balance_loss_mlp": 1.00042844, "epoch": 0.9260784608447317, "flos": 63039207657600.0, "grad_norm": 0.7436769847149317, "language_loss": 0.51193005, "learning_rate": 5.700321661357876e-08, "loss": 0.53205532, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.06396484, "step": 15403, "time_per_iteration": 3.1323418617248535 }, { "auxiliary_loss_clip": 0.01006666, "auxiliary_loss_mlp": 0.01003017, "balance_loss_clip": 1.00119352, "balance_loss_mlp": 1.00041938, "epoch": 0.9261385840973997, "flos": 70582367854080.0, "grad_norm": 0.6865232341886973, "language_loss": 0.58810252, "learning_rate": 5.69109330631965e-08, "loss": 0.60819936, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.0625, "step": 15404, "time_per_iteration": 3.0514886379241943 }, { "auxiliary_loss_clip": 0.01052362, "auxiliary_loss_mlp": 0.010357, "balance_loss_clip": 1.0112859, "balance_loss_mlp": 1.01547813, "epoch": 0.9261987073500676, "flos": 20228885349120.0, "grad_norm": 2.692854339325013, "language_loss": 0.73773986, "learning_rate": 5.681872319494596e-08, "loss": 0.75862044, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 15405, "time_per_iteration": 2.371661424636841 }, { "auxiliary_loss_clip": 0.01054052, "auxiliary_loss_mlp": 0.01047379, "balance_loss_clip": 1.02294123, "balance_loss_mlp": 1.01711726, "epoch": 0.9262588306027356, "flos": 20953263571200.0, "grad_norm": 1.8054808332045595, "language_loss": 0.69289929, "learning_rate": 5.672658701232458e-08, "loss": 0.71391356, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36914062, "step": 15406, "time_per_iteration": 2.37551212310791 }, { "auxiliary_loss_clip": 0.01051057, "auxiliary_loss_mlp": 0.01041671, "balance_loss_clip": 1.01755476, "balance_loss_mlp": 1.01511145, "epoch": 0.9263189538554035, "flos": 22157732165760.0, "grad_norm": 2.610657567832657, "language_loss": 0.77398837, "learning_rate": 5.663452451882555e-08, "loss": 0.79491568, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.359375, "step": 15407, "time_per_iteration": 2.3745970726013184 }, { "auxiliary_loss_clip": 0.0105263, "auxiliary_loss_mlp": 0.0104233, "balance_loss_clip": 1.01649714, "balance_loss_mlp": 1.01500833, "epoch": 0.9263790771080715, "flos": 18186221900160.0, "grad_norm": 2.3106702256213913, "language_loss": 0.73375511, "learning_rate": 5.6542535717940096e-08, "loss": 0.75470471, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37695312, "step": 15408, "time_per_iteration": 2.3496766090393066 }, { "auxiliary_loss_clip": 0.0105013, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.01317441, "balance_loss_mlp": 1.0157671, "epoch": 0.9264392003607396, "flos": 48176718923520.0, "grad_norm": 1.7205268593334944, "language_loss": 0.69789755, "learning_rate": 5.645062061315675e-08, "loss": 0.71874136, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34375, "step": 15409, "time_per_iteration": 2.6112823486328125 }, { "auxiliary_loss_clip": 0.01053583, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.01286268, "balance_loss_mlp": 1.01664913, "epoch": 0.9264993236134075, "flos": 26388437431680.0, "grad_norm": 3.9331721856542403, "language_loss": 0.76717454, "learning_rate": 5.6358779207960506e-08, "loss": 0.78810519, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36914062, "step": 15410, "time_per_iteration": 5.315945625305176 }, { "auxiliary_loss_clip": 0.01052609, "auxiliary_loss_mlp": 0.01035395, "balance_loss_clip": 1.01167285, "balance_loss_mlp": 1.01622713, "epoch": 0.9265594468660755, "flos": 20919118394880.0, "grad_norm": 1.598637739146034, "language_loss": 0.82640588, "learning_rate": 5.6267011505833905e-08, "loss": 0.84728587, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 15411, "time_per_iteration": 2.383453607559204 }, { "auxiliary_loss_clip": 0.01053525, "auxiliary_loss_mlp": 0.01035597, "balance_loss_clip": 1.01320982, "balance_loss_mlp": 1.01812458, "epoch": 0.9266195701187434, "flos": 17524199099520.0, "grad_norm": 1.7979583270701456, "language_loss": 0.762218, "learning_rate": 5.617531751025728e-08, "loss": 0.78310925, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 15412, "time_per_iteration": 2.374799966812134 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.01329708, "balance_loss_mlp": 1.01530325, "epoch": 0.9266796933714114, "flos": 33687449424000.0, "grad_norm": 1.622815615234456, "language_loss": 0.67649156, "learning_rate": 5.6083697224707406e-08, "loss": 0.69737196, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 15413, "time_per_iteration": 2.483412265777588 }, { "auxiliary_loss_clip": 0.01052695, "auxiliary_loss_mlp": 0.0103893, "balance_loss_clip": 1.01519489, "balance_loss_mlp": 1.01579916, "epoch": 0.9267398166240793, "flos": 18915522624000.0, "grad_norm": 1.6259275129236803, "language_loss": 0.76539111, "learning_rate": 5.5992150652658167e-08, "loss": 0.78630739, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3671875, "step": 15414, "time_per_iteration": 2.3618295192718506 }, { "auxiliary_loss_clip": 0.01052381, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.00990438, "balance_loss_mlp": 1.01657212, "epoch": 0.9267999398767474, "flos": 20478095700480.0, "grad_norm": 2.2922434583233007, "language_loss": 0.82255268, "learning_rate": 5.59006777975819e-08, "loss": 0.84339368, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35742188, "step": 15415, "time_per_iteration": 2.347046375274658 }, { "auxiliary_loss_clip": 0.01052282, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.01394975, "balance_loss_mlp": 1.01634526, "epoch": 0.9268600631294153, "flos": 24788228042880.0, "grad_norm": 1.575483710711171, "language_loss": 0.55063933, "learning_rate": 5.580927866294671e-08, "loss": 0.57152951, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 15416, "time_per_iteration": 2.416163921356201 }, { "auxiliary_loss_clip": 0.01050656, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.01168585, "balance_loss_mlp": 1.01586938, "epoch": 0.9269201863820833, "flos": 18696198263040.0, "grad_norm": 1.9862017910436238, "language_loss": 0.72507811, "learning_rate": 5.571795325221807e-08, "loss": 0.74592674, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 15417, "time_per_iteration": 2.3417580127716064 }, { "auxiliary_loss_clip": 0.0105163, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.0132103, "balance_loss_mlp": 1.01630807, "epoch": 0.9269803096347512, "flos": 20922923733120.0, "grad_norm": 2.0874330332415933, "language_loss": 0.76749027, "learning_rate": 5.5626701568859624e-08, "loss": 0.7883662, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 15418, "time_per_iteration": 2.3812272548675537 }, { "auxiliary_loss_clip": 0.01050027, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.01098108, "balance_loss_mlp": 1.01563239, "epoch": 0.9270404328874192, "flos": 28001424378240.0, "grad_norm": 1.4662870990666435, "language_loss": 0.77172709, "learning_rate": 5.553552361633174e-08, "loss": 0.79256773, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34375, "step": 15419, "time_per_iteration": 2.447937488555908 }, { "auxiliary_loss_clip": 0.01047821, "auxiliary_loss_mlp": 0.01038446, "balance_loss_clip": 1.01884818, "balance_loss_mlp": 1.01493824, "epoch": 0.9271005561400871, "flos": 25888550451840.0, "grad_norm": 1.6392138078250358, "language_loss": 0.76819646, "learning_rate": 5.5444419398091636e-08, "loss": 0.78905916, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.328125, "step": 15420, "time_per_iteration": 2.450503349304199 }, { "auxiliary_loss_clip": 0.01053285, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.01290464, "balance_loss_mlp": 1.01628911, "epoch": 0.9271606793927551, "flos": 27052659648000.0, "grad_norm": 1.613134989320184, "language_loss": 0.77285206, "learning_rate": 5.535338891759389e-08, "loss": 0.79374957, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 15421, "time_per_iteration": 2.4176900386810303 }, { "auxiliary_loss_clip": 0.01052022, "auxiliary_loss_mlp": 0.01033951, "balance_loss_clip": 1.01200438, "balance_loss_mlp": 1.01577652, "epoch": 0.9272208026454232, "flos": 26208774241920.0, "grad_norm": 2.21728234999961, "language_loss": 0.73905486, "learning_rate": 5.526243217829041e-08, "loss": 0.75991458, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.36328125, "step": 15422, "time_per_iteration": 3.849534273147583 }, { "auxiliary_loss_clip": 0.01052487, "auxiliary_loss_mlp": 0.01043074, "balance_loss_clip": 1.01906562, "balance_loss_mlp": 1.01595449, "epoch": 0.9272809258980911, "flos": 12457638040320.0, "grad_norm": 2.1981632777477604, "language_loss": 0.79276204, "learning_rate": 5.517154918363065e-08, "loss": 0.8137176, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 15423, "time_per_iteration": 2.3573899269104004 }, { "auxiliary_loss_clip": 0.01051631, "auxiliary_loss_mlp": 0.01040397, "balance_loss_clip": 1.01644802, "balance_loss_mlp": 1.01537919, "epoch": 0.9273410491507591, "flos": 22855785356160.0, "grad_norm": 1.732772081427573, "language_loss": 0.76249588, "learning_rate": 5.508073993706053e-08, "loss": 0.78341615, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 15424, "time_per_iteration": 2.3975846767425537 }, { "auxiliary_loss_clip": 0.01006741, "auxiliary_loss_mlp": 0.01002494, "balance_loss_clip": 1.0004437, "balance_loss_mlp": 1.00037718, "epoch": 0.927401172403427, "flos": 47662621153920.0, "grad_norm": 0.7776832360495015, "language_loss": 0.60772258, "learning_rate": 5.499000444202351e-08, "loss": 0.62781495, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06347656, "step": 15425, "time_per_iteration": 2.868772506713867 }, { "auxiliary_loss_clip": 0.01052059, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.01656365, "balance_loss_mlp": 1.01575065, "epoch": 0.927461295656095, "flos": 29971049529600.0, "grad_norm": 1.400017919067707, "language_loss": 0.71744585, "learning_rate": 5.489934270196106e-08, "loss": 0.73836625, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 15426, "time_per_iteration": 2.455390691757202 }, { "auxiliary_loss_clip": 0.01051395, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.01120079, "balance_loss_mlp": 1.01630795, "epoch": 0.9275214189087629, "flos": 20374403362560.0, "grad_norm": 1.9270698068457937, "language_loss": 0.83923584, "learning_rate": 5.480875472030977e-08, "loss": 0.86007482, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 15427, "time_per_iteration": 2.353458881378174 }, { "auxiliary_loss_clip": 0.01051556, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.01169932, "balance_loss_mlp": 1.01622701, "epoch": 0.927581542161431, "flos": 22382083762560.0, "grad_norm": 1.4449250408100502, "language_loss": 0.77626657, "learning_rate": 5.471824050050555e-08, "loss": 0.7971307, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 15428, "time_per_iteration": 2.3825225830078125 }, { "auxiliary_loss_clip": 0.01050374, "auxiliary_loss_mlp": 0.01034312, "balance_loss_clip": 1.01087594, "balance_loss_mlp": 1.01513839, "epoch": 0.9276416654140989, "flos": 23951289997440.0, "grad_norm": 1.936789431694983, "language_loss": 0.75383145, "learning_rate": 5.4627800045980555e-08, "loss": 0.77467829, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 15429, "time_per_iteration": 2.3745245933532715 }, { "auxiliary_loss_clip": 0.01049217, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01180911, "balance_loss_mlp": 1.01507974, "epoch": 0.9277017886667669, "flos": 13916867892480.0, "grad_norm": 1.9235720600984634, "language_loss": 0.75799656, "learning_rate": 5.45374333601647e-08, "loss": 0.7788204, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34179688, "step": 15430, "time_per_iteration": 2.3502449989318848 }, { "auxiliary_loss_clip": 0.01050917, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.01139927, "balance_loss_mlp": 1.01530695, "epoch": 0.9277619119194348, "flos": 35664929631360.0, "grad_norm": 1.4470820441503982, "language_loss": 0.77265114, "learning_rate": 5.444714044648391e-08, "loss": 0.79351801, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 15431, "time_per_iteration": 2.498563766479492 }, { "auxiliary_loss_clip": 0.01051247, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.01184237, "balance_loss_mlp": 1.0165565, "epoch": 0.9278220351721028, "flos": 23840126628480.0, "grad_norm": 1.6577370778793363, "language_loss": 0.72058022, "learning_rate": 5.4356921308363e-08, "loss": 0.74142075, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34765625, "step": 15432, "time_per_iteration": 2.4000725746154785 }, { "auxiliary_loss_clip": 0.0105214, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 1.01463842, "balance_loss_mlp": 1.01602364, "epoch": 0.9278821584247707, "flos": 15227332974720.0, "grad_norm": 2.3157342044341527, "language_loss": 0.84305406, "learning_rate": 5.4266775949222354e-08, "loss": 0.86395109, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36132812, "step": 15433, "time_per_iteration": 2.3293073177337646 }, { "auxiliary_loss_clip": 0.01049308, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 1.01110744, "balance_loss_mlp": 1.01581335, "epoch": 0.9279422816774388, "flos": 24680241607680.0, "grad_norm": 1.8396776033261326, "language_loss": 0.67429137, "learning_rate": 5.417670437248056e-08, "loss": 0.69510651, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33398438, "step": 15434, "time_per_iteration": 2.426029682159424 }, { "auxiliary_loss_clip": 0.01047835, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 1.01221085, "balance_loss_mlp": 1.01455641, "epoch": 0.9280024049301068, "flos": 19168259022720.0, "grad_norm": 1.691954205271849, "language_loss": 0.69450068, "learning_rate": 5.40867065815529e-08, "loss": 0.71530473, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33203125, "step": 15435, "time_per_iteration": 2.3532016277313232 }, { "auxiliary_loss_clip": 0.01052078, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.01176727, "balance_loss_mlp": 1.01618838, "epoch": 0.9280625281827747, "flos": 11393101641600.0, "grad_norm": 1.8372593274515352, "language_loss": 0.7405771, "learning_rate": 5.399678257985263e-08, "loss": 0.76144767, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 15436, "time_per_iteration": 2.3422205448150635 }, { "auxiliary_loss_clip": 0.01051998, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.00977874, "balance_loss_mlp": 1.01671624, "epoch": 0.9281226514354427, "flos": 24784597261440.0, "grad_norm": 3.0381304843181396, "language_loss": 0.67702329, "learning_rate": 5.390693237078925e-08, "loss": 0.69787878, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35351562, "step": 15437, "time_per_iteration": 2.427947521209717 }, { "auxiliary_loss_clip": 0.01052774, "auxiliary_loss_mlp": 0.01039249, "balance_loss_clip": 1.01272464, "balance_loss_mlp": 1.01633668, "epoch": 0.9281827746881106, "flos": 15082303720320.0, "grad_norm": 2.251650347489714, "language_loss": 0.72843397, "learning_rate": 5.3817155957770254e-08, "loss": 0.74935418, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36328125, "step": 15438, "time_per_iteration": 2.3529088497161865 }, { "auxiliary_loss_clip": 0.01051463, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.01072168, "balance_loss_mlp": 1.0161643, "epoch": 0.9282428979407786, "flos": 24133885741440.0, "grad_norm": 2.5923800753187884, "language_loss": 0.65576768, "learning_rate": 5.3727453344199366e-08, "loss": 0.67661572, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15439, "time_per_iteration": 3.6783390045166016 }, { "auxiliary_loss_clip": 0.01050968, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.0114336, "balance_loss_mlp": 1.01653361, "epoch": 0.9283030211934465, "flos": 24822163751040.0, "grad_norm": 1.7290096187845199, "language_loss": 0.71320701, "learning_rate": 5.363782453347876e-08, "loss": 0.73406005, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 15440, "time_per_iteration": 2.4173314571380615 }, { "auxiliary_loss_clip": 0.01052674, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.01304746, "balance_loss_mlp": 1.01614499, "epoch": 0.9283631444461146, "flos": 23980093735680.0, "grad_norm": 1.6792720650071553, "language_loss": 0.77824235, "learning_rate": 5.354826952900682e-08, "loss": 0.79914767, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 15441, "time_per_iteration": 2.407503128051758 }, { "auxiliary_loss_clip": 0.01047569, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.01356542, "balance_loss_mlp": 1.01489997, "epoch": 0.9284232676987825, "flos": 22783410374400.0, "grad_norm": 1.8422604609753301, "language_loss": 0.64768559, "learning_rate": 5.345878833417949e-08, "loss": 0.66849983, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.32617188, "step": 15442, "time_per_iteration": 2.394751787185669 }, { "auxiliary_loss_clip": 0.01053161, "auxiliary_loss_mlp": 0.01040687, "balance_loss_clip": 1.01833582, "balance_loss_mlp": 1.01589394, "epoch": 0.9284833909514505, "flos": 19499479891200.0, "grad_norm": 1.9244006890329577, "language_loss": 0.81939209, "learning_rate": 5.3369380952390295e-08, "loss": 0.8403306, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.37109375, "step": 15443, "time_per_iteration": 2.3707332611083984 }, { "auxiliary_loss_clip": 0.01053042, "auxiliary_loss_mlp": 0.01035557, "balance_loss_clip": 1.01221609, "balance_loss_mlp": 1.0162189, "epoch": 0.9285435142041184, "flos": 23184841720320.0, "grad_norm": 2.4742626385671653, "language_loss": 0.66415119, "learning_rate": 5.328004738702896e-08, "loss": 0.68503714, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 15444, "time_per_iteration": 2.395737648010254 }, { "auxiliary_loss_clip": 0.0105266, "auxiliary_loss_mlp": 0.01036955, "balance_loss_clip": 1.0143168, "balance_loss_mlp": 1.0159744, "epoch": 0.9286036374567864, "flos": 17674569792000.0, "grad_norm": 1.9684637082017338, "language_loss": 0.7422958, "learning_rate": 5.3190787641483215e-08, "loss": 0.76319194, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3671875, "step": 15445, "time_per_iteration": 2.3202152252197266 }, { "auxiliary_loss_clip": 0.01051036, "auxiliary_loss_mlp": 0.01039169, "balance_loss_clip": 1.01607811, "balance_loss_mlp": 1.01639783, "epoch": 0.9286637607094543, "flos": 20885636534400.0, "grad_norm": 2.167659046385479, "language_loss": 0.72436911, "learning_rate": 5.3101601719138135e-08, "loss": 0.74527121, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34765625, "step": 15446, "time_per_iteration": 2.3781802654266357 }, { "auxiliary_loss_clip": 0.01054213, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.0133462, "balance_loss_mlp": 1.01685739, "epoch": 0.9287238839621224, "flos": 19025010247680.0, "grad_norm": 1.72471336507201, "language_loss": 0.70301461, "learning_rate": 5.301248962337523e-08, "loss": 0.7239393, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 15447, "time_per_iteration": 2.3611490726470947 }, { "auxiliary_loss_clip": 0.01048519, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 1.01128793, "balance_loss_mlp": 1.0158093, "epoch": 0.9287840072147904, "flos": 20556021588480.0, "grad_norm": 1.6207484922561115, "language_loss": 0.73703748, "learning_rate": 5.292345135757403e-08, "loss": 0.75784272, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.328125, "step": 15448, "time_per_iteration": 2.3943426609039307 }, { "auxiliary_loss_clip": 0.01050945, "auxiliary_loss_mlp": 0.01038403, "balance_loss_clip": 1.01471615, "balance_loss_mlp": 1.01533377, "epoch": 0.9288441304674583, "flos": 21249780681600.0, "grad_norm": 1.5729282866943668, "language_loss": 0.7602458, "learning_rate": 5.283448692511072e-08, "loss": 0.78113925, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 15449, "time_per_iteration": 2.37914776802063 }, { "auxiliary_loss_clip": 0.0105247, "auxiliary_loss_mlp": 0.0103616, "balance_loss_clip": 1.0116384, "balance_loss_mlp": 1.01628864, "epoch": 0.9289042537201263, "flos": 27668702321280.0, "grad_norm": 2.012790129958376, "language_loss": 0.68962002, "learning_rate": 5.27455963293586e-08, "loss": 0.71050632, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 15450, "time_per_iteration": 5.244949102401733 }, { "auxiliary_loss_clip": 0.01052047, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.01250684, "balance_loss_mlp": 1.01602376, "epoch": 0.9289643769727942, "flos": 19316744501760.0, "grad_norm": 1.9220194862768503, "language_loss": 0.73279798, "learning_rate": 5.265677957368875e-08, "loss": 0.75368154, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15451, "time_per_iteration": 2.3459155559539795 }, { "auxiliary_loss_clip": 0.01053247, "auxiliary_loss_mlp": 0.01041477, "balance_loss_clip": 1.01883924, "balance_loss_mlp": 1.01703644, "epoch": 0.9290245002254622, "flos": 14057358670080.0, "grad_norm": 2.308863464216243, "language_loss": 0.74242461, "learning_rate": 5.25680366614687e-08, "loss": 0.76337183, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 15452, "time_per_iteration": 2.3372955322265625 }, { "auxiliary_loss_clip": 0.01052553, "auxiliary_loss_mlp": 0.01040233, "balance_loss_clip": 1.01599765, "balance_loss_mlp": 1.0175612, "epoch": 0.9290846234781301, "flos": 20046115048320.0, "grad_norm": 1.8377458751352682, "language_loss": 0.74658811, "learning_rate": 5.2479367596064196e-08, "loss": 0.76751596, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.34960938, "step": 15453, "time_per_iteration": 2.3649237155914307 }, { "auxiliary_loss_clip": 0.01006861, "auxiliary_loss_mlp": 0.0100231, "balance_loss_clip": 1.00021207, "balance_loss_mlp": 1.00043845, "epoch": 0.9291447467307982, "flos": 61224668098560.0, "grad_norm": 0.8204203078681165, "language_loss": 0.6079185, "learning_rate": 5.2390772380837226e-08, "loss": 0.62801015, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06445312, "step": 15454, "time_per_iteration": 2.9322316646575928 }, { "auxiliary_loss_clip": 0.0105192, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.01224661, "balance_loss_mlp": 1.01567888, "epoch": 0.9292048699834661, "flos": 20552425718400.0, "grad_norm": 1.8884284224132608, "language_loss": 0.6967715, "learning_rate": 5.230225101914709e-08, "loss": 0.71764594, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 15455, "time_per_iteration": 2.3759891986846924 }, { "auxiliary_loss_clip": 0.01053089, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.01463628, "balance_loss_mlp": 1.0171392, "epoch": 0.9292649932361341, "flos": 23622512924160.0, "grad_norm": 2.1821020734168965, "language_loss": 0.65686917, "learning_rate": 5.22138035143509e-08, "loss": 0.67778456, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15456, "time_per_iteration": 2.381910562515259 }, { "auxiliary_loss_clip": 0.01049633, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.01147258, "balance_loss_mlp": 1.015504, "epoch": 0.929325116488802, "flos": 15009125777280.0, "grad_norm": 1.754484817731398, "language_loss": 0.69709206, "learning_rate": 5.2125429869802615e-08, "loss": 0.71793151, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 15457, "time_per_iteration": 2.3438079357147217 }, { "auxiliary_loss_clip": 0.01051809, "auxiliary_loss_mlp": 0.01037149, "balance_loss_clip": 1.01422501, "balance_loss_mlp": 1.01548874, "epoch": 0.92938523974147, "flos": 17966408780160.0, "grad_norm": 2.053932063715907, "language_loss": 0.81433213, "learning_rate": 5.203713008885291e-08, "loss": 0.83522177, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.36328125, "step": 15458, "time_per_iteration": 2.3426055908203125 }, { "auxiliary_loss_clip": 0.01050946, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.01631343, "balance_loss_mlp": 1.01505303, "epoch": 0.9294453629941379, "flos": 23001931774080.0, "grad_norm": 1.6218502594545223, "language_loss": 0.73112679, "learning_rate": 5.194890417485065e-08, "loss": 0.75202674, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 15459, "time_per_iteration": 2.3789749145507812 }, { "auxiliary_loss_clip": 0.01053102, "auxiliary_loss_mlp": 0.01039881, "balance_loss_clip": 1.01593137, "balance_loss_mlp": 1.01700044, "epoch": 0.929505486246806, "flos": 17054302844160.0, "grad_norm": 2.3491222574455177, "language_loss": 0.60239172, "learning_rate": 5.1860752131141384e-08, "loss": 0.62332153, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36132812, "step": 15460, "time_per_iteration": 2.3468258380889893 }, { "auxiliary_loss_clip": 0.0105212, "auxiliary_loss_mlp": 0.01042159, "balance_loss_clip": 1.01887751, "balance_loss_mlp": 1.01622748, "epoch": 0.9295656094994739, "flos": 27339296843520.0, "grad_norm": 1.7414471158311102, "language_loss": 0.81421041, "learning_rate": 5.177267396106733e-08, "loss": 0.8351531, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 15461, "time_per_iteration": 2.398174524307251 }, { "auxiliary_loss_clip": 0.01050217, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.01376188, "balance_loss_mlp": 1.01548851, "epoch": 0.9296257327521419, "flos": 21469873092480.0, "grad_norm": 1.771857487705492, "language_loss": 0.78811288, "learning_rate": 5.168466966796869e-08, "loss": 0.80895698, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34765625, "step": 15462, "time_per_iteration": 3.812796115875244 }, { "auxiliary_loss_clip": 0.01051213, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.01995707, "balance_loss_mlp": 1.0152148, "epoch": 0.9296858560048099, "flos": 16361730737280.0, "grad_norm": 1.9239114206061534, "language_loss": 0.63584721, "learning_rate": 5.159673925518282e-08, "loss": 0.65678644, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 15463, "time_per_iteration": 2.3739259243011475 }, { "auxiliary_loss_clip": 0.01050156, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.01322591, "balance_loss_mlp": 1.01522708, "epoch": 0.9297459792574778, "flos": 29857407454080.0, "grad_norm": 2.2071802281479695, "language_loss": 0.72058141, "learning_rate": 5.15088827260437e-08, "loss": 0.74142051, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34960938, "step": 15464, "time_per_iteration": 2.450702428817749 }, { "auxiliary_loss_clip": 0.01050896, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.00942338, "balance_loss_mlp": 1.01548052, "epoch": 0.9298061025101458, "flos": 15923919888000.0, "grad_norm": 1.8907641416665046, "language_loss": 0.78243697, "learning_rate": 5.1421100083883115e-08, "loss": 0.80326664, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15465, "time_per_iteration": 2.4018173217773438 }, { "auxiliary_loss_clip": 0.01006766, "auxiliary_loss_mlp": 0.01003059, "balance_loss_clip": 1.00102043, "balance_loss_mlp": 1.00034046, "epoch": 0.9298662257628137, "flos": 64093410293760.0, "grad_norm": 0.6994327217301708, "language_loss": 0.56484145, "learning_rate": 5.133339133202952e-08, "loss": 0.58493966, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06445312, "step": 15466, "time_per_iteration": 3.115100145339966 }, { "auxiliary_loss_clip": 0.0105174, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.01706612, "balance_loss_mlp": 1.01554751, "epoch": 0.9299263490154818, "flos": 24279054641280.0, "grad_norm": 1.4324275263939712, "language_loss": 0.7357024, "learning_rate": 5.1245756473809355e-08, "loss": 0.75664008, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 15467, "time_per_iteration": 2.4122750759124756 }, { "auxiliary_loss_clip": 0.01052888, "auxiliary_loss_mlp": 0.0103995, "balance_loss_clip": 1.01555967, "balance_loss_mlp": 1.0161891, "epoch": 0.9299864722681497, "flos": 23293247091840.0, "grad_norm": 1.686358431062476, "language_loss": 0.72654349, "learning_rate": 5.1158195512545076e-08, "loss": 0.74747187, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 15468, "time_per_iteration": 2.3808610439300537 }, { "auxiliary_loss_clip": 0.01052182, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.01506948, "balance_loss_mlp": 1.01532805, "epoch": 0.9300465955208177, "flos": 21394949581440.0, "grad_norm": 1.647491070198712, "language_loss": 0.75796723, "learning_rate": 5.107070845155737e-08, "loss": 0.77889043, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 15469, "time_per_iteration": 2.3883211612701416 }, { "auxiliary_loss_clip": 0.01050792, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 1.01491213, "balance_loss_mlp": 1.01561153, "epoch": 0.9301067187734856, "flos": 24570300136320.0, "grad_norm": 1.9342002794579205, "language_loss": 0.77367383, "learning_rate": 5.098329529416379e-08, "loss": 0.79456437, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3515625, "step": 15470, "time_per_iteration": 2.506520986557007 }, { "auxiliary_loss_clip": 0.01051244, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.01247561, "balance_loss_mlp": 1.01566839, "epoch": 0.9301668420261536, "flos": 22195961971200.0, "grad_norm": 1.570254779238394, "language_loss": 0.75279051, "learning_rate": 5.089595604367902e-08, "loss": 0.77365494, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 15471, "time_per_iteration": 2.3790082931518555 }, { "auxiliary_loss_clip": 0.0105168, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.01423883, "balance_loss_mlp": 1.01570725, "epoch": 0.9302269652788215, "flos": 17746700394240.0, "grad_norm": 2.1505279292796176, "language_loss": 0.70745623, "learning_rate": 5.080869070341487e-08, "loss": 0.72834051, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 15472, "time_per_iteration": 2.3207974433898926 }, { "auxiliary_loss_clip": 0.01048029, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.01176882, "balance_loss_mlp": 1.01496065, "epoch": 0.9302870885314896, "flos": 19389782799360.0, "grad_norm": 2.000038301420985, "language_loss": 0.90017295, "learning_rate": 5.0721499276680233e-08, "loss": 0.92098874, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33007812, "step": 15473, "time_per_iteration": 2.375072717666626 }, { "auxiliary_loss_clip": 0.01054582, "auxiliary_loss_mlp": 0.01044674, "balance_loss_clip": 1.01915145, "balance_loss_mlp": 1.01710236, "epoch": 0.9303472117841575, "flos": 21759268285440.0, "grad_norm": 2.3435289317648103, "language_loss": 0.66395676, "learning_rate": 5.063438176678203e-08, "loss": 0.68494934, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 15474, "time_per_iteration": 2.37300705909729 }, { "auxiliary_loss_clip": 0.01051597, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.01398849, "balance_loss_mlp": 1.01606774, "epoch": 0.9304073350368255, "flos": 19608723135360.0, "grad_norm": 1.7745935611833767, "language_loss": 0.75614184, "learning_rate": 5.054733817702339e-08, "loss": 0.77702391, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 15475, "time_per_iteration": 2.3689987659454346 }, { "auxiliary_loss_clip": 0.010506, "auxiliary_loss_mlp": 0.01037715, "balance_loss_clip": 1.01448107, "balance_loss_mlp": 1.01575041, "epoch": 0.9304674582894935, "flos": 30440387203200.0, "grad_norm": 1.8091250871278772, "language_loss": 0.68238896, "learning_rate": 5.0460368510704786e-08, "loss": 0.7032721, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 15476, "time_per_iteration": 2.4600582122802734 }, { "auxiliary_loss_clip": 0.01051062, "auxiliary_loss_mlp": 0.01039781, "balance_loss_clip": 1.015486, "balance_loss_mlp": 1.01528168, "epoch": 0.9305275815421614, "flos": 17784720731520.0, "grad_norm": 5.591629258092099, "language_loss": 0.70238221, "learning_rate": 5.0373472771124914e-08, "loss": 0.72329056, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35742188, "step": 15477, "time_per_iteration": 2.4136195182800293 }, { "auxiliary_loss_clip": 0.01050831, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.0145539, "balance_loss_mlp": 1.0162127, "epoch": 0.9305877047948294, "flos": 25297366533120.0, "grad_norm": 1.8099178812081842, "language_loss": 0.59518814, "learning_rate": 5.0286650961578027e-08, "loss": 0.61606061, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 15478, "time_per_iteration": 2.442432165145874 }, { "auxiliary_loss_clip": 0.01055394, "auxiliary_loss_mlp": 0.01038353, "balance_loss_clip": 1.0130446, "balance_loss_mlp": 1.01712441, "epoch": 0.9306478280474973, "flos": 16976446778880.0, "grad_norm": 1.907385694348522, "language_loss": 0.80931556, "learning_rate": 5.01999030853566e-08, "loss": 0.83025301, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 15479, "time_per_iteration": 3.6370887756347656 }, { "auxiliary_loss_clip": 0.01050879, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.01452422, "balance_loss_mlp": 1.01534796, "epoch": 0.9307079513001654, "flos": 35661892343040.0, "grad_norm": 1.7077334036133218, "language_loss": 0.6996218, "learning_rate": 5.0113229145750445e-08, "loss": 0.72050703, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 15480, "time_per_iteration": 2.5033771991729736 }, { "auxiliary_loss_clip": 0.01052789, "auxiliary_loss_mlp": 0.01039021, "balance_loss_clip": 1.01558399, "balance_loss_mlp": 1.01650691, "epoch": 0.9307680745528333, "flos": 19207152144000.0, "grad_norm": 1.5716652693733677, "language_loss": 0.68526614, "learning_rate": 5.002662914604583e-08, "loss": 0.70618427, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36132812, "step": 15481, "time_per_iteration": 2.3965609073638916 }, { "auxiliary_loss_clip": 0.01048885, "auxiliary_loss_mlp": 0.01033577, "balance_loss_clip": 1.01231027, "balance_loss_mlp": 1.01491022, "epoch": 0.9308281978055013, "flos": 19061634130560.0, "grad_norm": 3.0315172047708465, "language_loss": 0.75629687, "learning_rate": 4.994010308952701e-08, "loss": 0.77712148, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 15482, "time_per_iteration": 2.3873937129974365 }, { "auxiliary_loss_clip": 0.01050605, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.0153656, "balance_loss_mlp": 1.01605952, "epoch": 0.9308883210581692, "flos": 20520514869120.0, "grad_norm": 3.7453203508151405, "language_loss": 0.81255436, "learning_rate": 4.985365097947469e-08, "loss": 0.83342552, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34570312, "step": 15483, "time_per_iteration": 2.3916823863983154 }, { "auxiliary_loss_clip": 0.01052425, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.02107823, "balance_loss_mlp": 1.01626444, "epoch": 0.9309484443108372, "flos": 13000712238720.0, "grad_norm": 1.8868584985550634, "language_loss": 0.75895619, "learning_rate": 4.976727281916782e-08, "loss": 0.77991021, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 15484, "time_per_iteration": 2.3744494915008545 }, { "auxiliary_loss_clip": 0.01054065, "auxiliary_loss_mlp": 0.0103904, "balance_loss_clip": 1.01506734, "balance_loss_mlp": 1.01716113, "epoch": 0.9310085675635051, "flos": 12566951107200.0, "grad_norm": 2.10416124334466, "language_loss": 0.77949762, "learning_rate": 4.968096861188087e-08, "loss": 0.80042869, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36914062, "step": 15485, "time_per_iteration": 2.3867828845977783 }, { "auxiliary_loss_clip": 0.01052783, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.01279306, "balance_loss_mlp": 1.01599753, "epoch": 0.9310686908161732, "flos": 23476436328960.0, "grad_norm": 1.873990051427053, "language_loss": 0.7936722, "learning_rate": 4.959473836088723e-08, "loss": 0.81457865, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 15486, "time_per_iteration": 2.4315803050994873 }, { "auxiliary_loss_clip": 0.01053486, "auxiliary_loss_mlp": 0.01041154, "balance_loss_clip": 1.01652527, "balance_loss_mlp": 1.01726902, "epoch": 0.9311288140688411, "flos": 24169148081280.0, "grad_norm": 1.9010512856590633, "language_loss": 0.78689325, "learning_rate": 4.950858206945674e-08, "loss": 0.80783963, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36328125, "step": 15487, "time_per_iteration": 2.393742561340332 }, { "auxiliary_loss_clip": 0.01050485, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.00867748, "balance_loss_mlp": 1.01531672, "epoch": 0.9311889373215091, "flos": 35588749311360.0, "grad_norm": 2.2632890331940114, "language_loss": 0.6908502, "learning_rate": 4.942249974085633e-08, "loss": 0.71166945, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 15488, "time_per_iteration": 2.4751522541046143 }, { "auxiliary_loss_clip": 0.0104843, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.01263237, "balance_loss_mlp": 1.0155654, "epoch": 0.9312490605741771, "flos": 20229478842240.0, "grad_norm": 1.880765877155819, "language_loss": 0.76407802, "learning_rate": 4.933649137834983e-08, "loss": 0.78490722, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.328125, "step": 15489, "time_per_iteration": 2.4068539142608643 }, { "auxiliary_loss_clip": 0.01052996, "auxiliary_loss_mlp": 0.01035533, "balance_loss_clip": 1.01196575, "balance_loss_mlp": 1.01618361, "epoch": 0.931309183826845, "flos": 13949826082560.0, "grad_norm": 2.1868058784293267, "language_loss": 0.82385576, "learning_rate": 4.925055698519931e-08, "loss": 0.84474099, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 15490, "time_per_iteration": 5.259932041168213 }, { "auxiliary_loss_clip": 0.010526, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.0122385, "balance_loss_mlp": 1.01607871, "epoch": 0.931369307079513, "flos": 20155707406080.0, "grad_norm": 1.5225258937874817, "language_loss": 0.73487014, "learning_rate": 4.9164696564663264e-08, "loss": 0.75576341, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36523438, "step": 15491, "time_per_iteration": 2.374633312225342 }, { "auxiliary_loss_clip": 0.01048525, "auxiliary_loss_mlp": 0.01033733, "balance_loss_clip": 1.01320481, "balance_loss_mlp": 1.01456475, "epoch": 0.931429430332181, "flos": 25337376817920.0, "grad_norm": 2.0594830599494025, "language_loss": 0.75429034, "learning_rate": 4.9078910119997096e-08, "loss": 0.77511287, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33984375, "step": 15492, "time_per_iteration": 2.4147138595581055 }, { "auxiliary_loss_clip": 0.01006885, "auxiliary_loss_mlp": 0.01002831, "balance_loss_clip": 1.00079286, "balance_loss_mlp": 1.00066495, "epoch": 0.931489553584849, "flos": 71223024902400.0, "grad_norm": 0.7108712739598405, "language_loss": 0.5348987, "learning_rate": 4.899319765445442e-08, "loss": 0.55499583, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06201172, "step": 15493, "time_per_iteration": 2.886294364929199 }, { "auxiliary_loss_clip": 0.01049918, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.01337683, "balance_loss_mlp": 1.01562214, "epoch": 0.9315496768375169, "flos": 14642886948480.0, "grad_norm": 1.5901055419351053, "language_loss": 0.7196362, "learning_rate": 4.890755917128531e-08, "loss": 0.74049079, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 15494, "time_per_iteration": 2.3536510467529297 }, { "auxiliary_loss_clip": 0.01053187, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.01169348, "balance_loss_mlp": 1.01631606, "epoch": 0.9316098000901849, "flos": 28328665351680.0, "grad_norm": 1.6859262488960933, "language_loss": 0.69579858, "learning_rate": 4.882199467373671e-08, "loss": 0.71667773, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 15495, "time_per_iteration": 2.420713186264038 }, { "auxiliary_loss_clip": 0.01049679, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.01442039, "balance_loss_mlp": 1.015172, "epoch": 0.9316699233428528, "flos": 28511400741120.0, "grad_norm": 1.8577740677898036, "language_loss": 0.6315192, "learning_rate": 4.8736504165053815e-08, "loss": 0.65238327, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 15496, "time_per_iteration": 2.437195301055908 }, { "auxiliary_loss_clip": 0.01049935, "auxiliary_loss_mlp": 0.01040626, "balance_loss_clip": 1.01679564, "balance_loss_mlp": 1.01528525, "epoch": 0.9317300465955208, "flos": 33691987900800.0, "grad_norm": 1.5189830896959533, "language_loss": 0.7789076, "learning_rate": 4.865108764847825e-08, "loss": 0.79981327, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34765625, "step": 15497, "time_per_iteration": 2.4789657592773438 }, { "auxiliary_loss_clip": 0.01054178, "auxiliary_loss_mlp": 0.01041875, "balance_loss_clip": 1.01723409, "balance_loss_mlp": 1.0168128, "epoch": 0.9317901698481887, "flos": 23657146859520.0, "grad_norm": 1.6003078524519532, "language_loss": 0.67085141, "learning_rate": 4.856574512724898e-08, "loss": 0.69181192, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.375, "step": 15498, "time_per_iteration": 2.402937650680542 }, { "auxiliary_loss_clip": 0.01053238, "auxiliary_loss_mlp": 0.01041822, "balance_loss_clip": 1.01739573, "balance_loss_mlp": 1.01651382, "epoch": 0.9318502931008568, "flos": 20958954122880.0, "grad_norm": 1.9022369695638224, "language_loss": 0.80394447, "learning_rate": 4.8480476604602305e-08, "loss": 0.82489502, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 15499, "time_per_iteration": 2.389277935028076 }, { "auxiliary_loss_clip": 0.01051307, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 1.01440883, "balance_loss_mlp": 1.01699257, "epoch": 0.9319104163535247, "flos": 23439917180160.0, "grad_norm": 1.6057817272385164, "language_loss": 0.77962089, "learning_rate": 4.8395282083771196e-08, "loss": 0.80052167, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.34179688, "step": 15500, "time_per_iteration": 2.416773796081543 }, { "auxiliary_loss_clip": 0.01050187, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.01170552, "balance_loss_mlp": 1.01541066, "epoch": 0.9319705396061927, "flos": 22346297752320.0, "grad_norm": 1.7899731142937378, "language_loss": 0.73341179, "learning_rate": 4.8310161567987064e-08, "loss": 0.75424248, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 15501, "time_per_iteration": 2.3810460567474365 }, { "auxiliary_loss_clip": 0.01053233, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.01606965, "balance_loss_mlp": 1.01618791, "epoch": 0.9320306628588607, "flos": 20992575628800.0, "grad_norm": 1.63791757904497, "language_loss": 0.67448437, "learning_rate": 4.822511506047666e-08, "loss": 0.69541049, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37109375, "step": 15502, "time_per_iteration": 3.805577278137207 }, { "auxiliary_loss_clip": 0.01051287, "auxiliary_loss_mlp": 0.01036089, "balance_loss_clip": 1.01323652, "balance_loss_mlp": 1.01548219, "epoch": 0.9320907861115286, "flos": 24537062655360.0, "grad_norm": 1.4721106437727456, "language_loss": 0.66344607, "learning_rate": 4.814014256446586e-08, "loss": 0.68431985, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.359375, "step": 15503, "time_per_iteration": 2.470339298248291 }, { "auxiliary_loss_clip": 0.0105336, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.01432657, "balance_loss_mlp": 1.01605141, "epoch": 0.9321509093641966, "flos": 19784580986880.0, "grad_norm": 1.665075675259837, "language_loss": 0.75832868, "learning_rate": 4.805524408317652e-08, "loss": 0.77925384, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37304688, "step": 15504, "time_per_iteration": 2.3862807750701904 }, { "auxiliary_loss_clip": 0.01051967, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.00923431, "balance_loss_mlp": 1.01606655, "epoch": 0.9322110326168646, "flos": 24971522014080.0, "grad_norm": 2.4589652644103985, "language_loss": 0.72537422, "learning_rate": 4.797041961982762e-08, "loss": 0.74622762, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 15505, "time_per_iteration": 2.4808223247528076 }, { "auxiliary_loss_clip": 0.01051096, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.01462579, "balance_loss_mlp": 1.01584625, "epoch": 0.9322711558695326, "flos": 16142720578560.0, "grad_norm": 1.963995058944008, "language_loss": 0.76887888, "learning_rate": 4.788566917763614e-08, "loss": 0.7897653, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 15506, "time_per_iteration": 2.362354040145874 }, { "auxiliary_loss_clip": 0.01050141, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.01147377, "balance_loss_mlp": 1.01569867, "epoch": 0.9323312791222005, "flos": 23731302320640.0, "grad_norm": 2.004634049680931, "language_loss": 0.84173101, "learning_rate": 4.780099275981597e-08, "loss": 0.86255479, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34375, "step": 15507, "time_per_iteration": 2.3755786418914795 }, { "auxiliary_loss_clip": 0.01053227, "auxiliary_loss_mlp": 0.01038803, "balance_loss_clip": 1.01523519, "balance_loss_mlp": 1.0168798, "epoch": 0.9323914023748685, "flos": 20776847137920.0, "grad_norm": 1.4955228351924654, "language_loss": 0.68712223, "learning_rate": 4.771639036957742e-08, "loss": 0.7080425, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 15508, "time_per_iteration": 2.37910795211792 }, { "auxiliary_loss_clip": 0.01051354, "auxiliary_loss_mlp": 0.0103717, "balance_loss_clip": 1.01294637, "balance_loss_mlp": 1.01580012, "epoch": 0.9324515256275364, "flos": 23914037710080.0, "grad_norm": 1.7157071134316018, "language_loss": 0.73326474, "learning_rate": 4.7631862010129033e-08, "loss": 0.75414997, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 15509, "time_per_iteration": 2.3757073879241943 }, { "auxiliary_loss_clip": 0.01050919, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.01674223, "balance_loss_mlp": 1.01554048, "epoch": 0.9325116488802044, "flos": 18004219649280.0, "grad_norm": 1.9666653871366206, "language_loss": 0.75361347, "learning_rate": 4.754740768467624e-08, "loss": 0.77452648, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 15510, "time_per_iteration": 2.3623502254486084 }, { "auxiliary_loss_clip": 0.01052284, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.01568007, "balance_loss_mlp": 1.01618016, "epoch": 0.9325717721328723, "flos": 29020364674560.0, "grad_norm": 1.5050547531539282, "language_loss": 0.71341455, "learning_rate": 4.746302739642161e-08, "loss": 0.73431861, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36132812, "step": 15511, "time_per_iteration": 2.424105405807495 }, { "auxiliary_loss_clip": 0.01052704, "auxiliary_loss_mlp": 0.01040443, "balance_loss_clip": 1.01680422, "balance_loss_mlp": 1.01672244, "epoch": 0.9326318953855404, "flos": 21645451653120.0, "grad_norm": 1.760351069032224, "language_loss": 0.79069889, "learning_rate": 4.737872114856412e-08, "loss": 0.81163031, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.359375, "step": 15512, "time_per_iteration": 2.366345167160034 }, { "auxiliary_loss_clip": 0.01050272, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.01208866, "balance_loss_mlp": 1.01513243, "epoch": 0.9326920186382083, "flos": 26064582860160.0, "grad_norm": 1.4886806138162374, "language_loss": 0.80672336, "learning_rate": 4.7294488944301436e-08, "loss": 0.82757711, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 15513, "time_per_iteration": 2.39361572265625 }, { "auxiliary_loss_clip": 0.01055304, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.01254892, "balance_loss_mlp": 1.01711917, "epoch": 0.9327521418908763, "flos": 12056311428480.0, "grad_norm": 2.9338870594714495, "language_loss": 0.82046336, "learning_rate": 4.721033078682768e-08, "loss": 0.84138888, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 15514, "time_per_iteration": 2.3663597106933594 }, { "auxiliary_loss_clip": 0.01049804, "auxiliary_loss_mlp": 0.01040334, "balance_loss_clip": 1.01844764, "balance_loss_mlp": 1.01550579, "epoch": 0.9328122651435443, "flos": 43832755607040.0, "grad_norm": 1.8656055489022059, "language_loss": 0.72386193, "learning_rate": 4.7126246679333626e-08, "loss": 0.74476331, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 15515, "time_per_iteration": 2.567047119140625 }, { "auxiliary_loss_clip": 0.01054262, "auxiliary_loss_mlp": 0.01038949, "balance_loss_clip": 1.01415396, "balance_loss_mlp": 1.01671958, "epoch": 0.9328723883962122, "flos": 15194060582400.0, "grad_norm": 2.6484929458080817, "language_loss": 0.82226193, "learning_rate": 4.704223662500806e-08, "loss": 0.84319407, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 15516, "time_per_iteration": 2.3804430961608887 }, { "auxiliary_loss_clip": 0.01052946, "auxiliary_loss_mlp": 0.01037064, "balance_loss_clip": 1.01398492, "balance_loss_mlp": 1.01698196, "epoch": 0.9329325116488802, "flos": 20260866021120.0, "grad_norm": 1.7154006003846705, "language_loss": 0.81991786, "learning_rate": 4.695830062703643e-08, "loss": 0.84081793, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 15517, "time_per_iteration": 2.362855911254883 }, { "auxiliary_loss_clip": 0.01052782, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.01591372, "balance_loss_mlp": 1.01624691, "epoch": 0.9329926349015482, "flos": 13114179757440.0, "grad_norm": 2.251732142920275, "language_loss": 0.7544601, "learning_rate": 4.687443868860219e-08, "loss": 0.77540112, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36523438, "step": 15518, "time_per_iteration": 2.3753061294555664 }, { "auxiliary_loss_clip": 0.01052642, "auxiliary_loss_mlp": 0.01045483, "balance_loss_clip": 1.02022207, "balance_loss_mlp": 1.01628697, "epoch": 0.9330527581542162, "flos": 23039114238720.0, "grad_norm": 1.9775999176521728, "language_loss": 0.77191019, "learning_rate": 4.679065081288458e-08, "loss": 0.79289138, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 15519, "time_per_iteration": 3.9107143878936768 }, { "auxiliary_loss_clip": 0.01051291, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.01643968, "balance_loss_mlp": 1.01552701, "epoch": 0.9331128814068841, "flos": 15558728400000.0, "grad_norm": 2.150252313294838, "language_loss": 0.84228456, "learning_rate": 4.6706937003061275e-08, "loss": 0.86319828, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 15520, "time_per_iteration": 2.5652806758880615 }, { "auxiliary_loss_clip": 0.01050801, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.01269233, "balance_loss_mlp": 1.01583517, "epoch": 0.9331730046595521, "flos": 22270710925440.0, "grad_norm": 1.5882863882652158, "language_loss": 0.77148998, "learning_rate": 4.6623297262306846e-08, "loss": 0.79236102, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34960938, "step": 15521, "time_per_iteration": 2.447887420654297 }, { "auxiliary_loss_clip": 0.01051841, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.0138731, "balance_loss_mlp": 1.01583099, "epoch": 0.93323312791222, "flos": 15776761040640.0, "grad_norm": 1.7817591789809348, "language_loss": 0.78855592, "learning_rate": 4.6539731593792545e-08, "loss": 0.80943584, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 15522, "time_per_iteration": 2.436081647872925 }, { "auxiliary_loss_clip": 0.01050919, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.00999236, "balance_loss_mlp": 1.01530552, "epoch": 0.933293251164888, "flos": 22010084559360.0, "grad_norm": 5.938100882867834, "language_loss": 0.64308828, "learning_rate": 4.6456240000687373e-08, "loss": 0.66392517, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 15523, "time_per_iteration": 2.364863872528076 }, { "auxiliary_loss_clip": 0.01051446, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.01796281, "balance_loss_mlp": 1.01626945, "epoch": 0.933353374417556, "flos": 26030158392960.0, "grad_norm": 1.643196156131243, "language_loss": 0.69006878, "learning_rate": 4.63728224861577e-08, "loss": 0.71098763, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 15524, "time_per_iteration": 2.4553122520446777 }, { "auxiliary_loss_clip": 0.01052232, "auxiliary_loss_mlp": 0.01041241, "balance_loss_clip": 1.01722002, "balance_loss_mlp": 1.01624274, "epoch": 0.933413497670224, "flos": 24898937564160.0, "grad_norm": 1.4956553818234406, "language_loss": 0.74851155, "learning_rate": 4.628947905336589e-08, "loss": 0.76944625, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 15525, "time_per_iteration": 2.4126312732696533 }, { "auxiliary_loss_clip": 0.01049915, "auxiliary_loss_mlp": 0.01045844, "balance_loss_clip": 1.02069092, "balance_loss_mlp": 1.01444101, "epoch": 0.9334736209228919, "flos": 23687765988480.0, "grad_norm": 1.8530707204079593, "language_loss": 0.85078549, "learning_rate": 4.6206209705473175e-08, "loss": 0.87174308, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.35546875, "step": 15526, "time_per_iteration": 2.406351089477539 }, { "auxiliary_loss_clip": 0.01051717, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.01451683, "balance_loss_mlp": 1.0160259, "epoch": 0.9335337441755599, "flos": 15376446858240.0, "grad_norm": 2.062333979843498, "language_loss": 0.70179081, "learning_rate": 4.61230144456366e-08, "loss": 0.72268248, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35742188, "step": 15527, "time_per_iteration": 2.3730995655059814 }, { "auxiliary_loss_clip": 0.01053478, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.01373315, "balance_loss_mlp": 1.01641512, "epoch": 0.9335938674282279, "flos": 16105817404800.0, "grad_norm": 2.016365469937635, "language_loss": 0.66432559, "learning_rate": 4.603989327701141e-08, "loss": 0.68525088, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 15528, "time_per_iteration": 2.3671157360076904 }, { "auxiliary_loss_clip": 0.01053836, "auxiliary_loss_mlp": 0.01042168, "balance_loss_clip": 1.01651406, "balance_loss_mlp": 1.01645744, "epoch": 0.9336539906808958, "flos": 18951902127360.0, "grad_norm": 1.7734269730704326, "language_loss": 0.76493645, "learning_rate": 4.5956846202748867e-08, "loss": 0.78589648, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 15529, "time_per_iteration": 3.7705845832824707 }, { "auxiliary_loss_clip": 0.01049846, "auxiliary_loss_mlp": 0.01039817, "balance_loss_clip": 1.01732254, "balance_loss_mlp": 1.01533389, "epoch": 0.9337141139335638, "flos": 18108261100800.0, "grad_norm": 1.6845630981912458, "language_loss": 0.63670456, "learning_rate": 4.5873873225998674e-08, "loss": 0.65760124, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34570312, "step": 15530, "time_per_iteration": 3.7735142707824707 }, { "auxiliary_loss_clip": 0.01048721, "auxiliary_loss_mlp": 0.01031831, "balance_loss_clip": 1.01114821, "balance_loss_mlp": 1.01446831, "epoch": 0.9337742371862318, "flos": 17344815200640.0, "grad_norm": 1.7219876857610978, "language_loss": 0.73910606, "learning_rate": 4.5790974349907194e-08, "loss": 0.75991154, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 15531, "time_per_iteration": 2.4998130798339844 }, { "auxiliary_loss_clip": 0.01049873, "auxiliary_loss_mlp": 0.01033994, "balance_loss_clip": 1.01146293, "balance_loss_mlp": 1.01518452, "epoch": 0.9338343604388998, "flos": 29057721696000.0, "grad_norm": 2.1656148153810313, "language_loss": 0.71994406, "learning_rate": 4.5708149577617925e-08, "loss": 0.74078274, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 15532, "time_per_iteration": 2.437512159347534 }, { "auxiliary_loss_clip": 0.01052776, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.01227343, "balance_loss_mlp": 1.01672912, "epoch": 0.9338944836915677, "flos": 18659923493760.0, "grad_norm": 1.6990174671551157, "language_loss": 0.73429704, "learning_rate": 4.5625398912271016e-08, "loss": 0.75516969, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 15533, "time_per_iteration": 2.4290270805358887 }, { "auxiliary_loss_clip": 0.01050666, "auxiliary_loss_mlp": 0.01032224, "balance_loss_clip": 1.01073074, "balance_loss_mlp": 1.01620579, "epoch": 0.9339546069442357, "flos": 16616806197120.0, "grad_norm": 1.7199163351046083, "language_loss": 0.80961418, "learning_rate": 4.554272235700507e-08, "loss": 0.83044302, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 15534, "time_per_iteration": 2.4293696880340576 }, { "auxiliary_loss_clip": 0.01047448, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.01037621, "balance_loss_mlp": 1.01508141, "epoch": 0.9340147301969036, "flos": 23692060085760.0, "grad_norm": 1.8014677556288161, "language_loss": 0.75052774, "learning_rate": 4.546011991495513e-08, "loss": 0.77130443, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.32421875, "step": 15535, "time_per_iteration": 2.4058406352996826 }, { "auxiliary_loss_clip": 0.0105211, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.0099535, "balance_loss_mlp": 1.01596856, "epoch": 0.9340748534495716, "flos": 28653287973120.0, "grad_norm": 2.3912791878297925, "language_loss": 0.79233843, "learning_rate": 4.537759158925292e-08, "loss": 0.81319129, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 15536, "time_per_iteration": 2.4327948093414307 }, { "auxiliary_loss_clip": 0.01050537, "auxiliary_loss_mlp": 0.01033654, "balance_loss_clip": 1.01129007, "balance_loss_mlp": 1.01534402, "epoch": 0.9341349767022396, "flos": 24898483716480.0, "grad_norm": 1.4934101780873912, "language_loss": 0.81487447, "learning_rate": 4.5295137383028593e-08, "loss": 0.83571637, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 15537, "time_per_iteration": 2.4371771812438965 }, { "auxiliary_loss_clip": 0.01052126, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.01306152, "balance_loss_mlp": 1.015715, "epoch": 0.9341950999549076, "flos": 29058245366400.0, "grad_norm": 1.6105032699440676, "language_loss": 0.79115015, "learning_rate": 4.5212757299408764e-08, "loss": 0.81204367, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 15538, "time_per_iteration": 2.501819133758545 }, { "auxiliary_loss_clip": 0.01049591, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.01273382, "balance_loss_mlp": 1.01521754, "epoch": 0.9342552232075755, "flos": 23585923952640.0, "grad_norm": 1.4921908511811355, "language_loss": 0.7388128, "learning_rate": 4.513045134151672e-08, "loss": 0.75964868, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 15539, "time_per_iteration": 2.4543988704681396 }, { "auxiliary_loss_clip": 0.01050255, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.01390338, "balance_loss_mlp": 1.01543021, "epoch": 0.9343153464602435, "flos": 36719900317440.0, "grad_norm": 1.4775618498312006, "language_loss": 0.65639466, "learning_rate": 4.504821951247373e-08, "loss": 0.67725217, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34765625, "step": 15540, "time_per_iteration": 2.5121264457702637 }, { "auxiliary_loss_clip": 0.01050641, "auxiliary_loss_mlp": 0.01036295, "balance_loss_clip": 1.01430106, "balance_loss_mlp": 1.01535273, "epoch": 0.9343754697129115, "flos": 22235413674240.0, "grad_norm": 1.5801377283995806, "language_loss": 0.7680003, "learning_rate": 4.496606181539864e-08, "loss": 0.78886968, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 15541, "time_per_iteration": 3.891711950302124 }, { "auxiliary_loss_clip": 0.0105253, "auxiliary_loss_mlp": 0.01040511, "balance_loss_clip": 1.01551294, "balance_loss_mlp": 1.01737189, "epoch": 0.9344355929655794, "flos": 29709201265920.0, "grad_norm": 1.9209625870424736, "language_loss": 0.68309653, "learning_rate": 4.4883978253406066e-08, "loss": 0.70402694, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3515625, "step": 15542, "time_per_iteration": 2.4518649578094482 }, { "auxiliary_loss_clip": 0.0105323, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.01118135, "balance_loss_mlp": 1.01696873, "epoch": 0.9344957162182475, "flos": 18879387500160.0, "grad_norm": 1.6890937794815168, "language_loss": 0.70507216, "learning_rate": 4.480196882960907e-08, "loss": 0.72594893, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36328125, "step": 15543, "time_per_iteration": 2.378181219100952 }, { "auxiliary_loss_clip": 0.01051993, "auxiliary_loss_mlp": 0.01040133, "balance_loss_clip": 1.01396608, "balance_loss_mlp": 1.01545167, "epoch": 0.9345558394709154, "flos": 27416524504320.0, "grad_norm": 1.9760616671413913, "language_loss": 0.7127353, "learning_rate": 4.4720033547117394e-08, "loss": 0.73365653, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36523438, "step": 15544, "time_per_iteration": 2.481067419052124 }, { "auxiliary_loss_clip": 0.01052823, "auxiliary_loss_mlp": 0.01037069, "balance_loss_clip": 1.01244056, "balance_loss_mlp": 1.01634419, "epoch": 0.9346159627235834, "flos": 20740223255040.0, "grad_norm": 1.6791138003123842, "language_loss": 0.78262335, "learning_rate": 4.463817240903789e-08, "loss": 0.80352223, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 15545, "time_per_iteration": 2.4423398971557617 }, { "auxiliary_loss_clip": 0.01053562, "auxiliary_loss_mlp": 0.01035899, "balance_loss_clip": 1.013237, "balance_loss_mlp": 1.01636732, "epoch": 0.9346760859762513, "flos": 21068162455680.0, "grad_norm": 3.102317163373488, "language_loss": 0.7023989, "learning_rate": 4.455638541847495e-08, "loss": 0.72329354, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.37109375, "step": 15546, "time_per_iteration": 2.398581027984619 }, { "auxiliary_loss_clip": 0.01049172, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.01028168, "balance_loss_mlp": 1.01535106, "epoch": 0.9347362092289193, "flos": 29203658645760.0, "grad_norm": 1.7656543840902614, "language_loss": 0.8303746, "learning_rate": 4.447467257852966e-08, "loss": 0.85119581, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.33789062, "step": 15547, "time_per_iteration": 2.501020669937134 }, { "auxiliary_loss_clip": 0.01049563, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.01484776, "balance_loss_mlp": 1.01490784, "epoch": 0.9347963324815872, "flos": 19426336859520.0, "grad_norm": 1.753886332321221, "language_loss": 0.8488313, "learning_rate": 4.439303389230087e-08, "loss": 0.86969036, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34765625, "step": 15548, "time_per_iteration": 2.361800193786621 }, { "auxiliary_loss_clip": 0.01053141, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.01574767, "balance_loss_mlp": 1.01556945, "epoch": 0.9348564557342552, "flos": 36900401379840.0, "grad_norm": 1.8807040185281136, "language_loss": 0.66739476, "learning_rate": 4.4311469362884326e-08, "loss": 0.68833578, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37695312, "step": 15549, "time_per_iteration": 2.5184123516082764 }, { "auxiliary_loss_clip": 0.0105302, "auxiliary_loss_mlp": 0.01044158, "balance_loss_clip": 1.01917195, "balance_loss_mlp": 1.01626611, "epoch": 0.9349165789869232, "flos": 21689022896640.0, "grad_norm": 1.7685635089134109, "language_loss": 0.81532145, "learning_rate": 4.4229978993372665e-08, "loss": 0.83629322, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.3671875, "step": 15550, "time_per_iteration": 2.3723294734954834 }, { "auxiliary_loss_clip": 0.01051615, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.01310122, "balance_loss_mlp": 1.01650167, "epoch": 0.9349767022395912, "flos": 18843042908160.0, "grad_norm": 1.7073711170403898, "language_loss": 0.76802647, "learning_rate": 4.4148562786856524e-08, "loss": 0.78890312, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 15551, "time_per_iteration": 2.3926734924316406 }, { "auxiliary_loss_clip": 0.01049472, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.00932717, "balance_loss_mlp": 1.01652658, "epoch": 0.9350368254922591, "flos": 24972255152640.0, "grad_norm": 1.7169125501637814, "language_loss": 0.74450666, "learning_rate": 4.406722074642255e-08, "loss": 0.76529038, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.33007812, "step": 15552, "time_per_iteration": 2.413543701171875 }, { "auxiliary_loss_clip": 0.01051346, "auxiliary_loss_mlp": 0.01034294, "balance_loss_clip": 1.01151335, "balance_loss_mlp": 1.01579607, "epoch": 0.9350969487449271, "flos": 23069628633600.0, "grad_norm": 1.7447277076997598, "language_loss": 0.78058887, "learning_rate": 4.3985952875155386e-08, "loss": 0.80144525, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 15553, "time_per_iteration": 2.4090209007263184 }, { "auxiliary_loss_clip": 0.01053715, "auxiliary_loss_mlp": 0.01040906, "balance_loss_clip": 1.01675439, "balance_loss_mlp": 1.01676154, "epoch": 0.9351570719975951, "flos": 18624172394880.0, "grad_norm": 2.0901306220400495, "language_loss": 0.78745627, "learning_rate": 4.390475917613723e-08, "loss": 0.80840248, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 15554, "time_per_iteration": 2.353708028793335 }, { "auxiliary_loss_clip": 0.01048126, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.01665556, "balance_loss_mlp": 1.01482177, "epoch": 0.935217195250263, "flos": 15887435650560.0, "grad_norm": 1.5482913580332578, "language_loss": 0.70198011, "learning_rate": 4.382363965244695e-08, "loss": 0.72282624, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33398438, "step": 15555, "time_per_iteration": 2.336634874343872 }, { "auxiliary_loss_clip": 0.01050893, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.01568997, "balance_loss_mlp": 1.0154748, "epoch": 0.935277318502931, "flos": 24389135758080.0, "grad_norm": 1.5727893435886218, "language_loss": 0.76900572, "learning_rate": 4.374259430715965e-08, "loss": 0.78990424, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 15556, "time_per_iteration": 2.4207804203033447 }, { "auxiliary_loss_clip": 0.01050389, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.01425505, "balance_loss_mlp": 1.01481462, "epoch": 0.935337441755599, "flos": 27599015514240.0, "grad_norm": 1.5368566585951444, "language_loss": 0.737553, "learning_rate": 4.366162314334953e-08, "loss": 0.75841749, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.35546875, "step": 15557, "time_per_iteration": 2.441277027130127 }, { "auxiliary_loss_clip": 0.01052197, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.01544201, "balance_loss_mlp": 1.01558065, "epoch": 0.935397565008267, "flos": 20481901038720.0, "grad_norm": 1.584859323079968, "language_loss": 0.63808179, "learning_rate": 4.358072616408681e-08, "loss": 0.6589973, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3671875, "step": 15558, "time_per_iteration": 2.3788363933563232 }, { "auxiliary_loss_clip": 0.01050305, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.01269555, "balance_loss_mlp": 1.01587427, "epoch": 0.9354576882609349, "flos": 23653411344000.0, "grad_norm": 2.0974987036033528, "language_loss": 0.74405056, "learning_rate": 4.34999033724388e-08, "loss": 0.76490915, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34570312, "step": 15559, "time_per_iteration": 3.709498405456543 }, { "auxiliary_loss_clip": 0.01049851, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.01129746, "balance_loss_mlp": 1.01559949, "epoch": 0.9355178115136029, "flos": 36683416080000.0, "grad_norm": 1.7827282120971069, "language_loss": 0.65058064, "learning_rate": 4.341915477147062e-08, "loss": 0.6714015, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34179688, "step": 15560, "time_per_iteration": 2.5275275707244873 }, { "auxiliary_loss_clip": 0.01057068, "auxiliary_loss_mlp": 0.01042155, "balance_loss_clip": 1.01533294, "balance_loss_mlp": 1.0170716, "epoch": 0.9355779347662708, "flos": 14459662800000.0, "grad_norm": 2.1712758872841054, "language_loss": 0.65523863, "learning_rate": 4.3338480364244034e-08, "loss": 0.67623091, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.40039062, "step": 15561, "time_per_iteration": 2.3613178730010986 }, { "auxiliary_loss_clip": 0.01052856, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.01241302, "balance_loss_mlp": 1.01741934, "epoch": 0.9356380580189388, "flos": 23184841720320.0, "grad_norm": 1.6979583288386324, "language_loss": 0.76780564, "learning_rate": 4.325788015381859e-08, "loss": 0.78869152, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 15562, "time_per_iteration": 2.3847169876098633 }, { "auxiliary_loss_clip": 0.01006906, "auxiliary_loss_mlp": 0.01003834, "balance_loss_clip": 1.00164044, "balance_loss_mlp": 1.00076199, "epoch": 0.9356981812716068, "flos": 67468465025280.0, "grad_norm": 0.9468820190188354, "language_loss": 0.62479985, "learning_rate": 4.31773541432503e-08, "loss": 0.64490724, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.06152344, "step": 15563, "time_per_iteration": 2.921511173248291 }, { "auxiliary_loss_clip": 0.01049837, "auxiliary_loss_mlp": 0.01038366, "balance_loss_clip": 1.01560855, "balance_loss_mlp": 1.01568794, "epoch": 0.9357583045242748, "flos": 24680451075840.0, "grad_norm": 1.5634408105583546, "language_loss": 0.79206073, "learning_rate": 4.3096902335592714e-08, "loss": 0.81294274, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34179688, "step": 15564, "time_per_iteration": 2.3827171325683594 }, { "auxiliary_loss_clip": 0.01053187, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.0129416, "balance_loss_mlp": 1.01621163, "epoch": 0.9358184277769427, "flos": 19462716362880.0, "grad_norm": 1.7866624577755008, "language_loss": 0.79249668, "learning_rate": 4.301652473389694e-08, "loss": 0.81339782, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 15565, "time_per_iteration": 2.405205488204956 }, { "auxiliary_loss_clip": 0.01049243, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.01121569, "balance_loss_mlp": 1.01543212, "epoch": 0.9358785510296107, "flos": 18915976471680.0, "grad_norm": 1.8874216853412216, "language_loss": 0.73369557, "learning_rate": 4.2936221341210774e-08, "loss": 0.75449395, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.33984375, "step": 15566, "time_per_iteration": 2.419931650161743 }, { "auxiliary_loss_clip": 0.01051649, "auxiliary_loss_mlp": 0.01036014, "balance_loss_clip": 1.01345944, "balance_loss_mlp": 1.01531863, "epoch": 0.9359386742822787, "flos": 23440126648320.0, "grad_norm": 2.776499698610212, "language_loss": 0.68979996, "learning_rate": 4.285599216057889e-08, "loss": 0.71067655, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.36328125, "step": 15567, "time_per_iteration": 2.4416613578796387 }, { "auxiliary_loss_clip": 0.01051533, "auxiliary_loss_mlp": 0.01039935, "balance_loss_clip": 1.01692784, "balance_loss_mlp": 1.01603258, "epoch": 0.9359987975349466, "flos": 32742699500160.0, "grad_norm": 1.9974722752151701, "language_loss": 0.63864136, "learning_rate": 4.277583719504418e-08, "loss": 0.65955603, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 15568, "time_per_iteration": 3.8917109966278076 }, { "auxiliary_loss_clip": 0.01050358, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.01732647, "balance_loss_mlp": 1.01456785, "epoch": 0.9360589207876147, "flos": 22818567980160.0, "grad_norm": 1.5864017510804298, "language_loss": 0.79737961, "learning_rate": 4.269575644764556e-08, "loss": 0.81829387, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35742188, "step": 15569, "time_per_iteration": 3.7031047344207764 }, { "auxiliary_loss_clip": 0.01052084, "auxiliary_loss_mlp": 0.0104172, "balance_loss_clip": 1.01812875, "balance_loss_mlp": 1.0160985, "epoch": 0.9361190440402826, "flos": 20884240080000.0, "grad_norm": 2.251479837732253, "language_loss": 0.703655, "learning_rate": 4.261574992142014e-08, "loss": 0.72459304, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36132812, "step": 15570, "time_per_iteration": 2.429901123046875 }, { "auxiliary_loss_clip": 0.01052927, "auxiliary_loss_mlp": 0.0103947, "balance_loss_clip": 1.01596177, "balance_loss_mlp": 1.0164851, "epoch": 0.9361791672929506, "flos": 19316814324480.0, "grad_norm": 2.1003143530186668, "language_loss": 0.80161887, "learning_rate": 4.2535817619401726e-08, "loss": 0.82254291, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 15571, "time_per_iteration": 2.431450128555298 }, { "auxiliary_loss_clip": 0.01051405, "auxiliary_loss_mlp": 0.01038207, "balance_loss_clip": 1.01629663, "balance_loss_mlp": 1.01616037, "epoch": 0.9362392905456185, "flos": 15157297054080.0, "grad_norm": 2.7293620941017362, "language_loss": 0.77614141, "learning_rate": 4.2455959544621224e-08, "loss": 0.79703748, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15572, "time_per_iteration": 2.3560757637023926 }, { "auxiliary_loss_clip": 0.01052315, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.02382636, "balance_loss_mlp": 1.01695943, "epoch": 0.9362994137982865, "flos": 22084938247680.0, "grad_norm": 1.9255784168349535, "language_loss": 0.78612989, "learning_rate": 4.237617570010688e-08, "loss": 0.80711269, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.35351562, "step": 15573, "time_per_iteration": 2.3574860095977783 }, { "auxiliary_loss_clip": 0.01049686, "auxiliary_loss_mlp": 0.01035727, "balance_loss_clip": 1.0151751, "balance_loss_mlp": 1.01617265, "epoch": 0.9363595370509544, "flos": 23511174998400.0, "grad_norm": 1.5973179538976112, "language_loss": 0.74797916, "learning_rate": 4.2296466088884044e-08, "loss": 0.76883328, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 15574, "time_per_iteration": 2.4065310955047607 }, { "auxiliary_loss_clip": 0.01049791, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.01587391, "balance_loss_mlp": 1.01477861, "epoch": 0.9364196603036224, "flos": 27122311543680.0, "grad_norm": 2.8972055986516367, "language_loss": 0.69574404, "learning_rate": 4.221683071397564e-08, "loss": 0.71663249, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34960938, "step": 15575, "time_per_iteration": 2.4790120124816895 }, { "auxiliary_loss_clip": 0.01048439, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.01552081, "balance_loss_mlp": 1.01499379, "epoch": 0.9364797835562904, "flos": 18478060888320.0, "grad_norm": 1.4745489527685491, "language_loss": 0.66078651, "learning_rate": 4.2137269578401026e-08, "loss": 0.68163979, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33398438, "step": 15576, "time_per_iteration": 2.3426690101623535 }, { "auxiliary_loss_clip": 0.01051606, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.01222277, "balance_loss_mlp": 1.01492107, "epoch": 0.9365399068089584, "flos": 13004727045120.0, "grad_norm": 2.271328244474399, "language_loss": 0.77750683, "learning_rate": 4.2057782685177566e-08, "loss": 0.79837579, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3671875, "step": 15577, "time_per_iteration": 2.313136100769043 }, { "auxiliary_loss_clip": 0.01051105, "auxiliary_loss_mlp": 0.01038094, "balance_loss_clip": 1.01358438, "balance_loss_mlp": 1.0153296, "epoch": 0.9366000300616263, "flos": 25665246195840.0, "grad_norm": 1.998717747670019, "language_loss": 0.53937387, "learning_rate": 4.1978370037318855e-08, "loss": 0.5602659, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.35742188, "step": 15578, "time_per_iteration": 2.426535129547119 }, { "auxiliary_loss_clip": 0.01049605, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.01449549, "balance_loss_mlp": 1.01579392, "epoch": 0.9366601533142943, "flos": 21432306602880.0, "grad_norm": 1.6289305271890389, "language_loss": 0.70872164, "learning_rate": 4.189903163783692e-08, "loss": 0.72956586, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33789062, "step": 15579, "time_per_iteration": 2.3681886196136475 }, { "auxiliary_loss_clip": 0.01050336, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.00929463, "balance_loss_mlp": 1.0149951, "epoch": 0.9367202765669622, "flos": 24092199711360.0, "grad_norm": 1.8961419215694968, "language_loss": 0.77220267, "learning_rate": 4.181976748973959e-08, "loss": 0.79302609, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15580, "time_per_iteration": 2.399958372116089 }, { "auxiliary_loss_clip": 0.01054158, "auxiliary_loss_mlp": 0.01038272, "balance_loss_clip": 1.01470482, "balance_loss_mlp": 1.01689887, "epoch": 0.9367803998196302, "flos": 20887731216000.0, "grad_norm": 1.951086176792541, "language_loss": 0.67388529, "learning_rate": 4.1740577596033114e-08, "loss": 0.69480962, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37304688, "step": 15581, "time_per_iteration": 3.831249237060547 }, { "auxiliary_loss_clip": 0.01052717, "auxiliary_loss_mlp": 0.01035262, "balance_loss_clip": 1.01207578, "balance_loss_mlp": 1.01710343, "epoch": 0.9368405230722983, "flos": 22563283052160.0, "grad_norm": 1.7416279329174358, "language_loss": 0.77385497, "learning_rate": 4.166146195972042e-08, "loss": 0.79473484, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35546875, "step": 15582, "time_per_iteration": 2.4045424461364746 }, { "auxiliary_loss_clip": 0.01049623, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.01225781, "balance_loss_mlp": 1.01547658, "epoch": 0.9369006463249662, "flos": 18879212943360.0, "grad_norm": 1.6580390865097596, "language_loss": 0.75148284, "learning_rate": 4.1582420583800905e-08, "loss": 0.77230942, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34179688, "step": 15583, "time_per_iteration": 2.3852062225341797 }, { "auxiliary_loss_clip": 0.01054603, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 1.01589012, "balance_loss_mlp": 1.01714718, "epoch": 0.9369607695776342, "flos": 26431310448000.0, "grad_norm": 2.0264821660251675, "language_loss": 0.85793573, "learning_rate": 4.1503453471272376e-08, "loss": 0.8788898, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 15584, "time_per_iteration": 2.533287286758423 }, { "auxiliary_loss_clip": 0.01053269, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.01395404, "balance_loss_mlp": 1.01596427, "epoch": 0.9370208928303021, "flos": 39565775571840.0, "grad_norm": 1.5614612290891945, "language_loss": 0.73460281, "learning_rate": 4.1424560625129334e-08, "loss": 0.75551909, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 15585, "time_per_iteration": 2.550903558731079 }, { "auxiliary_loss_clip": 0.01048621, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.01091552, "balance_loss_mlp": 1.0146122, "epoch": 0.9370810160829701, "flos": 22962096046080.0, "grad_norm": 1.8616704486834752, "language_loss": 0.8097899, "learning_rate": 4.134574204836316e-08, "loss": 0.83059967, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33984375, "step": 15586, "time_per_iteration": 2.4045028686523438 }, { "auxiliary_loss_clip": 0.01050301, "auxiliary_loss_mlp": 0.01039201, "balance_loss_clip": 1.01580071, "balance_loss_mlp": 1.01468492, "epoch": 0.937141139335638, "flos": 23073957642240.0, "grad_norm": 1.5837977022722, "language_loss": 0.77501911, "learning_rate": 4.126699774396258e-08, "loss": 0.79591417, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 15587, "time_per_iteration": 2.41715931892395 }, { "auxiliary_loss_clip": 0.01053775, "auxiliary_loss_mlp": 0.01044213, "balance_loss_clip": 1.01917934, "balance_loss_mlp": 1.01686704, "epoch": 0.937201262588306, "flos": 16355900540160.0, "grad_norm": 1.8899401678699659, "language_loss": 0.88800901, "learning_rate": 4.118832771491387e-08, "loss": 0.90898889, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 15588, "time_per_iteration": 2.3943769931793213 }, { "auxiliary_loss_clip": 0.01050048, "auxiliary_loss_mlp": 0.01030161, "balance_loss_clip": 1.01046813, "balance_loss_mlp": 1.01632404, "epoch": 0.937261385840974, "flos": 20193029516160.0, "grad_norm": 1.8024561798003838, "language_loss": 0.79297262, "learning_rate": 4.11097319642002e-08, "loss": 0.8137747, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.33789062, "step": 15589, "time_per_iteration": 2.3895111083984375 }, { "auxiliary_loss_clip": 0.01049883, "auxiliary_loss_mlp": 0.01032643, "balance_loss_clip": 1.00967169, "balance_loss_mlp": 1.01571584, "epoch": 0.937321509093642, "flos": 18294976385280.0, "grad_norm": 1.8515327319542423, "language_loss": 0.78707862, "learning_rate": 4.103121049480163e-08, "loss": 0.80790389, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34179688, "step": 15590, "time_per_iteration": 2.3717401027679443 }, { "auxiliary_loss_clip": 0.01052546, "auxiliary_loss_mlp": 0.010395, "balance_loss_clip": 1.01434672, "balance_loss_mlp": 1.01539898, "epoch": 0.9373816323463099, "flos": 25883488304640.0, "grad_norm": 2.2184565379397836, "language_loss": 0.72379827, "learning_rate": 4.095276330969577e-08, "loss": 0.74471879, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 15591, "time_per_iteration": 2.400038957595825 }, { "auxiliary_loss_clip": 0.01054472, "auxiliary_loss_mlp": 0.01043542, "balance_loss_clip": 1.0170176, "balance_loss_mlp": 1.01644588, "epoch": 0.9374417555989779, "flos": 27197619079680.0, "grad_norm": 2.0746597792349135, "language_loss": 0.55443549, "learning_rate": 4.0874390411857804e-08, "loss": 0.57541555, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 15592, "time_per_iteration": 2.419501304626465 }, { "auxiliary_loss_clip": 0.01050203, "auxiliary_loss_mlp": 0.01035966, "balance_loss_clip": 1.01466358, "balance_loss_mlp": 1.01537585, "epoch": 0.9375018788516458, "flos": 23620173863040.0, "grad_norm": 2.11019556028233, "language_loss": 0.68094552, "learning_rate": 4.0796091804259136e-08, "loss": 0.70180714, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 15593, "time_per_iteration": 2.3870997428894043 }, { "auxiliary_loss_clip": 0.01052142, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.01170659, "balance_loss_mlp": 1.01678848, "epoch": 0.9375620021043138, "flos": 22677553532160.0, "grad_norm": 1.513755228932916, "language_loss": 0.7463029, "learning_rate": 4.0717867489868715e-08, "loss": 0.76716125, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 15594, "time_per_iteration": 2.4158999919891357 }, { "auxiliary_loss_clip": 0.01048534, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.01109004, "balance_loss_mlp": 1.01495469, "epoch": 0.9376221253569819, "flos": 27559109963520.0, "grad_norm": 1.5810671433110417, "language_loss": 0.74595338, "learning_rate": 4.063971747165351e-08, "loss": 0.76675802, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.3359375, "step": 15595, "time_per_iteration": 2.395942211151123 }, { "auxiliary_loss_clip": 0.01052412, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.01644695, "balance_loss_mlp": 1.01574457, "epoch": 0.9376822486096498, "flos": 24128858505600.0, "grad_norm": 1.7864084192890564, "language_loss": 0.77138156, "learning_rate": 4.056164175257626e-08, "loss": 0.79230618, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 15596, "time_per_iteration": 2.3851065635681152 }, { "auxiliary_loss_clip": 0.01051647, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.01274729, "balance_loss_mlp": 1.01596093, "epoch": 0.9377423718623178, "flos": 22782921615360.0, "grad_norm": 1.6596710233968117, "language_loss": 0.79474044, "learning_rate": 4.0483640335597926e-08, "loss": 0.81560516, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35546875, "step": 15597, "time_per_iteration": 2.361423969268799 }, { "auxiliary_loss_clip": 0.0105341, "auxiliary_loss_mlp": 0.01038034, "balance_loss_clip": 1.01387048, "balance_loss_mlp": 1.01632965, "epoch": 0.9378024951149857, "flos": 19167979731840.0, "grad_norm": 1.7393717774963604, "language_loss": 0.82109153, "learning_rate": 4.0405713223676363e-08, "loss": 0.84200603, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37109375, "step": 15598, "time_per_iteration": 2.4837722778320312 }, { "auxiliary_loss_clip": 0.01055479, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.01660454, "balance_loss_mlp": 1.01678848, "epoch": 0.9378626183676537, "flos": 23504611662720.0, "grad_norm": 1.781353264866782, "language_loss": 0.64918667, "learning_rate": 4.0327860419766994e-08, "loss": 0.67015153, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 15599, "time_per_iteration": 3.8161299228668213 }, { "auxiliary_loss_clip": 0.01052001, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.01025343, "balance_loss_mlp": 1.01577878, "epoch": 0.9379227416203216, "flos": 18404673477120.0, "grad_norm": 1.6985585116493827, "language_loss": 0.73912603, "learning_rate": 4.0250081926821e-08, "loss": 0.75998676, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 15600, "time_per_iteration": 2.3504292964935303 }, { "auxiliary_loss_clip": 0.01050348, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.01332998, "balance_loss_mlp": 1.01586628, "epoch": 0.9379828648729897, "flos": 17820890766720.0, "grad_norm": 1.7431432996334848, "language_loss": 0.70345283, "learning_rate": 4.0172377747788474e-08, "loss": 0.72431135, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34570312, "step": 15601, "time_per_iteration": 2.337918758392334 }, { "auxiliary_loss_clip": 0.01007252, "auxiliary_loss_mlp": 0.01002116, "balance_loss_clip": 1.00014901, "balance_loss_mlp": 1.0009501, "epoch": 0.9380429881256576, "flos": 68021070024960.0, "grad_norm": 0.745403438262729, "language_loss": 0.58097732, "learning_rate": 4.009474788561573e-08, "loss": 0.601071, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.06298828, "step": 15602, "time_per_iteration": 3.1973063945770264 }, { "auxiliary_loss_clip": 0.01051202, "auxiliary_loss_mlp": 0.01036041, "balance_loss_clip": 1.01324844, "balance_loss_mlp": 1.01503873, "epoch": 0.9381031113783256, "flos": 20775939442560.0, "grad_norm": 1.8366003076466202, "language_loss": 0.73026192, "learning_rate": 4.001719234324663e-08, "loss": 0.7511344, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 15603, "time_per_iteration": 2.3533098697662354 }, { "auxiliary_loss_clip": 0.01047823, "auxiliary_loss_mlp": 0.01037693, "balance_loss_clip": 1.01699793, "balance_loss_mlp": 1.01526999, "epoch": 0.9381632346309935, "flos": 19024102552320.0, "grad_norm": 1.6617385875961301, "language_loss": 0.77022874, "learning_rate": 3.993971112362171e-08, "loss": 0.79108393, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.32617188, "step": 15604, "time_per_iteration": 2.38677978515625 }, { "auxiliary_loss_clip": 0.01051474, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.01017809, "balance_loss_mlp": 1.01586866, "epoch": 0.9382233578836615, "flos": 23512711098240.0, "grad_norm": 2.5094016145645064, "language_loss": 0.65928507, "learning_rate": 3.9862304229679734e-08, "loss": 0.68014371, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 15605, "time_per_iteration": 2.384951591491699 }, { "auxiliary_loss_clip": 0.01053159, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.01650691, "balance_loss_mlp": 1.01605725, "epoch": 0.9382834811363294, "flos": 43066272418560.0, "grad_norm": 1.5887821012328822, "language_loss": 0.6840356, "learning_rate": 3.9784971664355683e-08, "loss": 0.70497894, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 15606, "time_per_iteration": 2.569531202316284 }, { "auxiliary_loss_clip": 0.0104853, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.00893962, "balance_loss_mlp": 1.0146997, "epoch": 0.9383436043889974, "flos": 16435292705280.0, "grad_norm": 1.7581833613091775, "language_loss": 0.7855317, "learning_rate": 3.970771343058166e-08, "loss": 0.80631459, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33789062, "step": 15607, "time_per_iteration": 2.3436005115509033 }, { "auxiliary_loss_clip": 0.0105219, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.01285481, "balance_loss_mlp": 1.01583648, "epoch": 0.9384037276416655, "flos": 20739559939200.0, "grad_norm": 1.7385882696859578, "language_loss": 0.83887893, "learning_rate": 3.963052953128776e-08, "loss": 0.85974401, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.36328125, "step": 15608, "time_per_iteration": 3.8705523014068604 }, { "auxiliary_loss_clip": 0.01051808, "auxiliary_loss_mlp": 0.01038887, "balance_loss_clip": 1.01586795, "balance_loss_mlp": 1.01657891, "epoch": 0.9384638508943334, "flos": 19061669041920.0, "grad_norm": 1.7194690411885467, "language_loss": 0.70325053, "learning_rate": 3.9553419969400536e-08, "loss": 0.72415751, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.3515625, "step": 15609, "time_per_iteration": 2.373213052749634 }, { "auxiliary_loss_clip": 0.01054589, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.00731099, "balance_loss_mlp": 1.01669431, "epoch": 0.9385239741470014, "flos": 23403747144960.0, "grad_norm": 1.8771172444693556, "language_loss": 0.76327538, "learning_rate": 3.9476384747844316e-08, "loss": 0.78412819, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37890625, "step": 15610, "time_per_iteration": 2.3753364086151123 }, { "auxiliary_loss_clip": 0.01051717, "auxiliary_loss_mlp": 0.01034734, "balance_loss_clip": 1.01290715, "balance_loss_mlp": 1.01558685, "epoch": 0.9385840973996693, "flos": 12823876869120.0, "grad_norm": 2.3283247290559217, "language_loss": 0.76196337, "learning_rate": 3.939942386953987e-08, "loss": 0.78282791, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 15611, "time_per_iteration": 2.355422258377075 }, { "auxiliary_loss_clip": 0.01050453, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.01375628, "balance_loss_mlp": 1.01603007, "epoch": 0.9386442206523373, "flos": 15486074127360.0, "grad_norm": 1.884335815512752, "language_loss": 0.66472042, "learning_rate": 3.9322537337405756e-08, "loss": 0.68557525, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 15612, "time_per_iteration": 2.366431951522827 }, { "auxiliary_loss_clip": 0.01049728, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.01573551, "balance_loss_mlp": 1.01526213, "epoch": 0.9387043439050052, "flos": 21177754813440.0, "grad_norm": 2.4585437693066, "language_loss": 0.58972031, "learning_rate": 3.924572515435742e-08, "loss": 0.61060345, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 15613, "time_per_iteration": 2.353928565979004 }, { "auxiliary_loss_clip": 0.0105163, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.01394999, "balance_loss_mlp": 1.01600778, "epoch": 0.9387644671576733, "flos": 27667166221440.0, "grad_norm": 2.4397508487518955, "language_loss": 0.71952766, "learning_rate": 3.916898732330764e-08, "loss": 0.74040961, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 15614, "time_per_iteration": 2.4126012325286865 }, { "auxiliary_loss_clip": 0.01055202, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.01222432, "balance_loss_mlp": 1.01763582, "epoch": 0.9388245904103412, "flos": 18835536965760.0, "grad_norm": 4.5926146757850965, "language_loss": 0.82168424, "learning_rate": 3.9092323847166544e-08, "loss": 0.84261811, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.375, "step": 15615, "time_per_iteration": 2.3452248573303223 }, { "auxiliary_loss_clip": 0.01049954, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.01466441, "balance_loss_mlp": 1.01573849, "epoch": 0.9388847136630092, "flos": 25482650451840.0, "grad_norm": 1.5671611850862652, "language_loss": 0.72773451, "learning_rate": 3.901573472884134e-08, "loss": 0.74860108, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34179688, "step": 15616, "time_per_iteration": 2.389707326889038 }, { "auxiliary_loss_clip": 0.01052034, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.01335001, "balance_loss_mlp": 1.01559722, "epoch": 0.9389448369156771, "flos": 18733974220800.0, "grad_norm": 1.6666321402409459, "language_loss": 0.67212701, "learning_rate": 3.89392199712355e-08, "loss": 0.6930058, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36328125, "step": 15617, "time_per_iteration": 2.3517935276031494 }, { "auxiliary_loss_clip": 0.01052735, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.01619065, "balance_loss_mlp": 1.01641524, "epoch": 0.9390049601683451, "flos": 21716988762240.0, "grad_norm": 2.12089900316259, "language_loss": 0.74135602, "learning_rate": 3.886277957725092e-08, "loss": 0.76230311, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36328125, "step": 15618, "time_per_iteration": 2.356299638748169 }, { "auxiliary_loss_clip": 0.01054512, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.01274705, "balance_loss_mlp": 1.01646602, "epoch": 0.939065083421013, "flos": 19390201735680.0, "grad_norm": 2.529720748924497, "language_loss": 0.71133423, "learning_rate": 3.878641354978662e-08, "loss": 0.73226029, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 15619, "time_per_iteration": 2.361297130584717 }, { "auxiliary_loss_clip": 0.01052408, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.01588428, "balance_loss_mlp": 1.01592553, "epoch": 0.939125206673681, "flos": 24680346341760.0, "grad_norm": 1.827646240630847, "language_loss": 0.78627193, "learning_rate": 3.8710121891737834e-08, "loss": 0.80720437, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36328125, "step": 15620, "time_per_iteration": 2.411228895187378 }, { "auxiliary_loss_clip": 0.01048773, "auxiliary_loss_mlp": 0.0103748, "balance_loss_clip": 1.0159272, "balance_loss_mlp": 1.01462555, "epoch": 0.9391853299263491, "flos": 16325037031680.0, "grad_norm": 1.9328035895493718, "language_loss": 0.74847609, "learning_rate": 3.8633904605998025e-08, "loss": 0.76933861, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.33984375, "step": 15621, "time_per_iteration": 3.7611842155456543 }, { "auxiliary_loss_clip": 0.01054903, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.01577878, "balance_loss_mlp": 1.01684976, "epoch": 0.939245453179017, "flos": 11654984816640.0, "grad_norm": 2.2239452061147644, "language_loss": 0.68314373, "learning_rate": 3.855776169545688e-08, "loss": 0.70409817, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.38085938, "step": 15622, "time_per_iteration": 2.3479156494140625 }, { "auxiliary_loss_clip": 0.01049851, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.01704431, "balance_loss_mlp": 1.01540875, "epoch": 0.939305576431685, "flos": 23147589432960.0, "grad_norm": 1.547412555081906, "language_loss": 0.72117376, "learning_rate": 3.848169316300209e-08, "loss": 0.74207062, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 15623, "time_per_iteration": 2.3616764545440674 }, { "auxiliary_loss_clip": 0.01055158, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.01387942, "balance_loss_mlp": 1.01698554, "epoch": 0.9393656996843529, "flos": 33286506837120.0, "grad_norm": 1.7947239603423966, "language_loss": 0.73918319, "learning_rate": 3.84056990115178e-08, "loss": 0.76012844, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38085938, "step": 15624, "time_per_iteration": 2.537196636199951 }, { "auxiliary_loss_clip": 0.01050962, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.01170444, "balance_loss_mlp": 1.01600659, "epoch": 0.9394258229370209, "flos": 21688359580800.0, "grad_norm": 1.8667039152642837, "language_loss": 0.90371323, "learning_rate": 3.832977924388614e-08, "loss": 0.92456597, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34960938, "step": 15625, "time_per_iteration": 2.38071870803833 }, { "auxiliary_loss_clip": 0.01050524, "auxiliary_loss_mlp": 0.01037537, "balance_loss_clip": 1.01495862, "balance_loss_mlp": 1.01536763, "epoch": 0.9394859461896888, "flos": 23873189552640.0, "grad_norm": 1.726840184102843, "language_loss": 0.85184729, "learning_rate": 3.825393386298592e-08, "loss": 0.87272787, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15626, "time_per_iteration": 2.4309463500976562 }, { "auxiliary_loss_clip": 0.01007084, "auxiliary_loss_mlp": 0.01002278, "balance_loss_clip": 1.00039434, "balance_loss_mlp": 1.00091851, "epoch": 0.9395460694423569, "flos": 61562975973120.0, "grad_norm": 0.7766354257421524, "language_loss": 0.56118798, "learning_rate": 3.8178162871693284e-08, "loss": 0.58128154, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.06152344, "step": 15627, "time_per_iteration": 2.995896816253662 }, { "auxiliary_loss_clip": 0.01049938, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.01432228, "balance_loss_mlp": 1.01594329, "epoch": 0.9396061926950248, "flos": 20994670310400.0, "grad_norm": 2.6657700825245874, "language_loss": 0.71020567, "learning_rate": 3.810246627288105e-08, "loss": 0.73105758, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 15628, "time_per_iteration": 2.4003913402557373 }, { "auxiliary_loss_clip": 0.01050343, "auxiliary_loss_mlp": 0.01038376, "balance_loss_clip": 1.01623869, "balance_loss_mlp": 1.01540077, "epoch": 0.9396663159476928, "flos": 27486630247680.0, "grad_norm": 1.5218562726447324, "language_loss": 0.76275909, "learning_rate": 3.8026844069420025e-08, "loss": 0.78364629, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 15629, "time_per_iteration": 2.4130239486694336 }, { "auxiliary_loss_clip": 0.0104716, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.01724887, "balance_loss_mlp": 1.01443696, "epoch": 0.9397264392003607, "flos": 19426441593600.0, "grad_norm": 1.8050609228224286, "language_loss": 0.74520159, "learning_rate": 3.795129626417748e-08, "loss": 0.76607221, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.328125, "step": 15630, "time_per_iteration": 2.3671884536743164 }, { "auxiliary_loss_clip": 0.0104937, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.01370311, "balance_loss_mlp": 1.01601601, "epoch": 0.9397865624530287, "flos": 18003835624320.0, "grad_norm": 2.1230261040379355, "language_loss": 0.70099765, "learning_rate": 3.787582286001845e-08, "loss": 0.72183359, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33398438, "step": 15631, "time_per_iteration": 2.341913938522339 }, { "auxiliary_loss_clip": 0.01049859, "auxiliary_loss_mlp": 0.01033816, "balance_loss_clip": 1.01213157, "balance_loss_mlp": 1.0155158, "epoch": 0.9398466857056966, "flos": 22563527431680.0, "grad_norm": 2.943020981027574, "language_loss": 0.75766695, "learning_rate": 3.7800423859805086e-08, "loss": 0.77850372, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 15632, "time_per_iteration": 2.4089062213897705 }, { "auxiliary_loss_clip": 0.0105361, "auxiliary_loss_mlp": 0.01040654, "balance_loss_clip": 1.01556051, "balance_loss_mlp": 1.01674008, "epoch": 0.9399068089583646, "flos": 24534514126080.0, "grad_norm": 1.5282250589363653, "language_loss": 0.75462735, "learning_rate": 3.772509926639622e-08, "loss": 0.77556998, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36914062, "step": 15633, "time_per_iteration": 2.4162654876708984 }, { "auxiliary_loss_clip": 0.01053285, "auxiliary_loss_mlp": 0.01040393, "balance_loss_clip": 1.01703966, "balance_loss_mlp": 1.01650786, "epoch": 0.9399669322110327, "flos": 25629145983360.0, "grad_norm": 1.8188463854194714, "language_loss": 0.73983693, "learning_rate": 3.764984908264823e-08, "loss": 0.76077366, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.3671875, "step": 15634, "time_per_iteration": 2.428926706314087 }, { "auxiliary_loss_clip": 0.0105199, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.01712179, "balance_loss_mlp": 1.01544857, "epoch": 0.9400270554637006, "flos": 17088517843200.0, "grad_norm": 3.5849532096124332, "language_loss": 0.70200169, "learning_rate": 3.75746733114144e-08, "loss": 0.72293115, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 15635, "time_per_iteration": 2.3613803386688232 }, { "auxiliary_loss_clip": 0.01049839, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.01445651, "balance_loss_mlp": 1.0155822, "epoch": 0.9400871787163686, "flos": 22054004916480.0, "grad_norm": 1.6583025740230928, "language_loss": 0.74934673, "learning_rate": 3.7499571955545985e-08, "loss": 0.77020442, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 15636, "time_per_iteration": 2.414755344390869 }, { "auxiliary_loss_clip": 0.01052523, "auxiliary_loss_mlp": 0.01038196, "balance_loss_clip": 1.01508129, "balance_loss_mlp": 1.01648653, "epoch": 0.9401473019690365, "flos": 16981823128320.0, "grad_norm": 2.6993919560570356, "language_loss": 0.84831071, "learning_rate": 3.7424545017890054e-08, "loss": 0.86921787, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 15637, "time_per_iteration": 2.3242437839508057 }, { "auxiliary_loss_clip": 0.01051117, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.01124883, "balance_loss_mlp": 1.0155251, "epoch": 0.9402074252217045, "flos": 19680958471680.0, "grad_norm": 2.1122304911285528, "language_loss": 0.69991219, "learning_rate": 3.7349592501292325e-08, "loss": 0.72076297, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 15638, "time_per_iteration": 3.7071399688720703 }, { "auxiliary_loss_clip": 0.01049102, "auxiliary_loss_mlp": 0.01038243, "balance_loss_clip": 1.01746535, "balance_loss_mlp": 1.01575935, "epoch": 0.9402675484743724, "flos": 24753140259840.0, "grad_norm": 1.6515562797776364, "language_loss": 0.85843271, "learning_rate": 3.727471440859498e-08, "loss": 0.8793062, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33398438, "step": 15639, "time_per_iteration": 2.449061632156372 }, { "auxiliary_loss_clip": 0.01050394, "auxiliary_loss_mlp": 0.01034198, "balance_loss_clip": 1.01253772, "balance_loss_mlp": 1.01496971, "epoch": 0.9403276717270405, "flos": 25557399406080.0, "grad_norm": 1.4716663322677248, "language_loss": 0.78826511, "learning_rate": 3.719991074263662e-08, "loss": 0.80911106, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35351562, "step": 15640, "time_per_iteration": 2.4462640285491943 }, { "auxiliary_loss_clip": 0.01052352, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.01279557, "balance_loss_mlp": 1.01564741, "epoch": 0.9403877949797084, "flos": 26688585323520.0, "grad_norm": 1.5373280305275707, "language_loss": 0.7471444, "learning_rate": 3.7125181506254544e-08, "loss": 0.76803195, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 15641, "time_per_iteration": 2.4562628269195557 }, { "auxiliary_loss_clip": 0.0105437, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.01980555, "balance_loss_mlp": 1.0158093, "epoch": 0.9404479182323764, "flos": 15010801522560.0, "grad_norm": 2.052496067904438, "language_loss": 0.83522308, "learning_rate": 3.7050526702282256e-08, "loss": 0.8562181, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 15642, "time_per_iteration": 2.3289942741394043 }, { "auxiliary_loss_clip": 0.01049849, "auxiliary_loss_mlp": 0.01030764, "balance_loss_clip": 1.0084244, "balance_loss_mlp": 1.01527965, "epoch": 0.9405080414850443, "flos": 24972394798080.0, "grad_norm": 1.815302472596836, "language_loss": 0.69430149, "learning_rate": 3.697594633355084e-08, "loss": 0.71510756, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 15643, "time_per_iteration": 2.3960869312286377 }, { "auxiliary_loss_clip": 0.01053862, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 1.0140053, "balance_loss_mlp": 1.01636064, "epoch": 0.9405681647377123, "flos": 20843845770240.0, "grad_norm": 1.758146964285999, "language_loss": 0.77498543, "learning_rate": 3.6901440402888226e-08, "loss": 0.79592198, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 15644, "time_per_iteration": 2.3765246868133545 }, { "auxiliary_loss_clip": 0.01049296, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.01252961, "balance_loss_mlp": 1.01558161, "epoch": 0.9406282879903802, "flos": 23804445352320.0, "grad_norm": 1.5520697880724856, "language_loss": 0.68292248, "learning_rate": 3.682700891311974e-08, "loss": 0.70375365, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3359375, "step": 15645, "time_per_iteration": 2.3781869411468506 }, { "auxiliary_loss_clip": 0.01048767, "auxiliary_loss_mlp": 0.010388, "balance_loss_clip": 1.01603127, "balance_loss_mlp": 1.01489544, "epoch": 0.9406884112430483, "flos": 27673659734400.0, "grad_norm": 1.457525108360432, "language_loss": 0.70921159, "learning_rate": 3.6752651867067774e-08, "loss": 0.73008716, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.33984375, "step": 15646, "time_per_iteration": 2.451057195663452 }, { "auxiliary_loss_clip": 0.01049848, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.01521206, "balance_loss_mlp": 1.01590347, "epoch": 0.9407485344957163, "flos": 23073957642240.0, "grad_norm": 1.5293283113259832, "language_loss": 0.74949479, "learning_rate": 3.667836926755208e-08, "loss": 0.7703656, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33984375, "step": 15647, "time_per_iteration": 2.3760108947753906 }, { "auxiliary_loss_clip": 0.01006811, "auxiliary_loss_mlp": 0.01002289, "balance_loss_clip": 1.00028658, "balance_loss_mlp": 1.00059032, "epoch": 0.9408086577483842, "flos": 71010682813440.0, "grad_norm": 0.8918114597104883, "language_loss": 0.63546491, "learning_rate": 3.660416111738907e-08, "loss": 0.6555559, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06201172, "step": 15648, "time_per_iteration": 5.926705598831177 }, { "auxiliary_loss_clip": 0.01049273, "auxiliary_loss_mlp": 0.0103077, "balance_loss_clip": 1.0107789, "balance_loss_mlp": 1.01572967, "epoch": 0.9408687810010522, "flos": 23729870954880.0, "grad_norm": 1.4922902923425476, "language_loss": 0.66908103, "learning_rate": 3.653002741939337e-08, "loss": 0.68988144, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.3359375, "step": 15649, "time_per_iteration": 2.4111316204071045 }, { "auxiliary_loss_clip": 0.01051639, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.01616514, "balance_loss_mlp": 1.01548266, "epoch": 0.9409289042537201, "flos": 18368328885120.0, "grad_norm": 1.9097197622058568, "language_loss": 0.78417587, "learning_rate": 3.645596817637586e-08, "loss": 0.80508173, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 15650, "time_per_iteration": 2.3677749633789062 }, { "auxiliary_loss_clip": 0.0105252, "auxiliary_loss_mlp": 0.01042859, "balance_loss_clip": 1.01982808, "balance_loss_mlp": 1.01682079, "epoch": 0.9409890275063881, "flos": 23877204359040.0, "grad_norm": 2.1846560446648167, "language_loss": 0.75485516, "learning_rate": 3.638198339114451e-08, "loss": 0.77580899, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 15651, "time_per_iteration": 2.4164466857910156 }, { "auxiliary_loss_clip": 0.01051233, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.01565099, "balance_loss_mlp": 1.01557755, "epoch": 0.941049150759056, "flos": 16544151924480.0, "grad_norm": 1.948662377684444, "language_loss": 0.73065972, "learning_rate": 3.630807306650507e-08, "loss": 0.751553, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 15652, "time_per_iteration": 2.3763630390167236 }, { "auxiliary_loss_clip": 0.01055023, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.01537049, "balance_loss_mlp": 1.01655626, "epoch": 0.9411092740117241, "flos": 25117249495680.0, "grad_norm": 1.7394381525351137, "language_loss": 0.67121506, "learning_rate": 3.6234237205260645e-08, "loss": 0.69216812, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.38476562, "step": 15653, "time_per_iteration": 2.4335546493530273 }, { "auxiliary_loss_clip": 0.01051008, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.01342392, "balance_loss_mlp": 1.01579666, "epoch": 0.941169397264392, "flos": 21141200753280.0, "grad_norm": 1.976442858887634, "language_loss": 0.78803641, "learning_rate": 3.6160475810210536e-08, "loss": 0.80890775, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15654, "time_per_iteration": 2.3758723735809326 }, { "auxiliary_loss_clip": 0.01052959, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.01341128, "balance_loss_mlp": 1.01576376, "epoch": 0.94122952051706, "flos": 38507383572480.0, "grad_norm": 1.4673606203937788, "language_loss": 0.70814204, "learning_rate": 3.6086788884152065e-08, "loss": 0.72903073, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.37109375, "step": 15655, "time_per_iteration": 2.5172908306121826 }, { "auxiliary_loss_clip": 0.01051158, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.01624298, "balance_loss_mlp": 1.01568627, "epoch": 0.9412896437697279, "flos": 18368224151040.0, "grad_norm": 1.7879057157047091, "language_loss": 0.73356426, "learning_rate": 3.601317642987944e-08, "loss": 0.75448567, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.35546875, "step": 15656, "time_per_iteration": 2.3691539764404297 }, { "auxiliary_loss_clip": 0.01049888, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.01427591, "balance_loss_mlp": 1.01588106, "epoch": 0.9413497670223959, "flos": 25883767595520.0, "grad_norm": 1.8691132579598406, "language_loss": 0.79492581, "learning_rate": 3.593963845018377e-08, "loss": 0.81578088, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 15657, "time_per_iteration": 2.4129810333251953 }, { "auxiliary_loss_clip": 0.0105052, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.0159148, "balance_loss_mlp": 1.0150702, "epoch": 0.9414098902750638, "flos": 16617364778880.0, "grad_norm": 1.902449610708662, "language_loss": 0.85148299, "learning_rate": 3.586617494785371e-08, "loss": 0.87236726, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35546875, "step": 15658, "time_per_iteration": 2.3510165214538574 }, { "auxiliary_loss_clip": 0.01054305, "auxiliary_loss_mlp": 0.01038494, "balance_loss_clip": 1.01241136, "balance_loss_mlp": 1.016559, "epoch": 0.9414700135277319, "flos": 18624032749440.0, "grad_norm": 1.7775656004559017, "language_loss": 0.72003543, "learning_rate": 3.5792785925675254e-08, "loss": 0.7409634, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37890625, "step": 15659, "time_per_iteration": 2.377020835876465 }, { "auxiliary_loss_clip": 0.0104931, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.01646733, "balance_loss_mlp": 1.01525259, "epoch": 0.9415301367803999, "flos": 26279124364800.0, "grad_norm": 1.598641710190227, "language_loss": 0.801265, "learning_rate": 3.571947138643172e-08, "loss": 0.82214659, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 15660, "time_per_iteration": 3.8955111503601074 }, { "auxiliary_loss_clip": 0.01048704, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 1.01259542, "balance_loss_mlp": 1.01517761, "epoch": 0.9415902600330678, "flos": 23260184167680.0, "grad_norm": 1.4392962721244393, "language_loss": 0.68352115, "learning_rate": 3.564623133290201e-08, "loss": 0.70434821, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3359375, "step": 15661, "time_per_iteration": 2.4307644367218018 }, { "auxiliary_loss_clip": 0.01051345, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.01357913, "balance_loss_mlp": 1.01589549, "epoch": 0.9416503832857358, "flos": 14718299218560.0, "grad_norm": 2.112032573062945, "language_loss": 0.6746403, "learning_rate": 3.557306576786434e-08, "loss": 0.69551229, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 15662, "time_per_iteration": 2.3466787338256836 }, { "auxiliary_loss_clip": 0.01007344, "auxiliary_loss_mlp": 0.01002646, "balance_loss_clip": 1.0006671, "balance_loss_mlp": 1.00096118, "epoch": 0.9417105065384037, "flos": 70309592334720.0, "grad_norm": 0.766958982859779, "language_loss": 0.59388822, "learning_rate": 3.5499974694092935e-08, "loss": 0.61398816, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06347656, "step": 15663, "time_per_iteration": 3.0594472885131836 }, { "auxiliary_loss_clip": 0.01055156, "auxiliary_loss_mlp": 0.01039283, "balance_loss_clip": 1.01370049, "balance_loss_mlp": 1.01660132, "epoch": 0.9417706297910717, "flos": 34056481161600.0, "grad_norm": 1.7615910425832437, "language_loss": 0.68008703, "learning_rate": 3.542695811435914e-08, "loss": 0.70103145, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38476562, "step": 15664, "time_per_iteration": 2.4801340103149414 }, { "auxiliary_loss_clip": 0.01052926, "auxiliary_loss_mlp": 0.01036655, "balance_loss_clip": 1.0136354, "balance_loss_mlp": 1.01700747, "epoch": 0.9418307530437396, "flos": 16470694690560.0, "grad_norm": 2.222533073935684, "language_loss": 0.74297833, "learning_rate": 3.535401603143207e-08, "loss": 0.76387417, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 15665, "time_per_iteration": 2.370307207107544 }, { "auxiliary_loss_clip": 0.01049788, "auxiliary_loss_mlp": 0.0103685, "balance_loss_clip": 1.01548719, "balance_loss_mlp": 1.01631618, "epoch": 0.9418908762964077, "flos": 11252785420800.0, "grad_norm": 2.95733737200462, "language_loss": 0.64406824, "learning_rate": 3.528114844807773e-08, "loss": 0.66493464, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33398438, "step": 15666, "time_per_iteration": 2.344013214111328 }, { "auxiliary_loss_clip": 0.01051705, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 1.0135361, "balance_loss_mlp": 1.01601267, "epoch": 0.9419509995490756, "flos": 18437945869440.0, "grad_norm": 1.7151042841992807, "language_loss": 0.79952621, "learning_rate": 3.520835536705902e-08, "loss": 0.82040638, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 15667, "time_per_iteration": 2.3620805740356445 }, { "auxiliary_loss_clip": 0.01049746, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.01246452, "balance_loss_mlp": 1.0151757, "epoch": 0.9420111228017436, "flos": 20736976498560.0, "grad_norm": 1.678955583754132, "language_loss": 0.76715958, "learning_rate": 3.5135636791136404e-08, "loss": 0.7879886, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34570312, "step": 15668, "time_per_iteration": 2.3686819076538086 }, { "auxiliary_loss_clip": 0.01052544, "auxiliary_loss_mlp": 0.01039064, "balance_loss_clip": 1.01409018, "balance_loss_mlp": 1.01543689, "epoch": 0.9420712460544115, "flos": 21140886551040.0, "grad_norm": 4.611883980605143, "language_loss": 0.60261154, "learning_rate": 3.506299272306723e-08, "loss": 0.62352771, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37109375, "step": 15669, "time_per_iteration": 2.3777499198913574 }, { "auxiliary_loss_clip": 0.01048029, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01060319, "balance_loss_mlp": 1.01487875, "epoch": 0.9421313693070795, "flos": 15850846679040.0, "grad_norm": 1.4650272705966911, "language_loss": 0.78346336, "learning_rate": 3.4990423165606406e-08, "loss": 0.80424809, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33203125, "step": 15670, "time_per_iteration": 2.3679869174957275 }, { "auxiliary_loss_clip": 0.01050984, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.01413357, "balance_loss_mlp": 1.01619506, "epoch": 0.9421914925597474, "flos": 32414550831360.0, "grad_norm": 2.1348573545554252, "language_loss": 0.66222078, "learning_rate": 3.491792812150574e-08, "loss": 0.68309009, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 15671, "time_per_iteration": 2.45574951171875 }, { "auxiliary_loss_clip": 0.01051075, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.01297569, "balance_loss_mlp": 1.01554179, "epoch": 0.9422516158124155, "flos": 19717512531840.0, "grad_norm": 1.686994194948593, "language_loss": 0.80618894, "learning_rate": 3.48455075935139e-08, "loss": 0.82706571, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 15672, "time_per_iteration": 2.4345691204071045 }, { "auxiliary_loss_clip": 0.01054495, "auxiliary_loss_mlp": 0.01039649, "balance_loss_clip": 1.01406705, "balance_loss_mlp": 1.01584673, "epoch": 0.9423117390650835, "flos": 16252347847680.0, "grad_norm": 2.957850114322184, "language_loss": 0.74753499, "learning_rate": 3.47731615843776e-08, "loss": 0.76847643, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38671875, "step": 15673, "time_per_iteration": 2.3533878326416016 }, { "auxiliary_loss_clip": 0.01050621, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 1.00982451, "balance_loss_mlp": 1.0150578, "epoch": 0.9423718623177514, "flos": 31795191578880.0, "grad_norm": 1.4225851448468043, "language_loss": 0.7104491, "learning_rate": 3.470089009683974e-08, "loss": 0.73128909, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35546875, "step": 15674, "time_per_iteration": 2.4670565128326416 }, { "auxiliary_loss_clip": 0.01050825, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.00898933, "balance_loss_mlp": 1.01511526, "epoch": 0.9424319855704194, "flos": 23330673936000.0, "grad_norm": 1.8188527376441446, "language_loss": 0.81617695, "learning_rate": 3.462869313364125e-08, "loss": 0.83699024, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35742188, "step": 15675, "time_per_iteration": 2.4221954345703125 }, { "auxiliary_loss_clip": 0.01051244, "auxiliary_loss_mlp": 0.01037967, "balance_loss_clip": 1.01633072, "balance_loss_mlp": 1.01587164, "epoch": 0.9424921088230873, "flos": 20776567847040.0, "grad_norm": 1.592897648999295, "language_loss": 0.63371134, "learning_rate": 3.4556570697519494e-08, "loss": 0.65460342, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35351562, "step": 15676, "time_per_iteration": 2.3891806602478027 }, { "auxiliary_loss_clip": 0.0105139, "auxiliary_loss_mlp": 0.01037082, "balance_loss_clip": 1.01234555, "balance_loss_mlp": 1.01531136, "epoch": 0.9425522320757553, "flos": 19025638652160.0, "grad_norm": 1.7811726629502391, "language_loss": 0.6793493, "learning_rate": 3.448452279120984e-08, "loss": 0.700234, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.359375, "step": 15677, "time_per_iteration": 2.360773801803589 }, { "auxiliary_loss_clip": 0.0105241, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.01072443, "balance_loss_mlp": 1.01515222, "epoch": 0.9426123553284232, "flos": 25154187580800.0, "grad_norm": 1.7110955319939203, "language_loss": 0.65850306, "learning_rate": 3.441254941744387e-08, "loss": 0.67938113, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 15678, "time_per_iteration": 3.680584192276001 }, { "auxiliary_loss_clip": 0.01051204, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 1.01131546, "balance_loss_mlp": 1.01634359, "epoch": 0.9426724785810913, "flos": 21178278483840.0, "grad_norm": 1.865312280566845, "language_loss": 0.75618827, "learning_rate": 3.434065057895097e-08, "loss": 0.77704561, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 15679, "time_per_iteration": 2.3730101585388184 }, { "auxiliary_loss_clip": 0.01053749, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.01333642, "balance_loss_mlp": 1.01644516, "epoch": 0.9427326018337592, "flos": 14756040264960.0, "grad_norm": 2.1160773064386933, "language_loss": 0.78637087, "learning_rate": 3.426882627845762e-08, "loss": 0.8072741, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.37304688, "step": 15680, "time_per_iteration": 2.351349115371704 }, { "auxiliary_loss_clip": 0.01050805, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.01395762, "balance_loss_mlp": 1.01575518, "epoch": 0.9427927250864272, "flos": 20922574619520.0, "grad_norm": 1.9274873690265357, "language_loss": 0.76353133, "learning_rate": 3.419707651868742e-08, "loss": 0.7844162, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34960938, "step": 15681, "time_per_iteration": 2.358640193939209 }, { "auxiliary_loss_clip": 0.01050909, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.01546264, "balance_loss_mlp": 1.01551008, "epoch": 0.9428528483390951, "flos": 19751587885440.0, "grad_norm": 2.7010765932653196, "language_loss": 0.67387354, "learning_rate": 3.412540130236086e-08, "loss": 0.69477642, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.35351562, "step": 15682, "time_per_iteration": 2.3876149654388428 }, { "auxiliary_loss_clip": 0.01050769, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.01330996, "balance_loss_mlp": 1.01536489, "epoch": 0.9429129715917631, "flos": 24533850810240.0, "grad_norm": 6.4492329066291765, "language_loss": 0.77862942, "learning_rate": 3.405380063219665e-08, "loss": 0.79951686, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35351562, "step": 15683, "time_per_iteration": 2.392002582550049 }, { "auxiliary_loss_clip": 0.01054369, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.01803744, "balance_loss_mlp": 1.01616371, "epoch": 0.942973094844431, "flos": 17959077394560.0, "grad_norm": 2.5979596334955066, "language_loss": 0.76860291, "learning_rate": 3.398227451090885e-08, "loss": 0.78958887, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.3828125, "step": 15684, "time_per_iteration": 2.338413953781128 }, { "auxiliary_loss_clip": 0.01048615, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.0097611, "balance_loss_mlp": 1.01498389, "epoch": 0.9430332180970991, "flos": 26136573816960.0, "grad_norm": 1.513125249916925, "language_loss": 0.78003323, "learning_rate": 3.391082294121017e-08, "loss": 0.80081022, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.3359375, "step": 15685, "time_per_iteration": 2.408259391784668 }, { "auxiliary_loss_clip": 0.01049306, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.01672387, "balance_loss_mlp": 1.01500905, "epoch": 0.943093341349767, "flos": 23950242656640.0, "grad_norm": 1.833602543248329, "language_loss": 0.76786196, "learning_rate": 3.383944592581023e-08, "loss": 0.78871697, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.34375, "step": 15686, "time_per_iteration": 2.3895561695098877 }, { "auxiliary_loss_clip": 0.01052849, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.01787424, "balance_loss_mlp": 1.01557934, "epoch": 0.943153464602435, "flos": 17967421209600.0, "grad_norm": 1.7689822182921808, "language_loss": 0.81614923, "learning_rate": 3.376814346741575e-08, "loss": 0.83709776, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 15687, "time_per_iteration": 2.354069471359253 }, { "auxiliary_loss_clip": 0.01053648, "auxiliary_loss_mlp": 0.01038616, "balance_loss_clip": 1.0126996, "balance_loss_mlp": 1.01639271, "epoch": 0.943213587855103, "flos": 14500650602880.0, "grad_norm": 2.0626068598587595, "language_loss": 0.77690166, "learning_rate": 3.369691556873011e-08, "loss": 0.79782426, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37109375, "step": 15688, "time_per_iteration": 5.177797794342041 }, { "auxiliary_loss_clip": 0.01047647, "auxiliary_loss_mlp": 0.01030198, "balance_loss_clip": 1.00877619, "balance_loss_mlp": 1.01445019, "epoch": 0.9432737111077709, "flos": 28985137246080.0, "grad_norm": 1.5647809383099627, "language_loss": 0.69460744, "learning_rate": 3.3625762232454504e-08, "loss": 0.71538591, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33203125, "step": 15689, "time_per_iteration": 2.430363893508911 }, { "auxiliary_loss_clip": 0.01049501, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.01503158, "balance_loss_mlp": 1.01619172, "epoch": 0.9433338343604389, "flos": 21608199365760.0, "grad_norm": 1.7651282329268918, "language_loss": 0.81571043, "learning_rate": 3.35546834612872e-08, "loss": 0.83655214, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.33203125, "step": 15690, "time_per_iteration": 2.3944578170776367 }, { "auxiliary_loss_clip": 0.01049292, "auxiliary_loss_mlp": 0.01036993, "balance_loss_clip": 1.01541638, "balance_loss_mlp": 1.01492918, "epoch": 0.9433939576131068, "flos": 33180894374400.0, "grad_norm": 1.8153269164989358, "language_loss": 0.61306536, "learning_rate": 3.348367925792317e-08, "loss": 0.63392818, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 15691, "time_per_iteration": 2.452047824859619 }, { "auxiliary_loss_clip": 0.01052879, "auxiliary_loss_mlp": 0.01038988, "balance_loss_clip": 1.01712489, "balance_loss_mlp": 1.01677012, "epoch": 0.9434540808657749, "flos": 20485322352000.0, "grad_norm": 1.5130579082667968, "language_loss": 0.67191529, "learning_rate": 3.341274962505514e-08, "loss": 0.69283402, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36132812, "step": 15692, "time_per_iteration": 2.38089919090271 }, { "auxiliary_loss_clip": 0.01052184, "auxiliary_loss_mlp": 0.01036872, "balance_loss_clip": 1.01270819, "balance_loss_mlp": 1.01661599, "epoch": 0.9435142041184428, "flos": 21541898960640.0, "grad_norm": 2.093818173948926, "language_loss": 0.75675499, "learning_rate": 3.334189456537251e-08, "loss": 0.77764553, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 15693, "time_per_iteration": 2.36436128616333 }, { "auxiliary_loss_clip": 0.01051585, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.01151359, "balance_loss_mlp": 1.01612198, "epoch": 0.9435743273711108, "flos": 25007936428800.0, "grad_norm": 1.6107327433275025, "language_loss": 0.73632777, "learning_rate": 3.327111408156291e-08, "loss": 0.75719404, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 15694, "time_per_iteration": 2.40400767326355 }, { "auxiliary_loss_clip": 0.01006817, "auxiliary_loss_mlp": 0.01002314, "balance_loss_clip": 0.99997795, "balance_loss_mlp": 1.00051272, "epoch": 0.9436344506237787, "flos": 60155172420480.0, "grad_norm": 0.6997556146662767, "language_loss": 0.50689393, "learning_rate": 3.3200408176309316e-08, "loss": 0.52698529, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.06298828, "step": 15695, "time_per_iteration": 3.0540308952331543 }, { "auxiliary_loss_clip": 0.01047909, "auxiliary_loss_mlp": 0.01033625, "balance_loss_clip": 1.01387215, "balance_loss_mlp": 1.01517987, "epoch": 0.9436945738764467, "flos": 22236146812800.0, "grad_norm": 1.6622802311362894, "language_loss": 0.66174775, "learning_rate": 3.312977685229335e-08, "loss": 0.68256307, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.328125, "step": 15696, "time_per_iteration": 2.4006612300872803 }, { "auxiliary_loss_clip": 0.01051873, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.01107192, "balance_loss_mlp": 1.01652789, "epoch": 0.9437546971291146, "flos": 25044036641280.0, "grad_norm": 1.7213576577931613, "language_loss": 0.67571533, "learning_rate": 3.305922011219353e-08, "loss": 0.69655919, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35351562, "step": 15697, "time_per_iteration": 2.407270908355713 }, { "auxiliary_loss_clip": 0.01007196, "auxiliary_loss_mlp": 0.0100259, "balance_loss_clip": 1.00050378, "balance_loss_mlp": 1.00077844, "epoch": 0.9438148203817827, "flos": 56787660408960.0, "grad_norm": 0.8506090074101591, "language_loss": 0.63308966, "learning_rate": 3.298873795868506e-08, "loss": 0.65318751, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06445312, "step": 15698, "time_per_iteration": 2.9413440227508545 }, { "auxiliary_loss_clip": 0.01054237, "auxiliary_loss_mlp": 0.01042373, "balance_loss_clip": 1.01774395, "balance_loss_mlp": 1.01646614, "epoch": 0.9438749436344506, "flos": 22345285322880.0, "grad_norm": 1.7582276677596744, "language_loss": 0.70387596, "learning_rate": 3.291833039444092e-08, "loss": 0.72484207, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37890625, "step": 15699, "time_per_iteration": 2.395869493484497 }, { "auxiliary_loss_clip": 0.0104836, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01330101, "balance_loss_mlp": 1.01444578, "epoch": 0.9439350668871186, "flos": 13369953444480.0, "grad_norm": 3.2685026257879897, "language_loss": 0.77191794, "learning_rate": 3.2847997422130734e-08, "loss": 0.79275179, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 15700, "time_per_iteration": 3.8112454414367676 }, { "auxiliary_loss_clip": 0.0105067, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.01487017, "balance_loss_mlp": 1.01568592, "epoch": 0.9439951901397866, "flos": 17784371617920.0, "grad_norm": 1.538161794301533, "language_loss": 0.71721721, "learning_rate": 3.2777739044421495e-08, "loss": 0.73807549, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.34960938, "step": 15701, "time_per_iteration": 2.369601011276245 }, { "auxiliary_loss_clip": 0.01054447, "auxiliary_loss_mlp": 0.01036548, "balance_loss_clip": 1.01269388, "balance_loss_mlp": 1.01606381, "epoch": 0.9440553133924545, "flos": 18878584538880.0, "grad_norm": 1.9926690784734284, "language_loss": 0.78800839, "learning_rate": 3.2707555263977505e-08, "loss": 0.80891836, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3828125, "step": 15702, "time_per_iteration": 2.3600189685821533 }, { "auxiliary_loss_clip": 0.01052109, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.01435804, "balance_loss_mlp": 1.01584864, "epoch": 0.9441154366451225, "flos": 19571959607040.0, "grad_norm": 1.7766818579195918, "language_loss": 0.67802179, "learning_rate": 3.2637446083460194e-08, "loss": 0.69891316, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 15703, "time_per_iteration": 2.3622708320617676 }, { "auxiliary_loss_clip": 0.01052455, "auxiliary_loss_mlp": 0.0103906, "balance_loss_clip": 1.01492, "balance_loss_mlp": 1.01562691, "epoch": 0.9441755598977905, "flos": 30293821848960.0, "grad_norm": 1.8644875813867392, "language_loss": 0.74283695, "learning_rate": 3.256741150552833e-08, "loss": 0.7637521, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 15704, "time_per_iteration": 2.4171149730682373 }, { "auxiliary_loss_clip": 0.0105057, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.01472533, "balance_loss_mlp": 1.01542127, "epoch": 0.9442356831504585, "flos": 20666835843840.0, "grad_norm": 1.7897907650679277, "language_loss": 0.75566, "learning_rate": 3.2497451532837336e-08, "loss": 0.7765404, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15705, "time_per_iteration": 2.3887364864349365 }, { "auxiliary_loss_clip": 0.01050424, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.01545739, "balance_loss_mlp": 1.01553643, "epoch": 0.9442958064031264, "flos": 16106341075200.0, "grad_norm": 1.9089173177989442, "language_loss": 0.7800808, "learning_rate": 3.2427566168039986e-08, "loss": 0.8009572, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34765625, "step": 15706, "time_per_iteration": 2.3486855030059814 }, { "auxiliary_loss_clip": 0.01048838, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.01181996, "balance_loss_mlp": 1.01579976, "epoch": 0.9443559296557944, "flos": 20446394319360.0, "grad_norm": 1.4893752781675078, "language_loss": 0.69977427, "learning_rate": 3.23577554137866e-08, "loss": 0.72059029, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33007812, "step": 15707, "time_per_iteration": 2.3724913597106934 }, { "auxiliary_loss_clip": 0.01046779, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.01529455, "balance_loss_mlp": 1.01394308, "epoch": 0.9444160529084623, "flos": 21609979845120.0, "grad_norm": 2.0935765545694203, "language_loss": 0.70179212, "learning_rate": 3.22880192727244e-08, "loss": 0.72261339, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.328125, "step": 15708, "time_per_iteration": 2.3791630268096924 }, { "auxiliary_loss_clip": 0.01048398, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.0125246, "balance_loss_mlp": 1.0153892, "epoch": 0.9444761761611303, "flos": 18440808600960.0, "grad_norm": 2.6958870654620233, "language_loss": 0.71644783, "learning_rate": 3.221835774749748e-08, "loss": 0.7372694, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.33007812, "step": 15709, "time_per_iteration": 2.35504412651062 }, { "auxiliary_loss_clip": 0.01051729, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.01274133, "balance_loss_mlp": 1.01697791, "epoch": 0.9445362994137982, "flos": 20956161214080.0, "grad_norm": 2.1278301484370585, "language_loss": 0.86089122, "learning_rate": 3.214877084074774e-08, "loss": 0.88175809, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 15710, "time_per_iteration": 2.35038685798645 }, { "auxiliary_loss_clip": 0.01054751, "auxiliary_loss_mlp": 0.01039843, "balance_loss_clip": 1.0151906, "balance_loss_mlp": 1.01669383, "epoch": 0.9445964226664663, "flos": 20302237848960.0, "grad_norm": 1.5593689613397428, "language_loss": 0.72561204, "learning_rate": 3.2079258555113956e-08, "loss": 0.74655801, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.38085938, "step": 15711, "time_per_iteration": 2.368347644805908 }, { "auxiliary_loss_clip": 0.01054099, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.01296139, "balance_loss_mlp": 1.01839876, "epoch": 0.9446565459191342, "flos": 26394826210560.0, "grad_norm": 1.7496243806983809, "language_loss": 0.70933217, "learning_rate": 3.200982089323179e-08, "loss": 0.73022807, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35742188, "step": 15712, "time_per_iteration": 2.4209465980529785 }, { "auxiliary_loss_clip": 0.01053939, "auxiliary_loss_mlp": 0.01038729, "balance_loss_clip": 1.01383829, "balance_loss_mlp": 1.0175941, "epoch": 0.9447166691718022, "flos": 16543837722240.0, "grad_norm": 2.429246153867456, "language_loss": 0.71340889, "learning_rate": 3.1940457857734246e-08, "loss": 0.73433554, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36328125, "step": 15713, "time_per_iteration": 2.3387863636016846 }, { "auxiliary_loss_clip": 0.01050294, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.01237118, "balance_loss_mlp": 1.01571119, "epoch": 0.9447767924244702, "flos": 29163473804160.0, "grad_norm": 1.5251877969228511, "language_loss": 0.77621722, "learning_rate": 3.187116945125212e-08, "loss": 0.79707259, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34570312, "step": 15714, "time_per_iteration": 2.4268229007720947 }, { "auxiliary_loss_clip": 0.01051506, "auxiliary_loss_mlp": 0.01039961, "balance_loss_clip": 1.01620245, "balance_loss_mlp": 1.0155549, "epoch": 0.9448369156771381, "flos": 19274080953600.0, "grad_norm": 1.948182136061874, "language_loss": 0.68240595, "learning_rate": 3.1801955676412194e-08, "loss": 0.70332062, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.359375, "step": 15715, "time_per_iteration": 2.340466260910034 }, { "auxiliary_loss_clip": 0.01051631, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.0182662, "balance_loss_mlp": 1.0156678, "epoch": 0.9448970389298061, "flos": 23840056805760.0, "grad_norm": 1.9675959950097006, "language_loss": 0.76481324, "learning_rate": 3.173281653583948e-08, "loss": 0.78574598, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 15716, "time_per_iteration": 2.421613931655884 }, { "auxiliary_loss_clip": 0.01052931, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.01407206, "balance_loss_mlp": 1.01658416, "epoch": 0.944957162182474, "flos": 22381176067200.0, "grad_norm": 1.7591417026581748, "language_loss": 0.63288689, "learning_rate": 3.166375203215565e-08, "loss": 0.65379584, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 15717, "time_per_iteration": 3.7301392555236816 }, { "auxiliary_loss_clip": 0.01051169, "auxiliary_loss_mlp": 0.0103949, "balance_loss_clip": 1.01769888, "balance_loss_mlp": 1.01597261, "epoch": 0.9450172854351421, "flos": 17382940272000.0, "grad_norm": 1.5609500568491539, "language_loss": 0.79615843, "learning_rate": 3.1594762167979514e-08, "loss": 0.81706494, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.3515625, "step": 15718, "time_per_iteration": 2.338719606399536 }, { "auxiliary_loss_clip": 0.01007053, "auxiliary_loss_mlp": 0.010024, "balance_loss_clip": 1.00036168, "balance_loss_mlp": 1.00079274, "epoch": 0.94507740868781, "flos": 68462895680640.0, "grad_norm": 0.7020023369103464, "language_loss": 0.57897186, "learning_rate": 3.152584694592719e-08, "loss": 0.5990665, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.0625, "step": 15719, "time_per_iteration": 3.036783218383789 }, { "auxiliary_loss_clip": 0.01052792, "auxiliary_loss_mlp": 0.01037762, "balance_loss_clip": 1.01394355, "balance_loss_mlp": 1.01689458, "epoch": 0.945137531940478, "flos": 21141410221440.0, "grad_norm": 1.808888907378482, "language_loss": 0.77094722, "learning_rate": 3.145700636861193e-08, "loss": 0.79185271, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15720, "time_per_iteration": 2.409707546234131 }, { "auxiliary_loss_clip": 0.01050407, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.0109508, "balance_loss_mlp": 1.01543927, "epoch": 0.9451976551931459, "flos": 24532803469440.0, "grad_norm": 1.8868901117773662, "language_loss": 0.73210549, "learning_rate": 3.138824043864452e-08, "loss": 0.75292528, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34960938, "step": 15721, "time_per_iteration": 2.4238338470458984 }, { "auxiliary_loss_clip": 0.01051869, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.01854873, "balance_loss_mlp": 1.01627827, "epoch": 0.9452577784458139, "flos": 23439463332480.0, "grad_norm": 1.7299126540210101, "language_loss": 0.86341059, "learning_rate": 3.131954915863244e-08, "loss": 0.88434231, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 15722, "time_per_iteration": 2.388282060623169 }, { "auxiliary_loss_clip": 0.01006695, "auxiliary_loss_mlp": 0.01001614, "balance_loss_clip": 0.99962324, "balance_loss_mlp": 1.00048327, "epoch": 0.9453179016984818, "flos": 52014509349120.0, "grad_norm": 0.8969533625200669, "language_loss": 0.64559221, "learning_rate": 3.125093253118005e-08, "loss": 0.66567528, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06201172, "step": 15723, "time_per_iteration": 2.966252565383911 }, { "auxiliary_loss_clip": 0.01054788, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.01330924, "balance_loss_mlp": 1.01765704, "epoch": 0.9453780249511499, "flos": 13472354062080.0, "grad_norm": 1.9908249120846178, "language_loss": 0.74402249, "learning_rate": 3.1182390558889715e-08, "loss": 0.76495135, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 15724, "time_per_iteration": 2.353783369064331 }, { "auxiliary_loss_clip": 0.0105029, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.01427817, "balance_loss_mlp": 1.01510239, "epoch": 0.9454381482038178, "flos": 23257391258880.0, "grad_norm": 2.6483226999533422, "language_loss": 0.86021805, "learning_rate": 3.111392324436024e-08, "loss": 0.88107908, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 15725, "time_per_iteration": 2.413386344909668 }, { "auxiliary_loss_clip": 0.01051607, "auxiliary_loss_mlp": 0.01030306, "balance_loss_clip": 1.0082643, "balance_loss_mlp": 1.01587057, "epoch": 0.9454982714564858, "flos": 19495709464320.0, "grad_norm": 2.393209428691187, "language_loss": 0.72035265, "learning_rate": 3.104553059018822e-08, "loss": 0.74117178, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35742188, "step": 15726, "time_per_iteration": 2.3790717124938965 }, { "auxiliary_loss_clip": 0.01053726, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.01641083, "balance_loss_mlp": 1.01768899, "epoch": 0.9455583947091538, "flos": 23257007233920.0, "grad_norm": 1.8265232387882848, "language_loss": 0.62331235, "learning_rate": 3.097721259896735e-08, "loss": 0.64424533, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 15727, "time_per_iteration": 3.783712863922119 }, { "auxiliary_loss_clip": 0.01048854, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.0161705, "balance_loss_mlp": 1.01505744, "epoch": 0.9456185179618217, "flos": 17672160908160.0, "grad_norm": 2.0313236856982266, "language_loss": 0.83083779, "learning_rate": 3.0908969273287566e-08, "loss": 0.85169792, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33789062, "step": 15728, "time_per_iteration": 2.349702835083008 }, { "auxiliary_loss_clip": 0.01006989, "auxiliary_loss_mlp": 0.01004247, "balance_loss_clip": 1.00225627, "balance_loss_mlp": 1.00068545, "epoch": 0.9456786412144897, "flos": 61412046698880.0, "grad_norm": 0.7274375207276249, "language_loss": 0.59050035, "learning_rate": 3.08408006157368e-08, "loss": 0.61061275, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06298828, "step": 15729, "time_per_iteration": 2.9529356956481934 }, { "auxiliary_loss_clip": 0.01049143, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.01321554, "balance_loss_mlp": 1.01421511, "epoch": 0.9457387644671577, "flos": 18587373955200.0, "grad_norm": 2.1071409430480292, "language_loss": 0.77911067, "learning_rate": 3.077270662890052e-08, "loss": 0.79997301, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34960938, "step": 15730, "time_per_iteration": 2.3394429683685303 }, { "auxiliary_loss_clip": 0.01050876, "auxiliary_loss_mlp": 0.01038493, "balance_loss_clip": 1.01347136, "balance_loss_mlp": 1.0153923, "epoch": 0.9457988877198257, "flos": 21107404690560.0, "grad_norm": 1.573660215432561, "language_loss": 0.63896251, "learning_rate": 3.070468731536047e-08, "loss": 0.6598562, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35546875, "step": 15731, "time_per_iteration": 2.3992385864257812 }, { "auxiliary_loss_clip": 0.01053233, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.0147264, "balance_loss_mlp": 1.01643026, "epoch": 0.9458590109724936, "flos": 26687153957760.0, "grad_norm": 1.7031635192331982, "language_loss": 0.64893425, "learning_rate": 3.063674267769589e-08, "loss": 0.66986114, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 15732, "time_per_iteration": 2.4150149822235107 }, { "auxiliary_loss_clip": 0.01052734, "auxiliary_loss_mlp": 0.01039815, "balance_loss_clip": 1.01410115, "balance_loss_mlp": 1.01579356, "epoch": 0.9459191342251616, "flos": 18660586809600.0, "grad_norm": 1.8639573177589215, "language_loss": 0.85138643, "learning_rate": 3.056887271848363e-08, "loss": 0.87231195, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36914062, "step": 15733, "time_per_iteration": 2.3545610904693604 }, { "auxiliary_loss_clip": 0.01050259, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.01372635, "balance_loss_mlp": 1.01625395, "epoch": 0.9459792574778295, "flos": 23397498011520.0, "grad_norm": 1.633419121414046, "language_loss": 0.73101872, "learning_rate": 3.0501077440297173e-08, "loss": 0.75185525, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.33984375, "step": 15734, "time_per_iteration": 2.3664653301239014 }, { "auxiliary_loss_clip": 0.01047515, "auxiliary_loss_mlp": 0.01033172, "balance_loss_clip": 1.01427782, "balance_loss_mlp": 1.01481378, "epoch": 0.9460393807304975, "flos": 24391719198720.0, "grad_norm": 1.8496724832755427, "language_loss": 0.87283492, "learning_rate": 3.043335684570692e-08, "loss": 0.89364183, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.328125, "step": 15735, "time_per_iteration": 2.4172842502593994 }, { "auxiliary_loss_clip": 0.01051713, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.0124222, "balance_loss_mlp": 1.01610923, "epoch": 0.9460995039831654, "flos": 21938477627520.0, "grad_norm": 1.8817584183315856, "language_loss": 0.68113804, "learning_rate": 3.036571093728102e-08, "loss": 0.70200282, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 15736, "time_per_iteration": 2.3443260192871094 }, { "auxiliary_loss_clip": 0.01007251, "auxiliary_loss_mlp": 0.01004608, "balance_loss_clip": 1.00260508, "balance_loss_mlp": 1.00084352, "epoch": 0.9461596272358335, "flos": 70319088224640.0, "grad_norm": 3.345535156247148, "language_loss": 0.65383285, "learning_rate": 3.029813971758499e-08, "loss": 0.67395145, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06445312, "step": 15737, "time_per_iteration": 3.0105857849121094 }, { "auxiliary_loss_clip": 0.01006807, "auxiliary_loss_mlp": 0.01003425, "balance_loss_clip": 1.00149429, "balance_loss_mlp": 1.00062609, "epoch": 0.9462197504885014, "flos": 58587711753600.0, "grad_norm": 0.8032576440295021, "language_loss": 0.58887529, "learning_rate": 3.0230643189181225e-08, "loss": 0.60897762, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06201172, "step": 15738, "time_per_iteration": 3.0291783809661865 }, { "auxiliary_loss_clip": 0.01049333, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.01648343, "balance_loss_mlp": 1.0156709, "epoch": 0.9462798737411694, "flos": 23432830174080.0, "grad_norm": 1.7100755897649076, "language_loss": 0.72265196, "learning_rate": 3.016322135462834e-08, "loss": 0.74351811, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.3359375, "step": 15739, "time_per_iteration": 3.7920031547546387 }, { "auxiliary_loss_clip": 0.01051422, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.01459348, "balance_loss_mlp": 1.0161258, "epoch": 0.9463399969938374, "flos": 25044909425280.0, "grad_norm": 2.311977274133704, "language_loss": 0.65706706, "learning_rate": 3.009587421648363e-08, "loss": 0.67798686, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.35351562, "step": 15740, "time_per_iteration": 2.4056015014648438 }, { "auxiliary_loss_clip": 0.01050302, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.01148176, "balance_loss_mlp": 1.01619697, "epoch": 0.9464001202465053, "flos": 24351464534400.0, "grad_norm": 1.7601092745326523, "language_loss": 0.67770946, "learning_rate": 3.0028601777301045e-08, "loss": 0.69854569, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 15741, "time_per_iteration": 2.4002838134765625 }, { "auxiliary_loss_clip": 0.01051897, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.01295495, "balance_loss_mlp": 1.01601577, "epoch": 0.9464602434991733, "flos": 17164488695040.0, "grad_norm": 1.8318941130114585, "language_loss": 0.76413423, "learning_rate": 2.9961404039630987e-08, "loss": 0.7850132, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 15742, "time_per_iteration": 2.3537046909332275 }, { "auxiliary_loss_clip": 0.01049763, "auxiliary_loss_mlp": 0.01031722, "balance_loss_clip": 1.01157522, "balance_loss_mlp": 1.01582313, "epoch": 0.9465203667518413, "flos": 19937081272320.0, "grad_norm": 1.701882020607607, "language_loss": 0.73186368, "learning_rate": 2.989428100602187e-08, "loss": 0.75267851, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 15743, "time_per_iteration": 2.356045961380005 }, { "auxiliary_loss_clip": 0.01052126, "auxiliary_loss_mlp": 0.01036346, "balance_loss_clip": 1.01178908, "balance_loss_mlp": 1.01549971, "epoch": 0.9465804900045093, "flos": 20119292991360.0, "grad_norm": 1.5473158208878013, "language_loss": 0.81053233, "learning_rate": 2.982723267901943e-08, "loss": 0.83141708, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 15744, "time_per_iteration": 2.3845858573913574 }, { "auxiliary_loss_clip": 0.01053372, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.01684713, "balance_loss_mlp": 1.01688266, "epoch": 0.9466406132571772, "flos": 23910581485440.0, "grad_norm": 1.6986603999036616, "language_loss": 0.79273266, "learning_rate": 2.9760259061165417e-08, "loss": 0.81367308, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 15745, "time_per_iteration": 2.3632452487945557 }, { "auxiliary_loss_clip": 0.01052721, "auxiliary_loss_mlp": 0.01045591, "balance_loss_clip": 1.02128458, "balance_loss_mlp": 1.01596212, "epoch": 0.9467007365098452, "flos": 19932333327360.0, "grad_norm": 1.54415685289877, "language_loss": 0.70964062, "learning_rate": 2.9693360155000014e-08, "loss": 0.73062372, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3671875, "step": 15746, "time_per_iteration": 2.3761560916900635 }, { "auxiliary_loss_clip": 0.0105129, "auxiliary_loss_mlp": 0.01036959, "balance_loss_clip": 1.0136658, "balance_loss_mlp": 1.01568663, "epoch": 0.9467608597625131, "flos": 19309692407040.0, "grad_norm": 2.1451746300962546, "language_loss": 0.57907569, "learning_rate": 2.962653596305964e-08, "loss": 0.59995824, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 15747, "time_per_iteration": 2.3392863273620605 }, { "auxiliary_loss_clip": 0.01006986, "auxiliary_loss_mlp": 0.01002343, "balance_loss_clip": 1.00037611, "balance_loss_mlp": 1.00056291, "epoch": 0.9468209830151811, "flos": 69626865231360.0, "grad_norm": 0.6611829450844628, "language_loss": 0.53334635, "learning_rate": 2.955978648787871e-08, "loss": 0.55343962, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.06445312, "step": 15748, "time_per_iteration": 3.1864516735076904 }, { "auxiliary_loss_clip": 0.01053096, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.01681805, "balance_loss_mlp": 1.01724553, "epoch": 0.946881106267849, "flos": 27015407360640.0, "grad_norm": 1.645128205066435, "language_loss": 0.6795038, "learning_rate": 2.9493111731988096e-08, "loss": 0.70043314, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35742188, "step": 15749, "time_per_iteration": 2.413214921951294 }, { "auxiliary_loss_clip": 0.01053437, "auxiliary_loss_mlp": 0.01039913, "balance_loss_clip": 1.01417553, "balance_loss_mlp": 1.01645637, "epoch": 0.9469412295205171, "flos": 20189154355200.0, "grad_norm": 2.032378175357304, "language_loss": 0.7736485, "learning_rate": 2.942651169791621e-08, "loss": 0.79458201, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36914062, "step": 15750, "time_per_iteration": 2.366022825241089 }, { "auxiliary_loss_clip": 0.01049806, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.01484275, "balance_loss_mlp": 1.01507854, "epoch": 0.947001352773185, "flos": 21323831408640.0, "grad_norm": 1.6304530114950968, "language_loss": 0.68132639, "learning_rate": 2.9359986388188372e-08, "loss": 0.70219821, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 15751, "time_per_iteration": 2.359426975250244 }, { "auxiliary_loss_clip": 0.01051278, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.01390803, "balance_loss_mlp": 1.01581454, "epoch": 0.947061476025853, "flos": 21942981192960.0, "grad_norm": 1.6053485425333964, "language_loss": 0.65992463, "learning_rate": 2.929353580532723e-08, "loss": 0.68079329, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 15752, "time_per_iteration": 2.389608144760132 }, { "auxiliary_loss_clip": 0.01051341, "auxiliary_loss_mlp": 0.010439, "balance_loss_clip": 1.02047551, "balance_loss_mlp": 1.01596475, "epoch": 0.947121599278521, "flos": 21393727683840.0, "grad_norm": 1.649038898526814, "language_loss": 0.72704154, "learning_rate": 2.9227159951852764e-08, "loss": 0.74799395, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 15753, "time_per_iteration": 2.3667352199554443 }, { "auxiliary_loss_clip": 0.01053558, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.01375103, "balance_loss_mlp": 1.01605284, "epoch": 0.9471817225311889, "flos": 23074620958080.0, "grad_norm": 3.2016480156782885, "language_loss": 0.72156245, "learning_rate": 2.9160858830281855e-08, "loss": 0.74250913, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.375, "step": 15754, "time_per_iteration": 2.3784995079040527 }, { "auxiliary_loss_clip": 0.01052847, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.01383758, "balance_loss_mlp": 1.01563597, "epoch": 0.947241845783857, "flos": 11909955542400.0, "grad_norm": 2.1257624029497078, "language_loss": 0.80202901, "learning_rate": 2.9094632443129153e-08, "loss": 0.82292974, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.37109375, "step": 15755, "time_per_iteration": 2.322429656982422 }, { "auxiliary_loss_clip": 0.01056105, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.01423943, "balance_loss_mlp": 1.01645529, "epoch": 0.9473019690365249, "flos": 20739629761920.0, "grad_norm": 2.4589946703026953, "language_loss": 0.76531327, "learning_rate": 2.9028480792904876e-08, "loss": 0.78628355, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.39648438, "step": 15756, "time_per_iteration": 2.4107859134674072 }, { "auxiliary_loss_clip": 0.01052551, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.01306951, "balance_loss_mlp": 1.01684213, "epoch": 0.9473620922891929, "flos": 17638923427200.0, "grad_norm": 2.0544760427845774, "language_loss": 0.76171911, "learning_rate": 2.8962403882118347e-08, "loss": 0.78259164, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35742188, "step": 15757, "time_per_iteration": 3.5786876678466797 }, { "auxiliary_loss_clip": 0.01053403, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.01428819, "balance_loss_mlp": 1.01627028, "epoch": 0.9474222155418608, "flos": 23548881133440.0, "grad_norm": 2.4125512415762604, "language_loss": 0.80745155, "learning_rate": 2.889640171327512e-08, "loss": 0.82836914, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 15758, "time_per_iteration": 2.39438533782959 }, { "auxiliary_loss_clip": 0.01049845, "auxiliary_loss_mlp": 0.01036933, "balance_loss_clip": 1.0154866, "balance_loss_mlp": 1.01568794, "epoch": 0.9474823387945288, "flos": 27088515480960.0, "grad_norm": 1.436839291673015, "language_loss": 0.72752243, "learning_rate": 2.8830474288877638e-08, "loss": 0.7483902, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 15759, "time_per_iteration": 2.39927077293396 }, { "auxiliary_loss_clip": 0.01048563, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01410866, "balance_loss_mlp": 1.01566064, "epoch": 0.9475424620471967, "flos": 22965412625280.0, "grad_norm": 1.4276200640636403, "language_loss": 0.76750857, "learning_rate": 2.8764621611426344e-08, "loss": 0.7883271, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.328125, "step": 15760, "time_per_iteration": 2.4052064418792725 }, { "auxiliary_loss_clip": 0.01051921, "auxiliary_loss_mlp": 0.01033266, "balance_loss_clip": 1.01254773, "balance_loss_mlp": 1.0167259, "epoch": 0.9476025852998647, "flos": 20046638718720.0, "grad_norm": 1.831280554667467, "language_loss": 0.7392031, "learning_rate": 2.8698843683418128e-08, "loss": 0.76005495, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3515625, "step": 15761, "time_per_iteration": 2.3365721702575684 }, { "auxiliary_loss_clip": 0.01051774, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.01637864, "balance_loss_mlp": 1.01695716, "epoch": 0.9476627085525327, "flos": 14974596576000.0, "grad_norm": 2.0373545266415047, "language_loss": 0.73546183, "learning_rate": 2.863314050734722e-08, "loss": 0.75636244, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 15762, "time_per_iteration": 2.3458595275878906 }, { "auxiliary_loss_clip": 0.01052827, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.01399112, "balance_loss_mlp": 1.01517081, "epoch": 0.9477228318052007, "flos": 18696791756160.0, "grad_norm": 1.8549977343844324, "language_loss": 0.68747044, "learning_rate": 2.856751208570518e-08, "loss": 0.70838565, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.375, "step": 15763, "time_per_iteration": 2.343414783477783 }, { "auxiliary_loss_clip": 0.01051421, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.01375699, "balance_loss_mlp": 1.01553822, "epoch": 0.9477829550578686, "flos": 23874027425280.0, "grad_norm": 1.7249302035119354, "language_loss": 0.71828848, "learning_rate": 2.8501958420980466e-08, "loss": 0.73916018, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.359375, "step": 15764, "time_per_iteration": 2.377281665802002 }, { "auxiliary_loss_clip": 0.01049942, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.01291132, "balance_loss_mlp": 1.01787353, "epoch": 0.9478430783105366, "flos": 22561851686400.0, "grad_norm": 1.8978506820146028, "language_loss": 0.72125614, "learning_rate": 2.8436479515659306e-08, "loss": 0.7420789, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.3203125, "step": 15765, "time_per_iteration": 2.3711142539978027 }, { "auxiliary_loss_clip": 0.01006984, "auxiliary_loss_mlp": 0.01002558, "balance_loss_clip": 1.00059092, "balance_loss_mlp": 1.00059164, "epoch": 0.9479032015632046, "flos": 60855182513280.0, "grad_norm": 0.812818388173387, "language_loss": 0.59277248, "learning_rate": 2.8371075372224384e-08, "loss": 0.61286789, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.06396484, "step": 15766, "time_per_iteration": 2.7828454971313477 }, { "auxiliary_loss_clip": 0.01051696, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.01502442, "balance_loss_mlp": 1.01631093, "epoch": 0.9479633248158725, "flos": 14683001967360.0, "grad_norm": 1.9345226492966823, "language_loss": 0.76072192, "learning_rate": 2.8305745993155938e-08, "loss": 0.78161216, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 15767, "time_per_iteration": 5.166979074478149 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.01594377, "balance_loss_mlp": 1.01641369, "epoch": 0.9480234480685406, "flos": 20332996623360.0, "grad_norm": 2.439916371343903, "language_loss": 0.744923, "learning_rate": 2.8240491380931096e-08, "loss": 0.76585042, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.37304688, "step": 15768, "time_per_iteration": 2.331920862197876 }, { "auxiliary_loss_clip": 0.01006985, "auxiliary_loss_mlp": 0.01002668, "balance_loss_clip": 1.00055826, "balance_loss_mlp": 1.00066161, "epoch": 0.9480835713212085, "flos": 70289516436480.0, "grad_norm": 0.7373441220008005, "language_loss": 0.55396569, "learning_rate": 2.8175311538024326e-08, "loss": 0.57406223, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06347656, "step": 15769, "time_per_iteration": 3.0290675163269043 }, { "auxiliary_loss_clip": 0.01050376, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.01279151, "balance_loss_mlp": 1.01531935, "epoch": 0.9481436945738765, "flos": 25448505275520.0, "grad_norm": 1.3188865045926519, "language_loss": 0.7798844, "learning_rate": 2.8110206466907428e-08, "loss": 0.80073708, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 15770, "time_per_iteration": 2.4093711376190186 }, { "auxiliary_loss_clip": 0.01053756, "auxiliary_loss_mlp": 0.01039875, "balance_loss_clip": 1.0146389, "balance_loss_mlp": 1.01718915, "epoch": 0.9482038178265444, "flos": 26978678743680.0, "grad_norm": 1.8548990544039272, "language_loss": 0.80541909, "learning_rate": 2.8045176170049313e-08, "loss": 0.8263554, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 15771, "time_per_iteration": 2.410914659500122 }, { "auxiliary_loss_clip": 0.01051825, "auxiliary_loss_mlp": 0.01040182, "balance_loss_clip": 1.01775885, "balance_loss_mlp": 1.0162394, "epoch": 0.9482639410792124, "flos": 17784476352000.0, "grad_norm": 1.91729281179819, "language_loss": 0.71073043, "learning_rate": 2.7980220649915566e-08, "loss": 0.73165053, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 15772, "time_per_iteration": 2.3288848400115967 }, { "auxiliary_loss_clip": 0.01051808, "auxiliary_loss_mlp": 0.01032399, "balance_loss_clip": 1.01109636, "balance_loss_mlp": 1.01643836, "epoch": 0.9483240643318803, "flos": 20995612917120.0, "grad_norm": 1.5189649135938619, "language_loss": 0.74679327, "learning_rate": 2.7915339908969327e-08, "loss": 0.76763541, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35351562, "step": 15773, "time_per_iteration": 2.3830127716064453 }, { "auxiliary_loss_clip": 0.01051614, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.020051, "balance_loss_mlp": 1.015378, "epoch": 0.9483841875845483, "flos": 20082285083520.0, "grad_norm": 2.1608065594412027, "language_loss": 0.63782197, "learning_rate": 2.7850533949671072e-08, "loss": 0.65879023, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 15774, "time_per_iteration": 2.342543601989746 }, { "auxiliary_loss_clip": 0.01051416, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.01305795, "balance_loss_mlp": 1.01532567, "epoch": 0.9484443108372163, "flos": 20812598236800.0, "grad_norm": 1.836315133660154, "language_loss": 0.6033777, "learning_rate": 2.7785802774478396e-08, "loss": 0.62425613, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36132812, "step": 15775, "time_per_iteration": 2.3637495040893555 }, { "auxiliary_loss_clip": 0.01052092, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.01695418, "balance_loss_mlp": 1.01555753, "epoch": 0.9485044340898843, "flos": 36427712215680.0, "grad_norm": 1.5475583933942398, "language_loss": 0.63015133, "learning_rate": 2.772114638584555e-08, "loss": 0.6510734, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36523438, "step": 15776, "time_per_iteration": 2.479140281677246 }, { "auxiliary_loss_clip": 0.01051124, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.01497483, "balance_loss_mlp": 1.01573062, "epoch": 0.9485645573425522, "flos": 22601408123520.0, "grad_norm": 1.713684047541088, "language_loss": 0.74484086, "learning_rate": 2.765656478622458e-08, "loss": 0.76573515, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35351562, "step": 15777, "time_per_iteration": 2.39367938041687 }, { "auxiliary_loss_clip": 0.01056242, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.01395464, "balance_loss_mlp": 1.01673436, "epoch": 0.9486246805952202, "flos": 22016682806400.0, "grad_norm": 2.366852912526445, "language_loss": 0.73864162, "learning_rate": 2.759205797806441e-08, "loss": 0.75961173, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.39453125, "step": 15778, "time_per_iteration": 2.4085848331451416 }, { "auxiliary_loss_clip": 0.01048108, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01165605, "balance_loss_mlp": 1.01622701, "epoch": 0.9486848038478882, "flos": 16507737509760.0, "grad_norm": 1.874035045407014, "language_loss": 0.70931852, "learning_rate": 2.7527625963810865e-08, "loss": 0.73011273, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.31835938, "step": 15779, "time_per_iteration": 3.8160581588745117 }, { "auxiliary_loss_clip": 0.01051313, "auxiliary_loss_mlp": 0.01040589, "balance_loss_clip": 1.01617515, "balance_loss_mlp": 1.01591027, "epoch": 0.9487449271005561, "flos": 19243392001920.0, "grad_norm": 2.1594258132610964, "language_loss": 0.79951775, "learning_rate": 2.7463268745907542e-08, "loss": 0.82043672, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 15780, "time_per_iteration": 2.405877113342285 }, { "auxiliary_loss_clip": 0.01052869, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.01052999, "balance_loss_mlp": 1.01689065, "epoch": 0.9488050503532242, "flos": 21761607346560.0, "grad_norm": 1.824461358554288, "language_loss": 0.66746843, "learning_rate": 2.7398986326794494e-08, "loss": 0.68832469, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 15781, "time_per_iteration": 2.396348476409912 }, { "auxiliary_loss_clip": 0.01050663, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.01505208, "balance_loss_mlp": 1.01588678, "epoch": 0.9488651736058921, "flos": 18367944860160.0, "grad_norm": 1.9328750435883062, "language_loss": 0.80601978, "learning_rate": 2.733477870890999e-08, "loss": 0.82691932, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.34765625, "step": 15782, "time_per_iteration": 2.347132682800293 }, { "auxiliary_loss_clip": 0.01006951, "auxiliary_loss_mlp": 0.01003248, "balance_loss_clip": 1.00120986, "balance_loss_mlp": 1.0007534, "epoch": 0.9489252968585601, "flos": 70080211635840.0, "grad_norm": 0.7177658861799657, "language_loss": 0.59844077, "learning_rate": 2.7270645894688082e-08, "loss": 0.61854273, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.06201172, "step": 15783, "time_per_iteration": 3.133420705795288 }, { "auxiliary_loss_clip": 0.0105305, "auxiliary_loss_mlp": 0.01041193, "balance_loss_clip": 1.01663589, "balance_loss_mlp": 1.01687169, "epoch": 0.948985420111228, "flos": 27854195708160.0, "grad_norm": 1.6353976976184954, "language_loss": 0.74750841, "learning_rate": 2.720658788656105e-08, "loss": 0.76845086, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36132812, "step": 15784, "time_per_iteration": 2.414520263671875 }, { "auxiliary_loss_clip": 0.01051914, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.01249862, "balance_loss_mlp": 1.01556623, "epoch": 0.949045543363896, "flos": 24314910474240.0, "grad_norm": 1.9744259501028116, "language_loss": 0.7099762, "learning_rate": 2.714260468695806e-08, "loss": 0.73087811, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36328125, "step": 15785, "time_per_iteration": 2.407527446746826 }, { "auxiliary_loss_clip": 0.01051385, "auxiliary_loss_mlp": 0.01034676, "balance_loss_clip": 1.01070333, "balance_loss_mlp": 1.01540065, "epoch": 0.9491056666165639, "flos": 24240580456320.0, "grad_norm": 1.4429142743868508, "language_loss": 0.77154326, "learning_rate": 2.707869629830495e-08, "loss": 0.79240382, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 15786, "time_per_iteration": 2.378446578979492 }, { "auxiliary_loss_clip": 0.01051486, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.01831484, "balance_loss_mlp": 1.01598048, "epoch": 0.949165789869232, "flos": 24530220028800.0, "grad_norm": 1.7446507128857964, "language_loss": 0.7962755, "learning_rate": 2.7014862723025335e-08, "loss": 0.81719244, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35546875, "step": 15787, "time_per_iteration": 2.4200310707092285 }, { "auxiliary_loss_clip": 0.01051004, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.0144614, "balance_loss_mlp": 1.01679325, "epoch": 0.9492259131218999, "flos": 22234296510720.0, "grad_norm": 1.4995088344008733, "language_loss": 0.77073526, "learning_rate": 2.6951103963540388e-08, "loss": 0.79161632, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34179688, "step": 15788, "time_per_iteration": 2.369018793106079 }, { "auxiliary_loss_clip": 0.01051984, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.01674962, "balance_loss_mlp": 1.0156194, "epoch": 0.9492860363745679, "flos": 22965203157120.0, "grad_norm": 1.8210914559542963, "language_loss": 0.7247588, "learning_rate": 2.6887420022266848e-08, "loss": 0.74568349, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36328125, "step": 15789, "time_per_iteration": 2.4013288021087646 }, { "auxiliary_loss_clip": 0.01050087, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.01257348, "balance_loss_mlp": 1.0156306, "epoch": 0.9493461596272358, "flos": 18369271491840.0, "grad_norm": 1.9197003646407504, "language_loss": 0.74717146, "learning_rate": 2.682381090161989e-08, "loss": 0.76804769, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.34375, "step": 15790, "time_per_iteration": 2.3436598777770996 }, { "auxiliary_loss_clip": 0.01052926, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.0160892, "balance_loss_mlp": 1.015625, "epoch": 0.9494062828799038, "flos": 20010678151680.0, "grad_norm": 2.0222143829363985, "language_loss": 0.79554653, "learning_rate": 2.6760276604012033e-08, "loss": 0.81648904, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37304688, "step": 15791, "time_per_iteration": 2.3818650245666504 }, { "auxiliary_loss_clip": 0.01054235, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.01416469, "balance_loss_mlp": 1.01602292, "epoch": 0.9494664061325718, "flos": 27227539981440.0, "grad_norm": 1.979923922052732, "language_loss": 0.75001168, "learning_rate": 2.6696817131852234e-08, "loss": 0.77094185, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3828125, "step": 15792, "time_per_iteration": 2.4420721530914307 }, { "auxiliary_loss_clip": 0.01051171, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.01235151, "balance_loss_mlp": 1.01564395, "epoch": 0.9495265293852397, "flos": 18368817644160.0, "grad_norm": 2.9060022726019583, "language_loss": 0.79395521, "learning_rate": 2.663343248754679e-08, "loss": 0.81481302, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 15793, "time_per_iteration": 2.3762784004211426 }, { "auxiliary_loss_clip": 0.01050113, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.01739216, "balance_loss_mlp": 1.01555562, "epoch": 0.9495866526379078, "flos": 23074655869440.0, "grad_norm": 1.6289336463375503, "language_loss": 0.78482169, "learning_rate": 2.6570122673499562e-08, "loss": 0.8057211, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 15794, "time_per_iteration": 2.3643834590911865 }, { "auxiliary_loss_clip": 0.01053192, "auxiliary_loss_mlp": 0.01040069, "balance_loss_clip": 1.01452279, "balance_loss_mlp": 1.01622009, "epoch": 0.9496467758905757, "flos": 17528947044480.0, "grad_norm": 1.8333361013214984, "language_loss": 0.61554813, "learning_rate": 2.650688769211107e-08, "loss": 0.63648069, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37109375, "step": 15795, "time_per_iteration": 2.3806416988372803 }, { "auxiliary_loss_clip": 0.01050474, "auxiliary_loss_mlp": 0.01033702, "balance_loss_clip": 1.01111174, "balance_loss_mlp": 1.01591456, "epoch": 0.9497068991432437, "flos": 24132768577920.0, "grad_norm": 1.8133118896862435, "language_loss": 0.80635405, "learning_rate": 2.644372754577895e-08, "loss": 0.82719582, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34570312, "step": 15796, "time_per_iteration": 3.691746234893799 }, { "auxiliary_loss_clip": 0.01052817, "auxiliary_loss_mlp": 0.0104038, "balance_loss_clip": 1.01614499, "balance_loss_mlp": 1.01675487, "epoch": 0.9497670223959116, "flos": 20302272760320.0, "grad_norm": 1.85903592389378, "language_loss": 0.76504666, "learning_rate": 2.6380642236898398e-08, "loss": 0.78597862, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.359375, "step": 15797, "time_per_iteration": 2.4051671028137207 }, { "auxiliary_loss_clip": 0.01053172, "auxiliary_loss_mlp": 0.01042131, "balance_loss_clip": 1.01940966, "balance_loss_mlp": 1.01714063, "epoch": 0.9498271456485796, "flos": 13698067201920.0, "grad_norm": 2.0128197947402846, "language_loss": 0.66863424, "learning_rate": 2.6317631767861727e-08, "loss": 0.68958724, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.359375, "step": 15798, "time_per_iteration": 2.3455209732055664 }, { "auxiliary_loss_clip": 0.01053339, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.01612127, "balance_loss_mlp": 1.01687527, "epoch": 0.9498872689012475, "flos": 20812947350400.0, "grad_norm": 2.658792667723561, "language_loss": 0.77899957, "learning_rate": 2.6254696141058575e-08, "loss": 0.79991609, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.36523438, "step": 15799, "time_per_iteration": 2.431056261062622 }, { "auxiliary_loss_clip": 0.0105031, "auxiliary_loss_mlp": 0.01033655, "balance_loss_clip": 1.01260281, "balance_loss_mlp": 1.01577735, "epoch": 0.9499473921539155, "flos": 21031643306880.0, "grad_norm": 1.6386593781683665, "language_loss": 0.72319591, "learning_rate": 2.6191835358874814e-08, "loss": 0.7440356, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 15800, "time_per_iteration": 2.358853816986084 }, { "auxiliary_loss_clip": 0.01050057, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.0119381, "balance_loss_mlp": 1.01505017, "epoch": 0.9500075154065835, "flos": 20997567953280.0, "grad_norm": 1.741097787848468, "language_loss": 0.7292456, "learning_rate": 2.6129049423694315e-08, "loss": 0.75008857, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 15801, "time_per_iteration": 2.4055848121643066 }, { "auxiliary_loss_clip": 0.01052263, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.01132059, "balance_loss_mlp": 1.01650167, "epoch": 0.9500676386592515, "flos": 25120705720320.0, "grad_norm": 1.5642727877451794, "language_loss": 0.82085252, "learning_rate": 2.6066338337898508e-08, "loss": 0.8417123, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 15802, "time_per_iteration": 2.410964250564575 }, { "auxiliary_loss_clip": 0.01053156, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.01771951, "balance_loss_mlp": 1.01748276, "epoch": 0.9501277619119194, "flos": 27522486080640.0, "grad_norm": 1.8014040037056822, "language_loss": 0.700629, "learning_rate": 2.60037021038646e-08, "loss": 0.72157431, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 15803, "time_per_iteration": 2.4151597023010254 }, { "auxiliary_loss_clip": 0.01050508, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.01274633, "balance_loss_mlp": 1.01532125, "epoch": 0.9501878851645874, "flos": 20812912439040.0, "grad_norm": 1.60386733006486, "language_loss": 0.76881194, "learning_rate": 2.5941140723968247e-08, "loss": 0.78967363, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 15804, "time_per_iteration": 2.353442430496216 }, { "auxiliary_loss_clip": 0.01053483, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.01130688, "balance_loss_mlp": 1.01724744, "epoch": 0.9502480084172553, "flos": 18368398707840.0, "grad_norm": 1.7102479561068251, "language_loss": 0.74626267, "learning_rate": 2.5878654200581775e-08, "loss": 0.76714504, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 15805, "time_per_iteration": 2.3542933464050293 }, { "auxiliary_loss_clip": 0.01053945, "auxiliary_loss_mlp": 0.01044383, "balance_loss_clip": 1.01875293, "balance_loss_mlp": 1.01737309, "epoch": 0.9503081316699233, "flos": 23548497108480.0, "grad_norm": 1.41784939736946, "language_loss": 0.81602895, "learning_rate": 2.5816242536074618e-08, "loss": 0.83701217, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36523438, "step": 15806, "time_per_iteration": 3.779156446456909 }, { "auxiliary_loss_clip": 0.01053679, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.01321054, "balance_loss_mlp": 1.01606154, "epoch": 0.9503682549225914, "flos": 18039481989120.0, "grad_norm": 2.006503519273143, "language_loss": 0.83730197, "learning_rate": 2.5753905732813108e-08, "loss": 0.85819864, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.375, "step": 15807, "time_per_iteration": 3.8365230560302734 }, { "auxiliary_loss_clip": 0.01049858, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.01125288, "balance_loss_mlp": 1.01537561, "epoch": 0.9504283781752593, "flos": 25884919670400.0, "grad_norm": 1.8949183599474633, "language_loss": 0.73129475, "learning_rate": 2.5691643793161355e-08, "loss": 0.7521109, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 15808, "time_per_iteration": 2.409778356552124 }, { "auxiliary_loss_clip": 0.01051121, "auxiliary_loss_mlp": 0.01037303, "balance_loss_clip": 1.01445079, "balance_loss_mlp": 1.01613343, "epoch": 0.9504885014279273, "flos": 22123028407680.0, "grad_norm": 1.5375428449142265, "language_loss": 0.70733684, "learning_rate": 2.562945671948058e-08, "loss": 0.72822106, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 15809, "time_per_iteration": 2.367258071899414 }, { "auxiliary_loss_clip": 0.0105088, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.01018357, "balance_loss_mlp": 1.01569915, "epoch": 0.9505486246805952, "flos": 21614902346880.0, "grad_norm": 1.5464392002674792, "language_loss": 0.76276183, "learning_rate": 2.5567344514128452e-08, "loss": 0.78358924, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 15810, "time_per_iteration": 2.3793888092041016 }, { "auxiliary_loss_clip": 0.01051733, "auxiliary_loss_mlp": 0.01046664, "balance_loss_clip": 1.02085519, "balance_loss_mlp": 1.01553297, "epoch": 0.9506087479332632, "flos": 22527147928320.0, "grad_norm": 1.389929406750289, "language_loss": 0.80940664, "learning_rate": 2.5505307179460643e-08, "loss": 0.83039057, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36328125, "step": 15811, "time_per_iteration": 2.3715226650238037 }, { "auxiliary_loss_clip": 0.0105169, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.01501656, "balance_loss_mlp": 1.01580882, "epoch": 0.9506688711859311, "flos": 27526116862080.0, "grad_norm": 2.080237516477039, "language_loss": 0.713368, "learning_rate": 2.5443344717829495e-08, "loss": 0.73425829, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.359375, "step": 15812, "time_per_iteration": 2.421966791152954 }, { "auxiliary_loss_clip": 0.01052612, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.01526988, "balance_loss_mlp": 1.0161047, "epoch": 0.9507289944385992, "flos": 19864741201920.0, "grad_norm": 4.107636292967543, "language_loss": 0.66446549, "learning_rate": 2.538145713158446e-08, "loss": 0.6854012, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36523438, "step": 15813, "time_per_iteration": 2.3678653240203857 }, { "auxiliary_loss_clip": 0.01053066, "auxiliary_loss_mlp": 0.01042999, "balance_loss_clip": 1.01883543, "balance_loss_mlp": 1.01618659, "epoch": 0.9507891176912671, "flos": 25192068272640.0, "grad_norm": 1.3700365067867208, "language_loss": 0.71002227, "learning_rate": 2.5319644423072327e-08, "loss": 0.73098296, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36914062, "step": 15814, "time_per_iteration": 2.414485454559326 }, { "auxiliary_loss_clip": 0.01051045, "auxiliary_loss_mlp": 0.01035849, "balance_loss_clip": 1.01535678, "balance_loss_mlp": 1.01659274, "epoch": 0.9508492409439351, "flos": 24898413893760.0, "grad_norm": 2.187335771931001, "language_loss": 0.64592427, "learning_rate": 2.5257906594637445e-08, "loss": 0.66679323, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34375, "step": 15815, "time_per_iteration": 2.4007251262664795 }, { "auxiliary_loss_clip": 0.01050195, "auxiliary_loss_mlp": 0.01032323, "balance_loss_clip": 1.00998282, "balance_loss_mlp": 1.01477575, "epoch": 0.950909364196603, "flos": 29782937790720.0, "grad_norm": 2.733884323610963, "language_loss": 0.59094274, "learning_rate": 2.519624364862061e-08, "loss": 0.61176795, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 15816, "time_per_iteration": 2.435983180999756 }, { "auxiliary_loss_clip": 0.01050976, "auxiliary_loss_mlp": 0.0104226, "balance_loss_clip": 1.02038479, "balance_loss_mlp": 1.01588655, "epoch": 0.950969487449271, "flos": 24716621111040.0, "grad_norm": 1.602794943518448, "language_loss": 0.74385679, "learning_rate": 2.513465558735994e-08, "loss": 0.76478922, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15817, "time_per_iteration": 2.4036643505096436 }, { "auxiliary_loss_clip": 0.01053335, "auxiliary_loss_mlp": 0.01042289, "balance_loss_clip": 1.01448905, "balance_loss_mlp": 1.01636279, "epoch": 0.9510296107019389, "flos": 13698311581440.0, "grad_norm": 1.689703042979272, "language_loss": 0.61688828, "learning_rate": 2.5073142413190918e-08, "loss": 0.63784456, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.36914062, "step": 15818, "time_per_iteration": 2.342528820037842 }, { "auxiliary_loss_clip": 0.01052762, "auxiliary_loss_mlp": 0.01038299, "balance_loss_clip": 1.01464748, "balance_loss_mlp": 1.01618862, "epoch": 0.9510897339546069, "flos": 17310879492480.0, "grad_norm": 1.7897669373789604, "language_loss": 0.69764018, "learning_rate": 2.5011704128446552e-08, "loss": 0.71855074, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36523438, "step": 15819, "time_per_iteration": 3.8512356281280518 }, { "auxiliary_loss_clip": 0.01054041, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.0114603, "balance_loss_mlp": 1.01720726, "epoch": 0.951149857207275, "flos": 14793048172800.0, "grad_norm": 1.6010985429242954, "language_loss": 0.75245774, "learning_rate": 2.49503407354561e-08, "loss": 0.7733587, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 15820, "time_per_iteration": 2.3487792015075684 }, { "auxiliary_loss_clip": 0.01053228, "auxiliary_loss_mlp": 0.01042734, "balance_loss_clip": 1.02006054, "balance_loss_mlp": 1.01717699, "epoch": 0.9512099804599429, "flos": 19390131912960.0, "grad_norm": 10.470473736383695, "language_loss": 0.79290527, "learning_rate": 2.4889052236546804e-08, "loss": 0.81386489, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 15821, "time_per_iteration": 2.352457046508789 }, { "auxiliary_loss_clip": 0.01051558, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.00825179, "balance_loss_mlp": 1.01621509, "epoch": 0.9512701037126109, "flos": 36756384554880.0, "grad_norm": 2.4159442541923624, "language_loss": 0.71920443, "learning_rate": 2.4827838634042586e-08, "loss": 0.74005121, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.35546875, "step": 15822, "time_per_iteration": 2.4984288215637207 }, { "auxiliary_loss_clip": 0.01051611, "auxiliary_loss_mlp": 0.01038288, "balance_loss_clip": 1.01691365, "balance_loss_mlp": 1.0165062, "epoch": 0.9513302269652788, "flos": 22637159222400.0, "grad_norm": 1.568934040627301, "language_loss": 0.67450416, "learning_rate": 2.47666999302647e-08, "loss": 0.6954031, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.3515625, "step": 15823, "time_per_iteration": 2.361361026763916 }, { "auxiliary_loss_clip": 0.01049632, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.0179565, "balance_loss_mlp": 1.01669717, "epoch": 0.9513903502179468, "flos": 22892129948160.0, "grad_norm": 1.9296844376432272, "language_loss": 0.78269511, "learning_rate": 2.4705636127531292e-08, "loss": 0.80357593, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33007812, "step": 15824, "time_per_iteration": 2.361347198486328 }, { "auxiliary_loss_clip": 0.01054125, "auxiliary_loss_mlp": 0.01036686, "balance_loss_clip": 1.01050806, "balance_loss_mlp": 1.01672745, "epoch": 0.9514504734706147, "flos": 27927373651200.0, "grad_norm": 2.4146700649466246, "language_loss": 0.74586093, "learning_rate": 2.4644647228158065e-08, "loss": 0.76676905, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 15825, "time_per_iteration": 2.4211010932922363 }, { "auxiliary_loss_clip": 0.01006731, "auxiliary_loss_mlp": 0.01003997, "balance_loss_clip": 1.00182772, "balance_loss_mlp": 1.00045443, "epoch": 0.9515105967232828, "flos": 67363480967040.0, "grad_norm": 0.8149926611730905, "language_loss": 0.53477883, "learning_rate": 2.458373323445806e-08, "loss": 0.5548861, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.0625, "step": 15826, "time_per_iteration": 2.928389310836792 }, { "auxiliary_loss_clip": 0.01050741, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.02028763, "balance_loss_mlp": 1.01530159, "epoch": 0.9515707199759507, "flos": 25845398144640.0, "grad_norm": 2.0156449637533647, "language_loss": 0.7388829, "learning_rate": 2.452289414874076e-08, "loss": 0.7598294, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35351562, "step": 15827, "time_per_iteration": 2.421349048614502 }, { "auxiliary_loss_clip": 0.01051122, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.01299596, "balance_loss_mlp": 1.01551127, "epoch": 0.9516308432286187, "flos": 21828082308480.0, "grad_norm": 1.7447525695548816, "language_loss": 0.75603145, "learning_rate": 2.4462129973313207e-08, "loss": 0.77691096, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 15828, "time_per_iteration": 2.3925609588623047 }, { "auxiliary_loss_clip": 0.01050457, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.01660228, "balance_loss_mlp": 1.01656032, "epoch": 0.9516909664812866, "flos": 27268423050240.0, "grad_norm": 1.4952419255400673, "language_loss": 0.73961169, "learning_rate": 2.440144071047978e-08, "loss": 0.76048136, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33984375, "step": 15829, "time_per_iteration": 2.4239468574523926 }, { "auxiliary_loss_clip": 0.0105122, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.01386285, "balance_loss_mlp": 1.01599991, "epoch": 0.9517510897339546, "flos": 21214273962240.0, "grad_norm": 1.7126526302928866, "language_loss": 0.62630898, "learning_rate": 2.4340826362541533e-08, "loss": 0.6471858, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15830, "time_per_iteration": 2.3966004848480225 }, { "auxiliary_loss_clip": 0.01053347, "auxiliary_loss_mlp": 0.0104297, "balance_loss_clip": 1.01823401, "balance_loss_mlp": 1.01652503, "epoch": 0.9518112129866225, "flos": 18732996702720.0, "grad_norm": 2.311221961994566, "language_loss": 0.74203545, "learning_rate": 2.428028693179729e-08, "loss": 0.76299858, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3671875, "step": 15831, "time_per_iteration": 2.3690803050994873 }, { "auxiliary_loss_clip": 0.0104869, "auxiliary_loss_mlp": 0.01031563, "balance_loss_clip": 1.01158357, "balance_loss_mlp": 1.01523876, "epoch": 0.9518713362392905, "flos": 16762743146880.0, "grad_norm": 1.7297189926986796, "language_loss": 0.6618067, "learning_rate": 2.4219822420542545e-08, "loss": 0.68260926, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.3359375, "step": 15832, "time_per_iteration": 2.351290464401245 }, { "auxiliary_loss_clip": 0.01052217, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.01436472, "balance_loss_mlp": 1.01769352, "epoch": 0.9519314594919586, "flos": 15229776769920.0, "grad_norm": 1.8320567973791722, "language_loss": 0.79156792, "learning_rate": 2.4159432831070135e-08, "loss": 0.81245732, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34570312, "step": 15833, "time_per_iteration": 2.3685672283172607 }, { "auxiliary_loss_clip": 0.01050846, "auxiliary_loss_mlp": 0.0103798, "balance_loss_clip": 1.01517487, "balance_loss_mlp": 1.01620221, "epoch": 0.9519915827446265, "flos": 19351971930240.0, "grad_norm": 2.2140118028917724, "language_loss": 0.75770491, "learning_rate": 2.4099118165670007e-08, "loss": 0.77859318, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34765625, "step": 15834, "time_per_iteration": 2.3333375453948975 }, { "auxiliary_loss_clip": 0.01055711, "auxiliary_loss_mlp": 0.01041048, "balance_loss_clip": 1.01565611, "balance_loss_mlp": 1.01755822, "epoch": 0.9520517059972945, "flos": 22265404398720.0, "grad_norm": 2.449352609370315, "language_loss": 0.76886183, "learning_rate": 2.4038878426629216e-08, "loss": 0.78982943, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 15835, "time_per_iteration": 2.3772284984588623 }, { "auxiliary_loss_clip": 0.01052477, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.01876569, "balance_loss_mlp": 1.01587021, "epoch": 0.9521118292499624, "flos": 14861547993600.0, "grad_norm": 1.9487383823420492, "language_loss": 0.67290878, "learning_rate": 2.397871361623238e-08, "loss": 0.69387043, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36523438, "step": 15836, "time_per_iteration": 3.6042189598083496 }, { "auxiliary_loss_clip": 0.01049524, "auxiliary_loss_mlp": 0.01033903, "balance_loss_clip": 1.01203966, "balance_loss_mlp": 1.01539505, "epoch": 0.9521719525026304, "flos": 23507823507840.0, "grad_norm": 1.6634554968979125, "language_loss": 0.71155626, "learning_rate": 2.391862373676057e-08, "loss": 0.73239052, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34179688, "step": 15837, "time_per_iteration": 2.3780572414398193 }, { "auxiliary_loss_clip": 0.01052756, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.01445174, "balance_loss_mlp": 1.0157752, "epoch": 0.9522320757552983, "flos": 19714021395840.0, "grad_norm": 2.283966890035632, "language_loss": 0.74443007, "learning_rate": 2.3858608790492617e-08, "loss": 0.76534033, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36914062, "step": 15838, "time_per_iteration": 2.357729911804199 }, { "auxiliary_loss_clip": 0.0105227, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.01407444, "balance_loss_mlp": 1.0158546, "epoch": 0.9522921990079664, "flos": 25920112187520.0, "grad_norm": 1.7633834837664393, "language_loss": 0.78806812, "learning_rate": 2.379866877970449e-08, "loss": 0.80897057, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 15839, "time_per_iteration": 2.4131290912628174 }, { "auxiliary_loss_clip": 0.01052377, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.016397, "balance_loss_mlp": 1.01626158, "epoch": 0.9523523222606343, "flos": 19207117232640.0, "grad_norm": 1.6684109871244057, "language_loss": 0.81014609, "learning_rate": 2.3738803706668585e-08, "loss": 0.83105773, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36132812, "step": 15840, "time_per_iteration": 2.3694798946380615 }, { "auxiliary_loss_clip": 0.01048504, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.01082134, "balance_loss_mlp": 1.01561677, "epoch": 0.9524124455133023, "flos": 20920270469760.0, "grad_norm": 1.918041691759174, "language_loss": 0.74608397, "learning_rate": 2.3679013573655314e-08, "loss": 0.76687586, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.328125, "step": 15841, "time_per_iteration": 2.4067153930664062 }, { "auxiliary_loss_clip": 0.01046933, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.01398456, "balance_loss_mlp": 1.01415849, "epoch": 0.9524725687659702, "flos": 18842554149120.0, "grad_norm": 1.7775436571592924, "language_loss": 0.79999256, "learning_rate": 2.3619298382931972e-08, "loss": 0.82081079, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.328125, "step": 15842, "time_per_iteration": 2.3869404792785645 }, { "auxiliary_loss_clip": 0.01051934, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.01174259, "balance_loss_mlp": 1.01716566, "epoch": 0.9525326920186382, "flos": 22673573637120.0, "grad_norm": 1.6348633690873502, "language_loss": 0.73576659, "learning_rate": 2.3559658136762973e-08, "loss": 0.75662738, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 15843, "time_per_iteration": 2.3867440223693848 }, { "auxiliary_loss_clip": 0.01052376, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.00728488, "balance_loss_mlp": 1.01637542, "epoch": 0.9525928152713061, "flos": 22085671386240.0, "grad_norm": 1.7341239183037538, "language_loss": 0.79630184, "learning_rate": 2.3500092837409612e-08, "loss": 0.81714934, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.359375, "step": 15844, "time_per_iteration": 2.4296345710754395 }, { "auxiliary_loss_clip": 0.01054904, "auxiliary_loss_mlp": 0.01041732, "balance_loss_clip": 1.01525569, "balance_loss_mlp": 1.01689696, "epoch": 0.9526529385239741, "flos": 20703669194880.0, "grad_norm": 2.1980339555288304, "language_loss": 0.71574736, "learning_rate": 2.3440602487130977e-08, "loss": 0.73671365, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37890625, "step": 15845, "time_per_iteration": 2.3789660930633545 }, { "auxiliary_loss_clip": 0.0105134, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.01340818, "balance_loss_mlp": 1.01513517, "epoch": 0.9527130617766422, "flos": 23366913793920.0, "grad_norm": 1.4963006906385692, "language_loss": 0.76256394, "learning_rate": 2.338118708818282e-08, "loss": 0.78344679, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 15846, "time_per_iteration": 3.7917609214782715 }, { "auxiliary_loss_clip": 0.01051112, "auxiliary_loss_mlp": 0.01036702, "balance_loss_clip": 1.01392066, "balance_loss_mlp": 1.01583636, "epoch": 0.9527731850293101, "flos": 18985034874240.0, "grad_norm": 1.7818698103887793, "language_loss": 0.79906321, "learning_rate": 2.3321846642817998e-08, "loss": 0.8199414, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 15847, "time_per_iteration": 3.664309501647949 }, { "auxiliary_loss_clip": 0.01049506, "auxiliary_loss_mlp": 0.01039582, "balance_loss_clip": 1.0183028, "balance_loss_mlp": 1.01571226, "epoch": 0.9528333082819781, "flos": 19317023792640.0, "grad_norm": 1.6472156535618263, "language_loss": 0.78919411, "learning_rate": 2.326258115328672e-08, "loss": 0.81008506, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 15848, "time_per_iteration": 2.370346784591675 }, { "auxiliary_loss_clip": 0.01054869, "auxiliary_loss_mlp": 0.01042346, "balance_loss_clip": 1.01789594, "balance_loss_mlp": 1.01667953, "epoch": 0.952893431534646, "flos": 23950207745280.0, "grad_norm": 1.6719529551263836, "language_loss": 0.7348671, "learning_rate": 2.320339062183674e-08, "loss": 0.75583923, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 15849, "time_per_iteration": 2.3756625652313232 }, { "auxiliary_loss_clip": 0.01056851, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.01568818, "balance_loss_mlp": 1.01810658, "epoch": 0.952953554787314, "flos": 21029548625280.0, "grad_norm": 1.700642233437923, "language_loss": 0.75860387, "learning_rate": 2.314427505071226e-08, "loss": 0.77958834, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38671875, "step": 15850, "time_per_iteration": 2.380350351333618 }, { "auxiliary_loss_clip": 0.01050927, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.0159452, "balance_loss_mlp": 1.01546395, "epoch": 0.9530136780399819, "flos": 22381769560320.0, "grad_norm": 2.0573234674398813, "language_loss": 0.7430864, "learning_rate": 2.308523444215482e-08, "loss": 0.76398402, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 15851, "time_per_iteration": 2.3749444484710693 }, { "auxiliary_loss_clip": 0.01050695, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.0106523, "balance_loss_mlp": 1.01563227, "epoch": 0.95307380129265, "flos": 22158639861120.0, "grad_norm": 1.7663350672242917, "language_loss": 0.80397546, "learning_rate": 2.3026268798403525e-08, "loss": 0.82481819, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 15852, "time_per_iteration": 2.4308059215545654 }, { "auxiliary_loss_clip": 0.01052324, "auxiliary_loss_mlp": 0.0103886, "balance_loss_clip": 1.01565027, "balance_loss_mlp": 1.01668894, "epoch": 0.9531339245453179, "flos": 44020937168640.0, "grad_norm": 2.105120781690506, "language_loss": 0.60533524, "learning_rate": 2.2967378121694138e-08, "loss": 0.62624705, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 15853, "time_per_iteration": 2.608398199081421 }, { "auxiliary_loss_clip": 0.01048171, "auxiliary_loss_mlp": 0.01034355, "balance_loss_clip": 1.01290965, "balance_loss_mlp": 1.01466072, "epoch": 0.9531940477979859, "flos": 20265648877440.0, "grad_norm": 2.2540896926893272, "language_loss": 0.73172218, "learning_rate": 2.290856241425998e-08, "loss": 0.75254744, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3359375, "step": 15854, "time_per_iteration": 2.3858566284179688 }, { "auxiliary_loss_clip": 0.01051467, "auxiliary_loss_mlp": 0.01032728, "balance_loss_clip": 1.00931501, "balance_loss_mlp": 1.01512969, "epoch": 0.9532541710506538, "flos": 25334618820480.0, "grad_norm": 2.09699223134901, "language_loss": 0.69564271, "learning_rate": 2.284982167833127e-08, "loss": 0.71648461, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 15855, "time_per_iteration": 2.390559673309326 }, { "auxiliary_loss_clip": 0.01051303, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.01539469, "balance_loss_mlp": 1.0155952, "epoch": 0.9533142943033218, "flos": 26468073976320.0, "grad_norm": 1.6867898340582466, "language_loss": 0.78068388, "learning_rate": 2.279115591613556e-08, "loss": 0.80157053, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35742188, "step": 15856, "time_per_iteration": 2.4246580600738525 }, { "auxiliary_loss_clip": 0.01050282, "auxiliary_loss_mlp": 0.01038129, "balance_loss_clip": 1.01631379, "balance_loss_mlp": 1.0150317, "epoch": 0.9533744175559897, "flos": 23655890050560.0, "grad_norm": 1.6115890183929678, "language_loss": 0.78910303, "learning_rate": 2.2732565129897075e-08, "loss": 0.80998719, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15857, "time_per_iteration": 2.404137134552002 }, { "auxiliary_loss_clip": 0.01007056, "auxiliary_loss_mlp": 0.010019, "balance_loss_clip": 0.9997896, "balance_loss_mlp": 1.00078034, "epoch": 0.9534345408086577, "flos": 61049019715200.0, "grad_norm": 0.7071440029564339, "language_loss": 0.62705445, "learning_rate": 2.267404932183803e-08, "loss": 0.64714402, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.0625, "step": 15858, "time_per_iteration": 2.9848430156707764 }, { "auxiliary_loss_clip": 0.01051617, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.01619053, "balance_loss_mlp": 1.01629174, "epoch": 0.9534946640613258, "flos": 18950715141120.0, "grad_norm": 1.4766298835809097, "language_loss": 0.57757145, "learning_rate": 2.2615608494177097e-08, "loss": 0.59847665, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35351562, "step": 15859, "time_per_iteration": 3.788269281387329 }, { "auxiliary_loss_clip": 0.01049415, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.01302826, "balance_loss_mlp": 1.01534092, "epoch": 0.9535547873139937, "flos": 16653360257280.0, "grad_norm": 2.1585969325443366, "language_loss": 0.82529777, "learning_rate": 2.2557242649130504e-08, "loss": 0.84612995, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 15860, "time_per_iteration": 2.335883617401123 }, { "auxiliary_loss_clip": 0.01050509, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.01234651, "balance_loss_mlp": 1.01495755, "epoch": 0.9536149105666617, "flos": 20666731109760.0, "grad_norm": 1.7768978343944564, "language_loss": 0.67905796, "learning_rate": 2.249895178891159e-08, "loss": 0.69990337, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 15861, "time_per_iteration": 2.4390482902526855 }, { "auxiliary_loss_clip": 0.01052147, "auxiliary_loss_mlp": 0.01038488, "balance_loss_clip": 1.01465797, "balance_loss_mlp": 1.01635587, "epoch": 0.9536750338193296, "flos": 30699198178560.0, "grad_norm": 1.7814212858081262, "language_loss": 0.66205788, "learning_rate": 2.244073591573037e-08, "loss": 0.68296427, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 15862, "time_per_iteration": 2.4228644371032715 }, { "auxiliary_loss_clip": 0.01049802, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.01601672, "balance_loss_mlp": 1.01562989, "epoch": 0.9537351570719976, "flos": 20404638466560.0, "grad_norm": 1.5509038057676798, "language_loss": 0.68582076, "learning_rate": 2.238259503179485e-08, "loss": 0.70669031, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 15863, "time_per_iteration": 2.375710964202881 }, { "auxiliary_loss_clip": 0.01050876, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.01386476, "balance_loss_mlp": 1.01590776, "epoch": 0.9537952803246655, "flos": 29928106690560.0, "grad_norm": 1.9378125784158517, "language_loss": 0.79251802, "learning_rate": 2.2324529139309267e-08, "loss": 0.81338972, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 15864, "time_per_iteration": 2.429142475128174 }, { "auxiliary_loss_clip": 0.01051006, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.00838685, "balance_loss_mlp": 1.01665926, "epoch": 0.9538554035773336, "flos": 20520375223680.0, "grad_norm": 3.627429013368778, "language_loss": 0.60524046, "learning_rate": 2.226653824047586e-08, "loss": 0.62604839, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 15865, "time_per_iteration": 2.367243528366089 }, { "auxiliary_loss_clip": 0.01051119, "auxiliary_loss_mlp": 0.01032932, "balance_loss_clip": 1.00982928, "balance_loss_mlp": 1.01503944, "epoch": 0.9539155268300015, "flos": 18405511349760.0, "grad_norm": 1.8743952159203336, "language_loss": 0.70323241, "learning_rate": 2.2208622337493765e-08, "loss": 0.72407293, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 15866, "time_per_iteration": 2.3556034564971924 }, { "auxiliary_loss_clip": 0.01051253, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.01076066, "balance_loss_mlp": 1.01553202, "epoch": 0.9539756500826695, "flos": 26212090821120.0, "grad_norm": 2.236255204699346, "language_loss": 0.86048293, "learning_rate": 2.215078143255855e-08, "loss": 0.88133669, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 15867, "time_per_iteration": 2.497847557067871 }, { "auxiliary_loss_clip": 0.01007209, "auxiliary_loss_mlp": 0.01004126, "balance_loss_clip": 1.00211143, "balance_loss_mlp": 1.00093174, "epoch": 0.9540357733353374, "flos": 68285501729280.0, "grad_norm": 0.827242637067941, "language_loss": 0.61838388, "learning_rate": 2.2093015527864024e-08, "loss": 0.63849723, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.0625, "step": 15868, "time_per_iteration": 3.088291883468628 }, { "auxiliary_loss_clip": 0.01050596, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 1.01156998, "balance_loss_mlp": 1.01523328, "epoch": 0.9540958965880054, "flos": 21287207525760.0, "grad_norm": 1.8164930112341908, "language_loss": 0.60558134, "learning_rate": 2.2035324625600425e-08, "loss": 0.62644404, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35351562, "step": 15869, "time_per_iteration": 2.3758113384246826 }, { "auxiliary_loss_clip": 0.01050131, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.01788545, "balance_loss_mlp": 1.01558471, "epoch": 0.9541560198406733, "flos": 19750505633280.0, "grad_norm": 4.109882110151423, "language_loss": 0.72463799, "learning_rate": 2.197770872795579e-08, "loss": 0.7455281, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34765625, "step": 15870, "time_per_iteration": 2.381094455718994 }, { "auxiliary_loss_clip": 0.01050168, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.01499224, "balance_loss_mlp": 1.01566541, "epoch": 0.9542161430933414, "flos": 24714526429440.0, "grad_norm": 1.9469304657613002, "language_loss": 0.77361041, "learning_rate": 2.1920167837114368e-08, "loss": 0.79448676, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 15871, "time_per_iteration": 2.430687427520752 }, { "auxiliary_loss_clip": 0.01051785, "auxiliary_loss_mlp": 0.0103865, "balance_loss_clip": 1.01430714, "balance_loss_mlp": 1.01586151, "epoch": 0.9542762663460094, "flos": 31064494400640.0, "grad_norm": 1.6688153019976901, "language_loss": 0.5975728, "learning_rate": 2.1862701955258634e-08, "loss": 0.61847717, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 15872, "time_per_iteration": 2.4542057514190674 }, { "auxiliary_loss_clip": 0.01053551, "auxiliary_loss_mlp": 0.01039659, "balance_loss_clip": 1.01431513, "balance_loss_mlp": 1.01605821, "epoch": 0.9543363895986773, "flos": 20775695063040.0, "grad_norm": 2.2779787489912864, "language_loss": 0.75900149, "learning_rate": 2.1805311084567514e-08, "loss": 0.77993363, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 15873, "time_per_iteration": 2.3682265281677246 }, { "auxiliary_loss_clip": 0.01053063, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.01911318, "balance_loss_mlp": 1.01632762, "epoch": 0.9543965128513453, "flos": 24461580562560.0, "grad_norm": 2.091703211191758, "language_loss": 0.62973905, "learning_rate": 2.1747995227217265e-08, "loss": 0.65070564, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 15874, "time_per_iteration": 2.4145166873931885 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.010384, "balance_loss_clip": 1.01629865, "balance_loss_mlp": 1.01620173, "epoch": 0.9544566361040132, "flos": 15260814835200.0, "grad_norm": 1.872339062188966, "language_loss": 0.90211391, "learning_rate": 2.169075438538104e-08, "loss": 0.92301178, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 15875, "time_per_iteration": 3.57841420173645 }, { "auxiliary_loss_clip": 0.01053631, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.01503134, "balance_loss_mlp": 1.01596415, "epoch": 0.9545167593566812, "flos": 25917668392320.0, "grad_norm": 1.640787690358709, "language_loss": 0.68467742, "learning_rate": 2.1633588561229765e-08, "loss": 0.70560515, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37695312, "step": 15876, "time_per_iteration": 2.382108449935913 }, { "auxiliary_loss_clip": 0.01052565, "auxiliary_loss_mlp": 0.01036638, "balance_loss_clip": 1.01304626, "balance_loss_mlp": 1.01559424, "epoch": 0.9545768826093491, "flos": 25627051301760.0, "grad_norm": 1.974313649921627, "language_loss": 0.70749456, "learning_rate": 2.1576497756931267e-08, "loss": 0.72838658, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 15877, "time_per_iteration": 2.4322316646575928 }, { "auxiliary_loss_clip": 0.01052651, "auxiliary_loss_mlp": 0.01038416, "balance_loss_clip": 1.01319194, "balance_loss_mlp": 1.01628089, "epoch": 0.9546370058620172, "flos": 22490419311360.0, "grad_norm": 1.6528747113632685, "language_loss": 0.71948212, "learning_rate": 2.1519481974650035e-08, "loss": 0.7403928, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36328125, "step": 15878, "time_per_iteration": 2.359132766723633 }, { "auxiliary_loss_clip": 0.010509, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.0099628, "balance_loss_mlp": 1.0163722, "epoch": 0.9546971291146851, "flos": 24608390296320.0, "grad_norm": 1.3400905069472893, "language_loss": 0.69370234, "learning_rate": 2.1462541216548335e-08, "loss": 0.71453786, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 15879, "time_per_iteration": 2.413098096847534 }, { "auxiliary_loss_clip": 0.01049995, "auxiliary_loss_mlp": 0.0103985, "balance_loss_clip": 1.01723576, "balance_loss_mlp": 1.01540184, "epoch": 0.9547572523673531, "flos": 28656499818240.0, "grad_norm": 1.9153211130218188, "language_loss": 0.86591649, "learning_rate": 2.1405675484785334e-08, "loss": 0.88681495, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 15880, "time_per_iteration": 2.412125825881958 }, { "auxiliary_loss_clip": 0.01052488, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.01181471, "balance_loss_mlp": 1.01627576, "epoch": 0.954817375620021, "flos": 33801196233600.0, "grad_norm": 1.7438157496885023, "language_loss": 0.73170936, "learning_rate": 2.134888478151753e-08, "loss": 0.75259924, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 15881, "time_per_iteration": 2.4881432056427 }, { "auxiliary_loss_clip": 0.01050988, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.01355815, "balance_loss_mlp": 1.01630282, "epoch": 0.954877498872689, "flos": 14427367925760.0, "grad_norm": 1.9005336511055235, "language_loss": 0.7247647, "learning_rate": 2.1292169108898083e-08, "loss": 0.74564254, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 15882, "time_per_iteration": 2.3470170497894287 }, { "auxiliary_loss_clip": 0.01051172, "auxiliary_loss_mlp": 0.0103918, "balance_loss_clip": 1.01663792, "balance_loss_mlp": 1.0159651, "epoch": 0.9549376221253569, "flos": 59267333612160.0, "grad_norm": 1.7395896759439178, "language_loss": 0.67185152, "learning_rate": 2.1235528469078168e-08, "loss": 0.69275498, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 15883, "time_per_iteration": 2.714733123779297 }, { "auxiliary_loss_clip": 0.01053042, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.01124454, "balance_loss_mlp": 1.01611388, "epoch": 0.954997745378025, "flos": 17273452648320.0, "grad_norm": 2.269980442516136, "language_loss": 0.79954946, "learning_rate": 2.1178962864205175e-08, "loss": 0.82043636, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36914062, "step": 15884, "time_per_iteration": 2.31618332862854 }, { "auxiliary_loss_clip": 0.01052871, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.0142169, "balance_loss_mlp": 1.01632714, "epoch": 0.955057868630693, "flos": 13005530006400.0, "grad_norm": 1.740780334758101, "language_loss": 0.78332984, "learning_rate": 2.1122472296424054e-08, "loss": 0.80424309, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 15885, "time_per_iteration": 3.740532398223877 }, { "auxiliary_loss_clip": 0.01052777, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.01513064, "balance_loss_mlp": 1.01587021, "epoch": 0.9551179918833609, "flos": 22636600640640.0, "grad_norm": 2.732244072917136, "language_loss": 0.71443319, "learning_rate": 2.1066056767877317e-08, "loss": 0.73534334, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36914062, "step": 15886, "time_per_iteration": 3.740164041519165 }, { "auxiliary_loss_clip": 0.01055447, "auxiliary_loss_mlp": 0.01042263, "balance_loss_clip": 1.01505983, "balance_loss_mlp": 1.01716113, "epoch": 0.9551781151360289, "flos": 21541759315200.0, "grad_norm": 5.2915517038807005, "language_loss": 0.74242061, "learning_rate": 2.1009716280703916e-08, "loss": 0.76339769, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.3828125, "step": 15887, "time_per_iteration": 2.363351583480835 }, { "auxiliary_loss_clip": 0.01050116, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.01685786, "balance_loss_mlp": 1.01554072, "epoch": 0.9552382383886968, "flos": 20701260311040.0, "grad_norm": 1.8998533847813919, "language_loss": 0.57885987, "learning_rate": 2.0953450837040364e-08, "loss": 0.59975183, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 15888, "time_per_iteration": 2.383162498474121 }, { "auxiliary_loss_clip": 0.01006771, "auxiliary_loss_mlp": 0.01002216, "balance_loss_clip": 1.00033236, "balance_loss_mlp": 1.00059724, "epoch": 0.9552983616413648, "flos": 67766448412800.0, "grad_norm": 0.7135347869678657, "language_loss": 0.57983088, "learning_rate": 2.0897260439020514e-08, "loss": 0.59992075, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.06176758, "step": 15889, "time_per_iteration": 3.0720791816711426 }, { "auxiliary_loss_clip": 0.01050742, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.011621, "balance_loss_mlp": 1.01476526, "epoch": 0.9553584848940327, "flos": 21578906868480.0, "grad_norm": 1.3655724251462567, "language_loss": 0.68234438, "learning_rate": 2.084114508877466e-08, "loss": 0.70319152, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 15890, "time_per_iteration": 2.4154703617095947 }, { "auxiliary_loss_clip": 0.01051472, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.01715899, "balance_loss_mlp": 1.01634371, "epoch": 0.9554186081467008, "flos": 24206993861760.0, "grad_norm": 1.7197643165885403, "language_loss": 0.74944818, "learning_rate": 2.0785104788430874e-08, "loss": 0.77036089, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15891, "time_per_iteration": 2.3969712257385254 }, { "auxiliary_loss_clip": 0.01048941, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.01639247, "balance_loss_mlp": 1.01502037, "epoch": 0.9554787313993687, "flos": 16250672102400.0, "grad_norm": 1.754978039042566, "language_loss": 0.78418338, "learning_rate": 2.072913954011435e-08, "loss": 0.80503249, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.33984375, "step": 15892, "time_per_iteration": 2.3675899505615234 }, { "auxiliary_loss_clip": 0.01050566, "auxiliary_loss_mlp": 0.01036469, "balance_loss_clip": 1.01384306, "balance_loss_mlp": 1.01579142, "epoch": 0.9555388546520367, "flos": 23403014006400.0, "grad_norm": 1.508050145875867, "language_loss": 0.71407998, "learning_rate": 2.0673249345947386e-08, "loss": 0.73495042, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 15893, "time_per_iteration": 2.412060022354126 }, { "auxiliary_loss_clip": 0.01052385, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.01179743, "balance_loss_mlp": 1.01741362, "epoch": 0.9555989779047046, "flos": 14793152906880.0, "grad_norm": 1.78658891002791, "language_loss": 0.66864103, "learning_rate": 2.0617434208048955e-08, "loss": 0.68951416, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34960938, "step": 15894, "time_per_iteration": 2.416110038757324 }, { "auxiliary_loss_clip": 0.01051455, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.01320648, "balance_loss_mlp": 1.01555932, "epoch": 0.9556591011573726, "flos": 22235658053760.0, "grad_norm": 1.6616463448796988, "language_loss": 0.82594383, "learning_rate": 2.056169412853581e-08, "loss": 0.84682989, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.359375, "step": 15895, "time_per_iteration": 2.421842575073242 }, { "auxiliary_loss_clip": 0.01050108, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.01219833, "balance_loss_mlp": 1.014727, "epoch": 0.9557192244100405, "flos": 27854056062720.0, "grad_norm": 1.4569598703773616, "language_loss": 0.72959042, "learning_rate": 2.0506029109521593e-08, "loss": 0.75044513, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 15896, "time_per_iteration": 2.454200267791748 }, { "auxiliary_loss_clip": 0.01050851, "auxiliary_loss_mlp": 0.01036253, "balance_loss_clip": 1.01547527, "balance_loss_mlp": 1.01580548, "epoch": 0.9557793476627086, "flos": 17601845696640.0, "grad_norm": 3.3124609656397728, "language_loss": 0.80606771, "learning_rate": 2.045043915311706e-08, "loss": 0.82693875, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34960938, "step": 15897, "time_per_iteration": 2.360569715499878 }, { "auxiliary_loss_clip": 0.0105087, "auxiliary_loss_mlp": 0.01038863, "balance_loss_clip": 1.01461589, "balance_loss_mlp": 1.01547599, "epoch": 0.9558394709153766, "flos": 23874446361600.0, "grad_norm": 1.9297708239050082, "language_loss": 0.73269582, "learning_rate": 2.03949242614303e-08, "loss": 0.75359315, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 15898, "time_per_iteration": 3.8735599517822266 }, { "auxiliary_loss_clip": 0.01007142, "auxiliary_loss_mlp": 0.01002435, "balance_loss_clip": 1.0003252, "balance_loss_mlp": 1.00077939, "epoch": 0.9558995941680445, "flos": 53679168840960.0, "grad_norm": 0.8772295032498897, "language_loss": 0.52358651, "learning_rate": 2.033948443656652e-08, "loss": 0.54368234, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06347656, "step": 15899, "time_per_iteration": 2.938342809677124 }, { "auxiliary_loss_clip": 0.01053535, "auxiliary_loss_mlp": 0.01038158, "balance_loss_clip": 1.01321983, "balance_loss_mlp": 1.0155921, "epoch": 0.9559597174207125, "flos": 13763843936640.0, "grad_norm": 2.6856510945573433, "language_loss": 0.68965912, "learning_rate": 2.028411968062782e-08, "loss": 0.71057606, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37890625, "step": 15900, "time_per_iteration": 2.3671762943267822 }, { "auxiliary_loss_clip": 0.01053072, "auxiliary_loss_mlp": 0.01037558, "balance_loss_clip": 1.01358485, "balance_loss_mlp": 1.01708114, "epoch": 0.9560198406733804, "flos": 19935370615680.0, "grad_norm": 1.985604744309505, "language_loss": 0.83391601, "learning_rate": 2.0228829995713627e-08, "loss": 0.85482228, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 15901, "time_per_iteration": 2.369601011276245 }, { "auxiliary_loss_clip": 0.01006973, "auxiliary_loss_mlp": 0.01002165, "balance_loss_clip": 1.00006747, "balance_loss_mlp": 1.00059128, "epoch": 0.9560799639260484, "flos": 57286744427520.0, "grad_norm": 0.7068364703246808, "language_loss": 0.5440585, "learning_rate": 2.0173615383920485e-08, "loss": 0.56414992, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06347656, "step": 15902, "time_per_iteration": 3.09403395652771 }, { "auxiliary_loss_clip": 0.01047413, "auxiliary_loss_mlp": 0.01036106, "balance_loss_clip": 1.01604271, "balance_loss_mlp": 1.01498163, "epoch": 0.9561400871787163, "flos": 18916151028480.0, "grad_norm": 1.7899687690076742, "language_loss": 0.86226821, "learning_rate": 2.01184758473425e-08, "loss": 0.88310337, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.32421875, "step": 15903, "time_per_iteration": 2.3524932861328125 }, { "auxiliary_loss_clip": 0.01050646, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.01141727, "balance_loss_mlp": 1.01626682, "epoch": 0.9562002104313844, "flos": 18037666598400.0, "grad_norm": 1.7770060462807518, "language_loss": 0.81366104, "learning_rate": 2.0063411388070217e-08, "loss": 0.83448821, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.34375, "step": 15904, "time_per_iteration": 2.348975419998169 }, { "auxiliary_loss_clip": 0.01053555, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.016747, "balance_loss_mlp": 1.01714206, "epoch": 0.9562603336840523, "flos": 24716516376960.0, "grad_norm": 3.290622354023181, "language_loss": 0.61336327, "learning_rate": 2.0008422008191972e-08, "loss": 0.63432908, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.36328125, "step": 15905, "time_per_iteration": 2.383650064468384 }, { "auxiliary_loss_clip": 0.0105063, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.01349592, "balance_loss_mlp": 1.01562405, "epoch": 0.9563204569367203, "flos": 21176183802240.0, "grad_norm": 1.930597514132045, "language_loss": 0.71148258, "learning_rate": 1.995350770979254e-08, "loss": 0.73234165, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34960938, "step": 15906, "time_per_iteration": 2.3577215671539307 }, { "auxiliary_loss_clip": 0.01053736, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.01797128, "balance_loss_mlp": 1.01717734, "epoch": 0.9563805801893882, "flos": 20228710792320.0, "grad_norm": 1.660225473608005, "language_loss": 0.72437561, "learning_rate": 1.9898668494954473e-08, "loss": 0.74533904, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 15907, "time_per_iteration": 2.3680317401885986 }, { "auxiliary_loss_clip": 0.0105002, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.01450849, "balance_loss_mlp": 1.01557803, "epoch": 0.9564407034420562, "flos": 25409821622400.0, "grad_norm": 1.8339795724962484, "language_loss": 0.72196662, "learning_rate": 1.9843904365757447e-08, "loss": 0.74283814, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 15908, "time_per_iteration": 2.430176258087158 }, { "auxiliary_loss_clip": 0.01051539, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.01411939, "balance_loss_mlp": 1.01660371, "epoch": 0.9565008266947241, "flos": 18622915585920.0, "grad_norm": 1.7920929162231434, "language_loss": 0.83762777, "learning_rate": 1.978921532427802e-08, "loss": 0.85851181, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34960938, "step": 15909, "time_per_iteration": 2.332280397415161 }, { "auxiliary_loss_clip": 0.01050104, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.01499104, "balance_loss_mlp": 1.01504803, "epoch": 0.9565609499473922, "flos": 24861021960960.0, "grad_norm": 1.835490870738085, "language_loss": 0.68661487, "learning_rate": 1.9734601372590086e-08, "loss": 0.70749146, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 15910, "time_per_iteration": 2.413201093673706 }, { "auxiliary_loss_clip": 0.01053175, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.01497245, "balance_loss_mlp": 1.01637292, "epoch": 0.9566210732000601, "flos": 21797393356800.0, "grad_norm": 1.6790425612849469, "language_loss": 0.75496143, "learning_rate": 1.968006251276444e-08, "loss": 0.77588946, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 15911, "time_per_iteration": 2.383683443069458 }, { "auxiliary_loss_clip": 0.01051193, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.00830543, "balance_loss_mlp": 1.01583552, "epoch": 0.9566811964527281, "flos": 18696617199360.0, "grad_norm": 2.2272979818333396, "language_loss": 0.70965379, "learning_rate": 1.9625598746869198e-08, "loss": 0.73048174, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 15912, "time_per_iteration": 2.3850502967834473 }, { "auxiliary_loss_clip": 0.01053419, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.01786613, "balance_loss_mlp": 1.01678073, "epoch": 0.9567413197053961, "flos": 12999944188800.0, "grad_norm": 3.1665744236526407, "language_loss": 0.72576618, "learning_rate": 1.95712100769696e-08, "loss": 0.74672699, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 15913, "time_per_iteration": 2.334707260131836 }, { "auxiliary_loss_clip": 0.01051966, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.01021397, "balance_loss_mlp": 1.01667702, "epoch": 0.956801442958064, "flos": 19718245670400.0, "grad_norm": 1.8521800438790648, "language_loss": 0.74427569, "learning_rate": 1.9516896505128444e-08, "loss": 0.76511061, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35351562, "step": 15914, "time_per_iteration": 2.463934898376465 }, { "auxiliary_loss_clip": 0.01050996, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.01070821, "balance_loss_mlp": 1.01619768, "epoch": 0.956861566210732, "flos": 18221868264960.0, "grad_norm": 1.4411332682011917, "language_loss": 0.68151999, "learning_rate": 1.9462658033404965e-08, "loss": 0.70236146, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 15915, "time_per_iteration": 3.6021080017089844 }, { "auxiliary_loss_clip": 0.01050737, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.01531625, "balance_loss_mlp": 1.01572609, "epoch": 0.9569216894634, "flos": 22195961971200.0, "grad_norm": 1.849582597038872, "language_loss": 0.6504885, "learning_rate": 1.9408494663855967e-08, "loss": 0.67137861, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34960938, "step": 15916, "time_per_iteration": 2.39572811126709 }, { "auxiliary_loss_clip": 0.01047291, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.01304674, "balance_loss_mlp": 1.01505637, "epoch": 0.956981812716068, "flos": 21688219935360.0, "grad_norm": 2.0580137084885535, "language_loss": 0.81759721, "learning_rate": 1.935440639853536e-08, "loss": 0.83839095, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.32226562, "step": 15917, "time_per_iteration": 2.3666906356811523 }, { "auxiliary_loss_clip": 0.01051644, "auxiliary_loss_mlp": 0.01037491, "balance_loss_clip": 1.0148772, "balance_loss_mlp": 1.01652825, "epoch": 0.9570419359687359, "flos": 13990045835520.0, "grad_norm": 2.920233987316388, "language_loss": 0.74378479, "learning_rate": 1.9300393239494172e-08, "loss": 0.76467609, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 15918, "time_per_iteration": 2.3647429943084717 }, { "auxiliary_loss_clip": 0.01007138, "auxiliary_loss_mlp": 0.01004545, "balance_loss_clip": 1.00261354, "balance_loss_mlp": 1.0008328, "epoch": 0.9571020592214039, "flos": 65193174120960.0, "grad_norm": 0.6294961240596305, "language_loss": 0.53108186, "learning_rate": 1.924645518878032e-08, "loss": 0.55119872, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06298828, "step": 15919, "time_per_iteration": 3.116783618927002 }, { "auxiliary_loss_clip": 0.0105434, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.01638556, "balance_loss_mlp": 1.01752639, "epoch": 0.9571621824740718, "flos": 17383114828800.0, "grad_norm": 2.693770931813411, "language_loss": 0.7764731, "learning_rate": 1.919259224843972e-08, "loss": 0.797418, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 15920, "time_per_iteration": 2.365687608718872 }, { "auxiliary_loss_clip": 0.01053216, "auxiliary_loss_mlp": 0.0103623, "balance_loss_clip": 1.01218522, "balance_loss_mlp": 1.01653087, "epoch": 0.9572223057267398, "flos": 14537309397120.0, "grad_norm": 1.888686197659497, "language_loss": 0.80151856, "learning_rate": 1.9138804420514298e-08, "loss": 0.82241309, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36523438, "step": 15921, "time_per_iteration": 2.3548638820648193 }, { "auxiliary_loss_clip": 0.01053835, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.01142108, "balance_loss_mlp": 1.01647007, "epoch": 0.9572824289794077, "flos": 33946400044800.0, "grad_norm": 1.9294571009496784, "language_loss": 0.52243096, "learning_rate": 1.9085091707044197e-08, "loss": 0.54333913, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 15922, "time_per_iteration": 2.4977126121520996 }, { "auxiliary_loss_clip": 0.0105315, "auxiliary_loss_mlp": 0.01039538, "balance_loss_clip": 1.01556504, "balance_loss_mlp": 1.01613259, "epoch": 0.9573425522320758, "flos": 18693998847360.0, "grad_norm": 2.081881025938299, "language_loss": 0.84550434, "learning_rate": 1.903145411006557e-08, "loss": 0.86643124, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 15923, "time_per_iteration": 2.3350837230682373 }, { "auxiliary_loss_clip": 0.01049759, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.01330519, "balance_loss_mlp": 1.01547003, "epoch": 0.9574026754847437, "flos": 28509096591360.0, "grad_norm": 1.6304001788483007, "language_loss": 0.76547694, "learning_rate": 1.8977891631613008e-08, "loss": 0.78633004, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 15924, "time_per_iteration": 2.421373128890991 }, { "auxiliary_loss_clip": 0.01051638, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.01627111, "balance_loss_mlp": 1.01517534, "epoch": 0.9574627987374117, "flos": 24351255066240.0, "grad_norm": 2.1533653983512173, "language_loss": 0.86841249, "learning_rate": 1.892440427371711e-08, "loss": 0.8893224, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 15925, "time_per_iteration": 3.9438793659210205 }, { "auxiliary_loss_clip": 0.01055051, "auxiliary_loss_mlp": 0.01038953, "balance_loss_clip": 1.01426458, "balance_loss_mlp": 1.01644588, "epoch": 0.9575229219900797, "flos": 23509638898560.0, "grad_norm": 1.7783878069482557, "language_loss": 0.76711237, "learning_rate": 1.8870992038406474e-08, "loss": 0.78805244, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.38671875, "step": 15926, "time_per_iteration": 3.7876646518707275 }, { "auxiliary_loss_clip": 0.01051966, "auxiliary_loss_mlp": 0.01034324, "balance_loss_clip": 1.01465511, "balance_loss_mlp": 1.01705217, "epoch": 0.9575830452427476, "flos": 22673713282560.0, "grad_norm": 1.5737730451521168, "language_loss": 0.78740114, "learning_rate": 1.8817654927706373e-08, "loss": 0.80826402, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.34765625, "step": 15927, "time_per_iteration": 2.3834445476531982 }, { "auxiliary_loss_clip": 0.01052637, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.01404595, "balance_loss_mlp": 1.01573634, "epoch": 0.9576431684954156, "flos": 30483853712640.0, "grad_norm": 1.7859286553252403, "language_loss": 0.70555919, "learning_rate": 1.8764392943639183e-08, "loss": 0.72648478, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.36914062, "step": 15928, "time_per_iteration": 2.4634289741516113 }, { "auxiliary_loss_clip": 0.01052144, "auxiliary_loss_mlp": 0.01035877, "balance_loss_clip": 1.01154673, "balance_loss_mlp": 1.01676917, "epoch": 0.9577032917480836, "flos": 21686369633280.0, "grad_norm": 1.6838387529054268, "language_loss": 0.83342946, "learning_rate": 1.871120608822485e-08, "loss": 0.85430968, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 15929, "time_per_iteration": 2.368654489517212 }, { "auxiliary_loss_clip": 0.01055361, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 1.01652849, "balance_loss_mlp": 1.01708698, "epoch": 0.9577634150007516, "flos": 29021865863040.0, "grad_norm": 1.6004715597043562, "language_loss": 0.72961676, "learning_rate": 1.8658094363480202e-08, "loss": 0.75058293, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.3828125, "step": 15930, "time_per_iteration": 2.4118032455444336 }, { "auxiliary_loss_clip": 0.01050395, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.01435447, "balance_loss_mlp": 1.01579487, "epoch": 0.9578235382534195, "flos": 19281202871040.0, "grad_norm": 1.568409989377175, "language_loss": 0.63862371, "learning_rate": 1.8605057771419185e-08, "loss": 0.65948528, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34570312, "step": 15931, "time_per_iteration": 2.364921808242798 }, { "auxiliary_loss_clip": 0.01049849, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.01120806, "balance_loss_mlp": 1.01584554, "epoch": 0.9578836615060875, "flos": 13698416315520.0, "grad_norm": 2.4273404826043192, "language_loss": 0.70954621, "learning_rate": 1.8552096314052633e-08, "loss": 0.73037404, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.33984375, "step": 15932, "time_per_iteration": 2.333998203277588 }, { "auxiliary_loss_clip": 0.01054839, "auxiliary_loss_mlp": 0.01052303, "balance_loss_clip": 1.02663684, "balance_loss_mlp": 1.01668859, "epoch": 0.9579437847587554, "flos": 17053604616960.0, "grad_norm": 1.8148009488218682, "language_loss": 0.76566553, "learning_rate": 1.849920999338961e-08, "loss": 0.78673697, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.38085938, "step": 15933, "time_per_iteration": 2.353330612182617 }, { "auxiliary_loss_clip": 0.01006971, "auxiliary_loss_mlp": 0.01003413, "balance_loss_clip": 1.00131488, "balance_loss_mlp": 1.00058162, "epoch": 0.9580039080114234, "flos": 60568056558720.0, "grad_norm": 0.7049301204979743, "language_loss": 0.57366931, "learning_rate": 1.8446398811434948e-08, "loss": 0.59377325, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06396484, "step": 15934, "time_per_iteration": 3.1124107837677 }, { "auxiliary_loss_clip": 0.01007134, "auxiliary_loss_mlp": 0.01003703, "balance_loss_clip": 1.00161636, "balance_loss_mlp": 1.00074077, "epoch": 0.9580640312640913, "flos": 66232120492800.0, "grad_norm": 0.9177390137783973, "language_loss": 0.66021359, "learning_rate": 1.8393662770191277e-08, "loss": 0.68032193, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06396484, "step": 15935, "time_per_iteration": 2.957770586013794 }, { "auxiliary_loss_clip": 0.0100736, "auxiliary_loss_mlp": 0.01006069, "balance_loss_clip": 1.00407791, "balance_loss_mlp": 1.00093269, "epoch": 0.9581241545167594, "flos": 62214979213440.0, "grad_norm": 0.7920324677844052, "language_loss": 0.57112312, "learning_rate": 1.8341001871658546e-08, "loss": 0.59125733, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06445312, "step": 15936, "time_per_iteration": 3.054417848587036 }, { "auxiliary_loss_clip": 0.0105218, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.01608849, "balance_loss_mlp": 1.01592326, "epoch": 0.9581842777694273, "flos": 23766983596800.0, "grad_norm": 1.904979880042566, "language_loss": 0.78604603, "learning_rate": 1.8288416117833825e-08, "loss": 0.80696678, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 15937, "time_per_iteration": 3.957155227661133 }, { "auxiliary_loss_clip": 0.01051572, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.01561546, "balance_loss_mlp": 1.01545262, "epoch": 0.9582444010220953, "flos": 21212074546560.0, "grad_norm": 2.090757908501192, "language_loss": 0.68886894, "learning_rate": 1.8235905510710636e-08, "loss": 0.70979077, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 15938, "time_per_iteration": 2.3920342922210693 }, { "auxiliary_loss_clip": 0.01051042, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.01233506, "balance_loss_mlp": 1.01504302, "epoch": 0.9583045242747633, "flos": 23804026416000.0, "grad_norm": 2.4568010263776046, "language_loss": 0.66985846, "learning_rate": 1.8183470052280712e-08, "loss": 0.6907146, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 15939, "time_per_iteration": 2.4049181938171387 }, { "auxiliary_loss_clip": 0.01050584, "auxiliary_loss_mlp": 0.01038602, "balance_loss_clip": 1.01638138, "balance_loss_mlp": 1.01546407, "epoch": 0.9583646475274312, "flos": 24130394605440.0, "grad_norm": 1.5703089510582406, "language_loss": 0.74659812, "learning_rate": 1.8131109744532025e-08, "loss": 0.76748997, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 15940, "time_per_iteration": 2.452683210372925 }, { "auxiliary_loss_clip": 0.0105125, "auxiliary_loss_mlp": 0.01037302, "balance_loss_clip": 1.01160002, "balance_loss_mlp": 1.01574183, "epoch": 0.9584247707800992, "flos": 20885601623040.0, "grad_norm": 1.6876320732495553, "language_loss": 0.73961294, "learning_rate": 1.8078824589450535e-08, "loss": 0.7604984, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.35546875, "step": 15941, "time_per_iteration": 2.3460428714752197 }, { "auxiliary_loss_clip": 0.01050393, "auxiliary_loss_mlp": 0.01036356, "balance_loss_clip": 1.01375437, "balance_loss_mlp": 1.01579976, "epoch": 0.9584848940327672, "flos": 26066398250880.0, "grad_norm": 1.658207708548181, "language_loss": 0.7314378, "learning_rate": 1.8026614589018442e-08, "loss": 0.75230521, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 15942, "time_per_iteration": 2.428291082382202 }, { "auxiliary_loss_clip": 0.01052062, "auxiliary_loss_mlp": 0.01042453, "balance_loss_clip": 1.0185039, "balance_loss_mlp": 1.01551652, "epoch": 0.9585450172854352, "flos": 34491638747520.0, "grad_norm": 1.6117292182477234, "language_loss": 0.72885191, "learning_rate": 1.797447974521571e-08, "loss": 0.74979699, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36523438, "step": 15943, "time_per_iteration": 2.4837348461151123 }, { "auxiliary_loss_clip": 0.01052959, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.0155108, "balance_loss_mlp": 1.01549745, "epoch": 0.9586051405381031, "flos": 23109673829760.0, "grad_norm": 1.7088793273147398, "language_loss": 0.69376874, "learning_rate": 1.792242006001965e-08, "loss": 0.71469307, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 15944, "time_per_iteration": 2.4655656814575195 }, { "auxiliary_loss_clip": 0.01051063, "auxiliary_loss_mlp": 0.01039708, "balance_loss_clip": 1.01640248, "balance_loss_mlp": 1.01521587, "epoch": 0.9586652637907711, "flos": 19603137317760.0, "grad_norm": 1.9724868568027907, "language_loss": 0.67324072, "learning_rate": 1.7870435535403795e-08, "loss": 0.69414842, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 15945, "time_per_iteration": 2.3966352939605713 }, { "auxiliary_loss_clip": 0.01006967, "auxiliary_loss_mlp": 0.01001957, "balance_loss_clip": 0.99985874, "balance_loss_mlp": 1.00052619, "epoch": 0.958725387043439, "flos": 72069912685440.0, "grad_norm": 0.7461275071718755, "language_loss": 0.6191324, "learning_rate": 1.7818526173339678e-08, "loss": 0.63922161, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06445312, "step": 15946, "time_per_iteration": 3.1014492511749268 }, { "auxiliary_loss_clip": 0.01050497, "auxiliary_loss_mlp": 0.01036082, "balance_loss_clip": 1.0148623, "balance_loss_mlp": 1.01594818, "epoch": 0.958785510296107, "flos": 28910702494080.0, "grad_norm": 1.7485730872887275, "language_loss": 0.7618624, "learning_rate": 1.7766691975795723e-08, "loss": 0.7827282, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34570312, "step": 15947, "time_per_iteration": 2.423203945159912 }, { "auxiliary_loss_clip": 0.01049966, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.01067472, "balance_loss_mlp": 1.01532507, "epoch": 0.958845633548775, "flos": 18476245497600.0, "grad_norm": 2.168199441327553, "language_loss": 0.70417219, "learning_rate": 1.771493294473747e-08, "loss": 0.72499579, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 15948, "time_per_iteration": 2.436776876449585 }, { "auxiliary_loss_clip": 0.01050702, "auxiliary_loss_mlp": 0.01039958, "balance_loss_clip": 1.01679587, "balance_loss_mlp": 1.01601624, "epoch": 0.958905756801443, "flos": 24205771964160.0, "grad_norm": 2.026471604495809, "language_loss": 0.79469323, "learning_rate": 1.7663249082127574e-08, "loss": 0.8155998, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 15949, "time_per_iteration": 2.4446756839752197 }, { "auxiliary_loss_clip": 0.0105316, "auxiliary_loss_mlp": 0.01041645, "balance_loss_clip": 1.01692128, "balance_loss_mlp": 1.01724553, "epoch": 0.9589658800541109, "flos": 25006819265280.0, "grad_norm": 1.9971695290664462, "language_loss": 0.69398749, "learning_rate": 1.761164038992602e-08, "loss": 0.71493554, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.359375, "step": 15950, "time_per_iteration": 2.411465644836426 }, { "auxiliary_loss_clip": 0.01050533, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.01484787, "balance_loss_mlp": 1.0153451, "epoch": 0.9590260033067789, "flos": 23513409325440.0, "grad_norm": 1.7868211951469781, "language_loss": 0.86944973, "learning_rate": 1.7560106870089687e-08, "loss": 0.89030051, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.3515625, "step": 15951, "time_per_iteration": 2.4062821865081787 }, { "auxiliary_loss_clip": 0.0105461, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.01578999, "balance_loss_mlp": 1.01729059, "epoch": 0.9590861265594469, "flos": 25519169600640.0, "grad_norm": 2.3563562045716773, "language_loss": 0.81316125, "learning_rate": 1.7508648524572568e-08, "loss": 0.83411592, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.375, "step": 15952, "time_per_iteration": 2.4176366329193115 }, { "auxiliary_loss_clip": 0.01052199, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.0133791, "balance_loss_mlp": 1.01638043, "epoch": 0.9591462498121148, "flos": 21178243572480.0, "grad_norm": 1.6970389467113942, "language_loss": 0.70143539, "learning_rate": 1.7457265355326434e-08, "loss": 0.72233826, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.35742188, "step": 15953, "time_per_iteration": 2.3623769283294678 }, { "auxiliary_loss_clip": 0.01053496, "auxiliary_loss_mlp": 0.0104353, "balance_loss_clip": 1.01816213, "balance_loss_mlp": 1.0164423, "epoch": 0.9592063730647828, "flos": 21722050909440.0, "grad_norm": 2.9526635702808113, "language_loss": 0.59713745, "learning_rate": 1.7405957364299285e-08, "loss": 0.61810774, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 15954, "time_per_iteration": 2.372628927230835 }, { "auxiliary_loss_clip": 0.0105205, "auxiliary_loss_mlp": 0.01036171, "balance_loss_clip": 1.01260376, "balance_loss_mlp": 1.01585698, "epoch": 0.9592664963174508, "flos": 29890295821440.0, "grad_norm": 2.412637640972294, "language_loss": 0.74248457, "learning_rate": 1.7354724553437117e-08, "loss": 0.76336676, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 15955, "time_per_iteration": 3.7577335834503174 }, { "auxiliary_loss_clip": 0.01051604, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.01451159, "balance_loss_mlp": 1.01549995, "epoch": 0.9593266195701188, "flos": 17998773477120.0, "grad_norm": 1.7924351932755835, "language_loss": 0.63233501, "learning_rate": 1.7303566924682378e-08, "loss": 0.65323222, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 15956, "time_per_iteration": 2.35461688041687 }, { "auxiliary_loss_clip": 0.01052179, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.01150346, "balance_loss_mlp": 1.01649356, "epoch": 0.9593867428227867, "flos": 18837422179200.0, "grad_norm": 2.552329354762303, "language_loss": 0.61143756, "learning_rate": 1.725248447997507e-08, "loss": 0.6323083, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 15957, "time_per_iteration": 2.319568157196045 }, { "auxiliary_loss_clip": 0.01052039, "auxiliary_loss_mlp": 0.01042529, "balance_loss_clip": 1.01824629, "balance_loss_mlp": 1.01590097, "epoch": 0.9594468660754547, "flos": 29565847756800.0, "grad_norm": 1.929686725357339, "language_loss": 0.75101221, "learning_rate": 1.7201477221252314e-08, "loss": 0.77195787, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 15958, "time_per_iteration": 2.4654414653778076 }, { "auxiliary_loss_clip": 0.01051059, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.01029754, "balance_loss_mlp": 1.01536083, "epoch": 0.9595069893281226, "flos": 20702237829120.0, "grad_norm": 1.7612367540372689, "language_loss": 0.75364256, "learning_rate": 1.7150545150448116e-08, "loss": 0.77447963, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35742188, "step": 15959, "time_per_iteration": 2.3680059909820557 }, { "auxiliary_loss_clip": 0.01052616, "auxiliary_loss_mlp": 0.01035692, "balance_loss_clip": 1.01156378, "balance_loss_mlp": 1.01603055, "epoch": 0.9595671125807906, "flos": 22452573530880.0, "grad_norm": 2.0743246643747812, "language_loss": 0.66044897, "learning_rate": 1.7099688269493816e-08, "loss": 0.68133211, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36523438, "step": 15960, "time_per_iteration": 2.3895580768585205 }, { "auxiliary_loss_clip": 0.01049824, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.01314116, "balance_loss_mlp": 1.0156914, "epoch": 0.9596272358334585, "flos": 23914072621440.0, "grad_norm": 1.6142745323316712, "language_loss": 0.79132092, "learning_rate": 1.7048906580318544e-08, "loss": 0.81218088, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34179688, "step": 15961, "time_per_iteration": 2.393799066543579 }, { "auxiliary_loss_clip": 0.01049245, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.01061714, "balance_loss_mlp": 1.01521397, "epoch": 0.9596873590861266, "flos": 17671672149120.0, "grad_norm": 1.7608118932645207, "language_loss": 0.76524007, "learning_rate": 1.699820008484698e-08, "loss": 0.78605199, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 15962, "time_per_iteration": 2.372875928878784 }, { "auxiliary_loss_clip": 0.01052752, "auxiliary_loss_mlp": 0.01043685, "balance_loss_clip": 1.01810241, "balance_loss_mlp": 1.01592052, "epoch": 0.9597474823387945, "flos": 25807447630080.0, "grad_norm": 2.1363531846559987, "language_loss": 0.73042309, "learning_rate": 1.6947568785002698e-08, "loss": 0.75138736, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 15963, "time_per_iteration": 2.4120633602142334 }, { "auxiliary_loss_clip": 0.01048917, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.01636434, "balance_loss_mlp": 1.01514459, "epoch": 0.9598076055914625, "flos": 23767402533120.0, "grad_norm": 1.4476882599768301, "language_loss": 0.74559075, "learning_rate": 1.689701268270527e-08, "loss": 0.76646554, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33789062, "step": 15964, "time_per_iteration": 2.4225552082061768 }, { "auxiliary_loss_clip": 0.01006988, "auxiliary_loss_mlp": 0.01003498, "balance_loss_clip": 1.00157881, "balance_loss_mlp": 1.00050068, "epoch": 0.9598677288441305, "flos": 56513383701120.0, "grad_norm": 0.869836305975847, "language_loss": 0.5764432, "learning_rate": 1.684653177987161e-08, "loss": 0.59654808, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.06445312, "step": 15965, "time_per_iteration": 5.703410625457764 }, { "auxiliary_loss_clip": 0.01051412, "auxiliary_loss_mlp": 0.01033641, "balance_loss_clip": 1.0126003, "balance_loss_mlp": 1.015607, "epoch": 0.9599278520967984, "flos": 22996520513280.0, "grad_norm": 1.6885341250110466, "language_loss": 0.79604876, "learning_rate": 1.6796126078416627e-08, "loss": 0.81689924, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.359375, "step": 15966, "time_per_iteration": 2.369694709777832 }, { "auxiliary_loss_clip": 0.01049367, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.0097723, "balance_loss_mlp": 1.01490414, "epoch": 0.9599879753494664, "flos": 23038555656960.0, "grad_norm": 1.594428318537607, "language_loss": 0.80313581, "learning_rate": 1.674579558025102e-08, "loss": 0.82395178, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 15967, "time_per_iteration": 2.3785345554351807 }, { "auxiliary_loss_clip": 0.01053268, "auxiliary_loss_mlp": 0.01036667, "balance_loss_clip": 1.01256251, "balance_loss_mlp": 1.01647484, "epoch": 0.9600480986021344, "flos": 16391546904960.0, "grad_norm": 2.119692171994962, "language_loss": 0.81605113, "learning_rate": 1.669554028728348e-08, "loss": 0.83695048, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.3671875, "step": 15968, "time_per_iteration": 2.322484016418457 }, { "auxiliary_loss_clip": 0.01056192, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.01671791, "balance_loss_mlp": 1.017488, "epoch": 0.9601082218548024, "flos": 24275388948480.0, "grad_norm": 4.0975063788178385, "language_loss": 0.69170868, "learning_rate": 1.6645360201420044e-08, "loss": 0.71272898, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.38671875, "step": 15969, "time_per_iteration": 2.3945462703704834 }, { "auxiliary_loss_clip": 0.01049316, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.01504469, "balance_loss_mlp": 1.01488876, "epoch": 0.9601683451074703, "flos": 19608967514880.0, "grad_norm": 3.019085633373059, "language_loss": 0.82089639, "learning_rate": 1.6595255324563186e-08, "loss": 0.84176493, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 15970, "time_per_iteration": 2.369249105453491 }, { "auxiliary_loss_clip": 0.01050357, "auxiliary_loss_mlp": 0.01037161, "balance_loss_clip": 1.01378393, "balance_loss_mlp": 1.0161171, "epoch": 0.9602284683601383, "flos": 26649901670400.0, "grad_norm": 1.516527873753173, "language_loss": 0.78257477, "learning_rate": 1.654522565861316e-08, "loss": 0.80344999, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34179688, "step": 15971, "time_per_iteration": 2.4472031593322754 }, { "auxiliary_loss_clip": 0.01053519, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.01198149, "balance_loss_mlp": 1.0167737, "epoch": 0.9602885916128062, "flos": 15553352050560.0, "grad_norm": 2.153835417099332, "language_loss": 0.6855886, "learning_rate": 1.64952712054669e-08, "loss": 0.70648998, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3671875, "step": 15972, "time_per_iteration": 2.346890926361084 }, { "auxiliary_loss_clip": 0.01050275, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.01110125, "balance_loss_mlp": 1.01544368, "epoch": 0.9603487148654742, "flos": 16501593110400.0, "grad_norm": 2.0494287060031238, "language_loss": 0.76786613, "learning_rate": 1.644539196701844e-08, "loss": 0.78870702, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 15973, "time_per_iteration": 2.330226182937622 }, { "auxiliary_loss_clip": 0.0105283, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.01596713, "balance_loss_mlp": 1.01708269, "epoch": 0.9604088381181421, "flos": 20844439263360.0, "grad_norm": 1.6948545178165422, "language_loss": 0.69432819, "learning_rate": 1.639558794515983e-08, "loss": 0.71523446, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35742188, "step": 15974, "time_per_iteration": 2.3609750270843506 }, { "auxiliary_loss_clip": 0.01051535, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.01405931, "balance_loss_mlp": 1.01526833, "epoch": 0.9604689613708102, "flos": 19682075635200.0, "grad_norm": 1.6907352343888187, "language_loss": 0.69477212, "learning_rate": 1.6345859141779105e-08, "loss": 0.71566111, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36328125, "step": 15975, "time_per_iteration": 2.412550687789917 }, { "auxiliary_loss_clip": 0.01050493, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.00985169, "balance_loss_mlp": 1.01603985, "epoch": 0.9605290846234781, "flos": 24096423985920.0, "grad_norm": 2.120382154347477, "language_loss": 0.57783139, "learning_rate": 1.6296205558762322e-08, "loss": 0.5986439, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34375, "step": 15976, "time_per_iteration": 2.391308307647705 }, { "auxiliary_loss_clip": 0.01050667, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.01224804, "balance_loss_mlp": 1.01512885, "epoch": 0.9605892078761461, "flos": 27121438759680.0, "grad_norm": 1.9646584074681954, "language_loss": 0.69094861, "learning_rate": 1.624662719799219e-08, "loss": 0.7117945, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 15977, "time_per_iteration": 3.8302674293518066 }, { "auxiliary_loss_clip": 0.01051767, "auxiliary_loss_mlp": 0.01042408, "balance_loss_clip": 1.0195322, "balance_loss_mlp": 1.01643729, "epoch": 0.9606493311288141, "flos": 14136052608000.0, "grad_norm": 3.0811973166756212, "language_loss": 0.83131748, "learning_rate": 1.6197124061348766e-08, "loss": 0.85225928, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 15978, "time_per_iteration": 2.4253768920898438 }, { "auxiliary_loss_clip": 0.01053842, "auxiliary_loss_mlp": 0.01045657, "balance_loss_clip": 1.02015853, "balance_loss_mlp": 1.01634765, "epoch": 0.960709454381482, "flos": 15812477228160.0, "grad_norm": 2.141447003186882, "language_loss": 0.83917761, "learning_rate": 1.614769615070921e-08, "loss": 0.86017257, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.375, "step": 15979, "time_per_iteration": 2.3394477367401123 }, { "auxiliary_loss_clip": 0.01051659, "auxiliary_loss_mlp": 0.01037326, "balance_loss_clip": 1.01471233, "balance_loss_mlp": 1.0155946, "epoch": 0.96076957763415, "flos": 22564295481600.0, "grad_norm": 1.6380020480426722, "language_loss": 0.80350435, "learning_rate": 1.6098343467947805e-08, "loss": 0.82439417, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 15980, "time_per_iteration": 2.371248483657837 }, { "auxiliary_loss_clip": 0.01052359, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.01023149, "balance_loss_mlp": 1.0157671, "epoch": 0.960829700886818, "flos": 24680101962240.0, "grad_norm": 1.8129794790741613, "language_loss": 0.6898396, "learning_rate": 1.6049066014935942e-08, "loss": 0.71070045, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3671875, "step": 15981, "time_per_iteration": 2.421250104904175 }, { "auxiliary_loss_clip": 0.01049998, "auxiliary_loss_mlp": 0.0103177, "balance_loss_clip": 1.01093197, "balance_loss_mlp": 1.01539731, "epoch": 0.960889824139486, "flos": 26541007539840.0, "grad_norm": 1.4124070843841663, "language_loss": 0.70551586, "learning_rate": 1.5999863793542344e-08, "loss": 0.72633356, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34570312, "step": 15982, "time_per_iteration": 2.4169585704803467 }, { "auxiliary_loss_clip": 0.01007031, "auxiliary_loss_mlp": 0.01003281, "balance_loss_clip": 1.00104034, "balance_loss_mlp": 1.00065887, "epoch": 0.9609499473921539, "flos": 71111826622080.0, "grad_norm": 0.6810248858391218, "language_loss": 0.53341031, "learning_rate": 1.595073680563286e-08, "loss": 0.55351341, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.06347656, "step": 15983, "time_per_iteration": 3.120314598083496 }, { "auxiliary_loss_clip": 0.01050478, "auxiliary_loss_mlp": 0.01037097, "balance_loss_clip": 1.01489997, "balance_loss_mlp": 1.01558781, "epoch": 0.9610100706448219, "flos": 20551587845760.0, "grad_norm": 1.931127767024186, "language_loss": 0.69094241, "learning_rate": 1.5901685053070212e-08, "loss": 0.71181816, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34960938, "step": 15984, "time_per_iteration": 2.3552684783935547 }, { "auxiliary_loss_clip": 0.01050555, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.01442778, "balance_loss_mlp": 1.01644397, "epoch": 0.9610701938974898, "flos": 14063328512640.0, "grad_norm": 2.0570162471749938, "language_loss": 0.68679589, "learning_rate": 1.5852708537714477e-08, "loss": 0.70765829, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 15985, "time_per_iteration": 2.3653926849365234 }, { "auxiliary_loss_clip": 0.01050884, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.0151453, "balance_loss_mlp": 1.01556051, "epoch": 0.9611303171501578, "flos": 20228955171840.0, "grad_norm": 2.032793942845237, "language_loss": 0.7983613, "learning_rate": 1.580380726142283e-08, "loss": 0.81923568, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3515625, "step": 15986, "time_per_iteration": 2.3483502864837646 }, { "auxiliary_loss_clip": 0.01052236, "auxiliary_loss_mlp": 0.01043393, "balance_loss_clip": 1.01871681, "balance_loss_mlp": 1.01617587, "epoch": 0.9611904404028258, "flos": 20950261194240.0, "grad_norm": 2.3177499323643684, "language_loss": 0.65122116, "learning_rate": 1.5754981226049792e-08, "loss": 0.67217743, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.359375, "step": 15987, "time_per_iteration": 2.3649208545684814 }, { "auxiliary_loss_clip": 0.01049717, "auxiliary_loss_mlp": 0.01036065, "balance_loss_clip": 1.01594257, "balance_loss_mlp": 1.01570582, "epoch": 0.9612505636554938, "flos": 24826562582400.0, "grad_norm": 2.340572917743407, "language_loss": 0.6806004, "learning_rate": 1.5706230433446544e-08, "loss": 0.70145828, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 15988, "time_per_iteration": 2.4407198429107666 }, { "auxiliary_loss_clip": 0.01050894, "auxiliary_loss_mlp": 0.01041913, "balance_loss_clip": 1.02006233, "balance_loss_mlp": 1.01587963, "epoch": 0.9613106869081617, "flos": 17164034847360.0, "grad_norm": 2.991099221301518, "language_loss": 0.75382102, "learning_rate": 1.5657554885462055e-08, "loss": 0.77474916, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 15989, "time_per_iteration": 2.361701250076294 }, { "auxiliary_loss_clip": 0.01006674, "auxiliary_loss_mlp": 0.01001742, "balance_loss_clip": 0.99957198, "balance_loss_mlp": 1.0002985, "epoch": 0.9613708101608297, "flos": 61560741646080.0, "grad_norm": 0.8241164784891363, "language_loss": 0.6313653, "learning_rate": 1.5608954583941737e-08, "loss": 0.6514495, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.06347656, "step": 15990, "time_per_iteration": 2.904916286468506 }, { "auxiliary_loss_clip": 0.01051041, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.01472831, "balance_loss_mlp": 1.01543188, "epoch": 0.9614309334134977, "flos": 27416664149760.0, "grad_norm": 1.9128738380460268, "language_loss": 0.7902748, "learning_rate": 1.5560429530729003e-08, "loss": 0.81116319, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 15991, "time_per_iteration": 2.428565263748169 }, { "auxiliary_loss_clip": 0.01055494, "auxiliary_loss_mlp": 0.01043882, "balance_loss_clip": 1.01953924, "balance_loss_mlp": 1.01680899, "epoch": 0.9614910566661656, "flos": 22818079221120.0, "grad_norm": 2.4417071326628474, "language_loss": 0.86077636, "learning_rate": 1.5511979727663493e-08, "loss": 0.88177013, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38671875, "step": 15992, "time_per_iteration": 2.3566465377807617 }, { "auxiliary_loss_clip": 0.01050924, "auxiliary_loss_mlp": 0.01037384, "balance_loss_clip": 1.01333976, "balance_loss_mlp": 1.01589024, "epoch": 0.9615511799188337, "flos": 20666766021120.0, "grad_norm": 2.076943338727949, "language_loss": 0.73948586, "learning_rate": 1.5463605176582406e-08, "loss": 0.76036882, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 15993, "time_per_iteration": 2.3684961795806885 }, { "auxiliary_loss_clip": 0.01051809, "auxiliary_loss_mlp": 0.01042023, "balance_loss_clip": 1.01797891, "balance_loss_mlp": 1.01562655, "epoch": 0.9616113031715016, "flos": 33147726716160.0, "grad_norm": 1.423823370090843, "language_loss": 0.69024044, "learning_rate": 1.5415305879320716e-08, "loss": 0.71117878, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 15994, "time_per_iteration": 2.47804856300354 }, { "auxiliary_loss_clip": 0.01050684, "auxiliary_loss_mlp": 0.01034442, "balance_loss_clip": 1.01161385, "balance_loss_mlp": 1.01608562, "epoch": 0.9616714264241696, "flos": 25008634656000.0, "grad_norm": 1.683853377432611, "language_loss": 0.85164827, "learning_rate": 1.5367081837709183e-08, "loss": 0.87249953, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34570312, "step": 15995, "time_per_iteration": 3.776531219482422 }, { "auxiliary_loss_clip": 0.01052789, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.01698804, "balance_loss_mlp": 1.01602054, "epoch": 0.9617315496768375, "flos": 13546788814080.0, "grad_norm": 1.7917655461924809, "language_loss": 0.77197307, "learning_rate": 1.5318933053576788e-08, "loss": 0.79291892, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 15996, "time_per_iteration": 2.3838090896606445 }, { "auxiliary_loss_clip": 0.01051374, "auxiliary_loss_mlp": 0.01043199, "balance_loss_clip": 1.01771224, "balance_loss_mlp": 1.01606226, "epoch": 0.9617916729295055, "flos": 11253728027520.0, "grad_norm": 2.3047691136766546, "language_loss": 0.78710508, "learning_rate": 1.52708595287494e-08, "loss": 0.80805087, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.35351562, "step": 15997, "time_per_iteration": 2.3416249752044678 }, { "auxiliary_loss_clip": 0.01048546, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 1.0075686, "balance_loss_mlp": 1.01572919, "epoch": 0.9618517961821734, "flos": 22818637802880.0, "grad_norm": 1.6720711531424477, "language_loss": 0.67591751, "learning_rate": 1.522286126505001e-08, "loss": 0.69668055, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.328125, "step": 15998, "time_per_iteration": 2.3645706176757812 }, { "auxiliary_loss_clip": 0.01050649, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.01479328, "balance_loss_mlp": 1.0156579, "epoch": 0.9619119194348414, "flos": 16616422172160.0, "grad_norm": 1.530060499041349, "language_loss": 0.73690301, "learning_rate": 1.5174938264298498e-08, "loss": 0.75778598, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 15999, "time_per_iteration": 2.3484597206115723 }, { "auxiliary_loss_clip": 0.0104899, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.0121212, "balance_loss_mlp": 1.01534975, "epoch": 0.9619720426875094, "flos": 24533990455680.0, "grad_norm": 2.8654942682468505, "language_loss": 0.66509032, "learning_rate": 1.5127090528312514e-08, "loss": 0.68592203, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3359375, "step": 16000, "time_per_iteration": 2.395911693572998 }, { "auxiliary_loss_clip": 0.01050933, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.01260185, "balance_loss_mlp": 1.01547718, "epoch": 0.9620321659401774, "flos": 20631154567680.0, "grad_norm": 1.5610776269498494, "language_loss": 0.76016855, "learning_rate": 1.5079318058905723e-08, "loss": 0.78103054, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 16001, "time_per_iteration": 2.376692056655884 }, { "auxiliary_loss_clip": 0.01051213, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.0118705, "balance_loss_mlp": 1.01516843, "epoch": 0.9620922891928453, "flos": 18514300746240.0, "grad_norm": 1.5260259247163617, "language_loss": 0.69484949, "learning_rate": 1.5031620857890447e-08, "loss": 0.71571875, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.359375, "step": 16002, "time_per_iteration": 2.362133741378784 }, { "auxiliary_loss_clip": 0.01050777, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.01583958, "balance_loss_mlp": 1.01628304, "epoch": 0.9621524124455133, "flos": 28766127087360.0, "grad_norm": 1.2952124351001784, "language_loss": 0.65549731, "learning_rate": 1.4983998927074804e-08, "loss": 0.67639565, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34570312, "step": 16003, "time_per_iteration": 2.442814350128174 }, { "auxiliary_loss_clip": 0.01052273, "auxiliary_loss_mlp": 0.01043937, "balance_loss_clip": 1.02004755, "balance_loss_mlp": 1.01668859, "epoch": 0.9622125356981813, "flos": 19097873988480.0, "grad_norm": 1.7696233137136212, "language_loss": 0.76585895, "learning_rate": 1.493645226826512e-08, "loss": 0.78682101, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 16004, "time_per_iteration": 3.855914831161499 }, { "auxiliary_loss_clip": 0.01049639, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.01252079, "balance_loss_mlp": 1.01556885, "epoch": 0.9622726589508492, "flos": 20301784001280.0, "grad_norm": 1.8782947453947194, "language_loss": 0.8063792, "learning_rate": 1.4888980883263958e-08, "loss": 0.82722294, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33984375, "step": 16005, "time_per_iteration": 3.6531317234039307 }, { "auxiliary_loss_clip": 0.01050286, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.01520586, "balance_loss_mlp": 1.01555324, "epoch": 0.9623327822035173, "flos": 54927699304320.0, "grad_norm": 3.4277770886657244, "language_loss": 0.68012023, "learning_rate": 1.4841584773871652e-08, "loss": 0.70100081, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 16006, "time_per_iteration": 2.671592950820923 }, { "auxiliary_loss_clip": 0.01048217, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.01639092, "balance_loss_mlp": 1.0158757, "epoch": 0.9623929054561852, "flos": 21758046387840.0, "grad_norm": 1.5966904462722127, "language_loss": 0.78948289, "learning_rate": 1.479426394188521e-08, "loss": 0.81033283, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.32421875, "step": 16007, "time_per_iteration": 2.3723080158233643 }, { "auxiliary_loss_clip": 0.01052347, "auxiliary_loss_mlp": 0.01040064, "balance_loss_clip": 1.01677012, "balance_loss_mlp": 1.01636708, "epoch": 0.9624530287088532, "flos": 17930587858560.0, "grad_norm": 2.277327722848451, "language_loss": 0.69445264, "learning_rate": 1.4747018389099198e-08, "loss": 0.71537673, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 16008, "time_per_iteration": 2.356457233428955 }, { "auxiliary_loss_clip": 0.01054318, "auxiliary_loss_mlp": 0.01038418, "balance_loss_clip": 1.01375413, "balance_loss_mlp": 1.01777267, "epoch": 0.9625131519615211, "flos": 23252748048000.0, "grad_norm": 2.5727544000575415, "language_loss": 0.74813676, "learning_rate": 1.469984811730529e-08, "loss": 0.76906413, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 16009, "time_per_iteration": 2.4390571117401123 }, { "auxiliary_loss_clip": 0.01050153, "auxiliary_loss_mlp": 0.01034865, "balance_loss_clip": 1.0132525, "balance_loss_mlp": 1.0150224, "epoch": 0.9625732752141891, "flos": 18915627358080.0, "grad_norm": 1.7231996586273772, "language_loss": 0.76032519, "learning_rate": 1.4652753128292061e-08, "loss": 0.78117537, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 16010, "time_per_iteration": 2.3641135692596436 }, { "auxiliary_loss_clip": 0.01055513, "auxiliary_loss_mlp": 0.01044585, "balance_loss_clip": 1.01605785, "balance_loss_mlp": 1.01746249, "epoch": 0.962633398466857, "flos": 16251998734080.0, "grad_norm": 1.735356902913667, "language_loss": 0.71053302, "learning_rate": 1.4605733423845635e-08, "loss": 0.731534, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.37890625, "step": 16011, "time_per_iteration": 2.3207857608795166 }, { "auxiliary_loss_clip": 0.01050486, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.01527405, "balance_loss_mlp": 1.0165664, "epoch": 0.962693521719525, "flos": 54195466026240.0, "grad_norm": 2.89073854199058, "language_loss": 0.69931227, "learning_rate": 1.4558789005748585e-08, "loss": 0.72017622, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33984375, "step": 16012, "time_per_iteration": 2.685346841812134 }, { "auxiliary_loss_clip": 0.01056065, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.01650786, "balance_loss_mlp": 1.01756883, "epoch": 0.962753644972193, "flos": 33104504586240.0, "grad_norm": 1.8620476249568032, "language_loss": 0.73530102, "learning_rate": 1.4511919875781264e-08, "loss": 0.7562952, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.38476562, "step": 16013, "time_per_iteration": 2.494292736053467 }, { "auxiliary_loss_clip": 0.01051349, "auxiliary_loss_mlp": 0.01038484, "balance_loss_clip": 1.01281786, "balance_loss_mlp": 1.01601541, "epoch": 0.962813768224861, "flos": 42229020170880.0, "grad_norm": 2.8548739377325516, "language_loss": 0.65165585, "learning_rate": 1.4465126035720698e-08, "loss": 0.67255419, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.35351562, "step": 16014, "time_per_iteration": 2.596019744873047 }, { "auxiliary_loss_clip": 0.01049492, "auxiliary_loss_mlp": 0.010379, "balance_loss_clip": 1.01598954, "balance_loss_mlp": 1.01598716, "epoch": 0.9628738914775289, "flos": 43943011280640.0, "grad_norm": 1.5518636576123002, "language_loss": 0.73462719, "learning_rate": 1.4418407487341688e-08, "loss": 0.75550115, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3359375, "step": 16015, "time_per_iteration": 2.588960886001587 }, { "auxiliary_loss_clip": 0.01051686, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.01830149, "balance_loss_mlp": 1.01549733, "epoch": 0.9629340147301969, "flos": 15595282460160.0, "grad_norm": 1.921790073642791, "language_loss": 0.78790063, "learning_rate": 1.4371764232415707e-08, "loss": 0.80883086, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 16016, "time_per_iteration": 3.7822258472442627 }, { "auxiliary_loss_clip": 0.01007553, "auxiliary_loss_mlp": 0.0100249, "balance_loss_clip": 1.00051105, "balance_loss_mlp": 1.00122738, "epoch": 0.9629941379828649, "flos": 62947875807360.0, "grad_norm": 0.8143764327654579, "language_loss": 0.63251495, "learning_rate": 1.4325196272711337e-08, "loss": 0.65261543, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06347656, "step": 16017, "time_per_iteration": 2.949894428253174 }, { "auxiliary_loss_clip": 0.01053599, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.01212478, "balance_loss_mlp": 1.01678205, "epoch": 0.9630542612355328, "flos": 29897801763840.0, "grad_norm": 2.096234912483926, "language_loss": 0.67619795, "learning_rate": 1.4278703609994502e-08, "loss": 0.6970762, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 16018, "time_per_iteration": 2.4287383556365967 }, { "auxiliary_loss_clip": 0.01051831, "auxiliary_loss_mlp": 0.01038353, "balance_loss_clip": 1.01379585, "balance_loss_mlp": 1.01621151, "epoch": 0.9631143844882009, "flos": 17893614862080.0, "grad_norm": 2.128348342985032, "language_loss": 0.80235201, "learning_rate": 1.4232286246028457e-08, "loss": 0.82325387, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35742188, "step": 16019, "time_per_iteration": 2.354036808013916 }, { "auxiliary_loss_clip": 0.01049211, "auxiliary_loss_mlp": 0.01031536, "balance_loss_clip": 1.01141322, "balance_loss_mlp": 1.01519477, "epoch": 0.9631745077408688, "flos": 26138005182720.0, "grad_norm": 1.5100201127152957, "language_loss": 0.72283399, "learning_rate": 1.4185944182572907e-08, "loss": 0.7436415, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 16020, "time_per_iteration": 2.3910152912139893 }, { "auxiliary_loss_clip": 0.01051698, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.01069403, "balance_loss_mlp": 1.01636624, "epoch": 0.9632346309935368, "flos": 24972464620800.0, "grad_norm": 1.7591189458215046, "language_loss": 0.78095269, "learning_rate": 1.4139677421385331e-08, "loss": 0.80179995, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 16021, "time_per_iteration": 2.403712034225464 }, { "auxiliary_loss_clip": 0.01055369, "auxiliary_loss_mlp": 0.01049203, "balance_loss_clip": 1.02056909, "balance_loss_mlp": 1.01644444, "epoch": 0.9632947542462047, "flos": 23616263790720.0, "grad_norm": 1.8300683505847224, "language_loss": 0.66139209, "learning_rate": 1.4093485964220331e-08, "loss": 0.6824379, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 0.38867188, "step": 16022, "time_per_iteration": 2.384716272354126 }, { "auxiliary_loss_clip": 0.01049787, "auxiliary_loss_mlp": 0.01037062, "balance_loss_clip": 1.01509154, "balance_loss_mlp": 1.01562238, "epoch": 0.9633548774988727, "flos": 26394407274240.0, "grad_norm": 1.9620299552442224, "language_loss": 0.73913723, "learning_rate": 1.4047369812829168e-08, "loss": 0.76000571, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34179688, "step": 16023, "time_per_iteration": 2.4288840293884277 }, { "auxiliary_loss_clip": 0.01051119, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.01157522, "balance_loss_mlp": 1.01570904, "epoch": 0.9634150007515406, "flos": 23766634483200.0, "grad_norm": 1.4817392733395718, "language_loss": 0.82273638, "learning_rate": 1.4001328968960891e-08, "loss": 0.84359121, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 16024, "time_per_iteration": 2.370471239089966 }, { "auxiliary_loss_clip": 0.01054888, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.01715457, "balance_loss_mlp": 1.01774609, "epoch": 0.9634751240042086, "flos": 24134165032320.0, "grad_norm": 1.5911684270175708, "language_loss": 0.81675315, "learning_rate": 1.3955363434361212e-08, "loss": 0.83771944, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37109375, "step": 16025, "time_per_iteration": 2.4052963256835938 }, { "auxiliary_loss_clip": 0.01052042, "auxiliary_loss_mlp": 0.01036426, "balance_loss_clip": 1.01246524, "balance_loss_mlp": 1.0159291, "epoch": 0.9635352472568766, "flos": 24348043221120.0, "grad_norm": 1.8271223705506954, "language_loss": 0.77671736, "learning_rate": 1.3909473210773181e-08, "loss": 0.79760206, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 16026, "time_per_iteration": 2.426281213760376 }, { "auxiliary_loss_clip": 0.01050911, "auxiliary_loss_mlp": 0.01039552, "balance_loss_clip": 1.01581693, "balance_loss_mlp": 1.01556015, "epoch": 0.9635953705095446, "flos": 23983724517120.0, "grad_norm": 1.6243477824394057, "language_loss": 0.63965183, "learning_rate": 1.3863658299936965e-08, "loss": 0.66055644, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.35546875, "step": 16027, "time_per_iteration": 2.379905939102173 }, { "auxiliary_loss_clip": 0.01053696, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.01777899, "balance_loss_mlp": 1.01666403, "epoch": 0.9636554937622125, "flos": 19827419091840.0, "grad_norm": 1.776974235071988, "language_loss": 0.88229984, "learning_rate": 1.3817918703589837e-08, "loss": 0.90325272, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 16028, "time_per_iteration": 2.3717849254608154 }, { "auxiliary_loss_clip": 0.01007345, "auxiliary_loss_mlp": 0.01003455, "balance_loss_clip": 1.00150049, "balance_loss_mlp": 1.00090837, "epoch": 0.9637156170148805, "flos": 67432329901440.0, "grad_norm": 0.6823006023065247, "language_loss": 0.53216922, "learning_rate": 1.3772254423466412e-08, "loss": 0.55227721, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06445312, "step": 16029, "time_per_iteration": 2.9912073612213135 }, { "auxiliary_loss_clip": 0.01052691, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.01473856, "balance_loss_mlp": 1.01644135, "epoch": 0.9637757402675484, "flos": 20299933699200.0, "grad_norm": 1.512703744425463, "language_loss": 0.7481575, "learning_rate": 1.372666546129797e-08, "loss": 0.76905823, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 16030, "time_per_iteration": 2.3820552825927734 }, { "auxiliary_loss_clip": 0.01049642, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.00984061, "balance_loss_mlp": 1.01558781, "epoch": 0.9638358635202164, "flos": 27233335267200.0, "grad_norm": 2.0130671872605452, "language_loss": 0.66715676, "learning_rate": 1.3681151818813575e-08, "loss": 0.68796247, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 16031, "time_per_iteration": 2.413482189178467 }, { "auxiliary_loss_clip": 0.01007201, "auxiliary_loss_mlp": 0.01002031, "balance_loss_clip": 1.00010026, "balance_loss_mlp": 1.00088799, "epoch": 0.9638959867728845, "flos": 70285536541440.0, "grad_norm": 0.8313907646854893, "language_loss": 0.60793722, "learning_rate": 1.3635713497738955e-08, "loss": 0.62802947, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.06347656, "step": 16032, "time_per_iteration": 3.0752902030944824 }, { "auxiliary_loss_clip": 0.01048219, "auxiliary_loss_mlp": 0.01036886, "balance_loss_clip": 1.01608431, "balance_loss_mlp": 1.01477051, "epoch": 0.9639561100255524, "flos": 25406435220480.0, "grad_norm": 1.8437130517845532, "language_loss": 0.67502177, "learning_rate": 1.3590350499796954e-08, "loss": 0.69587278, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33398438, "step": 16033, "time_per_iteration": 2.4047117233276367 }, { "auxiliary_loss_clip": 0.0105127, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.01673937, "balance_loss_mlp": 1.01643705, "epoch": 0.9640162332782204, "flos": 18112904311680.0, "grad_norm": 1.5722055759385611, "language_loss": 0.6659829, "learning_rate": 1.3545062826707976e-08, "loss": 0.68687969, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34765625, "step": 16034, "time_per_iteration": 2.345611810684204 }, { "auxiliary_loss_clip": 0.01051632, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.01846015, "balance_loss_mlp": 1.01590931, "epoch": 0.9640763565308883, "flos": 23439184041600.0, "grad_norm": 2.201121793371913, "language_loss": 0.74668491, "learning_rate": 1.3499850480189313e-08, "loss": 0.76762998, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35742188, "step": 16035, "time_per_iteration": 3.656064510345459 }, { "auxiliary_loss_clip": 0.01051497, "auxiliary_loss_mlp": 0.01036806, "balance_loss_clip": 1.01459718, "balance_loss_mlp": 1.01669717, "epoch": 0.9641364797835563, "flos": 22418253797760.0, "grad_norm": 1.8823427411561207, "language_loss": 0.84114087, "learning_rate": 1.3454713461955591e-08, "loss": 0.86202395, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 16036, "time_per_iteration": 2.3741323947906494 }, { "auxiliary_loss_clip": 0.01051012, "auxiliary_loss_mlp": 0.01033359, "balance_loss_clip": 1.01044691, "balance_loss_mlp": 1.01571178, "epoch": 0.9641966030362242, "flos": 30621202467840.0, "grad_norm": 1.757069841405239, "language_loss": 0.71011567, "learning_rate": 1.340965177371789e-08, "loss": 0.7309593, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 16037, "time_per_iteration": 2.4310102462768555 }, { "auxiliary_loss_clip": 0.0105111, "auxiliary_loss_mlp": 0.01031403, "balance_loss_clip": 1.00888431, "balance_loss_mlp": 1.01588678, "epoch": 0.9642567262888923, "flos": 20952251141760.0, "grad_norm": 1.633194008876519, "language_loss": 0.63964403, "learning_rate": 1.3364665417185506e-08, "loss": 0.66046917, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 16038, "time_per_iteration": 2.3781497478485107 }, { "auxiliary_loss_clip": 0.01052532, "auxiliary_loss_mlp": 0.01039713, "balance_loss_clip": 1.01599014, "balance_loss_mlp": 1.01608419, "epoch": 0.9643168495415602, "flos": 22638276385920.0, "grad_norm": 1.540291062371269, "language_loss": 0.71683729, "learning_rate": 1.3319754394064187e-08, "loss": 0.73775971, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 16039, "time_per_iteration": 2.43440580368042 }, { "auxiliary_loss_clip": 0.01051811, "auxiliary_loss_mlp": 0.01037023, "balance_loss_clip": 1.01200128, "balance_loss_mlp": 1.01584184, "epoch": 0.9643769727942282, "flos": 20265229941120.0, "grad_norm": 1.9839222301543957, "language_loss": 0.74496377, "learning_rate": 1.327491870605657e-08, "loss": 0.76585209, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.359375, "step": 16040, "time_per_iteration": 2.3355650901794434 }, { "auxiliary_loss_clip": 0.01054155, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.01539433, "balance_loss_mlp": 1.01674306, "epoch": 0.9644370960468961, "flos": 13880977148160.0, "grad_norm": 2.1546593784901273, "language_loss": 0.74001008, "learning_rate": 1.3230158354863296e-08, "loss": 0.76094556, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 16041, "time_per_iteration": 2.357393741607666 }, { "auxiliary_loss_clip": 0.01048625, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.01270628, "balance_loss_mlp": 1.01528037, "epoch": 0.9644972192995641, "flos": 17237247701760.0, "grad_norm": 1.8825179655395745, "language_loss": 0.73261982, "learning_rate": 1.3185473342181674e-08, "loss": 0.75343937, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.33203125, "step": 16042, "time_per_iteration": 2.3535566329956055 }, { "auxiliary_loss_clip": 0.01053299, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.01548338, "balance_loss_mlp": 1.01620901, "epoch": 0.964557342552232, "flos": 23839253844480.0, "grad_norm": 1.7285551473736833, "language_loss": 0.81874263, "learning_rate": 1.3140863669705683e-08, "loss": 0.83965242, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.37109375, "step": 16043, "time_per_iteration": 2.393782377243042 }, { "auxiliary_loss_clip": 0.01050575, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.01270103, "balance_loss_mlp": 1.01605856, "epoch": 0.9646174658049, "flos": 21652049900160.0, "grad_norm": 1.5512683126598588, "language_loss": 0.7280798, "learning_rate": 1.3096329339127522e-08, "loss": 0.74892414, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 16044, "time_per_iteration": 3.762526512145996 }, { "auxiliary_loss_clip": 0.01048567, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.00790071, "balance_loss_mlp": 1.01514912, "epoch": 0.9646775890575681, "flos": 17128597950720.0, "grad_norm": 1.7590764712143856, "language_loss": 0.7147361, "learning_rate": 1.3051870352135397e-08, "loss": 0.73552084, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33398438, "step": 16045, "time_per_iteration": 3.6612355709075928 }, { "auxiliary_loss_clip": 0.01051988, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.01488805, "balance_loss_mlp": 1.01596606, "epoch": 0.964737712310236, "flos": 13004901601920.0, "grad_norm": 2.0231926437475085, "language_loss": 0.76914036, "learning_rate": 1.3007486710415737e-08, "loss": 0.79006171, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.359375, "step": 16046, "time_per_iteration": 2.4055087566375732 }, { "auxiliary_loss_clip": 0.01053829, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.01727474, "balance_loss_mlp": 1.01655316, "epoch": 0.964797835562904, "flos": 24278112034560.0, "grad_norm": 1.5048305763435106, "language_loss": 0.63306832, "learning_rate": 1.2963178415651199e-08, "loss": 0.65403593, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 16047, "time_per_iteration": 2.379857301712036 }, { "auxiliary_loss_clip": 0.0105367, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.01497722, "balance_loss_mlp": 1.01733494, "epoch": 0.9648579588155719, "flos": 20521632032640.0, "grad_norm": 1.75590866193559, "language_loss": 0.69794703, "learning_rate": 1.2918945469521992e-08, "loss": 0.71886957, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 16048, "time_per_iteration": 2.3765525817871094 }, { "auxiliary_loss_clip": 0.01053299, "auxiliary_loss_mlp": 0.0104015, "balance_loss_clip": 1.01559305, "balance_loss_mlp": 1.01587236, "epoch": 0.9649180820682399, "flos": 32153645174400.0, "grad_norm": 1.840631966533396, "language_loss": 0.64546549, "learning_rate": 1.2874787873705662e-08, "loss": 0.66639996, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 16049, "time_per_iteration": 2.446866035461426 }, { "auxiliary_loss_clip": 0.01053086, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.01098967, "balance_loss_mlp": 1.01746321, "epoch": 0.9649782053209078, "flos": 20521527298560.0, "grad_norm": 2.1310071579536527, "language_loss": 0.71648538, "learning_rate": 1.2830705629876427e-08, "loss": 0.73735309, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 16050, "time_per_iteration": 2.384705066680908 }, { "auxiliary_loss_clip": 0.01053592, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.0137403, "balance_loss_mlp": 1.01533866, "epoch": 0.9650383285735759, "flos": 43066342241280.0, "grad_norm": 1.7392375202158004, "language_loss": 0.70926976, "learning_rate": 1.278669873970606e-08, "loss": 0.73019844, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.3828125, "step": 16051, "time_per_iteration": 2.5842931270599365 }, { "auxiliary_loss_clip": 0.01007038, "auxiliary_loss_mlp": 0.01003749, "balance_loss_clip": 1.00186563, "balance_loss_mlp": 1.0006038, "epoch": 0.9650984518262438, "flos": 61745117869440.0, "grad_norm": 1.0114743959708241, "language_loss": 0.59229559, "learning_rate": 1.2742767204863004e-08, "loss": 0.61240345, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.06445312, "step": 16052, "time_per_iteration": 3.048064708709717 }, { "auxiliary_loss_clip": 0.01048864, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.0075326, "balance_loss_mlp": 1.01469183, "epoch": 0.9651585750789118, "flos": 29788034849280.0, "grad_norm": 1.6254031523227979, "language_loss": 0.7575264, "learning_rate": 1.2698911027013482e-08, "loss": 0.77830744, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 16053, "time_per_iteration": 2.4541800022125244 }, { "auxiliary_loss_clip": 0.01052269, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.01183808, "balance_loss_mlp": 1.01632297, "epoch": 0.9652186983315797, "flos": 16872126036480.0, "grad_norm": 1.9655886793416415, "language_loss": 0.69826424, "learning_rate": 1.2655130207820386e-08, "loss": 0.71913671, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 16054, "time_per_iteration": 2.35073184967041 }, { "auxiliary_loss_clip": 0.01051716, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.02014792, "balance_loss_mlp": 1.01631272, "epoch": 0.9652788215842477, "flos": 31648416756480.0, "grad_norm": 1.4650773630131453, "language_loss": 0.63039398, "learning_rate": 1.2611424748943944e-08, "loss": 0.65132505, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.35546875, "step": 16055, "time_per_iteration": 2.4677865505218506 }, { "auxiliary_loss_clip": 0.01049691, "auxiliary_loss_mlp": 0.010319, "balance_loss_clip": 1.01083601, "balance_loss_mlp": 1.01641965, "epoch": 0.9653389448369156, "flos": 24753105348480.0, "grad_norm": 2.198092535053469, "language_loss": 0.7817114, "learning_rate": 1.2567794652041719e-08, "loss": 0.80252731, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33203125, "step": 16056, "time_per_iteration": 3.8349571228027344 }, { "auxiliary_loss_clip": 0.01050601, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.01235271, "balance_loss_mlp": 1.01506138, "epoch": 0.9653990680895836, "flos": 20296372740480.0, "grad_norm": 1.5200186853521038, "language_loss": 0.72559613, "learning_rate": 1.2524239918767498e-08, "loss": 0.74645567, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 16057, "time_per_iteration": 2.3470540046691895 }, { "auxiliary_loss_clip": 0.01050082, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.01684713, "balance_loss_mlp": 1.01565528, "epoch": 0.9654591913422517, "flos": 22527671598720.0, "grad_norm": 1.8385092180488218, "language_loss": 0.72378552, "learning_rate": 1.2480760550773295e-08, "loss": 0.74467659, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34375, "step": 16058, "time_per_iteration": 2.3821005821228027 }, { "auxiliary_loss_clip": 0.01050908, "auxiliary_loss_mlp": 0.01034618, "balance_loss_clip": 1.01207519, "balance_loss_mlp": 1.0157119, "epoch": 0.9655193145949196, "flos": 26761728355200.0, "grad_norm": 1.399011642231527, "language_loss": 0.74907136, "learning_rate": 1.2437356549708011e-08, "loss": 0.76992655, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.3515625, "step": 16059, "time_per_iteration": 2.4197094440460205 }, { "auxiliary_loss_clip": 0.01052731, "auxiliary_loss_mlp": 0.01040154, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.01567423, "epoch": 0.9655794378475876, "flos": 41969196766080.0, "grad_norm": 1.6966623859332075, "language_loss": 0.74133748, "learning_rate": 1.239402791721722e-08, "loss": 0.7622664, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 16060, "time_per_iteration": 2.555171251296997 }, { "auxiliary_loss_clip": 0.01048996, "auxiliary_loss_mlp": 0.01036974, "balance_loss_clip": 1.01666069, "balance_loss_mlp": 1.01586652, "epoch": 0.9656395611002555, "flos": 27708188935680.0, "grad_norm": 1.550753055344959, "language_loss": 0.77632302, "learning_rate": 1.2350774654944273e-08, "loss": 0.79718274, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33203125, "step": 16061, "time_per_iteration": 2.422240972518921 }, { "auxiliary_loss_clip": 0.01007594, "auxiliary_loss_mlp": 0.01001934, "balance_loss_clip": 0.99990708, "balance_loss_mlp": 1.00117517, "epoch": 0.9656996843529235, "flos": 68965331189760.0, "grad_norm": 0.7226771919139883, "language_loss": 0.64287436, "learning_rate": 1.2307596764528749e-08, "loss": 0.66296965, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.06445312, "step": 16062, "time_per_iteration": 3.054680347442627 }, { "auxiliary_loss_clip": 0.01047314, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 1.01368272, "balance_loss_mlp": 1.01432729, "epoch": 0.9657598076055914, "flos": 20630281783680.0, "grad_norm": 3.1477779613252572, "language_loss": 0.94945234, "learning_rate": 1.226449424760867e-08, "loss": 0.97027236, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33007812, "step": 16063, "time_per_iteration": 2.352570056915283 }, { "auxiliary_loss_clip": 0.01052064, "auxiliary_loss_mlp": 0.01035952, "balance_loss_clip": 1.01225352, "balance_loss_mlp": 1.01627839, "epoch": 0.9658199308582595, "flos": 20447546394240.0, "grad_norm": 1.7583367649097892, "language_loss": 0.83459657, "learning_rate": 1.2221467105818062e-08, "loss": 0.85547674, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 16064, "time_per_iteration": 2.386924982070923 }, { "auxiliary_loss_clip": 0.01052331, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.01222122, "balance_loss_mlp": 1.01759255, "epoch": 0.9658800541109274, "flos": 24716865490560.0, "grad_norm": 1.5521294427621328, "language_loss": 0.84755284, "learning_rate": 1.2178515340788731e-08, "loss": 0.86841959, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34765625, "step": 16065, "time_per_iteration": 2.3992273807525635 }, { "auxiliary_loss_clip": 0.01050511, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.01421905, "balance_loss_mlp": 1.01529014, "epoch": 0.9659401773635954, "flos": 21609141972480.0, "grad_norm": 2.4897704004852548, "language_loss": 0.67904586, "learning_rate": 1.2135638954149151e-08, "loss": 0.69991714, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.3515625, "step": 16066, "time_per_iteration": 2.396071195602417 }, { "auxiliary_loss_clip": 0.01050322, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.01245284, "balance_loss_mlp": 1.01549292, "epoch": 0.9660003006162633, "flos": 20300212990080.0, "grad_norm": 1.8265606872692268, "language_loss": 0.82990187, "learning_rate": 1.209283794752558e-08, "loss": 0.85074878, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 16067, "time_per_iteration": 2.3703489303588867 }, { "auxiliary_loss_clip": 0.01050158, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.01008701, "balance_loss_mlp": 1.01539946, "epoch": 0.9660604238689313, "flos": 24460812512640.0, "grad_norm": 1.9040689180244306, "language_loss": 0.70806754, "learning_rate": 1.2050112322540496e-08, "loss": 0.72889894, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34765625, "step": 16068, "time_per_iteration": 2.4282138347625732 }, { "auxiliary_loss_clip": 0.01046489, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.00981414, "balance_loss_mlp": 1.01469755, "epoch": 0.9661205471215992, "flos": 19863030545280.0, "grad_norm": 1.8175565054097025, "language_loss": 0.68479562, "learning_rate": 1.20074620808146e-08, "loss": 0.70554572, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.31835938, "step": 16069, "time_per_iteration": 2.369292736053467 }, { "auxiliary_loss_clip": 0.01050525, "auxiliary_loss_mlp": 0.01034136, "balance_loss_clip": 1.01280999, "balance_loss_mlp": 1.01540565, "epoch": 0.9661806703742672, "flos": 20556859461120.0, "grad_norm": 2.2091311201680073, "language_loss": 0.89868468, "learning_rate": 1.1964887223964826e-08, "loss": 0.91953123, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 16070, "time_per_iteration": 2.3826332092285156 }, { "auxiliary_loss_clip": 0.01054125, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.01861978, "balance_loss_mlp": 1.0182817, "epoch": 0.9662407936269353, "flos": 21429967541760.0, "grad_norm": 1.7780917273529135, "language_loss": 0.78577352, "learning_rate": 1.1922387753605878e-08, "loss": 0.80675173, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.35742188, "step": 16071, "time_per_iteration": 2.377195119857788 }, { "auxiliary_loss_clip": 0.01050785, "auxiliary_loss_mlp": 0.01045104, "balance_loss_clip": 1.01525402, "balance_loss_mlp": 1.01601005, "epoch": 0.9663009168796032, "flos": 14902012126080.0, "grad_norm": 2.713744059565762, "language_loss": 0.66328382, "learning_rate": 1.1879963671349137e-08, "loss": 0.68424273, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 0.34765625, "step": 16072, "time_per_iteration": 2.353283405303955 }, { "auxiliary_loss_clip": 0.01051907, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.01596892, "balance_loss_mlp": 1.01599228, "epoch": 0.9663610401322712, "flos": 24309883238400.0, "grad_norm": 1.6084159205770305, "language_loss": 0.78418231, "learning_rate": 1.1837614978803534e-08, "loss": 0.80508316, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 16073, "time_per_iteration": 2.3780970573425293 }, { "auxiliary_loss_clip": 0.01053711, "auxiliary_loss_mlp": 0.01036765, "balance_loss_clip": 1.0130899, "balance_loss_mlp": 1.01692629, "epoch": 0.9664211633849391, "flos": 17636933479680.0, "grad_norm": 2.238024998352046, "language_loss": 0.76859093, "learning_rate": 1.1795341677574677e-08, "loss": 0.78949571, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3671875, "step": 16074, "time_per_iteration": 2.375025749206543 }, { "auxiliary_loss_clip": 0.01052333, "auxiliary_loss_mlp": 0.01040555, "balance_loss_clip": 1.01611662, "balance_loss_mlp": 1.0155549, "epoch": 0.9664812866376071, "flos": 29788104672000.0, "grad_norm": 1.502088676803031, "language_loss": 0.76369214, "learning_rate": 1.1753143769265728e-08, "loss": 0.784621, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 16075, "time_per_iteration": 3.6689798831939697 }, { "auxiliary_loss_clip": 0.01052629, "auxiliary_loss_mlp": 0.01039939, "balance_loss_clip": 1.01672935, "balance_loss_mlp": 1.0170691, "epoch": 0.966541409890275, "flos": 14281465887360.0, "grad_norm": 3.3117223884037257, "language_loss": 0.79492158, "learning_rate": 1.171102125547696e-08, "loss": 0.81584728, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16076, "time_per_iteration": 2.3756070137023926 }, { "auxiliary_loss_clip": 0.01052837, "auxiliary_loss_mlp": 0.01045178, "balance_loss_clip": 1.02042997, "balance_loss_mlp": 1.01663959, "epoch": 0.9666015331429431, "flos": 19859539409280.0, "grad_norm": 1.7562952581655573, "language_loss": 0.72777057, "learning_rate": 1.166897413780532e-08, "loss": 0.74875069, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.36328125, "step": 16077, "time_per_iteration": 2.3515450954437256 }, { "auxiliary_loss_clip": 0.01050174, "auxiliary_loss_mlp": 0.01034835, "balance_loss_clip": 1.01195884, "balance_loss_mlp": 1.01516652, "epoch": 0.966661656395611, "flos": 27124720427520.0, "grad_norm": 1.669460946959248, "language_loss": 0.5999918, "learning_rate": 1.1627002417845533e-08, "loss": 0.62084186, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 16078, "time_per_iteration": 2.417071580886841 }, { "auxiliary_loss_clip": 0.01051986, "auxiliary_loss_mlp": 0.01043044, "balance_loss_clip": 1.01860666, "balance_loss_mlp": 1.01535034, "epoch": 0.966721779648279, "flos": 21507125379840.0, "grad_norm": 2.0126344063170105, "language_loss": 0.741157, "learning_rate": 1.158510609718899e-08, "loss": 0.76210737, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 16079, "time_per_iteration": 2.370870590209961 }, { "auxiliary_loss_clip": 0.01049481, "auxiliary_loss_mlp": 0.01033625, "balance_loss_clip": 1.0117383, "balance_loss_mlp": 1.01554215, "epoch": 0.9667819029009469, "flos": 23876994890880.0, "grad_norm": 2.3234999900659634, "language_loss": 0.72863531, "learning_rate": 1.1543285177424644e-08, "loss": 0.74946642, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 16080, "time_per_iteration": 2.425915002822876 }, { "auxiliary_loss_clip": 0.01050318, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.01326394, "balance_loss_mlp": 1.01523316, "epoch": 0.9668420261536149, "flos": 21506147861760.0, "grad_norm": 1.8334696050096901, "language_loss": 0.75770211, "learning_rate": 1.1501539660138115e-08, "loss": 0.77855462, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 16081, "time_per_iteration": 2.3596010208129883 }, { "auxiliary_loss_clip": 0.01049686, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.01097894, "balance_loss_mlp": 1.01445818, "epoch": 0.9669021494062828, "flos": 26686071705600.0, "grad_norm": 1.6910526854710999, "language_loss": 0.68616599, "learning_rate": 1.145986954691236e-08, "loss": 0.70699102, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.3515625, "step": 16082, "time_per_iteration": 2.4299964904785156 }, { "auxiliary_loss_clip": 0.0105175, "auxiliary_loss_mlp": 0.01033865, "balance_loss_clip": 1.01112008, "balance_loss_mlp": 1.01714206, "epoch": 0.9669622726589508, "flos": 29823751036800.0, "grad_norm": 1.4783035108407874, "language_loss": 0.77265406, "learning_rate": 1.141827483932789e-08, "loss": 0.7935102, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.34570312, "step": 16083, "time_per_iteration": 3.845435380935669 }, { "auxiliary_loss_clip": 0.01052465, "auxiliary_loss_mlp": 0.01039132, "balance_loss_clip": 1.01725769, "balance_loss_mlp": 1.01612091, "epoch": 0.9670223959116189, "flos": 22921597002240.0, "grad_norm": 1.9640091713468955, "language_loss": 0.80318689, "learning_rate": 1.1376755538961669e-08, "loss": 0.82410288, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.36328125, "step": 16084, "time_per_iteration": 3.868178129196167 }, { "auxiliary_loss_clip": 0.01052596, "auxiliary_loss_mlp": 0.01035746, "balance_loss_clip": 1.01199925, "balance_loss_mlp": 1.01576781, "epoch": 0.9670825191642868, "flos": 18623509079040.0, "grad_norm": 2.4821422164740206, "language_loss": 0.7075085, "learning_rate": 1.1335311647387991e-08, "loss": 0.72839195, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36914062, "step": 16085, "time_per_iteration": 2.360260248184204 }, { "auxiliary_loss_clip": 0.01055189, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.01348829, "balance_loss_mlp": 1.01694584, "epoch": 0.9671426424169548, "flos": 24496807991040.0, "grad_norm": 1.9547572969234808, "language_loss": 0.70051897, "learning_rate": 1.1293943166178709e-08, "loss": 0.72145712, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 16086, "time_per_iteration": 2.3938238620758057 }, { "auxiliary_loss_clip": 0.01050817, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.01721704, "balance_loss_mlp": 1.01656139, "epoch": 0.9672027656696227, "flos": 20370283822080.0, "grad_norm": 8.866218432697908, "language_loss": 0.791125, "learning_rate": 1.125265009690235e-08, "loss": 0.81202376, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34179688, "step": 16087, "time_per_iteration": 2.376634120941162 }, { "auxiliary_loss_clip": 0.01049719, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.01236701, "balance_loss_mlp": 1.01572192, "epoch": 0.9672628889222907, "flos": 18879178032000.0, "grad_norm": 1.9423671171785017, "language_loss": 0.73270673, "learning_rate": 1.1211432441124769e-08, "loss": 0.75355101, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.33984375, "step": 16088, "time_per_iteration": 2.3510990142822266 }, { "auxiliary_loss_clip": 0.01049123, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.0093472, "balance_loss_mlp": 1.01529431, "epoch": 0.9673230121749586, "flos": 28693961573760.0, "grad_norm": 1.801341740391533, "language_loss": 0.71765542, "learning_rate": 1.117029020040916e-08, "loss": 0.73845309, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33789062, "step": 16089, "time_per_iteration": 2.4353060722351074 }, { "auxiliary_loss_clip": 0.01053138, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.01483238, "balance_loss_mlp": 1.0167805, "epoch": 0.9673831354276267, "flos": 20483437138560.0, "grad_norm": 8.766553250990611, "language_loss": 0.76837862, "learning_rate": 1.1129223376315167e-08, "loss": 0.78929251, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.36328125, "step": 16090, "time_per_iteration": 2.375403642654419 }, { "auxiliary_loss_clip": 0.01054011, "auxiliary_loss_mlp": 0.01037949, "balance_loss_clip": 1.01402402, "balance_loss_mlp": 1.01607621, "epoch": 0.9674432586802946, "flos": 26796327379200.0, "grad_norm": 1.7329725719005327, "language_loss": 0.69959539, "learning_rate": 1.1088231970400653e-08, "loss": 0.72051501, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.37890625, "step": 16091, "time_per_iteration": 2.4481067657470703 }, { "auxiliary_loss_clip": 0.01050303, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.01882422, "balance_loss_mlp": 1.01560974, "epoch": 0.9675033819329626, "flos": 22309813514880.0, "grad_norm": 1.751207181322269, "language_loss": 0.7746352, "learning_rate": 1.1047315984219484e-08, "loss": 0.79556084, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 16092, "time_per_iteration": 2.398439884185791 }, { "auxiliary_loss_clip": 0.01051331, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 1.01239979, "balance_loss_mlp": 1.01679778, "epoch": 0.9675635051856305, "flos": 12674937542400.0, "grad_norm": 1.7338812311684877, "language_loss": 0.77624905, "learning_rate": 1.1006475419323313e-08, "loss": 0.79709649, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34570312, "step": 16093, "time_per_iteration": 2.3603732585906982 }, { "auxiliary_loss_clip": 0.01051144, "auxiliary_loss_mlp": 0.01036932, "balance_loss_clip": 1.0131619, "balance_loss_mlp": 1.015908, "epoch": 0.9676236284382985, "flos": 24607308044160.0, "grad_norm": 1.6232732914625876, "language_loss": 0.70292461, "learning_rate": 1.096571027726112e-08, "loss": 0.72380531, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3515625, "step": 16094, "time_per_iteration": 2.4123642444610596 }, { "auxiliary_loss_clip": 0.01052977, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.00972712, "balance_loss_mlp": 1.01621914, "epoch": 0.9676837516909664, "flos": 23366041009920.0, "grad_norm": 1.4725094179332505, "language_loss": 0.76970899, "learning_rate": 1.0925020559578557e-08, "loss": 0.79056466, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 16095, "time_per_iteration": 2.392596960067749 }, { "auxiliary_loss_clip": 0.01054089, "auxiliary_loss_mlp": 0.01038643, "balance_loss_clip": 1.01428807, "balance_loss_mlp": 1.01695085, "epoch": 0.9677438749436345, "flos": 20485811111040.0, "grad_norm": 2.5971059770137193, "language_loss": 0.72513145, "learning_rate": 1.0884406267818392e-08, "loss": 0.74605876, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37109375, "step": 16096, "time_per_iteration": 3.8083598613739014 }, { "auxiliary_loss_clip": 0.01052905, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.01306963, "balance_loss_mlp": 1.01641178, "epoch": 0.9678039981963025, "flos": 47554147825920.0, "grad_norm": 1.9611397659488752, "language_loss": 0.7317754, "learning_rate": 1.0843867403520946e-08, "loss": 0.75267303, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 16097, "time_per_iteration": 2.6593122482299805 }, { "auxiliary_loss_clip": 0.01050673, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.01354647, "balance_loss_mlp": 1.01544416, "epoch": 0.9678641214489704, "flos": 25039463253120.0, "grad_norm": 1.5444669084516687, "language_loss": 0.7905584, "learning_rate": 1.0803403968223434e-08, "loss": 0.8114295, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3515625, "step": 16098, "time_per_iteration": 2.405832529067993 }, { "auxiliary_loss_clip": 0.01050099, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.01427019, "balance_loss_mlp": 1.01516223, "epoch": 0.9679242447016384, "flos": 19240040511360.0, "grad_norm": 1.9411390966098805, "language_loss": 0.91722441, "learning_rate": 1.0763015963459965e-08, "loss": 0.93808043, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34960938, "step": 16099, "time_per_iteration": 2.4160869121551514 }, { "auxiliary_loss_clip": 0.01052689, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.011832, "balance_loss_mlp": 1.01542008, "epoch": 0.9679843679543063, "flos": 33253304267520.0, "grad_norm": 2.1948511427926136, "language_loss": 0.67206645, "learning_rate": 1.0722703390762643e-08, "loss": 0.69295764, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 16100, "time_per_iteration": 2.4849400520324707 }, { "auxiliary_loss_clip": 0.01051716, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.01559091, "balance_loss_mlp": 1.0158267, "epoch": 0.9680444912069743, "flos": 22782537590400.0, "grad_norm": 1.6905209388930482, "language_loss": 0.74275255, "learning_rate": 1.0682466251659584e-08, "loss": 0.76364452, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.359375, "step": 16101, "time_per_iteration": 2.405630111694336 }, { "auxiliary_loss_clip": 0.01051557, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.0116322, "balance_loss_mlp": 1.01616979, "epoch": 0.9681046144596422, "flos": 24022966752000.0, "grad_norm": 1.595957555999901, "language_loss": 0.74549711, "learning_rate": 1.0642304547676672e-08, "loss": 0.7663725, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35351562, "step": 16102, "time_per_iteration": 2.381563901901245 }, { "auxiliary_loss_clip": 0.0105364, "auxiliary_loss_mlp": 0.01037771, "balance_loss_clip": 1.01216507, "balance_loss_mlp": 1.0170784, "epoch": 0.9681647377123103, "flos": 23439882268800.0, "grad_norm": 1.8087453083679341, "language_loss": 0.7827543, "learning_rate": 1.0602218280337139e-08, "loss": 0.80366838, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36523438, "step": 16103, "time_per_iteration": 2.4016358852386475 }, { "auxiliary_loss_clip": 0.01050817, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.01319385, "balance_loss_mlp": 1.01588869, "epoch": 0.9682248609649782, "flos": 22673957662080.0, "grad_norm": 1.5498623055625385, "language_loss": 0.81435454, "learning_rate": 1.0562207451160655e-08, "loss": 0.83520854, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34960938, "step": 16104, "time_per_iteration": 2.3659982681274414 }, { "auxiliary_loss_clip": 0.01047904, "auxiliary_loss_mlp": 0.01030653, "balance_loss_clip": 1.01043534, "balance_loss_mlp": 1.01467633, "epoch": 0.9682849842176462, "flos": 24427191006720.0, "grad_norm": 1.5134426534371945, "language_loss": 0.78478014, "learning_rate": 1.0522272061664672e-08, "loss": 0.80556571, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.33203125, "step": 16105, "time_per_iteration": 2.418720245361328 }, { "auxiliary_loss_clip": 0.01006695, "auxiliary_loss_mlp": 0.01002117, "balance_loss_clip": 1.00020969, "balance_loss_mlp": 1.00052822, "epoch": 0.9683451074703141, "flos": 59992093992960.0, "grad_norm": 0.8186738148492448, "language_loss": 0.56804675, "learning_rate": 1.0482412113363536e-08, "loss": 0.58813483, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.06152344, "step": 16106, "time_per_iteration": 3.0642759799957275 }, { "auxiliary_loss_clip": 0.01006872, "auxiliary_loss_mlp": 0.01002002, "balance_loss_clip": 0.99988025, "balance_loss_mlp": 1.00069344, "epoch": 0.9684052307229821, "flos": 52693361291520.0, "grad_norm": 0.8840205396416722, "language_loss": 0.61826897, "learning_rate": 1.0442627607768707e-08, "loss": 0.63835776, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.06176758, "step": 16107, "time_per_iteration": 2.947842597961426 }, { "auxiliary_loss_clip": 0.01050739, "auxiliary_loss_mlp": 0.01043276, "balance_loss_clip": 1.01923215, "balance_loss_mlp": 1.01540971, "epoch": 0.96846535397565, "flos": 22782747058560.0, "grad_norm": 2.272632826689127, "language_loss": 0.75265068, "learning_rate": 1.040291854638875e-08, "loss": 0.7735908, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3515625, "step": 16108, "time_per_iteration": 2.4018356800079346 }, { "auxiliary_loss_clip": 0.01053028, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.01711178, "balance_loss_mlp": 1.01648057, "epoch": 0.968525477228318, "flos": 23323063259520.0, "grad_norm": 2.142570279997239, "language_loss": 0.58724678, "learning_rate": 1.0363284930729576e-08, "loss": 0.60819709, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 16109, "time_per_iteration": 2.422017812728882 }, { "auxiliary_loss_clip": 0.01006836, "auxiliary_loss_mlp": 0.01001568, "balance_loss_clip": 0.9995774, "balance_loss_mlp": 1.0006156, "epoch": 0.9685856004809861, "flos": 67879496995200.0, "grad_norm": 0.67232148830515, "language_loss": 0.54346168, "learning_rate": 1.0323726762294205e-08, "loss": 0.5635457, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.06225586, "step": 16110, "time_per_iteration": 3.005385637283325 }, { "auxiliary_loss_clip": 0.01054627, "auxiliary_loss_mlp": 0.01042535, "balance_loss_clip": 1.01559377, "balance_loss_mlp": 1.01668239, "epoch": 0.968645723733654, "flos": 33946504778880.0, "grad_norm": 1.866502823363403, "language_loss": 0.6365602, "learning_rate": 1.0284244042582325e-08, "loss": 0.65753186, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.37890625, "step": 16111, "time_per_iteration": 2.4785172939300537 }, { "auxiliary_loss_clip": 0.01048246, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 1.01162708, "balance_loss_mlp": 1.01478171, "epoch": 0.968705846986322, "flos": 18550505692800.0, "grad_norm": 1.9128776343736997, "language_loss": 0.75264025, "learning_rate": 1.024483677309118e-08, "loss": 0.77345359, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.33398438, "step": 16112, "time_per_iteration": 2.3386454582214355 }, { "auxiliary_loss_clip": 0.01049344, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.01105523, "balance_loss_mlp": 1.01489007, "epoch": 0.9687659702389899, "flos": 17419948179840.0, "grad_norm": 1.7900726865168421, "language_loss": 0.68051887, "learning_rate": 1.020550495531558e-08, "loss": 0.7013377, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34375, "step": 16113, "time_per_iteration": 2.376335382461548 }, { "auxiliary_loss_clip": 0.01006947, "auxiliary_loss_mlp": 0.01004109, "balance_loss_clip": 1.00232065, "balance_loss_mlp": 1.00062847, "epoch": 0.9688260934916579, "flos": 62043834395520.0, "grad_norm": 0.6953946158264256, "language_loss": 0.56659311, "learning_rate": 1.0166248590746329e-08, "loss": 0.58670366, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.06347656, "step": 16114, "time_per_iteration": 4.464321851730347 }, { "auxiliary_loss_clip": 0.01050759, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.01611805, "balance_loss_mlp": 1.01565456, "epoch": 0.9688862167443258, "flos": 15075880030080.0, "grad_norm": 2.264123125287711, "language_loss": 0.83493125, "learning_rate": 1.0127067680872458e-08, "loss": 0.85583299, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3515625, "step": 16115, "time_per_iteration": 2.3177521228790283 }, { "auxiliary_loss_clip": 0.01048639, "auxiliary_loss_mlp": 0.01034, "balance_loss_clip": 1.01267314, "balance_loss_mlp": 1.01591897, "epoch": 0.9689463399969939, "flos": 19937186006400.0, "grad_norm": 1.5715789022818367, "language_loss": 0.72941732, "learning_rate": 1.0087962227179448e-08, "loss": 0.75024366, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.328125, "step": 16116, "time_per_iteration": 2.3699638843536377 }, { "auxiliary_loss_clip": 0.01053209, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.01386034, "balance_loss_mlp": 1.01635742, "epoch": 0.9690064632496618, "flos": 19572029429760.0, "grad_norm": 3.412943354918438, "language_loss": 0.76608336, "learning_rate": 1.0048932231150553e-08, "loss": 0.78698337, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3671875, "step": 16117, "time_per_iteration": 2.350480556488037 }, { "auxiliary_loss_clip": 0.01052248, "auxiliary_loss_mlp": 0.01037589, "balance_loss_clip": 1.01474833, "balance_loss_mlp": 1.0155921, "epoch": 0.9690665865023298, "flos": 21871199704320.0, "grad_norm": 2.090434030978091, "language_loss": 0.78962916, "learning_rate": 1.000997769426548e-08, "loss": 0.81052756, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 16118, "time_per_iteration": 2.3771841526031494 }, { "auxiliary_loss_clip": 0.01053667, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.01709867, "balance_loss_mlp": 1.01672792, "epoch": 0.9691267097549977, "flos": 20993483324160.0, "grad_norm": 1.6951414269843572, "language_loss": 0.79164743, "learning_rate": 9.971098618001272e-09, "loss": 0.81258178, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 16119, "time_per_iteration": 2.365143299102783 }, { "auxiliary_loss_clip": 0.01049132, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.01351762, "balance_loss_mlp": 1.01527607, "epoch": 0.9691868330076657, "flos": 24278007300480.0, "grad_norm": 1.3946402113673064, "language_loss": 0.76491714, "learning_rate": 9.932295003832747e-09, "loss": 0.78574085, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.33789062, "step": 16120, "time_per_iteration": 2.396498680114746 }, { "auxiliary_loss_clip": 0.01050063, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.01336515, "balance_loss_mlp": 1.01560175, "epoch": 0.9692469562603336, "flos": 17674744348800.0, "grad_norm": 1.8071665960932686, "language_loss": 0.70849288, "learning_rate": 9.89356685323095e-09, "loss": 0.72935808, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34375, "step": 16121, "time_per_iteration": 2.3384037017822266 }, { "auxiliary_loss_clip": 0.01051415, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.01076639, "balance_loss_mlp": 1.01618576, "epoch": 0.9693070795130017, "flos": 26833160730240.0, "grad_norm": 1.697101535128724, "language_loss": 0.70129514, "learning_rate": 9.854914167664486e-09, "loss": 0.72215271, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.3515625, "step": 16122, "time_per_iteration": 3.795444965362549 }, { "auxiliary_loss_clip": 0.01051468, "auxiliary_loss_mlp": 0.01033149, "balance_loss_clip": 1.0117985, "balance_loss_mlp": 1.01622355, "epoch": 0.9693672027656697, "flos": 18076315340160.0, "grad_norm": 1.9201952275766911, "language_loss": 0.77532089, "learning_rate": 9.81633694859907e-09, "loss": 0.79616702, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3515625, "step": 16123, "time_per_iteration": 2.3360681533813477 }, { "auxiliary_loss_clip": 0.0105213, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.01455438, "balance_loss_mlp": 1.01573086, "epoch": 0.9694273260183376, "flos": 21761921548800.0, "grad_norm": 1.4006675433721731, "language_loss": 0.75026745, "learning_rate": 9.777835197497753e-09, "loss": 0.77116477, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 16124, "time_per_iteration": 2.401848554611206 }, { "auxiliary_loss_clip": 0.01052808, "auxiliary_loss_mlp": 0.0104215, "balance_loss_clip": 1.01896358, "balance_loss_mlp": 1.01614594, "epoch": 0.9694874492710056, "flos": 24424956679680.0, "grad_norm": 2.0781645291887823, "language_loss": 0.75684053, "learning_rate": 9.739408915820258e-09, "loss": 0.77779019, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.3671875, "step": 16125, "time_per_iteration": 3.9059674739837646 }, { "auxiliary_loss_clip": 0.01006926, "auxiliary_loss_mlp": 0.01002081, "balance_loss_clip": 0.99997139, "balance_loss_mlp": 1.00054717, "epoch": 0.9695475725236735, "flos": 67647674367360.0, "grad_norm": 0.9176618998559712, "language_loss": 0.61681956, "learning_rate": 9.70105810502364e-09, "loss": 0.6369096, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06396484, "step": 16126, "time_per_iteration": 2.9746172428131104 }, { "auxiliary_loss_clip": 0.01050473, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.01737559, "balance_loss_mlp": 1.0160681, "epoch": 0.9696076957763415, "flos": 19128493117440.0, "grad_norm": 2.345511807899787, "language_loss": 0.757061, "learning_rate": 9.662782766562738e-09, "loss": 0.7779513, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 16127, "time_per_iteration": 2.369414806365967 }, { "auxiliary_loss_clip": 0.01052321, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.01552701, "balance_loss_mlp": 1.01533747, "epoch": 0.9696678190290094, "flos": 15485934481920.0, "grad_norm": 1.672816788576485, "language_loss": 0.70155412, "learning_rate": 9.62458290188839e-09, "loss": 0.7224803, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 16128, "time_per_iteration": 2.3544678688049316 }, { "auxiliary_loss_clip": 0.0105246, "auxiliary_loss_mlp": 0.01041524, "balance_loss_clip": 1.0175159, "balance_loss_mlp": 1.01684117, "epoch": 0.9697279422816775, "flos": 36207270691200.0, "grad_norm": 1.5857381621694402, "language_loss": 0.65888321, "learning_rate": 9.586458512449213e-09, "loss": 0.67982304, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 16129, "time_per_iteration": 2.49839186668396 }, { "auxiliary_loss_clip": 0.01053275, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.01387262, "balance_loss_mlp": 1.01610339, "epoch": 0.9697880655343454, "flos": 25482825008640.0, "grad_norm": 1.8286364545465266, "language_loss": 0.64336139, "learning_rate": 9.548409599691166e-09, "loss": 0.66426992, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 16130, "time_per_iteration": 2.401433229446411 }, { "auxiliary_loss_clip": 0.01053804, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.00954056, "balance_loss_mlp": 1.0168736, "epoch": 0.9698481887870134, "flos": 15332840703360.0, "grad_norm": 3.4301606142630106, "language_loss": 0.70327669, "learning_rate": 9.510436165056867e-09, "loss": 0.72415161, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36914062, "step": 16131, "time_per_iteration": 2.346112012863159 }, { "auxiliary_loss_clip": 0.0105341, "auxiliary_loss_mlp": 0.01039659, "balance_loss_clip": 1.01536393, "balance_loss_mlp": 1.01602435, "epoch": 0.9699083120396813, "flos": 21981141175680.0, "grad_norm": 1.8956598443797406, "language_loss": 0.77784586, "learning_rate": 9.472538209986058e-09, "loss": 0.79877651, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 16132, "time_per_iteration": 2.398916721343994 }, { "auxiliary_loss_clip": 0.01052926, "auxiliary_loss_mlp": 0.01040629, "balance_loss_clip": 1.01702571, "balance_loss_mlp": 1.01662874, "epoch": 0.9699684352923493, "flos": 15663293521920.0, "grad_norm": 2.5959859054557097, "language_loss": 0.79790437, "learning_rate": 9.434715735916477e-09, "loss": 0.81883991, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 16133, "time_per_iteration": 2.3285725116729736 }, { "auxiliary_loss_clip": 0.01049987, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.01113248, "balance_loss_mlp": 1.01573324, "epoch": 0.9700285585450172, "flos": 21907299916800.0, "grad_norm": 1.5877818432649977, "language_loss": 0.65287036, "learning_rate": 9.396968744281863e-09, "loss": 0.67369723, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.34375, "step": 16134, "time_per_iteration": 2.39306640625 }, { "auxiliary_loss_clip": 0.01051587, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.01450682, "balance_loss_mlp": 1.01614177, "epoch": 0.9700886817976853, "flos": 23913758419200.0, "grad_norm": 2.229991025923408, "language_loss": 0.81354457, "learning_rate": 9.359297236513519e-09, "loss": 0.83443701, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.35351562, "step": 16135, "time_per_iteration": 3.8164432048797607 }, { "auxiliary_loss_clip": 0.01053743, "auxiliary_loss_mlp": 0.01043995, "balance_loss_clip": 1.01738751, "balance_loss_mlp": 1.01636052, "epoch": 0.9701488050503532, "flos": 25446654973440.0, "grad_norm": 2.7544422009255816, "language_loss": 0.74125993, "learning_rate": 9.321701214040079e-09, "loss": 0.76223731, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.37304688, "step": 16136, "time_per_iteration": 2.3956527709960938 }, { "auxiliary_loss_clip": 0.01051004, "auxiliary_loss_mlp": 0.01036438, "balance_loss_clip": 1.0152545, "balance_loss_mlp": 1.01591992, "epoch": 0.9702089283030212, "flos": 20589922385280.0, "grad_norm": 1.5432292279810815, "language_loss": 0.77085161, "learning_rate": 9.28418067828729e-09, "loss": 0.79172599, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.3515625, "step": 16137, "time_per_iteration": 2.3935258388519287 }, { "auxiliary_loss_clip": 0.01007122, "auxiliary_loss_mlp": 0.01001517, "balance_loss_clip": 0.99956167, "balance_loss_mlp": 1.00080061, "epoch": 0.9702690515556892, "flos": 70648109677440.0, "grad_norm": 0.7413621738084187, "language_loss": 0.54991925, "learning_rate": 9.246735630678015e-09, "loss": 0.57000566, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06347656, "step": 16138, "time_per_iteration": 3.1011555194854736 }, { "auxiliary_loss_clip": 0.01051403, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.01330459, "balance_loss_mlp": 1.01543951, "epoch": 0.9703291748083571, "flos": 35879994806400.0, "grad_norm": 1.7578206922818729, "language_loss": 0.72241372, "learning_rate": 9.209366072632007e-09, "loss": 0.74328387, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.359375, "step": 16139, "time_per_iteration": 2.506899356842041 }, { "auxiliary_loss_clip": 0.01053948, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.0170548, "balance_loss_mlp": 1.01764393, "epoch": 0.9703892980610251, "flos": 24315329410560.0, "grad_norm": 1.597683626736072, "language_loss": 0.73195803, "learning_rate": 9.172072005566134e-09, "loss": 0.7529093, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36328125, "step": 16140, "time_per_iteration": 2.4049429893493652 }, { "auxiliary_loss_clip": 0.01055162, "auxiliary_loss_mlp": 0.0104548, "balance_loss_clip": 1.02004075, "balance_loss_mlp": 1.01661134, "epoch": 0.970449421313693, "flos": 18002788283520.0, "grad_norm": 2.214500367311487, "language_loss": 0.69595134, "learning_rate": 9.13485343089504e-09, "loss": 0.71695781, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38476562, "step": 16141, "time_per_iteration": 2.3558924198150635 }, { "auxiliary_loss_clip": 0.0105023, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.01210129, "balance_loss_mlp": 1.0160253, "epoch": 0.9705095445663611, "flos": 25336818236160.0, "grad_norm": 1.8919819184238322, "language_loss": 0.69370198, "learning_rate": 9.097710350029597e-09, "loss": 0.71454883, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34179688, "step": 16142, "time_per_iteration": 2.3896596431732178 }, { "auxiliary_loss_clip": 0.01050916, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.01403487, "balance_loss_mlp": 1.01574326, "epoch": 0.970569667819029, "flos": 26832392680320.0, "grad_norm": 1.9074397670213503, "language_loss": 0.5668214, "learning_rate": 9.060642764378457e-09, "loss": 0.58769691, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.3515625, "step": 16143, "time_per_iteration": 2.409198045730591 }, { "auxiliary_loss_clip": 0.01052264, "auxiliary_loss_mlp": 0.01038981, "balance_loss_clip": 1.01633096, "balance_loss_mlp": 1.01652098, "epoch": 0.970629791071697, "flos": 25847457914880.0, "grad_norm": 2.255471583039371, "language_loss": 0.69186002, "learning_rate": 9.023650675347382e-09, "loss": 0.71277249, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 16144, "time_per_iteration": 2.4054372310638428 }, { "auxiliary_loss_clip": 0.01049663, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.01863611, "balance_loss_mlp": 1.01549971, "epoch": 0.9706899143243649, "flos": 36538666116480.0, "grad_norm": 1.6402509363437472, "language_loss": 0.72733116, "learning_rate": 8.986734084339253e-09, "loss": 0.74822915, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34179688, "step": 16145, "time_per_iteration": 2.517343282699585 }, { "auxiliary_loss_clip": 0.01052384, "auxiliary_loss_mlp": 0.01040877, "balance_loss_clip": 1.01775038, "balance_loss_mlp": 1.01561809, "epoch": 0.9707500375770329, "flos": 12267641088000.0, "grad_norm": 2.873061359689676, "language_loss": 0.80198228, "learning_rate": 8.949892992753395e-09, "loss": 0.8229149, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 16146, "time_per_iteration": 2.3292410373687744 }, { "auxiliary_loss_clip": 0.0100665, "auxiliary_loss_mlp": 0.01002509, "balance_loss_clip": 1.00057745, "balance_loss_mlp": 1.00038028, "epoch": 0.9708101608297008, "flos": 60849455184000.0, "grad_norm": 0.7652088937700031, "language_loss": 0.54750621, "learning_rate": 8.91312740198713e-09, "loss": 0.56759787, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.0625, "step": 16147, "time_per_iteration": 3.0869803428649902 }, { "auxiliary_loss_clip": 0.01052763, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.01518285, "balance_loss_mlp": 1.01580977, "epoch": 0.9708702840823689, "flos": 27123568352640.0, "grad_norm": 2.4046142711012086, "language_loss": 0.61785817, "learning_rate": 8.876437313434682e-09, "loss": 0.63877416, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36914062, "step": 16148, "time_per_iteration": 2.4354472160339355 }, { "auxiliary_loss_clip": 0.0104984, "auxiliary_loss_mlp": 0.01036016, "balance_loss_clip": 1.01370049, "balance_loss_mlp": 1.01576948, "epoch": 0.9709304073350368, "flos": 20776637669760.0, "grad_norm": 1.661857388144361, "language_loss": 0.75210702, "learning_rate": 8.839822728487155e-09, "loss": 0.77296561, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 16149, "time_per_iteration": 2.407698154449463 }, { "auxiliary_loss_clip": 0.01051635, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.01475215, "balance_loss_mlp": 1.01536775, "epoch": 0.9709905305877048, "flos": 41933026730880.0, "grad_norm": 2.411848504536087, "language_loss": 0.76338774, "learning_rate": 8.803283648533222e-09, "loss": 0.78429091, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 16150, "time_per_iteration": 2.5546629428863525 }, { "auxiliary_loss_clip": 0.01057406, "auxiliary_loss_mlp": 0.0104263, "balance_loss_clip": 1.01530743, "balance_loss_mlp": 1.01875305, "epoch": 0.9710506538403728, "flos": 17164628340480.0, "grad_norm": 2.2277580575847207, "language_loss": 0.75050211, "learning_rate": 8.766820074958214e-09, "loss": 0.77150249, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.38671875, "step": 16151, "time_per_iteration": 2.380120038986206 }, { "auxiliary_loss_clip": 0.0105115, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.01233971, "balance_loss_mlp": 1.01672053, "epoch": 0.9711107770930407, "flos": 21172553020800.0, "grad_norm": 2.3161722621981995, "language_loss": 0.75693154, "learning_rate": 8.730432009145027e-09, "loss": 0.77779782, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34375, "step": 16152, "time_per_iteration": 2.365614652633667 }, { "auxiliary_loss_clip": 0.01051112, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.01197624, "balance_loss_mlp": 1.01573753, "epoch": 0.9711709003457087, "flos": 22235972256000.0, "grad_norm": 2.3392128573934445, "language_loss": 0.68480581, "learning_rate": 8.694119452473448e-09, "loss": 0.70567298, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 16153, "time_per_iteration": 3.717405080795288 }, { "auxiliary_loss_clip": 0.01050878, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.0101862, "balance_loss_mlp": 1.01589191, "epoch": 0.9712310235983767, "flos": 26212649402880.0, "grad_norm": 1.6547607325200764, "language_loss": 0.71603197, "learning_rate": 8.65788240632037e-09, "loss": 0.73687065, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34960938, "step": 16154, "time_per_iteration": 2.407580614089966 }, { "auxiliary_loss_clip": 0.01054342, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.01586747, "balance_loss_mlp": 1.01810873, "epoch": 0.9712911468510447, "flos": 20667115134720.0, "grad_norm": 2.447867004065475, "language_loss": 0.81967646, "learning_rate": 8.621720872059812e-09, "loss": 0.84064037, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.36328125, "step": 16155, "time_per_iteration": 2.3718838691711426 }, { "auxiliary_loss_clip": 0.01052553, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.01480484, "balance_loss_mlp": 1.01614833, "epoch": 0.9713512701037126, "flos": 13552060429440.0, "grad_norm": 2.118557384170636, "language_loss": 0.69541401, "learning_rate": 8.58563485106334e-09, "loss": 0.71634066, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36328125, "step": 16156, "time_per_iteration": 2.3299098014831543 }, { "auxiliary_loss_clip": 0.0105233, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.01011992, "balance_loss_mlp": 1.01493609, "epoch": 0.9714113933563806, "flos": 25847562648960.0, "grad_norm": 2.9007005351514095, "language_loss": 0.93096399, "learning_rate": 8.54962434469919e-09, "loss": 0.95182312, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.375, "step": 16157, "time_per_iteration": 2.3956804275512695 }, { "auxiliary_loss_clip": 0.01053478, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.01442099, "balance_loss_mlp": 1.01664543, "epoch": 0.9714715166090485, "flos": 12742459845120.0, "grad_norm": 1.862485926334487, "language_loss": 0.7415154, "learning_rate": 8.513689354332721e-09, "loss": 0.76241463, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 16158, "time_per_iteration": 2.34171462059021 }, { "auxiliary_loss_clip": 0.01051277, "auxiliary_loss_mlp": 0.01040061, "balance_loss_clip": 1.0177567, "balance_loss_mlp": 1.01665223, "epoch": 0.9715316398617165, "flos": 18404568743040.0, "grad_norm": 2.2325049220735442, "language_loss": 0.61718512, "learning_rate": 8.477829881326836e-09, "loss": 0.63809848, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 16159, "time_per_iteration": 2.3459653854370117 }, { "auxiliary_loss_clip": 0.01048611, "auxiliary_loss_mlp": 0.010334, "balance_loss_clip": 1.01356387, "balance_loss_mlp": 1.01565135, "epoch": 0.9715917631143844, "flos": 28912517884800.0, "grad_norm": 1.648462906946207, "language_loss": 0.79768419, "learning_rate": 8.44204592704112e-09, "loss": 0.81850433, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33007812, "step": 16160, "time_per_iteration": 2.424766778945923 }, { "auxiliary_loss_clip": 0.01006963, "auxiliary_loss_mlp": 0.0100411, "balance_loss_clip": 1.00228584, "balance_loss_mlp": 1.00081182, "epoch": 0.9716518863670525, "flos": 65936580900480.0, "grad_norm": 0.7798448724162302, "language_loss": 0.54399252, "learning_rate": 8.406337492832704e-09, "loss": 0.56410325, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.06152344, "step": 16161, "time_per_iteration": 4.519224405288696 }, { "auxiliary_loss_clip": 0.01049667, "auxiliary_loss_mlp": 0.01037202, "balance_loss_clip": 1.01564908, "balance_loss_mlp": 1.01598775, "epoch": 0.9717120096197204, "flos": 17711787168000.0, "grad_norm": 1.6877084677405325, "language_loss": 0.72703922, "learning_rate": 8.3707045800554e-09, "loss": 0.74790794, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3359375, "step": 16162, "time_per_iteration": 2.3794913291931152 }, { "auxiliary_loss_clip": 0.01049512, "auxiliary_loss_mlp": 0.01038097, "balance_loss_clip": 1.01356411, "balance_loss_mlp": 1.01546884, "epoch": 0.9717721328723884, "flos": 24462243878400.0, "grad_norm": 1.6324985568051364, "language_loss": 0.7980094, "learning_rate": 8.335147190060787e-09, "loss": 0.81888545, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.33984375, "step": 16163, "time_per_iteration": 2.4006128311157227 }, { "auxiliary_loss_clip": 0.01051838, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.01164818, "balance_loss_mlp": 1.01653624, "epoch": 0.9718322561250564, "flos": 20775450683520.0, "grad_norm": 1.6341594270217512, "language_loss": 0.73977244, "learning_rate": 8.299665324196903e-09, "loss": 0.76064175, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35351562, "step": 16164, "time_per_iteration": 3.7066211700439453 }, { "auxiliary_loss_clip": 0.01051238, "auxiliary_loss_mlp": 0.01042725, "balance_loss_clip": 1.0185492, "balance_loss_mlp": 1.01574624, "epoch": 0.9718923793777243, "flos": 19024905513600.0, "grad_norm": 2.148674946162132, "language_loss": 0.84720397, "learning_rate": 8.264258983809114e-09, "loss": 0.86814356, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.35546875, "step": 16165, "time_per_iteration": 2.3458411693573 }, { "auxiliary_loss_clip": 0.01050212, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.01334023, "balance_loss_mlp": 1.0156908, "epoch": 0.9719525026303923, "flos": 21870641122560.0, "grad_norm": 1.4822582080081317, "language_loss": 0.80025721, "learning_rate": 8.228928170240345e-09, "loss": 0.82109243, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.34570312, "step": 16166, "time_per_iteration": 2.374640941619873 }, { "auxiliary_loss_clip": 0.01050489, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.01623273, "balance_loss_mlp": 1.01558137, "epoch": 0.9720126258830603, "flos": 14428415266560.0, "grad_norm": 1.8876310531438771, "language_loss": 0.72100997, "learning_rate": 8.193672884830195e-09, "loss": 0.7418853, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.34765625, "step": 16167, "time_per_iteration": 2.3485209941864014 }, { "auxiliary_loss_clip": 0.01052018, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.01563537, "balance_loss_mlp": 1.01711226, "epoch": 0.9720727491357283, "flos": 26249936601600.0, "grad_norm": 1.7512259462599866, "language_loss": 0.76867938, "learning_rate": 8.158493128915812e-09, "loss": 0.78957689, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34765625, "step": 16168, "time_per_iteration": 2.4379918575286865 }, { "auxiliary_loss_clip": 0.0105273, "auxiliary_loss_mlp": 0.01041721, "balance_loss_clip": 1.01749754, "balance_loss_mlp": 1.01621985, "epoch": 0.9721328723883962, "flos": 22673015055360.0, "grad_norm": 2.2990461689838972, "language_loss": 0.74745542, "learning_rate": 8.123388903830797e-09, "loss": 0.76839995, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36523438, "step": 16169, "time_per_iteration": 2.3691699504852295 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.01221657, "balance_loss_mlp": 1.01664925, "epoch": 0.9721929956410642, "flos": 28072926576000.0, "grad_norm": 1.8534871323007815, "language_loss": 0.58696675, "learning_rate": 8.088360210906309e-09, "loss": 0.60787117, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.37109375, "step": 16170, "time_per_iteration": 2.441551446914673 }, { "auxiliary_loss_clip": 0.01052144, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.01617682, "balance_loss_mlp": 1.01578546, "epoch": 0.9722531188937321, "flos": 20995193980800.0, "grad_norm": 1.800828649495178, "language_loss": 0.72098637, "learning_rate": 8.053407051471062e-09, "loss": 0.74191481, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36328125, "step": 16171, "time_per_iteration": 2.3832414150238037 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.01857519, "balance_loss_mlp": 1.0159725, "epoch": 0.9723132421464001, "flos": 16069682280960.0, "grad_norm": 1.8064136121888357, "language_loss": 0.69329232, "learning_rate": 8.018529426850218e-09, "loss": 0.71423507, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 16172, "time_per_iteration": 2.358764410018921 }, { "auxiliary_loss_clip": 0.01049608, "auxiliary_loss_mlp": 0.01032061, "balance_loss_clip": 1.01082993, "balance_loss_mlp": 1.01530886, "epoch": 0.972373365399068, "flos": 27744533527680.0, "grad_norm": 1.714706018047328, "language_loss": 0.86992097, "learning_rate": 7.983727338366274e-09, "loss": 0.89073765, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 16173, "time_per_iteration": 2.410062074661255 }, { "auxiliary_loss_clip": 0.01054753, "auxiliary_loss_mlp": 0.0104089, "balance_loss_clip": 1.01547432, "balance_loss_mlp": 1.01615453, "epoch": 0.9724334886517361, "flos": 23001827040000.0, "grad_norm": 2.6187017012202167, "language_loss": 0.65290916, "learning_rate": 7.949000787339289e-09, "loss": 0.67386556, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.38671875, "step": 16174, "time_per_iteration": 2.453516960144043 }, { "auxiliary_loss_clip": 0.01049616, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.01269555, "balance_loss_mlp": 1.01566195, "epoch": 0.972493611904404, "flos": 25445886923520.0, "grad_norm": 1.4986065621822637, "language_loss": 0.79103863, "learning_rate": 7.914349775085538e-09, "loss": 0.81188869, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.33984375, "step": 16175, "time_per_iteration": 2.428025484085083 }, { "auxiliary_loss_clip": 0.01051329, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.01456785, "balance_loss_mlp": 1.01612461, "epoch": 0.972553735157072, "flos": 16982102419200.0, "grad_norm": 3.6715683919531346, "language_loss": 0.58757693, "learning_rate": 7.879774302919307e-09, "loss": 0.60847384, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.3515625, "step": 16176, "time_per_iteration": 3.7630529403686523 }, { "auxiliary_loss_clip": 0.01052519, "auxiliary_loss_mlp": 0.01036288, "balance_loss_clip": 1.01329219, "balance_loss_mlp": 1.01663339, "epoch": 0.97261385840974, "flos": 26103441070080.0, "grad_norm": 2.6996245155512097, "language_loss": 0.73221099, "learning_rate": 7.845274372151545e-09, "loss": 0.75309908, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 16177, "time_per_iteration": 2.4367587566375732 }, { "auxiliary_loss_clip": 0.01051208, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.0172236, "balance_loss_mlp": 1.01535988, "epoch": 0.9726739816624079, "flos": 25446689884800.0, "grad_norm": 1.723086761001516, "language_loss": 0.69110298, "learning_rate": 7.810849984090984e-09, "loss": 0.71201485, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 16178, "time_per_iteration": 2.395634174346924 }, { "auxiliary_loss_clip": 0.01050845, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.01504242, "balance_loss_mlp": 1.01483214, "epoch": 0.972734104915076, "flos": 29013871161600.0, "grad_norm": 1.962119092450322, "language_loss": 0.68698204, "learning_rate": 7.776501140042358e-09, "loss": 0.70788491, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.359375, "step": 16179, "time_per_iteration": 2.4130783081054688 }, { "auxiliary_loss_clip": 0.0105059, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.01342416, "balance_loss_mlp": 1.01625824, "epoch": 0.9727942281677439, "flos": 23436600600960.0, "grad_norm": 2.346562990958224, "language_loss": 0.78120065, "learning_rate": 7.742227841308624e-09, "loss": 0.80206358, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 16180, "time_per_iteration": 2.3911776542663574 }, { "auxiliary_loss_clip": 0.01052961, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.00989962, "balance_loss_mlp": 1.01569223, "epoch": 0.9728543514204119, "flos": 31723200622080.0, "grad_norm": 1.8934816540408783, "language_loss": 0.77390093, "learning_rate": 7.708030089189188e-09, "loss": 0.79477715, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 16181, "time_per_iteration": 2.4553840160369873 }, { "auxiliary_loss_clip": 0.01050393, "auxiliary_loss_mlp": 0.01036164, "balance_loss_clip": 1.01477814, "balance_loss_mlp": 1.01540804, "epoch": 0.9729144746730798, "flos": 16288448060160.0, "grad_norm": 1.4529704504412109, "language_loss": 0.64404643, "learning_rate": 7.67390788498079e-09, "loss": 0.66491199, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34960938, "step": 16182, "time_per_iteration": 2.3624930381774902 }, { "auxiliary_loss_clip": 0.01054228, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.0148977, "balance_loss_mlp": 1.01749313, "epoch": 0.9729745979257478, "flos": 25040370948480.0, "grad_norm": 1.851159641089124, "language_loss": 0.64062685, "learning_rate": 7.639861229977507e-09, "loss": 0.6615603, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 16183, "time_per_iteration": 2.3939785957336426 }, { "auxiliary_loss_clip": 0.01049728, "auxiliary_loss_mlp": 0.01038514, "balance_loss_clip": 1.01623392, "balance_loss_mlp": 1.01549077, "epoch": 0.9730347211784157, "flos": 22637613070080.0, "grad_norm": 1.6074545949767371, "language_loss": 0.7949146, "learning_rate": 7.605890125470527e-09, "loss": 0.81579709, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 16184, "time_per_iteration": 2.3971481323242188 }, { "auxiliary_loss_clip": 0.01050436, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.01428974, "balance_loss_mlp": 1.01566172, "epoch": 0.9730948444310837, "flos": 10997849606400.0, "grad_norm": 2.9480884572587502, "language_loss": 0.80870569, "learning_rate": 7.571994572747709e-09, "loss": 0.82956433, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34765625, "step": 16185, "time_per_iteration": 2.3304996490478516 }, { "auxiliary_loss_clip": 0.01051949, "auxiliary_loss_mlp": 0.0103812, "balance_loss_clip": 1.01501679, "balance_loss_mlp": 1.01589656, "epoch": 0.9731549676837516, "flos": 16798564068480.0, "grad_norm": 1.7432419504165366, "language_loss": 0.79116321, "learning_rate": 7.538174573094469e-09, "loss": 0.81206387, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.36132812, "step": 16186, "time_per_iteration": 2.356036901473999 }, { "auxiliary_loss_clip": 0.01049042, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.01493073, "balance_loss_mlp": 1.01494479, "epoch": 0.9732150909364197, "flos": 21140711994240.0, "grad_norm": 1.4758802842171408, "language_loss": 0.65937459, "learning_rate": 7.504430127793337e-09, "loss": 0.68023646, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.33984375, "step": 16187, "time_per_iteration": 2.399038791656494 }, { "auxiliary_loss_clip": 0.01049723, "auxiliary_loss_mlp": 0.01036391, "balance_loss_clip": 1.01319325, "balance_loss_mlp": 1.01494956, "epoch": 0.9732752141890876, "flos": 33725993431680.0, "grad_norm": 2.1063996919487904, "language_loss": 0.81314504, "learning_rate": 7.47076123812418e-09, "loss": 0.83400619, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34765625, "step": 16188, "time_per_iteration": 2.4891457557678223 }, { "auxiliary_loss_clip": 0.01049726, "auxiliary_loss_mlp": 0.0103704, "balance_loss_clip": 1.01704848, "balance_loss_mlp": 1.01590788, "epoch": 0.9733353374417556, "flos": 23403258385920.0, "grad_norm": 2.1914613033401396, "language_loss": 0.79027653, "learning_rate": 7.437167905363084e-09, "loss": 0.81114417, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33789062, "step": 16189, "time_per_iteration": 2.3915352821350098 }, { "auxiliary_loss_clip": 0.0105038, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.01064456, "balance_loss_mlp": 1.01558042, "epoch": 0.9733954606944236, "flos": 39164239491840.0, "grad_norm": 1.9151820174112748, "language_loss": 0.52202719, "learning_rate": 7.403650130784367e-09, "loss": 0.54286408, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 16190, "time_per_iteration": 2.5662381649017334 }, { "auxiliary_loss_clip": 0.01050982, "auxiliary_loss_mlp": 0.01035509, "balance_loss_clip": 1.01185834, "balance_loss_mlp": 1.01588714, "epoch": 0.9734555839470915, "flos": 21980722239360.0, "grad_norm": 2.2680067010341, "language_loss": 0.81707144, "learning_rate": 7.3702079156590105e-09, "loss": 0.83793634, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.3515625, "step": 16191, "time_per_iteration": 2.369935989379883 }, { "auxiliary_loss_clip": 0.01050095, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.01482165, "balance_loss_mlp": 1.01559186, "epoch": 0.9735157071997596, "flos": 16574771053440.0, "grad_norm": 1.677298901386076, "language_loss": 0.83594191, "learning_rate": 7.336841261255111e-09, "loss": 0.85681045, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 16192, "time_per_iteration": 2.3512990474700928 }, { "auxiliary_loss_clip": 0.01052017, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.01157403, "balance_loss_mlp": 1.01663947, "epoch": 0.9735758304524275, "flos": 20222391836160.0, "grad_norm": 1.8771505551021115, "language_loss": 0.75771618, "learning_rate": 7.303550168837658e-09, "loss": 0.778566, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35546875, "step": 16193, "time_per_iteration": 3.60482120513916 }, { "auxiliary_loss_clip": 0.0104972, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.01677442, "balance_loss_mlp": 1.01553476, "epoch": 0.9736359537050955, "flos": 23652398914560.0, "grad_norm": 1.6325566201344517, "language_loss": 0.85763252, "learning_rate": 7.270334639669417e-09, "loss": 0.8785038, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34179688, "step": 16194, "time_per_iteration": 2.395846366882324 }, { "auxiliary_loss_clip": 0.01050753, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.01189423, "balance_loss_mlp": 1.01620603, "epoch": 0.9736960769577634, "flos": 15559286981760.0, "grad_norm": 1.5618432861107077, "language_loss": 0.76734579, "learning_rate": 7.237194675009828e-09, "loss": 0.7882024, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34570312, "step": 16195, "time_per_iteration": 2.3392210006713867 }, { "auxiliary_loss_clip": 0.01007502, "auxiliary_loss_mlp": 0.01002392, "balance_loss_clip": 1.00031805, "balance_loss_mlp": 1.00105977, "epoch": 0.9737562002104314, "flos": 65348783383680.0, "grad_norm": 0.7105143616565754, "language_loss": 0.52495837, "learning_rate": 7.204130276115439e-09, "loss": 0.5450573, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.06445312, "step": 16196, "time_per_iteration": 2.983163833618164 }, { "auxiliary_loss_clip": 0.01049361, "auxiliary_loss_mlp": 0.01036359, "balance_loss_clip": 1.01398373, "balance_loss_mlp": 1.01480007, "epoch": 0.9738163234630993, "flos": 27195559309440.0, "grad_norm": 1.8840060786221002, "language_loss": 0.7685082, "learning_rate": 7.171141444240136e-09, "loss": 0.78936541, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 16197, "time_per_iteration": 2.4125523567199707 }, { "auxiliary_loss_clip": 0.01054312, "auxiliary_loss_mlp": 0.01035453, "balance_loss_clip": 1.01158714, "balance_loss_mlp": 1.01704621, "epoch": 0.9738764467157673, "flos": 21068302101120.0, "grad_norm": 2.075231776285573, "language_loss": 0.68784046, "learning_rate": 7.13822818063492e-09, "loss": 0.70873809, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37304688, "step": 16198, "time_per_iteration": 2.3904638290405273 }, { "auxiliary_loss_clip": 0.01052549, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.01033163, "balance_loss_mlp": 1.01581955, "epoch": 0.9739365699684353, "flos": 21360141089280.0, "grad_norm": 1.8872430358537475, "language_loss": 0.79702455, "learning_rate": 7.10539048654768e-09, "loss": 0.81790859, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 16199, "time_per_iteration": 2.459584951400757 }, { "auxiliary_loss_clip": 0.01051322, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.01042724, "balance_loss_mlp": 1.01571774, "epoch": 0.9739966932211033, "flos": 21901155517440.0, "grad_norm": 1.8227214887819847, "language_loss": 0.80409551, "learning_rate": 7.072628363223865e-09, "loss": 0.82494664, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35546875, "step": 16200, "time_per_iteration": 2.376957893371582 }, { "auxiliary_loss_clip": 0.01054686, "auxiliary_loss_mlp": 0.01041181, "balance_loss_clip": 1.01557481, "balance_loss_mlp": 1.01652122, "epoch": 0.9740568164737712, "flos": 24826178557440.0, "grad_norm": 2.333146056883385, "language_loss": 0.70364141, "learning_rate": 7.039941811905592e-09, "loss": 0.72460002, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 16201, "time_per_iteration": 3.872481107711792 }, { "auxiliary_loss_clip": 0.01052471, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.0143311, "balance_loss_mlp": 1.01665306, "epoch": 0.9741169397264392, "flos": 23622443101440.0, "grad_norm": 1.583599394866779, "language_loss": 0.73521233, "learning_rate": 7.0073308338325364e-09, "loss": 0.75611556, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 16202, "time_per_iteration": 2.393620252609253 }, { "auxiliary_loss_clip": 0.01052723, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.01227129, "balance_loss_mlp": 1.01632154, "epoch": 0.9741770629791072, "flos": 18840215088000.0, "grad_norm": 1.7389256299574858, "language_loss": 0.74189705, "learning_rate": 6.974795430241265e-09, "loss": 0.76277101, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36328125, "step": 16203, "time_per_iteration": 2.380190849304199 }, { "auxiliary_loss_clip": 0.01051239, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.01833963, "balance_loss_mlp": 1.01525939, "epoch": 0.9742371862317751, "flos": 22345145677440.0, "grad_norm": 1.8863522968256838, "language_loss": 0.78701407, "learning_rate": 6.942335602365235e-09, "loss": 0.80795717, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 16204, "time_per_iteration": 3.6815130710601807 }, { "auxiliary_loss_clip": 0.01053232, "auxiliary_loss_mlp": 0.01042479, "balance_loss_clip": 1.01810038, "balance_loss_mlp": 1.0164305, "epoch": 0.9742973094844432, "flos": 21761677169280.0, "grad_norm": 2.387298720403749, "language_loss": 0.81632292, "learning_rate": 6.909951351435905e-09, "loss": 0.83728004, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.3671875, "step": 16205, "time_per_iteration": 2.4056804180145264 }, { "auxiliary_loss_clip": 0.01051655, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.01247501, "balance_loss_mlp": 1.016078, "epoch": 0.9743574327371111, "flos": 26247562629120.0, "grad_norm": 1.7005137301601418, "language_loss": 0.74873126, "learning_rate": 6.87764267868074e-09, "loss": 0.76958883, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35546875, "step": 16206, "time_per_iteration": 2.404078483581543 }, { "auxiliary_loss_clip": 0.01051649, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.01337683, "balance_loss_mlp": 1.01527238, "epoch": 0.9744175559897791, "flos": 12348499530240.0, "grad_norm": 2.3184214631889537, "language_loss": 0.85256523, "learning_rate": 6.8454095853252015e-09, "loss": 0.87345803, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36328125, "step": 16207, "time_per_iteration": 2.356764316558838 }, { "auxiliary_loss_clip": 0.01050808, "auxiliary_loss_mlp": 0.01039995, "balance_loss_clip": 1.01808453, "balance_loss_mlp": 1.01601696, "epoch": 0.974477679242447, "flos": 28396711324800.0, "grad_norm": 1.5947055134547456, "language_loss": 0.71413767, "learning_rate": 6.813252072591425e-09, "loss": 0.73504567, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 16208, "time_per_iteration": 2.4173760414123535 }, { "auxiliary_loss_clip": 0.0104725, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 1.00980973, "balance_loss_mlp": 1.01486027, "epoch": 0.974537802495115, "flos": 17784406529280.0, "grad_norm": 1.7918250435749647, "language_loss": 0.77899724, "learning_rate": 6.781170141698878e-09, "loss": 0.79976308, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.32421875, "step": 16209, "time_per_iteration": 2.3587779998779297 }, { "auxiliary_loss_clip": 0.01051231, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.01182866, "balance_loss_mlp": 1.01537454, "epoch": 0.9745979257477829, "flos": 23841173969280.0, "grad_norm": 2.0289641531249427, "language_loss": 0.80138087, "learning_rate": 6.749163793864144e-09, "loss": 0.82223356, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 16210, "time_per_iteration": 2.384678602218628 }, { "auxiliary_loss_clip": 0.01051502, "auxiliary_loss_mlp": 0.01038484, "balance_loss_clip": 1.01617992, "balance_loss_mlp": 1.01635242, "epoch": 0.9746580490004509, "flos": 27014953512960.0, "grad_norm": 4.570296830381062, "language_loss": 0.79392266, "learning_rate": 6.7172330303009176e-09, "loss": 0.81482255, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16211, "time_per_iteration": 2.4344282150268555 }, { "auxiliary_loss_clip": 0.01054071, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.0129981, "balance_loss_mlp": 1.01661289, "epoch": 0.9747181722531189, "flos": 19791004677120.0, "grad_norm": 3.4666580672594107, "language_loss": 0.79683435, "learning_rate": 6.685377852219787e-09, "loss": 0.81774771, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.375, "step": 16212, "time_per_iteration": 2.360055923461914 }, { "auxiliary_loss_clip": 0.01049617, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.01813173, "balance_loss_mlp": 1.01496208, "epoch": 0.9747782955057869, "flos": 31429266952320.0, "grad_norm": 1.5527170834139825, "language_loss": 0.81620514, "learning_rate": 6.653598260829118e-09, "loss": 0.83708912, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.34570312, "step": 16213, "time_per_iteration": 2.446025848388672 }, { "auxiliary_loss_clip": 0.01051541, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.01186705, "balance_loss_mlp": 1.01567721, "epoch": 0.9748384187584548, "flos": 15960369214080.0, "grad_norm": 1.7876817766041477, "language_loss": 0.68034029, "learning_rate": 6.6218942573335044e-09, "loss": 0.7012012, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.359375, "step": 16214, "time_per_iteration": 2.347456216812134 }, { "auxiliary_loss_clip": 0.01053293, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.01040387, "balance_loss_mlp": 1.0170877, "epoch": 0.9748985420111228, "flos": 20557662422400.0, "grad_norm": 2.783331139475341, "language_loss": 0.75472248, "learning_rate": 6.5902658429355386e-09, "loss": 0.77560699, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36328125, "step": 16215, "time_per_iteration": 3.827699661254883 }, { "auxiliary_loss_clip": 0.0105044, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.01281404, "balance_loss_mlp": 1.01485527, "epoch": 0.9749586652637908, "flos": 36720109785600.0, "grad_norm": 1.7628413465880899, "language_loss": 0.68541753, "learning_rate": 6.558713018834483e-09, "loss": 0.70626831, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35546875, "step": 16216, "time_per_iteration": 2.4815425872802734 }, { "auxiliary_loss_clip": 0.01054112, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.01193523, "balance_loss_mlp": 1.0172174, "epoch": 0.9750187885164587, "flos": 10998443099520.0, "grad_norm": 1.8190230855563996, "language_loss": 0.73264134, "learning_rate": 6.527235786226937e-09, "loss": 0.75354266, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36914062, "step": 16217, "time_per_iteration": 2.3734238147735596 }, { "auxiliary_loss_clip": 0.01051189, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.01170433, "balance_loss_mlp": 1.01558006, "epoch": 0.9750789117691268, "flos": 25738319404800.0, "grad_norm": 1.549035784580979, "language_loss": 0.7936213, "learning_rate": 6.495834146306167e-09, "loss": 0.81448478, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 16218, "time_per_iteration": 2.3923707008361816 }, { "auxiliary_loss_clip": 0.01050836, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.01284814, "balance_loss_mlp": 1.01591182, "epoch": 0.9751390350217947, "flos": 13333120093440.0, "grad_norm": 2.0019568554250786, "language_loss": 0.78253645, "learning_rate": 6.464508100263222e-09, "loss": 0.80338871, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 16219, "time_per_iteration": 2.3584911823272705 }, { "auxiliary_loss_clip": 0.01052651, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.01548433, "balance_loss_mlp": 1.01613903, "epoch": 0.9751991582744627, "flos": 22819510586880.0, "grad_norm": 1.6178250847642992, "language_loss": 0.82534528, "learning_rate": 6.433257649285817e-09, "loss": 0.84625524, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36523438, "step": 16220, "time_per_iteration": 2.3694510459899902 }, { "auxiliary_loss_clip": 0.01049878, "auxiliary_loss_mlp": 0.01038433, "balance_loss_clip": 1.01689208, "balance_loss_mlp": 1.01461542, "epoch": 0.9752592815271306, "flos": 19645172461440.0, "grad_norm": 2.0405355070788946, "language_loss": 0.76253647, "learning_rate": 6.402082794559227e-09, "loss": 0.78341961, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.35351562, "step": 16221, "time_per_iteration": 2.3701202869415283 }, { "auxiliary_loss_clip": 0.01050159, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.0109992, "balance_loss_mlp": 1.01578784, "epoch": 0.9753194047797986, "flos": 26689178816640.0, "grad_norm": 1.6448522916296298, "language_loss": 0.67757177, "learning_rate": 6.370983537265395e-09, "loss": 0.69841403, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34375, "step": 16222, "time_per_iteration": 2.4214653968811035 }, { "auxiliary_loss_clip": 0.0105087, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.01356363, "balance_loss_mlp": 1.01563275, "epoch": 0.9753795280324665, "flos": 23220662641920.0, "grad_norm": 1.7912132424316922, "language_loss": 0.89669716, "learning_rate": 6.3399598785836004e-09, "loss": 0.91756582, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.3515625, "step": 16223, "time_per_iteration": 2.4053471088409424 }, { "auxiliary_loss_clip": 0.01051198, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.01469827, "balance_loss_mlp": 1.01624584, "epoch": 0.9754396512851345, "flos": 19462821096960.0, "grad_norm": 1.7807941164052492, "language_loss": 0.75998187, "learning_rate": 6.309011819690457e-09, "loss": 0.7808727, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.34960938, "step": 16224, "time_per_iteration": 2.341597080230713 }, { "auxiliary_loss_clip": 0.01006429, "auxiliary_loss_mlp": 0.0100252, "balance_loss_clip": 1.00043356, "balance_loss_mlp": 1.00022578, "epoch": 0.9754997745378025, "flos": 68456053054080.0, "grad_norm": 0.8160232345922002, "language_loss": 0.59252238, "learning_rate": 6.278139361759249e-09, "loss": 0.61261189, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06201172, "step": 16225, "time_per_iteration": 2.9823837280273438 }, { "auxiliary_loss_clip": 0.01050673, "auxiliary_loss_mlp": 0.01040259, "balance_loss_clip": 1.01926637, "balance_loss_mlp": 1.01618183, "epoch": 0.9755598977904705, "flos": 26393988337920.0, "grad_norm": 1.8311766101190035, "language_loss": 0.69350362, "learning_rate": 6.247342505960818e-09, "loss": 0.71441299, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34570312, "step": 16226, "time_per_iteration": 2.4117002487182617 }, { "auxiliary_loss_clip": 0.01052966, "auxiliary_loss_mlp": 0.01042636, "balance_loss_clip": 1.02024865, "balance_loss_mlp": 1.01640797, "epoch": 0.9756200210431384, "flos": 16616736374400.0, "grad_norm": 1.6178648217701492, "language_loss": 0.83631498, "learning_rate": 6.216621253462894e-09, "loss": 0.85727102, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36523438, "step": 16227, "time_per_iteration": 2.366856575012207 }, { "auxiliary_loss_clip": 0.01049799, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.00996602, "balance_loss_mlp": 1.01520216, "epoch": 0.9756801442958064, "flos": 23621919431040.0, "grad_norm": 1.7705016363024901, "language_loss": 0.78587818, "learning_rate": 6.185975605430549e-09, "loss": 0.80669808, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34570312, "step": 16228, "time_per_iteration": 2.385324001312256 }, { "auxiliary_loss_clip": 0.01007015, "auxiliary_loss_mlp": 0.01003684, "balance_loss_clip": 1.0017767, "balance_loss_mlp": 1.00066829, "epoch": 0.9757402675484744, "flos": 61623027244800.0, "grad_norm": 0.8510095176503469, "language_loss": 0.55889618, "learning_rate": 6.155405563025962e-09, "loss": 0.57900321, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.06347656, "step": 16229, "time_per_iteration": 2.988348960876465 }, { "auxiliary_loss_clip": 0.01052077, "auxiliary_loss_mlp": 0.01040415, "balance_loss_clip": 1.01712108, "balance_loss_mlp": 1.01616251, "epoch": 0.9758003908011423, "flos": 24057879978240.0, "grad_norm": 1.6668275266788202, "language_loss": 0.75956839, "learning_rate": 6.124911127407984e-09, "loss": 0.78049326, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 16230, "time_per_iteration": 2.4021689891815186 }, { "auxiliary_loss_clip": 0.01048109, "auxiliary_loss_mlp": 0.0102979, "balance_loss_clip": 1.00886846, "balance_loss_mlp": 1.01521313, "epoch": 0.9758605140538104, "flos": 17492148604800.0, "grad_norm": 1.7388834417032808, "language_loss": 0.73215628, "learning_rate": 6.094492299733245e-09, "loss": 0.75293529, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.328125, "step": 16231, "time_per_iteration": 2.3673012256622314 }, { "auxiliary_loss_clip": 0.01053425, "auxiliary_loss_mlp": 0.01041943, "balance_loss_clip": 1.01663542, "balance_loss_mlp": 1.01614225, "epoch": 0.9759206373064783, "flos": 24825794532480.0, "grad_norm": 1.8235620278520732, "language_loss": 0.77730596, "learning_rate": 6.064149081155267e-09, "loss": 0.79825962, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37304688, "step": 16232, "time_per_iteration": 2.3963732719421387 }, { "auxiliary_loss_clip": 0.01006644, "auxiliary_loss_mlp": 0.01003715, "balance_loss_clip": 1.0016644, "balance_loss_mlp": 1.00032806, "epoch": 0.9759807605591463, "flos": 68158104577920.0, "grad_norm": 0.7401191002746551, "language_loss": 0.53879398, "learning_rate": 6.033881472824465e-09, "loss": 0.55889755, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.06298828, "step": 16233, "time_per_iteration": 4.178099870681763 }, { "auxiliary_loss_clip": 0.01051493, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.01690292, "balance_loss_mlp": 1.01547194, "epoch": 0.9760408838118142, "flos": 18988246719360.0, "grad_norm": 1.6634579810189019, "language_loss": 0.72442842, "learning_rate": 6.003689475888807e-09, "loss": 0.74534452, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 16234, "time_per_iteration": 2.356457471847534 }, { "auxiliary_loss_clip": 0.01052112, "auxiliary_loss_mlp": 0.01037837, "balance_loss_clip": 1.01425767, "balance_loss_mlp": 1.01588154, "epoch": 0.9761010070644822, "flos": 17125106814720.0, "grad_norm": 2.392312083596412, "language_loss": 0.80336225, "learning_rate": 5.973573091493156e-09, "loss": 0.82426178, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 16235, "time_per_iteration": 2.359485149383545 }, { "auxiliary_loss_clip": 0.0105141, "auxiliary_loss_mlp": 0.0103898, "balance_loss_clip": 1.01503062, "balance_loss_mlp": 1.01589191, "epoch": 0.9761611303171501, "flos": 22051561121280.0, "grad_norm": 2.979687788859341, "language_loss": 0.78097498, "learning_rate": 5.943532320779265e-09, "loss": 0.80187887, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.35546875, "step": 16236, "time_per_iteration": 2.367854356765747 }, { "auxiliary_loss_clip": 0.01051082, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.01081371, "balance_loss_mlp": 1.01577806, "epoch": 0.9762212535698181, "flos": 21756580110720.0, "grad_norm": 1.670529251176396, "language_loss": 0.76093996, "learning_rate": 5.913567164886446e-09, "loss": 0.7817688, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.3515625, "step": 16237, "time_per_iteration": 2.3767683506011963 }, { "auxiliary_loss_clip": 0.01050481, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.01644826, "balance_loss_mlp": 1.01510918, "epoch": 0.9762813768224861, "flos": 25920426389760.0, "grad_norm": 3.586992358348562, "language_loss": 0.74114096, "learning_rate": 5.8836776249509e-09, "loss": 0.76203465, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35351562, "step": 16238, "time_per_iteration": 2.404482841491699 }, { "auxiliary_loss_clip": 0.01050684, "auxiliary_loss_mlp": 0.01038626, "balance_loss_clip": 1.01518929, "balance_loss_mlp": 1.01557684, "epoch": 0.9763415000751541, "flos": 24050967528960.0, "grad_norm": 2.148630454851276, "language_loss": 0.84540528, "learning_rate": 5.8538637021063875e-09, "loss": 0.86629832, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.3515625, "step": 16239, "time_per_iteration": 2.3649039268493652 }, { "auxiliary_loss_clip": 0.010518, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.01319683, "balance_loss_mlp": 1.0161562, "epoch": 0.976401623327822, "flos": 17017748784000.0, "grad_norm": 2.5703687252853444, "language_loss": 0.61810488, "learning_rate": 5.824125397483115e-09, "loss": 0.63899922, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35546875, "step": 16240, "time_per_iteration": 3.7469937801361084 }, { "auxiliary_loss_clip": 0.01051732, "auxiliary_loss_mlp": 0.01036818, "balance_loss_clip": 1.01487207, "balance_loss_mlp": 1.01629782, "epoch": 0.97646174658049, "flos": 16106026872960.0, "grad_norm": 2.1433431566762424, "language_loss": 0.84030306, "learning_rate": 5.7944627122088474e-09, "loss": 0.86118859, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35351562, "step": 16241, "time_per_iteration": 2.3535537719726562 }, { "auxiliary_loss_clip": 0.01052288, "auxiliary_loss_mlp": 0.0103789, "balance_loss_clip": 1.01597977, "balance_loss_mlp": 1.01741338, "epoch": 0.9765218698331579, "flos": 21251630983680.0, "grad_norm": 1.7464228752804831, "language_loss": 0.83729768, "learning_rate": 5.764875647408463e-09, "loss": 0.85819948, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34765625, "step": 16242, "time_per_iteration": 2.3788530826568604 }, { "auxiliary_loss_clip": 0.01050742, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.01363301, "balance_loss_mlp": 1.01494384, "epoch": 0.9765819930858259, "flos": 18587059752960.0, "grad_norm": 1.8129862370198815, "language_loss": 0.77020705, "learning_rate": 5.7353642042037294e-09, "loss": 0.79109371, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35742188, "step": 16243, "time_per_iteration": 3.8230788707733154 }, { "auxiliary_loss_clip": 0.01051729, "auxiliary_loss_mlp": 0.01043321, "balance_loss_clip": 1.01789403, "balance_loss_mlp": 1.01602066, "epoch": 0.976642116338494, "flos": 20265788522880.0, "grad_norm": 1.691520732168161, "language_loss": 0.70978725, "learning_rate": 5.705928383713754e-09, "loss": 0.73073769, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.35742188, "step": 16244, "time_per_iteration": 2.391317844390869 }, { "auxiliary_loss_clip": 0.01052722, "auxiliary_loss_mlp": 0.01038152, "balance_loss_clip": 1.0134995, "balance_loss_mlp": 1.01669192, "epoch": 0.9767022395911619, "flos": 25549474527360.0, "grad_norm": 2.8266185094813228, "language_loss": 0.85087144, "learning_rate": 5.676568187055197e-09, "loss": 0.87178022, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.359375, "step": 16245, "time_per_iteration": 2.3948748111724854 }, { "auxiliary_loss_clip": 0.01050279, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.01410353, "balance_loss_mlp": 1.01552403, "epoch": 0.9767623628438299, "flos": 21761572435200.0, "grad_norm": 1.3178895202551233, "language_loss": 0.79237139, "learning_rate": 5.647283615340726e-09, "loss": 0.81322837, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34765625, "step": 16246, "time_per_iteration": 2.3962326049804688 }, { "auxiliary_loss_clip": 0.01047024, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.01285791, "balance_loss_mlp": 1.01480174, "epoch": 0.9768224860964978, "flos": 15850218274560.0, "grad_norm": 1.4278027624585417, "language_loss": 0.74900198, "learning_rate": 5.6180746696812275e-09, "loss": 0.76979405, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.32226562, "step": 16247, "time_per_iteration": 2.35406494140625 }, { "auxiliary_loss_clip": 0.01051911, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.01333606, "balance_loss_mlp": 1.01637161, "epoch": 0.9768826093491658, "flos": 25150102951680.0, "grad_norm": 1.9200062458131466, "language_loss": 0.81004345, "learning_rate": 5.58894135118404e-09, "loss": 0.83092844, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16248, "time_per_iteration": 2.4182803630828857 }, { "auxiliary_loss_clip": 0.01055015, "auxiliary_loss_mlp": 0.01043695, "balance_loss_clip": 1.0178144, "balance_loss_mlp": 1.01715374, "epoch": 0.9769427326018337, "flos": 22965901384320.0, "grad_norm": 1.8421816659177177, "language_loss": 0.80745262, "learning_rate": 5.559883660954278e-09, "loss": 0.82843971, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.37890625, "step": 16249, "time_per_iteration": 2.4294703006744385 }, { "auxiliary_loss_clip": 0.01050378, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.01086915, "balance_loss_mlp": 1.01608932, "epoch": 0.9770028558545018, "flos": 15263503009920.0, "grad_norm": 1.9218418797520893, "language_loss": 0.68108678, "learning_rate": 5.530901600093507e-09, "loss": 0.70192546, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34375, "step": 16250, "time_per_iteration": 2.3627490997314453 }, { "auxiliary_loss_clip": 0.0100755, "auxiliary_loss_mlp": 0.01004302, "balance_loss_clip": 1.0022155, "balance_loss_mlp": 1.00130868, "epoch": 0.9770629791071697, "flos": 71447027385600.0, "grad_norm": 0.7782993644230678, "language_loss": 0.60003209, "learning_rate": 5.501995169700846e-09, "loss": 0.62015069, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.0625, "step": 16251, "time_per_iteration": 3.0625874996185303 }, { "auxiliary_loss_clip": 0.01051318, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.01692677, "balance_loss_mlp": 1.01599717, "epoch": 0.9771231023598377, "flos": 22411934841600.0, "grad_norm": 3.2244275120166255, "language_loss": 0.79379612, "learning_rate": 5.473164370872307e-09, "loss": 0.81470078, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35351562, "step": 16252, "time_per_iteration": 2.406430959701538 }, { "auxiliary_loss_clip": 0.01050615, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.01255584, "balance_loss_mlp": 1.01541758, "epoch": 0.9771832256125056, "flos": 19024940424960.0, "grad_norm": 5.6127220475732935, "language_loss": 0.65967035, "learning_rate": 5.444409204701461e-09, "loss": 0.68052393, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16253, "time_per_iteration": 2.372288465499878 }, { "auxiliary_loss_clip": 0.01054102, "auxiliary_loss_mlp": 0.01043511, "balance_loss_clip": 1.01825047, "balance_loss_mlp": 1.01719069, "epoch": 0.9772433488651736, "flos": 17821170057600.0, "grad_norm": 2.6595575531809477, "language_loss": 0.77415287, "learning_rate": 5.415729672278324e-09, "loss": 0.79512894, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 16254, "time_per_iteration": 2.366090774536133 }, { "auxiliary_loss_clip": 0.01053112, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.01577449, "balance_loss_mlp": 1.01609945, "epoch": 0.9773034721178415, "flos": 37628375472000.0, "grad_norm": 1.7525651975728227, "language_loss": 0.64966416, "learning_rate": 5.387125774690471e-09, "loss": 0.67059582, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 16255, "time_per_iteration": 4.062563419342041 }, { "auxiliary_loss_clip": 0.01054601, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.01602554, "balance_loss_mlp": 1.01690197, "epoch": 0.9773635953705095, "flos": 20301434887680.0, "grad_norm": 1.6598318259970841, "language_loss": 0.76730871, "learning_rate": 5.358597513023033e-09, "loss": 0.78829527, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.37695312, "step": 16256, "time_per_iteration": 2.3865115642547607 }, { "auxiliary_loss_clip": 0.01051787, "auxiliary_loss_mlp": 0.01039688, "balance_loss_clip": 1.01318753, "balance_loss_mlp": 1.01762807, "epoch": 0.9774237186231776, "flos": 22308172680960.0, "grad_norm": 2.1551566595895006, "language_loss": 0.78750259, "learning_rate": 5.330144888357369e-09, "loss": 0.80841732, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.34179688, "step": 16257, "time_per_iteration": 2.406813621520996 }, { "auxiliary_loss_clip": 0.01050183, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.01692867, "balance_loss_mlp": 1.01547432, "epoch": 0.9774838418758455, "flos": 24203816928000.0, "grad_norm": 1.619907992349388, "language_loss": 0.76701504, "learning_rate": 5.301767901772391e-09, "loss": 0.78792918, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.34765625, "step": 16258, "time_per_iteration": 2.4205052852630615 }, { "auxiliary_loss_clip": 0.01007159, "auxiliary_loss_mlp": 0.01001934, "balance_loss_clip": 0.99995559, "balance_loss_mlp": 1.00080156, "epoch": 0.9775439651285135, "flos": 66353547801600.0, "grad_norm": 0.6972547420526777, "language_loss": 0.59889776, "learning_rate": 5.273466554344353e-09, "loss": 0.61898869, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.06347656, "step": 16259, "time_per_iteration": 3.1085686683654785 }, { "auxiliary_loss_clip": 0.01053666, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.01447225, "balance_loss_mlp": 1.01645267, "epoch": 0.9776040883811814, "flos": 22600465516800.0, "grad_norm": 1.6646427519766314, "language_loss": 0.74797744, "learning_rate": 5.2452408471461705e-09, "loss": 0.76890361, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.37304688, "step": 16260, "time_per_iteration": 2.3954949378967285 }, { "auxiliary_loss_clip": 0.01051541, "auxiliary_loss_mlp": 0.0104317, "balance_loss_clip": 1.01882732, "balance_loss_mlp": 1.01601219, "epoch": 0.9776642116338494, "flos": 18441192625920.0, "grad_norm": 2.0406287847816644, "language_loss": 0.80482519, "learning_rate": 5.2170907812485456e-09, "loss": 0.82577229, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 16261, "time_per_iteration": 2.350165605545044 }, { "auxiliary_loss_clip": 0.01053002, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.00792146, "balance_loss_mlp": 1.01659012, "epoch": 0.9777243348865173, "flos": 22637333779200.0, "grad_norm": 2.3314653914763173, "language_loss": 0.74713063, "learning_rate": 5.189016357718845e-09, "loss": 0.76797587, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36328125, "step": 16262, "time_per_iteration": 2.3669631481170654 }, { "auxiliary_loss_clip": 0.01052011, "auxiliary_loss_mlp": 0.01037434, "balance_loss_clip": 1.01139879, "balance_loss_mlp": 1.01585507, "epoch": 0.9777844581391854, "flos": 31320058619520.0, "grad_norm": 1.950560921663072, "language_loss": 0.71727556, "learning_rate": 5.16101757762133e-09, "loss": 0.73816997, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.36132812, "step": 16263, "time_per_iteration": 2.4454596042633057 }, { "auxiliary_loss_clip": 0.01052445, "auxiliary_loss_mlp": 0.01038484, "balance_loss_clip": 1.01458287, "balance_loss_mlp": 1.01634741, "epoch": 0.9778445813918533, "flos": 23037997075200.0, "grad_norm": 1.7419167589450297, "language_loss": 0.67526233, "learning_rate": 5.133094442018038e-09, "loss": 0.69617164, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 16264, "time_per_iteration": 2.3725388050079346 }, { "auxiliary_loss_clip": 0.01054636, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.01383853, "balance_loss_mlp": 1.01696754, "epoch": 0.9779047046445213, "flos": 17565396370560.0, "grad_norm": 1.9874003721337024, "language_loss": 0.74033892, "learning_rate": 5.105246951967679e-09, "loss": 0.76129782, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.37695312, "step": 16265, "time_per_iteration": 2.346679210662842 }, { "auxiliary_loss_clip": 0.01050456, "auxiliary_loss_mlp": 0.01033422, "balance_loss_clip": 1.01055741, "balance_loss_mlp": 1.01591182, "epoch": 0.9779648278971892, "flos": 20740083609600.0, "grad_norm": 1.7111586800492788, "language_loss": 0.69486511, "learning_rate": 5.077475108526297e-09, "loss": 0.71570385, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 16266, "time_per_iteration": 2.3781681060791016 }, { "auxiliary_loss_clip": 0.01050017, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.01397276, "balance_loss_mlp": 1.0171473, "epoch": 0.9780249511498572, "flos": 21026057489280.0, "grad_norm": 1.7321289426482234, "language_loss": 0.87310529, "learning_rate": 5.049778912747049e-09, "loss": 0.89395428, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.328125, "step": 16267, "time_per_iteration": 2.395775318145752 }, { "auxiliary_loss_clip": 0.01052844, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.00988472, "balance_loss_mlp": 1.01608896, "epoch": 0.9780850744025251, "flos": 30772899792000.0, "grad_norm": 2.143302800586871, "language_loss": 0.71657932, "learning_rate": 5.022158365679985e-09, "loss": 0.73744673, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 16268, "time_per_iteration": 2.4571402072906494 }, { "auxiliary_loss_clip": 0.01051789, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.00972235, "balance_loss_mlp": 1.01662111, "epoch": 0.9781451976551931, "flos": 20301958558080.0, "grad_norm": 1.8098205082854755, "language_loss": 0.74560487, "learning_rate": 4.994613468372711e-09, "loss": 0.7664361, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3515625, "step": 16269, "time_per_iteration": 2.355454444885254 }, { "auxiliary_loss_clip": 0.0105223, "auxiliary_loss_mlp": 0.01037482, "balance_loss_clip": 1.01352096, "balance_loss_mlp": 1.01620686, "epoch": 0.9782053209078612, "flos": 24315329410560.0, "grad_norm": 2.635316342544413, "language_loss": 0.71931982, "learning_rate": 4.967144221869501e-09, "loss": 0.74021691, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.359375, "step": 16270, "time_per_iteration": 2.3895928859710693 }, { "auxiliary_loss_clip": 0.01051778, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.01661611, "balance_loss_mlp": 1.01523995, "epoch": 0.9782654441605291, "flos": 32488042976640.0, "grad_norm": 1.7755135564018276, "language_loss": 0.65417469, "learning_rate": 4.939750627212191e-09, "loss": 0.67508894, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 16271, "time_per_iteration": 2.468729257583618 }, { "auxiliary_loss_clip": 0.0104924, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.01124215, "balance_loss_mlp": 1.01540339, "epoch": 0.9783255674131971, "flos": 26977875782400.0, "grad_norm": 1.4054463684249654, "language_loss": 0.71214747, "learning_rate": 4.912432685439505e-09, "loss": 0.73296887, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 16272, "time_per_iteration": 2.4224414825439453 }, { "auxiliary_loss_clip": 0.01054319, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.01588404, "balance_loss_mlp": 1.01703978, "epoch": 0.978385690665865, "flos": 23111140106880.0, "grad_norm": 1.7526729271866786, "language_loss": 0.67784607, "learning_rate": 4.88519039758728e-09, "loss": 0.69879568, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.37109375, "step": 16273, "time_per_iteration": 3.6333563327789307 }, { "auxiliary_loss_clip": 0.01051837, "auxiliary_loss_mlp": 0.01035229, "balance_loss_clip": 1.01164889, "balance_loss_mlp": 1.0161531, "epoch": 0.978445813918533, "flos": 25408529902080.0, "grad_norm": 1.6228678277336, "language_loss": 0.74738228, "learning_rate": 4.85802376468869e-09, "loss": 0.76825297, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35742188, "step": 16274, "time_per_iteration": 2.419017791748047 }, { "auxiliary_loss_clip": 0.01052383, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.01482296, "balance_loss_mlp": 1.01695609, "epoch": 0.9785059371712009, "flos": 23549090601600.0, "grad_norm": 1.5785907059284108, "language_loss": 0.77968282, "learning_rate": 4.830932787773579e-09, "loss": 0.8005836, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 16275, "time_per_iteration": 2.4130842685699463 }, { "auxiliary_loss_clip": 0.01052644, "auxiliary_loss_mlp": 0.0104089, "balance_loss_clip": 1.016083, "balance_loss_mlp": 1.01606548, "epoch": 0.978566060423869, "flos": 34349716604160.0, "grad_norm": 1.6159180691657673, "language_loss": 0.71950769, "learning_rate": 4.803917467869567e-09, "loss": 0.74044311, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.36523438, "step": 16276, "time_per_iteration": 2.491328477859497 }, { "auxiliary_loss_clip": 0.01047986, "auxiliary_loss_mlp": 0.01034098, "balance_loss_clip": 1.01341522, "balance_loss_mlp": 1.01454401, "epoch": 0.9786261836765369, "flos": 11618081642880.0, "grad_norm": 1.8159449400936531, "language_loss": 0.86534059, "learning_rate": 4.776977806000726e-09, "loss": 0.88616145, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3359375, "step": 16277, "time_per_iteration": 2.335324764251709 }, { "auxiliary_loss_clip": 0.01049428, "auxiliary_loss_mlp": 0.01034702, "balance_loss_clip": 1.01221955, "balance_loss_mlp": 1.01536107, "epoch": 0.9786863069292049, "flos": 17419983091200.0, "grad_norm": 1.8116618765733885, "language_loss": 0.71823227, "learning_rate": 4.7501138031891264e-09, "loss": 0.73907351, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.33984375, "step": 16278, "time_per_iteration": 2.359335422515869 }, { "auxiliary_loss_clip": 0.01050676, "auxiliary_loss_mlp": 0.01036643, "balance_loss_clip": 1.01329029, "balance_loss_mlp": 1.01500022, "epoch": 0.9787464301818728, "flos": 20844020327040.0, "grad_norm": 1.7729552304562761, "language_loss": 0.8480013, "learning_rate": 4.723325460453065e-09, "loss": 0.86887449, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35742188, "step": 16279, "time_per_iteration": 2.379136800765991 }, { "auxiliary_loss_clip": 0.0105216, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.01681471, "balance_loss_mlp": 1.01583052, "epoch": 0.9788065534345408, "flos": 18221972999040.0, "grad_norm": 2.360107763605047, "language_loss": 0.80136001, "learning_rate": 4.696612778808395e-09, "loss": 0.82229024, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 16280, "time_per_iteration": 3.80787992477417 }, { "auxiliary_loss_clip": 0.01049972, "auxiliary_loss_mlp": 0.01034213, "balance_loss_clip": 1.01242185, "balance_loss_mlp": 1.01585007, "epoch": 0.9788666766872087, "flos": 21577196211840.0, "grad_norm": 1.5633876356811685, "language_loss": 0.80212593, "learning_rate": 4.669975759268085e-09, "loss": 0.82296777, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.34179688, "step": 16281, "time_per_iteration": 2.4323813915252686 }, { "auxiliary_loss_clip": 0.01052168, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.0114336, "balance_loss_mlp": 1.01598072, "epoch": 0.9789267999398767, "flos": 24899321589120.0, "grad_norm": 1.801068673361125, "language_loss": 0.81043297, "learning_rate": 4.643414402842216e-09, "loss": 0.83130646, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 16282, "time_per_iteration": 2.4071526527404785 }, { "auxiliary_loss_clip": 0.01051048, "auxiliary_loss_mlp": 0.01042801, "balance_loss_clip": 1.0197103, "balance_loss_mlp": 1.01585007, "epoch": 0.9789869231925448, "flos": 19572099252480.0, "grad_norm": 2.2465563921005476, "language_loss": 0.84447289, "learning_rate": 4.616928710538204e-09, "loss": 0.8654114, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 16283, "time_per_iteration": 3.738436460494995 }, { "auxiliary_loss_clip": 0.01051413, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.01119757, "balance_loss_mlp": 1.01588917, "epoch": 0.9790470464452127, "flos": 16795806071040.0, "grad_norm": 1.742790294408586, "language_loss": 0.73318106, "learning_rate": 4.590518683360134e-09, "loss": 0.75403303, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 16284, "time_per_iteration": 2.349609851837158 }, { "auxiliary_loss_clip": 0.01049571, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.01459718, "balance_loss_mlp": 1.01593137, "epoch": 0.9791071696978807, "flos": 18368363796480.0, "grad_norm": 1.669754642408173, "language_loss": 0.652264, "learning_rate": 4.56418432230965e-09, "loss": 0.67312634, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3359375, "step": 16285, "time_per_iteration": 2.347547769546509 }, { "auxiliary_loss_clip": 0.01051575, "auxiliary_loss_mlp": 0.01041039, "balance_loss_clip": 1.01776958, "balance_loss_mlp": 1.01555085, "epoch": 0.9791672929505486, "flos": 24169182992640.0, "grad_norm": 1.5994928084959203, "language_loss": 0.71672165, "learning_rate": 4.537925628385286e-09, "loss": 0.73764777, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.359375, "step": 16286, "time_per_iteration": 2.380171775817871 }, { "auxiliary_loss_clip": 0.01050032, "auxiliary_loss_mlp": 0.01034871, "balance_loss_clip": 1.01218545, "balance_loss_mlp": 1.01542842, "epoch": 0.9792274162032166, "flos": 24352930811520.0, "grad_norm": 1.36836579799763, "language_loss": 0.59567952, "learning_rate": 4.511742602582691e-09, "loss": 0.61652851, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34570312, "step": 16287, "time_per_iteration": 2.4127933979034424 }, { "auxiliary_loss_clip": 0.01052118, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.01099169, "balance_loss_mlp": 1.01647866, "epoch": 0.9792875394558845, "flos": 26394581831040.0, "grad_norm": 1.9872444633135307, "language_loss": 0.82398105, "learning_rate": 4.485635245894626e-09, "loss": 0.8448444, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16288, "time_per_iteration": 2.4053053855895996 }, { "auxiliary_loss_clip": 0.01051781, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.01176965, "balance_loss_mlp": 1.01523495, "epoch": 0.9793476627085526, "flos": 28146732923520.0, "grad_norm": 1.4894667424566832, "language_loss": 0.72917533, "learning_rate": 4.459603559311631e-09, "loss": 0.7500484, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36523438, "step": 16289, "time_per_iteration": 2.435150623321533 }, { "auxiliary_loss_clip": 0.01050878, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.01566732, "balance_loss_mlp": 1.01630354, "epoch": 0.9794077859612205, "flos": 16762987526400.0, "grad_norm": 2.6668417755022964, "language_loss": 0.76404607, "learning_rate": 4.43364754382003e-09, "loss": 0.78494614, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34570312, "step": 16290, "time_per_iteration": 2.3375704288482666 }, { "auxiliary_loss_clip": 0.01052494, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.01545405, "balance_loss_mlp": 1.01542783, "epoch": 0.9794679092138885, "flos": 19279841328000.0, "grad_norm": 1.5387307888874768, "language_loss": 0.68080485, "learning_rate": 4.4077672004048105e-09, "loss": 0.70171571, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37109375, "step": 16291, "time_per_iteration": 2.3815112113952637 }, { "auxiliary_loss_clip": 0.0105395, "auxiliary_loss_mlp": 0.01035126, "balance_loss_clip": 1.01040256, "balance_loss_mlp": 1.01636255, "epoch": 0.9795280324665564, "flos": 32155390742400.0, "grad_norm": 2.2290795879219054, "language_loss": 0.62703609, "learning_rate": 4.3819625300467456e-09, "loss": 0.64792681, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 16292, "time_per_iteration": 2.437255620956421 }, { "auxiliary_loss_clip": 0.01051094, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.01476693, "balance_loss_mlp": 1.01568174, "epoch": 0.9795881557192244, "flos": 19059993296640.0, "grad_norm": 1.6038583088690632, "language_loss": 0.74417138, "learning_rate": 4.356233533724829e-09, "loss": 0.76506269, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 16293, "time_per_iteration": 2.370506525039673 }, { "auxiliary_loss_clip": 0.01052429, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.01264143, "balance_loss_mlp": 1.0156126, "epoch": 0.9796482789718923, "flos": 28328665351680.0, "grad_norm": 2.1385163730804613, "language_loss": 0.84564602, "learning_rate": 4.330580212414503e-09, "loss": 0.86652446, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3671875, "step": 16294, "time_per_iteration": 3.8501548767089844 }, { "auxiliary_loss_clip": 0.01048888, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.01240373, "balance_loss_mlp": 1.01577413, "epoch": 0.9797084022245603, "flos": 17967002273280.0, "grad_norm": 2.081096309108261, "language_loss": 0.73562968, "learning_rate": 4.305002567088767e-09, "loss": 0.75646216, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33203125, "step": 16295, "time_per_iteration": 2.368640661239624 }, { "auxiliary_loss_clip": 0.010541, "auxiliary_loss_mlp": 0.01041585, "balance_loss_clip": 1.0173614, "balance_loss_mlp": 1.01682496, "epoch": 0.9797685254772284, "flos": 20265683788800.0, "grad_norm": 1.903534382346113, "language_loss": 0.81335032, "learning_rate": 4.2795005987170674e-09, "loss": 0.83430719, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.37304688, "step": 16296, "time_per_iteration": 2.369436025619507 }, { "auxiliary_loss_clip": 0.01051945, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.01581728, "balance_loss_mlp": 1.01654851, "epoch": 0.9798286487298963, "flos": 26905919736960.0, "grad_norm": 1.9038925107441067, "language_loss": 0.76223636, "learning_rate": 4.254074308266853e-09, "loss": 0.78313941, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 16297, "time_per_iteration": 2.426706075668335 }, { "auxiliary_loss_clip": 0.01053771, "auxiliary_loss_mlp": 0.01045316, "balance_loss_clip": 1.02179551, "balance_loss_mlp": 1.01663387, "epoch": 0.9798887719825643, "flos": 27159878033280.0, "grad_norm": 1.576993630612611, "language_loss": 0.78714371, "learning_rate": 4.228723696702019e-09, "loss": 0.80813456, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.37109375, "step": 16298, "time_per_iteration": 2.43342661857605 }, { "auxiliary_loss_clip": 0.01047826, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.01325762, "balance_loss_mlp": 1.01481533, "epoch": 0.9799488952352322, "flos": 20667359514240.0, "grad_norm": 1.5043192394048588, "language_loss": 0.73935461, "learning_rate": 4.203448764984019e-09, "loss": 0.76018232, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33007812, "step": 16299, "time_per_iteration": 2.372545003890991 }, { "auxiliary_loss_clip": 0.01052235, "auxiliary_loss_mlp": 0.0104037, "balance_loss_clip": 1.01729071, "balance_loss_mlp": 1.01578879, "epoch": 0.9800090184879002, "flos": 21980128746240.0, "grad_norm": 1.9358624560935058, "language_loss": 0.90422457, "learning_rate": 4.178249514071419e-09, "loss": 0.92515063, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36523438, "step": 16300, "time_per_iteration": 2.3983559608459473 }, { "auxiliary_loss_clip": 0.01052413, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.01268387, "balance_loss_mlp": 1.01551521, "epoch": 0.9800691417405681, "flos": 21287347171200.0, "grad_norm": 2.128496337870732, "language_loss": 0.79383481, "learning_rate": 4.1531259449194555e-09, "loss": 0.81471264, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36914062, "step": 16301, "time_per_iteration": 2.3979315757751465 }, { "auxiliary_loss_clip": 0.0105047, "auxiliary_loss_mlp": 0.01036636, "balance_loss_clip": 1.0110414, "balance_loss_mlp": 1.01460326, "epoch": 0.9801292649932362, "flos": 18438958298880.0, "grad_norm": 1.9545706333687962, "language_loss": 0.76325989, "learning_rate": 4.128078058480921e-09, "loss": 0.78413093, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.359375, "step": 16302, "time_per_iteration": 2.3920507431030273 }, { "auxiliary_loss_clip": 0.01050627, "auxiliary_loss_mlp": 0.01040017, "balance_loss_clip": 1.01674771, "balance_loss_mlp": 1.01532149, "epoch": 0.9801893882459041, "flos": 25045188716160.0, "grad_norm": 1.7627906205551664, "language_loss": 0.80796307, "learning_rate": 4.103105855705724e-09, "loss": 0.82886952, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35351562, "step": 16303, "time_per_iteration": 2.418084144592285 }, { "auxiliary_loss_clip": 0.01053973, "auxiliary_loss_mlp": 0.01037262, "balance_loss_clip": 1.01252615, "balance_loss_mlp": 1.01634848, "epoch": 0.9802495114985721, "flos": 18510600142080.0, "grad_norm": 1.8643372158584641, "language_loss": 0.8410511, "learning_rate": 4.078209337540883e-09, "loss": 0.86196345, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 16304, "time_per_iteration": 2.364396572113037 }, { "auxiliary_loss_clip": 0.01047321, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.01345515, "balance_loss_mlp": 1.01457667, "epoch": 0.98030963475124, "flos": 21468790840320.0, "grad_norm": 1.9962320865016747, "language_loss": 0.71741819, "learning_rate": 4.053388504930089e-09, "loss": 0.73821688, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.328125, "step": 16305, "time_per_iteration": 2.413853645324707 }, { "auxiliary_loss_clip": 0.01052406, "auxiliary_loss_mlp": 0.01041729, "balance_loss_clip": 1.01866174, "balance_loss_mlp": 1.01642942, "epoch": 0.980369758003908, "flos": 20411236713600.0, "grad_norm": 1.8516876462046858, "language_loss": 0.73463619, "learning_rate": 4.028643358815032e-09, "loss": 0.75557756, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.359375, "step": 16306, "time_per_iteration": 2.3753771781921387 }, { "auxiliary_loss_clip": 0.01049483, "auxiliary_loss_mlp": 0.01036907, "balance_loss_clip": 1.01496041, "balance_loss_mlp": 1.01566863, "epoch": 0.9804298812565759, "flos": 23398196238720.0, "grad_norm": 2.0379638759101057, "language_loss": 0.74983633, "learning_rate": 4.00397390013385e-09, "loss": 0.77070022, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33789062, "step": 16307, "time_per_iteration": 2.4346423149108887 }, { "auxiliary_loss_clip": 0.01047408, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.0102483, "balance_loss_mlp": 1.01538658, "epoch": 0.980490004509244, "flos": 23291606257920.0, "grad_norm": 1.6155750704564833, "language_loss": 0.75853771, "learning_rate": 3.979380129822018e-09, "loss": 0.77929974, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.3203125, "step": 16308, "time_per_iteration": 2.41501784324646 }, { "auxiliary_loss_clip": 0.01006726, "auxiliary_loss_mlp": 0.01003774, "balance_loss_clip": 1.00189042, "balance_loss_mlp": 1.0003922, "epoch": 0.980550127761912, "flos": 56048200479360.0, "grad_norm": 0.7610824099977884, "language_loss": 0.57824188, "learning_rate": 3.954862048811902e-09, "loss": 0.59834689, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.06347656, "step": 16309, "time_per_iteration": 2.909175395965576 }, { "auxiliary_loss_clip": 0.01052031, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.01869702, "balance_loss_mlp": 1.01564407, "epoch": 0.9806102510145799, "flos": 25332244848000.0, "grad_norm": 1.828554166883765, "language_loss": 0.67729443, "learning_rate": 3.930419658033646e-09, "loss": 0.69823939, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 16310, "time_per_iteration": 2.418511390686035 }, { "auxiliary_loss_clip": 0.0100654, "auxiliary_loss_mlp": 0.01006595, "balance_loss_clip": 1.00460434, "balance_loss_mlp": 1.00031316, "epoch": 0.9806703742672479, "flos": 67271064998400.0, "grad_norm": 0.8269730407302652, "language_loss": 0.54613614, "learning_rate": 3.906052958413841e-09, "loss": 0.56626749, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.0625, "step": 16311, "time_per_iteration": 3.1059730052948 }, { "auxiliary_loss_clip": 0.01051054, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.01365995, "balance_loss_mlp": 1.01642013, "epoch": 0.9807304975199158, "flos": 25227784460160.0, "grad_norm": 2.191380492024427, "language_loss": 0.80553001, "learning_rate": 3.881761950876638e-09, "loss": 0.82640225, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 16312, "time_per_iteration": 3.6441431045532227 }, { "auxiliary_loss_clip": 0.01050211, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.01486015, "balance_loss_mlp": 1.01615763, "epoch": 0.9807906207725838, "flos": 17455454899200.0, "grad_norm": 1.86700925168628, "language_loss": 0.64590198, "learning_rate": 3.8575466363430785e-09, "loss": 0.66676295, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 16313, "time_per_iteration": 2.3372106552124023 }, { "auxiliary_loss_clip": 0.01052855, "auxiliary_loss_mlp": 0.01040022, "balance_loss_clip": 1.01612067, "balance_loss_mlp": 1.01648092, "epoch": 0.9808507440252517, "flos": 21031364016000.0, "grad_norm": 1.8742319919235488, "language_loss": 0.74606657, "learning_rate": 3.833407015731316e-09, "loss": 0.76699531, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 16314, "time_per_iteration": 2.4005391597747803 }, { "auxiliary_loss_clip": 0.01006612, "auxiliary_loss_mlp": 0.01003886, "balance_loss_clip": 1.00188375, "balance_loss_mlp": 1.00040197, "epoch": 0.9809108672779198, "flos": 64041355589760.0, "grad_norm": 0.6868386528969249, "language_loss": 0.51750958, "learning_rate": 3.80934308995684e-09, "loss": 0.53761458, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06201172, "step": 16315, "time_per_iteration": 3.021420955657959 }, { "auxiliary_loss_clip": 0.01048719, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.01419151, "balance_loss_mlp": 1.01465309, "epoch": 0.9809709905305877, "flos": 22779605036160.0, "grad_norm": 1.2819991976477334, "language_loss": 0.70196968, "learning_rate": 3.785354859932033e-09, "loss": 0.72282064, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34179688, "step": 16316, "time_per_iteration": 2.4467756748199463 }, { "auxiliary_loss_clip": 0.01052262, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.01670289, "balance_loss_mlp": 1.01617038, "epoch": 0.9810311137832557, "flos": 37011913862400.0, "grad_norm": 1.8679204417348145, "language_loss": 0.56294227, "learning_rate": 3.76144232656661e-09, "loss": 0.58385563, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.36132812, "step": 16317, "time_per_iteration": 2.4888687133789062 }, { "auxiliary_loss_clip": 0.0105089, "auxiliary_loss_mlp": 0.01037406, "balance_loss_clip": 1.01532817, "balance_loss_mlp": 1.01575983, "epoch": 0.9810912370359236, "flos": 18915313155840.0, "grad_norm": 1.5739996012222122, "language_loss": 0.74428546, "learning_rate": 3.737605490767404e-09, "loss": 0.76516843, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 16318, "time_per_iteration": 2.408968925476074 }, { "auxiliary_loss_clip": 0.01049467, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.01258898, "balance_loss_mlp": 1.01564896, "epoch": 0.9811513602885916, "flos": 18440634044160.0, "grad_norm": 2.112073027753667, "language_loss": 0.83150452, "learning_rate": 3.7138443534383555e-09, "loss": 0.85234457, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33789062, "step": 16319, "time_per_iteration": 2.354438066482544 }, { "auxiliary_loss_clip": 0.01006576, "auxiliary_loss_mlp": 0.01001918, "balance_loss_clip": 1.00009441, "balance_loss_mlp": 1.00030351, "epoch": 0.9812114835412595, "flos": 68055669048960.0, "grad_norm": 0.7186234152472349, "language_loss": 0.53690332, "learning_rate": 3.6901589154803014e-09, "loss": 0.55698824, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.0625, "step": 16320, "time_per_iteration": 4.356598377227783 }, { "auxiliary_loss_clip": 0.01051474, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.01528263, "balance_loss_mlp": 1.01536965, "epoch": 0.9812716067939276, "flos": 25371871107840.0, "grad_norm": 1.697664362211634, "language_loss": 0.74585223, "learning_rate": 3.6665491777914116e-09, "loss": 0.76675308, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 16321, "time_per_iteration": 2.404913902282715 }, { "auxiliary_loss_clip": 0.01051909, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.01398873, "balance_loss_mlp": 1.01790237, "epoch": 0.9813317300465956, "flos": 22855715533440.0, "grad_norm": 1.794032487660216, "language_loss": 0.80031633, "learning_rate": 3.6430151412669698e-09, "loss": 0.82119238, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.33984375, "step": 16322, "time_per_iteration": 2.4366705417633057 }, { "auxiliary_loss_clip": 0.0105036, "auxiliary_loss_mlp": 0.01034748, "balance_loss_clip": 1.01282549, "balance_loss_mlp": 1.01555061, "epoch": 0.9813918532992635, "flos": 23585819218560.0, "grad_norm": 1.8611670388025916, "language_loss": 0.81730074, "learning_rate": 3.619556806799595e-09, "loss": 0.83815181, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34765625, "step": 16323, "time_per_iteration": 2.421527147293091 }, { "auxiliary_loss_clip": 0.01052486, "auxiliary_loss_mlp": 0.01037605, "balance_loss_clip": 1.01383495, "balance_loss_mlp": 1.01604724, "epoch": 0.9814519765519315, "flos": 19605406556160.0, "grad_norm": 2.20197819573889, "language_loss": 0.85819775, "learning_rate": 3.596174175278799e-09, "loss": 0.87909865, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36328125, "step": 16324, "time_per_iteration": 3.7363414764404297 }, { "auxiliary_loss_clip": 0.01051588, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.01130199, "balance_loss_mlp": 1.01570761, "epoch": 0.9815120998045994, "flos": 33944933767680.0, "grad_norm": 1.6607739776269461, "language_loss": 0.75312394, "learning_rate": 3.5728672475909827e-09, "loss": 0.77397728, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.359375, "step": 16325, "time_per_iteration": 2.4911468029022217 }, { "auxiliary_loss_clip": 0.01049488, "auxiliary_loss_mlp": 0.01040686, "balance_loss_clip": 1.02024114, "balance_loss_mlp": 1.01593375, "epoch": 0.9815722230572674, "flos": 20848977740160.0, "grad_norm": 1.661901723457258, "language_loss": 0.77186984, "learning_rate": 3.5496360246201063e-09, "loss": 0.79277158, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.3359375, "step": 16326, "time_per_iteration": 2.3968594074249268 }, { "auxiliary_loss_clip": 0.01051957, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.01720965, "balance_loss_mlp": 1.01639569, "epoch": 0.9816323463099353, "flos": 22893386757120.0, "grad_norm": 1.8822983097358847, "language_loss": 0.68451631, "learning_rate": 3.5264805072470205e-09, "loss": 0.70544267, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 16327, "time_per_iteration": 2.384472608566284 }, { "auxiliary_loss_clip": 0.01055862, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.01625907, "balance_loss_mlp": 1.01776254, "epoch": 0.9816924695626034, "flos": 31538126171520.0, "grad_norm": 1.4197552973288343, "language_loss": 0.74265397, "learning_rate": 3.5034006963501337e-09, "loss": 0.76364124, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.38085938, "step": 16328, "time_per_iteration": 2.467512845993042 }, { "auxiliary_loss_clip": 0.01056171, "auxiliary_loss_mlp": 0.01041732, "balance_loss_clip": 1.01617372, "balance_loss_mlp": 1.0168891, "epoch": 0.9817525928152713, "flos": 21505833659520.0, "grad_norm": 2.0093214443929313, "language_loss": 0.82760537, "learning_rate": 3.4803965928040802e-09, "loss": 0.84858441, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.390625, "step": 16329, "time_per_iteration": 2.4182605743408203 }, { "auxiliary_loss_clip": 0.01053431, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.01250672, "balance_loss_mlp": 1.01616454, "epoch": 0.9818127160679393, "flos": 25549509438720.0, "grad_norm": 1.901969208113868, "language_loss": 0.77092868, "learning_rate": 3.4574681974817168e-09, "loss": 0.79183346, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.37304688, "step": 16330, "time_per_iteration": 2.462303638458252 }, { "auxiliary_loss_clip": 0.01056039, "auxiliary_loss_mlp": 0.01044888, "balance_loss_clip": 1.01555049, "balance_loss_mlp": 1.0166446, "epoch": 0.9818728393206072, "flos": 28802401856640.0, "grad_norm": 2.267107463902833, "language_loss": 0.68386292, "learning_rate": 3.434615511252126e-09, "loss": 0.70487213, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 0.39453125, "step": 16331, "time_per_iteration": 2.457507371902466 }, { "auxiliary_loss_clip": 0.0105052, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.01300478, "balance_loss_mlp": 1.01530433, "epoch": 0.9819329625732752, "flos": 23221116489600.0, "grad_norm": 1.6782099954030898, "language_loss": 0.74765062, "learning_rate": 3.411838534981948e-09, "loss": 0.76850879, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16332, "time_per_iteration": 2.4048938751220703 }, { "auxiliary_loss_clip": 0.01051252, "auxiliary_loss_mlp": 0.01031945, "balance_loss_clip": 1.01140499, "balance_loss_mlp": 1.01644313, "epoch": 0.9819930858259431, "flos": 17529470714880.0, "grad_norm": 1.623199087691056, "language_loss": 0.77490056, "learning_rate": 3.389137269534936e-09, "loss": 0.7957325, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34765625, "step": 16333, "time_per_iteration": 2.3599488735198975 }, { "auxiliary_loss_clip": 0.01049926, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.01477146, "balance_loss_mlp": 1.01555395, "epoch": 0.9820532090786112, "flos": 12529140238080.0, "grad_norm": 2.128241156479475, "language_loss": 0.74847984, "learning_rate": 3.366511715771958e-09, "loss": 0.7693435, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 16334, "time_per_iteration": 3.781158685684204 }, { "auxiliary_loss_clip": 0.01052799, "auxiliary_loss_mlp": 0.01042669, "balance_loss_clip": 1.01750374, "balance_loss_mlp": 1.01616991, "epoch": 0.9821133323312792, "flos": 18839272481280.0, "grad_norm": 1.8091069030298181, "language_loss": 0.7942704, "learning_rate": 3.3439618745509934e-09, "loss": 0.81522512, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3671875, "step": 16335, "time_per_iteration": 2.37719988822937 }, { "auxiliary_loss_clip": 0.01053932, "auxiliary_loss_mlp": 0.01048432, "balance_loss_clip": 1.02219439, "balance_loss_mlp": 1.01653314, "epoch": 0.9821734555839471, "flos": 34822580325120.0, "grad_norm": 2.425592967701404, "language_loss": 0.659935, "learning_rate": 3.3214877467271362e-09, "loss": 0.68095863, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 16336, "time_per_iteration": 2.5023622512817383 }, { "auxiliary_loss_clip": 0.01054364, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.01877618, "balance_loss_mlp": 1.0168879, "epoch": 0.9822335788366151, "flos": 17127236407680.0, "grad_norm": 1.8065807538027445, "language_loss": 0.74699605, "learning_rate": 3.299089333152372e-09, "loss": 0.76798958, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 16337, "time_per_iteration": 2.4047181606292725 }, { "auxiliary_loss_clip": 0.01051256, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.01244664, "balance_loss_mlp": 1.01498878, "epoch": 0.982293702089283, "flos": 20812214211840.0, "grad_norm": 1.6305318392751946, "language_loss": 0.74286151, "learning_rate": 3.2767666346764645e-09, "loss": 0.76373136, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 16338, "time_per_iteration": 2.4317257404327393 }, { "auxiliary_loss_clip": 0.01049906, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.01102781, "balance_loss_mlp": 1.01463807, "epoch": 0.982353825341951, "flos": 24679683025920.0, "grad_norm": 1.628800820731604, "language_loss": 0.82452738, "learning_rate": 3.2545196521454045e-09, "loss": 0.84538251, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.35351562, "step": 16339, "time_per_iteration": 2.4313089847564697 }, { "auxiliary_loss_clip": 0.01049615, "auxiliary_loss_mlp": 0.01037315, "balance_loss_clip": 1.01598871, "balance_loss_mlp": 1.01561904, "epoch": 0.982413948594619, "flos": 20849431587840.0, "grad_norm": 1.9065594793568583, "language_loss": 0.63436842, "learning_rate": 3.232348386403405e-09, "loss": 0.65523779, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 16340, "time_per_iteration": 2.4013330936431885 }, { "auxiliary_loss_clip": 0.0105317, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.01270425, "balance_loss_mlp": 1.0162518, "epoch": 0.982474071847287, "flos": 15376481769600.0, "grad_norm": 2.0251203735115415, "language_loss": 0.86770135, "learning_rate": 3.2102528382904613e-09, "loss": 0.88860536, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.36914062, "step": 16341, "time_per_iteration": 2.36991024017334 }, { "auxiliary_loss_clip": 0.01048899, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 1.01275373, "balance_loss_mlp": 1.01584196, "epoch": 0.9825341950999549, "flos": 23773442198400.0, "grad_norm": 1.943545147746063, "language_loss": 0.67592835, "learning_rate": 3.188233008645014e-09, "loss": 0.6967485, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.33007812, "step": 16342, "time_per_iteration": 2.457547426223755 }, { "auxiliary_loss_clip": 0.01052202, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.01356721, "balance_loss_mlp": 1.01610136, "epoch": 0.9825943183526229, "flos": 22745215480320.0, "grad_norm": 1.528696695250381, "language_loss": 0.77857774, "learning_rate": 3.16628889830195e-09, "loss": 0.79946291, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36132812, "step": 16343, "time_per_iteration": 2.409417152404785 }, { "auxiliary_loss_clip": 0.01051048, "auxiliary_loss_mlp": 0.01035729, "balance_loss_clip": 1.01381874, "balance_loss_mlp": 1.01587558, "epoch": 0.9826544416052908, "flos": 27708712606080.0, "grad_norm": 1.5934468345004658, "language_loss": 0.75953948, "learning_rate": 3.1444205080932707e-09, "loss": 0.78040731, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.3515625, "step": 16344, "time_per_iteration": 2.4772963523864746 }, { "auxiliary_loss_clip": 0.01051949, "auxiliary_loss_mlp": 0.01042672, "balance_loss_clip": 1.01972413, "balance_loss_mlp": 1.01660037, "epoch": 0.9827145648579588, "flos": 26940483849600.0, "grad_norm": 2.098973843192174, "language_loss": 0.6844663, "learning_rate": 3.122627838848313e-09, "loss": 0.70541251, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 16345, "time_per_iteration": 2.463862180709839 }, { "auxiliary_loss_clip": 0.01048697, "auxiliary_loss_mlp": 0.01032742, "balance_loss_clip": 1.0129652, "balance_loss_mlp": 1.01544106, "epoch": 0.9827746881106267, "flos": 21865613886720.0, "grad_norm": 1.4070859122924622, "language_loss": 0.79759431, "learning_rate": 3.1009108913933045e-09, "loss": 0.81840873, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.33203125, "step": 16346, "time_per_iteration": 2.371816396713257 }, { "auxiliary_loss_clip": 0.01053324, "auxiliary_loss_mlp": 0.01044732, "balance_loss_clip": 1.02081871, "balance_loss_mlp": 1.01584303, "epoch": 0.9828348113632948, "flos": 20849710878720.0, "grad_norm": 2.010777699848973, "language_loss": 0.76546705, "learning_rate": 3.079269666552031e-09, "loss": 0.78644758, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 16347, "time_per_iteration": 2.388453960418701 }, { "auxiliary_loss_clip": 0.01049945, "auxiliary_loss_mlp": 0.0104172, "balance_loss_clip": 1.01997566, "balance_loss_mlp": 1.01553833, "epoch": 0.9828949346159628, "flos": 34567784156160.0, "grad_norm": 1.5649975228841562, "language_loss": 0.67696249, "learning_rate": 3.0577041651449474e-09, "loss": 0.69787908, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34375, "step": 16348, "time_per_iteration": 2.4815850257873535 }, { "auxiliary_loss_clip": 0.01051285, "auxiliary_loss_mlp": 0.01036634, "balance_loss_clip": 1.01287544, "balance_loss_mlp": 1.01590323, "epoch": 0.9829550578686307, "flos": 24456448592640.0, "grad_norm": 1.7787945419996758, "language_loss": 0.70110482, "learning_rate": 3.0362143879898437e-09, "loss": 0.72198403, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35351562, "step": 16349, "time_per_iteration": 2.401388645172119 }, { "auxiliary_loss_clip": 0.01047332, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.01076579, "balance_loss_mlp": 1.01488185, "epoch": 0.9830151811212987, "flos": 16909133944320.0, "grad_norm": 1.8976733584348868, "language_loss": 0.76967633, "learning_rate": 3.0148003359014018e-09, "loss": 0.79046416, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.32421875, "step": 16350, "time_per_iteration": 2.318695545196533 }, { "auxiliary_loss_clip": 0.01052541, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.01292872, "balance_loss_mlp": 1.01647234, "epoch": 0.9830753043739666, "flos": 21287242437120.0, "grad_norm": 1.9837902232221079, "language_loss": 0.85340846, "learning_rate": 2.9934620096920826e-09, "loss": 0.87430036, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.359375, "step": 16351, "time_per_iteration": 2.428532600402832 }, { "auxiliary_loss_clip": 0.01051862, "auxiliary_loss_mlp": 0.01033266, "balance_loss_clip": 1.01072359, "balance_loss_mlp": 1.01640344, "epoch": 0.9831354276266346, "flos": 31722153281280.0, "grad_norm": 1.7998351366716587, "language_loss": 0.69635701, "learning_rate": 2.972199410170795e-09, "loss": 0.71720827, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35351562, "step": 16352, "time_per_iteration": 3.73988938331604 }, { "auxiliary_loss_clip": 0.01049963, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.01881731, "balance_loss_mlp": 1.01599264, "epoch": 0.9831955508793025, "flos": 21617904723840.0, "grad_norm": 1.3999214156838855, "language_loss": 0.66915941, "learning_rate": 2.951012538143782e-09, "loss": 0.69007051, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 16353, "time_per_iteration": 2.402040481567383 }, { "auxiliary_loss_clip": 0.01048279, "auxiliary_loss_mlp": 0.01037541, "balance_loss_clip": 1.01596367, "balance_loss_mlp": 1.014691, "epoch": 0.9832556741319706, "flos": 22967053459200.0, "grad_norm": 1.587405341781174, "language_loss": 0.75480109, "learning_rate": 2.9299013944144025e-09, "loss": 0.77565926, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.3359375, "step": 16354, "time_per_iteration": 2.3968896865844727 }, { "auxiliary_loss_clip": 0.01051115, "auxiliary_loss_mlp": 0.01036232, "balance_loss_clip": 1.01446426, "balance_loss_mlp": 1.01566374, "epoch": 0.9833157973846385, "flos": 21322853890560.0, "grad_norm": 1.9923948189419034, "language_loss": 0.78895617, "learning_rate": 2.9088659797835702e-09, "loss": 0.80982959, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 16355, "time_per_iteration": 2.390308141708374 }, { "auxiliary_loss_clip": 0.01051042, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.01402175, "balance_loss_mlp": 1.01607752, "epoch": 0.9833759206373065, "flos": 21067673696640.0, "grad_norm": 5.691180774739733, "language_loss": 0.74567938, "learning_rate": 2.8879062950484256e-09, "loss": 0.76655078, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34960938, "step": 16356, "time_per_iteration": 2.362691879272461 }, { "auxiliary_loss_clip": 0.01050198, "auxiliary_loss_mlp": 0.01037672, "balance_loss_clip": 1.01431906, "balance_loss_mlp": 1.01598334, "epoch": 0.9834360438899744, "flos": 18696337908480.0, "grad_norm": 1.6630162489642106, "language_loss": 0.77205515, "learning_rate": 2.8670223410041104e-09, "loss": 0.79293394, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34179688, "step": 16357, "time_per_iteration": 2.366004467010498 }, { "auxiliary_loss_clip": 0.01050562, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.01125503, "balance_loss_mlp": 1.01585782, "epoch": 0.9834961671426424, "flos": 21104192845440.0, "grad_norm": 2.7101708347111306, "language_loss": 0.81360042, "learning_rate": 2.846214118442436e-09, "loss": 0.83444208, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34765625, "step": 16358, "time_per_iteration": 2.3793528079986572 }, { "auxiliary_loss_clip": 0.01049981, "auxiliary_loss_mlp": 0.01033538, "balance_loss_clip": 1.01194942, "balance_loss_mlp": 1.01493049, "epoch": 0.9835562903953103, "flos": 26686420819200.0, "grad_norm": 2.246732026108116, "language_loss": 0.69060469, "learning_rate": 2.8254816281523263e-09, "loss": 0.71143985, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.3515625, "step": 16359, "time_per_iteration": 3.789628505706787 }, { "auxiliary_loss_clip": 0.01049272, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.01056838, "balance_loss_mlp": 1.01567852, "epoch": 0.9836164136479784, "flos": 22089092699520.0, "grad_norm": 1.5189103504553005, "language_loss": 0.70524323, "learning_rate": 2.804824870920264e-09, "loss": 0.72606087, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3359375, "step": 16360, "time_per_iteration": 2.368866443634033 }, { "auxiliary_loss_clip": 0.01052786, "auxiliary_loss_mlp": 0.01042753, "balance_loss_clip": 1.02009153, "balance_loss_mlp": 1.01654315, "epoch": 0.9836765369006463, "flos": 23877274181760.0, "grad_norm": 2.1360631863796304, "language_loss": 0.85343736, "learning_rate": 2.7842438475293996e-09, "loss": 0.87439275, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36328125, "step": 16361, "time_per_iteration": 2.395916700363159 }, { "auxiliary_loss_clip": 0.01051072, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.01395011, "balance_loss_mlp": 1.01601064, "epoch": 0.9837366601533143, "flos": 25843931867520.0, "grad_norm": 1.7913811570868579, "language_loss": 0.76969475, "learning_rate": 2.76373855876022e-09, "loss": 0.7905674, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16362, "time_per_iteration": 2.3901352882385254 }, { "auxiliary_loss_clip": 0.01051585, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.01342356, "balance_loss_mlp": 1.01649702, "epoch": 0.9837967834059823, "flos": 21357033978240.0, "grad_norm": 1.6598741600626963, "language_loss": 0.72194862, "learning_rate": 2.7433090053901043e-09, "loss": 0.74281961, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3515625, "step": 16363, "time_per_iteration": 2.387979745864868 }, { "auxiliary_loss_clip": 0.01050433, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.01429558, "balance_loss_mlp": 1.01623356, "epoch": 0.9838569066586502, "flos": 18514789505280.0, "grad_norm": 1.6462241306423049, "language_loss": 0.6392808, "learning_rate": 2.7229551881937653e-09, "loss": 0.66013223, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.34179688, "step": 16364, "time_per_iteration": 3.7885820865631104 }, { "auxiliary_loss_clip": 0.01052301, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.01985943, "balance_loss_mlp": 1.01662302, "epoch": 0.9839170299113182, "flos": 22451386544640.0, "grad_norm": 1.6526162809597875, "language_loss": 0.75870448, "learning_rate": 2.702677107943252e-09, "loss": 0.77964294, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.35546875, "step": 16365, "time_per_iteration": 2.3752334117889404 }, { "auxiliary_loss_clip": 0.01049171, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.01313686, "balance_loss_mlp": 1.01480484, "epoch": 0.9839771531639862, "flos": 27891063970560.0, "grad_norm": 1.925007329654407, "language_loss": 0.77271682, "learning_rate": 2.6824747654072832e-09, "loss": 0.79356349, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 16366, "time_per_iteration": 2.4409008026123047 }, { "auxiliary_loss_clip": 0.0104982, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.01020932, "balance_loss_mlp": 1.01564491, "epoch": 0.9840372764166542, "flos": 28213417353600.0, "grad_norm": 1.5772827560837581, "language_loss": 0.77588344, "learning_rate": 2.662348161352357e-09, "loss": 0.79670358, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34179688, "step": 16367, "time_per_iteration": 2.4201433658599854 }, { "auxiliary_loss_clip": 0.01051186, "auxiliary_loss_mlp": 0.01034337, "balance_loss_clip": 1.01090002, "balance_loss_mlp": 1.01638412, "epoch": 0.9840973996693221, "flos": 23402874360960.0, "grad_norm": 1.4342141464422853, "language_loss": 0.61606389, "learning_rate": 2.642297296540974e-09, "loss": 0.63691914, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34765625, "step": 16368, "time_per_iteration": 2.3948118686676025 }, { "auxiliary_loss_clip": 0.01047682, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.01494074, "balance_loss_mlp": 1.01528692, "epoch": 0.9841575229219901, "flos": 21394844847360.0, "grad_norm": 1.5397183860523675, "language_loss": 0.66948199, "learning_rate": 2.6223221717340816e-09, "loss": 0.69031137, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.32421875, "step": 16369, "time_per_iteration": 2.387934684753418 }, { "auxiliary_loss_clip": 0.01052292, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.01497483, "balance_loss_mlp": 1.01544714, "epoch": 0.984217646174658, "flos": 24462872282880.0, "grad_norm": 1.6489402260598014, "language_loss": 0.69834191, "learning_rate": 2.6024227876886295e-09, "loss": 0.7192533, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3671875, "step": 16370, "time_per_iteration": 2.391291379928589 }, { "auxiliary_loss_clip": 0.010523, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.01388597, "balance_loss_mlp": 1.0158987, "epoch": 0.984277769427326, "flos": 16434140630400.0, "grad_norm": 1.6945437524693494, "language_loss": 0.74585634, "learning_rate": 2.582599145159792e-09, "loss": 0.76674664, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 16371, "time_per_iteration": 2.3580658435821533 }, { "auxiliary_loss_clip": 0.0100671, "auxiliary_loss_mlp": 0.01003283, "balance_loss_clip": 1.00120878, "balance_loss_mlp": 1.00046444, "epoch": 0.9843378926799939, "flos": 64527136513920.0, "grad_norm": 0.7804969087272537, "language_loss": 0.65293753, "learning_rate": 2.562851244898745e-09, "loss": 0.67303753, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.0625, "step": 16372, "time_per_iteration": 3.062891960144043 }, { "auxiliary_loss_clip": 0.0104958, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.01163876, "balance_loss_mlp": 1.0155468, "epoch": 0.984398015932662, "flos": 17381892931200.0, "grad_norm": 3.197183451187209, "language_loss": 0.71823049, "learning_rate": 2.5431790876544456e-09, "loss": 0.73905069, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.33984375, "step": 16373, "time_per_iteration": 3.8125150203704834 }, { "auxiliary_loss_clip": 0.01051274, "auxiliary_loss_mlp": 0.01035305, "balance_loss_clip": 1.01388347, "balance_loss_mlp": 1.01677942, "epoch": 0.9844581391853299, "flos": 23877937497600.0, "grad_norm": 1.8119811309082339, "language_loss": 0.82228935, "learning_rate": 2.523582674173186e-09, "loss": 0.84315515, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34570312, "step": 16374, "time_per_iteration": 2.4055557250976562 }, { "auxiliary_loss_clip": 0.01053227, "auxiliary_loss_mlp": 0.01038252, "balance_loss_clip": 1.01668704, "balance_loss_mlp": 1.01687217, "epoch": 0.9845182624379979, "flos": 19864322265600.0, "grad_norm": 1.6948332393450802, "language_loss": 0.70059633, "learning_rate": 2.504062005197927e-09, "loss": 0.72151113, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.36328125, "step": 16375, "time_per_iteration": 2.3628551959991455 }, { "auxiliary_loss_clip": 0.01053874, "auxiliary_loss_mlp": 0.0104035, "balance_loss_clip": 1.01545906, "balance_loss_mlp": 1.01687169, "epoch": 0.9845783856906659, "flos": 28253462549760.0, "grad_norm": 1.8038506376845742, "language_loss": 0.81753165, "learning_rate": 2.484617081468521e-09, "loss": 0.83847392, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.36914062, "step": 16376, "time_per_iteration": 2.448657751083374 }, { "auxiliary_loss_clip": 0.01049874, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.01137865, "balance_loss_mlp": 1.01557243, "epoch": 0.9846385089433338, "flos": 28327164163200.0, "grad_norm": 2.7224078595634276, "language_loss": 0.63177055, "learning_rate": 2.4652479037228224e-09, "loss": 0.65260458, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34375, "step": 16377, "time_per_iteration": 2.425400733947754 }, { "auxiliary_loss_clip": 0.01052046, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.01286077, "balance_loss_mlp": 1.01589584, "epoch": 0.9846986321960018, "flos": 24315608701440.0, "grad_norm": 1.923162811279243, "language_loss": 0.74360383, "learning_rate": 2.445954472695133e-09, "loss": 0.7644918, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.36132812, "step": 16378, "time_per_iteration": 2.4479565620422363 }, { "auxiliary_loss_clip": 0.01050197, "auxiliary_loss_mlp": 0.01037946, "balance_loss_clip": 1.01533198, "balance_loss_mlp": 1.01548088, "epoch": 0.9847587554486698, "flos": 27270762111360.0, "grad_norm": 1.678696161280776, "language_loss": 0.71934736, "learning_rate": 2.426736789116868e-09, "loss": 0.74022883, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34765625, "step": 16379, "time_per_iteration": 2.4347543716430664 }, { "auxiliary_loss_clip": 0.01051446, "auxiliary_loss_mlp": 0.01039855, "balance_loss_clip": 1.01579881, "balance_loss_mlp": 1.01512051, "epoch": 0.9848188787013378, "flos": 16541847774720.0, "grad_norm": 1.8959567621396247, "language_loss": 0.69682419, "learning_rate": 2.407594853716999e-09, "loss": 0.71773726, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36328125, "step": 16380, "time_per_iteration": 2.3840978145599365 }, { "auxiliary_loss_clip": 0.01052682, "auxiliary_loss_mlp": 0.01038774, "balance_loss_clip": 1.01669669, "balance_loss_mlp": 1.01606941, "epoch": 0.9848790019540057, "flos": 20192610579840.0, "grad_norm": 2.008953450416004, "language_loss": 0.79923964, "learning_rate": 2.38852866722139e-09, "loss": 0.82015419, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.3671875, "step": 16381, "time_per_iteration": 2.4361753463745117 }, { "auxiliary_loss_clip": 0.01052848, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.01704383, "balance_loss_mlp": 1.01651645, "epoch": 0.9849391252066737, "flos": 28258385051520.0, "grad_norm": 1.4061687513289898, "language_loss": 0.83037704, "learning_rate": 2.3695382303527965e-09, "loss": 0.85130447, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 16382, "time_per_iteration": 2.4688243865966797 }, { "auxiliary_loss_clip": 0.01051979, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.0148133, "balance_loss_mlp": 1.01506865, "epoch": 0.9849992484593416, "flos": 22453865251200.0, "grad_norm": 2.0619820185821904, "language_loss": 0.74925745, "learning_rate": 2.3506235438315316e-09, "loss": 0.77016687, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.36914062, "step": 16383, "time_per_iteration": 2.3873274326324463 }, { "auxiliary_loss_clip": 0.0105189, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.01075017, "balance_loss_mlp": 1.01647317, "epoch": 0.9850593717120096, "flos": 34495723376640.0, "grad_norm": 1.524843689590293, "language_loss": 0.67417383, "learning_rate": 2.3317846083750203e-09, "loss": 0.69503021, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 16384, "time_per_iteration": 2.5061910152435303 }, { "auxiliary_loss_clip": 0.01055787, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.01678395, "balance_loss_mlp": 1.01828146, "epoch": 0.9851194949646775, "flos": 38835741709440.0, "grad_norm": 1.7839418461030894, "language_loss": 0.71569312, "learning_rate": 2.313021424697359e-09, "loss": 0.73667955, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 16385, "time_per_iteration": 2.5588550567626953 }, { "auxiliary_loss_clip": 0.01054014, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.01878941, "balance_loss_mlp": 1.01832306, "epoch": 0.9851796182173456, "flos": 17711472965760.0, "grad_norm": 1.860387565497663, "language_loss": 0.82609993, "learning_rate": 2.294333993509978e-09, "loss": 0.84706247, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 16386, "time_per_iteration": 2.3769240379333496 }, { "auxiliary_loss_clip": 0.01053968, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.01698184, "balance_loss_mlp": 1.01737571, "epoch": 0.9852397414700135, "flos": 27453078564480.0, "grad_norm": 1.9674754153159077, "language_loss": 0.69001901, "learning_rate": 2.2757223155216442e-09, "loss": 0.71096158, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.36523438, "step": 16387, "time_per_iteration": 2.455188751220703 }, { "auxiliary_loss_clip": 0.01048103, "auxiliary_loss_mlp": 0.01032565, "balance_loss_clip": 1.01252627, "balance_loss_mlp": 1.01465118, "epoch": 0.9852998647226815, "flos": 18295709523840.0, "grad_norm": 1.8928150182760208, "language_loss": 0.75666195, "learning_rate": 2.257186391438237e-09, "loss": 0.77746862, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.33398438, "step": 16388, "time_per_iteration": 2.3488147258758545 }, { "auxiliary_loss_clip": 0.01049533, "auxiliary_loss_mlp": 0.01038254, "balance_loss_clip": 1.01430464, "balance_loss_mlp": 1.01448452, "epoch": 0.9853599879753495, "flos": 19641646414080.0, "grad_norm": 1.8982439609514365, "language_loss": 0.83575523, "learning_rate": 2.238726221962528e-09, "loss": 0.85663307, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.3515625, "step": 16389, "time_per_iteration": 2.3926444053649902 }, { "auxiliary_loss_clip": 0.01050527, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.0091095, "balance_loss_mlp": 1.01568162, "epoch": 0.9854201112280174, "flos": 23840824855680.0, "grad_norm": 1.8106626462497077, "language_loss": 0.68398798, "learning_rate": 2.2203418077946234e-09, "loss": 0.70481348, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34765625, "step": 16390, "time_per_iteration": 2.405388355255127 }, { "auxiliary_loss_clip": 0.01050998, "auxiliary_loss_mlp": 0.01039756, "balance_loss_clip": 1.01652217, "balance_loss_mlp": 1.01557934, "epoch": 0.9854802344806854, "flos": 30079280344320.0, "grad_norm": 1.5876857948526333, "language_loss": 0.77615261, "learning_rate": 2.2020331496312994e-09, "loss": 0.79706013, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16391, "time_per_iteration": 3.7885658740997314 }, { "auxiliary_loss_clip": 0.01047593, "auxiliary_loss_mlp": 0.01035564, "balance_loss_clip": 1.01595449, "balance_loss_mlp": 1.01499617, "epoch": 0.9855403577333534, "flos": 21906357310080.0, "grad_norm": 2.321728727887405, "language_loss": 0.69515967, "learning_rate": 2.1838002481673333e-09, "loss": 0.71599126, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.32421875, "step": 16392, "time_per_iteration": 2.398983955383301 }, { "auxiliary_loss_clip": 0.01053378, "auxiliary_loss_mlp": 0.01037602, "balance_loss_clip": 1.01195979, "balance_loss_mlp": 1.01611471, "epoch": 0.9856004809860214, "flos": 15412896184320.0, "grad_norm": 1.8655368722873553, "language_loss": 0.56875718, "learning_rate": 2.1656431040937286e-09, "loss": 0.58966696, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.37304688, "step": 16393, "time_per_iteration": 2.370502233505249 }, { "auxiliary_loss_clip": 0.01055312, "auxiliary_loss_mlp": 0.01038757, "balance_loss_clip": 1.01433086, "balance_loss_mlp": 1.01722121, "epoch": 0.9856606042386893, "flos": 13652610744960.0, "grad_norm": 2.3633803376390476, "language_loss": 0.80490965, "learning_rate": 2.1475617180990444e-09, "loss": 0.82585037, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.38085938, "step": 16394, "time_per_iteration": 2.3431262969970703 }, { "auxiliary_loss_clip": 0.01053286, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.01357448, "balance_loss_mlp": 1.01579428, "epoch": 0.9857207274913573, "flos": 23477972428800.0, "grad_norm": 1.4868486526828084, "language_loss": 0.76620162, "learning_rate": 2.129556090869178e-09, "loss": 0.78711104, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.375, "step": 16395, "time_per_iteration": 2.433015823364258 }, { "auxiliary_loss_clip": 0.01049485, "auxiliary_loss_mlp": 0.01037065, "balance_loss_clip": 1.01479673, "balance_loss_mlp": 1.01548529, "epoch": 0.9857808507440252, "flos": 21064531674240.0, "grad_norm": 2.0375313154086925, "language_loss": 0.76209235, "learning_rate": 2.1116262230866933e-09, "loss": 0.78295791, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.33984375, "step": 16396, "time_per_iteration": 2.428804636001587 }, { "auxiliary_loss_clip": 0.01049989, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.00926781, "balance_loss_mlp": 1.01610541, "epoch": 0.9858409739966932, "flos": 25300194353280.0, "grad_norm": 1.4852655977870333, "language_loss": 0.72166562, "learning_rate": 2.0937721154317133e-09, "loss": 0.74246228, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33984375, "step": 16397, "time_per_iteration": 2.418898582458496 }, { "auxiliary_loss_clip": 0.01049122, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.00906897, "balance_loss_mlp": 1.01634789, "epoch": 0.9859010972493611, "flos": 20557697333760.0, "grad_norm": 1.6638806611965085, "language_loss": 0.72773969, "learning_rate": 2.0759937685810304e-09, "loss": 0.74852991, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.328125, "step": 16398, "time_per_iteration": 2.406045436859131 }, { "auxiliary_loss_clip": 0.0105116, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.01323032, "balance_loss_mlp": 1.01632845, "epoch": 0.9859612205020292, "flos": 24753803575680.0, "grad_norm": 1.3566524389275658, "language_loss": 0.74671513, "learning_rate": 2.058291183208771e-09, "loss": 0.76758564, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34765625, "step": 16399, "time_per_iteration": 3.8361988067626953 }, { "auxiliary_loss_clip": 0.01051484, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.01499045, "balance_loss_mlp": 1.01630771, "epoch": 0.9860213437546971, "flos": 21104786338560.0, "grad_norm": 2.1651239772926756, "language_loss": 0.58111942, "learning_rate": 2.0406643599863993e-09, "loss": 0.60201395, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 16400, "time_per_iteration": 2.3698480129241943 }, { "auxiliary_loss_clip": 0.01055265, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.0170095, "balance_loss_mlp": 1.01719236, "epoch": 0.9860814670073651, "flos": 19135056453120.0, "grad_norm": 2.197781445422773, "language_loss": 0.81264442, "learning_rate": 2.023113299582491e-09, "loss": 0.83362329, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 16401, "time_per_iteration": 2.3640732765197754 }, { "auxiliary_loss_clip": 0.01051328, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.00979114, "balance_loss_mlp": 1.01686144, "epoch": 0.9861415902600331, "flos": 17236130538240.0, "grad_norm": 1.7605566684864031, "language_loss": 0.79322815, "learning_rate": 2.005638002662069e-09, "loss": 0.81406844, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34375, "step": 16402, "time_per_iteration": 2.416285991668701 }, { "auxiliary_loss_clip": 0.01053724, "auxiliary_loss_mlp": 0.01041415, "balance_loss_clip": 1.01717949, "balance_loss_mlp": 1.01693904, "epoch": 0.986201713512701, "flos": 27781331967360.0, "grad_norm": 2.1815008010941126, "language_loss": 0.71726942, "learning_rate": 1.9882384698881596e-09, "loss": 0.73822081, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.3671875, "step": 16403, "time_per_iteration": 2.4480297565460205 }, { "auxiliary_loss_clip": 0.01049145, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.0139817, "balance_loss_mlp": 1.0152204, "epoch": 0.986261836765369, "flos": 28729154090880.0, "grad_norm": 1.6684205997386472, "language_loss": 0.75671315, "learning_rate": 1.9709147019204566e-09, "loss": 0.77756268, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.33984375, "step": 16404, "time_per_iteration": 3.792034387588501 }, { "auxiliary_loss_clip": 0.01051799, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.01659822, "balance_loss_mlp": 1.01649475, "epoch": 0.986321960018037, "flos": 34312045380480.0, "grad_norm": 1.7615518714837703, "language_loss": 0.71007299, "learning_rate": 1.953666699415768e-09, "loss": 0.73098087, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 16405, "time_per_iteration": 2.4825515747070312 }, { "auxiliary_loss_clip": 0.01050432, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.02046633, "balance_loss_mlp": 1.01687133, "epoch": 0.986382083270705, "flos": 25188646959360.0, "grad_norm": 1.7321461608844917, "language_loss": 0.70779246, "learning_rate": 1.93649446302846e-09, "loss": 0.72871816, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.3359375, "step": 16406, "time_per_iteration": 2.39078950881958 }, { "auxiliary_loss_clip": 0.01050277, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.01288557, "balance_loss_mlp": 1.01596785, "epoch": 0.9864422065233729, "flos": 11025396535680.0, "grad_norm": 2.8071377339465236, "language_loss": 0.74977183, "learning_rate": 1.9193979934095663e-09, "loss": 0.77063656, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.34375, "step": 16407, "time_per_iteration": 2.3261735439300537 }, { "auxiliary_loss_clip": 0.01050107, "auxiliary_loss_mlp": 0.01036702, "balance_loss_clip": 1.0141952, "balance_loss_mlp": 1.01539922, "epoch": 0.9865023297760409, "flos": 16544640683520.0, "grad_norm": 1.8406105648914812, "language_loss": 0.78635061, "learning_rate": 1.9023772912072357e-09, "loss": 0.80721867, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34765625, "step": 16408, "time_per_iteration": 2.339364528656006 }, { "auxiliary_loss_clip": 0.01053087, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.00822437, "balance_loss_mlp": 1.01623058, "epoch": 0.9865624530287088, "flos": 18879178032000.0, "grad_norm": 1.7591575208731958, "language_loss": 0.68894213, "learning_rate": 1.8854323570669515e-09, "loss": 0.70981085, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.36914062, "step": 16409, "time_per_iteration": 2.377699136734009 }, { "auxiliary_loss_clip": 0.01006955, "auxiliary_loss_mlp": 0.01006433, "balance_loss_clip": 1.00457323, "balance_loss_mlp": 1.00073314, "epoch": 0.9866225762813768, "flos": 68884786949760.0, "grad_norm": 0.810480237239613, "language_loss": 0.61117274, "learning_rate": 1.8685631916313118e-09, "loss": 0.63130659, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.06225586, "step": 16410, "time_per_iteration": 3.0539097785949707 }, { "auxiliary_loss_clip": 0.01052222, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.01188374, "balance_loss_mlp": 1.01587903, "epoch": 0.9866826995340447, "flos": 29021830951680.0, "grad_norm": 2.1481513751438226, "language_loss": 0.67338413, "learning_rate": 1.8517697955400258e-09, "loss": 0.69426012, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.36328125, "step": 16411, "time_per_iteration": 2.4786083698272705 }, { "auxiliary_loss_clip": 0.01006738, "auxiliary_loss_mlp": 0.01002858, "balance_loss_clip": 1.00090253, "balance_loss_mlp": 1.00045002, "epoch": 0.9867428227867128, "flos": 65373116601600.0, "grad_norm": 0.7300893932992246, "language_loss": 0.56260908, "learning_rate": 1.8350521694299182e-09, "loss": 0.58270502, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.0625, "step": 16412, "time_per_iteration": 3.172781229019165 }, { "auxiliary_loss_clip": 0.0105272, "auxiliary_loss_mlp": 0.01039891, "balance_loss_clip": 1.01426077, "balance_loss_mlp": 1.01605225, "epoch": 0.9868029460393807, "flos": 26505081884160.0, "grad_norm": 2.0363060590549673, "language_loss": 0.74241221, "learning_rate": 1.818410313934926e-09, "loss": 0.76333827, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 16413, "time_per_iteration": 3.835279703140259 }, { "auxiliary_loss_clip": 0.01050879, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.01054144, "balance_loss_mlp": 1.01469946, "epoch": 0.9868630692920487, "flos": 22966145763840.0, "grad_norm": 1.3808273630098753, "language_loss": 0.72461587, "learning_rate": 1.8018442296858782e-09, "loss": 0.74545777, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36132812, "step": 16414, "time_per_iteration": 2.4205312728881836 }, { "auxiliary_loss_clip": 0.01050758, "auxiliary_loss_mlp": 0.01033859, "balance_loss_clip": 1.01159072, "balance_loss_mlp": 1.0162971, "epoch": 0.9869231925447167, "flos": 19827663471360.0, "grad_norm": 1.620470092236507, "language_loss": 0.71556932, "learning_rate": 1.7853539173111608e-09, "loss": 0.73641551, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34375, "step": 16415, "time_per_iteration": 2.38403058052063 }, { "auxiliary_loss_clip": 0.0104774, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.01291561, "balance_loss_mlp": 1.01476336, "epoch": 0.9869833157973846, "flos": 20194146679680.0, "grad_norm": 1.5205793366655047, "language_loss": 0.75940329, "learning_rate": 1.7689393774362737e-09, "loss": 0.78020906, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.33007812, "step": 16416, "time_per_iteration": 2.367724657058716 }, { "auxiliary_loss_clip": 0.01051062, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.01026976, "balance_loss_mlp": 1.01593173, "epoch": 0.9870434390500527, "flos": 16098800221440.0, "grad_norm": 2.088130091066895, "language_loss": 0.71793807, "learning_rate": 1.7526006106833858e-09, "loss": 0.7387892, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.3515625, "step": 16417, "time_per_iteration": 2.3567676544189453 }, { "auxiliary_loss_clip": 0.0105368, "auxiliary_loss_mlp": 0.01043843, "balance_loss_clip": 1.01690197, "balance_loss_mlp": 1.01687193, "epoch": 0.9871035623027206, "flos": 21759722133120.0, "grad_norm": 1.517006056349825, "language_loss": 0.71416926, "learning_rate": 1.7363376176720013e-09, "loss": 0.7351445, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.3671875, "step": 16418, "time_per_iteration": 2.396503210067749 }, { "auxiliary_loss_clip": 0.01006558, "auxiliary_loss_mlp": 0.0100493, "balance_loss_clip": 1.00301087, "balance_loss_mlp": 1.00038457, "epoch": 0.9871636855553886, "flos": 70216024291200.0, "grad_norm": 0.66152945441132, "language_loss": 0.53756404, "learning_rate": 1.7201503990189603e-09, "loss": 0.55767888, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.06176758, "step": 16419, "time_per_iteration": 3.1348884105682373 }, { "auxiliary_loss_clip": 0.01053539, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.01529634, "balance_loss_mlp": 1.01606679, "epoch": 0.9872238088080565, "flos": 25044665045760.0, "grad_norm": 1.5621829674368377, "language_loss": 0.7906515, "learning_rate": 1.7040389553382162e-09, "loss": 0.81158638, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 16420, "time_per_iteration": 2.425593376159668 }, { "auxiliary_loss_clip": 0.0105147, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.01631284, "balance_loss_mlp": 1.01733971, "epoch": 0.9872839320607245, "flos": 19464776133120.0, "grad_norm": 1.5180943146734398, "language_loss": 0.71580184, "learning_rate": 1.6880032872403916e-09, "loss": 0.73669279, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33984375, "step": 16421, "time_per_iteration": 2.4043691158294678 }, { "auxiliary_loss_clip": 0.01053324, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.01604855, "balance_loss_mlp": 1.01658678, "epoch": 0.9873440553133924, "flos": 26941705747200.0, "grad_norm": 2.0145977897968224, "language_loss": 0.83416724, "learning_rate": 1.6720433953338886e-09, "loss": 0.85512847, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.3671875, "step": 16422, "time_per_iteration": 2.433784008026123 }, { "auxiliary_loss_clip": 0.01051235, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01178527, "balance_loss_mlp": 1.01664507, "epoch": 0.9874041785660604, "flos": 19061215194240.0, "grad_norm": 1.6201090059473358, "language_loss": 0.87122768, "learning_rate": 1.656159280223779e-09, "loss": 0.89207178, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.34570312, "step": 16423, "time_per_iteration": 2.382376194000244 }, { "auxiliary_loss_clip": 0.01053515, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.01252198, "balance_loss_mlp": 1.01698327, "epoch": 0.9874643018187284, "flos": 21104751427200.0, "grad_norm": 1.9333850585285173, "language_loss": 0.71762133, "learning_rate": 1.6403509425122475e-09, "loss": 0.73853129, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36523438, "step": 16424, "time_per_iteration": 2.360119342803955 }, { "auxiliary_loss_clip": 0.01050939, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.01186132, "balance_loss_mlp": 1.01543021, "epoch": 0.9875244250713964, "flos": 24424886856960.0, "grad_norm": 1.766349284181673, "language_loss": 0.81861985, "learning_rate": 1.6246183827990366e-09, "loss": 0.83947521, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 16425, "time_per_iteration": 2.414161205291748 }, { "auxiliary_loss_clip": 0.01052187, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.0131681, "balance_loss_mlp": 1.01602817, "epoch": 0.9875845483240643, "flos": 25116481445760.0, "grad_norm": 2.0360563338449875, "language_loss": 0.81601882, "learning_rate": 1.6089616016803364e-09, "loss": 0.83691531, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.36132812, "step": 16426, "time_per_iteration": 2.40673828125 }, { "auxiliary_loss_clip": 0.01052019, "auxiliary_loss_mlp": 0.01044242, "balance_loss_clip": 1.0195297, "balance_loss_mlp": 1.01637959, "epoch": 0.9876446715767323, "flos": 16580845630080.0, "grad_norm": 1.8578627110663601, "language_loss": 0.8602435, "learning_rate": 1.593380599750338e-09, "loss": 0.8812061, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.35546875, "step": 16427, "time_per_iteration": 2.377323627471924 }, { "auxiliary_loss_clip": 0.01050914, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.01332259, "balance_loss_mlp": 1.01648927, "epoch": 0.9877047948294003, "flos": 21615076903680.0, "grad_norm": 3.55336755998467, "language_loss": 0.71039045, "learning_rate": 1.577875377599458e-09, "loss": 0.73125184, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34375, "step": 16428, "time_per_iteration": 2.3823671340942383 }, { "auxiliary_loss_clip": 0.01051118, "auxiliary_loss_mlp": 0.01034511, "balance_loss_clip": 1.01326823, "balance_loss_mlp": 1.01647091, "epoch": 0.9877649180820682, "flos": 21177440611200.0, "grad_norm": 1.958637142355758, "language_loss": 0.81533808, "learning_rate": 1.5624459358158926e-09, "loss": 0.8361944, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34570312, "step": 16429, "time_per_iteration": 2.3799517154693604 }, { "auxiliary_loss_clip": 0.01050691, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.01578701, "balance_loss_mlp": 1.01612222, "epoch": 0.9878250413347363, "flos": 39747673088640.0, "grad_norm": 1.5189904413766122, "language_loss": 0.63345683, "learning_rate": 1.5470922749845073e-09, "loss": 0.65433913, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34570312, "step": 16430, "time_per_iteration": 2.526319742202759 }, { "auxiliary_loss_clip": 0.01052748, "auxiliary_loss_mlp": 0.01042089, "balance_loss_clip": 1.01928413, "balance_loss_mlp": 1.0172143, "epoch": 0.9878851645874042, "flos": 29424309638400.0, "grad_norm": 1.390196703517565, "language_loss": 0.73718691, "learning_rate": 1.531814395687725e-09, "loss": 0.75813532, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35546875, "step": 16431, "time_per_iteration": 3.7655041217803955 }, { "auxiliary_loss_clip": 0.01050171, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.01491237, "balance_loss_mlp": 1.01535726, "epoch": 0.9879452878400722, "flos": 15805599690240.0, "grad_norm": 2.096132272706326, "language_loss": 0.82503998, "learning_rate": 1.5166122985048602e-09, "loss": 0.84592676, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.34765625, "step": 16432, "time_per_iteration": 2.4247753620147705 }, { "auxiliary_loss_clip": 0.0104869, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.01639152, "balance_loss_mlp": 1.01463985, "epoch": 0.9880054110927401, "flos": 22232900056320.0, "grad_norm": 1.5500754997832111, "language_loss": 0.81132251, "learning_rate": 1.5014859840123405e-09, "loss": 0.83218277, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33984375, "step": 16433, "time_per_iteration": 2.38690447807312 }, { "auxiliary_loss_clip": 0.0105019, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.01297116, "balance_loss_mlp": 1.01581526, "epoch": 0.9880655343454081, "flos": 28762670862720.0, "grad_norm": 1.9474129145018453, "language_loss": 0.6603868, "learning_rate": 1.4864354527837075e-09, "loss": 0.68125731, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.34375, "step": 16434, "time_per_iteration": 2.438962697982788 }, { "auxiliary_loss_clip": 0.01050995, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.01107895, "balance_loss_mlp": 1.01474714, "epoch": 0.988125657598076, "flos": 32852012567040.0, "grad_norm": 1.5829391262260213, "language_loss": 0.70495975, "learning_rate": 1.4714607053896154e-09, "loss": 0.7258203, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.36328125, "step": 16435, "time_per_iteration": 2.473853588104248 }, { "auxiliary_loss_clip": 0.01052346, "auxiliary_loss_mlp": 0.01040043, "balance_loss_clip": 1.01576042, "balance_loss_mlp": 1.01689148, "epoch": 0.988185780850744, "flos": 19389678065280.0, "grad_norm": 1.7962635004058187, "language_loss": 0.76872921, "learning_rate": 1.4565617423980548e-09, "loss": 0.78965306, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 16436, "time_per_iteration": 2.3992791175842285 }, { "auxiliary_loss_clip": 0.01052012, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.01617277, "balance_loss_mlp": 1.01640487, "epoch": 0.988245904103412, "flos": 22527322485120.0, "grad_norm": 2.170621893126387, "language_loss": 0.74950749, "learning_rate": 1.4417385643741286e-09, "loss": 0.770419, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 16437, "time_per_iteration": 2.366015672683716 }, { "auxiliary_loss_clip": 0.01049125, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.01345158, "balance_loss_mlp": 1.01564479, "epoch": 0.98830602735608, "flos": 28657896272640.0, "grad_norm": 1.9331709521358371, "language_loss": 0.61003041, "learning_rate": 1.4269911718796103e-09, "loss": 0.63087583, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.33398438, "step": 16438, "time_per_iteration": 2.4445104598999023 }, { "auxiliary_loss_clip": 0.01050808, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.01244712, "balance_loss_mlp": 1.01554596, "epoch": 0.9883661506087479, "flos": 20994984512640.0, "grad_norm": 1.8630177330407571, "language_loss": 0.73682898, "learning_rate": 1.4123195654738295e-09, "loss": 0.75768304, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.3515625, "step": 16439, "time_per_iteration": 3.796140432357788 }, { "auxiliary_loss_clip": 0.01049352, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.01836002, "balance_loss_mlp": 1.01506066, "epoch": 0.9884262738614159, "flos": 32704783896960.0, "grad_norm": 1.5976280554722637, "language_loss": 0.60882068, "learning_rate": 1.3977237457134528e-09, "loss": 0.62972128, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.34375, "step": 16440, "time_per_iteration": 2.4942867755889893 }, { "auxiliary_loss_clip": 0.01051366, "auxiliary_loss_mlp": 0.01035424, "balance_loss_clip": 1.01198769, "balance_loss_mlp": 1.01541674, "epoch": 0.9884863971140839, "flos": 17563790448000.0, "grad_norm": 2.2635624710283815, "language_loss": 0.77716428, "learning_rate": 1.3832037131513707e-09, "loss": 0.79803216, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 16441, "time_per_iteration": 2.3266947269439697 }, { "auxiliary_loss_clip": 0.01052122, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.01482654, "balance_loss_mlp": 1.01664472, "epoch": 0.9885465203667518, "flos": 40550919805440.0, "grad_norm": 1.880305858835398, "language_loss": 0.69860953, "learning_rate": 1.3687594683386982e-09, "loss": 0.7195133, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 16442, "time_per_iteration": 2.573410749435425 }, { "auxiliary_loss_clip": 0.01051752, "auxiliary_loss_mlp": 0.01033089, "balance_loss_clip": 1.01067734, "balance_loss_mlp": 1.01638424, "epoch": 0.9886066436194199, "flos": 13807135889280.0, "grad_norm": 3.592945895111179, "language_loss": 0.7580958, "learning_rate": 1.3543910118227753e-09, "loss": 0.77894419, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 16443, "time_per_iteration": 3.7365148067474365 }, { "auxiliary_loss_clip": 0.01051772, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.01249087, "balance_loss_mlp": 1.01602936, "epoch": 0.9886667668720878, "flos": 23324180423040.0, "grad_norm": 1.9174728672332348, "language_loss": 0.7486062, "learning_rate": 1.3400983441487213e-09, "loss": 0.7694732, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35742188, "step": 16444, "time_per_iteration": 2.431607723236084 }, { "auxiliary_loss_clip": 0.01051063, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.01473284, "balance_loss_mlp": 1.0160687, "epoch": 0.9887268901247558, "flos": 22705065550080.0, "grad_norm": 1.7151853740269392, "language_loss": 0.69723791, "learning_rate": 1.325881465858547e-09, "loss": 0.71811032, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34960938, "step": 16445, "time_per_iteration": 2.398573160171509 }, { "auxiliary_loss_clip": 0.01053183, "auxiliary_loss_mlp": 0.01036835, "balance_loss_clip": 1.01331472, "balance_loss_mlp": 1.01720035, "epoch": 0.9887870133774237, "flos": 13040478144000.0, "grad_norm": 2.5497322415446084, "language_loss": 0.62588871, "learning_rate": 1.311740377491155e-09, "loss": 0.6467889, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.359375, "step": 16446, "time_per_iteration": 2.379716396331787 }, { "auxiliary_loss_clip": 0.01052261, "auxiliary_loss_mlp": 0.01037778, "balance_loss_clip": 1.01577199, "balance_loss_mlp": 1.01642764, "epoch": 0.9888471366300917, "flos": 15157960369920.0, "grad_norm": 5.694985151971942, "language_loss": 0.72333562, "learning_rate": 1.297675079582783e-09, "loss": 0.74423599, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.359375, "step": 16447, "time_per_iteration": 2.3430516719818115 }, { "auxiliary_loss_clip": 0.01050788, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.01270318, "balance_loss_mlp": 1.0162226, "epoch": 0.9889072598827596, "flos": 25117633520640.0, "grad_norm": 1.8761633392726447, "language_loss": 0.85369349, "learning_rate": 1.2836855726667818e-09, "loss": 0.87455285, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 16448, "time_per_iteration": 2.419823169708252 }, { "auxiliary_loss_clip": 0.01049279, "auxiliary_loss_mlp": 0.0103703, "balance_loss_clip": 1.01583493, "balance_loss_mlp": 1.01491666, "epoch": 0.9889673831354276, "flos": 16727690275200.0, "grad_norm": 1.8914100700921106, "language_loss": 0.71703279, "learning_rate": 1.26977185727406e-09, "loss": 0.73789591, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.34375, "step": 16449, "time_per_iteration": 2.3459885120391846 }, { "auxiliary_loss_clip": 0.01053033, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.01598048, "balance_loss_mlp": 1.01595449, "epoch": 0.9890275063880956, "flos": 35583861720960.0, "grad_norm": 2.3974074133582235, "language_loss": 0.75395608, "learning_rate": 1.25593393393153e-09, "loss": 0.77486938, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.37109375, "step": 16450, "time_per_iteration": 2.5247578620910645 }, { "auxiliary_loss_clip": 0.01052688, "auxiliary_loss_mlp": 0.01040394, "balance_loss_clip": 1.01630151, "balance_loss_mlp": 1.01529872, "epoch": 0.9890876296407636, "flos": 18951378456960.0, "grad_norm": 1.778131819077099, "language_loss": 0.80461675, "learning_rate": 1.242171803164549e-09, "loss": 0.82554758, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.37304688, "step": 16451, "time_per_iteration": 3.8472650051116943 }, { "auxiliary_loss_clip": 0.01052137, "auxiliary_loss_mlp": 0.01040403, "balance_loss_clip": 1.01755047, "balance_loss_mlp": 1.01547623, "epoch": 0.9891477528934315, "flos": 23768170583040.0, "grad_norm": 1.8999544910984703, "language_loss": 0.71861279, "learning_rate": 1.2284854654946996e-09, "loss": 0.73953819, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.3671875, "step": 16452, "time_per_iteration": 2.436432361602783 }, { "auxiliary_loss_clip": 0.01049406, "auxiliary_loss_mlp": 0.01034292, "balance_loss_clip": 1.01334691, "balance_loss_mlp": 1.01567316, "epoch": 0.9892078761460995, "flos": 20771994458880.0, "grad_norm": 1.6804233244916746, "language_loss": 0.74504727, "learning_rate": 1.2148749214409004e-09, "loss": 0.76588428, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.33789062, "step": 16453, "time_per_iteration": 2.3605263233184814 }, { "auxiliary_loss_clip": 0.01050984, "auxiliary_loss_mlp": 0.01040475, "balance_loss_clip": 1.01752734, "balance_loss_mlp": 1.01503813, "epoch": 0.9892679993987675, "flos": 23366704325760.0, "grad_norm": 1.858412779935992, "language_loss": 0.72369039, "learning_rate": 1.2013401715191828e-09, "loss": 0.74460495, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.359375, "step": 16454, "time_per_iteration": 2.4079082012176514 }, { "auxiliary_loss_clip": 0.01048714, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01160383, "balance_loss_mlp": 1.01617825, "epoch": 0.9893281226514354, "flos": 22704472056960.0, "grad_norm": 1.8728113833447253, "language_loss": 0.76526284, "learning_rate": 1.1878812162433583e-09, "loss": 0.78607583, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.32617188, "step": 16455, "time_per_iteration": 2.3817455768585205 }, { "auxiliary_loss_clip": 0.01049193, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.01313996, "balance_loss_mlp": 1.01489806, "epoch": 0.9893882459041035, "flos": 21795647788800.0, "grad_norm": 1.7634346509215462, "language_loss": 0.66370571, "learning_rate": 1.1744980561230188e-09, "loss": 0.68455434, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 16456, "time_per_iteration": 2.4328806400299072 }, { "auxiliary_loss_clip": 0.01052067, "auxiliary_loss_mlp": 0.01035639, "balance_loss_clip": 1.01335847, "balance_loss_mlp": 1.01657188, "epoch": 0.9894483691567714, "flos": 18112799577600.0, "grad_norm": 2.0296371867812977, "language_loss": 0.74619919, "learning_rate": 1.161190691666203e-09, "loss": 0.76707625, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35546875, "step": 16457, "time_per_iteration": 2.3447089195251465 }, { "auxiliary_loss_clip": 0.01051115, "auxiliary_loss_mlp": 0.01036391, "balance_loss_clip": 1.01318073, "balance_loss_mlp": 1.0155673, "epoch": 0.9895084924094394, "flos": 31210291704960.0, "grad_norm": 2.186533251603272, "language_loss": 0.70660162, "learning_rate": 1.1479591233773954e-09, "loss": 0.72747666, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16458, "time_per_iteration": 2.4939398765563965 }, { "auxiliary_loss_clip": 0.01050806, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.01354909, "balance_loss_mlp": 1.01630235, "epoch": 0.9895686156621073, "flos": 19677153133440.0, "grad_norm": 4.801756309589411, "language_loss": 0.79756331, "learning_rate": 1.1348033517581956e-09, "loss": 0.8184334, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34375, "step": 16459, "time_per_iteration": 2.356660842895508 }, { "auxiliary_loss_clip": 0.01052128, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.01375055, "balance_loss_mlp": 1.01662445, "epoch": 0.9896287389147753, "flos": 23580687248640.0, "grad_norm": 2.4303182584443306, "language_loss": 0.72389829, "learning_rate": 1.1217233773075373e-09, "loss": 0.74479324, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 16460, "time_per_iteration": 2.4447414875030518 }, { "auxiliary_loss_clip": 0.01052677, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.00868416, "balance_loss_mlp": 1.01575971, "epoch": 0.9896888621674432, "flos": 29604077562240.0, "grad_norm": 1.6303830324145399, "language_loss": 0.88177305, "learning_rate": 1.1087192005214685e-09, "loss": 0.90261871, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36914062, "step": 16461, "time_per_iteration": 2.432691812515259 }, { "auxiliary_loss_clip": 0.01050974, "auxiliary_loss_mlp": 0.01044886, "balance_loss_clip": 1.0205797, "balance_loss_mlp": 1.0157423, "epoch": 0.9897489854201112, "flos": 23693945299200.0, "grad_norm": 1.7475699793329003, "language_loss": 0.63910568, "learning_rate": 1.09579082189315e-09, "loss": 0.66006422, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.3515625, "step": 16462, "time_per_iteration": 2.4454967975616455 }, { "auxiliary_loss_clip": 0.01053105, "auxiliary_loss_mlp": 0.01042155, "balance_loss_clip": 1.01956463, "balance_loss_mlp": 1.01727307, "epoch": 0.9898091086727792, "flos": 13224295785600.0, "grad_norm": 2.277039326084464, "language_loss": 0.7406621, "learning_rate": 1.0829382419126343e-09, "loss": 0.76161468, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35742188, "step": 16463, "time_per_iteration": 2.3818283081054688 }, { "auxiliary_loss_clip": 0.01052175, "auxiliary_loss_mlp": 0.01037814, "balance_loss_clip": 1.01500976, "balance_loss_mlp": 1.01684809, "epoch": 0.9898692319254472, "flos": 22929277501440.0, "grad_norm": 1.693241997588554, "language_loss": 0.71012038, "learning_rate": 1.0701614610675314e-09, "loss": 0.73102033, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35351562, "step": 16464, "time_per_iteration": 2.427647590637207 }, { "auxiliary_loss_clip": 0.01052074, "auxiliary_loss_mlp": 0.01038323, "balance_loss_clip": 1.01418269, "balance_loss_mlp": 1.01575947, "epoch": 0.9899293551781151, "flos": 12457533306240.0, "grad_norm": 3.419526649472281, "language_loss": 0.74501026, "learning_rate": 1.0574604798421204e-09, "loss": 0.76591426, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36328125, "step": 16465, "time_per_iteration": 2.3523447513580322 }, { "auxiliary_loss_clip": 0.0104992, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.01618505, "balance_loss_mlp": 1.01508725, "epoch": 0.9899894784307831, "flos": 26869889347200.0, "grad_norm": 1.609535067901223, "language_loss": 0.87546068, "learning_rate": 1.0448352987182386e-09, "loss": 0.89633107, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.34765625, "step": 16466, "time_per_iteration": 2.4353411197662354 }, { "auxiliary_loss_clip": 0.01050965, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.00978935, "balance_loss_mlp": 1.01556659, "epoch": 0.990049601683451, "flos": 21541061088000.0, "grad_norm": 1.9364921641085109, "language_loss": 0.72647011, "learning_rate": 1.0322859181743915e-09, "loss": 0.74729717, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.35546875, "step": 16467, "time_per_iteration": 2.3762786388397217 }, { "auxiliary_loss_clip": 0.01049049, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.01667798, "balance_loss_mlp": 1.01504052, "epoch": 0.990109724936119, "flos": 28770421184640.0, "grad_norm": 1.2947714049917085, "language_loss": 0.6578666, "learning_rate": 1.019812338686643e-09, "loss": 0.67872572, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33984375, "step": 16468, "time_per_iteration": 2.4969394207000732 }, { "auxiliary_loss_clip": 0.01054899, "auxiliary_loss_mlp": 0.0103789, "balance_loss_clip": 1.01436973, "balance_loss_mlp": 1.01735473, "epoch": 0.9901698481887871, "flos": 29273101073280.0, "grad_norm": 2.3372459801976695, "language_loss": 0.63044477, "learning_rate": 1.0074145607281704e-09, "loss": 0.65137267, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.375, "step": 16469, "time_per_iteration": 2.4165854454040527 }, { "auxiliary_loss_clip": 0.01052665, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.01209807, "balance_loss_mlp": 1.01613653, "epoch": 0.990229971441455, "flos": 15958169798400.0, "grad_norm": 2.5588588558731615, "language_loss": 0.73851955, "learning_rate": 9.950925847685976e-10, "loss": 0.75940484, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.36523438, "step": 16470, "time_per_iteration": 2.368504285812378 }, { "auxiliary_loss_clip": 0.01006848, "auxiliary_loss_mlp": 0.01003025, "balance_loss_clip": 1.00090301, "balance_loss_mlp": 1.00055552, "epoch": 0.990290094694123, "flos": 69776782076160.0, "grad_norm": 0.6867722422435837, "language_loss": 0.55574894, "learning_rate": 9.828464112755509e-10, "loss": 0.57584763, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.06298828, "step": 16471, "time_per_iteration": 4.559534311294556 }, { "auxiliary_loss_clip": 0.01051879, "auxiliary_loss_mlp": 0.01038811, "balance_loss_clip": 1.01474309, "balance_loss_mlp": 1.01609254, "epoch": 0.9903502179467909, "flos": 16251544886400.0, "grad_norm": 2.0758515267114626, "language_loss": 0.85377383, "learning_rate": 9.706760407131032e-10, "loss": 0.8746807, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.35742188, "step": 16472, "time_per_iteration": 2.3839216232299805 }, { "auxiliary_loss_clip": 0.01050878, "auxiliary_loss_mlp": 0.01035771, "balance_loss_clip": 1.01262021, "balance_loss_mlp": 1.01583195, "epoch": 0.9904103411994589, "flos": 21687347151360.0, "grad_norm": 1.8922013676686273, "language_loss": 0.86685491, "learning_rate": 9.585814735431075e-10, "loss": 0.88772142, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34960938, "step": 16473, "time_per_iteration": 2.385730743408203 }, { "auxiliary_loss_clip": 0.01049752, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.01541722, "balance_loss_mlp": 1.01552463, "epoch": 0.9904704644521268, "flos": 25738249582080.0, "grad_norm": 1.7016588467145164, "language_loss": 0.85627317, "learning_rate": 9.465627102240859e-10, "loss": 0.87712675, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.34179688, "step": 16474, "time_per_iteration": 2.4347822666168213 }, { "auxiliary_loss_clip": 0.01049308, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.01535273, "balance_loss_mlp": 1.01435614, "epoch": 0.9905305877047949, "flos": 21907265005440.0, "grad_norm": 5.923288563003934, "language_loss": 0.77048028, "learning_rate": 9.346197512116738e-10, "loss": 0.79134715, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.34960938, "step": 16475, "time_per_iteration": 2.408653497695923 }, { "auxiliary_loss_clip": 0.01050489, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.01579583, "balance_loss_mlp": 1.01469254, "epoch": 0.9905907109574628, "flos": 21391493356800.0, "grad_norm": 1.4463607383785344, "language_loss": 0.76385832, "learning_rate": 9.227525969588423e-10, "loss": 0.78474885, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35742188, "step": 16476, "time_per_iteration": 2.404799699783325 }, { "auxiliary_loss_clip": 0.01054627, "auxiliary_loss_mlp": 0.01036147, "balance_loss_clip": 1.01063621, "balance_loss_mlp": 1.01653099, "epoch": 0.9906508342101308, "flos": 20520584691840.0, "grad_norm": 2.122970700054227, "language_loss": 0.68313503, "learning_rate": 9.109612479154538e-10, "loss": 0.70404279, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.38085938, "step": 16477, "time_per_iteration": 2.3602728843688965 }, { "auxiliary_loss_clip": 0.01055157, "auxiliary_loss_mlp": 0.01044113, "balance_loss_clip": 1.01843524, "balance_loss_mlp": 1.01768017, "epoch": 0.9907109574627987, "flos": 21360141089280.0, "grad_norm": 1.9755446152284668, "language_loss": 0.73181057, "learning_rate": 8.992457045289282e-10, "loss": 0.75280333, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.375, "step": 16478, "time_per_iteration": 3.8066673278808594 }, { "auxiliary_loss_clip": 0.01051753, "auxiliary_loss_mlp": 0.01042124, "balance_loss_clip": 1.01773357, "balance_loss_mlp": 1.01542413, "epoch": 0.9907710807154667, "flos": 17337902751360.0, "grad_norm": 2.2191726786891226, "language_loss": 0.82901597, "learning_rate": 8.876059672433545e-10, "loss": 0.84995472, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.36328125, "step": 16479, "time_per_iteration": 2.324389934539795 }, { "auxiliary_loss_clip": 0.01053318, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.0153563, "balance_loss_mlp": 1.01672435, "epoch": 0.9908312039681346, "flos": 28620609073920.0, "grad_norm": 1.6418495672889333, "language_loss": 0.67532659, "learning_rate": 8.760420364999355e-10, "loss": 0.6962533, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.3671875, "step": 16480, "time_per_iteration": 2.4523398876190186 }, { "auxiliary_loss_clip": 0.01051491, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.01284862, "balance_loss_mlp": 1.01650321, "epoch": 0.9908913272208026, "flos": 35769250373760.0, "grad_norm": 1.636763602068245, "language_loss": 0.72981977, "learning_rate": 8.645539127374313e-10, "loss": 0.75066888, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34960938, "step": 16481, "time_per_iteration": 2.500537633895874 }, { "auxiliary_loss_clip": 0.01050218, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 1.00928926, "balance_loss_mlp": 1.01630259, "epoch": 0.9909514504734707, "flos": 19901155616640.0, "grad_norm": 2.0327553229614135, "language_loss": 0.79446554, "learning_rate": 8.531415963912713e-10, "loss": 0.81527138, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.33984375, "step": 16482, "time_per_iteration": 3.768002986907959 }, { "auxiliary_loss_clip": 0.01053068, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.0161984, "balance_loss_mlp": 1.01690149, "epoch": 0.9910115737261386, "flos": 20003940259200.0, "grad_norm": 1.912557535839443, "language_loss": 0.76740807, "learning_rate": 8.418050878944427e-10, "loss": 0.78833652, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.36132812, "step": 16483, "time_per_iteration": 2.365248918533325 }, { "auxiliary_loss_clip": 0.01006592, "auxiliary_loss_mlp": 0.0100484, "balance_loss_clip": 1.00275421, "balance_loss_mlp": 1.00050306, "epoch": 0.9910716969788066, "flos": 70685012851200.0, "grad_norm": 0.6822996060653648, "language_loss": 0.53698361, "learning_rate": 8.305443876768237e-10, "loss": 0.55709791, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.06103516, "step": 16484, "time_per_iteration": 3.1693859100341797 }, { "auxiliary_loss_clip": 0.01049217, "auxiliary_loss_mlp": 0.01035339, "balance_loss_clip": 1.01242721, "balance_loss_mlp": 1.0159657, "epoch": 0.9911318202314745, "flos": 21432969918720.0, "grad_norm": 1.6328124082951856, "language_loss": 0.82757664, "learning_rate": 8.19359496165184e-10, "loss": 0.84842217, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.33203125, "step": 16485, "time_per_iteration": 2.359832763671875 }, { "auxiliary_loss_clip": 0.01051835, "auxiliary_loss_mlp": 0.0103917, "balance_loss_clip": 1.01691413, "balance_loss_mlp": 1.01685619, "epoch": 0.9911919434841425, "flos": 19825848080640.0, "grad_norm": 1.5937759566285437, "language_loss": 0.82357979, "learning_rate": 8.082504137836288e-10, "loss": 0.84448993, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16486, "time_per_iteration": 2.4138617515563965 }, { "auxiliary_loss_clip": 0.01051209, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.01357293, "balance_loss_mlp": 1.01577759, "epoch": 0.9912520667368104, "flos": 41717752087680.0, "grad_norm": 1.3739196914737983, "language_loss": 0.6689117, "learning_rate": 7.972171409538209e-10, "loss": 0.68978369, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.35546875, "step": 16487, "time_per_iteration": 2.5413870811462402 }, { "auxiliary_loss_clip": 0.0104876, "auxiliary_loss_mlp": 0.0103444, "balance_loss_clip": 1.0137217, "balance_loss_mlp": 1.01489711, "epoch": 0.9913121899894785, "flos": 23768519696640.0, "grad_norm": 1.7318356897993288, "language_loss": 0.77711767, "learning_rate": 7.862596780936481e-10, "loss": 0.79794967, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.33789062, "step": 16488, "time_per_iteration": 2.425225257873535 }, { "auxiliary_loss_clip": 0.01053263, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.0106982, "balance_loss_mlp": 1.0156213, "epoch": 0.9913723132421464, "flos": 23768519696640.0, "grad_norm": 2.8472199324442946, "language_loss": 0.69841266, "learning_rate": 7.753780256190001e-10, "loss": 0.71929085, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.375, "step": 16489, "time_per_iteration": 2.4532511234283447 }, { "auxiliary_loss_clip": 0.01007206, "auxiliary_loss_mlp": 0.01001766, "balance_loss_clip": 0.99973929, "balance_loss_mlp": 1.0008347, "epoch": 0.9914324364948144, "flos": 71264117439360.0, "grad_norm": 0.6060145273869829, "language_loss": 0.52697647, "learning_rate": 7.645721839424357e-10, "loss": 0.54706609, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.06347656, "step": 16490, "time_per_iteration": 3.128629684448242 }, { "auxiliary_loss_clip": 0.01054161, "auxiliary_loss_mlp": 0.01036942, "balance_loss_clip": 1.01164556, "balance_loss_mlp": 1.01723766, "epoch": 0.9914925597474823, "flos": 23694329324160.0, "grad_norm": 1.6064166689339243, "language_loss": 0.76273918, "learning_rate": 7.538421534734052e-10, "loss": 0.78365022, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.36914062, "step": 16491, "time_per_iteration": 3.8346545696258545 }, { "auxiliary_loss_clip": 0.0105455, "auxiliary_loss_mlp": 0.0104023, "balance_loss_clip": 1.0140276, "balance_loss_mlp": 1.01691353, "epoch": 0.9915526830001503, "flos": 13433251472640.0, "grad_norm": 2.3513792111782625, "language_loss": 0.70770943, "learning_rate": 7.431879346191383e-10, "loss": 0.72865725, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 16492, "time_per_iteration": 2.3726956844329834 }, { "auxiliary_loss_clip": 0.01050564, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.01415348, "balance_loss_mlp": 1.01540971, "epoch": 0.9916128062528182, "flos": 20739909052800.0, "grad_norm": 1.8662431848414098, "language_loss": 0.70366824, "learning_rate": 7.326095277837563e-10, "loss": 0.72454727, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3515625, "step": 16493, "time_per_iteration": 2.36232852935791 }, { "auxiliary_loss_clip": 0.0105211, "auxiliary_loss_mlp": 0.01039572, "balance_loss_clip": 1.01652884, "balance_loss_mlp": 1.01565886, "epoch": 0.9916729295054862, "flos": 22486125214080.0, "grad_norm": 1.7235124053253759, "language_loss": 0.72396523, "learning_rate": 7.221069333678276e-10, "loss": 0.74488205, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.36328125, "step": 16494, "time_per_iteration": 2.421680450439453 }, { "auxiliary_loss_clip": 0.01052284, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.0148766, "balance_loss_mlp": 1.01681185, "epoch": 0.9917330527581543, "flos": 14791616807040.0, "grad_norm": 2.093319702565956, "language_loss": 0.69256794, "learning_rate": 7.116801517701443e-10, "loss": 0.71347499, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 16495, "time_per_iteration": 2.3437657356262207 }, { "auxiliary_loss_clip": 0.01006871, "auxiliary_loss_mlp": 0.01002467, "balance_loss_clip": 1.00035739, "balance_loss_mlp": 1.00066948, "epoch": 0.9917931760108222, "flos": 59188602896640.0, "grad_norm": 0.7196328161486152, "language_loss": 0.53524882, "learning_rate": 7.013291833859458e-10, "loss": 0.55534214, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.06201172, "step": 16496, "time_per_iteration": 3.1472251415252686 }, { "auxiliary_loss_clip": 0.01051893, "auxiliary_loss_mlp": 0.01041416, "balance_loss_clip": 1.01640606, "balance_loss_mlp": 1.01569521, "epoch": 0.9918532992634902, "flos": 26760401723520.0, "grad_norm": 1.8162424490965101, "language_loss": 0.72732371, "learning_rate": 6.91054028607585e-10, "loss": 0.74825674, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.36132812, "step": 16497, "time_per_iteration": 2.4181582927703857 }, { "auxiliary_loss_clip": 0.01053786, "auxiliary_loss_mlp": 0.01041035, "balance_loss_clip": 1.01453519, "balance_loss_mlp": 1.01542497, "epoch": 0.9919134225161581, "flos": 14974840955520.0, "grad_norm": 2.0389644375831035, "language_loss": 0.83352733, "learning_rate": 6.808546878249721e-10, "loss": 0.8544755, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.3828125, "step": 16498, "time_per_iteration": 2.3694000244140625 }, { "auxiliary_loss_clip": 0.01053557, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.01428854, "balance_loss_mlp": 1.01728725, "epoch": 0.9919735457688261, "flos": 27816978332160.0, "grad_norm": 1.5693453934100117, "language_loss": 0.68640089, "learning_rate": 6.707311614246869e-10, "loss": 0.70729589, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.36328125, "step": 16499, "time_per_iteration": 2.4395530223846436 }, { "auxiliary_loss_clip": 0.01053233, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.01423359, "balance_loss_mlp": 1.01755714, "epoch": 0.992033669021494, "flos": 22561746952320.0, "grad_norm": 1.8815836248100426, "language_loss": 0.83313125, "learning_rate": 6.606834497904223e-10, "loss": 0.85403281, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 16500, "time_per_iteration": 2.4384145736694336 }, { "auxiliary_loss_clip": 0.0105403, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.01523626, "balance_loss_mlp": 1.0169574, "epoch": 0.9920937922741621, "flos": 25373407207680.0, "grad_norm": 1.6803088256279328, "language_loss": 0.82835019, "learning_rate": 6.507115533036511e-10, "loss": 0.84928048, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.37109375, "step": 16501, "time_per_iteration": 2.3931164741516113 }, { "auxiliary_loss_clip": 0.01051608, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.01205015, "balance_loss_mlp": 1.01614738, "epoch": 0.99215391552683, "flos": 22053376512000.0, "grad_norm": 1.8510840806730784, "language_loss": 0.78120786, "learning_rate": 6.408154723420711e-10, "loss": 0.80207372, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35351562, "step": 16502, "time_per_iteration": 2.4098494052886963 }, { "auxiliary_loss_clip": 0.01053774, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.01149321, "balance_loss_mlp": 1.01640487, "epoch": 0.992214038779498, "flos": 15413035829760.0, "grad_norm": 2.2631846167393426, "language_loss": 0.73021197, "learning_rate": 6.309952072811597e-10, "loss": 0.75111133, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.375, "step": 16503, "time_per_iteration": 2.397703170776367 }, { "auxiliary_loss_clip": 0.01007131, "auxiliary_loss_mlp": 0.01002899, "balance_loss_clip": 1.00087214, "balance_loss_mlp": 1.0008707, "epoch": 0.9922741620321659, "flos": 62011050762240.0, "grad_norm": 0.6403820923587432, "language_loss": 0.55178642, "learning_rate": 6.212507584932858e-10, "loss": 0.57188672, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.0625, "step": 16504, "time_per_iteration": 3.1349101066589355 }, { "auxiliary_loss_clip": 0.01049642, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.00945425, "balance_loss_mlp": 1.01544094, "epoch": 0.9923342852848339, "flos": 17164523606400.0, "grad_norm": 1.7338415560850131, "language_loss": 0.70777673, "learning_rate": 6.115821263481536e-10, "loss": 0.7285732, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.34179688, "step": 16505, "time_per_iteration": 2.35383677482605 }, { "auxiliary_loss_clip": 0.01052662, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.01175725, "balance_loss_mlp": 1.01557446, "epoch": 0.9923944085375018, "flos": 23182153545600.0, "grad_norm": 1.9792621296291446, "language_loss": 0.66055727, "learning_rate": 6.019893112119146e-10, "loss": 0.68146271, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.37109375, "step": 16506, "time_per_iteration": 2.419633150100708 }, { "auxiliary_loss_clip": 0.01051386, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.00857854, "balance_loss_mlp": 1.01627064, "epoch": 0.9924545317901698, "flos": 20812807704960.0, "grad_norm": 2.0808993957122333, "language_loss": 0.6433537, "learning_rate": 5.924723134487219e-10, "loss": 0.66419911, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.3515625, "step": 16507, "time_per_iteration": 2.3738150596618652 }, { "auxiliary_loss_clip": 0.01052517, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.01339436, "balance_loss_mlp": 1.01653695, "epoch": 0.9925146550428379, "flos": 20082424728960.0, "grad_norm": 1.8261062524598641, "language_loss": 0.73778886, "learning_rate": 5.830311334193983e-10, "loss": 0.75870091, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 16508, "time_per_iteration": 2.4077346324920654 }, { "auxiliary_loss_clip": 0.01052868, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.01320052, "balance_loss_mlp": 1.01633239, "epoch": 0.9925747782955058, "flos": 24972534443520.0, "grad_norm": 1.6369102504319144, "language_loss": 0.71225953, "learning_rate": 5.736657714818793e-10, "loss": 0.73316693, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.36523438, "step": 16509, "time_per_iteration": 2.3955023288726807 }, { "auxiliary_loss_clip": 0.01052625, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.01321578, "balance_loss_mlp": 1.0162859, "epoch": 0.9926349015481738, "flos": 60470405752320.0, "grad_norm": 1.873815565074535, "language_loss": 0.6992147, "learning_rate": 5.643762279912146e-10, "loss": 0.72010058, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 16510, "time_per_iteration": 2.762310028076172 }, { "auxiliary_loss_clip": 0.01052829, "auxiliary_loss_mlp": 0.0103761, "balance_loss_clip": 1.01424503, "balance_loss_mlp": 1.01684022, "epoch": 0.9926950248008417, "flos": 20740642191360.0, "grad_norm": 2.7202154483712326, "language_loss": 0.83179635, "learning_rate": 5.551625032997886e-10, "loss": 0.85270071, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.359375, "step": 16511, "time_per_iteration": 3.7451868057250977 }, { "auxiliary_loss_clip": 0.01049959, "auxiliary_loss_mlp": 0.01032594, "balance_loss_clip": 1.01123214, "balance_loss_mlp": 1.01458156, "epoch": 0.9927551480535097, "flos": 24351813648000.0, "grad_norm": 1.915278254996496, "language_loss": 0.9270786, "learning_rate": 5.460245977570998e-10, "loss": 0.94790423, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.35351562, "step": 16512, "time_per_iteration": 2.413013458251953 }, { "auxiliary_loss_clip": 0.01006882, "auxiliary_loss_mlp": 0.01005918, "balance_loss_clip": 1.00385523, "balance_loss_mlp": 1.00070262, "epoch": 0.9928152713061776, "flos": 71272042318080.0, "grad_norm": 0.6986452908464177, "language_loss": 0.55286467, "learning_rate": 5.369625117095378e-10, "loss": 0.57299268, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.06176758, "step": 16513, "time_per_iteration": 3.1287012100219727 }, { "auxiliary_loss_clip": 0.01049611, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.01091242, "balance_loss_mlp": 1.01484776, "epoch": 0.9928753945588457, "flos": 57807440444160.0, "grad_norm": 1.4500248961640303, "language_loss": 0.66083801, "learning_rate": 5.279762455006054e-10, "loss": 0.681674, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34765625, "step": 16514, "time_per_iteration": 2.6887924671173096 }, { "auxiliary_loss_clip": 0.01053682, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.01302004, "balance_loss_mlp": 1.01661086, "epoch": 0.9929355178115136, "flos": 19568084446080.0, "grad_norm": 1.7715384984141704, "language_loss": 0.73813522, "learning_rate": 5.190657994713632e-10, "loss": 0.75905418, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37109375, "step": 16515, "time_per_iteration": 2.362807035446167 }, { "auxiliary_loss_clip": 0.01052162, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.01374364, "balance_loss_mlp": 1.01695657, "epoch": 0.9929956410641816, "flos": 22963597234560.0, "grad_norm": 1.533060671236971, "language_loss": 0.78081566, "learning_rate": 5.102311739593191e-10, "loss": 0.80169368, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.3515625, "step": 16516, "time_per_iteration": 2.388577461242676 }, { "auxiliary_loss_clip": 0.01050562, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.0102638, "balance_loss_mlp": 1.01547718, "epoch": 0.9930557643168495, "flos": 22565272999680.0, "grad_norm": 1.7367284201761441, "language_loss": 0.7871142, "learning_rate": 5.014723692997602e-10, "loss": 0.80793083, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.3515625, "step": 16517, "time_per_iteration": 3.8320353031158447 }, { "auxiliary_loss_clip": 0.01054349, "auxiliary_loss_mlp": 0.01039594, "balance_loss_clip": 1.01358271, "balance_loss_mlp": 1.01637697, "epoch": 0.9931158875695175, "flos": 17200100148480.0, "grad_norm": 2.227524210168445, "language_loss": 0.69218057, "learning_rate": 4.927893858248655e-10, "loss": 0.71311998, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.37890625, "step": 16518, "time_per_iteration": 2.350573778152466 }, { "auxiliary_loss_clip": 0.01007158, "auxiliary_loss_mlp": 0.010063, "balance_loss_clip": 1.00420213, "balance_loss_mlp": 1.00088692, "epoch": 0.9931760108221854, "flos": 63708214596480.0, "grad_norm": 0.7381855297801038, "language_loss": 0.5350194, "learning_rate": 4.84182223863483e-10, "loss": 0.55515397, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.06298828, "step": 16519, "time_per_iteration": 2.8889756202697754 }, { "auxiliary_loss_clip": 0.01050969, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.01591015, "balance_loss_mlp": 1.01661777, "epoch": 0.9932361340748534, "flos": 15303897319680.0, "grad_norm": 1.9714136539775473, "language_loss": 0.61024296, "learning_rate": 4.756508837426842e-10, "loss": 0.63113695, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34375, "step": 16520, "time_per_iteration": 2.3462846279144287 }, { "auxiliary_loss_clip": 0.01051025, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.01398325, "balance_loss_mlp": 1.01605904, "epoch": 0.9932962573275215, "flos": 36063428423040.0, "grad_norm": 1.7793011150696274, "language_loss": 0.63288486, "learning_rate": 4.671953657853223e-10, "loss": 0.65376478, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.34960938, "step": 16521, "time_per_iteration": 2.519740581512451 }, { "auxiliary_loss_clip": 0.01053087, "auxiliary_loss_mlp": 0.01037745, "balance_loss_clip": 1.01285362, "balance_loss_mlp": 1.0160867, "epoch": 0.9933563805801894, "flos": 21469454156160.0, "grad_norm": 1.6415532292762327, "language_loss": 0.75530446, "learning_rate": 4.5881567031225145e-10, "loss": 0.77621281, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.37109375, "step": 16522, "time_per_iteration": 3.7900197505950928 }, { "auxiliary_loss_clip": 0.01052077, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.01244152, "balance_loss_mlp": 1.01720369, "epoch": 0.9934165038328574, "flos": 23985435173760.0, "grad_norm": 1.6468408584631735, "language_loss": 0.74101126, "learning_rate": 4.5051179764143964e-10, "loss": 0.76187873, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.34765625, "step": 16523, "time_per_iteration": 2.383441925048828 }, { "auxiliary_loss_clip": 0.01051045, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.01181746, "balance_loss_mlp": 1.01586974, "epoch": 0.9934766270855253, "flos": 21906741335040.0, "grad_norm": 1.6471219832714696, "language_loss": 0.71916431, "learning_rate": 4.422837480875241e-10, "loss": 0.74002278, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 16524, "time_per_iteration": 2.3766725063323975 }, { "auxiliary_loss_clip": 0.01050693, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.01368475, "balance_loss_mlp": 1.01527691, "epoch": 0.9935367503381933, "flos": 17128179014400.0, "grad_norm": 2.5444257779415147, "language_loss": 0.80100977, "learning_rate": 4.341315219624775e-10, "loss": 0.82187682, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35351562, "step": 16525, "time_per_iteration": 2.3410911560058594 }, { "auxiliary_loss_clip": 0.01050227, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01258588, "balance_loss_mlp": 1.01564264, "epoch": 0.9935968735908612, "flos": 22345145677440.0, "grad_norm": 1.7386842694761222, "language_loss": 0.76350033, "learning_rate": 4.2605511957582995e-10, "loss": 0.78435278, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34570312, "step": 16526, "time_per_iteration": 2.3898701667785645 }, { "auxiliary_loss_clip": 0.01049294, "auxiliary_loss_mlp": 0.01034676, "balance_loss_clip": 1.01374292, "balance_loss_mlp": 1.01554036, "epoch": 0.9936569968435293, "flos": 29459257776000.0, "grad_norm": 1.8364653836878828, "language_loss": 0.7334305, "learning_rate": 4.180545412333369e-10, "loss": 0.75427026, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.33789062, "step": 16527, "time_per_iteration": 2.428284168243408 }, { "auxiliary_loss_clip": 0.01052076, "auxiliary_loss_mlp": 0.01039544, "balance_loss_clip": 1.01626229, "balance_loss_mlp": 1.01589131, "epoch": 0.9937171200961972, "flos": 16543139495040.0, "grad_norm": 2.698635870111302, "language_loss": 0.76916313, "learning_rate": 4.1012978723875547e-10, "loss": 0.79007936, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 16528, "time_per_iteration": 2.351348638534546 }, { "auxiliary_loss_clip": 0.01051671, "auxiliary_loss_mlp": 0.0104076, "balance_loss_clip": 1.01670337, "balance_loss_mlp": 1.01549339, "epoch": 0.9937772433488652, "flos": 24389100846720.0, "grad_norm": 2.377174281750105, "language_loss": 0.69990468, "learning_rate": 4.022808578922898e-10, "loss": 0.72082907, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.36132812, "step": 16529, "time_per_iteration": 2.4071598052978516 }, { "auxiliary_loss_clip": 0.01054318, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.01295233, "balance_loss_mlp": 1.01623476, "epoch": 0.9938373666015331, "flos": 15668984073600.0, "grad_norm": 2.1380623992440584, "language_loss": 0.66526085, "learning_rate": 3.9450775349170186e-10, "loss": 0.68621022, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.38085938, "step": 16530, "time_per_iteration": 2.375223398208618 }, { "auxiliary_loss_clip": 0.01050984, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.01177335, "balance_loss_mlp": 1.01514733, "epoch": 0.9938974898542011, "flos": 19495290528000.0, "grad_norm": 2.680685666715921, "language_loss": 0.72723246, "learning_rate": 3.8681047433186676e-10, "loss": 0.74807423, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.359375, "step": 16531, "time_per_iteration": 3.8510658740997314 }, { "auxiliary_loss_clip": 0.01051911, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.01345384, "balance_loss_mlp": 1.0155586, "epoch": 0.993957613106869, "flos": 26905675357440.0, "grad_norm": 1.366043657897605, "language_loss": 0.75083601, "learning_rate": 3.791890207045512e-10, "loss": 0.77171838, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36328125, "step": 16532, "time_per_iteration": 2.4291305541992188 }, { "auxiliary_loss_clip": 0.01045898, "auxiliary_loss_mlp": 0.01037447, "balance_loss_clip": 1.01709771, "balance_loss_mlp": 1.01398778, "epoch": 0.994017736359537, "flos": 14938705831680.0, "grad_norm": 1.5833368361317355, "language_loss": 0.71623987, "learning_rate": 3.7164339289885717e-10, "loss": 0.7370733, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.3203125, "step": 16533, "time_per_iteration": 2.387833595275879 }, { "auxiliary_loss_clip": 0.01053518, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.01168466, "balance_loss_mlp": 1.01674867, "epoch": 0.9940778596122051, "flos": 15376970528640.0, "grad_norm": 1.9607247985543856, "language_loss": 0.85703009, "learning_rate": 3.641735912007782e-10, "loss": 0.87793696, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.3671875, "step": 16534, "time_per_iteration": 2.3824098110198975 }, { "auxiliary_loss_clip": 0.01048818, "auxiliary_loss_mlp": 0.01035094, "balance_loss_clip": 1.01441085, "balance_loss_mlp": 1.01525831, "epoch": 0.994137982864873, "flos": 25226946587520.0, "grad_norm": 1.5772627743867174, "language_loss": 0.67076039, "learning_rate": 3.567796158934211e-10, "loss": 0.69159949, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.3359375, "step": 16535, "time_per_iteration": 2.393564224243164 }, { "auxiliary_loss_clip": 0.01050285, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.01151562, "balance_loss_mlp": 1.0164336, "epoch": 0.994198106117541, "flos": 18441157714560.0, "grad_norm": 1.5888027388980284, "language_loss": 0.65913123, "learning_rate": 3.4946146725767235e-10, "loss": 0.67996287, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33984375, "step": 16536, "time_per_iteration": 2.3527514934539795 }, { "auxiliary_loss_clip": 0.01049952, "auxiliary_loss_mlp": 0.01039914, "balance_loss_clip": 1.01722884, "balance_loss_mlp": 1.01564372, "epoch": 0.9942582293702089, "flos": 16653185700480.0, "grad_norm": 1.709519813335413, "language_loss": 0.80246598, "learning_rate": 3.4221914557064357e-10, "loss": 0.82336462, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.34375, "step": 16537, "time_per_iteration": 2.335855484008789 }, { "auxiliary_loss_clip": 0.01055468, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.0149684, "balance_loss_mlp": 1.01710868, "epoch": 0.9943183526228769, "flos": 21943504863360.0, "grad_norm": 1.5339870724846063, "language_loss": 0.69705683, "learning_rate": 3.35052651107004e-10, "loss": 0.71801257, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.3828125, "step": 16538, "time_per_iteration": 2.416221857070923 }, { "auxiliary_loss_clip": 0.01049986, "auxiliary_loss_mlp": 0.01035954, "balance_loss_clip": 1.01380515, "balance_loss_mlp": 1.01508188, "epoch": 0.9943784758755448, "flos": 23841418348800.0, "grad_norm": 1.9202060782181256, "language_loss": 0.76796436, "learning_rate": 3.2796198413853614e-10, "loss": 0.78882378, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34960938, "step": 16539, "time_per_iteration": 2.3893511295318604 }, { "auxiliary_loss_clip": 0.0105223, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.01527894, "balance_loss_mlp": 1.01609337, "epoch": 0.9944385991282129, "flos": 21468930485760.0, "grad_norm": 2.1632463974003375, "language_loss": 0.72516906, "learning_rate": 3.209471449341361e-10, "loss": 0.74608588, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.36132812, "step": 16540, "time_per_iteration": 2.3841147422790527 }, { "auxiliary_loss_clip": 0.01049959, "auxiliary_loss_mlp": 0.01032396, "balance_loss_clip": 1.01135528, "balance_loss_mlp": 1.01548886, "epoch": 0.9944987223808808, "flos": 22925995833600.0, "grad_norm": 4.121484587426536, "language_loss": 0.7662214, "learning_rate": 3.140081337600353e-10, "loss": 0.78704488, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34375, "step": 16541, "time_per_iteration": 2.3598291873931885 }, { "auxiliary_loss_clip": 0.01050055, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.01488471, "balance_loss_mlp": 1.01465034, "epoch": 0.9945588456335488, "flos": 22381909205760.0, "grad_norm": 1.643422203558223, "language_loss": 0.77610779, "learning_rate": 3.0714495087891255e-10, "loss": 0.79698473, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.35351562, "step": 16542, "time_per_iteration": 2.3773975372314453 }, { "auxiliary_loss_clip": 0.01053373, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.0111531, "balance_loss_mlp": 1.01634741, "epoch": 0.9946189688862167, "flos": 21396450769920.0, "grad_norm": 2.0748769930489823, "language_loss": 0.76459378, "learning_rate": 3.0035759655122615e-10, "loss": 0.78548217, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.37109375, "step": 16543, "time_per_iteration": 2.355687379837036 }, { "auxiliary_loss_clip": 0.01054464, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.01364517, "balance_loss_mlp": 1.01669121, "epoch": 0.9946790921388847, "flos": 12415882187520.0, "grad_norm": 2.857682452209603, "language_loss": 0.83903992, "learning_rate": 2.9364607103454785e-10, "loss": 0.85997146, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.37695312, "step": 16544, "time_per_iteration": 2.3367371559143066 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.01233256, "balance_loss_mlp": 1.01631284, "epoch": 0.9947392153915526, "flos": 19057409856000.0, "grad_norm": 1.9325998597359795, "language_loss": 0.80035365, "learning_rate": 2.870103745831187e-10, "loss": 0.82121837, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.3515625, "step": 16545, "time_per_iteration": 2.3446030616760254 }, { "auxiliary_loss_clip": 0.01054609, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.01742232, "balance_loss_mlp": 1.01758015, "epoch": 0.9947993386442207, "flos": 27307560551040.0, "grad_norm": 2.2531104762781338, "language_loss": 0.73899984, "learning_rate": 2.8045050744873733e-10, "loss": 0.7599597, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 16546, "time_per_iteration": 2.4292893409729004 }, { "auxiliary_loss_clip": 0.01050112, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.01664197, "balance_loss_mlp": 1.01595426, "epoch": 0.9948594618968887, "flos": 20805650876160.0, "grad_norm": 2.8873900927491514, "language_loss": 0.78471982, "learning_rate": 2.739664698798716e-10, "loss": 0.80559862, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34179688, "step": 16547, "time_per_iteration": 2.3404369354248047 }, { "auxiliary_loss_clip": 0.0105054, "auxiliary_loss_mlp": 0.0103774, "balance_loss_clip": 1.01548362, "balance_loss_mlp": 1.01546764, "epoch": 0.9949195851495566, "flos": 23291885548800.0, "grad_norm": 2.360478078893796, "language_loss": 0.71825904, "learning_rate": 2.67558262122769e-10, "loss": 0.73914182, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.3515625, "step": 16548, "time_per_iteration": 2.369936466217041 }, { "auxiliary_loss_clip": 0.01051257, "auxiliary_loss_mlp": 0.01030028, "balance_loss_clip": 1.0089643, "balance_loss_mlp": 1.01596081, "epoch": 0.9949797084022246, "flos": 18514475303040.0, "grad_norm": 4.988725612805417, "language_loss": 0.7625221, "learning_rate": 2.6122588442012427e-10, "loss": 0.78333497, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.35351562, "step": 16549, "time_per_iteration": 2.3468565940856934 }, { "auxiliary_loss_clip": 0.01054454, "auxiliary_loss_mlp": 0.01035533, "balance_loss_clip": 1.01053452, "balance_loss_mlp": 1.0167799, "epoch": 0.9950398316548925, "flos": 30406451495040.0, "grad_norm": 1.602124964142184, "language_loss": 0.75402999, "learning_rate": 2.5496933701241177e-10, "loss": 0.77492988, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.375, "step": 16550, "time_per_iteration": 3.6663777828216553 }, { "auxiliary_loss_clip": 0.01050649, "auxiliary_loss_mlp": 0.01034455, "balance_loss_clip": 1.01187634, "balance_loss_mlp": 1.01520562, "epoch": 0.9950999549075605, "flos": 19899863896320.0, "grad_norm": 2.0206212902985956, "language_loss": 0.78524315, "learning_rate": 2.4878862013655297e-10, "loss": 0.80609417, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 16551, "time_per_iteration": 2.3656516075134277 }, { "auxiliary_loss_clip": 0.01047735, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.01584244, "balance_loss_mlp": 1.01515305, "epoch": 0.9951600781602284, "flos": 17602578835200.0, "grad_norm": 1.441477904336404, "language_loss": 0.67355371, "learning_rate": 2.426837340270271e-10, "loss": 0.69438243, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.32617188, "step": 16552, "time_per_iteration": 2.3832452297210693 }, { "auxiliary_loss_clip": 0.01051558, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.01158929, "balance_loss_mlp": 1.01596928, "epoch": 0.9952202014128965, "flos": 28949421058560.0, "grad_norm": 1.4204943972233643, "language_loss": 0.81840599, "learning_rate": 2.3665467891520465e-10, "loss": 0.8392669, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.35546875, "step": 16553, "time_per_iteration": 2.4452714920043945 }, { "auxiliary_loss_clip": 0.01006723, "auxiliary_loss_mlp": 0.01002789, "balance_loss_clip": 1.00102425, "balance_loss_mlp": 1.00049901, "epoch": 0.9952803246655644, "flos": 70806614716800.0, "grad_norm": 0.7244857306798538, "language_loss": 0.57359666, "learning_rate": 2.3070145503001348e-10, "loss": 0.59369177, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.06225586, "step": 16554, "time_per_iteration": 3.159752130508423 }, { "auxiliary_loss_clip": 0.01051623, "auxiliary_loss_mlp": 0.01035591, "balance_loss_clip": 1.01381123, "balance_loss_mlp": 1.0157423, "epoch": 0.9953404479182324, "flos": 21797986849920.0, "grad_norm": 1.5055956709779863, "language_loss": 0.7808187, "learning_rate": 2.24824062597051e-10, "loss": 0.80169082, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.359375, "step": 16555, "time_per_iteration": 2.4009222984313965 }, { "auxiliary_loss_clip": 0.01051039, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.01739168, "balance_loss_mlp": 1.01543474, "epoch": 0.9954005711709003, "flos": 21936522591360.0, "grad_norm": 2.141277241643714, "language_loss": 0.86823559, "learning_rate": 2.1902250183902793e-10, "loss": 0.88916296, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.35546875, "step": 16556, "time_per_iteration": 2.369748592376709 }, { "auxiliary_loss_clip": 0.01050415, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.01341891, "balance_loss_mlp": 1.01619124, "epoch": 0.9954606944235683, "flos": 19353542941440.0, "grad_norm": 1.8794048945468074, "language_loss": 0.7420122, "learning_rate": 2.132967729762125e-10, "loss": 0.76287919, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34179688, "step": 16557, "time_per_iteration": 3.8729825019836426 }, { "auxiliary_loss_clip": 0.0105052, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.01511228, "balance_loss_mlp": 1.01596987, "epoch": 0.9955208176762362, "flos": 30517300661760.0, "grad_norm": 1.8951657730807943, "language_loss": 0.77218401, "learning_rate": 2.0764687622554233e-10, "loss": 0.79305959, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34570312, "step": 16558, "time_per_iteration": 2.435725450515747 }, { "auxiliary_loss_clip": 0.01051108, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.01315093, "balance_loss_mlp": 1.01517534, "epoch": 0.9955809409289043, "flos": 30006940273920.0, "grad_norm": 2.4634520635712107, "language_loss": 0.64392465, "learning_rate": 2.0207281180129044e-10, "loss": 0.66480064, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.359375, "step": 16559, "time_per_iteration": 2.4451541900634766 }, { "auxiliary_loss_clip": 0.01051186, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.01408982, "balance_loss_mlp": 1.01681316, "epoch": 0.9956410641815723, "flos": 21542213162880.0, "grad_norm": 2.0867080274402583, "language_loss": 0.75309604, "learning_rate": 1.965745799148433e-10, "loss": 0.7739774, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.34375, "step": 16560, "time_per_iteration": 2.3613672256469727 }, { "auxiliary_loss_clip": 0.010503, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.01334977, "balance_loss_mlp": 1.01600516, "epoch": 0.9957011874342402, "flos": 21688394492160.0, "grad_norm": 1.62969815390942, "language_loss": 0.79620671, "learning_rate": 1.9115218077470073e-10, "loss": 0.81706357, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.34375, "step": 16561, "time_per_iteration": 2.3648927211761475 }, { "auxiliary_loss_clip": 0.01048872, "auxiliary_loss_mlp": 0.0103531, "balance_loss_clip": 1.01473427, "balance_loss_mlp": 1.01538503, "epoch": 0.9957613106869082, "flos": 17701977075840.0, "grad_norm": 2.5459361186105607, "language_loss": 0.66938078, "learning_rate": 1.8580561458647614e-10, "loss": 0.69022262, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.33398438, "step": 16562, "time_per_iteration": 3.6739256381988525 }, { "auxiliary_loss_clip": 0.01052778, "auxiliary_loss_mlp": 0.01039769, "balance_loss_clip": 1.01500964, "balance_loss_mlp": 1.01577938, "epoch": 0.9958214339395761, "flos": 30554657683200.0, "grad_norm": 2.0097908343019086, "language_loss": 0.65354985, "learning_rate": 1.805348815528962e-10, "loss": 0.67447531, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.37109375, "step": 16563, "time_per_iteration": 2.440884590148926 }, { "auxiliary_loss_clip": 0.01050573, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.00822711, "balance_loss_mlp": 1.01598966, "epoch": 0.9958815571922441, "flos": 24168065829120.0, "grad_norm": 2.557924873161957, "language_loss": 0.65371037, "learning_rate": 1.7533998187380105e-10, "loss": 0.67452836, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.34570312, "step": 16564, "time_per_iteration": 2.414936065673828 }, { "auxiliary_loss_clip": 0.01051241, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.01244569, "balance_loss_mlp": 1.01660132, "epoch": 0.995941680444912, "flos": 15486109038720.0, "grad_norm": 1.8260159806340535, "language_loss": 0.74973392, "learning_rate": 1.7022091574636633e-10, "loss": 0.77060646, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.34570312, "step": 16565, "time_per_iteration": 2.3570871353149414 }, { "auxiliary_loss_clip": 0.01051366, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.01240253, "balance_loss_mlp": 1.01590562, "epoch": 0.9960018036975801, "flos": 18620332145280.0, "grad_norm": 3.2311357440848334, "language_loss": 0.80504149, "learning_rate": 1.6517768336443694e-10, "loss": 0.82590282, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.35546875, "step": 16566, "time_per_iteration": 2.3560965061187744 }, { "auxiliary_loss_clip": 0.01049053, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.01567817, "balance_loss_mlp": 1.0147717, "epoch": 0.996061926950248, "flos": 20083088044800.0, "grad_norm": 1.6823427014517887, "language_loss": 0.72081131, "learning_rate": 1.6021028491941535e-10, "loss": 0.74167585, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.34179688, "step": 16567, "time_per_iteration": 2.370073080062866 }, { "auxiliary_loss_clip": 0.01053348, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.01300895, "balance_loss_mlp": 1.01682663, "epoch": 0.996122050202916, "flos": 24346821323520.0, "grad_norm": 2.084244954583539, "language_loss": 0.80799186, "learning_rate": 1.5531872059959538e-10, "loss": 0.82890701, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.36523438, "step": 16568, "time_per_iteration": 2.37907338142395 }, { "auxiliary_loss_clip": 0.01049349, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01019669, "balance_loss_mlp": 1.01588547, "epoch": 0.9961821734555839, "flos": 24198370755840.0, "grad_norm": 1.67551543137921, "language_loss": 0.82949042, "learning_rate": 1.5050299059060634e-10, "loss": 0.85028696, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.33398438, "step": 16569, "time_per_iteration": 2.406996250152588 }, { "auxiliary_loss_clip": 0.01050595, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.0155654, "balance_loss_mlp": 1.01628983, "epoch": 0.9962422967082519, "flos": 22632760391040.0, "grad_norm": 2.064249725054532, "language_loss": 0.7116192, "learning_rate": 1.457630950747468e-10, "loss": 0.73249388, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.34375, "step": 16570, "time_per_iteration": 3.8101067543029785 }, { "auxiliary_loss_clip": 0.01050699, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.01125073, "balance_loss_mlp": 1.01542306, "epoch": 0.9963024199609198, "flos": 26394826210560.0, "grad_norm": 1.6069440295952848, "language_loss": 0.76398087, "learning_rate": 1.4109903423209502e-10, "loss": 0.78482974, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.3515625, "step": 16571, "time_per_iteration": 2.416417360305786 }, { "auxiliary_loss_clip": 0.01051234, "auxiliary_loss_mlp": 0.01038029, "balance_loss_clip": 1.01363909, "balance_loss_mlp": 1.01547384, "epoch": 0.9963625432135879, "flos": 16580601250560.0, "grad_norm": 1.8231272132076841, "language_loss": 0.81244022, "learning_rate": 1.3651080823939843e-10, "loss": 0.83333278, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.35742188, "step": 16572, "time_per_iteration": 2.3412251472473145 }, { "auxiliary_loss_clip": 0.0105151, "auxiliary_loss_mlp": 0.01036631, "balance_loss_clip": 1.01468444, "balance_loss_mlp": 1.01622939, "epoch": 0.9964226664662559, "flos": 26467340837760.0, "grad_norm": 1.76280921311369, "language_loss": 0.71461475, "learning_rate": 1.3199841727074e-10, "loss": 0.7354961, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.35351562, "step": 16573, "time_per_iteration": 2.4155402183532715 }, { "auxiliary_loss_clip": 0.01053322, "auxiliary_loss_mlp": 0.01041315, "balance_loss_clip": 1.01567352, "balance_loss_mlp": 1.01603794, "epoch": 0.9964827897189238, "flos": 27447248367360.0, "grad_norm": 1.7050903892801517, "language_loss": 0.64442003, "learning_rate": 1.275618614968721e-10, "loss": 0.66536641, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.375, "step": 16574, "time_per_iteration": 2.4193148612976074 }, { "auxiliary_loss_clip": 0.01054076, "auxiliary_loss_mlp": 0.01039999, "balance_loss_clip": 1.01491761, "balance_loss_mlp": 1.01613212, "epoch": 0.9965429129715918, "flos": 11720971019520.0, "grad_norm": 2.2581131711859244, "language_loss": 0.78038186, "learning_rate": 1.2320114108654856e-10, "loss": 0.80132252, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.37890625, "step": 16575, "time_per_iteration": 2.3530516624450684 }, { "auxiliary_loss_clip": 0.01051399, "auxiliary_loss_mlp": 0.01038792, "balance_loss_clip": 1.01566517, "balance_loss_mlp": 1.01540589, "epoch": 0.9966030362242597, "flos": 19754974287360.0, "grad_norm": 2.21943445749183, "language_loss": 0.71169198, "learning_rate": 1.1891625620474855e-10, "loss": 0.73259389, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.359375, "step": 16576, "time_per_iteration": 2.3688220977783203 }, { "auxiliary_loss_clip": 0.01049997, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.01190281, "balance_loss_mlp": 1.01579452, "epoch": 0.9966631594769277, "flos": 23914037710080.0, "grad_norm": 1.573029299593729, "language_loss": 0.72724378, "learning_rate": 1.1470720701400871e-10, "loss": 0.7480889, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.34179688, "step": 16577, "time_per_iteration": 2.401250123977661 }, { "auxiliary_loss_clip": 0.01051835, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.01204014, "balance_loss_mlp": 1.01564169, "epoch": 0.9967232827295956, "flos": 15558693488640.0, "grad_norm": 1.9050211130784744, "language_loss": 0.7937783, "learning_rate": 1.1057399367397912e-10, "loss": 0.81464505, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.36132812, "step": 16578, "time_per_iteration": 2.3328428268432617 }, { "auxiliary_loss_clip": 0.01052065, "auxiliary_loss_mlp": 0.01036829, "balance_loss_clip": 1.01301122, "balance_loss_mlp": 1.0165, "epoch": 0.9967834059822637, "flos": 20812004743680.0, "grad_norm": 2.1849923030623613, "language_loss": 0.77196419, "learning_rate": 1.0651661634142328e-10, "loss": 0.79285318, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.35546875, "step": 16579, "time_per_iteration": 2.380669355392456 }, { "auxiliary_loss_clip": 0.0105391, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.01244652, "balance_loss_mlp": 1.01685953, "epoch": 0.9968435292349316, "flos": 36717805635840.0, "grad_norm": 3.057740186128159, "language_loss": 0.70457727, "learning_rate": 1.0253507516999604e-10, "loss": 0.7254957, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.37109375, "step": 16580, "time_per_iteration": 2.487881660461426 }, { "auxiliary_loss_clip": 0.01051383, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.01711011, "balance_loss_mlp": 1.01594841, "epoch": 0.9969036524875996, "flos": 26759703496320.0, "grad_norm": 1.8887586160955068, "language_loss": 0.81067926, "learning_rate": 9.862937031113184e-11, "loss": 0.83159065, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.35546875, "step": 16581, "time_per_iteration": 2.4047374725341797 }, { "auxiliary_loss_clip": 0.01049719, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.00889266, "balance_loss_mlp": 1.01550937, "epoch": 0.9969637757402675, "flos": 24826073823360.0, "grad_norm": 1.6892049916756566, "language_loss": 0.81922716, "learning_rate": 9.479950191249031e-11, "loss": 0.84002316, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.34179688, "step": 16582, "time_per_iteration": 2.4090566635131836 }, { "auxiliary_loss_clip": 0.01049586, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.0143528, "balance_loss_mlp": 1.0152775, "epoch": 0.9970238989929355, "flos": 23037647961600.0, "grad_norm": 1.7155947294782368, "language_loss": 0.61321664, "learning_rate": 9.104547011951069e-11, "loss": 0.6340754, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.34375, "step": 16583, "time_per_iteration": 2.3857479095458984 }, { "auxiliary_loss_clip": 0.01051965, "auxiliary_loss_mlp": 0.01040658, "balance_loss_clip": 1.01769781, "balance_loss_mlp": 1.01640689, "epoch": 0.9970840222456034, "flos": 25297715646720.0, "grad_norm": 1.6721892728611192, "language_loss": 0.79229361, "learning_rate": 8.736727507452357e-11, "loss": 0.81321979, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.35546875, "step": 16584, "time_per_iteration": 2.426241159439087 }, { "auxiliary_loss_clip": 0.01048554, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.01616728, "balance_loss_mlp": 1.01526308, "epoch": 0.9971441454982715, "flos": 21614553233280.0, "grad_norm": 1.6406007244497745, "language_loss": 0.70220906, "learning_rate": 8.376491691697297e-11, "loss": 0.7230683, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.33203125, "step": 16585, "time_per_iteration": 2.385852575302124 }, { "auxiliary_loss_clip": 0.01050716, "auxiliary_loss_mlp": 0.01039601, "balance_loss_clip": 1.01606929, "balance_loss_mlp": 1.0156883, "epoch": 0.9972042687509394, "flos": 14974736221440.0, "grad_norm": 2.981933283960767, "language_loss": 0.8215698, "learning_rate": 8.023839578363834e-11, "loss": 0.84247303, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.34960938, "step": 16586, "time_per_iteration": 2.342684507369995 }, { "auxiliary_loss_clip": 0.01051681, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.01157904, "balance_loss_mlp": 1.01495337, "epoch": 0.9972643920036074, "flos": 25805632239360.0, "grad_norm": 1.7308406418858153, "language_loss": 0.78701735, "learning_rate": 7.678771180796851e-11, "loss": 0.80788136, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.3671875, "step": 16587, "time_per_iteration": 2.4000422954559326 }, { "auxiliary_loss_clip": 0.01055086, "auxiliary_loss_mlp": 0.01039194, "balance_loss_clip": 1.01711631, "balance_loss_mlp": 1.01769638, "epoch": 0.9973245152562754, "flos": 23325262675200.0, "grad_norm": 1.8544809896588617, "language_loss": 0.7361927, "learning_rate": 7.341286512074773e-11, "loss": 0.75713551, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.375, "step": 16588, "time_per_iteration": 2.3835129737854004 }, { "auxiliary_loss_clip": 0.01053534, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.01198614, "balance_loss_mlp": 1.01553798, "epoch": 0.9973846385089433, "flos": 12165415027200.0, "grad_norm": 2.625649474134828, "language_loss": 0.84007698, "learning_rate": 7.011385585031781e-11, "loss": 0.86096543, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.37890625, "step": 16589, "time_per_iteration": 3.5893871784210205 }, { "auxiliary_loss_clip": 0.0105344, "auxiliary_loss_mlp": 0.01046173, "balance_loss_clip": 1.01951754, "balance_loss_mlp": 1.01655829, "epoch": 0.9974447617616113, "flos": 20044194923520.0, "grad_norm": 2.4441203599462313, "language_loss": 0.72375965, "learning_rate": 6.689068412168986e-11, "loss": 0.74475574, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.36914062, "step": 16590, "time_per_iteration": 2.3636531829833984 }, { "auxiliary_loss_clip": 0.0105325, "auxiliary_loss_mlp": 0.01039692, "balance_loss_clip": 1.01598096, "balance_loss_mlp": 1.01614296, "epoch": 0.9975048850142793, "flos": 32013259130880.0, "grad_norm": 2.1832074471126326, "language_loss": 0.65079105, "learning_rate": 6.374335005676634e-11, "loss": 0.67172045, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.37109375, "step": 16591, "time_per_iteration": 2.4642603397369385 }, { "auxiliary_loss_clip": 0.01050845, "auxiliary_loss_mlp": 0.010357, "balance_loss_clip": 1.0136466, "balance_loss_mlp": 1.01545966, "epoch": 0.9975650082669473, "flos": 36932416963200.0, "grad_norm": 1.6249904975664007, "language_loss": 0.74463296, "learning_rate": 6.067185377522933e-11, "loss": 0.7654984, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.35351562, "step": 16592, "time_per_iteration": 2.482407331466675 }, { "auxiliary_loss_clip": 0.01050862, "auxiliary_loss_mlp": 0.01036549, "balance_loss_clip": 1.01432872, "balance_loss_mlp": 1.01500273, "epoch": 0.9976251315196152, "flos": 16471183449600.0, "grad_norm": 1.6452748713975907, "language_loss": 0.85781348, "learning_rate": 5.767619539343016e-11, "loss": 0.87868762, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.359375, "step": 16593, "time_per_iteration": 2.370751142501831 }, { "auxiliary_loss_clip": 0.01048283, "auxiliary_loss_mlp": 0.01031121, "balance_loss_clip": 1.01060545, "balance_loss_mlp": 1.01561582, "epoch": 0.9976852547722832, "flos": 19645800865920.0, "grad_norm": 1.7692448035259756, "language_loss": 0.70439672, "learning_rate": 5.4756375024833656e-11, "loss": 0.72519076, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.328125, "step": 16594, "time_per_iteration": 2.3823812007904053 }, { "auxiliary_loss_clip": 0.01052133, "auxiliary_loss_mlp": 0.01037482, "balance_loss_clip": 1.01503527, "balance_loss_mlp": 1.01567554, "epoch": 0.9977453780249511, "flos": 20447406748800.0, "grad_norm": 2.2219170678976927, "language_loss": 0.74281514, "learning_rate": 5.1912392780462113e-11, "loss": 0.76371133, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.36523438, "step": 16595, "time_per_iteration": 2.3725314140319824 }, { "auxiliary_loss_clip": 0.01006767, "auxiliary_loss_mlp": 0.01003707, "balance_loss_clip": 1.00170457, "balance_loss_mlp": 1.00055349, "epoch": 0.9978055012776191, "flos": 65452266253440.0, "grad_norm": 0.7895314572637466, "language_loss": 0.60387748, "learning_rate": 4.9144248768007156e-11, "loss": 0.62398219, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.06201172, "step": 16596, "time_per_iteration": 2.887441873550415 }, { "auxiliary_loss_clip": 0.01051879, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.01775753, "balance_loss_mlp": 1.01624358, "epoch": 0.997865624530287, "flos": 20630456340480.0, "grad_norm": 1.8729761914053906, "language_loss": 0.78548157, "learning_rate": 4.645194309227385e-11, "loss": 0.80641234, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.35546875, "step": 16597, "time_per_iteration": 3.82503080368042 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.01906085, "balance_loss_mlp": 1.01612425, "epoch": 0.9979257477829551, "flos": 29385835453440.0, "grad_norm": 1.6980918599394474, "language_loss": 0.83088642, "learning_rate": 4.383547585562475e-11, "loss": 0.85185254, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 16598, "time_per_iteration": 2.4146275520324707 }, { "auxiliary_loss_clip": 0.01053124, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 1.01249039, "balance_loss_mlp": 1.01565742, "epoch": 0.997985871035623, "flos": 22634052111360.0, "grad_norm": 2.3621204448696114, "language_loss": 0.65646046, "learning_rate": 4.129484715709175e-11, "loss": 0.67737937, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.375, "step": 16599, "time_per_iteration": 2.399435043334961 }, { "auxiliary_loss_clip": 0.01006765, "auxiliary_loss_mlp": 0.01003489, "balance_loss_clip": 1.00153399, "balance_loss_mlp": 1.00048113, "epoch": 0.998045994288291, "flos": 61804226534400.0, "grad_norm": 0.8540147572191198, "language_loss": 0.62388647, "learning_rate": 3.8830057093264256e-11, "loss": 0.64398903, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.06298828, "step": 16600, "time_per_iteration": 2.9344897270202637 }, { "auxiliary_loss_clip": 0.01051295, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.01606369, "balance_loss_mlp": 1.01672804, "epoch": 0.998106117540959, "flos": 19244509165440.0, "grad_norm": 1.5785463759568659, "language_loss": 0.79639083, "learning_rate": 3.644110575717896e-11, "loss": 0.81727529, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.34570312, "step": 16601, "time_per_iteration": 3.6506690979003906 }, { "auxiliary_loss_clip": 0.01052655, "auxiliary_loss_mlp": 0.0103887, "balance_loss_clip": 1.01579106, "balance_loss_mlp": 1.01535285, "epoch": 0.9981662407936269, "flos": 21105135452160.0, "grad_norm": 2.7323943294607167, "language_loss": 0.83518046, "learning_rate": 3.412799323987414e-11, "loss": 0.85609567, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.37304688, "step": 16602, "time_per_iteration": 2.36869215965271 }, { "auxiliary_loss_clip": 0.01053094, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.01764202, "balance_loss_mlp": 1.01698816, "epoch": 0.998226364046295, "flos": 24315678524160.0, "grad_norm": 2.148807205422047, "language_loss": 0.64074594, "learning_rate": 3.189071962883538e-11, "loss": 0.66168469, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.36132812, "step": 16603, "time_per_iteration": 2.422283411026001 }, { "auxiliary_loss_clip": 0.01051161, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.00999415, "balance_loss_mlp": 1.01442075, "epoch": 0.9982864872989629, "flos": 23835413594880.0, "grad_norm": 1.7758104468031144, "language_loss": 0.71897662, "learning_rate": 2.972928500866168e-11, "loss": 0.73983592, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.3671875, "step": 16604, "time_per_iteration": 2.4389402866363525 }, { "auxiliary_loss_clip": 0.01049571, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.01082516, "balance_loss_mlp": 1.01469016, "epoch": 0.9983466105516309, "flos": 18332123938560.0, "grad_norm": 1.60166590914381, "language_loss": 0.65852296, "learning_rate": 2.7643689461953613e-11, "loss": 0.67935163, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.34960938, "step": 16605, "time_per_iteration": 2.3867013454437256 }, { "auxiliary_loss_clip": 0.0105063, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.01119995, "balance_loss_mlp": 1.01707256, "epoch": 0.9984067338042988, "flos": 17235851247360.0, "grad_norm": 1.8891311316556585, "language_loss": 0.71784663, "learning_rate": 2.5633933067092938e-11, "loss": 0.73867834, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.3359375, "step": 16606, "time_per_iteration": 2.3932013511657715 }, { "auxiliary_loss_clip": 0.01052363, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.01388836, "balance_loss_mlp": 1.01659214, "epoch": 0.9984668570569668, "flos": 20666835843840.0, "grad_norm": 1.9181256287367567, "language_loss": 0.83503181, "learning_rate": 2.370001590090709e-11, "loss": 0.85591704, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.35742188, "step": 16607, "time_per_iteration": 2.3787946701049805 }, { "auxiliary_loss_clip": 0.01051762, "auxiliary_loss_mlp": 0.01038196, "balance_loss_clip": 1.01409149, "balance_loss_mlp": 1.01468182, "epoch": 0.9985269803096347, "flos": 30261247683840.0, "grad_norm": 1.740544355641763, "language_loss": 0.67780882, "learning_rate": 2.184193803622669e-11, "loss": 0.69870836, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.37109375, "step": 16608, "time_per_iteration": 2.4853529930114746 }, { "auxiliary_loss_clip": 0.01052376, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.01106322, "balance_loss_mlp": 1.01666641, "epoch": 0.9985871035623027, "flos": 10560213313920.0, "grad_norm": 3.001978947404949, "language_loss": 0.81982499, "learning_rate": 2.0059699543883978e-11, "loss": 0.84068692, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35742188, "step": 16609, "time_per_iteration": 3.8795828819274902 }, { "auxiliary_loss_clip": 0.0105028, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.02661312, "balance_loss_mlp": 1.01489878, "epoch": 0.9986472268149706, "flos": 16872510061440.0, "grad_norm": 1.6257287334137476, "language_loss": 0.6403774, "learning_rate": 1.8353300491158462e-11, "loss": 0.66137981, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.35351562, "step": 16610, "time_per_iteration": 2.3731353282928467 }, { "auxiliary_loss_clip": 0.01050697, "auxiliary_loss_mlp": 0.01036106, "balance_loss_clip": 1.01456451, "balance_loss_mlp": 1.01536846, "epoch": 0.9987073500676387, "flos": 22053446334720.0, "grad_norm": 3.9446694888340867, "language_loss": 0.68684506, "learning_rate": 1.672274094288717e-11, "loss": 0.70771313, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.35351562, "step": 16611, "time_per_iteration": 2.3932766914367676 }, { "auxiliary_loss_clip": 0.01051885, "auxiliary_loss_mlp": 0.01039526, "balance_loss_clip": 1.01412213, "balance_loss_mlp": 1.01600552, "epoch": 0.9987674733203066, "flos": 30481549562880.0, "grad_norm": 1.5134086601207857, "language_loss": 0.70626664, "learning_rate": 1.5168020961020544e-11, "loss": 0.72718072, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.359375, "step": 16612, "time_per_iteration": 2.4830305576324463 }, { "auxiliary_loss_clip": 0.01048354, "auxiliary_loss_mlp": 0.01037742, "balance_loss_clip": 1.0156045, "balance_loss_mlp": 1.01522839, "epoch": 0.9988275965729746, "flos": 27744009857280.0, "grad_norm": 1.5752497637223863, "language_loss": 0.75244451, "learning_rate": 1.3689140604400407e-11, "loss": 0.77330542, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.33203125, "step": 16613, "time_per_iteration": 2.446263074874878 }, { "auxiliary_loss_clip": 0.01052605, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.0143764, "balance_loss_mlp": 1.01666212, "epoch": 0.9988877198256426, "flos": 17523396138240.0, "grad_norm": 2.138552975294756, "language_loss": 0.74666679, "learning_rate": 1.2286099928981996e-11, "loss": 0.76758468, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.359375, "step": 16614, "time_per_iteration": 2.3415441513061523 }, { "auxiliary_loss_clip": 0.01051252, "auxiliary_loss_mlp": 0.01037219, "balance_loss_clip": 1.01466441, "balance_loss_mlp": 1.01571035, "epoch": 0.9989478430783105, "flos": 20995438360320.0, "grad_norm": 1.605833188603838, "language_loss": 0.73889935, "learning_rate": 1.0958898988278065e-11, "loss": 0.75978398, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.35546875, "step": 16615, "time_per_iteration": 2.389955520629883 }, { "auxiliary_loss_clip": 0.01053131, "auxiliary_loss_mlp": 0.010398, "balance_loss_clip": 1.01703095, "balance_loss_mlp": 1.01675344, "epoch": 0.9990079663309785, "flos": 13369778887680.0, "grad_norm": 2.1459221654044325, "language_loss": 0.7934528, "learning_rate": 9.70753783247069e-12, "loss": 0.81438208, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.36328125, "step": 16616, "time_per_iteration": 2.3717541694641113 }, { "auxiliary_loss_clip": 0.01051345, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.01318955, "balance_loss_mlp": 1.0167439, "epoch": 0.9990680895836465, "flos": 17309308481280.0, "grad_norm": 2.46601451547681, "language_loss": 0.83377492, "learning_rate": 8.532016508855378e-12, "loss": 0.85464215, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.34570312, "step": 16617, "time_per_iteration": 2.3321564197540283 }, { "auxiliary_loss_clip": 0.01049965, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.00888789, "balance_loss_mlp": 1.01516509, "epoch": 0.9991282128363145, "flos": 24206819304960.0, "grad_norm": 1.5024403267944428, "language_loss": 0.79315591, "learning_rate": 7.43233506206309e-12, "loss": 0.81397545, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 16618, "time_per_iteration": 2.4143178462982178 }, { "auxiliary_loss_clip": 0.01050796, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.01604092, "balance_loss_mlp": 1.01518524, "epoch": 0.9991883360889824, "flos": 21174333500160.0, "grad_norm": 1.6461092931747305, "language_loss": 0.75733626, "learning_rate": 6.408493534060255e-12, "loss": 0.77823389, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.35546875, "step": 16619, "time_per_iteration": 2.356168508529663 }, { "auxiliary_loss_clip": 0.01048561, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.01345778, "balance_loss_mlp": 1.01525736, "epoch": 0.9992484593416504, "flos": 19900143187200.0, "grad_norm": 2.5299689132960475, "language_loss": 0.87344885, "learning_rate": 5.460491963260594e-12, "loss": 0.89427286, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.33203125, "step": 16620, "time_per_iteration": 2.3617000579833984 }, { "auxiliary_loss_clip": 0.01048509, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.01145434, "balance_loss_mlp": 1.0148344, "epoch": 0.9993085825943183, "flos": 24856832597760.0, "grad_norm": 2.303121441465776, "language_loss": 0.73033917, "learning_rate": 4.58833038607942e-12, "loss": 0.75114775, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.3359375, "step": 16621, "time_per_iteration": 2.3869481086730957 }, { "auxiliary_loss_clip": 0.01006963, "auxiliary_loss_mlp": 0.01003176, "balance_loss_clip": 1.00126839, "balance_loss_mlp": 1.00072789, "epoch": 0.9993687058469863, "flos": 71280700335360.0, "grad_norm": 0.7382641114313386, "language_loss": 0.56558859, "learning_rate": 3.79200883515729e-12, "loss": 0.5856899, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.0625, "step": 16622, "time_per_iteration": 3.2371137142181396 }, { "auxiliary_loss_clip": 0.01051269, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.00979447, "balance_loss_mlp": 1.01572907, "epoch": 0.9994288290996542, "flos": 12198861976320.0, "grad_norm": 1.7898881538013114, "language_loss": 0.72773027, "learning_rate": 3.071527340914315e-12, "loss": 0.74857301, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.35546875, "step": 16623, "time_per_iteration": 2.559809684753418 }, { "auxiliary_loss_clip": 0.01051026, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.00904608, "balance_loss_mlp": 1.01600718, "epoch": 0.9994889523523223, "flos": 17889111296640.0, "grad_norm": 1.8956237333838624, "language_loss": 0.75629091, "learning_rate": 2.4268859304399368e-12, "loss": 0.77712911, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.34960938, "step": 16624, "time_per_iteration": 2.4585185050964355 }, { "auxiliary_loss_clip": 0.01050857, "auxiliary_loss_mlp": 0.01033641, "balance_loss_clip": 1.01013291, "balance_loss_mlp": 1.01548636, "epoch": 0.9995490756049902, "flos": 26577666334080.0, "grad_norm": 1.6150226341336997, "language_loss": 0.74553597, "learning_rate": 1.8580846286031514e-12, "loss": 0.76638091, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.35351562, "step": 16625, "time_per_iteration": 2.498415231704712 }, { "auxiliary_loss_clip": 0.01050294, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01225019, "balance_loss_mlp": 1.01582503, "epoch": 0.9996091988576582, "flos": 22198964348160.0, "grad_norm": 2.1528677229734066, "language_loss": 0.77779073, "learning_rate": 1.3651234567202408e-12, "loss": 0.79863077, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.34570312, "step": 16626, "time_per_iteration": 2.484957456588745 }, { "auxiliary_loss_clip": 0.01052587, "auxiliary_loss_mlp": 0.01038548, "balance_loss_clip": 1.01540959, "balance_loss_mlp": 1.01794553, "epoch": 0.9996693221103262, "flos": 27372185210880.0, "grad_norm": 1.9442722822712908, "language_loss": 0.8299917, "learning_rate": 9.480024334429515e-13, "loss": 0.85090303, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.34765625, "step": 16627, "time_per_iteration": 2.4946837425231934 }, { "auxiliary_loss_clip": 0.01054537, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.01183534, "balance_loss_mlp": 1.01711273, "epoch": 0.9997294453629941, "flos": 26869191120000.0, "grad_norm": 1.8155562901067694, "language_loss": 0.71910471, "learning_rate": 6.067215747584952e-13, "loss": 0.74001628, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.375, "step": 16628, "time_per_iteration": 2.5552148818969727 }, { "auxiliary_loss_clip": 0.01051624, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.01417601, "balance_loss_mlp": 1.01620984, "epoch": 0.9997895686156621, "flos": 23475877747200.0, "grad_norm": 1.5155299183913533, "language_loss": 0.76431108, "learning_rate": 3.4128089332341456e-13, "loss": 0.78520536, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.35546875, "step": 16629, "time_per_iteration": 3.7055492401123047 }, { "auxiliary_loss_clip": 0.01053934, "auxiliary_loss_mlp": 0.01040972, "balance_loss_clip": 1.01703477, "balance_loss_mlp": 1.01638842, "epoch": 0.9998496918683301, "flos": 20224067581440.0, "grad_norm": 1.6683973943061003, "language_loss": 0.6142379, "learning_rate": 1.5168039935176126e-13, "loss": 0.63518697, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.375, "step": 16630, "time_per_iteration": 2.4113969802856445 }, { "auxiliary_loss_clip": 0.01053071, "auxiliary_loss_mlp": 0.01039839, "balance_loss_clip": 1.01722455, "balance_loss_mlp": 1.01667798, "epoch": 0.9999098151209981, "flos": 21651840432000.0, "grad_norm": 2.0743731405423356, "language_loss": 0.61565042, "learning_rate": 3.792010017100722e-14, "loss": 0.63657951, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.36523438, "step": 16631, "time_per_iteration": 2.3413243293762207 }, { "auxiliary_loss_clip": 0.01049699, "auxiliary_loss_mlp": 0.01036634, "balance_loss_clip": 1.01523566, "balance_loss_mlp": 1.01644707, "epoch": 0.999969938373666, "flos": 11543856359040.0, "grad_norm": 1.941040473940969, "language_loss": 0.73196745, "learning_rate": 0.0, "loss": 0.7528308, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.33203125, "step": 16632, "time_per_iteration": 2.3182687759399414 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3991449643649597e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }